├── .gitignore
├── .gitmodules
├── .travis.yml
├── COPYING
├── Dockerfile
├── Pipfile
├── Pipfile.lock
├── README.md
├── align.py
├── examples
    ├── data
    │   ├── lucier.mp3
    │   └── lucier.txt
    └── gentle_curl.sh
├── ext
    ├── Makefile
    ├── install_kaldi.sh
    ├── k3.cc
    └── m3.cc
├── gentle
    ├── __init__.py
    ├── __version__.py
    ├── diff_align.py
    ├── forced_aligner.py
    ├── full_transcriber.py
    ├── kaldi_queue.py
    ├── language_model.py
    ├── metasentence.py
    ├── multipass.py
    ├── resample.py
    ├── resources.py
    ├── rpc.py
    ├── standard_kaldi.py
    ├── transcriber.py
    ├── transcription.py
    └── util
    │   ├── __init__.py
    │   ├── cyst.py
    │   └── paths.py
├── install.sh
├── install_deps.sh
├── install_models.sh
├── pylintrc
├── serve.py
├── setup.py
├── tests
    ├── __init__.py
    ├── base.py
    └── transcriber.py
└── www
    ├── index.html
    ├── preloader.gif
    └── view_alignment.html


/.gitignore:
--------------------------------------------------------------------------------
 1 | PROTO_LANGDIR/
 2 | data/
 3 | ext/standard_kaldi
 4 | ext/mkgraph
 5 | ext/*-test
 6 | ext/*.o
 7 | ext/.depend.mk
 8 | *.pyc
 9 | *.swp
10 | *~
11 | webdata/
12 | gentle.egg-info/
13 | exp/
14 | ext/k3
15 | ext/m3
16 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "kaldi"]
2 | 	path = ext/kaldi
3 | 	url = https://github.com/kaldi-asr/kaldi
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | language: generic
 3 | 
 4 | services:
 5 |   - docker
 6 | 
 7 | install:
 8 |   - docker build -t lowerquality/gentle .
 9 | 
10 | script:
11 |   - docker run --rm lowerquality/gentle sh -c 'cd /gentle && python3 setup.py test'
12 | 
13 | after_success:
14 |   - if [ "$TRAVIS_BRANCH" == "master" ]; then
15 |     docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD";
16 |     docker push lowerquality/gentle:latest;
17 |     fi
18 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Robert M Ochshorn
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | 
 3 | RUN DEBIAN_FRONTEND=noninteractive && \
 4 | 	apt-get update && \
 5 | 	apt-get install -y \
 6 | 		gcc g++ gfortran \
 7 | 		libc++-dev \
 8 | 		libstdc++-6-dev zlib1g-dev \
 9 | 		automake autoconf libtool \
10 | 		git subversion \
11 | 		libatlas3-base \
12 | 		nvidia-cuda-dev \
13 | 		ffmpeg \
14 | 		python3 python3-dev python3-pip \
15 | 		python python-dev python-pip \
16 | 		wget unzip && \
17 | 	apt-get clean
18 | 
19 | ADD ext /gentle/ext
20 | RUN export MAKEFLAGS=' -j8' &&  cd /gentle/ext && \
21 | 	./install_kaldi.sh && \
22 | 	make depend && make && rm -rf kaldi *.o
23 | 
24 | ADD . /gentle
25 | RUN cd /gentle && python3 setup.py develop
26 | RUN cd /gentle && ./install_models.sh
27 | 
28 | EXPOSE 8765
29 | 
30 | VOLUME /gentle/webdata
31 | 
32 | CMD cd /gentle && python3 serve.py
33 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | twisted = "*"
 8 | 
 9 | [dev-packages]
10 | 
11 | [requires]
12 | python_version = "3.10"
13 | 


--------------------------------------------------------------------------------
/Pipfile.lock:
--------------------------------------------------------------------------------
  1 | {
  2 |     "_meta": {
  3 |         "hash": {
  4 |             "sha256": "57dfbe0c4be1b1cd383325a23df3b92e8fb7c4bd94e927838f1c8b381fae885c"
  5 |         },
  6 |         "pipfile-spec": 6,
  7 |         "requires": {
  8 |             "python_version": "3.10"
  9 |         },
 10 |         "sources": [
 11 |             {
 12 |                 "name": "pypi",
 13 |                 "url": "https://pypi.org/simple",
 14 |                 "verify_ssl": true
 15 |             }
 16 |         ]
 17 |     },
 18 |     "default": {
 19 |         "attrs": {
 20 |             "hashes": [
 21 |                 "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e",
 22 |                 "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a"
 23 |             ],
 24 |             "markers": "python_version >= '3.8'",
 25 |             "version": "==25.1.0"
 26 |         },
 27 |         "automat": {
 28 |             "hashes": [
 29 |                 "sha256:b34227cf63f6325b8ad2399ede780675083e439b20c323d376373d8ee6306d88",
 30 |                 "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a"
 31 |             ],
 32 |             "markers": "python_version >= '3.8'",
 33 |             "version": "==24.8.1"
 34 |         },
 35 |         "constantly": {
 36 |             "hashes": [
 37 |                 "sha256:3fd9b4d1c3dc1ec9757f3c52aef7e53ad9323dbe39f51dfd4c43853b68dfa3f9",
 38 |                 "sha256:aa92b70a33e2ac0bb33cd745eb61776594dc48764b06c35e0efd050b7f1c7cbd"
 39 |             ],
 40 |             "markers": "python_version >= '3.8'",
 41 |             "version": "==23.10.4"
 42 |         },
 43 |         "hyperlink": {
 44 |             "hashes": [
 45 |                 "sha256:427af957daa58bc909471c6c40f74c5450fa123dd093fc53efd2e91d2705a56b",
 46 |                 "sha256:e6b14c37ecb73e89c77d78cdb4c2cc8f3fb59a885c5b3f819ff4ed80f25af1b4"
 47 |             ],
 48 |             "version": "==21.0.0"
 49 |         },
 50 |         "idna": {
 51 |             "hashes": [
 52 |                 "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9",
 53 |                 "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"
 54 |             ],
 55 |             "markers": "python_version >= '3.6'",
 56 |             "version": "==3.10"
 57 |         },
 58 |         "incremental": {
 59 |             "hashes": [
 60 |                 "sha256:8cb2c3431530bec48ad70513931a760f446ad6c25e8333ca5d95e24b0ed7b8fe",
 61 |                 "sha256:fb4f1d47ee60efe87d4f6f0ebb5f70b9760db2b2574c59c8e8912be4ebd464c9"
 62 |             ],
 63 |             "markers": "python_version >= '3.8'",
 64 |             "version": "==24.7.2"
 65 |         },
 66 |         "setuptools": {
 67 |             "hashes": [
 68 |                 "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6",
 69 |                 "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3"
 70 |             ],
 71 |             "markers": "python_version >= '3.9'",
 72 |             "version": "==75.8.0"
 73 |         },
 74 |         "tomli": {
 75 |             "hashes": [
 76 |                 "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6",
 77 |                 "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd",
 78 |                 "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c",
 79 |                 "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b",
 80 |                 "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8",
 81 |                 "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6",
 82 |                 "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77",
 83 |                 "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff",
 84 |                 "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea",
 85 |                 "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192",
 86 |                 "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249",
 87 |                 "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee",
 88 |                 "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4",
 89 |                 "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98",
 90 |                 "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8",
 91 |                 "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4",
 92 |                 "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281",
 93 |                 "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744",
 94 |                 "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69",
 95 |                 "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13",
 96 |                 "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140",
 97 |                 "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e",
 98 |                 "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e",
 99 |                 "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc",
100 |                 "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff",
101 |                 "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec",
102 |                 "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2",
103 |                 "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222",
104 |                 "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106",
105 |                 "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272",
106 |                 "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a",
107 |                 "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7"
108 |             ],
109 |             "markers": "python_version >= '3.8'",
110 |             "version": "==2.2.1"
111 |         },
112 |         "twisted": {
113 |             "hashes": [
114 |                 "sha256:695d0556d5ec579dcc464d2856b634880ed1319f45b10d19043f2b57eb0115b5",
115 |                 "sha256:fe403076c71f04d5d2d789a755b687c5637ec3bcd3b2b8252d76f2ba65f54261"
116 |             ],
117 |             "index": "pypi",
118 |             "markers": "python_full_version >= '3.8.0'",
119 |             "version": "==24.11.0"
120 |         },
121 |         "typing-extensions": {
122 |             "hashes": [
123 |                 "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d",
124 |                 "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"
125 |             ],
126 |             "markers": "python_version >= '3.8'",
127 |             "version": "==4.12.2"
128 |         },
129 |         "zope-interface": {
130 |             "hashes": [
131 |                 "sha256:033b3923b63474800b04cba480b70f6e6243a62208071fc148354f3f89cc01b7",
132 |                 "sha256:05b910a5afe03256b58ab2ba6288960a2892dfeef01336dc4be6f1b9ed02ab0a",
133 |                 "sha256:086ee2f51eaef1e4a52bd7d3111a0404081dadae87f84c0ad4ce2649d4f708b7",
134 |                 "sha256:0ef9e2f865721553c6f22a9ff97da0f0216c074bd02b25cf0d3af60ea4d6931d",
135 |                 "sha256:1090c60116b3da3bfdd0c03406e2f14a1ff53e5771aebe33fec1edc0a350175d",
136 |                 "sha256:144964649eba4c5e4410bb0ee290d338e78f179cdbfd15813de1a664e7649b3b",
137 |                 "sha256:15398c000c094b8855d7d74f4fdc9e73aa02d4d0d5c775acdef98cdb1119768d",
138 |                 "sha256:1909f52a00c8c3dcab6c4fad5d13de2285a4b3c7be063b239b8dc15ddfb73bd2",
139 |                 "sha256:21328fcc9d5b80768bf051faa35ab98fb979080c18e6f84ab3f27ce703bce465",
140 |                 "sha256:224b7b0314f919e751f2bca17d15aad00ddbb1eadf1cb0190fa8175edb7ede62",
141 |                 "sha256:25e6a61dcb184453bb00eafa733169ab6d903e46f5c2ace4ad275386f9ab327a",
142 |                 "sha256:27f926f0dcb058211a3bb3e0e501c69759613b17a553788b2caeb991bed3b61d",
143 |                 "sha256:29caad142a2355ce7cfea48725aa8bcf0067e2b5cc63fcf5cd9f97ad12d6afb5",
144 |                 "sha256:2ad9913fd858274db8dd867012ebe544ef18d218f6f7d1e3c3e6d98000f14b75",
145 |                 "sha256:31d06db13a30303c08d61d5fb32154be51dfcbdb8438d2374ae27b4e069aac40",
146 |                 "sha256:3e0350b51e88658d5ad126c6a57502b19d5f559f6cb0a628e3dc90442b53dd98",
147 |                 "sha256:3f6771d1647b1fc543d37640b45c06b34832a943c80d1db214a37c31161a93f1",
148 |                 "sha256:4893395d5dd2ba655c38ceb13014fd65667740f09fa5bb01caa1e6284e48c0cd",
149 |                 "sha256:52e446f9955195440e787596dccd1411f543743c359eeb26e9b2c02b077b0519",
150 |                 "sha256:550f1c6588ecc368c9ce13c44a49b8d6b6f3ca7588873c679bd8fd88a1b557b6",
151 |                 "sha256:72cd1790b48c16db85d51fbbd12d20949d7339ad84fd971427cf00d990c1f137",
152 |                 "sha256:7bd449c306ba006c65799ea7912adbbfed071089461a19091a228998b82b1fdb",
153 |                 "sha256:7dc5016e0133c1a1ec212fc87a4f7e7e562054549a99c73c8896fa3a9e80cbc7",
154 |                 "sha256:802176a9f99bd8cc276dcd3b8512808716492f6f557c11196d42e26c01a69a4c",
155 |                 "sha256:80ecf2451596f19fd607bb09953f426588fc1e79e93f5968ecf3367550396b22",
156 |                 "sha256:8b49f1a3d1ee4cdaf5b32d2e738362c7f5e40ac8b46dd7d1a65e82a4872728fe",
157 |                 "sha256:8e7da17f53e25d1a3bde5da4601e026adc9e8071f9f6f936d0fe3fe84ace6d54",
158 |                 "sha256:a102424e28c6b47c67923a1f337ede4a4c2bba3965b01cf707978a801fc7442c",
159 |                 "sha256:a19a6cc9c6ce4b1e7e3d319a473cf0ee989cbbe2b39201d7c19e214d2dfb80c7",
160 |                 "sha256:a71a5b541078d0ebe373a81a3b7e71432c61d12e660f1d67896ca62d9628045b",
161 |                 "sha256:baf95683cde5bc7d0e12d8e7588a3eb754d7c4fa714548adcd96bdf90169f021",
162 |                 "sha256:cab15ff4832580aa440dc9790b8a6128abd0b88b7ee4dd56abacbc52f212209d",
163 |                 "sha256:ce290e62229964715f1011c3dbeab7a4a1e4971fd6f31324c4519464473ef9f2",
164 |                 "sha256:d3a8ffec2a50d8ec470143ea3d15c0c52d73df882eef92de7537e8ce13475e8a",
165 |                 "sha256:e204937f67b28d2dca73ca936d3039a144a081fc47a07598d44854ea2a106239",
166 |                 "sha256:eb23f58a446a7f09db85eda09521a498e109f137b85fb278edb2e34841055398",
167 |                 "sha256:f6dd02ec01f4468da0f234da9d9c8545c5412fef80bc590cc51d8dd084138a89"
168 |             ],
169 |             "markers": "python_version >= '3.8'",
170 |             "version": "==7.2"
171 |         }
172 |     },
173 |     "develop": {}
174 | }
175 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Gentle
 2 | **Robust yet lenient forced-aligner built on Kaldi. A tool for aligning speech with text.**
 3 | 
 4 | ## Getting Started
 5 | 
 6 | There are three ways to install Gentle.
 7 | 
 8 | 1. Download the [pre-built Mac application](https://github.com/lowerquality/gentle/releases/latest). This package includes a GUI that will start the server and a browser. It only works on Mac OS.
 9 | 
10 | 2. Use the [Docker](https://www.docker.com/) image. Just run ```docker run -P lowerquality/gentle```. This works on all platforms supported by Docker.
11 | 
12 | 3. Download the source code and run ```./install.sh```. Then run ```python3 serve.py``` to start the server. This works on Mac and Linux.
13 | 
14 | ## Using Gentle
15 | 
16 | By default, the aligner listens at http://localhost:8765. That page has a graphical interface for transcribing audio, viewing results, and downloading data.
17 | 
18 | There is also a REST API so you can use Gentle in your programs. Here's an example of how to use the API with CURL:
19 | 
20 | ```bash
21 | curl -F "audio=@audio.mp3" -F "transcript=@words.txt" "http://localhost:8765/transcriptions?async=false"
22 | ```
23 | 
24 | If you've downloaded the source code you can also run the aligner as a command line program:
25 | 
26 | ```bash
27 | git clone https://github.com/lowerquality/gentle.git
28 | cd gentle
29 | ./install.sh
30 | python3 align.py audio.mp3 words.txt
31 | ```
32 | 
33 | The default behaviour outputs the JSON to stdout.  See `python3 align.py --help` for options.
34 | 


--------------------------------------------------------------------------------
/align.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import multiprocessing
 4 | import os
 5 | import sys
 6 | 
 7 | import gentle
 8 | 
 9 | parser = argparse.ArgumentParser(
10 |         description='Align a transcript to audio by generating a new language model.  Outputs JSON')
11 | parser.add_argument(
12 |         '--nthreads', default=multiprocessing.cpu_count(), type=int,
13 |         help='number of alignment threads')
14 | parser.add_argument(
15 |         '-o', '--output', metavar='output', type=str, 
16 |         help='output filename')
17 | parser.add_argument(
18 |         '--conservative', dest='conservative', action='store_true',
19 |         help='conservative alignment')
20 | parser.set_defaults(conservative=False)
21 | parser.add_argument(
22 |         '--disfluency', dest='disfluency', action='store_true',
23 |         help='include disfluencies (uh, um) in alignment')
24 | parser.set_defaults(disfluency=False)
25 | parser.add_argument(
26 |         '--log', default="INFO",
27 |         help='the log level (DEBUG, INFO, WARNING, ERROR, or CRITICAL)')
28 | parser.add_argument(
29 |         'audiofile', type=str,
30 |         help='audio file')
31 | parser.add_argument(
32 |         'txtfile', type=str,
33 |         help='transcript text file')
34 | args = parser.parse_args()
35 | 
36 | log_level = args.log.upper()
37 | logging.getLogger().setLevel(log_level)
38 | 
39 | disfluencies = set(['uh', 'um'])
40 | 
41 | def on_progress(p):
42 |     for k,v in p.items():
43 |         logging.debug("%s: %s" % (k, v))
44 | 
45 | 
46 | with open(args.txtfile, encoding="utf-8") as fh:
47 |     transcript = fh.read()
48 | 
49 | resources = gentle.Resources()
50 | logging.info("converting audio to 8K sampled wav")
51 | 
52 | with gentle.resampled(args.audiofile) as wavfile:
53 |     logging.info("starting alignment")
54 |     aligner = gentle.ForcedAligner(resources, transcript, nthreads=args.nthreads, disfluency=args.disfluency, conservative=args.conservative, disfluencies=disfluencies)
55 |     result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging)
56 | 
57 | fh = open(args.output, 'w', encoding="utf-8") if args.output else sys.stdout
58 | fh.write(result.to_json(indent=2))
59 | if args.output:
60 |     logging.info("output written to %s" % (args.output))
61 | 


--------------------------------------------------------------------------------
/examples/data/lucier.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strob/gentle/087a3b738bad9ebafd0570561486252cb624ba3a/examples/data/lucier.mp3


--------------------------------------------------------------------------------
/examples/data/lucier.txt:
--------------------------------------------------------------------------------
1 | I am sitting in a room different from the one you are in now. I am recording the sound of my speaking voice and I am going to play it back into the room again and again until the resonant frequencies of the room reinforce themselves so that any semblance of my speech, with perhaps the exception of rhythm, is destroyed. What you will hear, then, are the natural resonant frequencies of the room articulated by speech. I regard this activity not so much as a demonstration of a physical fact, but more as a way to smooth out any irregularities my speech might have.


--------------------------------------------------------------------------------
/examples/gentle_curl.sh:
--------------------------------------------------------------------------------
1 | curl -X POST -F 'audio=@examples/data/lucier.mp3' -F 'transcript=<examples/data/lucier.txt' 'http://localhost:8765/transcriptions?async=false'
2 | 


--------------------------------------------------------------------------------
/ext/Makefile:
--------------------------------------------------------------------------------
 1 | KALDI_BASE = kaldi/src/
 2 | 
 3 | all:
 4 | EXTRA_CXXFLAGS = -Wno-sign-compare
 5 | include $(KALDI_BASE)kaldi.mk
 6 | 
 7 | BINFILES = k3 m3
 8 | 
 9 | OBJFILES = 
10 | 
11 | CXXFLAGS += -I$(KALDI_BASE) -O3 -DNDEBUG
12 | 
13 | ADDLIBS = $(KALDI_BASE)online2/kaldi-online2.a $(KALDI_BASE)ivector/kaldi-ivector.a \
14 |           $(KALDI_BASE)nnet3/kaldi-nnet3.a $(KALDI_BASE)chain/kaldi-chain.a \
15 |           $(KALDI_BASE)nnet2/kaldi-nnet2.a $(KALDI_BASE)lat/kaldi-lat.a \
16 |           $(KALDI_BASE)decoder/kaldi-decoder.a  $(KALDI_BASE)cudamatrix/kaldi-cudamatrix.a \
17 |           $(KALDI_BASE)feat/kaldi-feat.a $(KALDI_BASE)transform/kaldi-transform.a $(KALDI_BASE)gmm/kaldi-gmm.a \
18 |           $(KALDI_BASE)hmm/kaldi-hmm.a $(KALDI_BASE)tree/kaldi-tree.a \
19 |           $(KALDI_BASE)matrix/kaldi-matrix.a $(KALDI_BASE)fstext/kaldi-fstext.a \
20 |           $(KALDI_BASE)util/kaldi-util.a $(KALDI_BASE)base/kaldi-base.a
21 | 
22 | ifeq ($(CUDA), true)
23 |     LDFLAGS += $(CUDA_LDFLAGS)
24 |     LDLIBS += $(CUDA_LDLIBS)
25 |     $(info cuda libraries: $(CUDA_LDFLAGS) $(CUDA_LDLIBS))
26 | else
27 |     $(info Not building with cuda!!!)
28 | endif
29 | 
30 | include $(KALDI_BASE)makefiles/default_rules.mk
31 | 


--------------------------------------------------------------------------------
/ext/install_kaldi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Prepare Kaldi
 4 | cd kaldi/tools
 5 | make clean
 6 | make
 7 | cd ../src
 8 | # make clean (sometimes helpful after upgrading upstream?)
 9 | ./configure --static --static-math=yes --static-fst=yes --use-cuda=no
10 | make depend
11 | cd ../../
12 | 


--------------------------------------------------------------------------------
/ext/k3.cc:
--------------------------------------------------------------------------------
  1 | // refactor of online2-wav-nnet3-latgen-faster.cc
  2 | 
  3 | #include "online2/online-nnet3-decoding.h"
  4 | #include "online2/online-nnet2-feature-pipeline.h"
  5 | #include "online2/onlinebin-util.h"
  6 | #include "online2/online-timing.h"
  7 | #include "online2/online-endpoint.h"
  8 | #include "fstext/fstext-lib.h"
  9 | #include "lat/lattice-functions.h"
 10 | #include "lat/lattice-functions-transition-model.h"
 11 | #include "lat/word-align-lattice.h"
 12 | #include "nnet3/decodable-simple-looped.h"
 13 | 
 14 | #ifdef HAVE_CUDA
 15 | #include "cudamatrix/cu-device.h"
 16 | #endif
 17 | 
 18 | const int arate = 8000;
 19 | 
 20 | void ConfigFeatureInfo(kaldi::OnlineNnet2FeaturePipelineInfo& info,
 21 |                        std::string ivector_model_dir) {
 22 |     // Configure inline to avoid absolute paths in ".conf" files
 23 | 
 24 |     info.feature_type = "mfcc";
 25 |     info.use_ivectors = true;
 26 | 
 27 |     // ivector_extractor.conf
 28 |     ReadKaldiObject(ivector_model_dir + "/final.mat",
 29 |                     &info.ivector_extractor_info.lda_mat);
 30 |     ReadKaldiObject(ivector_model_dir + "/global_cmvn.stats",
 31 |                     &info.ivector_extractor_info.global_cmvn_stats);
 32 |     ReadKaldiObject(ivector_model_dir + "/final.dubm",
 33 |                     &info.ivector_extractor_info.diag_ubm);
 34 |     ReadKaldiObject(ivector_model_dir + "/final.ie",
 35 |                     &info.ivector_extractor_info.extractor);
 36 | 
 37 |     info.ivector_extractor_info.num_gselect = 5;
 38 |     info.ivector_extractor_info.min_post = 0.025;
 39 |     info.ivector_extractor_info.posterior_scale = 0.1;
 40 |     info.ivector_extractor_info.max_remembered_frames = 1000;
 41 |     info.ivector_extractor_info.max_count = 100; // changed from 0.0 (?)
 42 | 
 43 |     // XXX: Where do these come from?
 44 |     info.ivector_extractor_info.greedy_ivector_extractor = true;
 45 |     info.ivector_extractor_info.ivector_period = 10;
 46 |     info.ivector_extractor_info.num_cg_iters = 15;
 47 |     info.ivector_extractor_info.use_most_recent_ivector = true;
 48 | 
 49 |     // splice.conf
 50 |     info.ivector_extractor_info.splice_opts.left_context = 3;
 51 |     info.ivector_extractor_info.splice_opts.right_context = 3;
 52 | 
 53 |     // mfcc.conf
 54 |     info.mfcc_opts.frame_opts.samp_freq = arate;
 55 |     info.mfcc_opts.use_energy = false;
 56 |     info.mfcc_opts.num_ceps = 40;
 57 |     info.mfcc_opts.mel_opts.num_bins = 40;
 58 |     info.mfcc_opts.mel_opts.low_freq = 40;
 59 |     info.mfcc_opts.mel_opts.high_freq = -200;
 60 | 
 61 |     info.ivector_extractor_info.Check();
 62 | }
 63 | 
 64 | void ConfigDecoding(kaldi::LatticeFasterDecoderConfig& config) {
 65 |   config.lattice_beam = 6.0;
 66 |   config.beam = 15.0;
 67 |   config.max_active = 7000;
 68 | }
 69 | 
 70 | void ConfigEndpoint(kaldi::OnlineEndpointConfig& config) {
 71 |   config.silence_phones = "1:2:3:4:5:6:7:8:9:10:11:12:13:14:15:16:17:18:19:20";
 72 | }
 73 | void usage() {
 74 |   fprintf(stderr, "usage: k3 [nnet_dir hclg_path]\n");
 75 | }
 76 | 
 77 | int main(int argc, char *argv[]) {
 78 |     using namespace kaldi;
 79 |     using namespace fst;
 80 | 
 81 |     setbuf(stdout, NULL);
 82 | 
 83 |     std::string nnet_dir = "exp/tdnn_7b_chain_online";
 84 |     std::string graph_dir = nnet_dir + "/graph_pp";
 85 |     std::string fst_rxfilename = graph_dir + "/HCLG.fst";
 86 | 
 87 |     if(argc == 3) {
 88 |       nnet_dir = argv[1];
 89 |       graph_dir = nnet_dir + "/graph_pp";
 90 |       fst_rxfilename = argv[2];
 91 |     }
 92 |     else if(argc != 1) {
 93 |       usage();
 94 |       return EXIT_FAILURE;
 95 |     }
 96 | 
 97 | #ifdef HAVE_CUDA
 98 |     fprintf(stdout, "Cuda enabled\n");
 99 |     CuDevice &cu_device = CuDevice::Instantiate();
100 |     cu_device.SetVerbose(true);
101 |     cu_device.SelectGpuId("yes");
102 |     fprintf(stdout, "active gpu: %d\n", cu_device.ActiveGpuId());
103 | #endif
104 |     const std::string ivector_model_dir = nnet_dir + "/ivector_extractor";
105 |     const std::string nnet3_rxfilename = nnet_dir + "/final.mdl";
106 | 
107 |     const std::string word_syms_rxfilename = graph_dir + "/words.txt";
108 |     const string word_boundary_filename = graph_dir + "/phones/word_boundary.int";
109 |     const string phone_syms_rxfilename = graph_dir + "/phones.txt";
110 | 
111 |     WordBoundaryInfoNewOpts opts; // use default opts
112 |     WordBoundaryInfo word_boundary_info(opts, word_boundary_filename);
113 | 
114 |     OnlineNnet2FeaturePipelineInfo feature_info;
115 |     ConfigFeatureInfo(feature_info, ivector_model_dir);
116 |     LatticeFasterDecoderConfig nnet3_decoding_config;
117 |     ConfigDecoding(nnet3_decoding_config);
118 |     OnlineEndpointConfig endpoint_config;
119 |     ConfigEndpoint(endpoint_config);
120 | 
121 | 
122 |     BaseFloat frame_shift = feature_info.FrameShiftInSeconds();
123 | 
124 |     TransitionModel trans_model;
125 |     nnet3::AmNnetSimple am_nnet;
126 |     {
127 |       bool binary;
128 |       Input ki(nnet3_rxfilename, &binary);
129 |       trans_model.Read(ki.Stream(), binary);
130 |       am_nnet.Read(ki.Stream(), binary);
131 |     }
132 | 
133 |     nnet3::NnetSimpleLoopedComputationOptions nnet_simple_looped_opts;
134 |     nnet_simple_looped_opts.acoustic_scale = 1.0; // changed from 0.1?
135 | 
136 |     nnet3::DecodableNnetSimpleLoopedInfo de_nnet_simple_looped_info(nnet_simple_looped_opts, &am_nnet);
137 | 
138 |     fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldi(fst_rxfilename);
139 | 
140 |     fst::SymbolTable *word_syms =
141 |       fst::SymbolTable::ReadText(word_syms_rxfilename);
142 | 
143 |     fst::SymbolTable* phone_syms =
144 |       fst::SymbolTable::ReadText(phone_syms_rxfilename);
145 | 
146 | 
147 |     OnlineIvectorExtractorAdaptationState adaptation_state(feature_info.ivector_extractor_info);
148 | 
149 |     OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
150 |     feature_pipeline.SetAdaptationState(adaptation_state);
151 | 
152 |     OnlineSilenceWeighting silence_weighting(
153 |                                              trans_model,
154 |                                              feature_info.silence_weighting_config);
155 | 
156 |     SingleUtteranceNnet3Decoder decoder(nnet3_decoding_config,
157 |                                         trans_model,
158 | 					de_nnet_simple_looped_info,
159 |                                         //am_nnet, // kaldi::nnet3::DecodableNnetSimpleLoopedInfo
160 |                                         *decode_fst,
161 |                                         &feature_pipeline);
162 | 
163 | 
164 |   char cmd[1024];
165 | 
166 |   while(true) {
167 |     // Let the client decide what we should do...
168 |     fgets(cmd, sizeof(cmd), stdin);
169 | 
170 |     if(strcmp(cmd,"stop\n") == 0) {
171 |       break;
172 |     }
173 |     else if(strcmp(cmd,"reset\n") == 0) {
174 |       feature_pipeline.~OnlineNnet2FeaturePipeline();
175 |       new (&feature_pipeline) OnlineNnet2FeaturePipeline(feature_info);
176 | 
177 |       decoder.~SingleUtteranceNnet3Decoder();
178 |       new (&decoder) SingleUtteranceNnet3Decoder(nnet3_decoding_config,
179 |                                                  trans_model,
180 | 						 de_nnet_simple_looped_info,
181 |                                                  //am_nnet,
182 |                                                  *decode_fst,
183 |                                                  &feature_pipeline);
184 |     }
185 |     else if(strcmp(cmd,"push-chunk\n") == 0) {
186 | 
187 |       // Get chunk length from python
188 |       int chunk_len;
189 |       fgets(cmd, sizeof(cmd), stdin);
190 |       sscanf(cmd, "%d\n", &chunk_len);
191 | 
192 |       int16_t audio_chunk[chunk_len];
193 |       Vector<BaseFloat> wave_part = Vector<BaseFloat>(chunk_len);
194 | 
195 |       fread(&audio_chunk, 2, chunk_len, stdin);
196 | 
197 |       // We need to copy this into the `wave_part' Vector<BaseFloat> thing.
198 |       // From `gst-audio-source.cc' in gst-kaldi-nnet2
199 |       for (int i = 0; i < chunk_len ; ++i) {
200 |         (wave_part)(i) = static_cast<BaseFloat>(audio_chunk[i]);
201 |       }
202 | 
203 |       feature_pipeline.AcceptWaveform(arate, wave_part);
204 | 
205 |       std::vector<std::pair<int32, BaseFloat> > delta_weights;
206 |       if (silence_weighting.Active()) {
207 |         silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
208 |         silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(),
209 |                                           &delta_weights);
210 |         feature_pipeline.IvectorFeature()->UpdateFrameWeights(delta_weights);
211 |       }
212 | 
213 |       decoder.AdvanceDecoding();
214 | 
215 |       fprintf(stdout, "ok\n");
216 |     }
217 |     else if(strcmp(cmd, "get-final\n") == 0) {
218 |       feature_pipeline.InputFinished(); // Computes last few frames of input
219 |       decoder.AdvanceDecoding();        // Decodes remaining frames
220 |       decoder.FinalizeDecoding();
221 | 
222 |       Lattice final_lat;
223 |       decoder.GetBestPath(true, &final_lat);
224 |       CompactLattice clat;
225 |       ConvertLattice(final_lat, &clat);
226 | 
227 |       // Compute prons alignment (see: kaldi/latbin/nbest-to-prons.cc)
228 |       CompactLattice aligned_clat;
229 | 
230 |       std::vector<int32> words, times, lengths;
231 |       std::vector<std::vector<int32> > prons;
232 |       std::vector<std::vector<int32> > phone_lengths;
233 | 
234 |       WordAlignLattice(clat, trans_model, word_boundary_info,
235 |                        0, &aligned_clat);
236 | 
237 |       CompactLatticeToWordProns(trans_model, aligned_clat, &words, &times,
238 |                                 &lengths, &prons, &phone_lengths);
239 | 
240 |       for (int i = 0; i < words.size(); i++) {
241 |         if(words[i] == 0) {
242 |           // <eps> links - silence
243 |           continue;
244 |         }
245 |         fprintf(stdout, "word: %s / start: %f / duration: %f\n",
246 |                 word_syms->Find(words[i]).c_str(),
247 |                 times[i] * frame_shift,
248 |                 lengths[i] * frame_shift);
249 |         // Print out the phonemes for this word
250 |         for(size_t j=0; j<phone_lengths[i].size(); j++) {
251 |           fprintf(stdout, "phone: %s / duration: %f\n",
252 |                   phone_syms->Find(prons[i][j]).c_str(),
253 |                   phone_lengths[i][j] * frame_shift);
254 |         }
255 |       }
256 | 
257 |       fprintf(stdout, "done with words\n");
258 | 
259 |     }
260 |     else {
261 | 
262 |       fprintf(stderr, "unknown command %s\n", cmd);
263 | 
264 |     }
265 |   }
266 | }
267 | 


--------------------------------------------------------------------------------
/ext/m3.cc:
--------------------------------------------------------------------------------
  1 | #include "fstext/context-fst.h"
  2 | #include "fstext/fstext-utils.h"
  3 | #include "fstext/kaldi-fst-io.h"
  4 | #include "fstext/table-matcher.h"
  5 | #include "hmm/hmm-utils.h"
  6 | #include "hmm/transition-model.h"
  7 | #include "tree/context-dep.h"
  8 | #include "lat/lattice-functions-transition-model.h"
  9 | #include "util/common-utils.h"
 10 | #include <fst/script/arcsort.h>
 11 | #include <fst/script/compile.h>
 12 | 
 13 | int main(int argc, char *argv[]) {
 14 | 	using namespace kaldi;
 15 | 	using namespace fst;
 16 | 	using fst::script::ArcSort;
 17 | 	try {
 18 | 		const char *usage = "Usage: ./mkgraph [options] <proto-dir> <grammar-fst> <out-fst>\n";
 19 | 
 20 | 		ParseOptions po(usage);
 21 | 		po.Read(argc, argv);
 22 | 		if (po.NumArgs() != 3) {
 23 | 			po.PrintUsage();
 24 | 			return 1;
 25 | 		}
 26 | 
 27 | 		int32 N = 3, P = 1;
 28 | 		float transition_scale = 1.0;
 29 | 		float self_loop_scale = 0.1;
 30 | 
 31 | 		std::string proto_dir = po.GetArg(1),
 32 | 					grammar_fst_filename = po.GetArg(2),
 33 | 					out_filename = po.GetArg(3);
 34 | 
 35 | 		std::string lang_fst_filename = proto_dir + "/langdir/L.fst",
 36 | 			lang_disambig_fst_filename = proto_dir + "/langdir/L_disambig.fst",
 37 | 			disambig_phones_filename = proto_dir + "/langdir/phones/disambig.int",
 38 | 			model_filename = proto_dir + "/tdnn_7b_chain_online/final.mdl",
 39 | 			tree_filename = proto_dir + "/tdnn_7b_chain_online/tree",
 40 | 			words_filename = proto_dir + "/tdnn_7b_chain_online/graph_pp/words.txt";
 41 | 
 42 | 		if (!std::ifstream(lang_fst_filename.c_str())) {
 43 | 			std::cerr << "expected " << lang_fst_filename << " to exist" << std::endl;
 44 | 			return 1;
 45 | 		}
 46 | 		if (!std::ifstream(lang_disambig_fst_filename.c_str())) {
 47 | 			std::cerr << "expected " << lang_disambig_fst_filename << " to exist" << std::endl;
 48 | 			return 1;
 49 | 		}
 50 | 		if (!std::ifstream(grammar_fst_filename.c_str())) {
 51 | 			std::cerr << "expected " << grammar_fst_filename << " to exist" << std::endl;
 52 | 			return 1;
 53 | 		}
 54 | 		if (!std::ifstream(disambig_phones_filename.c_str())) {
 55 | 			std::cerr << "expected " << disambig_phones_filename << " to exist" << std::endl;
 56 | 			return 1;
 57 | 		}
 58 | 		if (!std::ifstream(model_filename.c_str())) {
 59 | 			std::cerr << "expected " << model_filename << " to exist" << std::endl;
 60 | 			return 1;
 61 | 		}
 62 | 		if (!std::ifstream(tree_filename.c_str())) {
 63 | 			std::cerr << "expected " << tree_filename << " to exist" << std::endl;
 64 | 			return 1;
 65 | 		}
 66 | 
 67 | 		// fstcompile
 68 | 		const SymbolTable *ssyms = 0;
 69 | 		fst::SymbolTableTextOptions opts;
 70 | 		const SymbolTable *isyms = SymbolTable::ReadText(words_filename, opts);
 71 | 		if (!isyms) { return 1; }
 72 | 		const SymbolTable *osyms = SymbolTable::ReadText(words_filename, opts);
 73 | 		if (!osyms) { return 1; }
 74 | 		std::ifstream grammar_fst_file(grammar_fst_filename.c_str());
 75 | 		FstCompiler<StdArc> fstcompiler(grammar_fst_file, "", isyms,
 76 | 			osyms, ssyms,
 77 | 			false, false,
 78 | 			false, false,
 79 | 			false);
 80 | 		VectorFst<StdArc> grammar_fst = fstcompiler.Fst();
 81 | 
 82 | 		// fsttablecompose
 83 | 		VectorFst<StdArc> *lang_disambig_fst = ReadFstKaldi(lang_disambig_fst_filename);
 84 | 		if (lang_disambig_fst->Properties(fst::kOLabelSorted, true) == 0) {
 85 | 			KALDI_WARN << "L_disambig.fst is not olabel sorted.";
 86 | 		}
 87 | 		TableComposeOptions table_opts;
 88 | 		VectorFst<StdArc> lg_fst;
 89 | 		TableCompose(*lang_disambig_fst, grammar_fst, &lg_fst, table_opts);
 90 | 		delete lang_disambig_fst;
 91 | 
 92 | 		// fstdeterminizestar --use-log
 93 | 		ArcSort(&lg_fst, ILabelCompare<StdArc>());
 94 | 		int max_states = -1;
 95 | 		bool debug_location = false;
 96 | 		DeterminizeStarInLog(&lg_fst, kDelta, &debug_location, max_states);
 97 | 
 98 | 		// fstminimizeencoded
 99 | 		MinimizeEncoded(&lg_fst, kDelta);
100 | 
101 | 		// fstarcsort --sort_type=ilabel
102 | 		ArcSort(&lg_fst, ILabelCompare<StdArc>());
103 | 
104 | 		// fstisstochastic
105 | 		StdArc::Weight min, max;
106 | 		if (!IsStochasticFst(lg_fst, 0.01, &min, &max)) {
107 | 			std::cerr << "[info]: LG not stochastic." << std::endl;
108 | 		}
109 | 
110 | 		// fstcomposecontext
111 | 		std::vector<int32> disambig_symbols;
112 | 		ReadIntegerVectorSimple(disambig_phones_filename, &disambig_symbols);
113 | 		if (disambig_symbols.empty()) {
114 | 			KALDI_WARN << "Disambiguation symbols list is empty; this likely "
115 | 				<< "indicates an error in data preparation.";
116 | 		}
117 | 		std::vector<std::vector<int32> > ilabels;
118 | 		VectorFst<StdArc> clg_fst;
119 | 		ComposeContext(disambig_symbols, N, P, &lg_fst, &clg_fst, &ilabels);
120 | 
121 | 		// fstarcsort --sort_type=ilabel
122 | 		ArcSort(&clg_fst, ILabelCompare<StdArc>());
123 | 
124 | 		// fstisstochastic
125 | 		if (!IsStochasticFst(clg_fst, 0.01, &min, &max)) {
126 | 			std::cerr << "[info]: CLG not stochastic." << std::endl;
127 | 		}
128 | 
129 | 		// make-h-transducer
130 | 		HTransducerConfig hcfg;
131 | 		hcfg.transition_scale = transition_scale;
132 | 		ContextDependency ctx_dep;
133 | 		ReadKaldiObject(tree_filename, &ctx_dep);
134 | 		TransitionModel trans_model;
135 | 		ReadKaldiObject(model_filename, &trans_model);
136 | 		std::vector<int32> disambig_tid;
137 | 		fst::VectorFst<fst::StdArc> *ha_fst = GetHTransducer(
138 | 			ilabels,
139 | 			ctx_dep,
140 | 			trans_model,
141 | 			hcfg,
142 | 			&disambig_tid);
143 | 
144 | 		// fsttablecompose
145 | 		VectorFst<StdArc> hclga_fst;
146 | 		TableComposeOptions hclga_table_opts;
147 | 		TableCompose(*ha_fst, clg_fst, &hclga_fst, hclga_table_opts);
148 | 
149 | 		// fstdeterminizestar --use-log=true
150 | 		ArcSort(&hclga_fst, ILabelCompare<StdArc>());
151 | 		DeterminizeStarInLog(&hclga_fst, kDelta, &debug_location, max_states);
152 | 
153 | 		// fstrmsymbols
154 | 		RemoveSomeInputSymbols(disambig_tid, &hclga_fst);
155 | 
156 | 		// fstrmepslocal
157 | 		RemoveEpsLocal(&hclga_fst);
158 | 
159 | 		// fstminimizeencoded
160 | 		MinimizeEncoded(&hclga_fst, kDelta);
161 | 
162 | 		// fstisstochastic
163 | 		if (!IsStochasticFst(hclga_fst, 0.01, &min, &max)) {
164 | 			std::cerr << "[info]: HCLGa is not stochastic." << std::endl;
165 | 		}
166 | 
167 | 		VectorFst<StdArc> hclg_fst = hclga_fst;
168 | 
169 | 		// add-self-loops
170 | 		std::vector<int32> null_disambig_syms;
171 | 		AddSelfLoops(trans_model,
172 | 	                 null_disambig_syms,
173 | 	                 self_loop_scale,
174 | 	                 true,
175 | 			 true,
176 | 	                 &hclg_fst);
177 | 
178 | 	    // fstisstochastic
179 | 		if (transition_scale == 1.0 &&
180 | 			self_loop_scale == 1.0 &&
181 | 			!IsStochasticFst(hclg_fst, 0.01, &min, &max)) {
182 | 			std::cerr << "[info]: final HCLG is not stochastic." << std::endl;
183 | 		}
184 | 
185 | 	    if (!hclg_fst.Write(out_filename)) {
186 | 			KALDI_ERR << "error writing FST to " << out_filename;
187 | 	    }
188 | 	} catch(const std::exception &e) {
189 | 		std::cerr << e.what();
190 | 		return -1;
191 | 	}
192 | }
193 | 


--------------------------------------------------------------------------------
/gentle/__init__.py:
--------------------------------------------------------------------------------
1 | from .__version__ import __version__
2 | from .resources import Resources
3 | from .forced_aligner import ForcedAligner
4 | from .full_transcriber import FullTranscriber
5 | from .resample import resample, resampled
6 | from .transcription import Transcription
7 | 


--------------------------------------------------------------------------------
/gentle/__version__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.11.0'
2 | 


--------------------------------------------------------------------------------
/gentle/diff_align.py:
--------------------------------------------------------------------------------
  1 | import difflib
  2 | import json
  3 | import os
  4 | import sys
  5 | 
  6 | from gentle import metasentence
  7 | from gentle import language_model
  8 | from gentle import standard_kaldi
  9 | from gentle import transcription
 10 | from gentle.resources import Resources
 11 | 
 12 | 
 13 | # TODO(maxhawkins): try using the (apparently-superior) time-mediated dynamic
 14 | # programming algorithm used in sclite's alignment process:
 15 | # http://www1.icsi.berkeley.edu/Speech/docs/sctk-1.2/sclite.htm#time-mediated
 16 | def align(alignment, ms, **kwargs):
 17 |     '''Use the diff algorithm to align the raw tokens recognized by Kaldi
 18 |     to the words in the transcript (tokenized by MetaSentence).
 19 | 
 20 |     The output combines information about the timing and alignment of
 21 |     correctly-aligned words as well as words that Kaldi failed to recognize
 22 |     and extra words not found in the original transcript.
 23 |     '''
 24 |     disfluency = kwargs['disfluency'] if 'disfluency' in kwargs else False
 25 |     disfluencies = kwargs['disfluencies'] if 'disfluencies' in kwargs else []
 26 | 
 27 |     hypothesis = [X.word for X in alignment]
 28 |     reference = ms.get_kaldi_sequence()
 29 | 
 30 |     display_seq = ms.get_display_sequence()
 31 |     txt_offsets = ms.get_text_offsets()
 32 | 
 33 |     out = []
 34 |     for op, a, b in word_diff(hypothesis, reference):
 35 | 
 36 |         if op == 'delete':
 37 |             word = hypothesis[a]
 38 |             if disfluency and word in disfluencies:
 39 |                 hyp_token = alignment[a]
 40 |                 phones = hyp_token.phones or []
 41 | 
 42 |                 out.append(transcription.Word(
 43 |                     case=transcription.Word.NOT_FOUND_IN_TRANSCRIPT,
 44 |                     phones=phones,
 45 |                     start=hyp_token.start,
 46 |                     duration=hyp_token.duration,
 47 |                     word=word))
 48 |             continue
 49 | 
 50 |         display_word = display_seq[b]
 51 |         start_offset, end_offset = txt_offsets[b]
 52 | 
 53 |         if op == 'equal':
 54 |             hyp_word = hypothesis[a]
 55 |             hyp_token = alignment[a]
 56 |             phones = hyp_token.phones or []
 57 | 
 58 |             out.append(transcription.Word(
 59 |                 case=transcription.Word.SUCCESS,
 60 |                 startOffset=start_offset,
 61 |                 endOffset=end_offset,
 62 |                 word=display_word,
 63 |                 alignedWord=hyp_word,
 64 |                 phones=phones,
 65 |                 start=hyp_token.start,
 66 |                 duration=hyp_token.duration))
 67 | 
 68 |         elif op in ['insert', 'replace']:
 69 |             out.append(transcription.Word(
 70 |                 case=transcription.Word.NOT_FOUND_IN_AUDIO,
 71 |                 startOffset=start_offset,
 72 |                 endOffset=end_offset,
 73 |                 word=display_word))
 74 |     return out
 75 | 
 76 | def word_diff(a, b):
 77 |     '''Like difflib.SequenceMatcher but it only compares one word
 78 |     at a time. Returns an iterator whose elements are like
 79 |     (operation, index in a, index in b)'''
 80 |     matcher = difflib.SequenceMatcher(a=a, b=b)
 81 |     for op, a_idx, _, b_idx, _ in by_word(matcher.get_opcodes()):
 82 |         yield (op, a_idx, b_idx)
 83 | 
 84 | def by_word(opcodes):
 85 |     '''Take difflib.SequenceMatcher.get_opcodes() output and
 86 |     return an equivalent opcode sequence that only modifies
 87 |     one word at a time'''
 88 |     for op, s1, e1, s2, e2 in opcodes:
 89 |         if op == 'delete':
 90 |             for i in range(s1, e1):
 91 |                 yield (op, i, i+1, s2, s2)
 92 |         elif op == 'insert':
 93 |             for i in range(s2, e2):
 94 |                 yield (op, s1, s1, i, i+1)
 95 |         else:
 96 |             len1 = e1-s1
 97 |             len2 = e2-s2
 98 |             for i1, i2 in zip(range(s1, e1), range(s2, e2)):
 99 |                 yield (op, i1, i1 + 1, i2, i2 + 1)
100 |             if len1 > len2:
101 |                 for i in range(s1 + len2, e1):
102 |                     yield ('delete', i, i+1, e2, e2)
103 |             if len2 > len1:
104 |                 for i in range(s2 + len1, e2):
105 |                     yield ('insert', s1, s1, i, i+1)
106 | 
107 | if __name__=='__main__':
108 |     TEXT_FILE = sys.argv[1]
109 |     JSON_FILE = sys.argv[2]
110 |     OUTPUT_FILE = sys.argv[3]
111 | 
112 |     ms = metasentence.MetaSentence(open(TEXT_FILE).read(), Resources().vocab)
113 |     alignment = json.load(open(JSON_FILE))['words']
114 | 
115 |     out = align(alignment, ms)
116 | 
117 |     json.dump(out, open(OUTPUT_FILE, 'w'), indent=2)
118 | 


--------------------------------------------------------------------------------
/gentle/forced_aligner.py:
--------------------------------------------------------------------------------
  1 | from gentle import diff_align
  2 | from gentle import kaldi_queue
  3 | from gentle import language_model
  4 | from gentle import metasentence
  5 | from gentle import multipass
  6 | from gentle.transcriber import MultiThreadedTranscriber
  7 | from gentle.transcription import Transcription
  8 | 
  9 | class ForcedAligner():
 10 | 
 11 |     def __init__(self, resources, transcript, nthreads=4, **kwargs):
 12 |         self.kwargs = kwargs
 13 |         self.nthreads = nthreads
 14 |         self.transcript = transcript
 15 |         self.resources = resources
 16 |         self.ms = metasentence.MetaSentence(transcript, resources.vocab)
 17 |         ks = self.ms.get_kaldi_sequence()
 18 |         gen_hclg_filename = language_model.make_bigram_language_model(ks, resources.proto_langdir, **kwargs)
 19 |         self.queue = kaldi_queue.build(resources, hclg_path=gen_hclg_filename, nthreads=nthreads)
 20 |         self.mtt = MultiThreadedTranscriber(self.queue, nthreads=nthreads)
 21 | 
 22 |     def transcribe(self, wavfile, progress_cb=None, logging=None):
 23 |         words, duration = self.mtt.transcribe(wavfile, progress_cb=progress_cb)
 24 | 
 25 |         # Clear queue (would this be gc'ed?)
 26 |         for i in range(self.nthreads):
 27 |             k = self.queue.get()
 28 |             k.stop()
 29 | 
 30 |         # Align words
 31 |         words = diff_align.align(words, self.ms, **self.kwargs)
 32 | 
 33 |         # Perform a second-pass with unaligned words
 34 |         if logging is not None:
 35 |             logging.info("%d unaligned words (of %d)" % (len([X for X in words if X.not_found_in_audio()]), len(words)))
 36 | 
 37 |         if progress_cb is not None:
 38 |             progress_cb({'status': 'ALIGNING'})
 39 | 
 40 |         words = multipass.realign(wavfile, words, self.ms, resources=self.resources, nthreads=self.nthreads, progress_cb=progress_cb)
 41 | 
 42 |         if logging is not None:
 43 |             logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in words if X.not_found_in_audio()]), len(words)))
 44 | 
 45 |         words = AdjacencyOptimizer(words, duration).optimize()
 46 | 
 47 |         return Transcription(words=words, transcript=self.transcript)
 48 | 
 49 | 
 50 | class AdjacencyOptimizer():
 51 | 
 52 |     '''
 53 |     Sometimes there are ambiguous possible placements of not-found-in-audio
 54 |     words.  The word-based diff doesn't take into account intra-word timings
 55 |     when it does insertion, so can create strange results.  E.g. if the audio
 56 |     contains these words with timings like
 57 | 
 58 |         "She climbed on the bed and jumped on the mattress"
 59 |             0     1    2   3   4    5   6    7   8     9
 60 | 
 61 |     and suppose the speaker mumbled or there was noise obscuring the words
 62 |     "on the bed and jumped", so the hypothesis is just "She climbed on the mattress".
 63 | 
 64 |     The intended alignment would be to insert the missing out-of-audio words:
 65 | 
 66 |         "She climbed [on the bed and jumped] on the mattress"
 67 |             0     1                            7   8     9
 68 | 
 69 |     But the word-based diff might instead align "on the" with the first
 70 |     occurrence, and so insert out-of-audio words like this:
 71 | 
 72 |         "She climbed on the [bed and jumped on the] mattress"
 73 |             0     1    7   8                             9
 74 | 
 75 |     with a big gap in between "climbed" and "on" and no time available for
 76 |     "[bend and jumped on the]".
 77 | 
 78 |     Or imagine a case such as "I really really really really want to do
 79 |     this", where only one of the "really"s is in the hypothesis, so again
 80 |     the choice word-based choice of which to align it with is arbitrary.
 81 | 
 82 |     This method cleans those up, by checking each not-found-in-audio sequence
 83 |     of words to see if its neighbor(s) are candidates for moving inward and
 84 |     whether doing so would improve adjacent intra-word distances.
 85 |     '''
 86 | 
 87 |     def __init__(self, words, duration):
 88 |         self.words = words
 89 |         self.duration = duration
 90 | 
 91 |     def out_of_audio_sequence(self, i):
 92 |         j = i
 93 |         while 0 <= j < len(self.words) and self.words[j].not_found_in_audio():
 94 |             j += 1
 95 |         return None if j == i else j
 96 | 
 97 |     def tend(self, i):
 98 |         for word in reversed(self.words[:i]):
 99 |             if word.success():
100 |                 return word.end
101 |         return 0
102 | 
103 |     def tstart(self, i):
104 |         for word in self.words[i:]:
105 |             if word.success():
106 |                 return word.start
107 |         return self.duration
108 | 
109 |     def find_subseq(self, i, j, p, n):
110 |         for k in range(i, j-n+1):
111 |             for m in range(p, p+n):
112 |                 if self.words[k].word != self.words[m].word:
113 |                     break
114 |             else:
115 |                 return k
116 |         return None
117 | 
118 |     def swap_adjacent_if_better(self, i, j, n, side):
119 |         '''Given an out-of-audio sequence at [i,j), looks to see if the adjacent n words
120 |         can be beneficially swapped with a subsequence.'''
121 | 
122 |         # construct adjacent candidate words and their gap relative to their
123 |         # opposite neighbors
124 |         if side == "left":
125 |             p, q = (i-n, i)
126 |             if p < 0: return False
127 |             opp_gap = self.tstart(p) - self.tend(p)
128 |         else:
129 |             p, q = (j, j+n)
130 |             if q > len(self.words): return False
131 |             opp_gap = self.tstart(q) - self.tend(q)
132 | 
133 |         # is there a matching subsequence?
134 |         k = self.find_subseq(i, j, p, n)
135 |         if k is None: return False
136 | 
137 |         # if the opposite gap isn't bigger than the sequence gap, no benefit to
138 |         # potential swap
139 |         seq_gap = self.tstart(j) - self.tend(i)
140 |         if opp_gap <= seq_gap: return False
141 | 
142 |         # swap subsequences at p and k
143 |         for m in range(0, n):
144 |             self.words[k+m].swap_alignment(self.words[p+m])
145 | 
146 |         return True
147 | 
148 |     def optimize_adjacent(self, i, j):
149 |         '''Given an out-of-audio sequence at [i,j), looks for an opportunity to
150 |         swap a sub-sequence with adjacent words at [p, i) or [j, p)'''
151 | 
152 |         for n in reversed(range(1, (j-i)+1)): # consider larger moves first
153 |             if self.swap_adjacent_if_better(i, j, n, "left"): return True
154 |             if self.swap_adjacent_if_better(i, j, n, "right"): return True
155 | 
156 |     def optimize(self):
157 |         i = 0
158 |         while i < len(self.words):
159 |             j = self.out_of_audio_sequence(i)
160 |             if j is None:
161 |                 i += 1
162 | 
163 |             elif self.optimize_adjacent(i, j):
164 |                 # back up to rescan in case we swapped left
165 |                 while i >= 0 and self.words[i].not_found_in_audio():
166 |                     i -= 1
167 | 
168 |             else:
169 |                 i = j # skip past this sequence
170 | 
171 |         return self.words
172 | 


--------------------------------------------------------------------------------
/gentle/full_transcriber.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from gentle import kaldi_queue
 4 | from gentle import transcription
 5 | from gentle.transcriber import MultiThreadedTranscriber
 6 | from gentle.transcription import Transcription
 7 | 
 8 | class FullTranscriber():
 9 | 
10 |     def __init__(self, resources, nthreads=2):
11 |         self.available = False
12 |         if nthreads <= 0: return
13 |         if not os.path.exists(resources.full_hclg_path): return
14 | 
15 |         queue = kaldi_queue.build(resources, nthreads=nthreads)
16 |         self.mtt = MultiThreadedTranscriber(queue, nthreads=nthreads)
17 |         self.available = True
18 | 
19 |     def transcribe(self, wavfile, progress_cb=None, logging=None):
20 |         words, duration = self.mtt.transcribe(wavfile, progress_cb=progress_cb)
21 |         return self.make_transcription_alignment(words)
22 | 
23 |     @staticmethod
24 |     def make_transcription_alignment(trans):
25 |         # Spoof the `diff_align` output format
26 |         transcript = ""
27 |         words = []
28 |         for t_wd in trans:
29 |             word = transcription.Word(
30 |                 case=transcription.Word.SUCCESS,
31 |                 startOffset=len(transcript),
32 |                 endOffset=len(transcript) + len(t_wd.word),
33 |                 word=t_wd.word,
34 |                 alignedWord=t_wd.word,
35 |                 phones=t_wd.phones,
36 |                 start=t_wd.start,
37 |                 end=t_wd.end)
38 |             words.append(word)
39 | 
40 |             transcript += word.word + " "
41 | 
42 |         return Transcription(words=words, transcript=transcript)
43 | 


--------------------------------------------------------------------------------
/gentle/kaldi_queue.py:
--------------------------------------------------------------------------------
 1 | from queue import Queue
 2 | from gentle import standard_kaldi
 3 | 
 4 | def build(resources, nthreads=4, hclg_path=None):
 5 | 
 6 |     if hclg_path is None: hclg_path = resources.full_hclg_path
 7 | 
 8 |     kaldi_queue = Queue()
 9 |     for i in range(nthreads):
10 |         kaldi_queue.put(standard_kaldi.Kaldi(
11 |             resources.nnet_gpu_path,
12 |             hclg_path,
13 |             resources.proto_langdir)
14 |         )
15 |     return kaldi_queue
16 | 


--------------------------------------------------------------------------------
/gentle/language_model.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import math
  3 | import os
  4 | import shutil
  5 | import subprocess
  6 | import sys
  7 | import tempfile
  8 | 
  9 | from .util.paths import get_binary
 10 | from .metasentence import MetaSentence
 11 | from .resources import Resources
 12 | 
 13 | MKGRAPH_PATH = get_binary("ext/m3")
 14 | 
 15 | # [oov] no longer in words.txt
 16 | OOV_TERM = '<unk>'
 17 | 
 18 | def make_bigram_lm_fst(word_sequences, **kwargs):
 19 |     '''
 20 |     Use the given token sequence to make a bigram language model
 21 |     in OpenFST plain text format.
 22 | 
 23 |     When the "conservative" flag is set, an [oov] is interleaved
 24 |     between successive words.
 25 | 
 26 |     When the "disfluency" flag is set, a small set of disfluencies is
 27 |     interleaved between successive words
 28 | 
 29 |     `Word sequence` is a list of lists, each valid as a start
 30 |     '''
 31 | 
 32 |     if len(word_sequences) == 0 or type(word_sequences[0]) != list:
 33 |         word_sequences = [word_sequences]
 34 | 
 35 |     conservative = kwargs['conservative'] if 'conservative' in kwargs else False
 36 |     disfluency = kwargs['disfluency'] if 'disfluency' in kwargs else False
 37 |     disfluencies = kwargs['disfluencies'] if 'disfluencies' in kwargs else []
 38 | 
 39 |     bigrams = {OOV_TERM: set([OOV_TERM])}
 40 | 
 41 |     for word_sequence in word_sequences:
 42 |         if len(word_sequence) == 0:
 43 |             continue
 44 | 
 45 |         prev_word = word_sequence[0]
 46 |         bigrams[OOV_TERM].add(prev_word) # valid start (?)
 47 | 
 48 |         if disfluency:
 49 |             bigrams[OOV_TERM].update(disfluencies)
 50 | 
 51 |             for dis in disfluencies:
 52 |                 bigrams.setdefault(dis, set()).add(prev_word)
 53 |                 bigrams[dis].add(OOV_TERM)
 54 | 
 55 |         for word in word_sequence[1:]:
 56 |             bigrams.setdefault(prev_word, set()).add(word)
 57 | 
 58 |             if conservative:
 59 |                 bigrams[prev_word].add(OOV_TERM)
 60 | 
 61 |             if disfluency:
 62 |                 bigrams[prev_word].update(disfluencies)
 63 | 
 64 |                 for dis in disfluencies:
 65 |                     bigrams[dis].add(word)
 66 | 
 67 |             prev_word = word
 68 | 
 69 |         # ...valid end
 70 |         bigrams.setdefault(prev_word, set()).add(OOV_TERM)
 71 | 
 72 |     node_ids = {}
 73 |     def get_node_id(word):
 74 |         node_id = node_ids.get(word, len(node_ids) + 1)
 75 |         node_ids[word] = node_id
 76 |         return node_id
 77 | 
 78 |     output = ""
 79 |     for from_word in sorted(bigrams.keys()):
 80 |         from_id = get_node_id(from_word)
 81 | 
 82 |         successors = bigrams[from_word]
 83 |         if len(successors) > 0:
 84 |             weight = -math.log(1.0 / len(successors))
 85 |         else:
 86 |             weight = 0
 87 | 
 88 |         for to_word in sorted(successors):
 89 |             to_id = get_node_id(to_word)
 90 |             output += '%d    %d    %s    %s    %f' % (from_id, to_id, to_word, to_word, weight)
 91 |             output += "\n"
 92 | 
 93 |     output += "%d    0\n" % (len(node_ids))
 94 | 
 95 |     return output.encode()
 96 | 
 97 | def make_bigram_language_model(kaldi_seq, proto_langdir, **kwargs):
 98 |     """Generates a language model to fit the text.
 99 | 
100 |     Returns the filename of the generated language model FST.
101 |     The caller is resposible for removing the generated file.
102 | 
103 |     `proto_langdir` is a path to a directory containing prototype model data
104 |     `kaldi_seq` is a list of words within kaldi's vocabulary.
105 |     """
106 | 
107 |     # Generate a textual FST
108 |     txt_fst = make_bigram_lm_fst(kaldi_seq, **kwargs)
109 |     txt_fst_file = tempfile.NamedTemporaryFile(delete=False)
110 |     txt_fst_file.write(txt_fst)
111 |     txt_fst_file.close()
112 | 
113 |     hclg_filename = tempfile.mktemp(suffix='_HCLG.fst')
114 |     try:
115 |         devnull = open(os.devnull, 'wb')
116 |         subprocess.check_output([MKGRAPH_PATH,
117 |                         proto_langdir,
118 |                         txt_fst_file.name,
119 |                         hclg_filename],
120 |                         stderr=devnull)
121 |     except Exception as e:
122 |         try:
123 |             os.unlink(hclg_filename)
124 |         except:
125 |             pass
126 |         raise e
127 |     finally:
128 |         os.unlink(txt_fst_file.name)
129 | 
130 |     return hclg_filename
131 | 
132 | if __name__=='__main__':
133 |     import sys
134 |     make_bigram_language_model(open(sys.argv[1]).read(), Resources().proto_langdir)
135 | 


--------------------------------------------------------------------------------
/gentle/metasentence.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import re
 3 | 
 4 | # [oov] no longer in words.txt
 5 | OOV_TERM = '<unk>'
 6 | 
 7 | def load_vocabulary(words_file):
 8 |     '''Load vocabulary words from an OpenFST SymbolTable formatted text file'''
 9 |     return set(x.split(' ')[0] for x in words_file if x != '')
10 | 
11 | def kaldi_normalize(word, vocab):
12 |     """
13 |     Take a token extracted from a transcript by MetaSentence and
14 |     transform it to use the same format as Kaldi's vocabulary files.
15 |     Removes fancy punctuation and strips out-of-vocabulary words.
16 |     """
17 |     # lowercase
18 |     norm = word.lower()
19 |     # Turn fancy apostrophes into simpler apostrophes
20 |     norm = norm.replace("’", "'")
21 |     if len(norm) > 0 and not norm in vocab:
22 |         norm = OOV_TERM
23 |     return norm
24 | 
25 | class MetaSentence:
26 |     """Maintain two parallel representations of a sentence: one for
27 |     Kaldi's benefit, and the other in human-legible form.
28 |     """
29 | 
30 |     def __init__(self, sentence, vocab):
31 |         self.raw_sentence = sentence
32 | 
33 |         if type(sentence) == bytes:
34 |             self.raw_sentence = sentence.decode('utf-8')
35 |         self.vocab = vocab
36 | 
37 |         self._tokenize()
38 | 
39 |     def _tokenize(self):
40 |         self._seq = []
41 |         for m in re.finditer(r'(\w|\’\w|\'\w)+', self.raw_sentence, re.UNICODE):
42 |             start, end = m.span()
43 |             word = m.group()
44 |             token = kaldi_normalize(word, self.vocab)
45 |             self._seq.append({
46 |                 "start": start, # as unicode codepoint offset
47 |                 "end": end, # as unicode codepoint offset
48 |                 "token": token,
49 |             })
50 | 
51 |     def get_kaldi_sequence(self):
52 |         return [x["token"] for x in self._seq]
53 | 
54 |     def get_display_sequence(self):
55 |         display_sequence = []
56 |         for x in self._seq:
57 |             start, end = x["start"], x["end"]
58 |             word = self.raw_sentence[start:end]
59 |             display_sequence.append(word)
60 |         return display_sequence
61 | 
62 |     def get_text_offsets(self):
63 |         return [(x["start"], x["end"]) for x in self._seq]
64 | 


--------------------------------------------------------------------------------
/gentle/multipass.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from multiprocessing.pool import ThreadPool as Pool
  3 | import os
  4 | import wave
  5 | 
  6 | from gentle import standard_kaldi
  7 | from gentle import metasentence
  8 | from gentle import language_model
  9 | from gentle import diff_align
 10 | from gentle import transcription
 11 | 
 12 | def prepare_multipass(alignment):
 13 |     to_realign = []
 14 |     last_aligned_word = None
 15 |     cur_unaligned_words = []
 16 | 
 17 |     for wd_idx,wd in enumerate(alignment):
 18 |         if wd.not_found_in_audio():
 19 |             cur_unaligned_words.append(wd)
 20 |         elif wd.success():
 21 |             if len(cur_unaligned_words) > 0:
 22 |                 to_realign.append({
 23 |                     "start": last_aligned_word,
 24 |                     "end": wd,
 25 |                     "words": cur_unaligned_words})
 26 |                 cur_unaligned_words = []
 27 | 
 28 |             last_aligned_word = wd
 29 | 
 30 |     if len(cur_unaligned_words) > 0:
 31 |         to_realign.append({
 32 |             "start": last_aligned_word,
 33 |             "end": None,
 34 |             "words": cur_unaligned_words})
 35 | 
 36 |     return to_realign
 37 |     
 38 | def realign(wavfile, alignment, ms, resources, nthreads=4, progress_cb=None):
 39 |     to_realign = prepare_multipass(alignment)
 40 |     realignments = []
 41 | 
 42 |     def realign(chunk):
 43 |         wav_obj = wave.open(wavfile, 'rb')
 44 | 
 45 |         if chunk["start"] is None:
 46 |             start_t = 0
 47 |         else:
 48 |             start_t = chunk["start"].end
 49 | 
 50 |         if chunk["end"] is None:
 51 |             end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
 52 |         else:
 53 |             end_t = chunk["end"].start
 54 | 
 55 |         duration = end_t - start_t
 56 |         # XXX: the minimum length seems bigger now (?)
 57 |         if duration < 0.75 or duration > 60:
 58 |             logging.debug("cannot realign %d words with duration %f" % (len(chunk['words']), duration))
 59 |             return
 60 | 
 61 |         # Create a language model
 62 |         offset_offset = chunk['words'][0].startOffset
 63 |         chunk_len = chunk['words'][-1].endOffset - offset_offset
 64 |         chunk_transcript = ms.raw_sentence[offset_offset:offset_offset+chunk_len].encode("utf-8")
 65 |         chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
 66 |         chunk_ks = chunk_ms.get_kaldi_sequence()
 67 | 
 68 |         chunk_gen_hclg_filename = language_model.make_bigram_language_model(chunk_ks, resources.proto_langdir)
 69 |         k = standard_kaldi.Kaldi(
 70 |             resources.nnet_gpu_path,
 71 |             chunk_gen_hclg_filename,
 72 |             resources.proto_langdir)
 73 | 
 74 |         wav_obj = wave.open(wavfile, 'rb')
 75 |         wav_obj.setpos(int(start_t * wav_obj.getframerate()))
 76 |         buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))
 77 | 
 78 |         k.push_chunk(buf)
 79 |         ret = [transcription.Word(**wd) for wd in k.get_final()]
 80 |         k.stop()
 81 | 
 82 |         word_alignment = diff_align.align(ret, chunk_ms)
 83 | 
 84 |         for wd in word_alignment:
 85 |             wd.shift(time=start_t, offset=offset_offset)
 86 | 
 87 |         # "chunk" should be replaced by "words"
 88 |         realignments.append({"chunk": chunk, "words": word_alignment})
 89 | 
 90 |         if progress_cb is not None:
 91 |             progress_cb({"percent": len(realignments) / float(len(to_realign))})
 92 | 
 93 |     pool = Pool(nthreads)
 94 |     pool.map(realign, to_realign)
 95 |     pool.close()
 96 | 
 97 |     # Sub in the replacements
 98 |     o_words = alignment
 99 |     for ret in realignments:
100 |         st_idx = o_words.index(ret["chunk"]["words"][0])
101 |         end_idx= o_words.index(ret["chunk"]["words"][-1])+1
102 |         #logging.debug('splice in: "%s' % (str(ret["words"])))
103 |         #logging.debug('splice out: "%s' % (str(o_words[st_idx:end_idx])))
104 |         o_words = o_words[:st_idx] + ret["words"] + o_words[end_idx:]
105 | 
106 |     return o_words
107 | 


--------------------------------------------------------------------------------
/gentle/resample.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import subprocess
 4 | import tempfile
 5 | 
 6 | from contextlib import contextmanager
 7 | 
 8 | 
 9 | from .util.paths import get_binary
10 | 
11 | FFMPEG = get_binary("ffmpeg")
12 | SOX = get_binary("sox")
13 | 
14 | def resample_ffmpeg(infile, outfile, offset=None, duration=None):
15 |     '''
16 |     Use FFMPEG to convert a media file to a wav file sampled at 8K
17 |     '''
18 |     if offset is None:
19 |         offset = []
20 |     else:
21 |         offset = ['-ss', str(offset)]
22 |     if duration is None:
23 |         duration = []
24 |     else:
25 |         duration = ['-t', str(duration)]
26 | 
27 |     cmd = [
28 |         FFMPEG,
29 |         '-loglevel', 'panic',
30 |         '-y',
31 |     ] + offset + [
32 |         '-i', infile,
33 |     ] + duration + [
34 |         '-ac', '1', '-ar', '8000',
35 |         '-acodec', 'pcm_s16le',
36 |         outfile
37 |     ]
38 |     return subprocess.call(cmd)
39 | 
40 | def resample_sox(infile, outfile, offset=None, duration=None):
41 |     '''
42 |     Use SoX to convert a media file to a wav file sampled at 8K
43 |     '''
44 |     if offset is None and duration is None:
45 |         trim = []
46 |     else:
47 |         if offset is None:
48 |             offset = 0
49 |         trim = ['trim', str(offset)]
50 |         if duration is not None:
51 |             trim += [str(duration)]
52 | 
53 |     cmd = [
54 |         SOX,
55 |         '--clobber',
56 |         '-q',
57 |         '-V1',
58 |         infile,
59 |         '-b', '16',
60 |         '-c', '1',
61 |         '-e', 'signed-integer',
62 |         '-r', '8000',
63 |         '-L',
64 |         outfile
65 |     ] + trim
66 |     return subprocess.call(cmd)
67 | 
68 | def resample(infile, outfile, offset=None, duration=None):
69 |     if not os.path.isfile(infile):
70 |         raise IOError("Not a file: %s" % infile)
71 |     if shutil.which(FFMPEG) or os.path.exists(FFMPEG):
72 |         return resample_ffmpeg(infile, outfile, offset, duration)
73 |     else:
74 |         return resample_sox(infile, outfile, offset, duration)
75 | 
76 | @contextmanager
77 | def resampled(infile, offset=None, duration=None):
78 |     with tempfile.NamedTemporaryFile(suffix='.wav') as fp:
79 |         if resample(infile, fp.name, offset, duration) != 0:
80 |             raise RuntimeError("Unable to resample/encode '%s'" % infile)
81 |         yield fp.name
82 | 


--------------------------------------------------------------------------------
/gentle/resources.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | from .util.paths import get_resource, ENV_VAR
 5 | from . import metasentence
 6 | 
 7 | class Resources():
 8 | 
 9 |     def __init__(self):
10 |         self.proto_langdir = get_resource('exp')
11 |         self.nnet_gpu_path = get_resource('exp/tdnn_7b_chain_online/')
12 |         self.full_hclg_path = get_resource('exp/tdnn_7b_chain_online/graph_pp/HCLG.fst')
13 | 
14 |         def require_dir(path):
15 |             if not os.path.isdir(path):
16 |                 raise RuntimeError("No resource directory %s.  Check %s environment variable?" % (path, ENV_VAR))
17 | 
18 | 
19 |         require_dir(self.proto_langdir)
20 |         require_dir(self.nnet_gpu_path)
21 | 
22 |         with open(os.path.join(self.proto_langdir, "langdir", "words.txt")) as fh:
23 |             self.vocab = metasentence.load_vocabulary(fh)
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/gentle/rpc.py:
--------------------------------------------------------------------------------
 1 | class RPCProtocol(object):
 2 |     '''RPCProtocol is the wire protocol we use to communicate with the
 3 |     standard_kaldi subprocess. It's a mixed text/binary protocol
 4 |     because we need to send binary audio chunks, but text is simpler.'''
 5 | 
 6 |     def __init__(self, send_pipe, recv_pipe):
 7 |         '''Initializes the RPCProtocol and reads from recv_pipe until the startup
 8 |         message is received.'''
 9 |         self.send_pipe = send_pipe
10 |         self.recv_pipe = recv_pipe
11 | 
12 |         # don't wait for startup
13 |         # body, _ = self._read_reply()
14 |         # if body != 'loaded':
15 |         #     raise RuntimeError('unexpected message from standard_kaldi on load')
16 | 
17 |     def do(self, method, *args, **kwargs):
18 |         '''Performs the method requested and returns the response body.
19 |         The body keyword argument can be used to provide a binary request
20 |         body. Throws an RPCError when the RPC returns an error.'''
21 |         body = kwargs.get('body', None)
22 |         self._write_request(method, args, body)
23 |         return self._read_reply()
24 | 
25 |     def _write_request(self, method, args, body):
26 |         '''Writes a request to the stream.
27 |         Request format:
28 |         MSG_SIZE\n
29 |         METHOD <ARG1> <ARG2> ... <ARGN>\n
30 |         BODY\n
31 |         '''
32 |         data = method
33 |         for arg in args:
34 |             data += ' ' + arg
35 |         data += '\n'
36 |         if body:
37 |             data += body
38 | 
39 |         try:
40 |             self.send_pipe.write('%d\n' % len(data))
41 |             self.send_pipe.write(data)
42 |             self.send_pipe.write('\n')
43 |         except IOError as _:
44 |             raise IOError("Lost connection with standard_kaldi subprocess")
45 | 
46 |     def _read_reply(self):
47 |         '''Reads a reply from the stream.
48 |         Reply format:
49 |         MSG_SIZE\n
50 |         STATUS\n
51 |         BODY\n
52 |         '''
53 |         try:
54 |             msg_size = int(self.recv_pipe.readline())
55 |             data = self.recv_pipe.read(msg_size)
56 |             self.recv_pipe.read(1) # trailing newline
57 | 
58 |             status_str, body = data.split('\n', 1)
59 |             status = int(status_str)
60 |         except IOError as _:
61 |             raise IOError("Lost connection with standard_kaldi subprocess")
62 | 
63 |         if status < 200 or status >= 300:
64 |             raise RPCError(status, body)
65 | 
66 |         return body, status
67 | 
68 | class RPCError(Exception):
69 |     '''Error thrown when standard_kaldi returns an error (in-band)'''
70 |     def __init__(self, status, why):
71 |         self.status = status
72 |         self.why = why
73 |     def __str__(self):
74 |         return 'standard_kaldi: error %d: %s' % (self.status, self.why)
75 | 


--------------------------------------------------------------------------------
/gentle/standard_kaldi.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import os
 3 | import logging
 4 | 
 5 | from .util.paths import get_binary
 6 | 
 7 | EXECUTABLE_PATH = get_binary("ext/k3")
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | STDERR = subprocess.DEVNULL
11 | 
12 | class Kaldi:
13 |     def __init__(self, nnet_dir=None, hclg_path=None, proto_langdir=None):
14 |         cmd = [EXECUTABLE_PATH]
15 |         
16 |         if nnet_dir is not None:
17 |             cmd.append(nnet_dir)
18 |             cmd.append(hclg_path)
19 | 
20 |         if not os.path.exists(hclg_path):
21 |             logger.error('hclg_path does not exist: %s', hclg_path)
22 |         self._p = subprocess.Popen(cmd,
23 |                                    stdin=subprocess.PIPE, stdout=subprocess.PIPE,
24 |                                    stderr=STDERR, bufsize=0)
25 |         self.finished = False
26 | 
27 |     def _cmd(self, c):
28 |         self._p.stdin.write(("%s\n" % (c)).encode())
29 |         self._p.stdin.flush()
30 | 
31 |     def push_chunk(self, buf):
32 |         # Wait until we're ready
33 |         self._cmd("push-chunk")
34 |         
35 |         cnt = int(len(buf)/2)
36 |         self._cmd(str(cnt))
37 |         self._p.stdin.write(buf) #arr.tostring())
38 |         status = self._p.stdout.readline().strip().decode()
39 |         return status == 'ok'
40 | 
41 |     def get_final(self):
42 |         self._cmd("get-final")
43 |         words = []
44 |         while True:
45 |             line = self._p.stdout.readline().decode()
46 |             if line.startswith("done"):
47 |                 break
48 |             parts = line.split(' / ')
49 |             if line.startswith('word'):
50 |                 wd = {}
51 |                 wd['word'] = parts[0].split(': ')[1]
52 |                 wd['start'] = float(parts[1].split(': ')[1])
53 |                 wd['duration'] = float(parts[2].split(': ')[1])
54 |                 wd['phones'] = []
55 |                 words.append(wd)
56 |             elif line.startswith('phone'):
57 |                 ph = {}
58 |                 ph['phone'] = parts[0].split(': ')[1]
59 |                 ph['duration'] = float(parts[1].split(': ')[1])
60 |                 words[-1]['phones'].append(ph)
61 | 
62 |         self._reset()
63 |         return words
64 | 
65 |     def _reset(self):
66 |         self._cmd("reset")
67 | 
68 |     def stop(self):
69 |         if not self.finished:
70 |             self.finished = True
71 |             self._cmd("stop")
72 |             self._p.stdin.close()
73 |             self._p.stdout.close()
74 |             self._p.wait()
75 | 
76 |     def __del__(self):
77 |         self.stop()
78 | 
79 | if __name__=='__main__':
80 |     import numm3
81 |     import sys
82 | 
83 |     infile = sys.argv[1]
84 |     
85 |     k = Kaldi()
86 | 
87 |     buf = numm3.sound2np(infile, nchannels=1, R=8000)
88 |     print('loaded_buf', len(buf))
89 |     
90 |     idx=0
91 |     while idx < len(buf):
92 |         k.push_chunk(buf[idx:idx+160000].tostring())
93 |         print(k.get_final())
94 |         idx += 160000
95 | 


--------------------------------------------------------------------------------
/gentle/transcriber.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import logging
  3 | import wave
  4 | 
  5 | from gentle import transcription
  6 | 
  7 | from multiprocessing.pool import ThreadPool as Pool
  8 | 
  9 | class MultiThreadedTranscriber:
 10 |     def __init__(self, kaldi_queue, chunk_len=20, overlap_t=2, nthreads=4):
 11 |         self.chunk_len = chunk_len
 12 |         self.overlap_t = overlap_t
 13 |         self.nthreads = nthreads
 14 |             
 15 |         self.kaldi_queue = kaldi_queue
 16 | 
 17 |     def transcribe(self, wavfile, progress_cb=None):
 18 |         wav_obj = wave.open(wavfile, 'rb')
 19 |         duration = wav_obj.getnframes() / float(wav_obj.getframerate())
 20 |         n_chunks = int(math.ceil(duration / float(self.chunk_len - self.overlap_t)))
 21 | 
 22 |         chunks = []
 23 | 
 24 | 
 25 |         def transcribe_chunk(idx):
 26 |             wav_obj = wave.open(wavfile, 'rb')
 27 |             start_t = idx * (self.chunk_len - self.overlap_t)
 28 |             # Seek
 29 |             wav_obj.setpos(int(start_t * wav_obj.getframerate()))
 30 |             # Read frames
 31 |             buf = wav_obj.readframes(int(self.chunk_len * wav_obj.getframerate()))
 32 | 
 33 |             if len(buf) < 4000:
 34 |                 logging.info('Short segment - ignored %d' % (idx))
 35 |                 ret = []
 36 |             else:
 37 |                 k = self.kaldi_queue.get()
 38 |                 k.push_chunk(buf)
 39 |                 ret = k.get_final()
 40 |                 # k.reset() (no longer needed)
 41 |                 self.kaldi_queue.put(k)
 42 | 
 43 |             chunks.append({"start": start_t, "words": ret})
 44 |             logging.info('%d/%d' % (len(chunks), n_chunks))
 45 |             if progress_cb is not None:
 46 |                 progress_cb({"message": ' '.join([X['word'] for X in ret]),
 47 |                              "percent": len(chunks) / float(n_chunks)})
 48 | 
 49 | 
 50 |         pool = Pool(min(n_chunks, self.nthreads))
 51 |         pool.map(transcribe_chunk, range(n_chunks))
 52 |         pool.close()
 53 |         
 54 |         chunks.sort(key=lambda x: x['start'])
 55 | 
 56 |         # Combine chunks
 57 |         words = []
 58 |         for c in chunks:
 59 |             chunk_start = c['start']
 60 |             chunk_end = chunk_start + self.chunk_len
 61 | 
 62 |             chunk_words = [transcription.Word(**wd).shift(time=chunk_start) for wd in c['words']]
 63 | 
 64 |             # At chunk boundary cut points the audio often contains part of a
 65 |             # word, which can get erroneously identified as one or more different
 66 |             # in-vocabulary words.  So discard one or more words near the cut points
 67 |             # (they'll be covered by the ovlerap anyway).
 68 |             #
 69 |             trim = min(0.25 * self.overlap_t, 0.5)
 70 |             if c is not chunks[0]:
 71 |                 while len(chunk_words) > 1:
 72 |                     chunk_words.pop(0)
 73 |                     if chunk_words[0].end > chunk_start + trim:
 74 |                         break
 75 |             if c is not chunks[-1]:
 76 |                 while len(chunk_words) > 1:
 77 |                     chunk_words.pop()
 78 |                     if chunk_words[-1].start < chunk_end - trim:
 79 |                         break
 80 | 
 81 |             words.extend(chunk_words)
 82 | 
 83 |         # Remove overlap:  Sort by time, then filter out any Word entries in
 84 |         # the list that are adjacent to another entry corresponding to the same
 85 |         # word in the audio.
 86 |         words.sort(key=lambda word: word.start)
 87 |         words.append(transcription.Word(word="__dummy__"))
 88 |         words = [words[i] for i in range(len(words)-1) if not words[i].corresponds(words[i+1])]
 89 | 
 90 |         return words, duration
 91 | 
 92 | 
 93 | if __name__=='__main__':
 94 |     # full transcription
 95 |     import json
 96 |     import sys
 97 | 
 98 |     import logging
 99 |     logging.getLogger().setLevel('INFO')
100 | 
101 |     import gentle
102 |     from gentle import standard_kaldi
103 |     from gentle import kaldi_queue
104 | 
105 |     resources = gentle.Resources()
106 | 
107 |     k_queue = kaldi_queue.build(resources, 3)
108 |     trans = MultiThreadedTranscriber(k_queue)
109 | 
110 |     with gentle.resampled(sys.argv[1]) as filename:
111 |         words, duration = trans.transcribe(filename)
112 | 
113 |     open(sys.argv[2], 'w').write(transcription.Transcription(words=words).to_json())
114 | 
115 | 


--------------------------------------------------------------------------------
/gentle/transcription.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import io
  3 | import json
  4 | 
  5 | from collections import defaultdict
  6 | 
  7 | class Word:
  8 | 
  9 |     SUCCESS = 'success'
 10 |     NOT_FOUND_IN_AUDIO = 'not-found-in-audio'
 11 |     NOT_FOUND_IN_TRANSCRIPT = 'not-found-in-transcript'
 12 | 
 13 |     def __init__(self, case=None, startOffset=None, endOffset=None, word=None, alignedWord=None, phones=None, start=None, end=None, duration=None):
 14 |         self.case = case
 15 |         self.startOffset = startOffset
 16 |         self.endOffset = endOffset
 17 |         self.word = word
 18 |         self.alignedWord = alignedWord
 19 |         self.phones = phones
 20 |         self.start = start
 21 |         self.duration = duration
 22 |         self.end = end
 23 |         if start is not None:
 24 |             if end is None:
 25 |                 self.end = start + duration
 26 |             elif duration is None:
 27 |                 self.duration = end - start
 28 | 
 29 |     def success(self):
 30 |         return self.case == Word.SUCCESS
 31 | 
 32 |     def not_found_in_audio(self):
 33 |         return self.case == Word.NOT_FOUND_IN_AUDIO
 34 | 
 35 |     def as_dict(self, without=None):
 36 |         return { key:val for key, val in self.__dict__.items() if (val is not None) and (key != without)}
 37 | 
 38 |     def __eq__(self, other):
 39 |         return self.__dict__ == other.__dict__
 40 | 
 41 |     def __ne__(self, other):
 42 |         return not self == other
 43 | 
 44 |     def __repr__(self):
 45 |         return "Word(" + " ".join(sorted([key + "=" + str(val) for key, val in self.as_dict(without="phones").items()])) + ")"
 46 | 
 47 |     def shift(self, time=None, offset=None):
 48 |         if self.start is not None and time is not None:
 49 |             self.start += time
 50 |             self.end += time
 51 | 
 52 |         if self.startOffset is not None and offset is not None:
 53 |             self.startOffset += offset
 54 |             self.endOffset += offset
 55 | 
 56 |         return self # for easy chaining
 57 | 
 58 |     def swap_alignment(self, other):
 59 |         '''Swaps the alignment info of two words, but does not swap the offset'''
 60 |         self.case, other.case = other.case, self.case
 61 |         self.alignedWord, other.alignedWord = other.alignedWord, self.alignedWord
 62 |         self.phones, other.phones = other.phones, self.phones
 63 |         self.start, other.start = other.start, self.start
 64 |         self.end, other.end = other.end, self.end
 65 |         self.duration, other.duration = other.duration, self.duration
 66 | 
 67 |     def corresponds(self, other):
 68 |         '''Returns true if self and other refer to the same word, at the same position in the audio (within a small tolerance)'''
 69 |         if self.word != other.word: return False
 70 |         return abs(self.start - other.start) / (self.duration + other.duration) < 0.1
 71 | 
 72 | class Transcription:
 73 | 
 74 |     def __init__(self, transcript=None, words=None):
 75 |         self.transcript = transcript
 76 |         self.words = words
 77 | 
 78 |     def __eq__(self, other):
 79 |         return self.transcript == other.transcript and self.words == other.words
 80 | 
 81 |     def to_json(self, **kwargs):
 82 |         '''Return a JSON representation of the aligned transcript'''
 83 |         options = {
 84 |                 'sort_keys':    True,
 85 |                 'indent':       4,
 86 |                 'separators':   (',', ': '),
 87 |                 }
 88 |         options.update(kwargs)
 89 | 
 90 |         container = {}
 91 |         if self.transcript:
 92 |             container['transcript'] = self.transcript
 93 |         if self.words: 
 94 |             container['words'] = [word.as_dict(without="duration") for word in self.words]
 95 |         return json.dumps(container, **options)
 96 | 
 97 |     @classmethod
 98 |     def from_json(cls, json_str):
 99 |         return cls._from_jsondata(json.loads(json_str))
100 | 
101 |     @classmethod
102 |     def from_jsonfile(cls, filename):
103 |         with open(filename) as fh:
104 |             return cls._from_jsondata(json.load(fh))
105 | 
106 |     @classmethod
107 |     def _from_jsondata(cls, data):
108 |         return cls(transcript = data['transcript'], words = [Word(**wd) for wd in data['words']])
109 | 
110 |     def to_csv(self):
111 |         '''Return a CSV representation of the aligned transcript. Format:
112 |         <word> <token> <start seconds> <end seconds>
113 |         '''
114 |         if not self.words:
115 |             return ''
116 |         buf = io.StringIO()
117 |         w = csv.writer(buf)
118 |         for X in self.words:
119 |             if X.case not in (Word.SUCCESS, Word.NOT_FOUND_IN_AUDIO):
120 |                 continue
121 |             row = [X.word,
122 |                 X.alignedWord,
123 |                 X.start,
124 |                 X.end
125 |             ]
126 |             w.writerow(row)
127 |         return buf.getvalue()
128 | 
129 |     def stats(self):
130 |         counts = defaultdict(int)
131 |         for word in self.words:
132 |             counts[word.case] += 1
133 |         stats = {}
134 |         stats['total'] = len(self.words)
135 |         for key, val in counts.items():
136 |             stats[key] = val
137 |         return stats
138 | 
139 | Transcription.Word = Word
140 | 


--------------------------------------------------------------------------------
/gentle/util/__init__.py:
--------------------------------------------------------------------------------
1 | # nothing here right now
2 | 


--------------------------------------------------------------------------------
/gentle/util/cyst.py:
--------------------------------------------------------------------------------
 1 | # Twisted lazy computations
 2 | # (from rmo-sketchbook/cyst/cyst.py)
 3 | 
 4 | import mimetypes
 5 | import os
 6 | 
 7 | from twisted.web.static import File
 8 | from twisted.web.resource import Resource
 9 | from twisted.web.server import Site, NOT_DONE_YET
10 | from twisted.internet import reactor
11 | 
12 | class Insist(Resource):
13 |     isLeaf = True
14 | 
15 |     def __init__(self, cacheloc):
16 |         self.cacheloc = cacheloc
17 |         self.cachefile = None
18 |         if os.path.exists(cacheloc):
19 |             self.cachefile = File(cacheloc)
20 |         self.reqs_waiting = []
21 |         self.started = False
22 |         Resource.__init__(self)
23 | 
24 |     def render_GET(self, req):
25 |         # Check if someone else has created the file somehow
26 |         if self.cachefile is None and os.path.exists(self.cacheloc):
27 |             self.cachefile = File(self.cacheloc)
28 |         # Check if someone else has *deleted* the file
29 |         elif self.cachefile is not None and not os.path.exists(self.cacheloc):
30 |             self.cachefile = None
31 | 
32 |         if self.cachefile is not None:
33 |             return self.cachefile.render_GET(req)
34 |         else:
35 |             self.reqs_waiting.append(req)
36 |             req.notifyFinish().addErrback(
37 |                 self._nevermind, req)
38 |             if not self.started:
39 |                 self.started = True
40 |                 reactor.callInThread(self.desist)
41 |             return NOT_DONE_YET
42 | 
43 |     def _nevermind(self, _err, req):
44 |         self.reqs_waiting.remove(req)
45 | 
46 |     def desist(self):
47 |         self.serialize_computation(self.cacheloc)
48 |         reactor.callFromThread(self.resist)
49 | 
50 |     def _get_mime(self):
51 |         return mimetypes.guess_type(self.cacheloc)[0]
52 | 
53 |     def resist(self):
54 |         if not os.path.exists(self.cacheloc):
55 |             # Error!
56 |             print("%s does not exist - rendering fail!" % (self.cacheloc))
57 |             for req in self.reqs_waiting:
58 |                 req.headers[b"Content-Type"] = b"text/plain"
59 |                 req.write(b"cyst error")
60 |                 req.finish()
61 |             return
62 | 
63 |         self.cachefile = File(self.cacheloc)
64 | 
65 |         # Send content to all interested parties
66 |         for req in self.reqs_waiting:
67 |             self.cachefile.render(req)
68 | 
69 |     def serialize_computation(self, outpath):
70 |         raise NotImplemented
71 | 
72 | class HelloCyst(Insist):
73 |     def serialize_computation(self, outpath):
74 |         import time
75 |         time.sleep(10)
76 |         open(outpath, "w").write("Hello, World")
77 | 
78 | if __name__=='__main__':
79 |     import sys
80 |     c = HelloCyst(sys.argv[1])
81 |     site = Site(c)
82 |     port = 7984
83 |     reactor.listenTCP(port, site)
84 |     print("http://localhost:%d" % (port))
85 |     reactor.run()
86 | 


--------------------------------------------------------------------------------
/gentle/util/paths.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import logging
 4 | import shutil
 5 | import sys
 6 | 
 7 | ENV_VAR = 'GENTLE_RESOURCES_ROOT'
 8 | 
 9 | class SourceResolver:
10 |     def __init__(self):
11 |         self.project_root = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir))
12 | 
13 |     def get_binary(self, name):
14 |         path_in_project = os.path.join(self.project_root, name)
15 |         if os.path.exists(path_in_project):
16 |             return path_in_project
17 |         else:
18 |             return name
19 | 
20 |     def get_resource(self, name):
21 |         root = os.environ.get(ENV_VAR) or self.project_root
22 |         return os.path.join(root, name)
23 | 
24 |     def get_datadir(self, name):
25 |         return self.get_resource(name)
26 | 
27 | class PyinstallResolver:
28 |     def __init__(self):
29 |         self.root = os.path.abspath(os.path.join(getattr(sys, '_MEIPASS', ''), os.pardir, 'Resources'))
30 | 
31 |     def get_binary(self, name):
32 |         return os.path.join(self.root, name)
33 | 
34 |     def get_resource(self, name):
35 |         rpath = os.path.join(self.root, name)
36 |         if os.path.exists(rpath):
37 |             return rpath
38 |         else:
39 |             return get_datadir(name) # DMG may be read-only; fall-back to datadir (ie. so language models can be added)
40 | 
41 |     def get_datadir(self, path):
42 |         return os.path.join(os.environ['HOME'], '.gentle', path)
43 | 
44 | RESOLVER = PyinstallResolver() if hasattr(sys, "frozen") else SourceResolver()
45 | 
46 | 
47 | def get_binary(name):
48 |     return RESOLVER.get_binary(name)
49 | 
50 | def get_resource(path):
51 |     return RESOLVER.get_resource(path)
52 | 
53 | def get_datadir(path):
54 |     return RESOLVER.get_datadir(path)
55 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | git submodule init
 6 | git submodule update
 7 | 
 8 | ./install_deps.sh
 9 | (cd ext && ./install_kaldi.sh)
10 | ./install_models.sh
11 | cd ext && make depend && make
12 | 


--------------------------------------------------------------------------------
/install_deps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | echo "Installing dependencies..."
 6 | 
 7 | # Install OS-specific dependencies
 8 | if [[ "$OSTYPE" == "linux-gnu" ]]; then
 9 | 	apt-get update -qq
10 | 	apt-get install -y zlib1g-dev automake autoconf git \
11 | 		libtool subversion libatlas3-base python3-pip \
12 | 		python3-dev wget unzip python3
13 | 	apt-get install -y ffmpeg || echo -n  "\n\nYou have to install ffmpeg from a PPA or from https://ffmpeg.org before you can run gentle\n\n"
14 | 	python3 setup.py develop
15 | elif [[ "$OSTYPE" == "darwin"* ]]; then
16 | 	brew install ffmpeg libtool automake autoconf wget python3
17 | 	sudo python3 setup.py develop
18 | fi
19 | 


--------------------------------------------------------------------------------
/install_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | VERSION="0.04"
 6 | 
 7 | download_models() {
 8 | 	local version="$1"
 9 | 	local filename="kaldi-models-$version.zip"
10 | 	local url="https://rmozone.com/gentle/$filename"
11 | 	wget -O $filename $url
12 | 	unzip $filename
13 | 	rm $filename
14 | }
15 | 
16 | echo "Downloading models for v$VERSION..." 1>&2
17 | download_models $VERSION
18 | 


--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
1 | [MESSAGES CONTROL]
2 | disable=locally-disabled
3 | 


--------------------------------------------------------------------------------
/serve.py:
--------------------------------------------------------------------------------
  1 | from twisted.web.static import File
  2 | from twisted.web.resource import Resource
  3 | from twisted.web.server import Site, NOT_DONE_YET
  4 | from twisted.internet import reactor, threads
  5 | from twisted.web._responses import FOUND
  6 | 
  7 | import json
  8 | import logging
  9 | import multiprocessing
 10 | import os
 11 | import shutil
 12 | import uuid
 13 | import wave
 14 | 
 15 | from gentle.util.paths import get_resource, get_datadir
 16 | from gentle.util.cyst import Insist
 17 | 
 18 | import gentle
 19 | 
 20 | class TranscriptionStatus(Resource):
 21 |     def __init__(self, status_dict):
 22 |         self.status_dict = status_dict
 23 |         Resource.__init__(self)
 24 | 
 25 |     def render_GET(self, req):
 26 |         req.setHeader(b"Content-Type", "application/json")
 27 |         return json.dumps(self.status_dict).encode()
 28 | 
 29 | class Transcriber():
 30 |     def __init__(self, data_dir, nthreads=4, ntranscriptionthreads=2):
 31 |         self.data_dir = data_dir
 32 |         self.nthreads = nthreads
 33 |         self.ntranscriptionthreads = ntranscriptionthreads
 34 |         self.resources = gentle.Resources()
 35 | 
 36 |         self.full_transcriber = gentle.FullTranscriber(self.resources, nthreads=ntranscriptionthreads)
 37 |         self._status_dicts = {}
 38 | 
 39 |     def get_status(self, uid):
 40 |         return self._status_dicts.setdefault(uid, {})
 41 | 
 42 |     def out_dir(self, uid):
 43 |         return os.path.join(self.data_dir, 'transcriptions', uid)
 44 | 
 45 |     # TODO(maxhawkins): refactor so this is returned by transcribe()
 46 |     def next_id(self):
 47 |         uid = None
 48 |         while uid is None or os.path.exists(os.path.join(self.data_dir, uid)):
 49 |             uid = uuid.uuid4().hex[:8]
 50 |         return uid
 51 | 
 52 |     def transcribe(self, uid, transcript, audio, async_mode, **kwargs):
 53 | 
 54 |         status = self.get_status(uid)
 55 | 
 56 |         status['status'] = 'STARTED'
 57 |         output = {
 58 |             'transcript': transcript
 59 |         }
 60 | 
 61 |         outdir = os.path.join(self.data_dir, 'transcriptions', uid)
 62 | 
 63 |         tran_path = os.path.join(outdir, 'transcript.txt')
 64 |         with open(tran_path, 'w') as tranfile:
 65 |             tranfile.write(transcript)
 66 |         audio_path = os.path.join(outdir, 'upload')
 67 |         with open(audio_path, 'wb') as wavfile:
 68 |             wavfile.write(audio)
 69 | 
 70 |         status['status'] = 'ENCODING'
 71 | 
 72 |         wavfile = os.path.join(outdir, 'a.wav')
 73 |         if gentle.resample(os.path.join(outdir, 'upload'), wavfile) != 0:
 74 |             status['status'] = 'ERROR'
 75 |             status['error'] = "Encoding failed. Make sure that you've uploaded a valid media file."
 76 |             # Save the status so that errors are recovered on restart of the server
 77 |             # XXX: This won't work, because the endpoint will override this file
 78 |             with open(os.path.join(outdir, 'status.json'), 'w') as jsfile:
 79 |                 json.dump(status, jsfile, indent=2)
 80 |             return
 81 | 
 82 |         #XXX: Maybe we should pass this wave object instead of the
 83 |         # file path to align_progress
 84 |         wav_obj = wave.open(wavfile, 'rb')
 85 |         status['duration'] = wav_obj.getnframes() / float(wav_obj.getframerate())
 86 |         status['status'] = 'TRANSCRIBING'
 87 | 
 88 |         def on_progress(p):
 89 |             print(p)
 90 |             for k,v in p.items():
 91 |                 status[k] = v
 92 | 
 93 |         if len(transcript.strip()) > 0:
 94 |             trans = gentle.ForcedAligner(self.resources, transcript, nthreads=self.nthreads, **kwargs)
 95 |         elif self.full_transcriber.available:
 96 |             trans = self.full_transcriber
 97 |         else:
 98 |             status['status'] = 'ERROR'
 99 |             status['error']  = 'No transcript provided and no language model for full transcription'
100 |             return
101 | 
102 |         output = trans.transcribe(wavfile, progress_cb=on_progress, logging=logging)
103 | 
104 |         # ...remove the original upload
105 |         os.unlink(os.path.join(outdir, 'upload'))
106 | 
107 |         # Save
108 |         with open(os.path.join(outdir, 'align.json'), 'w') as jsfile:
109 |             jsfile.write(output.to_json(indent=2))
110 |         with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile:
111 |             csvfile.write(output.to_csv())
112 | 
113 |         # Inline the alignment into the index.html file.
114 |         htmltxt = open(get_resource('www/view_alignment.html')).read()
115 |         htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (output.to_json()));
116 |         open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt)
117 | 
118 |         status['status'] = 'OK'
119 | 
120 |         logging.info('done with transcription.')
121 | 
122 |         return output
123 | 
124 | class TranscriptionsController(Resource):
125 |     def __init__(self, transcriber):
126 |         Resource.__init__(self)
127 |         self.transcriber = transcriber
128 | 
129 |     def getChild(self, uid, req):
130 |         uid = uid.decode()
131 |         out_dir = self.transcriber.out_dir(uid)
132 |         trans_ctrl = File(out_dir)
133 | 
134 |         # Add a Status endpoint to the file
135 |         trans_status = TranscriptionStatus(self.transcriber.get_status(uid))
136 |         trans_ctrl.putChild(b"status.json", trans_status)
137 | 
138 |         return trans_ctrl
139 | 
140 |     def render_POST(self, req):
141 |         uid = self.transcriber.next_id()
142 | 
143 |         tran = req.args.get(b'transcript', [b''])[0].decode()
144 |         audio = req.args[b'audio'][0]
145 | 
146 |         disfluency = True if b'disfluency' in req.args else False
147 |         conservative = True if b'conservative' in req.args else False
148 |         kwargs = {'disfluency': disfluency,
149 |                   'conservative': conservative,
150 |                   'disfluencies': set(['uh', 'um'])}
151 | 
152 |         async_mode = True
153 |         if b'async' in req.args and req.args[b'async'][0] == b'false':
154 |             async_mode = False
155 | 
156 |         # We need to make the transcription directory here, so that
157 |         # when we redirect the user we are sure that there's a place
158 |         # for them to go.
159 |         outdir = os.path.join(self.transcriber.data_dir, 'transcriptions', uid)
160 |         os.makedirs(outdir)
161 | 
162 |         # Copy over the HTML
163 |         shutil.copy(get_resource('www/view_alignment.html'), os.path.join(outdir, 'index.html'))
164 | 
165 |         result_promise = threads.deferToThreadPool(
166 |             reactor, reactor.getThreadPool(),
167 |             self.transcriber.transcribe,
168 |             uid, tran, audio, async_mode, **kwargs)
169 | 
170 |         if not async_mode:
171 |             def write_result(result):
172 |                 '''Write JSON to client on completion'''
173 |                 req.setHeader("Content-Type", "application/json")
174 |                 req.write(result.to_json(indent=2).encode())
175 |                 req.finish()
176 |             result_promise.addCallback(write_result)
177 |             result_promise.addErrback(lambda _: None) # ignore errors
178 | 
179 |             req.notifyFinish().addErrback(lambda _: result_promise.cancel())
180 | 
181 |             return NOT_DONE_YET
182 | 
183 |         req.setResponseCode(FOUND)
184 |         req.setHeader(b"Location", "/transcriptions/%s" % (uid))
185 |         return b''
186 | 
187 | class LazyZipper(Insist):
188 |     def __init__(self, cachedir, transcriber, uid):
189 |         self.transcriber = transcriber
190 |         self.uid = uid
191 |         Insist.__init__(self, os.path.join(cachedir, '%s.zip' % (uid)))
192 | 
193 |     def serialize_computation(self, outpath):
194 |         shutil.make_archive('.'.join(outpath.split('.')[:-1]), # We need to strip the ".zip" from the end
195 |                             "zip",                             # ...because `shutil.make_archive` adds it back
196 |                             os.path.join(self.transcriber.out_dir(self.uid)))
197 | 
198 | class TranscriptionZipper(Resource):
199 |     def __init__(self, cachedir, transcriber):
200 |         self.cachedir = cachedir
201 |         self.transcriber = transcriber
202 |         Resource.__init__(self)
203 | 
204 |     def getChild(self, path, req):
205 |         uid = path.decode().split('.')[0]
206 |         t_dir = self.transcriber.out_dir(uid)
207 |         if os.path.exists(t_dir):
208 |             # TODO: Check that "status" is complete and only create a LazyZipper if so
209 |             # Otherwise, we could have incomplete transcriptions that get permanently zipped.
210 |             # For now, a solution will be hiding the button in the client until it's done.
211 |             lz = LazyZipper(self.cachedir, self.transcriber, uid)
212 |             if not isinstance(path, bytes):
213 |                 path = path.encode()
214 |             self.putChild(path, lz)
215 |             return lz
216 |         else:
217 |             return Resource.getChild(self, path, req)
218 | 
219 | def serve(port=8765, interface='0.0.0.0', installSignalHandlers=0, nthreads=4, ntranscriptionthreads=2, data_dir=get_datadir('webdata')):
220 |     logging.info("SERVE %d, %s, %d", port, interface, installSignalHandlers)
221 | 
222 |     if not os.path.exists(data_dir):
223 |         os.makedirs(data_dir)
224 | 
225 |     zip_dir = os.path.join(data_dir, 'zip')
226 |     if not os.path.exists(zip_dir):
227 |         os.makedirs(zip_dir)
228 | 
229 |     f = File(data_dir)
230 | 
231 |     f.putChild(b'', File(get_resource('www/index.html')))
232 |     f.putChild(b'status.html', File(get_resource('www/status.html')))
233 |     f.putChild(b'preloader.gif', File(get_resource('www/preloader.gif')))
234 | 
235 |     trans = Transcriber(data_dir, nthreads=nthreads, ntranscriptionthreads=ntranscriptionthreads)
236 |     trans_ctrl = TranscriptionsController(trans)
237 |     f.putChild(b'transcriptions', trans_ctrl)
238 | 
239 |     trans_zippr = TranscriptionZipper(zip_dir, trans)
240 |     f.putChild(b'zip', trans_zippr)
241 | 
242 |     s = Site(f)
243 |     logging.info("about to listen")
244 |     reactor.listenTCP(port, s, interface=interface)
245 |     logging.info("listening")
246 | 
247 |     reactor.run(installSignalHandlers=installSignalHandlers)
248 | 
249 | 
250 | if __name__=='__main__':
251 |     import argparse
252 | 
253 |     parser = argparse.ArgumentParser(
254 |         description='Align a transcript to audio by generating a new language model.')
255 |     parser.add_argument('--host', default="0.0.0.0",
256 |                        help='host to run http server on')
257 |     parser.add_argument('--port', default=8765, type=int,
258 |                         help='port number to run http server on')
259 |     parser.add_argument('--nthreads', default=multiprocessing.cpu_count(), type=int,
260 |                         help='number of alignment threads')
261 |     parser.add_argument('--ntranscriptionthreads', default=2, type=int,
262 |                         help='number of full-transcription threads (memory intensive)')
263 |     parser.add_argument('--log', default="INFO",
264 |                         help='the log level (DEBUG, INFO, WARNING, ERROR, or CRITICAL)')
265 | 
266 |     args = parser.parse_args()
267 | 
268 |     log_level = args.log.upper()
269 |     logging.getLogger().setLevel(log_level)
270 | 
271 |     logging.info('gentle %s' % (gentle.__version__))
272 |     logging.info('listening at %s:%d\n' % (args.host, args.port))
273 | 
274 |     serve(args.port, args.host, nthreads=args.nthreads, ntranscriptionthreads=args.ntranscriptionthreads, installSignalHandlers=1)
275 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from gentle import __version__
 3 | 
 4 | setup(
 5 |     app=['serve.py'],
 6 |     data_files=[],
 7 |     options={'py2app': {
 8 |         'argv_emulation': False,
 9 |         'resources': 'k3,m3,ffmpeg,www,exp'
10 |     }},
11 |     name='gentle',
12 |     version=__version__,
13 |     description='Robust yet lenient forced-aligner built on Kaldi.',
14 |     url='http://lowerquality.com/gentle',
15 |     author='Robert M Ochshorn',
16 |     license='MIT',
17 |     packages=['gentle'],
18 |     install_requires=['twisted'],
19 |     test_suite='tests',
20 | )
21 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strob/gentle/087a3b738bad9ebafd0570561486252cb624ba3a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/base.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | class Base(unittest.TestCase):
 4 | 
 5 |     def test_import(self):
 6 |         import gentle
 7 | 
 8 |     def test_resources(self):
 9 |         import gentle
10 |         resources = gentle.Resources()
11 |         import gentle.util.paths
12 |         self.assertNotEqual(gentle.util.paths.get_binary("ext/k3"), "ext/k3")
13 | 


--------------------------------------------------------------------------------
/tests/transcriber.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | class Aligner(unittest.TestCase):
 5 |     audio = 'examples/data/lucier.mp3'
 6 |     transcript = "i am sitting in a room"
 7 | 
 8 |     def test_resources(self):
 9 |         from gentle import Resources
10 |         from gentle.util.paths import get_binary
11 | 
12 |         resources = Resources()
13 |         k3 = get_binary("ext/k3")
14 |         model = get_binary("exp/tdnn_7b_chain_online/final.mdl" )       
15 | 
16 |         self.assertEqual(os.path.exists(self.audio), True)
17 |         self.assertEqual(os.path.exists(k3), True)
18 |         self.assertEqual(os.path.exists(model), True)
19 | 
20 |     def test_aligner(self):
21 |         import subprocess
22 |         from gentle import resampled, standard_kaldi, Resources
23 |         from gentle.forced_aligner import ForcedAligner
24 |         from gentle.transcription import Word
25 | 
26 |         standard_kaldi.STDERR = subprocess.STDOUT
27 | 
28 |         resources = Resources()
29 |         align = ForcedAligner(resources, self.transcript, nthreads=1)
30 | 
31 |         with resampled(self.audio, 5.0, 5.0) as filename:
32 |             transcription = align.transcribe(filename)
33 |             words = transcription.words
34 |         self.assertEqual(words[0].word, "i")
35 |         self.assertEqual(words[1].word, "am")
36 |         self.assertEqual(words[1].case, Word.SUCCESS)        
37 | 


--------------------------------------------------------------------------------
/www/index.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <head>
 3 |     <meta charset="utf-8" />
 4 |     <style>
 5 |       body {font-family: sans-serif; padding-top: 70px; }
 6 |       textarea { width: 500px; height: 20em; }
 7 |       input, textarea { margin: 1em 0; }
 8 |       #header {
 9 |           position: fixed;
10 |           top: 0;
11 |           left: 0;
12 |           height: 50px;
13 |           line-height: 50px;
14 |           width: 100%;
15 |           background-color: #999;
16 |           box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
17 |           font-family: Helvetica, sans-serif;
18 |       }
19 |       #header, #header a {
20 |           color: white;
21 |       }
22 |       .home {
23 |           margin: 0;
24 |           font-size: 125%;
25 |           font-weight: lighter;
26 |           text-transform: lowercase;
27 |       }
28 |       .home a {
29 |           margin: 0;
30 |           background: #666;
31 |           padding-left: 25px;
32 |           padding-right: 30px;
33 |           margin-right: 20px;
34 |           float: left;
35 |           text-decoration: none;
36 |       }
37 |       .home:hover a {
38 |           background: #555;
39 |       }
40 |       #align-button {
41 |         background: #CCC;
42 |         border: 0;
43 |         font-size: 18px;
44 |         padding: 10px 30px;
45 |         cursor: pointer;
46 |       }
47 |       #alignment-flags {
48 |         background: #CCC;
49 |         border: 0;
50 |         font-size: 18px;
51 |         padding: 10px 30px;
52 |       }
53 |       #footer {
54 |         margin-top: 100px;
55 |         border-top: 1px dotted black;
56 |         font-size: 8pt;
57 |       font-style: italic;
58 |       padding: 10px;
59 |       }
60 |     </style>
61 |   </head>
62 |   <body>
63 |     <div id="header">
64 |       <h1 class="home"><a href="/">Gentle</a></h1>
65 |     </div>
66 |     <form action="/transcriptions" method="POST" enctype="multipart/form-data">
67 |       Audio:<br>
68 |       <input type=file name=audio><br>
69 |       <br>
70 |       Transcript:<br>
71 |       <textarea name="transcript"></textarea><br>
72 |       <input id=alignment-flags name=conservative type=checkbox> Conservative<br>
73 |       <input id=alignment-flags name=disfluency type=checkbox> Include disfluencies<br>
74 |       <input id="align-button" type=submit value=Align>
75 |     </form>
76 |     <div id="footer">
77 |       <a href="https://rmozone.com/gentle">Gentle</a> is free software released under the <a href="https://opensource.org/licenses/MIT">MIT license</a>. <a href="https://rmozone.com/gentle">Homepage</a> | <a href="https://github.com/lowerquality/gentle">Source code</a>.
78 |     </div>
79 |   </body>
80 | </html>
81 | 


--------------------------------------------------------------------------------
/www/preloader.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strob/gentle/087a3b738bad9ebafd0570561486252cb624ba3a/www/preloader.gif


--------------------------------------------------------------------------------
/www/view_alignment.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 |   <head>
  3 |     <meta charset="utf-8" />
  4 |     <style>
  5 | html, body {
  6 |     margin: 0;
  7 |     padding: 0;
  8 | }
  9 | #header {
 10 |     position: fixed;
 11 |     top: 0;
 12 |     left: 0;
 13 |     height: 50px;
 14 |     line-height: 50px;
 15 |     width: 100%;
 16 |     background-color: #999;
 17 |     box-shadow: 0px 0px 5px 0px rgba(0,0,0,0.5);
 18 |     font-family: Helvetica, sans-serif;
 19 | }
 20 | #header, #header a {
 21 |     color: white;
 22 | }
 23 | #downloads {
 24 |     float: right;
 25 |     background: #999;
 26 | }
 27 | .download {
 28 |     float: right;
 29 |     background: #999;
 30 |     padding: 0 5px;
 31 | }
 32 | .home {
 33 |   margin: 0;
 34 |   font-size: 125%;
 35 |   font-weight: lighter;
 36 |   text-transform: lowercase;
 37 | }
 38 | .home a {
 39 |   margin: 0;
 40 |   background: #666;
 41 |   padding-left: 25px;
 42 |   padding-right: 30px;
 43 |   margin-right: 20px;
 44 |   float: left;
 45 |   text-decoration: none;
 46 | }
 47 | .home:hover a {
 48 |   background: #555;
 49 | }
 50 | #audio {
 51 |     margin-top: 9px;
 52 |     width: 50%;
 53 |     display: inline-block;
 54 | }
 55 | #transcript {
 56 |     margin: 0 15px;
 57 |     margin-top: 70px;
 58 |     margin-bottom: 5em;
 59 |     white-space: pre-wrap;
 60 |     line-height: 2em;
 61 |     max-width: 600px;
 62 |     color: #999;
 63 | }
 64 | #transcript.status {
 65 |     background-color: #333;
 66 |     color: #fff;
 67 |     font-family: Courier, mono;
 68 |     line-height: 1em;
 69 |     font-size: 10pt;
 70 |     max-width: 100%;
 71 | }
 72 | #transcript.status h2 {
 73 |     padding: 10px;
 74 | }
 75 | #transcript.status .entry {
 76 |     margin-bottom: 10px;
 77 |     padding: 10px;
 78 | }
 79 | #transcript.status progress {
 80 |     width: 100%;
 81 |     height: 30px;
 82 |     margin-bottom: 20px;
 83 | }
 84 | .success {
 85 |     color: black;
 86 | }
 87 | .success:hover {
 88 |     text-decoration: underline;
 89 | }
 90 | .active {
 91 |     color: magenta;
 92 | }
 93 | #preloader {
 94 |     visibility: hidden;
 95 | }
 96 | .phactive {
 97 |     text-decoration: underline;
 98 | }
 99 | .phones {
100 |     position: absolute;
101 |     color: #333;
102 | }
103 | .phones .phone {
104 |     margin-right: 5px;
105 |     font-family: Helvetica, sans-serif;
106 |     text-transform: uppercase;
107 |     font-size: 50%;
108 | }
109 | .phones .phone:last-child {
110 |     margin-right: 0;
111 | }
112 | #footer {
113 |   margin-top: 100px;
114 |   border-top: 1px dotted black;
115 |   font-size: 8pt;
116 |   font-style: italic;
117 |   font-family: Helvetica, sans-serif;
118 |   padding: 10px;
119 | }
120 |     </style>
121 |   </head>
122 |   <body>
123 |     <div id="header">
124 |       <h1 class="home"><a href="/">Gentle</a></h1>
125 |       <audio id="audio" src="a.wav" controls="true"></audio>
126 |       <img src="/preloader.gif" id="preloader" alt="loading...">
127 |       <span id="downloads"> </div>
128 |     </div>
129 |     <div id="transcript"></div>
130 |     <div id="footer">
131 |       <a href="https://lowerquality.com/gentle">Gentle</a> is free software released under the <a href="https://opensource.org/licenses/MIT">MIT license</a>. <a href="https://lowerquality.com/gentle">Homepage</a> | <a href="https://github.com/lowerquality/gentle">Source code</a>.
132 |     </div>
133 | 
134 |     <script>
135 | 
136 | function get(url, cb) {
137 |     var xhr = new XMLHttpRequest();
138 |     xhr.open("GET", url, true);
139 |     xhr.onload = function() {
140 |         cb(this.responseText);
141 |     }
142 |     xhr.send();
143 | }
144 | function get_json(url, cb) {
145 |     get(url, function(x) {
146 |         cb(JSON.parse(x));
147 |     });
148 | }
149 | 
150 | var $a = document.getElementById("audio");
151 | window.onkeydown = function(ev) {
152 |     if(ev.keyCode == 32) {
153 |         ev.preventDefault();
154 |         $a.pause();
155 |     }
156 | }
157 | 
158 | var $trans = document.getElementById("transcript");
159 | var $preloader = document.getElementById('preloader');
160 | 
161 | var wds = [];
162 | var cur_wd;
163 | 
164 | var $phones = document.createElement("div");
165 | $phones.className = "phones";
166 | document.body.appendChild($phones);
167 | 
168 | var cur_phones$ = [];           // List of phoneme $divs
169 | var $active_phone;
170 | 
171 | function render_phones(wd) {
172 |     cur_phones$ = [];
173 |     $phones.innerHTML = "";
174 |     $active_phone = null;
175 | 
176 |     $phones.style.top = wd.$div.offsetTop + 18;
177 |     $phones.style.left = wd.$div.offsetLeft;
178 | 
179 |     var dur = wd.end - wd.start;
180 | 
181 |     var start_x = wd.$div.offsetLeft;
182 |     
183 |     wd.phones
184 |         .forEach(function(ph){
185 |             var $p = document.createElement("span");
186 |             $p.className = "phone";
187 |             $p.textContent = ph.phone.split("_")[0];
188 |             
189 |             $phones.appendChild($p);
190 |             cur_phones$.push($p);
191 |         });
192 | 
193 |     var offsetToCenter = (wd.$div.offsetWidth - $phones.offsetWidth) / 2;
194 |     $phones.style.left = wd.$div.offsetLeft + offsetToCenter;
195 | }
196 | function highlight_phone(t) {
197 |     if(!cur_wd) {
198 |         $phones.innerHTML = "";
199 |         return;
200 |     }
201 |     var hit;
202 |     var cur_t = cur_wd.start;
203 |     
204 |     cur_wd.phones.forEach(function(ph, idx) {
205 |         if(cur_t <= t && cur_t + ph.duration >= t) {
206 |             hit = idx;
207 |         }
208 |         cur_t += ph.duration;
209 |     });
210 | 
211 |     if(hit) {
212 |         var $ph = cur_phones$[hit];
213 |         if($ph != $active_phone) {
214 |             if($active_phone) {
215 |                 $active_phone.classList.remove("phactive");
216 |             }
217 |             if($ph) {
218 |                 $ph.classList.add("phactive");
219 |             }
220 |         }
221 |         $active_phone = $ph;
222 |     }
223 | }
224 | 
225 | function highlight_word() {
226 |     var t = $a.currentTime;
227 |     // XXX: O(N); use binary search
228 |     var hits = wds.filter(function(x) {
229 |         return (t - x.start) > 0.01 && (x.end - t) > 0.01;
230 |     }, wds);
231 |     var next_wd = hits[hits.length - 1];
232 | 
233 |     if(cur_wd != next_wd) {
234 |         var active = document.querySelectorAll('.active');
235 |         for(var i = 0; i < active.length; i++) {
236 |             active[i].classList.remove('active');
237 |         }
238 |         if(next_wd && next_wd.$div) {
239 |             next_wd.$div.classList.add('active');
240 |             render_phones(next_wd);
241 |         }
242 |     }
243 |     cur_wd = next_wd;
244 |     highlight_phone(t);
245 | 
246 |     window.requestAnimationFrame(highlight_word);
247 | }
248 | window.requestAnimationFrame(highlight_word);
249 | 
250 | $trans.innerHTML = "Loading...";
251 | 
252 | function render(ret) {
253 |     wds = ret['words'] || [];
254 |     transcript = ret['transcript'];
255 | 
256 |     $trans.innerHTML = '';
257 | 
258 |     var currentOffset = 0;
259 | 
260 |     wds.forEach(function(wd) {
261 |         if(wd.case == 'not-found-in-transcript') {
262 |             // TODO: show phonemes somewhere
263 |             var txt = ' ' + wd.word;
264 |             var $plaintext = document.createTextNode(txt);
265 |             $trans.appendChild($plaintext);
266 |             return;
267 |         }
268 | 
269 |         // Add non-linked text
270 |         if(wd.startOffset > currentOffset) {
271 |             var txt = transcript.slice(currentOffset, wd.startOffset);
272 |             var $plaintext = document.createTextNode(txt);
273 |             $trans.appendChild($plaintext);
274 |             currentOffset = wd.startOffset;
275 |         }
276 | 
277 |         var $wd = document.createElement('span');
278 |         var txt = transcript.slice(wd.startOffset, wd.endOffset);
279 |         var $wdText = document.createTextNode(txt);
280 |         $wd.appendChild($wdText);
281 |         wd.$div = $wd;
282 |         if(wd.start !== undefined) {
283 |             $wd.className = 'success';
284 |         }
285 |         $wd.onclick = function() {
286 |             if(wd.start !== undefined) {
287 |                 console.log(wd.start);
288 |                 $a.currentTime = wd.start;
289 |                 $a.play();
290 |             }
291 |         };
292 |         $trans.appendChild($wd);
293 |         currentOffset = wd.endOffset;
294 |     });
295 | 
296 |     var txt = transcript.slice(currentOffset, transcript.length);
297 |     var $plaintext = document.createTextNode(txt);
298 |     $trans.appendChild($plaintext);
299 |     currentOffset = transcript.length;
300 | }
301 | 
302 | function show_downloads() {
303 |     var $d = document.getElementById("downloads");
304 |     $d.textContent = "Download as: ";
305 |     var uid = window.location.pathname.split("/")[2];
306 |     // Name, path, title, inhibit-on-file:///
307 |     [["CSV", "align.csv", "Word alignment CSV"],
308 |      ["JSON", "align.json", "JSON word/phoneme alignment data"],
309 |      ["Zip", "/zip/" + uid + ".zip", "Standalone zipfile", true]]
310 |         .forEach(function(x) {
311 |             var $a = document.createElement("a");
312 |             $a.className = "download";
313 |             $a.textContent = x[0];
314 |             $a.href = x[1];
315 |             $a.title = x[2];
316 |             if(!x[3] || window.location.protocol != "file:") {
317 |                 $d.appendChild($a);
318 |             }
319 |         });
320 | }
321 | 
322 | var status_init = false;
323 | var status_log  = [];		// [ status ]
324 | var $status_pro;
325 | 
326 | function render_status(ret) {
327 |     if(!status_init) {
328 | 	// Clobber the $trans div and use it for status updates
329 | 	$trans.innerHTML = "<h2>transcription in progress</h2>";
330 | 	$trans.className = "status";
331 | 	$status_pro = document.createElement("progress");
332 | 	$status_pro.setAttribute("min", "0");
333 | 	$status_pro.setAttribute("max", "100");
334 | 	$status_pro.value = 0;
335 | 	$trans.appendChild($status_pro);
336 | 	
337 | 	status_init = true;
338 |     }
339 |     if(ret.status !== "TRANSCRIBING") {
340 | 	if(ret.percent) {
341 | 	    $status_pro.value = (100*ret.percent);
342 | 	}
343 |     }
344 |     else if(ret.percent && (status_log.length == 0 || status_log[status_log.length-1].percent+0.0001 < ret.percent)) {
345 | 	// New entry
346 | 	var $entry = document.createElement("div");
347 | 	$entry.className = "entry";
348 | 	$entry.textContent = ret.message;
349 | 	ret.$div = $entry;
350 | 
351 | 	if(ret.percent) {
352 | 	    $status_pro.value = (100*ret.percent);
353 | 	}
354 | 
355 | 	if(status_log.length > 0) {
356 | 	    $trans.insertBefore($entry, status_log[status_log.length-1].$div);
357 | 	}
358 | 	else {
359 | 	    $trans.appendChild($entry);
360 | 	}
361 | 	status_log.push(ret);
362 |     }
363 | }
364 | 
365 | function update() {
366 |     if(INLINE_JSON) {
367 |         // We want this to work from file:/// domains, so we provide a
368 |         // mechanism for inlining the alignment data.
369 |         render(INLINE_JSON);
370 |         show_downloads();
371 |     }
372 |     else  {
373 | 	// Show the status
374 |         get_json('status.json', function(ret) {
375 | 	    $a.style.visibility = 'hidden';
376 |             if (ret.status == 'ERROR') {
377 |                 $preloader.style.visibility = 'hidden';
378 |                 $trans.innerHTML = '<b>' + ret.status + ': ' + ret.error + '</b>';
379 |             } else if (ret.status == 'TRANSCRIBING' || ret.status == 'ALIGNING') {
380 |                 $preloader.style.visibility = 'visible';
381 |                 render_status(ret);
382 |                 setTimeout(update, 2000);
383 |             } else if (ret.status == 'OK') {
384 |                 $preloader.style.visibility = 'hidden';
385 | 		// XXX: should we fetch the align.json?
386 | 		window.location.reload();
387 |                 show_downloads();
388 |             } else if (ret.status == 'ENCODING' || ret.status == 'STARTED') {
389 |                 $preloader.style.visibility = 'visible';
390 |                 $trans.innerHTML = 'Encoding, please wait...';
391 |                 setTimeout(update, 2000);
392 |             } else {
393 | 		console.log("unknown status", ret);
394 |                 $preloader.style.visibility = 'hidden';
395 |                 $trans.innerHTML = ret.status + '...';
396 |                 setTimeout(update, 5000);		
397 |             }
398 |         });
399 |     }
400 | }
401 | 
402 | var INLINE_JSON;
403 | 
404 | update();
405 | 
406 | </script></body></html>
407 | 


--------------------------------------------------------------------------------