├── .gitignore ├── .gitmodules ├── .travis.yml ├── COPYING ├── Dockerfile ├── Pipfile ├── Pipfile.lock ├── README.md ├── align.py ├── examples ├── data │ ├── lucier.mp3 │ └── lucier.txt └── gentle_curl.sh ├── ext ├── Makefile ├── install_kaldi.sh ├── k3.cc └── m3.cc ├── gentle ├── __init__.py ├── __version__.py ├── diff_align.py ├── forced_aligner.py ├── full_transcriber.py ├── kaldi_queue.py ├── language_model.py ├── metasentence.py ├── multipass.py ├── resample.py ├── resources.py ├── rpc.py ├── standard_kaldi.py ├── transcriber.py ├── transcription.py └── util │ ├── __init__.py │ ├── cyst.py │ └── paths.py ├── install.sh ├── install_deps.sh ├── install_models.sh ├── pylintrc ├── serve.py ├── setup.py ├── tests ├── __init__.py ├── base.py └── transcriber.py └── www ├── index.html ├── preloader.gif └── view_alignment.html /.gitignore: -------------------------------------------------------------------------------- 1 | PROTO_LANGDIR/ 2 | data/ 3 | ext/standard_kaldi 4 | ext/mkgraph 5 | ext/*-test 6 | ext/*.o 7 | ext/.depend.mk 8 | *.pyc 9 | *.swp 10 | *~ 11 | webdata/ 12 | gentle.egg-info/ 13 | exp/ 14 | ext/k3 15 | ext/m3 16 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "kaldi"] 2 | path = ext/kaldi 3 | url = https://github.com/kaldi-asr/kaldi 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | language: generic 3 | 4 | services: 5 | - docker 6 | 7 | install: 8 | - docker build -t lowerquality/gentle . 9 | 10 | script: 11 | - docker run --rm lowerquality/gentle sh -c 'cd /gentle && python3 setup.py test' 12 | 13 | after_success: 14 | - if [ "$TRAVIS_BRANCH" == "master" ]; then 15 | docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD"; 16 | docker push lowerquality/gentle:latest; 17 | fi 18 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Robert M Ochshorn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | RUN DEBIAN_FRONTEND=noninteractive && \ 4 | apt-get update && \ 5 | apt-get install -y \ 6 | gcc g++ gfortran \ 7 | libc++-dev \ 8 | libstdc++-6-dev zlib1g-dev \ 9 | automake autoconf libtool \ 10 | git subversion \ 11 | libatlas3-base \ 12 | nvidia-cuda-dev \ 13 | ffmpeg \ 14 | python3 python3-dev python3-pip \ 15 | python python-dev python-pip \ 16 | wget unzip && \ 17 | apt-get clean 18 | 19 | ADD ext /gentle/ext 20 | RUN export MAKEFLAGS=' -j8' && cd /gentle/ext && \ 21 | ./install_kaldi.sh && \ 22 | make depend && make && rm -rf kaldi *.o 23 | 24 | ADD . /gentle 25 | RUN cd /gentle && python3 setup.py develop 26 | RUN cd /gentle && ./install_models.sh 27 | 28 | EXPOSE 8765 29 | 30 | VOLUME /gentle/webdata 31 | 32 | CMD cd /gentle && python3 serve.py 33 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | twisted = "*" 8 | 9 | [dev-packages] 10 | 11 | [requires] 12 | python_version = "3.10" 13 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "57dfbe0c4be1b1cd383325a23df3b92e8fb7c4bd94e927838f1c8b381fae885c" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.10" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "attrs": { 20 | "hashes": [ 21 | "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e", 22 | "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a" 23 | ], 24 | "markers": "python_version >= '3.8'", 25 | "version": "==25.1.0" 26 | }, 27 | "automat": { 28 | "hashes": [ 29 | "sha256:b34227cf63f6325b8ad2399ede780675083e439b20c323d376373d8ee6306d88", 30 | "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a" 31 | ], 32 | "markers": "python_version >= '3.8'", 33 | "version": "==24.8.1" 34 | }, 35 | "constantly": { 36 | "hashes": [ 37 | "sha256:3fd9b4d1c3dc1ec9757f3c52aef7e53ad9323dbe39f51dfd4c43853b68dfa3f9", 38 | "sha256:aa92b70a33e2ac0bb33cd745eb61776594dc48764b06c35e0efd050b7f1c7cbd" 39 | ], 40 | "markers": "python_version >= '3.8'", 41 | "version": "==23.10.4" 42 | }, 43 | "hyperlink": { 44 | "hashes": [ 45 | "sha256:427af957daa58bc909471c6c40f74c5450fa123dd093fc53efd2e91d2705a56b", 46 | "sha256:e6b14c37ecb73e89c77d78cdb4c2cc8f3fb59a885c5b3f819ff4ed80f25af1b4" 47 | ], 48 | "version": "==21.0.0" 49 | }, 50 | "idna": { 51 | "hashes": [ 52 | "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", 53 | "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3" 54 | ], 55 | "markers": "python_version >= '3.6'", 56 | "version": "==3.10" 57 | }, 58 | "incremental": { 59 | "hashes": [ 60 | "sha256:8cb2c3431530bec48ad70513931a760f446ad6c25e8333ca5d95e24b0ed7b8fe", 61 | "sha256:fb4f1d47ee60efe87d4f6f0ebb5f70b9760db2b2574c59c8e8912be4ebd464c9" 62 | ], 63 | "markers": "python_version >= '3.8'", 64 | "version": "==24.7.2" 65 | }, 66 | "setuptools": { 67 | "hashes": [ 68 | "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6", 69 | "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3" 70 | ], 71 | "markers": "python_version >= '3.9'", 72 | "version": "==75.8.0" 73 | }, 74 | "tomli": { 75 | "hashes": [ 76 | "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6", 77 | "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd", 78 | "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c", 79 | "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b", 80 | "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8", 81 | "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6", 82 | "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77", 83 | "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff", 84 | "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea", 85 | "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192", 86 | "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249", 87 | "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee", 88 | "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4", 89 | "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98", 90 | "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8", 91 | "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4", 92 | "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281", 93 | "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744", 94 | "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69", 95 | "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13", 96 | "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140", 97 | "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e", 98 | "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e", 99 | "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", 100 | "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff", 101 | "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec", 102 | "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2", 103 | "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222", 104 | "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106", 105 | "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272", 106 | "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a", 107 | "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7" 108 | ], 109 | "markers": "python_version >= '3.8'", 110 | "version": "==2.2.1" 111 | }, 112 | "twisted": { 113 | "hashes": [ 114 | "sha256:695d0556d5ec579dcc464d2856b634880ed1319f45b10d19043f2b57eb0115b5", 115 | "sha256:fe403076c71f04d5d2d789a755b687c5637ec3bcd3b2b8252d76f2ba65f54261" 116 | ], 117 | "index": "pypi", 118 | "markers": "python_full_version >= '3.8.0'", 119 | "version": "==24.11.0" 120 | }, 121 | "typing-extensions": { 122 | "hashes": [ 123 | "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", 124 | "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8" 125 | ], 126 | "markers": "python_version >= '3.8'", 127 | "version": "==4.12.2" 128 | }, 129 | "zope-interface": { 130 | "hashes": [ 131 | "sha256:033b3923b63474800b04cba480b70f6e6243a62208071fc148354f3f89cc01b7", 132 | "sha256:05b910a5afe03256b58ab2ba6288960a2892dfeef01336dc4be6f1b9ed02ab0a", 133 | "sha256:086ee2f51eaef1e4a52bd7d3111a0404081dadae87f84c0ad4ce2649d4f708b7", 134 | "sha256:0ef9e2f865721553c6f22a9ff97da0f0216c074bd02b25cf0d3af60ea4d6931d", 135 | "sha256:1090c60116b3da3bfdd0c03406e2f14a1ff53e5771aebe33fec1edc0a350175d", 136 | "sha256:144964649eba4c5e4410bb0ee290d338e78f179cdbfd15813de1a664e7649b3b", 137 | "sha256:15398c000c094b8855d7d74f4fdc9e73aa02d4d0d5c775acdef98cdb1119768d", 138 | "sha256:1909f52a00c8c3dcab6c4fad5d13de2285a4b3c7be063b239b8dc15ddfb73bd2", 139 | "sha256:21328fcc9d5b80768bf051faa35ab98fb979080c18e6f84ab3f27ce703bce465", 140 | "sha256:224b7b0314f919e751f2bca17d15aad00ddbb1eadf1cb0190fa8175edb7ede62", 141 | "sha256:25e6a61dcb184453bb00eafa733169ab6d903e46f5c2ace4ad275386f9ab327a", 142 | "sha256:27f926f0dcb058211a3bb3e0e501c69759613b17a553788b2caeb991bed3b61d", 143 | "sha256:29caad142a2355ce7cfea48725aa8bcf0067e2b5cc63fcf5cd9f97ad12d6afb5", 144 | "sha256:2ad9913fd858274db8dd867012ebe544ef18d218f6f7d1e3c3e6d98000f14b75", 145 | "sha256:31d06db13a30303c08d61d5fb32154be51dfcbdb8438d2374ae27b4e069aac40", 146 | "sha256:3e0350b51e88658d5ad126c6a57502b19d5f559f6cb0a628e3dc90442b53dd98", 147 | "sha256:3f6771d1647b1fc543d37640b45c06b34832a943c80d1db214a37c31161a93f1", 148 | "sha256:4893395d5dd2ba655c38ceb13014fd65667740f09fa5bb01caa1e6284e48c0cd", 149 | "sha256:52e446f9955195440e787596dccd1411f543743c359eeb26e9b2c02b077b0519", 150 | "sha256:550f1c6588ecc368c9ce13c44a49b8d6b6f3ca7588873c679bd8fd88a1b557b6", 151 | "sha256:72cd1790b48c16db85d51fbbd12d20949d7339ad84fd971427cf00d990c1f137", 152 | "sha256:7bd449c306ba006c65799ea7912adbbfed071089461a19091a228998b82b1fdb", 153 | "sha256:7dc5016e0133c1a1ec212fc87a4f7e7e562054549a99c73c8896fa3a9e80cbc7", 154 | "sha256:802176a9f99bd8cc276dcd3b8512808716492f6f557c11196d42e26c01a69a4c", 155 | "sha256:80ecf2451596f19fd607bb09953f426588fc1e79e93f5968ecf3367550396b22", 156 | "sha256:8b49f1a3d1ee4cdaf5b32d2e738362c7f5e40ac8b46dd7d1a65e82a4872728fe", 157 | "sha256:8e7da17f53e25d1a3bde5da4601e026adc9e8071f9f6f936d0fe3fe84ace6d54", 158 | "sha256:a102424e28c6b47c67923a1f337ede4a4c2bba3965b01cf707978a801fc7442c", 159 | "sha256:a19a6cc9c6ce4b1e7e3d319a473cf0ee989cbbe2b39201d7c19e214d2dfb80c7", 160 | "sha256:a71a5b541078d0ebe373a81a3b7e71432c61d12e660f1d67896ca62d9628045b", 161 | "sha256:baf95683cde5bc7d0e12d8e7588a3eb754d7c4fa714548adcd96bdf90169f021", 162 | "sha256:cab15ff4832580aa440dc9790b8a6128abd0b88b7ee4dd56abacbc52f212209d", 163 | "sha256:ce290e62229964715f1011c3dbeab7a4a1e4971fd6f31324c4519464473ef9f2", 164 | "sha256:d3a8ffec2a50d8ec470143ea3d15c0c52d73df882eef92de7537e8ce13475e8a", 165 | "sha256:e204937f67b28d2dca73ca936d3039a144a081fc47a07598d44854ea2a106239", 166 | "sha256:eb23f58a446a7f09db85eda09521a498e109f137b85fb278edb2e34841055398", 167 | "sha256:f6dd02ec01f4468da0f234da9d9c8545c5412fef80bc590cc51d8dd084138a89" 168 | ], 169 | "markers": "python_version >= '3.8'", 170 | "version": "==7.2" 171 | } 172 | }, 173 | "develop": {} 174 | } 175 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gentle 2 | **Robust yet lenient forced-aligner built on Kaldi. A tool for aligning speech with text.** 3 | 4 | ## Getting Started 5 | 6 | There are three ways to install Gentle. 7 | 8 | 1. Download the [pre-built Mac application](https://github.com/lowerquality/gentle/releases/latest). This package includes a GUI that will start the server and a browser. It only works on Mac OS. 9 | 10 | 2. Use the [Docker](https://www.docker.com/) image. Just run ```docker run -P lowerquality/gentle```. This works on all platforms supported by Docker. 11 | 12 | 3. Download the source code and run ```./install.sh```. Then run ```python3 serve.py``` to start the server. This works on Mac and Linux. 13 | 14 | ## Using Gentle 15 | 16 | By default, the aligner listens at http://localhost:8765. That page has a graphical interface for transcribing audio, viewing results, and downloading data. 17 | 18 | There is also a REST API so you can use Gentle in your programs. Here's an example of how to use the API with CURL: 19 | 20 | ```bash 21 | curl -F "audio=@audio.mp3" -F "transcript=@words.txt" "http://localhost:8765/transcriptions?async=false" 22 | ``` 23 | 24 | If you've downloaded the source code you can also run the aligner as a command line program: 25 | 26 | ```bash 27 | git clone https://github.com/lowerquality/gentle.git 28 | cd gentle 29 | ./install.sh 30 | python3 align.py audio.mp3 words.txt 31 | ``` 32 | 33 | The default behaviour outputs the JSON to stdout. See `python3 align.py --help` for options. 34 | -------------------------------------------------------------------------------- /align.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import multiprocessing 4 | import os 5 | import sys 6 | 7 | import gentle 8 | 9 | parser = argparse.ArgumentParser( 10 | description='Align a transcript to audio by generating a new language model. Outputs JSON') 11 | parser.add_argument( 12 | '--nthreads', default=multiprocessing.cpu_count(), type=int, 13 | help='number of alignment threads') 14 | parser.add_argument( 15 | '-o', '--output', metavar='output', type=str, 16 | help='output filename') 17 | parser.add_argument( 18 | '--conservative', dest='conservative', action='store_true', 19 | help='conservative alignment') 20 | parser.set_defaults(conservative=False) 21 | parser.add_argument( 22 | '--disfluency', dest='disfluency', action='store_true', 23 | help='include disfluencies (uh, um) in alignment') 24 | parser.set_defaults(disfluency=False) 25 | parser.add_argument( 26 | '--log', default="INFO", 27 | help='the log level (DEBUG, INFO, WARNING, ERROR, or CRITICAL)') 28 | parser.add_argument( 29 | 'audiofile', type=str, 30 | help='audio file') 31 | parser.add_argument( 32 | 'txtfile', type=str, 33 | help='transcript text file') 34 | args = parser.parse_args() 35 | 36 | log_level = args.log.upper() 37 | logging.getLogger().setLevel(log_level) 38 | 39 | disfluencies = set(['uh', 'um']) 40 | 41 | def on_progress(p): 42 | for k,v in p.items(): 43 | logging.debug("%s: %s" % (k, v)) 44 | 45 | 46 | with open(args.txtfile, encoding="utf-8") as fh: 47 | transcript = fh.read() 48 | 49 | resources = gentle.Resources() 50 | logging.info("converting audio to 8K sampled wav") 51 | 52 | with gentle.resampled(args.audiofile) as wavfile: 53 | logging.info("starting alignment") 54 | aligner = gentle.ForcedAligner(resources, transcript, nthreads=args.nthreads, disfluency=args.disfluency, conservative=args.conservative, disfluencies=disfluencies) 55 | result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) 56 | 57 | fh = open(args.output, 'w', encoding="utf-8") if args.output else sys.stdout 58 | fh.write(result.to_json(indent=2)) 59 | if args.output: 60 | logging.info("output written to %s" % (args.output)) 61 | -------------------------------------------------------------------------------- /examples/data/lucier.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strob/gentle/087a3b738bad9ebafd0570561486252cb624ba3a/examples/data/lucier.mp3 -------------------------------------------------------------------------------- /examples/data/lucier.txt: -------------------------------------------------------------------------------- 1 | I am sitting in a room different from the one you are in now. I am recording the sound of my speaking voice and I am going to play it back into the room again and again until the resonant frequencies of the room reinforce themselves so that any semblance of my speech, with perhaps the exception of rhythm, is destroyed. What you will hear, then, are the natural resonant frequencies of the room articulated by speech. I regard this activity not so much as a demonstration of a physical fact, but more as a way to smooth out any irregularities my speech might have. -------------------------------------------------------------------------------- /examples/gentle_curl.sh: -------------------------------------------------------------------------------- 1 | curl -X POST -F 'audio=@examples/data/lucier.mp3' -F 'transcript= *decode_fst = ReadFstKaldi(fst_rxfilename); 139 | 140 | fst::SymbolTable *word_syms = 141 | fst::SymbolTable::ReadText(word_syms_rxfilename); 142 | 143 | fst::SymbolTable* phone_syms = 144 | fst::SymbolTable::ReadText(phone_syms_rxfilename); 145 | 146 | 147 | OnlineIvectorExtractorAdaptationState adaptation_state(feature_info.ivector_extractor_info); 148 | 149 | OnlineNnet2FeaturePipeline feature_pipeline(feature_info); 150 | feature_pipeline.SetAdaptationState(adaptation_state); 151 | 152 | OnlineSilenceWeighting silence_weighting( 153 | trans_model, 154 | feature_info.silence_weighting_config); 155 | 156 | SingleUtteranceNnet3Decoder decoder(nnet3_decoding_config, 157 | trans_model, 158 | de_nnet_simple_looped_info, 159 | //am_nnet, // kaldi::nnet3::DecodableNnetSimpleLoopedInfo 160 | *decode_fst, 161 | &feature_pipeline); 162 | 163 | 164 | char cmd[1024]; 165 | 166 | while(true) { 167 | // Let the client decide what we should do... 168 | fgets(cmd, sizeof(cmd), stdin); 169 | 170 | if(strcmp(cmd,"stop\n") == 0) { 171 | break; 172 | } 173 | else if(strcmp(cmd,"reset\n") == 0) { 174 | feature_pipeline.~OnlineNnet2FeaturePipeline(); 175 | new (&feature_pipeline) OnlineNnet2FeaturePipeline(feature_info); 176 | 177 | decoder.~SingleUtteranceNnet3Decoder(); 178 | new (&decoder) SingleUtteranceNnet3Decoder(nnet3_decoding_config, 179 | trans_model, 180 | de_nnet_simple_looped_info, 181 | //am_nnet, 182 | *decode_fst, 183 | &feature_pipeline); 184 | } 185 | else if(strcmp(cmd,"push-chunk\n") == 0) { 186 | 187 | // Get chunk length from python 188 | int chunk_len; 189 | fgets(cmd, sizeof(cmd), stdin); 190 | sscanf(cmd, "%d\n", &chunk_len); 191 | 192 | int16_t audio_chunk[chunk_len]; 193 | Vector wave_part = Vector(chunk_len); 194 | 195 | fread(&audio_chunk, 2, chunk_len, stdin); 196 | 197 | // We need to copy this into the `wave_part' Vector thing. 198 | // From `gst-audio-source.cc' in gst-kaldi-nnet2 199 | for (int i = 0; i < chunk_len ; ++i) { 200 | (wave_part)(i) = static_cast(audio_chunk[i]); 201 | } 202 | 203 | feature_pipeline.AcceptWaveform(arate, wave_part); 204 | 205 | std::vector > delta_weights; 206 | if (silence_weighting.Active()) { 207 | silence_weighting.ComputeCurrentTraceback(decoder.Decoder()); 208 | silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(), 209 | &delta_weights); 210 | feature_pipeline.IvectorFeature()->UpdateFrameWeights(delta_weights); 211 | } 212 | 213 | decoder.AdvanceDecoding(); 214 | 215 | fprintf(stdout, "ok\n"); 216 | } 217 | else if(strcmp(cmd, "get-final\n") == 0) { 218 | feature_pipeline.InputFinished(); // Computes last few frames of input 219 | decoder.AdvanceDecoding(); // Decodes remaining frames 220 | decoder.FinalizeDecoding(); 221 | 222 | Lattice final_lat; 223 | decoder.GetBestPath(true, &final_lat); 224 | CompactLattice clat; 225 | ConvertLattice(final_lat, &clat); 226 | 227 | // Compute prons alignment (see: kaldi/latbin/nbest-to-prons.cc) 228 | CompactLattice aligned_clat; 229 | 230 | std::vector words, times, lengths; 231 | std::vector > prons; 232 | std::vector > phone_lengths; 233 | 234 | WordAlignLattice(clat, trans_model, word_boundary_info, 235 | 0, &aligned_clat); 236 | 237 | CompactLatticeToWordProns(trans_model, aligned_clat, &words, ×, 238 | &lengths, &prons, &phone_lengths); 239 | 240 | for (int i = 0; i < words.size(); i++) { 241 | if(words[i] == 0) { 242 | // links - silence 243 | continue; 244 | } 245 | fprintf(stdout, "word: %s / start: %f / duration: %f\n", 246 | word_syms->Find(words[i]).c_str(), 247 | times[i] * frame_shift, 248 | lengths[i] * frame_shift); 249 | // Print out the phonemes for this word 250 | for(size_t j=0; jFind(prons[i][j]).c_str(), 253 | phone_lengths[i][j] * frame_shift); 254 | } 255 | } 256 | 257 | fprintf(stdout, "done with words\n"); 258 | 259 | } 260 | else { 261 | 262 | fprintf(stderr, "unknown command %s\n", cmd); 263 | 264 | } 265 | } 266 | } 267 | -------------------------------------------------------------------------------- /ext/m3.cc: -------------------------------------------------------------------------------- 1 | #include "fstext/context-fst.h" 2 | #include "fstext/fstext-utils.h" 3 | #include "fstext/kaldi-fst-io.h" 4 | #include "fstext/table-matcher.h" 5 | #include "hmm/hmm-utils.h" 6 | #include "hmm/transition-model.h" 7 | #include "tree/context-dep.h" 8 | #include "lat/lattice-functions-transition-model.h" 9 | #include "util/common-utils.h" 10 | #include 11 | #include 12 | 13 | int main(int argc, char *argv[]) { 14 | using namespace kaldi; 15 | using namespace fst; 16 | using fst::script::ArcSort; 17 | try { 18 | const char *usage = "Usage: ./mkgraph [options] \n"; 19 | 20 | ParseOptions po(usage); 21 | po.Read(argc, argv); 22 | if (po.NumArgs() != 3) { 23 | po.PrintUsage(); 24 | return 1; 25 | } 26 | 27 | int32 N = 3, P = 1; 28 | float transition_scale = 1.0; 29 | float self_loop_scale = 0.1; 30 | 31 | std::string proto_dir = po.GetArg(1), 32 | grammar_fst_filename = po.GetArg(2), 33 | out_filename = po.GetArg(3); 34 | 35 | std::string lang_fst_filename = proto_dir + "/langdir/L.fst", 36 | lang_disambig_fst_filename = proto_dir + "/langdir/L_disambig.fst", 37 | disambig_phones_filename = proto_dir + "/langdir/phones/disambig.int", 38 | model_filename = proto_dir + "/tdnn_7b_chain_online/final.mdl", 39 | tree_filename = proto_dir + "/tdnn_7b_chain_online/tree", 40 | words_filename = proto_dir + "/tdnn_7b_chain_online/graph_pp/words.txt"; 41 | 42 | if (!std::ifstream(lang_fst_filename.c_str())) { 43 | std::cerr << "expected " << lang_fst_filename << " to exist" << std::endl; 44 | return 1; 45 | } 46 | if (!std::ifstream(lang_disambig_fst_filename.c_str())) { 47 | std::cerr << "expected " << lang_disambig_fst_filename << " to exist" << std::endl; 48 | return 1; 49 | } 50 | if (!std::ifstream(grammar_fst_filename.c_str())) { 51 | std::cerr << "expected " << grammar_fst_filename << " to exist" << std::endl; 52 | return 1; 53 | } 54 | if (!std::ifstream(disambig_phones_filename.c_str())) { 55 | std::cerr << "expected " << disambig_phones_filename << " to exist" << std::endl; 56 | return 1; 57 | } 58 | if (!std::ifstream(model_filename.c_str())) { 59 | std::cerr << "expected " << model_filename << " to exist" << std::endl; 60 | return 1; 61 | } 62 | if (!std::ifstream(tree_filename.c_str())) { 63 | std::cerr << "expected " << tree_filename << " to exist" << std::endl; 64 | return 1; 65 | } 66 | 67 | // fstcompile 68 | const SymbolTable *ssyms = 0; 69 | fst::SymbolTableTextOptions opts; 70 | const SymbolTable *isyms = SymbolTable::ReadText(words_filename, opts); 71 | if (!isyms) { return 1; } 72 | const SymbolTable *osyms = SymbolTable::ReadText(words_filename, opts); 73 | if (!osyms) { return 1; } 74 | std::ifstream grammar_fst_file(grammar_fst_filename.c_str()); 75 | FstCompiler fstcompiler(grammar_fst_file, "", isyms, 76 | osyms, ssyms, 77 | false, false, 78 | false, false, 79 | false); 80 | VectorFst grammar_fst = fstcompiler.Fst(); 81 | 82 | // fsttablecompose 83 | VectorFst *lang_disambig_fst = ReadFstKaldi(lang_disambig_fst_filename); 84 | if (lang_disambig_fst->Properties(fst::kOLabelSorted, true) == 0) { 85 | KALDI_WARN << "L_disambig.fst is not olabel sorted."; 86 | } 87 | TableComposeOptions table_opts; 88 | VectorFst lg_fst; 89 | TableCompose(*lang_disambig_fst, grammar_fst, &lg_fst, table_opts); 90 | delete lang_disambig_fst; 91 | 92 | // fstdeterminizestar --use-log 93 | ArcSort(&lg_fst, ILabelCompare()); 94 | int max_states = -1; 95 | bool debug_location = false; 96 | DeterminizeStarInLog(&lg_fst, kDelta, &debug_location, max_states); 97 | 98 | // fstminimizeencoded 99 | MinimizeEncoded(&lg_fst, kDelta); 100 | 101 | // fstarcsort --sort_type=ilabel 102 | ArcSort(&lg_fst, ILabelCompare()); 103 | 104 | // fstisstochastic 105 | StdArc::Weight min, max; 106 | if (!IsStochasticFst(lg_fst, 0.01, &min, &max)) { 107 | std::cerr << "[info]: LG not stochastic." << std::endl; 108 | } 109 | 110 | // fstcomposecontext 111 | std::vector disambig_symbols; 112 | ReadIntegerVectorSimple(disambig_phones_filename, &disambig_symbols); 113 | if (disambig_symbols.empty()) { 114 | KALDI_WARN << "Disambiguation symbols list is empty; this likely " 115 | << "indicates an error in data preparation."; 116 | } 117 | std::vector > ilabels; 118 | VectorFst clg_fst; 119 | ComposeContext(disambig_symbols, N, P, &lg_fst, &clg_fst, &ilabels); 120 | 121 | // fstarcsort --sort_type=ilabel 122 | ArcSort(&clg_fst, ILabelCompare()); 123 | 124 | // fstisstochastic 125 | if (!IsStochasticFst(clg_fst, 0.01, &min, &max)) { 126 | std::cerr << "[info]: CLG not stochastic." << std::endl; 127 | } 128 | 129 | // make-h-transducer 130 | HTransducerConfig hcfg; 131 | hcfg.transition_scale = transition_scale; 132 | ContextDependency ctx_dep; 133 | ReadKaldiObject(tree_filename, &ctx_dep); 134 | TransitionModel trans_model; 135 | ReadKaldiObject(model_filename, &trans_model); 136 | std::vector disambig_tid; 137 | fst::VectorFst *ha_fst = GetHTransducer( 138 | ilabels, 139 | ctx_dep, 140 | trans_model, 141 | hcfg, 142 | &disambig_tid); 143 | 144 | // fsttablecompose 145 | VectorFst hclga_fst; 146 | TableComposeOptions hclga_table_opts; 147 | TableCompose(*ha_fst, clg_fst, &hclga_fst, hclga_table_opts); 148 | 149 | // fstdeterminizestar --use-log=true 150 | ArcSort(&hclga_fst, ILabelCompare()); 151 | DeterminizeStarInLog(&hclga_fst, kDelta, &debug_location, max_states); 152 | 153 | // fstrmsymbols 154 | RemoveSomeInputSymbols(disambig_tid, &hclga_fst); 155 | 156 | // fstrmepslocal 157 | RemoveEpsLocal(&hclga_fst); 158 | 159 | // fstminimizeencoded 160 | MinimizeEncoded(&hclga_fst, kDelta); 161 | 162 | // fstisstochastic 163 | if (!IsStochasticFst(hclga_fst, 0.01, &min, &max)) { 164 | std::cerr << "[info]: HCLGa is not stochastic." << std::endl; 165 | } 166 | 167 | VectorFst hclg_fst = hclga_fst; 168 | 169 | // add-self-loops 170 | std::vector null_disambig_syms; 171 | AddSelfLoops(trans_model, 172 | null_disambig_syms, 173 | self_loop_scale, 174 | true, 175 | true, 176 | &hclg_fst); 177 | 178 | // fstisstochastic 179 | if (transition_scale == 1.0 && 180 | self_loop_scale == 1.0 && 181 | !IsStochasticFst(hclg_fst, 0.01, &min, &max)) { 182 | std::cerr << "[info]: final HCLG is not stochastic." << std::endl; 183 | } 184 | 185 | if (!hclg_fst.Write(out_filename)) { 186 | KALDI_ERR << "error writing FST to " << out_filename; 187 | } 188 | } catch(const std::exception &e) { 189 | std::cerr << e.what(); 190 | return -1; 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /gentle/__init__.py: -------------------------------------------------------------------------------- 1 | from .__version__ import __version__ 2 | from .resources import Resources 3 | from .forced_aligner import ForcedAligner 4 | from .full_transcriber import FullTranscriber 5 | from .resample import resample, resampled 6 | from .transcription import Transcription 7 | -------------------------------------------------------------------------------- /gentle/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.11.0' 2 | -------------------------------------------------------------------------------- /gentle/diff_align.py: -------------------------------------------------------------------------------- 1 | import difflib 2 | import json 3 | import os 4 | import sys 5 | 6 | from gentle import metasentence 7 | from gentle import language_model 8 | from gentle import standard_kaldi 9 | from gentle import transcription 10 | from gentle.resources import Resources 11 | 12 | 13 | # TODO(maxhawkins): try using the (apparently-superior) time-mediated dynamic 14 | # programming algorithm used in sclite's alignment process: 15 | # http://www1.icsi.berkeley.edu/Speech/docs/sctk-1.2/sclite.htm#time-mediated 16 | def align(alignment, ms, **kwargs): 17 | '''Use the diff algorithm to align the raw tokens recognized by Kaldi 18 | to the words in the transcript (tokenized by MetaSentence). 19 | 20 | The output combines information about the timing and alignment of 21 | correctly-aligned words as well as words that Kaldi failed to recognize 22 | and extra words not found in the original transcript. 23 | ''' 24 | disfluency = kwargs['disfluency'] if 'disfluency' in kwargs else False 25 | disfluencies = kwargs['disfluencies'] if 'disfluencies' in kwargs else [] 26 | 27 | hypothesis = [X.word for X in alignment] 28 | reference = ms.get_kaldi_sequence() 29 | 30 | display_seq = ms.get_display_sequence() 31 | txt_offsets = ms.get_text_offsets() 32 | 33 | out = [] 34 | for op, a, b in word_diff(hypothesis, reference): 35 | 36 | if op == 'delete': 37 | word = hypothesis[a] 38 | if disfluency and word in disfluencies: 39 | hyp_token = alignment[a] 40 | phones = hyp_token.phones or [] 41 | 42 | out.append(transcription.Word( 43 | case=transcription.Word.NOT_FOUND_IN_TRANSCRIPT, 44 | phones=phones, 45 | start=hyp_token.start, 46 | duration=hyp_token.duration, 47 | word=word)) 48 | continue 49 | 50 | display_word = display_seq[b] 51 | start_offset, end_offset = txt_offsets[b] 52 | 53 | if op == 'equal': 54 | hyp_word = hypothesis[a] 55 | hyp_token = alignment[a] 56 | phones = hyp_token.phones or [] 57 | 58 | out.append(transcription.Word( 59 | case=transcription.Word.SUCCESS, 60 | startOffset=start_offset, 61 | endOffset=end_offset, 62 | word=display_word, 63 | alignedWord=hyp_word, 64 | phones=phones, 65 | start=hyp_token.start, 66 | duration=hyp_token.duration)) 67 | 68 | elif op in ['insert', 'replace']: 69 | out.append(transcription.Word( 70 | case=transcription.Word.NOT_FOUND_IN_AUDIO, 71 | startOffset=start_offset, 72 | endOffset=end_offset, 73 | word=display_word)) 74 | return out 75 | 76 | def word_diff(a, b): 77 | '''Like difflib.SequenceMatcher but it only compares one word 78 | at a time. Returns an iterator whose elements are like 79 | (operation, index in a, index in b)''' 80 | matcher = difflib.SequenceMatcher(a=a, b=b) 81 | for op, a_idx, _, b_idx, _ in by_word(matcher.get_opcodes()): 82 | yield (op, a_idx, b_idx) 83 | 84 | def by_word(opcodes): 85 | '''Take difflib.SequenceMatcher.get_opcodes() output and 86 | return an equivalent opcode sequence that only modifies 87 | one word at a time''' 88 | for op, s1, e1, s2, e2 in opcodes: 89 | if op == 'delete': 90 | for i in range(s1, e1): 91 | yield (op, i, i+1, s2, s2) 92 | elif op == 'insert': 93 | for i in range(s2, e2): 94 | yield (op, s1, s1, i, i+1) 95 | else: 96 | len1 = e1-s1 97 | len2 = e2-s2 98 | for i1, i2 in zip(range(s1, e1), range(s2, e2)): 99 | yield (op, i1, i1 + 1, i2, i2 + 1) 100 | if len1 > len2: 101 | for i in range(s1 + len2, e1): 102 | yield ('delete', i, i+1, e2, e2) 103 | if len2 > len1: 104 | for i in range(s2 + len1, e2): 105 | yield ('insert', s1, s1, i, i+1) 106 | 107 | if __name__=='__main__': 108 | TEXT_FILE = sys.argv[1] 109 | JSON_FILE = sys.argv[2] 110 | OUTPUT_FILE = sys.argv[3] 111 | 112 | ms = metasentence.MetaSentence(open(TEXT_FILE).read(), Resources().vocab) 113 | alignment = json.load(open(JSON_FILE))['words'] 114 | 115 | out = align(alignment, ms) 116 | 117 | json.dump(out, open(OUTPUT_FILE, 'w'), indent=2) 118 | -------------------------------------------------------------------------------- /gentle/forced_aligner.py: -------------------------------------------------------------------------------- 1 | from gentle import diff_align 2 | from gentle import kaldi_queue 3 | from gentle import language_model 4 | from gentle import metasentence 5 | from gentle import multipass 6 | from gentle.transcriber import MultiThreadedTranscriber 7 | from gentle.transcription import Transcription 8 | 9 | class ForcedAligner(): 10 | 11 | def __init__(self, resources, transcript, nthreads=4, **kwargs): 12 | self.kwargs = kwargs 13 | self.nthreads = nthreads 14 | self.transcript = transcript 15 | self.resources = resources 16 | self.ms = metasentence.MetaSentence(transcript, resources.vocab) 17 | ks = self.ms.get_kaldi_sequence() 18 | gen_hclg_filename = language_model.make_bigram_language_model(ks, resources.proto_langdir, **kwargs) 19 | self.queue = kaldi_queue.build(resources, hclg_path=gen_hclg_filename, nthreads=nthreads) 20 | self.mtt = MultiThreadedTranscriber(self.queue, nthreads=nthreads) 21 | 22 | def transcribe(self, wavfile, progress_cb=None, logging=None): 23 | words, duration = self.mtt.transcribe(wavfile, progress_cb=progress_cb) 24 | 25 | # Clear queue (would this be gc'ed?) 26 | for i in range(self.nthreads): 27 | k = self.queue.get() 28 | k.stop() 29 | 30 | # Align words 31 | words = diff_align.align(words, self.ms, **self.kwargs) 32 | 33 | # Perform a second-pass with unaligned words 34 | if logging is not None: 35 | logging.info("%d unaligned words (of %d)" % (len([X for X in words if X.not_found_in_audio()]), len(words))) 36 | 37 | if progress_cb is not None: 38 | progress_cb({'status': 'ALIGNING'}) 39 | 40 | words = multipass.realign(wavfile, words, self.ms, resources=self.resources, nthreads=self.nthreads, progress_cb=progress_cb) 41 | 42 | if logging is not None: 43 | logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in words if X.not_found_in_audio()]), len(words))) 44 | 45 | words = AdjacencyOptimizer(words, duration).optimize() 46 | 47 | return Transcription(words=words, transcript=self.transcript) 48 | 49 | 50 | class AdjacencyOptimizer(): 51 | 52 | ''' 53 | Sometimes there are ambiguous possible placements of not-found-in-audio 54 | words. The word-based diff doesn't take into account intra-word timings 55 | when it does insertion, so can create strange results. E.g. if the audio 56 | contains these words with timings like 57 | 58 | "She climbed on the bed and jumped on the mattress" 59 | 0 1 2 3 4 5 6 7 8 9 60 | 61 | and suppose the speaker mumbled or there was noise obscuring the words 62 | "on the bed and jumped", so the hypothesis is just "She climbed on the mattress". 63 | 64 | The intended alignment would be to insert the missing out-of-audio words: 65 | 66 | "She climbed [on the bed and jumped] on the mattress" 67 | 0 1 7 8 9 68 | 69 | But the word-based diff might instead align "on the" with the first 70 | occurrence, and so insert out-of-audio words like this: 71 | 72 | "She climbed on the [bed and jumped on the] mattress" 73 | 0 1 7 8 9 74 | 75 | with a big gap in between "climbed" and "on" and no time available for 76 | "[bend and jumped on the]". 77 | 78 | Or imagine a case such as "I really really really really want to do 79 | this", where only one of the "really"s is in the hypothesis, so again 80 | the choice word-based choice of which to align it with is arbitrary. 81 | 82 | This method cleans those up, by checking each not-found-in-audio sequence 83 | of words to see if its neighbor(s) are candidates for moving inward and 84 | whether doing so would improve adjacent intra-word distances. 85 | ''' 86 | 87 | def __init__(self, words, duration): 88 | self.words = words 89 | self.duration = duration 90 | 91 | def out_of_audio_sequence(self, i): 92 | j = i 93 | while 0 <= j < len(self.words) and self.words[j].not_found_in_audio(): 94 | j += 1 95 | return None if j == i else j 96 | 97 | def tend(self, i): 98 | for word in reversed(self.words[:i]): 99 | if word.success(): 100 | return word.end 101 | return 0 102 | 103 | def tstart(self, i): 104 | for word in self.words[i:]: 105 | if word.success(): 106 | return word.start 107 | return self.duration 108 | 109 | def find_subseq(self, i, j, p, n): 110 | for k in range(i, j-n+1): 111 | for m in range(p, p+n): 112 | if self.words[k].word != self.words[m].word: 113 | break 114 | else: 115 | return k 116 | return None 117 | 118 | def swap_adjacent_if_better(self, i, j, n, side): 119 | '''Given an out-of-audio sequence at [i,j), looks to see if the adjacent n words 120 | can be beneficially swapped with a subsequence.''' 121 | 122 | # construct adjacent candidate words and their gap relative to their 123 | # opposite neighbors 124 | if side == "left": 125 | p, q = (i-n, i) 126 | if p < 0: return False 127 | opp_gap = self.tstart(p) - self.tend(p) 128 | else: 129 | p, q = (j, j+n) 130 | if q > len(self.words): return False 131 | opp_gap = self.tstart(q) - self.tend(q) 132 | 133 | # is there a matching subsequence? 134 | k = self.find_subseq(i, j, p, n) 135 | if k is None: return False 136 | 137 | # if the opposite gap isn't bigger than the sequence gap, no benefit to 138 | # potential swap 139 | seq_gap = self.tstart(j) - self.tend(i) 140 | if opp_gap <= seq_gap: return False 141 | 142 | # swap subsequences at p and k 143 | for m in range(0, n): 144 | self.words[k+m].swap_alignment(self.words[p+m]) 145 | 146 | return True 147 | 148 | def optimize_adjacent(self, i, j): 149 | '''Given an out-of-audio sequence at [i,j), looks for an opportunity to 150 | swap a sub-sequence with adjacent words at [p, i) or [j, p)''' 151 | 152 | for n in reversed(range(1, (j-i)+1)): # consider larger moves first 153 | if self.swap_adjacent_if_better(i, j, n, "left"): return True 154 | if self.swap_adjacent_if_better(i, j, n, "right"): return True 155 | 156 | def optimize(self): 157 | i = 0 158 | while i < len(self.words): 159 | j = self.out_of_audio_sequence(i) 160 | if j is None: 161 | i += 1 162 | 163 | elif self.optimize_adjacent(i, j): 164 | # back up to rescan in case we swapped left 165 | while i >= 0 and self.words[i].not_found_in_audio(): 166 | i -= 1 167 | 168 | else: 169 | i = j # skip past this sequence 170 | 171 | return self.words 172 | -------------------------------------------------------------------------------- /gentle/full_transcriber.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from gentle import kaldi_queue 4 | from gentle import transcription 5 | from gentle.transcriber import MultiThreadedTranscriber 6 | from gentle.transcription import Transcription 7 | 8 | class FullTranscriber(): 9 | 10 | def __init__(self, resources, nthreads=2): 11 | self.available = False 12 | if nthreads <= 0: return 13 | if not os.path.exists(resources.full_hclg_path): return 14 | 15 | queue = kaldi_queue.build(resources, nthreads=nthreads) 16 | self.mtt = MultiThreadedTranscriber(queue, nthreads=nthreads) 17 | self.available = True 18 | 19 | def transcribe(self, wavfile, progress_cb=None, logging=None): 20 | words, duration = self.mtt.transcribe(wavfile, progress_cb=progress_cb) 21 | return self.make_transcription_alignment(words) 22 | 23 | @staticmethod 24 | def make_transcription_alignment(trans): 25 | # Spoof the `diff_align` output format 26 | transcript = "" 27 | words = [] 28 | for t_wd in trans: 29 | word = transcription.Word( 30 | case=transcription.Word.SUCCESS, 31 | startOffset=len(transcript), 32 | endOffset=len(transcript) + len(t_wd.word), 33 | word=t_wd.word, 34 | alignedWord=t_wd.word, 35 | phones=t_wd.phones, 36 | start=t_wd.start, 37 | end=t_wd.end) 38 | words.append(word) 39 | 40 | transcript += word.word + " " 41 | 42 | return Transcription(words=words, transcript=transcript) 43 | -------------------------------------------------------------------------------- /gentle/kaldi_queue.py: -------------------------------------------------------------------------------- 1 | from queue import Queue 2 | from gentle import standard_kaldi 3 | 4 | def build(resources, nthreads=4, hclg_path=None): 5 | 6 | if hclg_path is None: hclg_path = resources.full_hclg_path 7 | 8 | kaldi_queue = Queue() 9 | for i in range(nthreads): 10 | kaldi_queue.put(standard_kaldi.Kaldi( 11 | resources.nnet_gpu_path, 12 | hclg_path, 13 | resources.proto_langdir) 14 | ) 15 | return kaldi_queue 16 | -------------------------------------------------------------------------------- /gentle/language_model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | import os 4 | import shutil 5 | import subprocess 6 | import sys 7 | import tempfile 8 | 9 | from .util.paths import get_binary 10 | from .metasentence import MetaSentence 11 | from .resources import Resources 12 | 13 | MKGRAPH_PATH = get_binary("ext/m3") 14 | 15 | # [oov] no longer in words.txt 16 | OOV_TERM = '' 17 | 18 | def make_bigram_lm_fst(word_sequences, **kwargs): 19 | ''' 20 | Use the given token sequence to make a bigram language model 21 | in OpenFST plain text format. 22 | 23 | When the "conservative" flag is set, an [oov] is interleaved 24 | between successive words. 25 | 26 | When the "disfluency" flag is set, a small set of disfluencies is 27 | interleaved between successive words 28 | 29 | `Word sequence` is a list of lists, each valid as a start 30 | ''' 31 | 32 | if len(word_sequences) == 0 or type(word_sequences[0]) != list: 33 | word_sequences = [word_sequences] 34 | 35 | conservative = kwargs['conservative'] if 'conservative' in kwargs else False 36 | disfluency = kwargs['disfluency'] if 'disfluency' in kwargs else False 37 | disfluencies = kwargs['disfluencies'] if 'disfluencies' in kwargs else [] 38 | 39 | bigrams = {OOV_TERM: set([OOV_TERM])} 40 | 41 | for word_sequence in word_sequences: 42 | if len(word_sequence) == 0: 43 | continue 44 | 45 | prev_word = word_sequence[0] 46 | bigrams[OOV_TERM].add(prev_word) # valid start (?) 47 | 48 | if disfluency: 49 | bigrams[OOV_TERM].update(disfluencies) 50 | 51 | for dis in disfluencies: 52 | bigrams.setdefault(dis, set()).add(prev_word) 53 | bigrams[dis].add(OOV_TERM) 54 | 55 | for word in word_sequence[1:]: 56 | bigrams.setdefault(prev_word, set()).add(word) 57 | 58 | if conservative: 59 | bigrams[prev_word].add(OOV_TERM) 60 | 61 | if disfluency: 62 | bigrams[prev_word].update(disfluencies) 63 | 64 | for dis in disfluencies: 65 | bigrams[dis].add(word) 66 | 67 | prev_word = word 68 | 69 | # ...valid end 70 | bigrams.setdefault(prev_word, set()).add(OOV_TERM) 71 | 72 | node_ids = {} 73 | def get_node_id(word): 74 | node_id = node_ids.get(word, len(node_ids) + 1) 75 | node_ids[word] = node_id 76 | return node_id 77 | 78 | output = "" 79 | for from_word in sorted(bigrams.keys()): 80 | from_id = get_node_id(from_word) 81 | 82 | successors = bigrams[from_word] 83 | if len(successors) > 0: 84 | weight = -math.log(1.0 / len(successors)) 85 | else: 86 | weight = 0 87 | 88 | for to_word in sorted(successors): 89 | to_id = get_node_id(to_word) 90 | output += '%d %d %s %s %f' % (from_id, to_id, to_word, to_word, weight) 91 | output += "\n" 92 | 93 | output += "%d 0\n" % (len(node_ids)) 94 | 95 | return output.encode() 96 | 97 | def make_bigram_language_model(kaldi_seq, proto_langdir, **kwargs): 98 | """Generates a language model to fit the text. 99 | 100 | Returns the filename of the generated language model FST. 101 | The caller is resposible for removing the generated file. 102 | 103 | `proto_langdir` is a path to a directory containing prototype model data 104 | `kaldi_seq` is a list of words within kaldi's vocabulary. 105 | """ 106 | 107 | # Generate a textual FST 108 | txt_fst = make_bigram_lm_fst(kaldi_seq, **kwargs) 109 | txt_fst_file = tempfile.NamedTemporaryFile(delete=False) 110 | txt_fst_file.write(txt_fst) 111 | txt_fst_file.close() 112 | 113 | hclg_filename = tempfile.mktemp(suffix='_HCLG.fst') 114 | try: 115 | devnull = open(os.devnull, 'wb') 116 | subprocess.check_output([MKGRAPH_PATH, 117 | proto_langdir, 118 | txt_fst_file.name, 119 | hclg_filename], 120 | stderr=devnull) 121 | except Exception as e: 122 | try: 123 | os.unlink(hclg_filename) 124 | except: 125 | pass 126 | raise e 127 | finally: 128 | os.unlink(txt_fst_file.name) 129 | 130 | return hclg_filename 131 | 132 | if __name__=='__main__': 133 | import sys 134 | make_bigram_language_model(open(sys.argv[1]).read(), Resources().proto_langdir) 135 | -------------------------------------------------------------------------------- /gentle/metasentence.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import re 3 | 4 | # [oov] no longer in words.txt 5 | OOV_TERM = '' 6 | 7 | def load_vocabulary(words_file): 8 | '''Load vocabulary words from an OpenFST SymbolTable formatted text file''' 9 | return set(x.split(' ')[0] for x in words_file if x != '') 10 | 11 | def kaldi_normalize(word, vocab): 12 | """ 13 | Take a token extracted from a transcript by MetaSentence and 14 | transform it to use the same format as Kaldi's vocabulary files. 15 | Removes fancy punctuation and strips out-of-vocabulary words. 16 | """ 17 | # lowercase 18 | norm = word.lower() 19 | # Turn fancy apostrophes into simpler apostrophes 20 | norm = norm.replace("’", "'") 21 | if len(norm) > 0 and not norm in vocab: 22 | norm = OOV_TERM 23 | return norm 24 | 25 | class MetaSentence: 26 | """Maintain two parallel representations of a sentence: one for 27 | Kaldi's benefit, and the other in human-legible form. 28 | """ 29 | 30 | def __init__(self, sentence, vocab): 31 | self.raw_sentence = sentence 32 | 33 | if type(sentence) == bytes: 34 | self.raw_sentence = sentence.decode('utf-8') 35 | self.vocab = vocab 36 | 37 | self._tokenize() 38 | 39 | def _tokenize(self): 40 | self._seq = [] 41 | for m in re.finditer(r'(\w|\’\w|\'\w)+', self.raw_sentence, re.UNICODE): 42 | start, end = m.span() 43 | word = m.group() 44 | token = kaldi_normalize(word, self.vocab) 45 | self._seq.append({ 46 | "start": start, # as unicode codepoint offset 47 | "end": end, # as unicode codepoint offset 48 | "token": token, 49 | }) 50 | 51 | def get_kaldi_sequence(self): 52 | return [x["token"] for x in self._seq] 53 | 54 | def get_display_sequence(self): 55 | display_sequence = [] 56 | for x in self._seq: 57 | start, end = x["start"], x["end"] 58 | word = self.raw_sentence[start:end] 59 | display_sequence.append(word) 60 | return display_sequence 61 | 62 | def get_text_offsets(self): 63 | return [(x["start"], x["end"]) for x in self._seq] 64 | -------------------------------------------------------------------------------- /gentle/multipass.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from multiprocessing.pool import ThreadPool as Pool 3 | import os 4 | import wave 5 | 6 | from gentle import standard_kaldi 7 | from gentle import metasentence 8 | from gentle import language_model 9 | from gentle import diff_align 10 | from gentle import transcription 11 | 12 | def prepare_multipass(alignment): 13 | to_realign = [] 14 | last_aligned_word = None 15 | cur_unaligned_words = [] 16 | 17 | for wd_idx,wd in enumerate(alignment): 18 | if wd.not_found_in_audio(): 19 | cur_unaligned_words.append(wd) 20 | elif wd.success(): 21 | if len(cur_unaligned_words) > 0: 22 | to_realign.append({ 23 | "start": last_aligned_word, 24 | "end": wd, 25 | "words": cur_unaligned_words}) 26 | cur_unaligned_words = [] 27 | 28 | last_aligned_word = wd 29 | 30 | if len(cur_unaligned_words) > 0: 31 | to_realign.append({ 32 | "start": last_aligned_word, 33 | "end": None, 34 | "words": cur_unaligned_words}) 35 | 36 | return to_realign 37 | 38 | def realign(wavfile, alignment, ms, resources, nthreads=4, progress_cb=None): 39 | to_realign = prepare_multipass(alignment) 40 | realignments = [] 41 | 42 | def realign(chunk): 43 | wav_obj = wave.open(wavfile, 'rb') 44 | 45 | if chunk["start"] is None: 46 | start_t = 0 47 | else: 48 | start_t = chunk["start"].end 49 | 50 | if chunk["end"] is None: 51 | end_t = wav_obj.getnframes() / float(wav_obj.getframerate()) 52 | else: 53 | end_t = chunk["end"].start 54 | 55 | duration = end_t - start_t 56 | # XXX: the minimum length seems bigger now (?) 57 | if duration < 0.75 or duration > 60: 58 | logging.debug("cannot realign %d words with duration %f" % (len(chunk['words']), duration)) 59 | return 60 | 61 | # Create a language model 62 | offset_offset = chunk['words'][0].startOffset 63 | chunk_len = chunk['words'][-1].endOffset - offset_offset 64 | chunk_transcript = ms.raw_sentence[offset_offset:offset_offset+chunk_len].encode("utf-8") 65 | chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab) 66 | chunk_ks = chunk_ms.get_kaldi_sequence() 67 | 68 | chunk_gen_hclg_filename = language_model.make_bigram_language_model(chunk_ks, resources.proto_langdir) 69 | k = standard_kaldi.Kaldi( 70 | resources.nnet_gpu_path, 71 | chunk_gen_hclg_filename, 72 | resources.proto_langdir) 73 | 74 | wav_obj = wave.open(wavfile, 'rb') 75 | wav_obj.setpos(int(start_t * wav_obj.getframerate())) 76 | buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) 77 | 78 | k.push_chunk(buf) 79 | ret = [transcription.Word(**wd) for wd in k.get_final()] 80 | k.stop() 81 | 82 | word_alignment = diff_align.align(ret, chunk_ms) 83 | 84 | for wd in word_alignment: 85 | wd.shift(time=start_t, offset=offset_offset) 86 | 87 | # "chunk" should be replaced by "words" 88 | realignments.append({"chunk": chunk, "words": word_alignment}) 89 | 90 | if progress_cb is not None: 91 | progress_cb({"percent": len(realignments) / float(len(to_realign))}) 92 | 93 | pool = Pool(nthreads) 94 | pool.map(realign, to_realign) 95 | pool.close() 96 | 97 | # Sub in the replacements 98 | o_words = alignment 99 | for ret in realignments: 100 | st_idx = o_words.index(ret["chunk"]["words"][0]) 101 | end_idx= o_words.index(ret["chunk"]["words"][-1])+1 102 | #logging.debug('splice in: "%s' % (str(ret["words"]))) 103 | #logging.debug('splice out: "%s' % (str(o_words[st_idx:end_idx]))) 104 | o_words = o_words[:st_idx] + ret["words"] + o_words[end_idx:] 105 | 106 | return o_words 107 | -------------------------------------------------------------------------------- /gentle/resample.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import subprocess 4 | import tempfile 5 | 6 | from contextlib import contextmanager 7 | 8 | 9 | from .util.paths import get_binary 10 | 11 | FFMPEG = get_binary("ffmpeg") 12 | SOX = get_binary("sox") 13 | 14 | def resample_ffmpeg(infile, outfile, offset=None, duration=None): 15 | ''' 16 | Use FFMPEG to convert a media file to a wav file sampled at 8K 17 | ''' 18 | if offset is None: 19 | offset = [] 20 | else: 21 | offset = ['-ss', str(offset)] 22 | if duration is None: 23 | duration = [] 24 | else: 25 | duration = ['-t', str(duration)] 26 | 27 | cmd = [ 28 | FFMPEG, 29 | '-loglevel', 'panic', 30 | '-y', 31 | ] + offset + [ 32 | '-i', infile, 33 | ] + duration + [ 34 | '-ac', '1', '-ar', '8000', 35 | '-acodec', 'pcm_s16le', 36 | outfile 37 | ] 38 | return subprocess.call(cmd) 39 | 40 | def resample_sox(infile, outfile, offset=None, duration=None): 41 | ''' 42 | Use SoX to convert a media file to a wav file sampled at 8K 43 | ''' 44 | if offset is None and duration is None: 45 | trim = [] 46 | else: 47 | if offset is None: 48 | offset = 0 49 | trim = ['trim', str(offset)] 50 | if duration is not None: 51 | trim += [str(duration)] 52 | 53 | cmd = [ 54 | SOX, 55 | '--clobber', 56 | '-q', 57 | '-V1', 58 | infile, 59 | '-b', '16', 60 | '-c', '1', 61 | '-e', 'signed-integer', 62 | '-r', '8000', 63 | '-L', 64 | outfile 65 | ] + trim 66 | return subprocess.call(cmd) 67 | 68 | def resample(infile, outfile, offset=None, duration=None): 69 | if not os.path.isfile(infile): 70 | raise IOError("Not a file: %s" % infile) 71 | if shutil.which(FFMPEG) or os.path.exists(FFMPEG): 72 | return resample_ffmpeg(infile, outfile, offset, duration) 73 | else: 74 | return resample_sox(infile, outfile, offset, duration) 75 | 76 | @contextmanager 77 | def resampled(infile, offset=None, duration=None): 78 | with tempfile.NamedTemporaryFile(suffix='.wav') as fp: 79 | if resample(infile, fp.name, offset, duration) != 0: 80 | raise RuntimeError("Unable to resample/encode '%s'" % infile) 81 | yield fp.name 82 | -------------------------------------------------------------------------------- /gentle/resources.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from .util.paths import get_resource, ENV_VAR 5 | from . import metasentence 6 | 7 | class Resources(): 8 | 9 | def __init__(self): 10 | self.proto_langdir = get_resource('exp') 11 | self.nnet_gpu_path = get_resource('exp/tdnn_7b_chain_online/') 12 | self.full_hclg_path = get_resource('exp/tdnn_7b_chain_online/graph_pp/HCLG.fst') 13 | 14 | def require_dir(path): 15 | if not os.path.isdir(path): 16 | raise RuntimeError("No resource directory %s. Check %s environment variable?" % (path, ENV_VAR)) 17 | 18 | 19 | require_dir(self.proto_langdir) 20 | require_dir(self.nnet_gpu_path) 21 | 22 | with open(os.path.join(self.proto_langdir, "langdir", "words.txt")) as fh: 23 | self.vocab = metasentence.load_vocabulary(fh) 24 | 25 | 26 | -------------------------------------------------------------------------------- /gentle/rpc.py: -------------------------------------------------------------------------------- 1 | class RPCProtocol(object): 2 | '''RPCProtocol is the wire protocol we use to communicate with the 3 | standard_kaldi subprocess. It's a mixed text/binary protocol 4 | because we need to send binary audio chunks, but text is simpler.''' 5 | 6 | def __init__(self, send_pipe, recv_pipe): 7 | '''Initializes the RPCProtocol and reads from recv_pipe until the startup 8 | message is received.''' 9 | self.send_pipe = send_pipe 10 | self.recv_pipe = recv_pipe 11 | 12 | # don't wait for startup 13 | # body, _ = self._read_reply() 14 | # if body != 'loaded': 15 | # raise RuntimeError('unexpected message from standard_kaldi on load') 16 | 17 | def do(self, method, *args, **kwargs): 18 | '''Performs the method requested and returns the response body. 19 | The body keyword argument can be used to provide a binary request 20 | body. Throws an RPCError when the RPC returns an error.''' 21 | body = kwargs.get('body', None) 22 | self._write_request(method, args, body) 23 | return self._read_reply() 24 | 25 | def _write_request(self, method, args, body): 26 | '''Writes a request to the stream. 27 | Request format: 28 | MSG_SIZE\n 29 | METHOD ... \n 30 | BODY\n 31 | ''' 32 | data = method 33 | for arg in args: 34 | data += ' ' + arg 35 | data += '\n' 36 | if body: 37 | data += body 38 | 39 | try: 40 | self.send_pipe.write('%d\n' % len(data)) 41 | self.send_pipe.write(data) 42 | self.send_pipe.write('\n') 43 | except IOError as _: 44 | raise IOError("Lost connection with standard_kaldi subprocess") 45 | 46 | def _read_reply(self): 47 | '''Reads a reply from the stream. 48 | Reply format: 49 | MSG_SIZE\n 50 | STATUS\n 51 | BODY\n 52 | ''' 53 | try: 54 | msg_size = int(self.recv_pipe.readline()) 55 | data = self.recv_pipe.read(msg_size) 56 | self.recv_pipe.read(1) # trailing newline 57 | 58 | status_str, body = data.split('\n', 1) 59 | status = int(status_str) 60 | except IOError as _: 61 | raise IOError("Lost connection with standard_kaldi subprocess") 62 | 63 | if status < 200 or status >= 300: 64 | raise RPCError(status, body) 65 | 66 | return body, status 67 | 68 | class RPCError(Exception): 69 | '''Error thrown when standard_kaldi returns an error (in-band)''' 70 | def __init__(self, status, why): 71 | self.status = status 72 | self.why = why 73 | def __str__(self): 74 | return 'standard_kaldi: error %d: %s' % (self.status, self.why) 75 | -------------------------------------------------------------------------------- /gentle/standard_kaldi.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import os 3 | import logging 4 | 5 | from .util.paths import get_binary 6 | 7 | EXECUTABLE_PATH = get_binary("ext/k3") 8 | logger = logging.getLogger(__name__) 9 | 10 | STDERR = subprocess.DEVNULL 11 | 12 | class Kaldi: 13 | def __init__(self, nnet_dir=None, hclg_path=None, proto_langdir=None): 14 | cmd = [EXECUTABLE_PATH] 15 | 16 | if nnet_dir is not None: 17 | cmd.append(nnet_dir) 18 | cmd.append(hclg_path) 19 | 20 | if not os.path.exists(hclg_path): 21 | logger.error('hclg_path does not exist: %s', hclg_path) 22 | self._p = subprocess.Popen(cmd, 23 | stdin=subprocess.PIPE, stdout=subprocess.PIPE, 24 | stderr=STDERR, bufsize=0) 25 | self.finished = False 26 | 27 | def _cmd(self, c): 28 | self._p.stdin.write(("%s\n" % (c)).encode()) 29 | self._p.stdin.flush() 30 | 31 | def push_chunk(self, buf): 32 | # Wait until we're ready 33 | self._cmd("push-chunk") 34 | 35 | cnt = int(len(buf)/2) 36 | self._cmd(str(cnt)) 37 | self._p.stdin.write(buf) #arr.tostring()) 38 | status = self._p.stdout.readline().strip().decode() 39 | return status == 'ok' 40 | 41 | def get_final(self): 42 | self._cmd("get-final") 43 | words = [] 44 | while True: 45 | line = self._p.stdout.readline().decode() 46 | if line.startswith("done"): 47 | break 48 | parts = line.split(' / ') 49 | if line.startswith('word'): 50 | wd = {} 51 | wd['word'] = parts[0].split(': ')[1] 52 | wd['start'] = float(parts[1].split(': ')[1]) 53 | wd['duration'] = float(parts[2].split(': ')[1]) 54 | wd['phones'] = [] 55 | words.append(wd) 56 | elif line.startswith('phone'): 57 | ph = {} 58 | ph['phone'] = parts[0].split(': ')[1] 59 | ph['duration'] = float(parts[1].split(': ')[1]) 60 | words[-1]['phones'].append(ph) 61 | 62 | self._reset() 63 | return words 64 | 65 | def _reset(self): 66 | self._cmd("reset") 67 | 68 | def stop(self): 69 | if not self.finished: 70 | self.finished = True 71 | self._cmd("stop") 72 | self._p.stdin.close() 73 | self._p.stdout.close() 74 | self._p.wait() 75 | 76 | def __del__(self): 77 | self.stop() 78 | 79 | if __name__=='__main__': 80 | import numm3 81 | import sys 82 | 83 | infile = sys.argv[1] 84 | 85 | k = Kaldi() 86 | 87 | buf = numm3.sound2np(infile, nchannels=1, R=8000) 88 | print('loaded_buf', len(buf)) 89 | 90 | idx=0 91 | while idx < len(buf): 92 | k.push_chunk(buf[idx:idx+160000].tostring()) 93 | print(k.get_final()) 94 | idx += 160000 95 | -------------------------------------------------------------------------------- /gentle/transcriber.py: -------------------------------------------------------------------------------- 1 | import math 2 | import logging 3 | import wave 4 | 5 | from gentle import transcription 6 | 7 | from multiprocessing.pool import ThreadPool as Pool 8 | 9 | class MultiThreadedTranscriber: 10 | def __init__(self, kaldi_queue, chunk_len=20, overlap_t=2, nthreads=4): 11 | self.chunk_len = chunk_len 12 | self.overlap_t = overlap_t 13 | self.nthreads = nthreads 14 | 15 | self.kaldi_queue = kaldi_queue 16 | 17 | def transcribe(self, wavfile, progress_cb=None): 18 | wav_obj = wave.open(wavfile, 'rb') 19 | duration = wav_obj.getnframes() / float(wav_obj.getframerate()) 20 | n_chunks = int(math.ceil(duration / float(self.chunk_len - self.overlap_t))) 21 | 22 | chunks = [] 23 | 24 | 25 | def transcribe_chunk(idx): 26 | wav_obj = wave.open(wavfile, 'rb') 27 | start_t = idx * (self.chunk_len - self.overlap_t) 28 | # Seek 29 | wav_obj.setpos(int(start_t * wav_obj.getframerate())) 30 | # Read frames 31 | buf = wav_obj.readframes(int(self.chunk_len * wav_obj.getframerate())) 32 | 33 | if len(buf) < 4000: 34 | logging.info('Short segment - ignored %d' % (idx)) 35 | ret = [] 36 | else: 37 | k = self.kaldi_queue.get() 38 | k.push_chunk(buf) 39 | ret = k.get_final() 40 | # k.reset() (no longer needed) 41 | self.kaldi_queue.put(k) 42 | 43 | chunks.append({"start": start_t, "words": ret}) 44 | logging.info('%d/%d' % (len(chunks), n_chunks)) 45 | if progress_cb is not None: 46 | progress_cb({"message": ' '.join([X['word'] for X in ret]), 47 | "percent": len(chunks) / float(n_chunks)}) 48 | 49 | 50 | pool = Pool(min(n_chunks, self.nthreads)) 51 | pool.map(transcribe_chunk, range(n_chunks)) 52 | pool.close() 53 | 54 | chunks.sort(key=lambda x: x['start']) 55 | 56 | # Combine chunks 57 | words = [] 58 | for c in chunks: 59 | chunk_start = c['start'] 60 | chunk_end = chunk_start + self.chunk_len 61 | 62 | chunk_words = [transcription.Word(**wd).shift(time=chunk_start) for wd in c['words']] 63 | 64 | # At chunk boundary cut points the audio often contains part of a 65 | # word, which can get erroneously identified as one or more different 66 | # in-vocabulary words. So discard one or more words near the cut points 67 | # (they'll be covered by the ovlerap anyway). 68 | # 69 | trim = min(0.25 * self.overlap_t, 0.5) 70 | if c is not chunks[0]: 71 | while len(chunk_words) > 1: 72 | chunk_words.pop(0) 73 | if chunk_words[0].end > chunk_start + trim: 74 | break 75 | if c is not chunks[-1]: 76 | while len(chunk_words) > 1: 77 | chunk_words.pop() 78 | if chunk_words[-1].start < chunk_end - trim: 79 | break 80 | 81 | words.extend(chunk_words) 82 | 83 | # Remove overlap: Sort by time, then filter out any Word entries in 84 | # the list that are adjacent to another entry corresponding to the same 85 | # word in the audio. 86 | words.sort(key=lambda word: word.start) 87 | words.append(transcription.Word(word="__dummy__")) 88 | words = [words[i] for i in range(len(words)-1) if not words[i].corresponds(words[i+1])] 89 | 90 | return words, duration 91 | 92 | 93 | if __name__=='__main__': 94 | # full transcription 95 | import json 96 | import sys 97 | 98 | import logging 99 | logging.getLogger().setLevel('INFO') 100 | 101 | import gentle 102 | from gentle import standard_kaldi 103 | from gentle import kaldi_queue 104 | 105 | resources = gentle.Resources() 106 | 107 | k_queue = kaldi_queue.build(resources, 3) 108 | trans = MultiThreadedTranscriber(k_queue) 109 | 110 | with gentle.resampled(sys.argv[1]) as filename: 111 | words, duration = trans.transcribe(filename) 112 | 113 | open(sys.argv[2], 'w').write(transcription.Transcription(words=words).to_json()) 114 | 115 | -------------------------------------------------------------------------------- /gentle/transcription.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import io 3 | import json 4 | 5 | from collections import defaultdict 6 | 7 | class Word: 8 | 9 | SUCCESS = 'success' 10 | NOT_FOUND_IN_AUDIO = 'not-found-in-audio' 11 | NOT_FOUND_IN_TRANSCRIPT = 'not-found-in-transcript' 12 | 13 | def __init__(self, case=None, startOffset=None, endOffset=None, word=None, alignedWord=None, phones=None, start=None, end=None, duration=None): 14 | self.case = case 15 | self.startOffset = startOffset 16 | self.endOffset = endOffset 17 | self.word = word 18 | self.alignedWord = alignedWord 19 | self.phones = phones 20 | self.start = start 21 | self.duration = duration 22 | self.end = end 23 | if start is not None: 24 | if end is None: 25 | self.end = start + duration 26 | elif duration is None: 27 | self.duration = end - start 28 | 29 | def success(self): 30 | return self.case == Word.SUCCESS 31 | 32 | def not_found_in_audio(self): 33 | return self.case == Word.NOT_FOUND_IN_AUDIO 34 | 35 | def as_dict(self, without=None): 36 | return { key:val for key, val in self.__dict__.items() if (val is not None) and (key != without)} 37 | 38 | def __eq__(self, other): 39 | return self.__dict__ == other.__dict__ 40 | 41 | def __ne__(self, other): 42 | return not self == other 43 | 44 | def __repr__(self): 45 | return "Word(" + " ".join(sorted([key + "=" + str(val) for key, val in self.as_dict(without="phones").items()])) + ")" 46 | 47 | def shift(self, time=None, offset=None): 48 | if self.start is not None and time is not None: 49 | self.start += time 50 | self.end += time 51 | 52 | if self.startOffset is not None and offset is not None: 53 | self.startOffset += offset 54 | self.endOffset += offset 55 | 56 | return self # for easy chaining 57 | 58 | def swap_alignment(self, other): 59 | '''Swaps the alignment info of two words, but does not swap the offset''' 60 | self.case, other.case = other.case, self.case 61 | self.alignedWord, other.alignedWord = other.alignedWord, self.alignedWord 62 | self.phones, other.phones = other.phones, self.phones 63 | self.start, other.start = other.start, self.start 64 | self.end, other.end = other.end, self.end 65 | self.duration, other.duration = other.duration, self.duration 66 | 67 | def corresponds(self, other): 68 | '''Returns true if self and other refer to the same word, at the same position in the audio (within a small tolerance)''' 69 | if self.word != other.word: return False 70 | return abs(self.start - other.start) / (self.duration + other.duration) < 0.1 71 | 72 | class Transcription: 73 | 74 | def __init__(self, transcript=None, words=None): 75 | self.transcript = transcript 76 | self.words = words 77 | 78 | def __eq__(self, other): 79 | return self.transcript == other.transcript and self.words == other.words 80 | 81 | def to_json(self, **kwargs): 82 | '''Return a JSON representation of the aligned transcript''' 83 | options = { 84 | 'sort_keys': True, 85 | 'indent': 4, 86 | 'separators': (',', ': '), 87 | } 88 | options.update(kwargs) 89 | 90 | container = {} 91 | if self.transcript: 92 | container['transcript'] = self.transcript 93 | if self.words: 94 | container['words'] = [word.as_dict(without="duration") for word in self.words] 95 | return json.dumps(container, **options) 96 | 97 | @classmethod 98 | def from_json(cls, json_str): 99 | return cls._from_jsondata(json.loads(json_str)) 100 | 101 | @classmethod 102 | def from_jsonfile(cls, filename): 103 | with open(filename) as fh: 104 | return cls._from_jsondata(json.load(fh)) 105 | 106 | @classmethod 107 | def _from_jsondata(cls, data): 108 | return cls(transcript = data['transcript'], words = [Word(**wd) for wd in data['words']]) 109 | 110 | def to_csv(self): 111 | '''Return a CSV representation of the aligned transcript. Format: 112 | 113 | ''' 114 | if not self.words: 115 | return '' 116 | buf = io.StringIO() 117 | w = csv.writer(buf) 118 | for X in self.words: 119 | if X.case not in (Word.SUCCESS, Word.NOT_FOUND_IN_AUDIO): 120 | continue 121 | row = [X.word, 122 | X.alignedWord, 123 | X.start, 124 | X.end 125 | ] 126 | w.writerow(row) 127 | return buf.getvalue() 128 | 129 | def stats(self): 130 | counts = defaultdict(int) 131 | for word in self.words: 132 | counts[word.case] += 1 133 | stats = {} 134 | stats['total'] = len(self.words) 135 | for key, val in counts.items(): 136 | stats[key] = val 137 | return stats 138 | 139 | Transcription.Word = Word 140 | -------------------------------------------------------------------------------- /gentle/util/__init__.py: -------------------------------------------------------------------------------- 1 | # nothing here right now 2 | -------------------------------------------------------------------------------- /gentle/util/cyst.py: -------------------------------------------------------------------------------- 1 | # Twisted lazy computations 2 | # (from rmo-sketchbook/cyst/cyst.py) 3 | 4 | import mimetypes 5 | import os 6 | 7 | from twisted.web.static import File 8 | from twisted.web.resource import Resource 9 | from twisted.web.server import Site, NOT_DONE_YET 10 | from twisted.internet import reactor 11 | 12 | class Insist(Resource): 13 | isLeaf = True 14 | 15 | def __init__(self, cacheloc): 16 | self.cacheloc = cacheloc 17 | self.cachefile = None 18 | if os.path.exists(cacheloc): 19 | self.cachefile = File(cacheloc) 20 | self.reqs_waiting = [] 21 | self.started = False 22 | Resource.__init__(self) 23 | 24 | def render_GET(self, req): 25 | # Check if someone else has created the file somehow 26 | if self.cachefile is None and os.path.exists(self.cacheloc): 27 | self.cachefile = File(self.cacheloc) 28 | # Check if someone else has *deleted* the file 29 | elif self.cachefile is not None and not os.path.exists(self.cacheloc): 30 | self.cachefile = None 31 | 32 | if self.cachefile is not None: 33 | return self.cachefile.render_GET(req) 34 | else: 35 | self.reqs_waiting.append(req) 36 | req.notifyFinish().addErrback( 37 | self._nevermind, req) 38 | if not self.started: 39 | self.started = True 40 | reactor.callInThread(self.desist) 41 | return NOT_DONE_YET 42 | 43 | def _nevermind(self, _err, req): 44 | self.reqs_waiting.remove(req) 45 | 46 | def desist(self): 47 | self.serialize_computation(self.cacheloc) 48 | reactor.callFromThread(self.resist) 49 | 50 | def _get_mime(self): 51 | return mimetypes.guess_type(self.cacheloc)[0] 52 | 53 | def resist(self): 54 | if not os.path.exists(self.cacheloc): 55 | # Error! 56 | print("%s does not exist - rendering fail!" % (self.cacheloc)) 57 | for req in self.reqs_waiting: 58 | req.headers[b"Content-Type"] = b"text/plain" 59 | req.write(b"cyst error") 60 | req.finish() 61 | return 62 | 63 | self.cachefile = File(self.cacheloc) 64 | 65 | # Send content to all interested parties 66 | for req in self.reqs_waiting: 67 | self.cachefile.render(req) 68 | 69 | def serialize_computation(self, outpath): 70 | raise NotImplemented 71 | 72 | class HelloCyst(Insist): 73 | def serialize_computation(self, outpath): 74 | import time 75 | time.sleep(10) 76 | open(outpath, "w").write("Hello, World") 77 | 78 | if __name__=='__main__': 79 | import sys 80 | c = HelloCyst(sys.argv[1]) 81 | site = Site(c) 82 | port = 7984 83 | reactor.listenTCP(port, site) 84 | print("http://localhost:%d" % (port)) 85 | reactor.run() 86 | -------------------------------------------------------------------------------- /gentle/util/paths.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import logging 4 | import shutil 5 | import sys 6 | 7 | ENV_VAR = 'GENTLE_RESOURCES_ROOT' 8 | 9 | class SourceResolver: 10 | def __init__(self): 11 | self.project_root = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir)) 12 | 13 | def get_binary(self, name): 14 | path_in_project = os.path.join(self.project_root, name) 15 | if os.path.exists(path_in_project): 16 | return path_in_project 17 | else: 18 | return name 19 | 20 | def get_resource(self, name): 21 | root = os.environ.get(ENV_VAR) or self.project_root 22 | return os.path.join(root, name) 23 | 24 | def get_datadir(self, name): 25 | return self.get_resource(name) 26 | 27 | class PyinstallResolver: 28 | def __init__(self): 29 | self.root = os.path.abspath(os.path.join(getattr(sys, '_MEIPASS', ''), os.pardir, 'Resources')) 30 | 31 | def get_binary(self, name): 32 | return os.path.join(self.root, name) 33 | 34 | def get_resource(self, name): 35 | rpath = os.path.join(self.root, name) 36 | if os.path.exists(rpath): 37 | return rpath 38 | else: 39 | return get_datadir(name) # DMG may be read-only; fall-back to datadir (ie. so language models can be added) 40 | 41 | def get_datadir(self, path): 42 | return os.path.join(os.environ['HOME'], '.gentle', path) 43 | 44 | RESOLVER = PyinstallResolver() if hasattr(sys, "frozen") else SourceResolver() 45 | 46 | 47 | def get_binary(name): 48 | return RESOLVER.get_binary(name) 49 | 50 | def get_resource(path): 51 | return RESOLVER.get_resource(path) 52 | 53 | def get_datadir(path): 54 | return RESOLVER.get_datadir(path) 55 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | git submodule init 6 | git submodule update 7 | 8 | ./install_deps.sh 9 | (cd ext && ./install_kaldi.sh) 10 | ./install_models.sh 11 | cd ext && make depend && make 12 | -------------------------------------------------------------------------------- /install_deps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | echo "Installing dependencies..." 6 | 7 | # Install OS-specific dependencies 8 | if [[ "$OSTYPE" == "linux-gnu" ]]; then 9 | apt-get update -qq 10 | apt-get install -y zlib1g-dev automake autoconf git \ 11 | libtool subversion libatlas3-base python3-pip \ 12 | python3-dev wget unzip python3 13 | apt-get install -y ffmpeg || echo -n "\n\nYou have to install ffmpeg from a PPA or from https://ffmpeg.org before you can run gentle\n\n" 14 | python3 setup.py develop 15 | elif [[ "$OSTYPE" == "darwin"* ]]; then 16 | brew install ffmpeg libtool automake autoconf wget python3 17 | sudo python3 setup.py develop 18 | fi 19 | -------------------------------------------------------------------------------- /install_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | VERSION="0.04" 6 | 7 | download_models() { 8 | local version="$1" 9 | local filename="kaldi-models-$version.zip" 10 | local url="https://rmozone.com/gentle/$filename" 11 | wget -O $filename $url 12 | unzip $filename 13 | rm $filename 14 | } 15 | 16 | echo "Downloading models for v$VERSION..." 1>&2 17 | download_models $VERSION 18 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | disable=locally-disabled 3 | -------------------------------------------------------------------------------- /serve.py: -------------------------------------------------------------------------------- 1 | from twisted.web.static import File 2 | from twisted.web.resource import Resource 3 | from twisted.web.server import Site, NOT_DONE_YET 4 | from twisted.internet import reactor, threads 5 | from twisted.web._responses import FOUND 6 | 7 | import json 8 | import logging 9 | import multiprocessing 10 | import os 11 | import shutil 12 | import uuid 13 | import wave 14 | 15 | from gentle.util.paths import get_resource, get_datadir 16 | from gentle.util.cyst import Insist 17 | 18 | import gentle 19 | 20 | class TranscriptionStatus(Resource): 21 | def __init__(self, status_dict): 22 | self.status_dict = status_dict 23 | Resource.__init__(self) 24 | 25 | def render_GET(self, req): 26 | req.setHeader(b"Content-Type", "application/json") 27 | return json.dumps(self.status_dict).encode() 28 | 29 | class Transcriber(): 30 | def __init__(self, data_dir, nthreads=4, ntranscriptionthreads=2): 31 | self.data_dir = data_dir 32 | self.nthreads = nthreads 33 | self.ntranscriptionthreads = ntranscriptionthreads 34 | self.resources = gentle.Resources() 35 | 36 | self.full_transcriber = gentle.FullTranscriber(self.resources, nthreads=ntranscriptionthreads) 37 | self._status_dicts = {} 38 | 39 | def get_status(self, uid): 40 | return self._status_dicts.setdefault(uid, {}) 41 | 42 | def out_dir(self, uid): 43 | return os.path.join(self.data_dir, 'transcriptions', uid) 44 | 45 | # TODO(maxhawkins): refactor so this is returned by transcribe() 46 | def next_id(self): 47 | uid = None 48 | while uid is None or os.path.exists(os.path.join(self.data_dir, uid)): 49 | uid = uuid.uuid4().hex[:8] 50 | return uid 51 | 52 | def transcribe(self, uid, transcript, audio, async_mode, **kwargs): 53 | 54 | status = self.get_status(uid) 55 | 56 | status['status'] = 'STARTED' 57 | output = { 58 | 'transcript': transcript 59 | } 60 | 61 | outdir = os.path.join(self.data_dir, 'transcriptions', uid) 62 | 63 | tran_path = os.path.join(outdir, 'transcript.txt') 64 | with open(tran_path, 'w') as tranfile: 65 | tranfile.write(transcript) 66 | audio_path = os.path.join(outdir, 'upload') 67 | with open(audio_path, 'wb') as wavfile: 68 | wavfile.write(audio) 69 | 70 | status['status'] = 'ENCODING' 71 | 72 | wavfile = os.path.join(outdir, 'a.wav') 73 | if gentle.resample(os.path.join(outdir, 'upload'), wavfile) != 0: 74 | status['status'] = 'ERROR' 75 | status['error'] = "Encoding failed. Make sure that you've uploaded a valid media file." 76 | # Save the status so that errors are recovered on restart of the server 77 | # XXX: This won't work, because the endpoint will override this file 78 | with open(os.path.join(outdir, 'status.json'), 'w') as jsfile: 79 | json.dump(status, jsfile, indent=2) 80 | return 81 | 82 | #XXX: Maybe we should pass this wave object instead of the 83 | # file path to align_progress 84 | wav_obj = wave.open(wavfile, 'rb') 85 | status['duration'] = wav_obj.getnframes() / float(wav_obj.getframerate()) 86 | status['status'] = 'TRANSCRIBING' 87 | 88 | def on_progress(p): 89 | print(p) 90 | for k,v in p.items(): 91 | status[k] = v 92 | 93 | if len(transcript.strip()) > 0: 94 | trans = gentle.ForcedAligner(self.resources, transcript, nthreads=self.nthreads, **kwargs) 95 | elif self.full_transcriber.available: 96 | trans = self.full_transcriber 97 | else: 98 | status['status'] = 'ERROR' 99 | status['error'] = 'No transcript provided and no language model for full transcription' 100 | return 101 | 102 | output = trans.transcribe(wavfile, progress_cb=on_progress, logging=logging) 103 | 104 | # ...remove the original upload 105 | os.unlink(os.path.join(outdir, 'upload')) 106 | 107 | # Save 108 | with open(os.path.join(outdir, 'align.json'), 'w') as jsfile: 109 | jsfile.write(output.to_json(indent=2)) 110 | with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile: 111 | csvfile.write(output.to_csv()) 112 | 113 | # Inline the alignment into the index.html file. 114 | htmltxt = open(get_resource('www/view_alignment.html')).read() 115 | htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (output.to_json())); 116 | open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt) 117 | 118 | status['status'] = 'OK' 119 | 120 | logging.info('done with transcription.') 121 | 122 | return output 123 | 124 | class TranscriptionsController(Resource): 125 | def __init__(self, transcriber): 126 | Resource.__init__(self) 127 | self.transcriber = transcriber 128 | 129 | def getChild(self, uid, req): 130 | uid = uid.decode() 131 | out_dir = self.transcriber.out_dir(uid) 132 | trans_ctrl = File(out_dir) 133 | 134 | # Add a Status endpoint to the file 135 | trans_status = TranscriptionStatus(self.transcriber.get_status(uid)) 136 | trans_ctrl.putChild(b"status.json", trans_status) 137 | 138 | return trans_ctrl 139 | 140 | def render_POST(self, req): 141 | uid = self.transcriber.next_id() 142 | 143 | tran = req.args.get(b'transcript', [b''])[0].decode() 144 | audio = req.args[b'audio'][0] 145 | 146 | disfluency = True if b'disfluency' in req.args else False 147 | conservative = True if b'conservative' in req.args else False 148 | kwargs = {'disfluency': disfluency, 149 | 'conservative': conservative, 150 | 'disfluencies': set(['uh', 'um'])} 151 | 152 | async_mode = True 153 | if b'async' in req.args and req.args[b'async'][0] == b'false': 154 | async_mode = False 155 | 156 | # We need to make the transcription directory here, so that 157 | # when we redirect the user we are sure that there's a place 158 | # for them to go. 159 | outdir = os.path.join(self.transcriber.data_dir, 'transcriptions', uid) 160 | os.makedirs(outdir) 161 | 162 | # Copy over the HTML 163 | shutil.copy(get_resource('www/view_alignment.html'), os.path.join(outdir, 'index.html')) 164 | 165 | result_promise = threads.deferToThreadPool( 166 | reactor, reactor.getThreadPool(), 167 | self.transcriber.transcribe, 168 | uid, tran, audio, async_mode, **kwargs) 169 | 170 | if not async_mode: 171 | def write_result(result): 172 | '''Write JSON to client on completion''' 173 | req.setHeader("Content-Type", "application/json") 174 | req.write(result.to_json(indent=2).encode()) 175 | req.finish() 176 | result_promise.addCallback(write_result) 177 | result_promise.addErrback(lambda _: None) # ignore errors 178 | 179 | req.notifyFinish().addErrback(lambda _: result_promise.cancel()) 180 | 181 | return NOT_DONE_YET 182 | 183 | req.setResponseCode(FOUND) 184 | req.setHeader(b"Location", "/transcriptions/%s" % (uid)) 185 | return b'' 186 | 187 | class LazyZipper(Insist): 188 | def __init__(self, cachedir, transcriber, uid): 189 | self.transcriber = transcriber 190 | self.uid = uid 191 | Insist.__init__(self, os.path.join(cachedir, '%s.zip' % (uid))) 192 | 193 | def serialize_computation(self, outpath): 194 | shutil.make_archive('.'.join(outpath.split('.')[:-1]), # We need to strip the ".zip" from the end 195 | "zip", # ...because `shutil.make_archive` adds it back 196 | os.path.join(self.transcriber.out_dir(self.uid))) 197 | 198 | class TranscriptionZipper(Resource): 199 | def __init__(self, cachedir, transcriber): 200 | self.cachedir = cachedir 201 | self.transcriber = transcriber 202 | Resource.__init__(self) 203 | 204 | def getChild(self, path, req): 205 | uid = path.decode().split('.')[0] 206 | t_dir = self.transcriber.out_dir(uid) 207 | if os.path.exists(t_dir): 208 | # TODO: Check that "status" is complete and only create a LazyZipper if so 209 | # Otherwise, we could have incomplete transcriptions that get permanently zipped. 210 | # For now, a solution will be hiding the button in the client until it's done. 211 | lz = LazyZipper(self.cachedir, self.transcriber, uid) 212 | if not isinstance(path, bytes): 213 | path = path.encode() 214 | self.putChild(path, lz) 215 | return lz 216 | else: 217 | return Resource.getChild(self, path, req) 218 | 219 | def serve(port=8765, interface='0.0.0.0', installSignalHandlers=0, nthreads=4, ntranscriptionthreads=2, data_dir=get_datadir('webdata')): 220 | logging.info("SERVE %d, %s, %d", port, interface, installSignalHandlers) 221 | 222 | if not os.path.exists(data_dir): 223 | os.makedirs(data_dir) 224 | 225 | zip_dir = os.path.join(data_dir, 'zip') 226 | if not os.path.exists(zip_dir): 227 | os.makedirs(zip_dir) 228 | 229 | f = File(data_dir) 230 | 231 | f.putChild(b'', File(get_resource('www/index.html'))) 232 | f.putChild(b'status.html', File(get_resource('www/status.html'))) 233 | f.putChild(b'preloader.gif', File(get_resource('www/preloader.gif'))) 234 | 235 | trans = Transcriber(data_dir, nthreads=nthreads, ntranscriptionthreads=ntranscriptionthreads) 236 | trans_ctrl = TranscriptionsController(trans) 237 | f.putChild(b'transcriptions', trans_ctrl) 238 | 239 | trans_zippr = TranscriptionZipper(zip_dir, trans) 240 | f.putChild(b'zip', trans_zippr) 241 | 242 | s = Site(f) 243 | logging.info("about to listen") 244 | reactor.listenTCP(port, s, interface=interface) 245 | logging.info("listening") 246 | 247 | reactor.run(installSignalHandlers=installSignalHandlers) 248 | 249 | 250 | if __name__=='__main__': 251 | import argparse 252 | 253 | parser = argparse.ArgumentParser( 254 | description='Align a transcript to audio by generating a new language model.') 255 | parser.add_argument('--host', default="0.0.0.0", 256 | help='host to run http server on') 257 | parser.add_argument('--port', default=8765, type=int, 258 | help='port number to run http server on') 259 | parser.add_argument('--nthreads', default=multiprocessing.cpu_count(), type=int, 260 | help='number of alignment threads') 261 | parser.add_argument('--ntranscriptionthreads', default=2, type=int, 262 | help='number of full-transcription threads (memory intensive)') 263 | parser.add_argument('--log', default="INFO", 264 | help='the log level (DEBUG, INFO, WARNING, ERROR, or CRITICAL)') 265 | 266 | args = parser.parse_args() 267 | 268 | log_level = args.log.upper() 269 | logging.getLogger().setLevel(log_level) 270 | 271 | logging.info('gentle %s' % (gentle.__version__)) 272 | logging.info('listening at %s:%d\n' % (args.host, args.port)) 273 | 274 | serve(args.port, args.host, nthreads=args.nthreads, ntranscriptionthreads=args.ntranscriptionthreads, installSignalHandlers=1) 275 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from gentle import __version__ 3 | 4 | setup( 5 | app=['serve.py'], 6 | data_files=[], 7 | options={'py2app': { 8 | 'argv_emulation': False, 9 | 'resources': 'k3,m3,ffmpeg,www,exp' 10 | }}, 11 | name='gentle', 12 | version=__version__, 13 | description='Robust yet lenient forced-aligner built on Kaldi.', 14 | url='http://lowerquality.com/gentle', 15 | author='Robert M Ochshorn', 16 | license='MIT', 17 | packages=['gentle'], 18 | install_requires=['twisted'], 19 | test_suite='tests', 20 | ) 21 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strob/gentle/087a3b738bad9ebafd0570561486252cb624ba3a/tests/__init__.py -------------------------------------------------------------------------------- /tests/base.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class Base(unittest.TestCase): 4 | 5 | def test_import(self): 6 | import gentle 7 | 8 | def test_resources(self): 9 | import gentle 10 | resources = gentle.Resources() 11 | import gentle.util.paths 12 | self.assertNotEqual(gentle.util.paths.get_binary("ext/k3"), "ext/k3") 13 | -------------------------------------------------------------------------------- /tests/transcriber.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | class Aligner(unittest.TestCase): 5 | audio = 'examples/data/lucier.mp3' 6 | transcript = "i am sitting in a room" 7 | 8 | def test_resources(self): 9 | from gentle import Resources 10 | from gentle.util.paths import get_binary 11 | 12 | resources = Resources() 13 | k3 = get_binary("ext/k3") 14 | model = get_binary("exp/tdnn_7b_chain_online/final.mdl" ) 15 | 16 | self.assertEqual(os.path.exists(self.audio), True) 17 | self.assertEqual(os.path.exists(k3), True) 18 | self.assertEqual(os.path.exists(model), True) 19 | 20 | def test_aligner(self): 21 | import subprocess 22 | from gentle import resampled, standard_kaldi, Resources 23 | from gentle.forced_aligner import ForcedAligner 24 | from gentle.transcription import Word 25 | 26 | standard_kaldi.STDERR = subprocess.STDOUT 27 | 28 | resources = Resources() 29 | align = ForcedAligner(resources, self.transcript, nthreads=1) 30 | 31 | with resampled(self.audio, 5.0, 5.0) as filename: 32 | transcription = align.transcribe(filename) 33 | words = transcription.words 34 | self.assertEqual(words[0].word, "i") 35 | self.assertEqual(words[1].word, "am") 36 | self.assertEqual(words[1].case, Word.SUCCESS) 37 | -------------------------------------------------------------------------------- /www/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 61 | 62 | 63 | 66 |
67 | Audio:
68 |
69 |
70 | Transcript:
71 |
72 | Conservative
73 | Include disfluencies
74 | 75 |
76 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /www/preloader.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strob/gentle/087a3b738bad9ebafd0570561486252cb624ba3a/www/preloader.gif -------------------------------------------------------------------------------- /www/view_alignment.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 121 | 122 | 123 | 128 | 129 |
130 | 133 | 134 | 407 | --------------------------------------------------------------------------------