├── Dockerfile
├── LICENSE
├── README.md
├── asr.wav
├── asr_client.py
├── docker-build.bash
├── docker-run.bash
└── environment.yml
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM quay.io/mpuels/docker-py-kaldi-asr:0.4.1
2 |
3 | RUN apt-get install xz-utils -y && \
4 | apt-get clean && \
5 | apt-get autoclean && \
6 | apt-get autoremove -y
7 |
8 | ARG MODEL_NAME=kaldi-generic-en-tdnn_250-r20180815
9 |
10 | WORKDIR /opt
11 | RUN wget -q http://goofy.zamia.org/zamia-speech/asr-models/${MODEL_NAME}.tar.xz && \
12 | tar xf ${MODEL_NAME}.tar.xz && \
13 | mv ${MODEL_NAME} kaldi-model && \
14 | rm ${MODEL_NAME}.tar.xz
15 |
16 | EXPOSE 80
17 |
18 | WORKDIR /opt/asr_server
19 | CMD ["python", "asr_server.py"]
20 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # STT Service based on Kaldi ASR
2 |
3 | This image contains a demo STT service based on
4 | [Kaldi ASR](https://github.com/kaldi-asr/kaldi) and
5 | [py-kaldi-asr](https://github.com/gooofy/py-kaldi-asr). Try it out by following
6 | these steps.
7 |
8 | To start the STT service on your local machine, execute:
9 |
10 | $ docker pull quay.io/mpuels/docker-py-kaldi-asr-and-model:kaldi-generic-en-tdnn_sp-r20180815
11 | $ docker run --rm -p 127.0.0.1:8080:80/tcp quay.io/mpuels/docker-py-kaldi-asr-and-model:kaldi-generic-en-tdnn_sp-r20180815
12 |
13 | To transfer an audio file for transcription to the service, in a second
14 | terminal, execute:
15 |
16 | $ conda env create -f environment.yml
17 | $ source activate py-kaldi-asr-client
18 | $ ./asr_client.py asr.wav
19 |
20 | For a list of available Kaldi models packaged in Docker containers, see
21 | https://quay.io/repository/mpuels/docker-py-kaldi-asr-and-model?tab=tags .
22 |
23 | For a description of the available models, see
24 | https://github.com/gooofy/zamia-speech#asr-models .
25 |
26 | Docker images are named according to the format
27 |
28 | kaldi-generic--tdnn--
29 |
30 | 1. ``: There are models for English (`en`) and German (`de`).
31 | 2. ``: Kaldi models come in two sizes: `sp` (standard size) and `250` (
32 | smaller size, suitable for realtime decoding on Raspberry Pi).
33 | 3. ``: Usually, models released later are trained on more data and
34 | hence have a lower word error rate.
35 |
36 | The image is part of [Zamia Speech](https://github.com/gooofy/zamia-speech).
37 |
--------------------------------------------------------------------------------
/asr.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mpuels/docker-py-kaldi-asr-and-model/b95879a0bc8eebaedb21c6066c3ecad06df7d89c/asr.wav
--------------------------------------------------------------------------------
/asr_client.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | #
5 | # Copyright 2017 Guenter Bartsch
6 | #
7 | # This program is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Lesser General Public License as published by
9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # This program is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Lesser General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Lesser General Public License
18 | # along with this program. If not, see .
19 | #
20 | #
21 | # very basic example client for our example speech asr server
22 | #
23 |
24 |
25 | import os
26 | import sys
27 | import logging
28 | import traceback
29 | import json
30 | import wave
31 | import struct
32 | import requests
33 |
34 | from time import time
35 | from optparse import OptionParser
36 |
37 | DEFAULT_HOST = 'localhost'
38 | DEFAULT_PORT = 8080
39 |
40 | #
41 | # commandline
42 | #
43 |
44 | parser = OptionParser("usage: %prog [options] foo.wav")
45 |
46 | parser.add_option ("-v", "--verbose", action="store_true", dest="verbose",
47 | help="verbose output")
48 |
49 | parser.add_option ("-H", "--host", dest="host", type = "string", default=DEFAULT_HOST,
50 | help="host, default: %s" % DEFAULT_HOST)
51 |
52 | parser.add_option ("-p", "--port", dest="port", type = "int", default=DEFAULT_PORT,
53 | help="port, default: %d" % DEFAULT_PORT)
54 |
55 |
56 | (options, args) = parser.parse_args()
57 |
58 | if options.verbose:
59 | logging.basicConfig(level=logging.DEBUG)
60 | else:
61 | logging.basicConfig(level=logging.INFO)
62 | logging.getLogger("requests").setLevel(logging.WARNING)
63 |
64 | if len(args) != 1:
65 | parser.print_help()
66 | sys.exit(1)
67 |
68 | wavfn = args[0]
69 |
70 | url = 'http://%s:%d/decode' % (options.host, options.port)
71 |
72 | #
73 | # read samples from wave file, hand them over to asr server incrementally to simulate online decoding
74 | #
75 |
76 | time_start = time()
77 |
78 | wavf = wave.open(wavfn, 'rb')
79 |
80 | # check format
81 | assert wavf.getnchannels()==1
82 | assert wavf.getsampwidth()==2
83 |
84 | # process file in 250ms chunks
85 |
86 | chunk_frames = 250 * wavf.getframerate() / 1000
87 | tot_frames = wavf.getnframes()
88 |
89 | num_frames = 0
90 | while num_frames < tot_frames:
91 |
92 | finalize = False
93 | if (num_frames + chunk_frames) < tot_frames:
94 | nframes = chunk_frames
95 | else:
96 | nframes = tot_frames - num_frames
97 | finalize = True
98 |
99 | frames = wavf.readframes(nframes)
100 | num_frames += nframes
101 | samples = struct.unpack_from('<%dh' % nframes, frames)
102 |
103 | data = {'audio' : samples,
104 | 'do_record' : False,
105 | 'do_asr' : True,
106 | 'do_finalize': finalize}
107 |
108 | response = requests.post(url, data=json.dumps(data))
109 |
110 | logging.info("%6.3fs: %5d frames (%6.3fs) decoded, status=%d." % (time()-time_start,
111 | num_frames,
112 | float(num_frames) / float(wavf.getframerate()),
113 | response.status_code))
114 | assert response.status_code == 200
115 |
116 |
117 | wavf.close()
118 |
119 | data = response.json()
120 |
121 | logging.debug("raw response data: %s" % repr(data))
122 |
123 | logging.info ( "*****************************************************************")
124 | logging.info ( "** wavfn : %s" % wavfn)
125 | logging.info ( "** hstr : %s" % data['hstr'])
126 | logging.info ( "** confidence : %f" % data['confidence'])
127 | logging.info ( "** decoding time : %8.2fs" % ( time() - time_start ))
128 | logging.info ( "*****************************************************************")
129 |
130 |
--------------------------------------------------------------------------------
/docker-build.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | MODEL_NAME=$(grep '^ARG MODEL_NAME=' Dockerfile | cut -f2 -d=)
4 | TAG=${USER}/docker-py-kaldi-asr-and-model:${MODEL_NAME}
5 |
6 | echo "Building ${TAG}"
7 | docker build -t ${TAG} .
8 |
--------------------------------------------------------------------------------
/docker-run.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | MODEL_NAME=$(grep '^ARG MODEL_NAME=' Dockerfile | cut -f2 -d=)
4 | TAG=${USER}/docker-py-kaldi-asr-and-model:${MODEL_NAME}
5 |
6 | echo "Starting ${TAG}"
7 | docker run --rm -p 127.0.0.1:8080:80/tcp ${TAG}
8 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: py-kaldi-asr-client
2 | channels:
3 | - defaults
4 | dependencies:
5 | - asn1crypto=0.24.0=py27_0
6 | - ca-certificates=2018.03.07=0
7 | - certifi=2018.4.16=py27_0
8 | - cffi=1.11.5=py27h9745a5d_0
9 | - chardet=3.0.4=py27hfa10054_1
10 | - cryptography=2.2.2=py27h14c3975_0
11 | - enum34=1.1.6=py27h99a27e9_1
12 | - idna=2.6=py27h5722d68_1
13 | - ipaddress=1.0.22=py27_0
14 | - libedit=3.1.20170329=h6b74fdf_2
15 | - libffi=3.2.1=hd88cf55_4
16 | - libgcc-ng=7.2.0=hdf63c60_3
17 | - libstdcxx-ng=7.2.0=hdf63c60_3
18 | - ncurses=6.1=hf484d3e_0
19 | - openssl=1.0.2o=h20670df_0
20 | - pip=10.0.1=py27_0
21 | - pycparser=2.18=py27hefa08c5_1
22 | - pyopenssl=18.0.0=py27_0
23 | - pysocks=1.6.8=py27_0
24 | - python=2.7.15=h1571d57_0
25 | - readline=7.0=ha6073c6_4
26 | - requests=2.18.4=py27hc5b0589_1
27 | - setuptools=39.2.0=py27_0
28 | - six=1.11.0=py27h5f960f1_1
29 | - sqlite=3.23.1=he433501_0
30 | - tk=8.6.7=hc745277_3
31 | - urllib3=1.22=py27ha55213b_0
32 | - wheel=0.31.1=py27_0
33 | - zlib=1.2.11=ha838bed_2
34 |
--------------------------------------------------------------------------------