├── requirements.in
├── README.md
├── .gitignore
├── speech
    └── asr
    │   ├── deepspeech
    │       ├── ds-transcriber.py
    │       └── mozilla_deepspeech_api_notebook.ipynb
    │   └── python_speech_recognition_notebook.ipynb
├── requirements.txt
└── gpt
    ├── nlp_with_gpt_notebook.ipynb
    └── translate_natural_language_query_to_sql_notebook.ipynb


/requirements.in:
--------------------------------------------------------------------------------
 1 | # Python 3.10.14
 2 | dask==2024.9.1
 3 | datetime==5.5
 4 | faker==30.8.0
 5 | fastapi[all]==0.111.1
 6 | gpustat==1.0.0
 7 | httpx==0.27.0
 8 | jax-metal==0.1.0
 9 | jax==0.4.26
10 | jaxlib==0.4.26
11 | jsonschema==4.22.0
12 | line-profiler==4.1.3
13 | matplotlib==3.7.5
14 | networkx==3.3
15 | numba==0.60.0
16 | numpy==1.26.4
17 | openai[datalib]==1.35.14
18 | openpyxl==3.1.5
19 | pandarallel==1.6.5
20 | pandas==2.2.3
21 | perfplot==0.10.2
22 | pypdf==5.0.1
23 | python-dotenv==1.0.1
24 | SQLAlchemy==2.0.30
25 | swifter==1.4.0
26 | tensorflow-macos==2.16.1
27 | tensorflow-metal==1.1.0
28 | tensorflow==2.16.1
29 | torch==2.4.0
30 | torchaudio==2.4.0
31 | torchvision==0.19.0
32 | xlrd==2.0.1
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ML4Devs — Notebooks
 2 | 
 3 | This repository hosts companion notebooks and code snippets for [ML4Devs.com](https://www.ml4devs.com) website:
 4 | 
 5 | - GPT
 6 |   - [`nlp_with_gpt_notebook.ipynb`](gpt/nlp_with_gpt_notebook.ipynb) : Sentiment Analysis, Language Translation, and Named-Entity Recognition with GPT
 7 |   - [`translate_natural_language_query_to_sql_notebook.ipynb`](gpt/translate_natural_language_query_to_sql_notebook.ipynb) : Translate Natural Language Queries to SQL, and execute it on a database
 8 | - Pandas
 9 |   - [`pandas_apply_fn_on_dataframe.ipynb`](pandas/pandas_apply_fn_on_dataframe.ipynb) : Better ways to [apply a function to each row in Pandas DataFrame](https://www.ml4devs.com/articles/pandas-dataframe-apply-function-iterate-over-rows/)
10 | - Speech
11 |   - [`mozilla_deepspeech_api_notebook.ipynb`](speech/asr/deepspeech/mozilla_deepspeech_api_notebook.ipynb) : Automatic [speech recognition with Mozilla DeepSpeech](https://www.ml4devs.com/articles/how-to-build-python-transcriber-using-mozilla-deepspeech/)
12 |   - [`ds-transcriber.py`](speech/asr/deepspeech/ds-transcriber.py) : Code for a [Python transcriber](https://www.ml4devs.com/articles/how-to-build-python-transcriber-using-mozilla-deepspeech/) that process streaming audio with DeepSpeech
13 |   - [`python_speech_recognition_notebook.ipynb`](speech/asr/python_speech_recognition_notebook.ipynb) : Comparing most prominent alternatives for [speech recognition with Python](https://www.ml4devs.com/articles/speech-recognition-with-python/)
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Databases
  2 | *.db
  3 | *.zip
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 


--------------------------------------------------------------------------------
/speech/asr/deepspeech/ds-transcriber.py:
--------------------------------------------------------------------------------
 1 | # (c) Copyright 2020-2022 Satish Chandra Gupta
 2 | # 
 3 | # MIT License
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | #
23 | 
24 | 
25 | # For more explanation, check following blog posts:
26 | #   - https://www.ml4devs.com/articles/how-to-build-python-transcriber-using-mozilla-deepspeech/
27 | #   - https://www.ml4devs.com/articles/speech-recognition-with-python/
28 | 
29 | import stt
30 | import numpy as np
31 | import os
32 | import pyaudio
33 | import time
34 | 
35 | # DeepSpeech parameters
36 | DEEPSPEECH_MODEL_DIR = 'coqui-stt-1.0.0-models'
37 | MODEL_FILE_PATH = os.path.join(DEEPSPEECH_MODEL_DIR, 'model.tflite')
38 | SCORER_FILE_PATH = os.path.join(DEEPSPEECH_MODEL_DIR, 'large_vocabulary.scorer')
39 | LM_ALPHA = 0.75
40 | LM_BETA = 1.85
41 | BEAM_WIDTH = 500
42 | 
43 | # Make DeepSpeech Model
44 | model = stt.Model(MODEL_FILE_PATH)
45 | model.enableExternalScorer(SCORER_FILE_PATH)
46 | model.setScorerAlphaBeta(LM_ALPHA, LM_BETA)
47 | model.setBeamWidth(BEAM_WIDTH)
48 | 
49 | # Create a Streaming session
50 | stt_stream = model.createStream()
51 | 
52 | # Encapsulate DeepSpeech audio feeding into a callback for PyAudio
53 | text_so_far = ''
54 | def process_audio(in_data, frame_count, time_info, status):
55 |     global text_so_far
56 |     data16 = np.frombuffer(in_data, dtype=np.int16)
57 |     stt_stream.feedAudioContent(data16)
58 |     text = stt_stream.intermediateDecode()
59 |     if text != text_so_far:
60 |         print('Interim text = {}'.format(text))
61 |         text_so_far = text
62 |     return (in_data, pyaudio.paContinue)
63 | 
64 | # PyAudio parameters
65 | FORMAT = pyaudio.paInt16
66 | CHANNELS = 1
67 | RATE = 16000
68 | CHUNK_SIZE = 1024
69 | 
70 | # Feed audio to deepspeech in a callback to PyAudio
71 | audio = pyaudio.PyAudio()
72 | stream = audio.open(
73 |     format=FORMAT,
74 |     channels=CHANNELS,
75 |     rate=RATE,
76 |     input=True,
77 |     frames_per_buffer=CHUNK_SIZE,
78 |     stream_callback=process_audio
79 | )
80 | 
81 | print('Please start speaking, when done press Ctrl-C ...')
82 | stream.start_stream()
83 | 
84 | try: 
85 |     while stream.is_active():
86 |         time.sleep(0.1)
87 | except KeyboardInterrupt:
88 |     # PyAudio
89 |     stream.stop_stream()
90 |     stream.close()
91 |     audio.terminate()
92 |     print('Finished recording.')
93 |     # DeepSpeech
94 |     text = stt_stream.finishStream()
95 |     print('Final text = {}'.format(text))
96 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file is autogenerated by pip-compile with Python 3.10
  3 | # by the following command:
  4 | #
  5 | #    pip-compile requirements.in
  6 | #
  7 | absl-py==2.1.0
  8 |     # via
  9 |     #   keras
 10 |     #   tensorboard
 11 |     #   tensorflow
 12 | annotated-types==0.7.0
 13 |     # via pydantic
 14 | anyio==4.6.2.post1
 15 |     # via
 16 |     #   httpx
 17 |     #   openai
 18 |     #   starlette
 19 |     #   watchfiles
 20 | astunparse==1.6.3
 21 |     # via tensorflow
 22 | attrs==24.2.0
 23 |     # via
 24 |     #   jsonschema
 25 |     #   referencing
 26 | blessed==1.20.0
 27 |     # via gpustat
 28 | certifi==2024.8.30
 29 |     # via
 30 |     #   httpcore
 31 |     #   httpx
 32 |     #   requests
 33 | charset-normalizer==3.4.0
 34 |     # via requests
 35 | click==8.1.7
 36 |     # via
 37 |     #   dask
 38 |     #   typer
 39 |     #   uvicorn
 40 | cloudpickle==3.1.0
 41 |     # via dask
 42 | contourpy==1.3.0
 43 |     # via matplotlib
 44 | cycler==0.12.1
 45 |     # via matplotlib
 46 | dask[array,dataframe]==2024.9.1
 47 |     # via
 48 |     #   -r requirements.in
 49 |     #   dask-expr
 50 |     #   swifter
 51 | dask-expr==1.1.15
 52 |     # via dask
 53 | datetime==5.5
 54 |     # via -r requirements.in
 55 | dill==0.3.9
 56 |     # via pandarallel
 57 | distro==1.9.0
 58 |     # via openai
 59 | dnspython==2.7.0
 60 |     # via email-validator
 61 | email-validator==2.2.0
 62 |     # via fastapi
 63 | et-xmlfile==2.0.0
 64 |     # via openpyxl
 65 | exceptiongroup==1.2.2
 66 |     # via anyio
 67 | faker==30.8.0
 68 |     # via -r requirements.in
 69 | fastapi[all]==0.111.1
 70 |     # via -r requirements.in
 71 | fastapi-cli==0.0.5
 72 |     # via fastapi
 73 | filelock==3.16.1
 74 |     # via torch
 75 | flatbuffers==24.3.25
 76 |     # via tensorflow
 77 | fonttools==4.54.1
 78 |     # via matplotlib
 79 | fsspec==2024.10.0
 80 |     # via
 81 |     #   dask
 82 |     #   torch
 83 | gast==0.6.0
 84 |     # via tensorflow
 85 | google-pasta==0.2.0
 86 |     # via tensorflow
 87 | gpustat==1.0.0
 88 |     # via -r requirements.in
 89 | grpcio==1.67.1
 90 |     # via
 91 |     #   tensorboard
 92 |     #   tensorflow
 93 | h11==0.14.0
 94 |     # via
 95 |     #   httpcore
 96 |     #   uvicorn
 97 | h5py==3.12.1
 98 |     # via
 99 |     #   keras
100 |     #   tensorflow
101 | httpcore==1.0.6
102 |     # via httpx
103 | httptools==0.6.4
104 |     # via uvicorn
105 | httpx==0.27.0
106 |     # via
107 |     #   -r requirements.in
108 |     #   fastapi
109 |     #   openai
110 | idna==3.10
111 |     # via
112 |     #   anyio
113 |     #   email-validator
114 |     #   httpx
115 |     #   requests
116 | importlib-metadata==8.5.0
117 |     # via dask
118 | itsdangerous==2.2.0
119 |     # via fastapi
120 | jax==0.4.26
121 |     # via
122 |     #   -r requirements.in
123 |     #   jax-metal
124 | jax-metal==0.1.0
125 |     # via -r requirements.in
126 | jaxlib==0.4.26
127 |     # via
128 |     #   -r requirements.in
129 |     #   jax-metal
130 | jinja2==3.1.4
131 |     # via
132 |     #   fastapi
133 |     #   torch
134 | jsonschema==4.22.0
135 |     # via -r requirements.in
136 | jsonschema-specifications==2024.10.1
137 |     # via jsonschema
138 | keras==3.6.0
139 |     # via tensorflow
140 | kiwisolver==1.4.7
141 |     # via matplotlib
142 | libclang==18.1.1
143 |     # via tensorflow
144 | line-profiler==4.1.3
145 |     # via -r requirements.in
146 | llvmlite==0.43.0
147 |     # via numba
148 | locket==1.0.0
149 |     # via partd
150 | markdown==3.7
151 |     # via tensorboard
152 | markdown-it-py==3.0.0
153 |     # via rich
154 | markupsafe==3.0.2
155 |     # via
156 |     #   jinja2
157 |     #   werkzeug
158 | matplotlib==3.7.5
159 |     # via
160 |     #   -r requirements.in
161 |     #   matplotx
162 |     #   perfplot
163 | matplotx==0.3.10
164 |     # via perfplot
165 | mdurl==0.1.2
166 |     # via markdown-it-py
167 | ml-dtypes==0.3.2
168 |     # via
169 |     #   jax
170 |     #   jaxlib
171 |     #   keras
172 |     #   tensorflow
173 | mpmath==1.3.0
174 |     # via sympy
175 | namex==0.0.8
176 |     # via keras
177 | networkx==3.3
178 |     # via
179 |     #   -r requirements.in
180 |     #   torch
181 | numba==0.60.0
182 |     # via -r requirements.in
183 | numpy==1.26.4
184 |     # via
185 |     #   -r requirements.in
186 |     #   contourpy
187 |     #   dask
188 |     #   h5py
189 |     #   jax
190 |     #   jaxlib
191 |     #   keras
192 |     #   matplotlib
193 |     #   matplotx
194 |     #   ml-dtypes
195 |     #   numba
196 |     #   openai
197 |     #   pandas
198 |     #   pandas-stubs
199 |     #   perfplot
200 |     #   scipy
201 |     #   tensorboard
202 |     #   tensorflow
203 |     #   torchvision
204 | nvidia-ml-py==11.495.46
205 |     # via gpustat
206 | openai[datalib]==1.35.14
207 |     # via -r requirements.in
208 | openpyxl==3.1.5
209 |     # via -r requirements.in
210 | opt-einsum==3.4.0
211 |     # via
212 |     #   jax
213 |     #   tensorflow
214 | optree==0.13.0
215 |     # via keras
216 | orjson==3.10.10
217 |     # via fastapi
218 | packaging==24.1
219 |     # via
220 |     #   dask
221 |     #   keras
222 |     #   matplotlib
223 |     #   tensorflow
224 | pandarallel==1.6.5
225 |     # via -r requirements.in
226 | pandas==2.2.3
227 |     # via
228 |     #   -r requirements.in
229 |     #   dask
230 |     #   dask-expr
231 |     #   openai
232 |     #   pandarallel
233 |     #   swifter
234 | pandas-stubs==2.2.3.241009
235 |     # via openai
236 | partd==1.4.2
237 |     # via dask
238 | perfplot==0.10.2
239 |     # via -r requirements.in
240 | pillow==11.0.0
241 |     # via
242 |     #   matplotlib
243 |     #   torchvision
244 | protobuf==4.25.5
245 |     # via
246 |     #   tensorboard
247 |     #   tensorflow
248 | psutil==6.1.0
249 |     # via
250 |     #   gpustat
251 |     #   pandarallel
252 |     #   swifter
253 | pyarrow==18.0.0
254 |     # via dask-expr
255 | pydantic==2.9.2
256 |     # via
257 |     #   fastapi
258 |     #   openai
259 |     #   pydantic-extra-types
260 |     #   pydantic-settings
261 | pydantic-core==2.23.4
262 |     # via pydantic
263 | pydantic-extra-types==2.9.0
264 |     # via fastapi
265 | pydantic-settings==2.6.1
266 |     # via fastapi
267 | pygments==2.18.0
268 |     # via rich
269 | pyparsing==3.2.0
270 |     # via matplotlib
271 | pypdf==5.0.1
272 |     # via -r requirements.in
273 | python-dateutil==2.9.0.post0
274 |     # via
275 |     #   faker
276 |     #   matplotlib
277 |     #   pandas
278 | python-dotenv==1.0.1
279 |     # via
280 |     #   -r requirements.in
281 |     #   pydantic-settings
282 |     #   uvicorn
283 | python-multipart==0.0.17
284 |     # via fastapi
285 | pytz==2024.2
286 |     # via
287 |     #   datetime
288 |     #   pandas
289 | pyyaml==6.0.2
290 |     # via
291 |     #   dask
292 |     #   fastapi
293 |     #   uvicorn
294 | referencing==0.35.1
295 |     # via
296 |     #   jsonschema
297 |     #   jsonschema-specifications
298 | requests==2.32.3
299 |     # via tensorflow
300 | rich==13.9.3
301 |     # via
302 |     #   keras
303 |     #   perfplot
304 |     #   typer
305 | rpds-py==0.20.1
306 |     # via
307 |     #   jsonschema
308 |     #   referencing
309 | scipy==1.14.1
310 |     # via
311 |     #   jax
312 |     #   jaxlib
313 | shellingham==1.5.4
314 |     # via typer
315 | six==1.16.0
316 |     # via
317 |     #   astunparse
318 |     #   blessed
319 |     #   google-pasta
320 |     #   gpustat
321 |     #   jax-metal
322 |     #   python-dateutil
323 |     #   tensorboard
324 |     #   tensorflow
325 |     #   tensorflow-metal
326 | sniffio==1.3.1
327 |     # via
328 |     #   anyio
329 |     #   httpx
330 |     #   openai
331 | sqlalchemy==2.0.30
332 |     # via -r requirements.in
333 | starlette==0.37.2
334 |     # via fastapi
335 | swifter==1.4.0
336 |     # via -r requirements.in
337 | sympy==1.13.3
338 |     # via torch
339 | tensorboard==2.16.2
340 |     # via tensorflow
341 | tensorboard-data-server==0.7.2
342 |     # via tensorboard
343 | tensorflow==2.16.1
344 |     # via
345 |     #   -r requirements.in
346 |     #   tensorflow-macos
347 | tensorflow-io-gcs-filesystem==0.37.1
348 |     # via tensorflow
349 | tensorflow-macos==2.16.1
350 |     # via -r requirements.in
351 | tensorflow-metal==1.1.0
352 |     # via -r requirements.in
353 | termcolor==2.5.0
354 |     # via tensorflow
355 | toolz==1.0.0
356 |     # via
357 |     #   dask
358 |     #   partd
359 | torch==2.4.0
360 |     # via
361 |     #   -r requirements.in
362 |     #   torchaudio
363 |     #   torchvision
364 | torchaudio==2.4.0
365 |     # via -r requirements.in
366 | torchvision==0.19.0
367 |     # via -r requirements.in
368 | tqdm==4.66.6
369 |     # via
370 |     #   openai
371 |     #   swifter
372 | typer==0.12.5
373 |     # via fastapi-cli
374 | types-pytz==2024.2.0.20241003
375 |     # via pandas-stubs
376 | typing-extensions==4.12.2
377 |     # via
378 |     #   anyio
379 |     #   faker
380 |     #   fastapi
381 |     #   openai
382 |     #   optree
383 |     #   pydantic
384 |     #   pydantic-core
385 |     #   pypdf
386 |     #   rich
387 |     #   sqlalchemy
388 |     #   tensorflow
389 |     #   torch
390 |     #   typer
391 |     #   uvicorn
392 | tzdata==2024.2
393 |     # via pandas
394 | ujson==5.10.0
395 |     # via fastapi
396 | urllib3==2.2.3
397 |     # via requests
398 | uvicorn[standard]==0.32.0
399 |     # via
400 |     #   fastapi
401 |     #   fastapi-cli
402 | uvloop==0.21.0
403 |     # via uvicorn
404 | watchfiles==0.24.0
405 |     # via uvicorn
406 | wcwidth==0.2.13
407 |     # via blessed
408 | websockets==13.1
409 |     # via uvicorn
410 | werkzeug==3.1.0
411 |     # via tensorboard
412 | wheel==0.44.0
413 |     # via
414 |     #   astunparse
415 |     #   jax-metal
416 |     #   tensorflow-metal
417 | wrapt==1.16.0
418 |     # via tensorflow
419 | xlrd==2.0.1
420 |     # via -r requirements.in
421 | zipp==3.20.2
422 |     # via importlib-metadata
423 | zope-interface==7.1.1
424 |     # via datetime
425 | 
426 | # The following packages are considered to be unsafe in a requirements file:
427 | # setuptools
428 | 


--------------------------------------------------------------------------------
/gpt/nlp_with_gpt_notebook.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","metadata":{"id":"EV0cfuXJeO-8"},"source":["<a href=\"https://colab.research.google.com/github/ml4devs/ml4devs-notebooks/blob/master/gpt/nlp_with_gpt_notebook.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"]},{"cell_type":"markdown","metadata":{"id":"e3Id4cfBeO-_"},"source":["<h1><center>Common NLP Tasks with GPT: Sentiment Analysis, Language Translation, and Named-Entity Recognition</center></h1>\n","\n","<p><center>\n","<address>&copy; Satish Chandra Gupta<br/>\n","LinkedIn: <a href=\"https://www.linkedin.com/in/scgupta/\">scgupta</a>,\n","Twitter: <a href=\"https://twitter.com/scgupta\">scgupta</a>\n","</address>\n","</center></p>\n","\n","---\n","\n","## Setup\n","\n","### Install Pip Packages\n","\n","You need Python 3.7 or higher to install [OpenAI Python API library](https://github.com/openai/openai-python)."]},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":770,"status":"ok","timestamp":1701355793267,"user":{"displayName":"","userId":""},"user_tz":-330},"id":"iZkQzDDxeO_A","outputId":"4824a891-78f6-47d5-c78b-5f51c7566483","vscode":{"languageId":"shellscript"}},"outputs":[{"name":"stdout","output_type":"stream","text":["Python 3.10.14\n"]}],"source":["# You should have Python 3.7 or higher\n","\n","!python --version\n"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5781,"status":"ok","timestamp":1701355799502,"user":{"displayName":"","userId":""},"user_tz":-330},"id":"1Y6r5ieIeO_C","outputId":"c6a9c2b8-42a5-4749-d018-df48a804223e","vscode":{"languageId":"shellscript"}},"outputs":[],"source":["!pip install openai python-dotenv &> /dev/null\n"]},{"cell_type":"markdown","metadata":{"id":"cYfbkKGPeO_D"},"source":["### Upload `.env` File with API Keys\n","\n","You can either use GPT directly from OpenAI, or you can use Azure OpenAI from Microsoft. You need to create a `.env` file and add the environment variables needed for OpenAI api.\n","\n","If you are using OpenAI, check your [OpenAI account](https://platform.openai.com/api-keys) for creating API key. Your `.env` file will look like following:\n","\n","```sh\n","$ cat .env\n","OPENAI_API_KEY='sk-YourOpenAiApiKeyHere'\n","```\n","\n","If you are using Microsoft Azure OpenAI:\n","- Go to [Azure Portal](https://portal.azure.com/) > **All Resources**\n","- Filter the list with Type == Azure OpenAI\n","- Select the one you plan to use\n","- If there are none, you can [create and deploy an Azure OpenAI Service resource](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource)\n","- Click on **Keys and Endpoint** on the left menu\n","- Get `AZURE_OPENAI_API_KEY` and `AZURE_OPENAI_ENDPOINT`\n","- Next click **Model deployments** on the left menu, and then click **Manage Deployment** button\n","- Alternatively, you can go to [Azure OpenAI Studio](https://oai.azure.com/), and click **Deployments** on the left menu\n","- Find (the latest) API version for [Azure OpenAI Service](https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#rest-api-versioning)\n","\n","Your `.env` file will look like following:\n","```sh\n","$ cat .env\n","AZURE_OPENAI_API_KEY=yourAzureOpenAiApiKey\n","AZURE_OPENAI_ENDPOINT=https://your-azure-deployment.openai.azure.com/\n","AZURE_OPENAI_DEPLOYMENT_ID=your-deployment-name\n","AZURE_OPENAI_API_VERSION=2023-10-01-preview\n","```\n","\n","Upload `.env` using Upload File button in Google Colab (or Jupyter Notebook). In worst case scenario, uncomment and modify the relevant lines in the following cell to create `.env` file. Please note that it is dangerous to share such notebooks or check them into git."]},{"cell_type":"code","execution_count":3,"metadata":{"id":"XKy_X1wdeO_D","vscode":{"languageId":"shellscript"}},"outputs":[],"source":["# Upload or create a .env file with (Azure) OpenAI API creds\n","\n","#!echo \"OPENAI_API_KEY=sk-YourOpenApiKeyHere\" >> .env\n","\n","#!echo \"AZURE_OPENAI_API_KEY=yourAzureOpenAiApiKey\" >> .env\n","#!echo \"AZURE_OPENAI_ENDPOINT=https://your-azure-deployment.openai.azure.com/\" >> .env\n","#!echo \"AZURE_OPENAI_DEPLOYMENT_ID=your-deployment-name\" >> .env\n","#!echo \"AZURE_OPENAI_API_VERSION=2023-10-01-preview\" >> .env\n"]},{"cell_type":"markdown","metadata":{"id":"KcPWLtAWeO_E"},"source":["### Load `.env` File and Specify (Azure) OpenAI GPT Model\n","\n","Load environment variables from `.env` file:"]},{"cell_type":"code","execution_count":4,"metadata":{"id":"tIT35MxGeO_F"},"outputs":[],"source":["from dotenv import load_dotenv, find_dotenv\n","\n","_ = load_dotenv(find_dotenv())\n"]},{"cell_type":"markdown","metadata":{"id":"cVN8Z7wQeO_F"},"source":["Set `IS_AZURE_OPENAI` flag to `True`, if you are using Azure OpenAI:"]},{"cell_type":"code","execution_count":5,"metadata":{"id":"KfXGuwQZeO_G"},"outputs":[],"source":["IS_AZURE_OPENAI: bool = False\n"]},{"cell_type":"markdown","metadata":{"id":"2mY86GV2eO_G"},"source":["Specify model name:"]},{"cell_type":"code","execution_count":6,"metadata":{"id":"GK1r2VFVeO_G"},"outputs":[],"source":["from datetime import datetime\n","\n","GPT35_TURBO: str = \"gpt-3.5-turbo-1106\" if datetime.now() < datetime(2023, 12, 11) else \"gpt-3.5-turbo\"\n"]},{"cell_type":"markdown","metadata":{"id":"87QWD_kLeO_G"},"source":["---\n","\n","## Create an OpenAI Client and Specify GPT Model"]},{"cell_type":"code","execution_count":7,"metadata":{"id":"J3gYa6czeO_H"},"outputs":[],"source":["import os\n","import openai\n"]},{"cell_type":"code","execution_count":8,"metadata":{"id":"yTp0GKx9eO_H"},"outputs":[],"source":["def create_open_ai_client():\n","    if IS_AZURE_OPENAI:\n","        return openai.AzureOpenAI(\n","            api_key=os.getenv(\"AZURE_OPENAI_API_KEY\"),\n","            api_version=os.getenv(\"AZURE_OPENAI_API_VERSION\"),\n","            azure_endpoint=os.getenv(\"AZURE_OPENAI_ENDPOINT\"),\n","            azure_deployment=os.getenv(\"AZURE_OPENAI_DEPLOYMENT_ID\")\n","        )\n","    else:\n","        return openai.OpenAI(\n","            api_key=os.getenv('OPENAI_API_KEY')\n","        )\n"]},{"cell_type":"code","execution_count":9,"metadata":{"id":"Hes3w6mfeO_H"},"outputs":[],"source":["openai_client = create_open_ai_client()\n","openai_model = os.getenv(\"AZURE_OPENAI_DEPLOYMENT_ID\") if IS_AZURE_OPENAI else GPT35_TURBO\n","\n","def get_gpt_response(prompt, model=openai_model, temperature=0):\n","    messages = [{\"role\": \"user\", \"content\": prompt}]\n","    response = openai_client.chat.completions.create(\n","        model=model,\n","        #response_format={\"type\": \"json_object\"},  # Uncomment it if your chosen model supports it\n","        messages=messages,\n","        temperature=temperature,\n","    )\n","    return response.choices[0].message.content\n"]},{"cell_type":"code","execution_count":10,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":831,"status":"ok","timestamp":1701355800331,"user":{"displayName":"","userId":""},"user_tz":-330},"id":"5JrKbCv7eO_H","outputId":"0202742f-4049-4f17-f581-9d2508d021c8"},"outputs":[{"name":"stdout","output_type":"stream","text":["{\n","  \"message\": \"This is a test\"\n","}\n"]}],"source":["print(get_gpt_response(\"Say this is test in JSON\"))\n"]},{"cell_type":"markdown","metadata":{"id":"Mu09cyaLeO_I"},"source":["You are all set to use GPT for common NLP tasks such as Sentiment Analysis, Language Translation, Intent/Entity Recognition."]},{"cell_type":"markdown","metadata":{"id":"eOmv1Y7OeO_I"},"source":["---\n","\n","## Sentiment Analysis\n","\n","Let's do sentiment analysis for food reviews. In classical ML, you will need to build a supervised classification model for sentiment analysis. You need to:\n","\n","- Clean and label the data (this takes significant amount of effort)\n","- Divide it into train, validate, and test sets\n","- Preprocessing: remove stop words, stemming, etc.\n","- Train multiple models\n","- Measure inference accuracy\n","- Select a model, and tune its hyper-parameters\n","- Deploy the final model\n","\n","This whole endeavour may take a couple of weeks and sometime months!\n","\n","But Large Language Models (LLMs) like GPT eliminates ML model training or train it with just few examples. It is called [Zero or Few Shot Learning](https://en.wikipedia.org/wiki/Zero-shot_learning). This is because foundational LLM models are capable of doing multiple tasks.\n","\n","This effectively makes many NLP capabilities accessible to developers who may not have data science and machine learning expertise. And, they can do it in few hours or days (instead of weeks and months)!\n","\n","See it yourself. Here is your food review sentiment analyzer with few lines of code."]},{"cell_type":"code","execution_count":11,"metadata":{"id":"CTkZo8K7eO_I"},"outputs":[],"source":["food_reviews = [\n","    \"The food is great, ambience is just right, but service is slow.\",\n","    \"खाना बहुत स्वादिष्ट है, बैंगन भरता और काबुली चिकन कबाब जरूर खाएँ\",\n","    \"starters soggy and लस्सी बिलकुल पानी, बकवास खाना, waste of money\",\n","]\n"]},{"cell_type":"code","execution_count":12,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":971,"status":"ok","timestamp":1701355801300,"user":{"displayName":"","userId":""},"user_tz":-330},"id":"FsiUsSlXeO_I","outputId":"f8eee039-b029-4552-f5df-51604627f1af"},"outputs":[{"name":"stdout","output_type":"stream","text":["{\n","  \"sentiment\": \"mixed\"\n","}\n"]}],"source":["prompt = f\"\"\"\n","What is the sentiment of the following review that is delimited with triple backticks?\n","\n","Format your response in JSON.\n","\n","Review text: ```{food_reviews[0]}```\n","\"\"\"\n","\n","print(get_gpt_response(prompt))\n"]},{"cell_type":"markdown","metadata":{"id":"ClX5SA-reO_J"},"source":["Voilà! It worked like a charm! With just 20-word long prompt! Now let's improve the prompt to get the response in a structure that you specify."]},{"cell_type":"code","execution_count":13,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1749,"status":"ok","timestamp":1701355803046,"user":{"displayName":"","userId":""},"user_tz":-330},"id":"n6BXbiIAeO_J","outputId":"d4cdea6f-805f-498e-f438-04e5442a451f"},"outputs":[{"name":"stdout","output_type":"stream","text":["{\n","    \"sentiment\": \"mixed\",\n","    \"stars\": 3,\n","    \"emotions\": [\"positive\", \"neutral\"],\n","    \"summary\": \"Great food and ambience, but slow service.\"\n","}\n"]}],"source":["prompt = f\"\"\"\n","Identify following items from the review text that is delimited with triple backticks:\n","- Sentiment: (positive, mixed, or negative)\n","- Stars: a number rating characterizing overall sentiment, 1 star being the lowest and 5 star being the highest\n","- Emotions: top emotion(s), maximum 3 emotions\n","- Summary: human readable summary of the review and sentiments in less than 255 characters\n","\n","Format your response as JSON with \"sentiment\", \"stars\", \"emotions\", and \"summary\" as the keys.\n","\n","Review text: ```{food_reviews[0]}```\n","\"\"\"\n","\n","print(get_gpt_response(prompt))\n"]},{"cell_type":"markdown","metadata":{"id":"GzjaaFRkeO_J"},"source":["---\n","\n","## Language Translation\n","\n","Now, let's make this sentiment analyzer multi-lingual.\n","\n","GPT has language identification and translation capabilities, and you can invoke them with a simple prompt."]},{"cell_type":"code","execution_count":14,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2852,"status":"ok","timestamp":1701355805897,"user":{"displayName":"","userId":""},"user_tz":-330},"id":"qCMoL5CteO_J","outputId":"b99460b9-a078-482e-82e8-a80a61c3a0ff"},"outputs":[{"name":"stdout","output_type":"stream","text":["{\n","    \"text\": \"खाना बहुत स्वादिष्ट है, बैंगन भरता और काबुली चिकन कबाब जरूर खाएँ\",\n","    \"language\": \"Hindi\",\n","    \"translation\": \"The food is very delicious, be sure to try Baingan Bharta and Kabuli Chicken Kebab\"\n","}\n"]}],"source":["prompt = f\"\"\"\n","Translate the input text into English.\n","\n","Format your response as JSON with values for following keys:\n","- text: input text as is\n","- language: the language of the input text\n","- translation: input text translated in English\n","\n","Input text: ```{food_reviews[1]}```\n","\"\"\"\n","\n","print(get_gpt_response(prompt))\n"]},{"cell_type":"markdown","metadata":{"id":"FljOMrZqeO_J"},"source":["You can see the pattern:\n","- Breakdown the task into smaller steps\n","- Give specific instructions for each step\n","- Include input with clear demarcation\n","- Specify the desired structure of the output\n","\n","It is almost like how you will teach a smart kid to do a specific task.\n","\n","You can change the functionality by changing the prompt. You can experiment and craft an effective prompt for your NLP task.\n","\n","Now let's put together sentiment analysis and translation."]},{"cell_type":"code","execution_count":15,"metadata":{"id":"bCCDancUeO_J"},"outputs":[],"source":["def infer_sentiment(text):\n","    prompt = f\"\"\"\n","        Identify following items from the review text:\n","        - Language: language of the review text\n","        - Translation: review text translated in English\n","        - Sentiment: (positive, mixed, or negative)\n","        - Stars: a number rating characterizing overall sentiment, 1 star being the lowest and 5 star being the highest\n","        - Emotions: top emotion(s), maximum 3 emotions\n","        - Summary: human readable summary of the review and sentiments in less than 255 characters\n","\n","        Format your response as JSON with \"language\", \"translation\", \"sentiment\", \"stars\", \"emotions\", and summary as the keys.\n","\n","        Review text: '''{text}'''\n","    \"\"\"\n","\n","    return get_gpt_response(prompt)\n"]},{"cell_type":"code","execution_count":16,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":8417,"status":"ok","timestamp":1701355814312,"user":{"displayName":"","userId":""},"user_tz":-330},"id":"xsrC5eoneO_K","outputId":"344be55d-2164-4353-eace-726ab9459a1f"},"outputs":[{"name":"stdout","output_type":"stream","text":["{\n","    \"language\": \"English\",\n","    \"translation\": \"The food is great, ambience is just right, but service is slow.\",\n","    \"sentiment\": \"mixed\",\n","    \"stars\": 3,\n","    \"emotions\": [\"satisfaction\", \"disappointment\"],\n","    \"summary\": \"Enjoyable experience with great food and ambience, but service could be improved.\"\n","}\n","{\n","    \"language\": \"Hindi\",\n","    \"translation\": \"The food is very delicious, do try the stuffed eggplant and Kabul chicken kebab\",\n","    \"sentiment\": \"positive\",\n","    \"stars\": 5,\n","    \"emotions\": [\"delight\", \"satisfaction\"],\n","    \"summary\": \"Delicious food with must-try stuffed eggplant and Kabul chicken kebab, highly satisfying experience with a 5-star rating\"\n","}\n","{\n","    \"language\": \"English\",\n","    \"translation\": \"starters soggy and lassi completely watery, terrible food, waste of money\",\n","    \"sentiment\": \"negative\",\n","    \"stars\": 1,\n","    \"emotions\": [\"disappointment\", \"disgust\"],\n","    \"summary\": \"Extremely disappointing experience with watery lassi and soggy starters, terrible food quality, complete waste of money.\"\n","}\n"]}],"source":["for t in food_reviews:\n","    print(infer_sentiment(t))\n"]},{"cell_type":"markdown","metadata":{"id":"i7yG6NszeO_K"},"source":["---\n","\n","## Intent/Entity Extraction"]},{"cell_type":"markdown","metadata":{"id":"L2Rv8bKpeO_K"},"source":["[Named-Entity Recognition (NER)](https://en.wikipedia.org/wiki/Named-entity_recognition) is another very common NLP task. For example, Chatbots and Voice Assistants have to:\n","\n","- Infer what you want (intent)\n","- Extract the named entities from your sentences that are needed to fulfill your request\n","- Perform that request\n","\n","For example, each of these commands to Alexa have different intent, and entities associated with it:\n","- Play songs by Taylor Swift\n","- Set an alarm for 30 minutes\n","- How is the weather\n","\n","Let's build a multilingual intent/entity extractor for a travel assistant that can enquire, book, and cancel bus, train, and flight tickets."]},{"cell_type":"code","execution_count":17,"metadata":{"id":"7GTL-q4xeO_K"},"outputs":[],"source":["travel_messages = [\n","    \"I want to fly from Bangalore to Delhi\",\n","    \"मुझे कल कानपुर से लखनऊ के लिए बस टिकट बुक करना है\",\n","    \"ನನ್ನ ಬಸ್ ಟಿಕೆಟ್ ರದ್ದು ಮಾಡಿ\",\n","]\n"]},{"cell_type":"code","execution_count":18,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2483,"status":"ok","timestamp":1701355816794,"user":{"displayName":"","userId":""},"user_tz":-330},"id":"rhQIzyhWeO_K","outputId":"96f23626-b6ad-44f8-d6d8-cadd8c945a9b"},"outputs":[{"name":"stdout","output_type":"stream","text":["{\n","  \"language\": \"Hindi\",\n","  \"translation\": \"I want to book a bus ticket from Kanpur to Lucknow for tomorrow\",\n","  \"intent\": \"book\",\n","  \"mode\": \"bus\",\n","  \"date\": \"2023-10-25\",\n","  \"source\": \"Kanpur\",\n","  \"destination\": \"Lucknow\"\n","}\n"]}],"source":["prompt = f\"\"\"\n","Act as a travel assistant clerk. Your job is to help customers by bus, train, or flight.\n","Identify following items from a customer message:\n","- Language: language of the customer message\n","- Translation: customer message translated in English\n","- Intent: (inquire, book, or cancel)\n","- Mode: (bus, train, or flight)\n","- Date: the travel date in YYYY-MM-DD\n","- Source: starting place of the journey\n","If the information isn't present, use null as the value.\n","\n","Format your response as JSON with \"language\", \"translation\", \"intent\", \"mode\", \"source\", and \"destination\".\n","\n","Review test: '''{travel_messages[1]}'''\n","\"\"\"\n","\n","print(get_gpt_response(prompt))\n"]},{"cell_type":"markdown","metadata":{"id":"sgzgjdvveO_L"},"source":["Well, it almost got everything right, except the date. It inferred \"tomorrow\" incorrectly, maybe because \"today\" for the model is when it was trained or deployed.\n","\n","That is another important lesson: your prompt must have the needed context. Let's tell it what the date today is."]},{"cell_type":"code","execution_count":19,"metadata":{"id":"HqsgF3g3eO_L"},"outputs":[],"source":["def travel_assistant(text):\n","    prompt = f\"\"\"\n","       Act as a travel assistant clerk. Your job is to help customers by bus, train, or flight.\n","       Identify following items from a customer message:\n","       - Language: language of the customer message\n","       - Translation: customer message translated in English\n","       - Intent: (inquire, book, or cancel)\n","       - Mode: (bus, train, or flight)\n","       - Date: the travel date in YYYY-MM-DD\n","       - Source: starting place of the journey\n","       If the information isn't present, use null as the value.\n","\n","       The current date and time is {datetime.now().strftime(\"%d %b %Y %I:%M %p\")}\n","\n","       Format your response as JSON with \"language\", \"translation\", \"intent\", \"mode\", \"source\", and \"destination\".\n","\n","       Review test: '''{text}'''\n","    \"\"\"\n","\n","    return get_gpt_response(prompt)\n"]},{"cell_type":"code","execution_count":20,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":6788,"status":"ok","timestamp":1701355823568,"user":{"displayName":"","userId":""},"user_tz":-330},"id":"6UyObQVfeO_L","outputId":"e3815064-3811-48ea-d654-971e55df4ccc"},"outputs":[{"name":"stdout","output_type":"stream","text":["{\n","    \"language\": \"English\",\n","    \"translation\": \"I want to fly from Bangalore to Delhi\",\n","    \"intent\": \"book\",\n","    \"mode\": \"flight\",\n","    \"date\": null,\n","    \"source\": \"Bangalore\",\n","    \"destination\": \"Delhi\"\n","}\n","{\n","    \"language\": \"Hindi\",\n","    \"translation\": \"I want to book a bus ticket from Kanpur to Lucknow tomorrow\",\n","    \"intent\": \"book\",\n","    \"mode\": \"bus\",\n","    \"date\": \"2024-10-25\",\n","    \"source\": \"Kanpur\",\n","    \"destination\": \"Lucknow\"\n","}\n","{\n","    \"language\": \"Kannada\",\n","    \"translation\": \"Cancel my bus ticket\",\n","    \"intent\": \"cancel\",\n","    \"mode\": \"bus\",\n","    \"date\": null,\n","    \"source\": null,\n","    \"destination\": null\n","}\n"]}],"source":["for t in travel_messages:\n","    print(travel_assistant(t))\n"]},{"cell_type":"markdown","metadata":{"id":"f5BQT17FeO_L"},"source":["LLMs are one of the most powerful models, and yet most accessible for developers. It reduces the time to experiment, prototype, and deploy sophisticated NLP-assisted applications."]},{"cell_type":"markdown","metadata":{"id":"-_oyoZAkeO_L"},"source":["---\n","<p>Copyright &copy 2023 <a href=\"https://www.linkedin.com/in/scgupta\">Satish Chandra Gupta</a>.</p>\n","<img src=\"https://licensebuttons.net/l/by-nc-sa/3.0/88x31.png\" align=\"left\"/> <p>&nbsp;<a href=\"https://creativecommons.org/licenses/by-nc-sa/4.0/\">CC BY-NC-SA 4.0 International</a> License.</p>"]}],"metadata":{"colab":{"provenance":[{"file_id":"https://github.com/ml4devs/ml4devs-notebooks/blob/master/gpt/nlp_with_gpt_notebook.ipynb","timestamp":1701355922767},{"file_id":"https://github.com/ml4devs/ml4devs-notebooks/blob/master/gpt/nlp_with_gpt_notebook.ipynb","timestamp":1701352821148}]},"kernelspec":{"display_name":"kaggle","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.14"}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/gpt/translate_natural_language_query_to_sql_notebook.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","metadata":{"id":"EV0cfuXJeO-8"},"source":["<a href=\"https://colab.research.google.com/github/ml4devs/ml4devs-notebooks/blob/master/gpt/translate_natural_language_query_to_sql_notebook.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"]},{"cell_type":"markdown","metadata":{"id":"e3Id4cfBeO-_"},"source":["<h1><center>Translate Natural Language Queries to SQL with GPT</center></h1>\n","\n","<p><center>\n","<address>&copy; Satish Chandra Gupta<br/>\n","LinkedIn: <a href=\"https://www.linkedin.com/in/scgupta/\">scgupta</a>,\n","Twitter: <a href=\"https://twitter.com/scgupta\">scgupta</a>\n","</address>\n","</center></p>\n","\n","---\n","\n","## Setup Environment\n","\n","### Install Pip Packages\n","\n","You need Python 3.7 or higher to install [OpenAI Python API library](https://github.com/openai/openai-python)."]},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":770,"status":"ok","timestamp":1701355793267,"user":{"displayName":"","userId":""},"user_tz":-330},"id":"iZkQzDDxeO_A","outputId":"4824a891-78f6-47d5-c78b-5f51c7566483","vscode":{"languageId":"shellscript"}},"outputs":[{"name":"stdout","output_type":"stream","text":["Python 3.10.14\n"]}],"source":["# You should have Python 3.7 or higher\n","\n","!python --version\n"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5781,"status":"ok","timestamp":1701355799502,"user":{"displayName":"","userId":""},"user_tz":-330},"id":"1Y6r5ieIeO_C","outputId":"c6a9c2b8-42a5-4749-d018-df48a804223e","vscode":{"languageId":"shellscript"}},"outputs":[],"source":["!pip install openai python-dotenv SQLAlchemy &> /dev/null\n"]},{"cell_type":"markdown","metadata":{"id":"cYfbkKGPeO_D"},"source":["### Upload `.env` File with API Keys\n","\n","You can either use GPT directly from OpenAI, or you can use Azure OpenAI from Microsoft. You need to create a `.env` file and add the environment variables needed for OpenAI api.\n","\n","If you are using OpenAI, check your [OpenAI account](https://platform.openai.com/api-keys) for creating API key. Your `.env` file will look like following:\n","\n","```sh\n","$ cat .env\n","OPENAI_API_KEY='sk-YourOpenAiApiKeyHere'\n","```\n","\n","If you are using Microsoft Azure OpenAI:\n","- Go to [Azure Portal](https://portal.azure.com/) > **All Resources**\n","- Filter the list with Type == Azure OpenAI\n","- Select the one you plan to use\n","- If there are none, you can [create and deploy an Azure OpenAI Service resource](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource)\n","- Click on **Keys and Endpoint** on the left menu\n","- Get `AZURE_OPENAI_API_KEY` and `AZURE_OPENAI_ENDPOINT`\n","- Next click **Model deployments** on the left menu, and then click **Manage Deployment** button\n","- Alternatively, you can go to [Azure OpenAI Studio](https://oai.azure.com/), and click **Deployments** on the left menu\n","- Find (the latest) API version for [Azure OpenAI Service](https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#rest-api-versioning)\n","\n","Your `.env` file will look like following:\n","```sh\n","$ cat .env\n","AZURE_OPENAI_API_KEY=yourAzureOpenAiApiKey\n","AZURE_OPENAI_ENDPOINT=https://your-azure-deployment.openai.azure.com/\n","AZURE_OPENAI_DEPLOYMENT_ID=your-deployment-name\n","AZURE_OPENAI_API_VERSION=2023-10-01-preview\n","```\n","\n","Upload `.env` using Upload File button in Google Colab (or Jupyter Notebook). In worst case scenario, uncomment and modify the relevant lines in the following cell to create `.env` file. Please note that it is dangerous to share such notebooks or check them into git."]},{"cell_type":"code","execution_count":3,"metadata":{"id":"XKy_X1wdeO_D","vscode":{"languageId":"shellscript"}},"outputs":[],"source":["# Upload or create a .env file with (Azure) OpenAI API creds\n","\n","#!echo \"OPENAI_API_KEY=sk-YourOpenApiKeyHere\" >> .env\n","\n","#!echo \"AZURE_OPENAI_API_KEY=yourAzureOpenAiApiKey\" >> .env\n","#!echo \"AZURE_OPENAI_ENDPOINT=https://your-azure-deployment.openai.azure.com/\" >> .env\n","#!echo \"AZURE_OPENAI_DEPLOYMENT_ID=your-deployment-name\" >> .env\n","#!echo \"AZURE_OPENAI_API_VERSION=2023-10-01-preview\" >> .env\n"]},{"cell_type":"markdown","metadata":{"id":"KcPWLtAWeO_E"},"source":["### Load `.env` File and Specify (Azure) OpenAI GPT Model\n","\n","Load environment variables from `.env` file:"]},{"cell_type":"code","execution_count":4,"metadata":{"id":"tIT35MxGeO_F"},"outputs":[],"source":["from dotenv import load_dotenv, find_dotenv\n","\n","_ = load_dotenv(find_dotenv())\n"]},{"cell_type":"markdown","metadata":{"id":"cVN8Z7wQeO_F"},"source":["Set `IS_AZURE_OPENAI` flag to `True`, if you are using Azure OpenAI:"]},{"cell_type":"code","execution_count":5,"metadata":{"id":"KfXGuwQZeO_G"},"outputs":[],"source":["IS_AZURE_OPENAI: bool = False\n"]},{"cell_type":"markdown","metadata":{"id":"2mY86GV2eO_G"},"source":["Specify model name:"]},{"cell_type":"code","execution_count":6,"metadata":{"id":"GK1r2VFVeO_G"},"outputs":[],"source":["from datetime import datetime\n","\n","GPT35_TURBO: str = \"gpt-3.5-turbo-1106\" if datetime.now() < datetime(2023, 12, 11) else \"gpt-3.5-turbo\"\n","GPT4: str = \"gpt-4\"\n"]},{"cell_type":"markdown","metadata":{"id":"87QWD_kLeO_G"},"source":["---\n","\n","## Setup OpenAI Client with GPT Model"]},{"cell_type":"code","execution_count":7,"metadata":{"id":"J3gYa6czeO_H"},"outputs":[],"source":["import json\n","import os\n","import openai\n"]},{"cell_type":"code","execution_count":8,"metadata":{"id":"yTp0GKx9eO_H"},"outputs":[],"source":["def create_open_ai_client():\n","    if IS_AZURE_OPENAI:\n","        return openai.AzureOpenAI(\n","            api_key=os.getenv(\"AZURE_OPENAI_API_KEY\"),\n","            api_version=os.getenv(\"AZURE_OPENAI_API_VERSION\"),\n","            azure_endpoint=os.getenv(\"AZURE_OPENAI_ENDPOINT\"),\n","            azure_deployment=os.getenv(\"AZURE_OPENAI_DEPLOYMENT_ID\")\n","        )\n","    else:\n","        return openai.OpenAI(\n","            api_key=os.getenv('OPENAI_API_KEY')\n","        )\n"]},{"cell_type":"code","execution_count":9,"metadata":{"id":"Hes3w6mfeO_H"},"outputs":[],"source":["openai_client = create_open_ai_client()\n","openai_model = os.getenv(\"AZURE_OPENAI_DEPLOYMENT_ID\") if IS_AZURE_OPENAI else GPT4\n","\n","def get_gpt_response(messages, model=openai_model, temperature=0) -> dict:\n","    response = openai_client.chat.completions.create(\n","        model=model,\n","        #response_format={\"type\": \"json_object\"},  # Uncomment it if your chosen model supports it\n","        messages=messages,\n","        temperature=temperature,\n","    )\n","    response_str = response.choices[0].message.content\n","\n","    try:\n","        response_dict = json.loads(response_str)\n","    except json.JSONDecodeError:\n","        print(f\"Failed to decode response: {response_str}\")\n","        raise\n","\n","    return response_dict\n"]},{"cell_type":"code","execution_count":10,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":831,"status":"ok","timestamp":1701355800331,"user":{"displayName":"","userId":""},"user_tz":-330},"id":"5JrKbCv7eO_H","outputId":"0202742f-4049-4f17-f581-9d2508d021c8"},"outputs":[{"name":"stdout","output_type":"stream","text":["{'message': 'this is test'}\n"]}],"source":["print(get_gpt_response([\n","    {\"role\": \"user\", \"content\": \"Say this is test. Format response in JSON\"}\n","]))\n"]},{"cell_type":"markdown","metadata":{"id":"Mu09cyaLeO_I"},"source":["You are all set to use GPT for common NLP tasks such as Sentiment Analysis, Language Translation, Intent/Entity Recognition."]},{"cell_type":"markdown","metadata":{"id":"eOmv1Y7OeO_I"},"source":["---\n","\n","## Setup Database\n","\n","You need a dataset that you will query using natural language. You also need a SQL database that will host that dataset."]},{"cell_type":"markdown","metadata":{},"source":["### Example Dataset: DVD Rental\n","\n","Sakila example dataset is commonly used for teaching and testing RDBMS concept. It has data of fictitious DVD Rental Store. We will use [SQLite](https://www.sqlite.org/index.html) as the database. Python has [sqlite3](https://docs.python.org/3/library/sqlite3.html) package, so it does not require anything to installed and deployed locally or on cloud."]},{"cell_type":"markdown","metadata":{},"source":["1. Download the dataset using `curl` or `wget` command from [SQLite Tutorial](https://www.sqlitetutorial.net/sqlite-sample-database/). Alternatively, you can download from [Kaggle](https://www.kaggle.com/datasets/atanaskanev/sqlite-sakila-sample-database/data) too."]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[],"source":["# Clear previously downloaded and unzipped files\n","\n","!rm -f ./chinook.zip ./chinook.db"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n","                                 Dload  Upload   Total   Spent    Left  Speed\n","100  298k  100  298k    0     0   273k      0  0:00:01  0:00:01 --:--:--  273k\n"]}],"source":["!curl -L0 https://www.sqlitetutorial.net/wp-content/uploads/2018/03/chinook.zip --output ./chinook.zip\n","\n","#!wget https://www.sqlitetutorial.net/wp-content/uploads/2018/03/chinook.zip\n"]},{"cell_type":"markdown","metadata":{},"source":["2. Unzip the db file"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Archive:  chinook.zip\n","  inflating: chinook.db              \n"]}],"source":["!unzip chinook.zip\n"]},{"cell_type":"markdown","metadata":{},"source":["3. The db file will be stored at `./chinook.db`. This is the path you will need when using `sqlite3` package."]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["-rw-r--r--@ 1 scgupta  staff  884736 Nov 29  2015 ./chinook.db\n"]}],"source":["!ls -l ./chinook.db\n"]},{"cell_type":"markdown","metadata":{},"source":["4. Extract DB Metadata"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["import sqlite3\n"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[],"source":["DB_FILE_PATH = \"./chinook.db\"\n"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[],"source":["def extract_sqlite3_db_metadata(sqlite_db_file_path: str):\n","    db_metadata = {}\n","\n","    # Connect to the SQLite database\n","    conn = sqlite3.connect(sqlite_db_file_path)\n","    cursor = conn.cursor()\n","\n","    # Get a list of all tables in the database\n","    cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n","    tables = cursor.fetchall()\n","\n","    # Loop through each table and get its columns\n","    for table in tables:\n","        table_name = table[0]\n","        primary_keys = []\n","        foreign_keys = {}\n","        columns_info = {}\n","\n","        # Get table details\n","        cursor.execute(f\"PRAGMA table_info({table_name});\")\n","        columns = cursor.fetchall()\n","\n","        # Extract info about the columns of the current table\n","        for column in columns:\n","            column_name = column[1]\n","            column_type = column[2]\n","            is_primary = (column[5] == 1)\n","\n","            columns_info[column_name] = {\n","                \"type\": column_type,\n","                \"primary\": is_primary,\n","                \"foreign\": {}\n","            }\n","\n","        # Primary Keys\n","        primary_keys = [\n","            c_name\n","            for c_name, c_attrs in columns_info.items()\n","            if c_attrs[\"primary\"] == True\n","        ]\n","\n","        # Get foreign key details\n","        cursor.execute(f\"PRAGMA foreign_key_list({table_name});\")\n","        fk_constraints = cursor.fetchall()\n","\n","        for fk in fk_constraints:\n","            fk_constraint_id = fk[0]\n","            fk_to_table = fk[2]\n","            fk_from_column = fk[3]\n","            fk_to_column = fk[4]\n","\n","            fk_info = {\n","                \"constraint_id\": fk_constraint_id,\n","                \"to_table\": fk_to_table,\n","                \"to_column\": fk_to_column\n","            }\n","            foreign_keys[fk_from_column] = fk_info\n","            columns_info[fk_from_column][\"foreign\"] = fk_info\n","\n","        db_metadata[table_name] = {\n","            \"columns\": columns_info,\n","            \"primary_keys\": primary_keys,\n","            \"foreign_keys\": foreign_keys\n","        }\n","\n","    # Close the connection\n","    conn.close()\n","\n","    # Remove tables with names staring with \"sqlite\" as those are not part of applications\n","    tables_to_remove = [t for t in db_metadata if t.startswith(\"sqlite\")]\n","    for t in tables_to_remove:\n","        del db_metadata[t]\n","\n","    # Done!\n","    return db_metadata\n"]},{"cell_type":"markdown","metadata":{},"source":["5. Check out if the database metadata has been extracted correctly."]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[],"source":["def table_info_str(t_name, t_info) -> str:\n","    column_info_str = \"\\n        \".join([\n","        f\"{c_name}: {c_info['type']}\"\n","        for c_name, c_info in t_info[\"columns\"].items()\n","    ])\n","\n","    primary_key_info_str = \"\"\n","    if len(t_info[\"primary_keys\"]) > 0:\n","        primary_key_info_str = f\"Primary Keys: {','.join(t_info['primary_keys'])}\"\n","\n","    foreign_key_info_str = \"\"\n","    if len(t_info[\"foreign_keys\"]) > 0:\n","        foreign_key_info_str = \"\\n    Foreign Keys:\\n        \" + \"\\n        \".join([\n","            f\"{fk_from_col} => {fk_info['to_table']}.{fk_info['to_column']}\"\n","            for fk_from_col, fk_info in t_info[\"foreign_keys\"].items()\n","        ])\n","\n","    return f\"\"\"Table Name: {t_name}\n","    Columns:\n","        {column_info_str}\n","    \"\"\" + primary_key_info_str + foreign_key_info_str\n"]},{"cell_type":"code","execution_count":19,"metadata":{},"outputs":[],"source":["chinook_db_metadata = extract_sqlite3_db_metadata(DB_FILE_PATH)\n"]},{"cell_type":"code","execution_count":20,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Table Name: albums\n","    Columns:\n","        AlbumId: INTEGER\n","        Title: NVARCHAR(160)\n","        ArtistId: INTEGER\n","    Primary Keys: AlbumId\n","    Foreign Keys:\n","        ArtistId => artists.ArtistId\n","\n","Table Name: artists\n","    Columns:\n","        ArtistId: INTEGER\n","        Name: NVARCHAR(120)\n","    Primary Keys: ArtistId\n","\n","Table Name: customers\n","    Columns:\n","        CustomerId: INTEGER\n","        FirstName: NVARCHAR(40)\n","        LastName: NVARCHAR(20)\n","        Company: NVARCHAR(80)\n","        Address: NVARCHAR(70)\n","        City: NVARCHAR(40)\n","        State: NVARCHAR(40)\n","        Country: NVARCHAR(40)\n","        PostalCode: NVARCHAR(10)\n","        Phone: NVARCHAR(24)\n","        Fax: NVARCHAR(24)\n","        Email: NVARCHAR(60)\n","        SupportRepId: INTEGER\n","    Primary Keys: CustomerId\n","    Foreign Keys:\n","        SupportRepId => employees.EmployeeId\n","\n","Table Name: employees\n","    Columns:\n","        EmployeeId: INTEGER\n","        LastName: NVARCHAR(20)\n","        FirstName: NVARCHAR(20)\n","        Title: NVARCHAR(30)\n","        ReportsTo: INTEGER\n","        BirthDate: DATETIME\n","        HireDate: DATETIME\n","        Address: NVARCHAR(70)\n","        City: NVARCHAR(40)\n","        State: NVARCHAR(40)\n","        Country: NVARCHAR(40)\n","        PostalCode: NVARCHAR(10)\n","        Phone: NVARCHAR(24)\n","        Fax: NVARCHAR(24)\n","        Email: NVARCHAR(60)\n","    Primary Keys: EmployeeId\n","    Foreign Keys:\n","        ReportsTo => employees.EmployeeId\n","\n","Table Name: genres\n","    Columns:\n","        GenreId: INTEGER\n","        Name: NVARCHAR(120)\n","    Primary Keys: GenreId\n","\n","Table Name: invoices\n","    Columns:\n","        InvoiceId: INTEGER\n","        CustomerId: INTEGER\n","        InvoiceDate: DATETIME\n","        BillingAddress: NVARCHAR(70)\n","        BillingCity: NVARCHAR(40)\n","        BillingState: NVARCHAR(40)\n","        BillingCountry: NVARCHAR(40)\n","        BillingPostalCode: NVARCHAR(10)\n","        Total: NUMERIC(10,2)\n","    Primary Keys: InvoiceId\n","    Foreign Keys:\n","        CustomerId => customers.CustomerId\n","\n","Table Name: invoice_items\n","    Columns:\n","        InvoiceLineId: INTEGER\n","        InvoiceId: INTEGER\n","        TrackId: INTEGER\n","        UnitPrice: NUMERIC(10,2)\n","        Quantity: INTEGER\n","    Primary Keys: InvoiceLineId\n","    Foreign Keys:\n","        TrackId => tracks.TrackId\n","        InvoiceId => invoices.InvoiceId\n","\n","Table Name: media_types\n","    Columns:\n","        MediaTypeId: INTEGER\n","        Name: NVARCHAR(120)\n","    Primary Keys: MediaTypeId\n","\n","Table Name: playlists\n","    Columns:\n","        PlaylistId: INTEGER\n","        Name: NVARCHAR(120)\n","    Primary Keys: PlaylistId\n","\n","Table Name: playlist_track\n","    Columns:\n","        PlaylistId: INTEGER\n","        TrackId: INTEGER\n","    Primary Keys: PlaylistId\n","    Foreign Keys:\n","        TrackId => tracks.TrackId\n","        PlaylistId => playlists.PlaylistId\n","\n","Table Name: tracks\n","    Columns:\n","        TrackId: INTEGER\n","        Name: NVARCHAR(200)\n","        AlbumId: INTEGER\n","        MediaTypeId: INTEGER\n","        GenreId: INTEGER\n","        Composer: NVARCHAR(220)\n","        Milliseconds: INTEGER\n","        Bytes: INTEGER\n","        UnitPrice: NUMERIC(10,2)\n","    Primary Keys: TrackId\n","    Foreign Keys:\n","        MediaTypeId => media_types.MediaTypeId\n","        GenreId => genres.GenreId\n","        AlbumId => albums.AlbumId\n","\n"]}],"source":["for t_name, t_info in chinook_db_metadata.items():\n","    print(table_info_str(t_name, t_info))\n","    print()\n"]},{"cell_type":"markdown","metadata":{},"source":["---\n","\n","## Database Table Schema Documents\n","\n","GPT can create a SQL query only if it understands various tables and their columns. While creating the GPT prompt, you must include this info of relevant tables.\n","\n","The `CREATE TABLE` statement of [SQL DDL](https://en.wikipedia.org/wiki/Data_definition_language) captures all necessary info. Ideally, table description and column descriptions should also be captured as comments to assist document search and GPT.\n","\n","Let's create a mapping of table name and their `CREATE TABLE` statements."]},{"cell_type":"code","execution_count":21,"metadata":{},"outputs":[],"source":["def create_table_ddl_stmt_str(t_name, t_info) -> str:\n","    column_defs = \",\\n    \".join([\n","        f\"{c_name} \\t{c_info['type']}\"\n","        for c_name, c_info in t_info[\"columns\"].items()\n","    ])\n","\n","    primary_key_def = \"\"\n","    if len(t_info[\"primary_keys\"]) > 0:\n","        primary_key_def = f\",\\n\\n    PRIMARY KEY ({', '.join(t_info['primary_keys'])})\"\n","\n","    foreign_key_def =\"\"\n","    if len(t_info[\"foreign_keys\"]) > 0:\n","        fk_stmts = \",\\n\".join([\n","            f\"    FOREIGN KEY({fk_from_col}) REFERENCES {fk_info['to_table']}({fk_info['to_column']})\"\n","            for fk_from_col, fk_info in t_info[\"foreign_keys\"].items()\n","        ])\n","        foreign_key_def = f\",\\n\\n{fk_stmts}\"\n","\n","    return f\"\"\"CREATE TABLE {t_name} (\n","    {column_defs}{primary_key_def}{foreign_key_def}\n",");\"\"\"\n"]},{"cell_type":"markdown","metadata":{},"source":["Let's sequence the tables so that the definition of every table referred to in a `FOREIGN KEY` constraint comes before the constraint. While one can write code to analyze foreign key constraint graph and perform a topological sort to get a partial order, I decided to just hand code it as it does not have relevance for this tutorial.\n","\n","You can check the [Entity Relation Model](https://en.wikipedia.org/wiki/Entity%E2%80%93relationship_model) for all tables drawn using Crow's Foot notation:\n","\n","![](https://www.sqlitetutorial.net/wp-content/uploads/2015/11/sqlite-sample-database-color.jpg)"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[],"source":["# Table list in Topological Order for foreign key constraints\n","\n","chinook_db_table_names = [\n","    \"artists\", \"albums\",\n","    \"media_types\", \"genres\", \"tracks\",\n","    \"playlists\", \"playlist_track\",\n","    \"employees\",\n","    \"customers\", \"invoices\", \"invoice_items\"\n","]\n"]},{"cell_type":"code","execution_count":23,"metadata":{},"outputs":[],"source":["all_chinook_db_table_documents: dict[str, str] = {\n","    t_name: create_table_ddl_stmt_str(t_name, chinook_db_metadata[t_name])\n","    for t_name in chinook_db_table_names\n","}\n"]},{"cell_type":"code","execution_count":24,"metadata":{},"outputs":[],"source":["#for t_name in chinook_db_table_names:\n","#    print(all_chinook_db_table_documents[t_name])\n","#    print()\n"]},{"cell_type":"markdown","metadata":{},"source":["---\n","\n","## Natural Language Query to SQL\n","\n","General flow of building applications using Large Language Models (LLMs) and Retrieval Augmented Generation (RAG) has three parts:\n","\n","- **Embeddings**: Data Preprocessing\n","  - Break private data or documents into chunks\n","  - Convert chunks to vectors using an embedding model\n","  - Store vectors in a Vector DB\n","- **Retrieval**: Prompt Construction\n","  - Convert user query into a vector using the same embedding model\n","  - Search the Vector DB for chunk with similar embeddings and rank them\n","  - Craft a prompt using the user query and the document chunks found in the search\n","- **Inference**: Prompt Execution\n","  - Submit the prompt to a LLM\n","  - Post-process (check, augment) the LLM response\n","  - Send the response to the user\n","\n","For converting a natural language query to SQL, RAG pattern will translate to:\n","\n","- Embeddings:\n","  - Consider a `CREATE TABLE` statement for a table as one document chunk\n","  - Convert each `CREATE TABLE` statement to a vector embedding\n","  - Save (embedding, table name) mapping in a Vector DB\n","- Retrieval:\n","  - Convert incoming user query to a vector embedding\n","  - Search Vector DB and find tables with top similarity score\n","  - Craft a prompt using the user query and `CREATE TABLE` statements of all top-matching tables\n","- Inference:\n","  - Submit prompt to GPT to get the equivalent SQL\n","  - Execute the returned SQL on the database\n","  - Present the results to the user\n"]},{"cell_type":"markdown","metadata":{},"source":["### RAG: Vector DB Document Search\n","\n","For sake of simplicity, we will skip the embedding and Vector DB search. Since there are only 11 tables, with not too many columns, we can send DDL for all tables in the prompt."]},{"cell_type":"code","execution_count":25,"metadata":{},"outputs":[],"source":["def find_tables(nl_query: str) -> dict[str, str]:\n","    # Bypassing\n","    # - Convert nl_query => embeddings\n","    # - Search Vector DB for documents (table's CREATE TABLE statement) with similar embeddings\n","    # - Return {table_name: document} mapping for all matching tables\n","    #\n","    # Instead return all documents\n","\n","    return all_chinook_db_table_documents\n"]},{"cell_type":"markdown","metadata":{},"source":["### Prompt Construction\n","\n","Craft a prompt using the user query and the documents returned from Vector DB search"]},{"cell_type":"code","execution_count":26,"metadata":{},"outputs":[],"source":["def nl2sql_system_prompt(documents: dict[str, str], sql_flavor: str = \"Python sqlite3\") -> str:\n","    metadata = \"\\n\".join([\n","        f\"# SQL DDL Schema for `{table_name}` table:```sql\\n{table_schema}```\\n\"\n","        for table_name, table_schema in documents.items()\n","    ])\n","\n","    system_prompt = f\"\"\"\n","    You are a data analyst and data engineer. You are an expert in writing SQL queries\n","    for {sql_flavor} database.\n","\n","    You have following tables in the database. The table name is in single backquote, and\n","    the DDL code to create that table with schema and metadata details are in triple backquote.\n","\n","    ### Database Table Schemas:\n","    \\n{metadata}\n","    ###\n","\n","    User ask you queries in natural language, and you job is to write equivalent\n","    SQL queries in following steps:\n","    1. Identify the tables that have data relevant for the query\n","    2. Identify relevant columns in those tables\n","    3. Craft a SQL query that selects, filters, groups, joins in an optimal order\n","       that is equivalent to the user's natural language query.\n","\n","    Format your response as a JSON dictionary with following key, value:\n","    - tables: a dictionary with the name of relevant tables as keys, and the\n","        list of relevant columns in that as value.\n","    - sql: the sql query that you crafted.\n","    \"\"\"\n","\n","    return system_prompt\n"]},{"cell_type":"code","execution_count":27,"metadata":{},"outputs":[],"source":["def nl2sql_user_prompt(nl_query: str):\n","    return f\"Write a SQL that computes natural language query in triple backquotes: ```{nl_query}```\"\n"]},{"cell_type":"markdown","metadata":{},"source":["### Prompt Execution"]},{"cell_type":"code","execution_count":28,"metadata":{},"outputs":[],"source":["def write_sql_query(nl_query: str) -> dict:\n","    # Vectorize nl_query and find matching documents (tables and their DDL)\n","    documents = find_tables(nl_query)\n","    # Craft prompt using the natural language queries and matching documents\n","    system_prompt = nl2sql_system_prompt(documents)\n","    user_prompt = nl2sql_user_prompt(nl_query)\n","\n","    messages = [\n","        {\"role\": \"system\", \"content\": system_prompt},\n","        {\"role\": \"user\", \"content\": user_prompt}\n","    ]\n","    response_dict = get_gpt_response(messages)\n","\n","    return response_dict\n"]},{"cell_type":"markdown","metadata":{},"source":["### Post-processing: Execute SQL"]},{"cell_type":"code","execution_count":29,"metadata":{},"outputs":[],"source":["def execute_sql_query_on_sqlite3(sql_query: str):\n","    conn = sqlite3.connect(DB_FILE_PATH)\n","    cursor = conn.cursor()\n","    result = cursor.execute(sql_query)\n","    rows = result.fetchall()\n","    conn.close()\n","\n","    return rows\n"]},{"cell_type":"code","execution_count":30,"metadata":{},"outputs":[],"source":["import sqlalchemy\n"]},{"cell_type":"code","execution_count":31,"metadata":{},"outputs":[],"source":["sql_engine = sqlalchemy.create_engine(\n","    f\"sqlite:///{os.path.abspath(os.path.join(os.getcwd(), DB_FILE_PATH))}\",\n","    echo=True\n",")\n"]},{"cell_type":"code","execution_count":32,"metadata":{},"outputs":[],"source":["def execute_sql_query(connection, query):\n","    result_obj = connection.execute(sqlalchemy.text(query))\n","    return result_obj.fetchall()\n"]},{"cell_type":"code","execution_count":33,"metadata":{},"outputs":[],"source":["def execute_nl_query(nl_query: str):\n","    response = write_sql_query(nl_query)\n","\n","    #response[\"rows\"] = execute_sql_query_on_sqlite3(response[\"sql\"])\n","    with sql_engine.connect() as conn:\n","        response[\"rows\"] = execute_sql_query(conn, response[\"sql\"])\n","\n","    return response\n"]},{"cell_type":"markdown","metadata":{},"source":["### Try"]},{"cell_type":"code","execution_count":34,"metadata":{},"outputs":[],"source":["test_nl_queries = [\n","    \"Who is the artist with the most albums?\",\n","    \"List the top 3 tracks with maximum sale.\",\n","    \"Name the employee who supports maximum number of customers.\"\n","] \n"]},{"cell_type":"code","execution_count":35,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["2024-10-24 11:49:14,428 INFO sqlalchemy.engine.Engine BEGIN (implicit)\n","2024-10-24 11:49:14,430 INFO sqlalchemy.engine.Engine SELECT artists.Name, COUNT(albums.AlbumId) as AlbumCount FROM artists JOIN albums ON artists.ArtistId = albums.ArtistId GROUP BY artists.ArtistId ORDER BY AlbumCount DESC LIMIT 1;\n","2024-10-24 11:49:14,431 INFO sqlalchemy.engine.Engine [generated in 0.00248s] ()\n","2024-10-24 11:49:14,433 INFO sqlalchemy.engine.Engine ROLLBACK\n","2024-10-24 11:49:17,800 INFO sqlalchemy.engine.Engine BEGIN (implicit)\n","2024-10-24 11:49:17,802 INFO sqlalchemy.engine.Engine SELECT t.Name, SUM(ii.Quantity) as Total_Sales FROM invoice_items ii JOIN tracks t ON ii.TrackId = t.TrackId GROUP BY ii.TrackId ORDER BY Total_Sales DESC LIMIT 3\n","2024-10-24 11:49:17,802 INFO sqlalchemy.engine.Engine [generated in 0.00232s] ()\n","2024-10-24 11:49:17,807 INFO sqlalchemy.engine.Engine ROLLBACK\n","2024-10-24 11:49:20,924 INFO sqlalchemy.engine.Engine BEGIN (implicit)\n","2024-10-24 11:49:20,925 INFO sqlalchemy.engine.Engine SELECT e.FirstName, e.LastName FROM employees e WHERE e.EmployeeId = (SELECT c.SupportRepId FROM customers c GROUP BY c.SupportRepId ORDER BY COUNT(*) DESC LIMIT 1)\n","2024-10-24 11:49:20,925 INFO sqlalchemy.engine.Engine [generated in 0.00116s] ()\n","2024-10-24 11:49:20,926 INFO sqlalchemy.engine.Engine ROLLBACK\n"]}],"source":["results = []\n","for nl_q in test_nl_queries:\n","    response =  execute_nl_query(nl_q)\n","    response[\"query\"] = nl_q\n","    results.append(response)\n"]},{"cell_type":"code","execution_count":36,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["User Query: Who is the artist with the most albums?\n","        Tables:\n","            {\"artists\": [\"ArtistId\", \"Name\"], \"albums\": [\"ArtistId\"]}\n","        SQL:\n","            SELECT artists.Name, COUNT(albums.AlbumId) as AlbumCount FROM artists JOIN albums ON artists.ArtistId = albums.ArtistId GROUP BY artists.ArtistId ORDER BY AlbumCount DESC LIMIT 1;\n","        Rows:\n","            [('Iron Maiden', 21)]\n","\n","        \n","User Query: List the top 3 tracks with maximum sale.\n","        Tables:\n","            {\"invoice_items\": [\"TrackId\", \"Quantity\"], \"tracks\": [\"TrackId\", \"Name\"]}\n","        SQL:\n","            SELECT t.Name, SUM(ii.Quantity) as Total_Sales FROM invoice_items ii JOIN tracks t ON ii.TrackId = t.TrackId GROUP BY ii.TrackId ORDER BY Total_Sales DESC LIMIT 3\n","        Rows:\n","            [('Balls to the Wall', 2), ('Inject The Venom', 2), ('Snowballed', 2)]\n","\n","        \n","User Query: Name the employee who supports maximum number of customers.\n","        Tables:\n","            {\"employees\": [\"EmployeeId\", \"FirstName\", \"LastName\"], \"customers\": [\"SupportRepId\"]}\n","        SQL:\n","            SELECT e.FirstName, e.LastName FROM employees e WHERE e.EmployeeId = (SELECT c.SupportRepId FROM customers c GROUP BY c.SupportRepId ORDER BY COUNT(*) DESC LIMIT 1)\n","        Rows:\n","            [('Jane', 'Peacock')]\n","\n","        \n"]}],"source":["for t in results:\n","    print(\n","        f\"\"\"User Query: {t['query']}\n","        Tables:\n","            {json.dumps(t['tables'])}\n","        SQL:\n","            {t['sql']}\n","        Rows:\n","            {str(t['rows'])}\\n\n","        \"\"\")\n"]},{"cell_type":"markdown","metadata":{},"source":["## Cleanup"]},{"cell_type":"code","execution_count":37,"metadata":{},"outputs":[],"source":["sql_engine.dispose()\n"]},{"cell_type":"markdown","metadata":{"id":"-_oyoZAkeO_L"},"source":["---\n","<p>Copyright &copy 2023 <a href=\"https://www.linkedin.com/in/scgupta\">Satish Chandra Gupta</a>.</p>\n","<img src=\"https://licensebuttons.net/l/by-nc-sa/3.0/88x31.png\" align=\"left\"/> <p>&nbsp;<a href=\"https://creativecommons.org/licenses/by-nc-sa/4.0/\">CC BY-NC-SA 4.0 International</a> License.</p>"]}],"metadata":{"colab":{"provenance":[{"file_id":"https://github.com/ml4devs/ml4devs-notebooks/blob/master/gpt/nlp_with_gpt_notebook.ipynb","timestamp":1701355922767},{"file_id":"https://github.com/ml4devs/ml4devs-notebooks/blob/master/gpt/nlp_with_gpt_notebook.ipynb","timestamp":1701352821148}]},"kernelspec":{"display_name":"kaggle","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.14"}},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/speech/asr/deepspeech/mozilla_deepspeech_api_notebook.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |   "cells": [
   3 |     {
   4 |       "cell_type": "markdown",
   5 |       "metadata": {
   6 |         "colab_type": "text",
   7 |         "id": "view-in-github"
   8 |       },
   9 |       "source": [
  10 |         "<a href=\"https://colab.research.google.com/github/ml4devs/ml4devs-notebooks/blob/master/speech/asr/deepspeech/mozilla_deepspeech_api_notebook.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
  11 |       ]
  12 |     },
  13 |     {
  14 |       "cell_type": "markdown",
  15 |       "metadata": {
  16 |         "id": "utrU3Ul7KK0G"
  17 |       },
  18 |       "source": [
  19 |         "<h1><center>Python Speech-to-Text Transcriber with Mozilla Deepspeech</center></h1>\n",
  20 |         "\n",
  21 |         "<p><center>\n",
  22 |         "<address>&copy; Satish Chandra Gupta<br/>\n",
  23 |         "LinkedIn: <a href=\"https://www.linkedin.com/in/scgupta/\">scgupta</a>,\n",
  24 |         "Twitter: <a href=\"https://twitter.com/scgupta\">scgupta</a>\n",
  25 |         "</address> \n",
  26 |         "</center></p>\n",
  27 |         "\n",
  28 |         "---\n",
  29 |         "\n",
  30 |         "Blog post: [How to Build Python Transcriber Using Mozilla Deepspeech](https://www.ml4devs.com/articles/how-to-build-python-transcriber-using-mozilla-deepspeech/)\n",
  31 |         "\n",
  32 |         "Update: [Mozilla DeepSpeech](https://github.com/mozilla/DeepSpeech) is no longer maintaned, and its new home is [Coqui STT](https://github.com/coqui-ai/STT), which has same [APIs in C, Java, .NET, Python, and JavaScript](https://stt.readthedocs.io/) (and also appears that the team has moved too). This notebook is tested with the [Coqui STT 1.4.0](https://github.com/coqui-ai/STT/releases/tag/v1.4.0).\n",
  33 |         "\n",
  34 |         "From Colab menu, select: **Runtime** > **Change runtime type**, and verify that it is set to Python3, and select GPU if you want to try out GPU version.\n",
  35 |         "\n",
  36 |         "You can [pip-install Coqui STT](https://pypi.org/project/stt/):"
  37 |       ]
  38 |     },
  39 |     {
  40 |       "cell_type": "code",
  41 |       "execution_count": 1,
  42 |       "metadata": {
  43 |         "colab": {
  44 |           "base_uri": "https://localhost:8080/"
  45 |         },
  46 |         "id": "iemeuv-jKR3P",
  47 |         "outputId": "ae86dd81-81c2-4417-e76f-178b3e1150ad"
  48 |       },
  49 |       "outputs": [
  50 |         {
  51 |           "name": "stdout",
  52 |           "output_type": "stream",
  53 |           "text": [
  54 |             "Python 3.7.15\n"
  55 |           ]
  56 |         }
  57 |       ],
  58 |       "source": [
  59 |         "!python --version\n"
  60 |       ]
  61 |     },
  62 |     {
  63 |       "cell_type": "code",
  64 |       "execution_count": 2,
  65 |       "metadata": {
  66 |         "colab": {
  67 |           "base_uri": "https://localhost:8080/"
  68 |         },
  69 |         "id": "zABV65yhNJ0M",
  70 |         "outputId": "8023c889-f59a-44a9-99e0-702e94416ebf"
  71 |       },
  72 |       "outputs": [
  73 |         {
  74 |           "name": "stdout",
  75 |           "output_type": "stream",
  76 |           "text": [
  77 |             "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
  78 |             "Requirement already satisfied: stt==1.4.0 in /usr/local/lib/python3.7/dist-packages (1.4.0)\n",
  79 |             "Requirement already satisfied: numpy>=1.14.5 in /usr/local/lib/python3.7/dist-packages (from stt==1.4.0) (1.21.6)\n"
  80 |           ]
  81 |         }
  82 |       ],
  83 |       "source": [
  84 |         "!pip install stt==1.4.0\n"
  85 |       ]
  86 |     },
  87 |     {
  88 |       "cell_type": "markdown",
  89 |       "metadata": {
  90 |         "id": "lbWIPOUwNVyI"
  91 |       },
  92 |       "source": [
  93 |         "## Download Models and Audio Files\n",
  94 |         "\n",
  95 |         "Mozilla has released models for US English, we will use those in this code lab.\n",
  96 |         "\n",
  97 |         "1. **Download the models:**\n",
  98 |         "Models can be downloaded from [Coqui Model repository](https://coqui.ai/models), for example, [English STT v1.0.0 (Large Vocabulary)](https://coqui.ai/english/coqui/v1.0.0-large-vocab) that is used here."
  99 |       ]
 100 |     },
 101 |     {
 102 |       "cell_type": "code",
 103 |       "execution_count": 3,
 104 |       "metadata": {
 105 |         "id": "-Z0dSoLJPKKY"
 106 |       },
 107 |       "outputs": [],
 108 |       "source": [
 109 |         "!mkdir coqui-stt-1.0.0-models\n"
 110 |       ]
 111 |     },
 112 |     {
 113 |       "cell_type": "code",
 114 |       "execution_count": 4,
 115 |       "metadata": {
 116 |         "colab": {
 117 |           "base_uri": "https://localhost:8080/"
 118 |         },
 119 |         "id": "cF1uB0zSNk-O",
 120 |         "outputId": "11c223bb-f1f3-43d5-f064-475b254f8bfc"
 121 |       },
 122 |       "outputs": [
 123 |         {
 124 |           "name": "stdout",
 125 |           "output_type": "stream",
 126 |           "text": [
 127 |             "--2022-11-01 08:48:10--  https://coqui.gateway.scarf.sh/english/coqui/v1.0.0-large-vocab/model.tflite\n",
 128 |             "Resolving coqui.gateway.scarf.sh (coqui.gateway.scarf.sh)... 54.70.21.136, 35.155.221.103\n",
 129 |             "Connecting to coqui.gateway.scarf.sh (coqui.gateway.scarf.sh)|54.70.21.136|:443... connected.\n",
 130 |             "HTTP request sent, awaiting response... 302 Found\n",
 131 |             "Location: https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v1.0.0-large-vocab/model.tflite [following]\n",
 132 |             "--2022-11-01 08:48:10--  https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v1.0.0-large-vocab/model.tflite\n",
 133 |             "Resolving github.com (github.com)... 140.82.113.3\n",
 134 |             "Connecting to github.com (github.com)|140.82.113.3|:443... connected.\n",
 135 |             "HTTP request sent, awaiting response... 302 Found\n",
 136 |             "Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/351871871/e6d0f95f-97dc-43ac-ac08-38660209ebbc?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221101T084810Z&X-Amz-Expires=300&X-Amz-Signature=d7092fcaf803d854ad039a5acba46750ea0aee7c807a1673d6583d9ab4debe8f&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=351871871&response-content-disposition=attachment%3B%20filename%3Dmodel.tflite&response-content-type=application%2Foctet-stream [following]\n",
 137 |             "--2022-11-01 08:48:10--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/351871871/e6d0f95f-97dc-43ac-ac08-38660209ebbc?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221101T084810Z&X-Amz-Expires=300&X-Amz-Signature=d7092fcaf803d854ad039a5acba46750ea0aee7c807a1673d6583d9ab4debe8f&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=351871871&response-content-disposition=attachment%3B%20filename%3Dmodel.tflite&response-content-type=application%2Foctet-stream\n",
 138 |             "Resolving objects.githubusercontent.com (objects.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...\n",
 139 |             "Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.109.133|:443... connected.\n",
 140 |             "HTTP request sent, awaiting response... 200 OK\n",
 141 |             "Length: 47332120 (45M) [application/octet-stream]\n",
 142 |             "Saving to: ‘model.tflite’\n",
 143 |             "\n",
 144 |             "model.tflite        100%[===================>]  45.14M  87.1MB/s    in 0.5s    \n",
 145 |             "\n",
 146 |             "2022-11-01 08:48:11 (87.1 MB/s) - ‘model.tflite’ saved [47332120/47332120]\n",
 147 |             "\n"
 148 |           ]
 149 |         }
 150 |       ],
 151 |       "source": [
 152 |         "!wget https://coqui.gateway.scarf.sh/english/coqui/v1.0.0-large-vocab/model.tflite\n"
 153 |       ]
 154 |     },
 155 |     {
 156 |       "cell_type": "code",
 157 |       "execution_count": 5,
 158 |       "metadata": {
 159 |         "colab": {
 160 |           "base_uri": "https://localhost:8080/"
 161 |         },
 162 |         "id": "mwqvk3jUFblh",
 163 |         "outputId": "b82099d7-9a54-4064-efeb-f868acc6f7ad"
 164 |       },
 165 |       "outputs": [
 166 |         {
 167 |           "name": "stdout",
 168 |           "output_type": "stream",
 169 |           "text": [
 170 |             "--2022-11-01 08:48:11--  https://coqui.gateway.scarf.sh/english/coqui/v1.0.0-large-vocab/large_vocabulary.scorer\n",
 171 |             "Resolving coqui.gateway.scarf.sh (coqui.gateway.scarf.sh)... 54.70.21.136, 35.155.221.103\n",
 172 |             "Connecting to coqui.gateway.scarf.sh (coqui.gateway.scarf.sh)|54.70.21.136|:443... connected.\n",
 173 |             "HTTP request sent, awaiting response... 302 Found\n",
 174 |             "Location: https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v1.0.0-large-vocab/large_vocabulary.scorer [following]\n",
 175 |             "--2022-11-01 08:48:11--  https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v1.0.0-large-vocab/large_vocabulary.scorer\n",
 176 |             "Resolving github.com (github.com)... 140.82.112.3\n",
 177 |             "Connecting to github.com (github.com)|140.82.112.3|:443... connected.\n",
 178 |             "HTTP request sent, awaiting response... 302 Found\n",
 179 |             "Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/351871871/1df256c5-336b-424b-b7b9-a33d8262eb24?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221101T084812Z&X-Amz-Expires=300&X-Amz-Signature=8c3f53e036ebc62959f82f7ed932926af817fe3648c7ee8a9c25d52e50365cf3&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=351871871&response-content-disposition=attachment%3B%20filename%3Dlarge_vocabulary.scorer&response-content-type=application%2Foctet-stream [following]\n",
 180 |             "--2022-11-01 08:48:12--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/351871871/1df256c5-336b-424b-b7b9-a33d8262eb24?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221101T084812Z&X-Amz-Expires=300&X-Amz-Signature=8c3f53e036ebc62959f82f7ed932926af817fe3648c7ee8a9c25d52e50365cf3&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=351871871&response-content-disposition=attachment%3B%20filename%3Dlarge_vocabulary.scorer&response-content-type=application%2Foctet-stream\n",
 181 |             "Resolving objects.githubusercontent.com (objects.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
 182 |             "Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.108.133|:443... connected.\n",
 183 |             "HTTP request sent, awaiting response... 200 OK\n",
 184 |             "Length: 132644544 (126M) [application/octet-stream]\n",
 185 |             "Saving to: ‘large_vocabulary.scorer’\n",
 186 |             "\n",
 187 |             "large_vocabulary.sc 100%[===================>] 126.50M  94.8MB/s    in 1.3s    \n",
 188 |             "\n",
 189 |             "2022-11-01 08:48:13 (94.8 MB/s) - ‘large_vocabulary.scorer’ saved [132644544/132644544]\n",
 190 |             "\n"
 191 |           ]
 192 |         }
 193 |       ],
 194 |       "source": [
 195 |         "!wget https://coqui.gateway.scarf.sh/english/coqui/v1.0.0-large-vocab/large_vocabulary.scorer\n"
 196 |       ]
 197 |     },
 198 |     {
 199 |       "cell_type": "code",
 200 |       "execution_count": 6,
 201 |       "metadata": {
 202 |         "colab": {
 203 |           "base_uri": "https://localhost:8080/"
 204 |         },
 205 |         "id": "55gHX_5zP9JY",
 206 |         "outputId": "582181eb-373a-4f28-ba71-db0d2a508228"
 207 |       },
 208 |       "outputs": [
 209 |         {
 210 |           "name": "stdout",
 211 |           "output_type": "stream",
 212 |           "text": [
 213 |             "--2022-11-01 08:48:13--  https://coqui.gateway.scarf.sh/english/coqui/v1.0.0-large-vocab/alphabet.txt\n",
 214 |             "Resolving coqui.gateway.scarf.sh (coqui.gateway.scarf.sh)... 54.70.21.136, 35.155.221.103\n",
 215 |             "Connecting to coqui.gateway.scarf.sh (coqui.gateway.scarf.sh)|54.70.21.136|:443... connected.\n",
 216 |             "HTTP request sent, awaiting response... 302 Found\n",
 217 |             "Location: https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v1.0.0-large-vocab/alphabet.txt [following]\n",
 218 |             "--2022-11-01 08:48:13--  https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v1.0.0-large-vocab/alphabet.txt\n",
 219 |             "Resolving github.com (github.com)... 140.82.112.3\n",
 220 |             "Connecting to github.com (github.com)|140.82.112.3|:443... connected.\n",
 221 |             "HTTP request sent, awaiting response... 302 Found\n",
 222 |             "Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/351871871/17a8ffed-fd5a-4225-bb12-884c66c87c62?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221101T084814Z&X-Amz-Expires=300&X-Amz-Signature=8774cf570daf9851962725458248903ce58515741f34f5e61ac9f5292599a002&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=351871871&response-content-disposition=attachment%3B%20filename%3Dalphabet.txt&response-content-type=application%2Foctet-stream [following]\n",
 223 |             "--2022-11-01 08:48:14--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/351871871/17a8ffed-fd5a-4225-bb12-884c66c87c62?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221101T084814Z&X-Amz-Expires=300&X-Amz-Signature=8774cf570daf9851962725458248903ce58515741f34f5e61ac9f5292599a002&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=351871871&response-content-disposition=attachment%3B%20filename%3Dalphabet.txt&response-content-type=application%2Foctet-stream\n",
 224 |             "Resolving objects.githubusercontent.com (objects.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n",
 225 |             "Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.111.133|:443... connected.\n",
 226 |             "HTTP request sent, awaiting response... 200 OK\n",
 227 |             "Length: 329 [application/octet-stream]\n",
 228 |             "Saving to: ‘alphabet.txt’\n",
 229 |             "\n",
 230 |             "alphabet.txt        100%[===================>]     329  --.-KB/s    in 0s      \n",
 231 |             "\n",
 232 |             "2022-11-01 08:48:14 (7.53 MB/s) - ‘alphabet.txt’ saved [329/329]\n",
 233 |             "\n"
 234 |           ]
 235 |         }
 236 |       ],
 237 |       "source": [
 238 |         "!wget https://coqui.gateway.scarf.sh/english/coqui/v1.0.0-large-vocab/alphabet.txt\n"
 239 |       ]
 240 |     },
 241 |     {
 242 |       "cell_type": "code",
 243 |       "execution_count": 7,
 244 |       "metadata": {
 245 |         "colab": {
 246 |           "base_uri": "https://localhost:8080/"
 247 |         },
 248 |         "id": "6RLXmm1FQABI",
 249 |         "outputId": "b0d0ba6d-cba3-46b1-ae0b-018db6b9ba3f"
 250 |       },
 251 |       "outputs": [
 252 |         {
 253 |           "name": "stdout",
 254 |           "output_type": "stream",
 255 |           "text": [
 256 |             "--2022-11-01 08:48:14--  https://coqui.gateway.scarf.sh/english/coqui/v1.0.0-large-vocab/MODEL_CARD\n",
 257 |             "Resolving coqui.gateway.scarf.sh (coqui.gateway.scarf.sh)... 54.70.21.136, 35.155.221.103\n",
 258 |             "Connecting to coqui.gateway.scarf.sh (coqui.gateway.scarf.sh)|54.70.21.136|:443... connected.\n",
 259 |             "HTTP request sent, awaiting response... 302 Found\n",
 260 |             "Location: https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v1.0.0-large-vocab/MODEL_CARD [following]\n",
 261 |             "--2022-11-01 08:48:14--  https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v1.0.0-large-vocab/MODEL_CARD\n",
 262 |             "Resolving github.com (github.com)... 140.82.112.3\n",
 263 |             "Connecting to github.com (github.com)|140.82.112.3|:443... connected.\n",
 264 |             "HTTP request sent, awaiting response... 302 Found\n",
 265 |             "Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/351871871/b03c95a9-30e2-420d-b07e-413b44525bf0?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221101T084814Z&X-Amz-Expires=300&X-Amz-Signature=0db67a4c622257f9e0519088f224f6b6ad96376357365549458c93ef7a804a15&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=351871871&response-content-disposition=attachment%3B%20filename%3DMODEL_CARD&response-content-type=application%2Foctet-stream [following]\n",
 266 |             "--2022-11-01 08:48:14--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/351871871/b03c95a9-30e2-420d-b07e-413b44525bf0?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221101T084814Z&X-Amz-Expires=300&X-Amz-Signature=0db67a4c622257f9e0519088f224f6b6ad96376357365549458c93ef7a804a15&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=351871871&response-content-disposition=attachment%3B%20filename%3DMODEL_CARD&response-content-type=application%2Foctet-stream\n",
 267 |             "Resolving objects.githubusercontent.com (objects.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n",
 268 |             "Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.111.133|:443... connected.\n",
 269 |             "HTTP request sent, awaiting response... 200 OK\n",
 270 |             "Length: 4244 (4.1K) [application/octet-stream]\n",
 271 |             "Saving to: ‘MODEL_CARD’\n",
 272 |             "\n",
 273 |             "MODEL_CARD          100%[===================>]   4.14K  --.-KB/s    in 0s      \n",
 274 |             "\n",
 275 |             "2022-11-01 08:48:14 (39.3 MB/s) - ‘MODEL_CARD’ saved [4244/4244]\n",
 276 |             "\n"
 277 |           ]
 278 |         }
 279 |       ],
 280 |       "source": [
 281 |         "!wget https://coqui.gateway.scarf.sh/english/coqui/v1.0.0-large-vocab/MODEL_CARD\n"
 282 |       ]
 283 |     },
 284 |     {
 285 |       "cell_type": "code",
 286 |       "execution_count": 8,
 287 |       "metadata": {
 288 |         "colab": {
 289 |           "base_uri": "https://localhost:8080/"
 290 |         },
 291 |         "id": "FOv1BfpaQB6S",
 292 |         "outputId": "a42f3fd0-c5ca-4e81-af6e-dd83aaf3eb47"
 293 |       },
 294 |       "outputs": [
 295 |         {
 296 |           "name": "stdout",
 297 |           "output_type": "stream",
 298 |           "text": [
 299 |             "--2022-11-01 08:48:15--  https://coqui.gateway.scarf.sh/english/coqui/v1.0.0-large-vocab/LOG_TESTING\n",
 300 |             "Resolving coqui.gateway.scarf.sh (coqui.gateway.scarf.sh)... 54.70.21.136, 35.155.221.103\n",
 301 |             "Connecting to coqui.gateway.scarf.sh (coqui.gateway.scarf.sh)|54.70.21.136|:443... connected.\n",
 302 |             "HTTP request sent, awaiting response... 302 Found\n",
 303 |             "Location: https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v1.0.0-large-vocab/LOG_TESTING [following]\n",
 304 |             "--2022-11-01 08:48:15--  https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v1.0.0-large-vocab/LOG_TESTING\n",
 305 |             "Resolving github.com (github.com)... 140.82.112.3\n",
 306 |             "Connecting to github.com (github.com)|140.82.112.3|:443... connected.\n",
 307 |             "HTTP request sent, awaiting response... 302 Found\n",
 308 |             "Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/351871871/f33b2c5a-c27e-47b1-9870-4f2a190a4a83?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221101T084815Z&X-Amz-Expires=300&X-Amz-Signature=98154462951673f376188791445b1886a32b40f6a686e289c8c124fad0afa4f2&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=351871871&response-content-disposition=attachment%3B%20filename%3DLOG_TESTING&response-content-type=application%2Foctet-stream [following]\n",
 309 |             "--2022-11-01 08:48:15--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/351871871/f33b2c5a-c27e-47b1-9870-4f2a190a4a83?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221101T084815Z&X-Amz-Expires=300&X-Amz-Signature=98154462951673f376188791445b1886a32b40f6a686e289c8c124fad0afa4f2&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=351871871&response-content-disposition=attachment%3B%20filename%3DLOG_TESTING&response-content-type=application%2Foctet-stream\n",
 310 |             "Resolving objects.githubusercontent.com (objects.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n",
 311 |             "Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.111.133|:443... connected.\n",
 312 |             "HTTP request sent, awaiting response... 200 OK\n",
 313 |             "Length: 25391 (25K) [application/octet-stream]\n",
 314 |             "Saving to: ‘LOG_TESTING’\n",
 315 |             "\n",
 316 |             "LOG_TESTING         100%[===================>]  24.80K  --.-KB/s    in 0.001s  \n",
 317 |             "\n",
 318 |             "2022-11-01 08:48:15 (19.6 MB/s) - ‘LOG_TESTING’ saved [25391/25391]\n",
 319 |             "\n"
 320 |           ]
 321 |         }
 322 |       ],
 323 |       "source": [
 324 |         "!wget https://coqui.gateway.scarf.sh/english/coqui/v1.0.0-large-vocab/LOG_TESTING\n"
 325 |       ]
 326 |     },
 327 |     {
 328 |       "cell_type": "code",
 329 |       "execution_count": 9,
 330 |       "metadata": {
 331 |         "colab": {
 332 |           "base_uri": "https://localhost:8080/"
 333 |         },
 334 |         "id": "WPBYunv5QC_A",
 335 |         "outputId": "412e8f27-3fa0-4f29-d494-97d55cffc166"
 336 |       },
 337 |       "outputs": [
 338 |         {
 339 |           "name": "stdout",
 340 |           "output_type": "stream",
 341 |           "text": [
 342 |             "--2022-11-01 08:48:15--  https://coqui.gateway.scarf.sh/english/coqui/v1.0.0-large-vocab/LICENSE\n",
 343 |             "Resolving coqui.gateway.scarf.sh (coqui.gateway.scarf.sh)... 54.70.21.136, 35.155.221.103\n",
 344 |             "Connecting to coqui.gateway.scarf.sh (coqui.gateway.scarf.sh)|54.70.21.136|:443... connected.\n",
 345 |             "HTTP request sent, awaiting response... 302 Found\n",
 346 |             "Location: https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v1.0.0-large-vocab/LICENSE [following]\n",
 347 |             "--2022-11-01 08:48:16--  https://github.com/coqui-ai/STT-models/releases/download/english/coqui/v1.0.0-large-vocab/LICENSE\n",
 348 |             "Resolving github.com (github.com)... 140.82.112.3\n",
 349 |             "Connecting to github.com (github.com)|140.82.112.3|:443... connected.\n",
 350 |             "HTTP request sent, awaiting response... 302 Found\n",
 351 |             "Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/351871871/dc69c571-83ca-48c1-9b31-408e9be73bc1?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221101T084816Z&X-Amz-Expires=300&X-Amz-Signature=ded931fa9d8b592ee69283a4664a5c4a88b447e2dd65f3446426f5eff806d1b7&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=351871871&response-content-disposition=attachment%3B%20filename%3DLICENSE&response-content-type=application%2Foctet-stream [following]\n",
 352 |             "--2022-11-01 08:48:16--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/351871871/dc69c571-83ca-48c1-9b31-408e9be73bc1?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20221101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20221101T084816Z&X-Amz-Expires=300&X-Amz-Signature=ded931fa9d8b592ee69283a4664a5c4a88b447e2dd65f3446426f5eff806d1b7&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=351871871&response-content-disposition=attachment%3B%20filename%3DLICENSE&response-content-type=application%2Foctet-stream\n",
 353 |             "Resolving objects.githubusercontent.com (objects.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...\n",
 354 |             "Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.111.133|:443... connected.\n",
 355 |             "HTTP request sent, awaiting response... 200 OK\n",
 356 |             "Length: 11358 (11K) [application/octet-stream]\n",
 357 |             "Saving to: ‘LICENSE’\n",
 358 |             "\n",
 359 |             "LICENSE             100%[===================>]  11.09K  --.-KB/s    in 0s      \n",
 360 |             "\n",
 361 |             "2022-11-01 08:48:16 (77.9 MB/s) - ‘LICENSE’ saved [11358/11358]\n",
 362 |             "\n"
 363 |           ]
 364 |         }
 365 |       ],
 366 |       "source": [
 367 |         "!wget https://coqui.gateway.scarf.sh/english/coqui/v1.0.0-large-vocab/LICENSE\n"
 368 |       ]
 369 |     },
 370 |     {
 371 |       "cell_type": "code",
 372 |       "execution_count": 10,
 373 |       "metadata": {
 374 |         "id": "dmxn7NKYGqRR"
 375 |       },
 376 |       "outputs": [],
 377 |       "source": [
 378 |         "!mv model.tflite large_vocabulary.scorer alphabet.txt MODEL_CARD LOG_TESTING LICENSE coqui-stt-1.0.0-models/\n"
 379 |       ]
 380 |     },
 381 |     {
 382 |       "cell_type": "code",
 383 |       "execution_count": 11,
 384 |       "metadata": {
 385 |         "colab": {
 386 |           "base_uri": "https://localhost:8080/"
 387 |         },
 388 |         "id": "flAYDslHHVZK",
 389 |         "outputId": "d88df0bd-8021-404b-822e-bf39b9e5b002"
 390 |       },
 391 |       "outputs": [
 392 |         {
 393 |           "name": "stdout",
 394 |           "output_type": "stream",
 395 |           "text": [
 396 |             "total 175816\n",
 397 |             "-rw-r--r-- 1 root root       329 Dec  7  2021 alphabet.txt\n",
 398 |             "-rw-r--r-- 1 root root 132644544 Dec  7  2021 large_vocabulary.scorer\n",
 399 |             "-rw-r--r-- 1 root root     11358 Dec  7  2021 LICENSE\n",
 400 |             "-rw-r--r-- 1 root root     25391 Dec  7  2021 LOG_TESTING\n",
 401 |             "-rw-r--r-- 1 root root      4244 Dec  7  2021 MODEL_CARD\n",
 402 |             "-rw-r--r-- 1 root root  47332120 Dec  7  2021 model.tflite\n"
 403 |           ]
 404 |         }
 405 |       ],
 406 |       "source": [
 407 |         "!ls -l coqui-stt-1.0.0-models\n"
 408 |       ]
 409 |     },
 410 |     {
 411 |       "cell_type": "markdown",
 412 |       "metadata": {
 413 |         "id": "_g5hZVWXO1wl"
 414 |       },
 415 |       "source": [
 416 |         "2. **Download audio data files**"
 417 |       ]
 418 |     },
 419 |     {
 420 |       "cell_type": "code",
 421 |       "execution_count": 12,
 422 |       "metadata": {
 423 |         "colab": {
 424 |           "base_uri": "https://localhost:8080/"
 425 |         },
 426 |         "id": "GdOxbBycM-Hf",
 427 |         "outputId": "d213f2ce-013b-4746-a59b-9e67fd0a3c57"
 428 |       },
 429 |       "outputs": [
 430 |         {
 431 |           "name": "stdout",
 432 |           "output_type": "stream",
 433 |           "text": [
 434 |             "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
 435 |             "                                 Dload  Upload   Total   Spent    Left  Speed\n",
 436 |             "  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\n",
 437 |             "100  193k  100  193k    0     0   609k      0 --:--:-- --:--:-- --:--:--  609k\n"
 438 |           ]
 439 |         }
 440 |       ],
 441 |       "source": [
 442 |         "!curl -LO https://github.com/coqui-ai/STT/releases/download/v1.4.0/audio-1.4.0.tar.gz\n"
 443 |       ]
 444 |     },
 445 |     {
 446 |       "cell_type": "markdown",
 447 |       "metadata": {
 448 |         "id": "m4yZaht2PH_5"
 449 |       },
 450 |       "source": [
 451 |         "4. **Unzip audio files**"
 452 |       ]
 453 |     },
 454 |     {
 455 |       "cell_type": "code",
 456 |       "execution_count": 13,
 457 |       "metadata": {
 458 |         "colab": {
 459 |           "base_uri": "https://localhost:8080/"
 460 |         },
 461 |         "id": "82DLg4JpPOVX",
 462 |         "outputId": "84c10407-6370-47c7-d721-ba73faba1537"
 463 |       },
 464 |       "outputs": [
 465 |         {
 466 |           "name": "stdout",
 467 |           "output_type": "stream",
 468 |           "text": [
 469 |             "._audio\n",
 470 |             "tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'\n",
 471 |             "audio/\n",
 472 |             "audio/._2830-3980-0043.wav\n",
 473 |             "tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'\n",
 474 |             "audio/2830-3980-0043.wav\n",
 475 |             "audio/._Attribution.txt\n",
 476 |             "tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'\n",
 477 |             "audio/Attribution.txt\n",
 478 |             "audio/._4507-16021-0012.wav\n",
 479 |             "tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'\n",
 480 |             "audio/4507-16021-0012.wav\n",
 481 |             "audio/._8455-210777-0068.wav\n",
 482 |             "tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'\n",
 483 |             "audio/8455-210777-0068.wav\n",
 484 |             "audio/._License.txt\n",
 485 |             "tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'\n",
 486 |             "audio/License.txt\n"
 487 |           ]
 488 |         }
 489 |       ],
 490 |       "source": [
 491 |         "!tar -xvzf audio-1.4.0.tar.gz\n"
 492 |       ]
 493 |     },
 494 |     {
 495 |       "cell_type": "code",
 496 |       "execution_count": 14,
 497 |       "metadata": {
 498 |         "colab": {
 499 |           "base_uri": "https://localhost:8080/"
 500 |         },
 501 |         "id": "zExydudVPU4r",
 502 |         "outputId": "19830ed2-cfd2-4eff-8f6a-c2a2338ec33c"
 503 |       },
 504 |       "outputs": [
 505 |         {
 506 |           "name": "stdout",
 507 |           "output_type": "stream",
 508 |           "text": [
 509 |             "total 260\n",
 510 |             "-rw-r--r-- 1 501 staff 63244 Nov 18  2017 2830-3980-0043.wav\n",
 511 |             "-rw-r--r-- 1 501 staff 87564 Nov 18  2017 4507-16021-0012.wav\n",
 512 |             "-rw-r--r-- 1 501 staff 82924 Nov 18  2017 8455-210777-0068.wav\n",
 513 |             "-rw-r--r-- 1 501 staff   340 May 14  2018 Attribution.txt\n",
 514 |             "-rw-r--r-- 1 501 staff 18652 May 12  2018 License.txt\n"
 515 |           ]
 516 |         }
 517 |       ],
 518 |       "source": [
 519 |         "!ls -l ./audio/\n"
 520 |       ]
 521 |     },
 522 |     {
 523 |       "cell_type": "markdown",
 524 |       "metadata": {
 525 |         "id": "dIiwclaXPzfm"
 526 |       },
 527 |       "source": [
 528 |         "5. **Test that it all works**"
 529 |       ]
 530 |     },
 531 |     {
 532 |       "cell_type": "code",
 533 |       "execution_count": 15,
 534 |       "metadata": {
 535 |         "colab": {
 536 |           "base_uri": "https://localhost:8080/"
 537 |         },
 538 |         "id": "ZJlSpiHSPebH",
 539 |         "outputId": "b7cd1790-a7c3-4fc9-df9f-50abbd09e1a3"
 540 |       },
 541 |       "outputs": [
 542 |         {
 543 |           "name": "stdout",
 544 |           "output_type": "stream",
 545 |           "text": [
 546 |             "usage: stt [-h] --model MODEL [--scorer SCORER] --audio AUDIO\n",
 547 |             "           [--beam_width BEAM_WIDTH] [--lm_alpha LM_ALPHA] [--lm_beta LM_BETA]\n",
 548 |             "           [--version] [--extended] [--json]\n",
 549 |             "           [--candidate_transcripts CANDIDATE_TRANSCRIPTS]\n",
 550 |             "           [--hot_words HOT_WORDS]\n",
 551 |             "\n",
 552 |             "Running Coqui STT inference.\n",
 553 |             "\n",
 554 |             "optional arguments:\n",
 555 |             "  -h, --help            show this help message and exit\n",
 556 |             "  --model MODEL         Path to the model (protocol buffer binary file)\n",
 557 |             "  --scorer SCORER       Path to the external scorer file\n",
 558 |             "  --audio AUDIO         Path to the audio file to run (WAV format)\n",
 559 |             "  --beam_width BEAM_WIDTH\n",
 560 |             "                        Beam width for the CTC decoder\n",
 561 |             "  --lm_alpha LM_ALPHA   Language model weight (lm_alpha). If not specified,\n",
 562 |             "                        use default from the scorer package.\n",
 563 |             "  --lm_beta LM_BETA     Word insertion bonus (lm_beta). If not specified, use\n",
 564 |             "                        default from the scorer package.\n",
 565 |             "  --version             Print version and exits\n",
 566 |             "  --extended            Output string from extended metadata\n",
 567 |             "  --json                Output json from metadata with timestamp of each word\n",
 568 |             "  --candidate_transcripts CANDIDATE_TRANSCRIPTS\n",
 569 |             "                        Number of candidate transcripts to include in JSON\n",
 570 |             "                        output\n",
 571 |             "  --hot_words HOT_WORDS\n",
 572 |             "                        Hot-words and their boosts.\n"
 573 |           ]
 574 |         }
 575 |       ],
 576 |       "source": [
 577 |         "!stt --help\n"
 578 |       ]
 579 |     },
 580 |     {
 581 |       "cell_type": "code",
 582 |       "execution_count": 16,
 583 |       "metadata": {
 584 |         "colab": {
 585 |           "base_uri": "https://localhost:8080/"
 586 |         },
 587 |         "id": "wCn27zQOOM3Y",
 588 |         "outputId": "6d5fe457-cf6a-4009-9b3a-d255571264a4"
 589 |       },
 590 |       "outputs": [
 591 |         {
 592 |           "name": "stdout",
 593 |           "output_type": "stream",
 594 |           "text": [
 595 |             "Loading model from file coqui-stt-1.0.0-models/model.tflite\n",
 596 |             "TensorFlow: v2.9.1-11-gf8242ebc005\n",
 597 |             " Coqui STT: v1.4.0-0-gfcec06bd\n",
 598 |             "INFO: Created TensorFlow Lite XNNPACK delegate for CPU.\n",
 599 |             "Loaded model in 0.00167s.\n",
 600 |             "Loading scorer from files coqui-stt-1.0.0-models/large_vocabulary.scorer\n",
 601 |             "Loaded scorer in 0.000265s.\n",
 602 |             "Running inference.\n",
 603 |             "experience proves this\n",
 604 |             "Inference took 0.795s for 1.975s audio file.\n"
 605 |           ]
 606 |         }
 607 |       ],
 608 |       "source": [
 609 |         "!stt --model coqui-stt-1.0.0-models/model.tflite --scorer coqui-stt-1.0.0-models/large_vocabulary.scorer --audio ./audio/2830-3980-0043.wav\n"
 610 |       ]
 611 |     },
 612 |     {
 613 |       "cell_type": "code",
 614 |       "execution_count": 17,
 615 |       "metadata": {
 616 |         "colab": {
 617 |           "base_uri": "https://localhost:8080/"
 618 |         },
 619 |         "id": "V-kHdWHZPHyG",
 620 |         "outputId": "6193a383-fbbd-4280-92ef-854098810687"
 621 |       },
 622 |       "outputs": [
 623 |         {
 624 |           "name": "stdout",
 625 |           "output_type": "stream",
 626 |           "text": [
 627 |             "Loading model from file coqui-stt-1.0.0-models/model.tflite\n",
 628 |             "TensorFlow: v2.9.1-11-gf8242ebc005\n",
 629 |             " Coqui STT: v1.4.0-0-gfcec06bd\n",
 630 |             "INFO: Created TensorFlow Lite XNNPACK delegate for CPU.\n",
 631 |             "Loaded model in 0.00149s.\n",
 632 |             "Loading scorer from files coqui-stt-1.0.0-models/large_vocabulary.scorer\n",
 633 |             "Loaded scorer in 0.000223s.\n",
 634 |             "Running inference.\n",
 635 |             "why should one halt on the way\n",
 636 |             "Inference took 0.910s for 2.735s audio file.\n"
 637 |           ]
 638 |         }
 639 |       ],
 640 |       "source": [
 641 |         "!stt --model coqui-stt-1.0.0-models/model.tflite --scorer coqui-stt-1.0.0-models/large_vocabulary.scorer --audio ./audio/4507-16021-0012.wav\n"
 642 |       ]
 643 |     },
 644 |     {
 645 |       "cell_type": "code",
 646 |       "execution_count": 18,
 647 |       "metadata": {
 648 |         "colab": {
 649 |           "base_uri": "https://localhost:8080/"
 650 |         },
 651 |         "id": "2lZBwBRDPHtu",
 652 |         "outputId": "347ed959-e590-4de3-e763-f920221095fd"
 653 |       },
 654 |       "outputs": [
 655 |         {
 656 |           "name": "stdout",
 657 |           "output_type": "stream",
 658 |           "text": [
 659 |             "Loading model from file coqui-stt-1.0.0-models/model.tflite\n",
 660 |             "TensorFlow: v2.9.1-11-gf8242ebc005\n",
 661 |             " Coqui STT: v1.4.0-0-gfcec06bd\n",
 662 |             "INFO: Created TensorFlow Lite XNNPACK delegate for CPU.\n",
 663 |             "Loaded model in 0.00148s.\n",
 664 |             "Loading scorer from files coqui-stt-1.0.0-models/large_vocabulary.scorer\n",
 665 |             "Loaded scorer in 0.000231s.\n",
 666 |             "Running inference.\n",
 667 |             "your power is sufficient i said\n",
 668 |             "Inference took 0.889s for 2.590s audio file.\n"
 669 |           ]
 670 |         }
 671 |       ],
 672 |       "source": [
 673 |         "!stt --model coqui-stt-1.0.0-models/model.tflite --scorer coqui-stt-1.0.0-models/large_vocabulary.scorer --audio ./audio/8455-210777-0068.wav\n"
 674 |       ]
 675 |     },
 676 |     {
 677 |       "cell_type": "markdown",
 678 |       "metadata": {
 679 |         "id": "_4m0A8-VTPQG"
 680 |       },
 681 |       "source": [
 682 |         "Examine the output of the last three commands, and you will see results “experience proof this, “why should one halt on the way”, and “your power is sufficient i said” respectively. You are all set.\n",
 683 |         "\n",
 684 |         "If you want the breakup and timestamp, you can use `--json` flag:"
 685 |       ]
 686 |     },
 687 |     {
 688 |       "cell_type": "code",
 689 |       "execution_count": 19,
 690 |       "metadata": {
 691 |         "colab": {
 692 |           "base_uri": "https://localhost:8080/"
 693 |         },
 694 |         "id": "DohT6qE5SvXF",
 695 |         "outputId": "0721c87b-2d9b-4e91-ee0a-d0ceb41509b6"
 696 |       },
 697 |       "outputs": [
 698 |         {
 699 |           "name": "stdout",
 700 |           "output_type": "stream",
 701 |           "text": [
 702 |             "Loading model from file coqui-stt-1.0.0-models/model.tflite\n",
 703 |             "TensorFlow: v2.9.1-11-gf8242ebc005\n",
 704 |             " Coqui STT: v1.4.0-0-gfcec06bd\n",
 705 |             "INFO: Created TensorFlow Lite XNNPACK delegate for CPU.\n",
 706 |             "Loaded model in 0.00152s.\n",
 707 |             "Loading scorer from files coqui-stt-1.0.0-models/large_vocabulary.scorer\n",
 708 |             "Loaded scorer in 0.000266s.\n",
 709 |             "Running inference.\n",
 710 |             "{\n",
 711 |             "  \"transcripts\": [\n",
 712 |             "    {\n",
 713 |             "      \"confidence\": -31.462177276611328,\n",
 714 |             "      \"words\": [\n",
 715 |             "        {\n",
 716 |             "          \"word\": \"your\",\n",
 717 |             "          \"start_time\": 0.72,\n",
 718 |             "          \"duration\": 0.2\n",
 719 |             "        },\n",
 720 |             "        {\n",
 721 |             "          \"word\": \"power\",\n",
 722 |             "          \"start_time\": 0.98,\n",
 723 |             "          \"duration\": 0.2\n",
 724 |             "        },\n",
 725 |             "        {\n",
 726 |             "          \"word\": \"is\",\n",
 727 |             "          \"start_time\": 1.28,\n",
 728 |             "          \"duration\": 0.1\n",
 729 |             "        },\n",
 730 |             "        {\n",
 731 |             "          \"word\": \"sufficient\",\n",
 732 |             "          \"start_time\": 1.44,\n",
 733 |             "          \"duration\": 0.36\n",
 734 |             "        },\n",
 735 |             "        {\n",
 736 |             "          \"word\": \"i\",\n",
 737 |             "          \"start_time\": 1.92,\n",
 738 |             "          \"duration\": 0.12\n",
 739 |             "        },\n",
 740 |             "        {\n",
 741 |             "          \"word\": \"said\",\n",
 742 |             "          \"start_time\": 2.1,\n",
 743 |             "          \"duration\": 0.08\n",
 744 |             "        }\n",
 745 |             "      ]\n",
 746 |             "    },\n",
 747 |             "    {\n",
 748 |             "      \"confidence\": -36.81807327270508,\n",
 749 |             "      \"words\": [\n",
 750 |             "        {\n",
 751 |             "          \"word\": \"our\",\n",
 752 |             "          \"start_time\": 0.76,\n",
 753 |             "          \"duration\": 0.16\n",
 754 |             "        },\n",
 755 |             "        {\n",
 756 |             "          \"word\": \"power\",\n",
 757 |             "          \"start_time\": 0.98,\n",
 758 |             "          \"duration\": 0.2\n",
 759 |             "        },\n",
 760 |             "        {\n",
 761 |             "          \"word\": \"is\",\n",
 762 |             "          \"start_time\": 1.28,\n",
 763 |             "          \"duration\": 0.1\n",
 764 |             "        },\n",
 765 |             "        {\n",
 766 |             "          \"word\": \"sufficient\",\n",
 767 |             "          \"start_time\": 1.44,\n",
 768 |             "          \"duration\": 0.36\n",
 769 |             "        },\n",
 770 |             "        {\n",
 771 |             "          \"word\": \"i\",\n",
 772 |             "          \"start_time\": 1.92,\n",
 773 |             "          \"duration\": 0.12\n",
 774 |             "        },\n",
 775 |             "        {\n",
 776 |             "          \"word\": \"said\",\n",
 777 |             "          \"start_time\": 2.1,\n",
 778 |             "          \"duration\": 0.08\n",
 779 |             "        }\n",
 780 |             "      ]\n",
 781 |             "    },\n",
 782 |             "    {\n",
 783 |             "      \"confidence\": -37.49082565307617,\n",
 784 |             "      \"words\": [\n",
 785 |             "        {\n",
 786 |             "          \"word\": \"your\",\n",
 787 |             "          \"start_time\": 0.72,\n",
 788 |             "          \"duration\": 0.2\n",
 789 |             "        },\n",
 790 |             "        {\n",
 791 |             "          \"word\": \"power\",\n",
 792 |             "          \"start_time\": 0.98,\n",
 793 |             "          \"duration\": 0.2\n",
 794 |             "        },\n",
 795 |             "        {\n",
 796 |             "          \"word\": \"is\",\n",
 797 |             "          \"start_time\": 1.28,\n",
 798 |             "          \"duration\": 0.1\n",
 799 |             "        },\n",
 800 |             "        {\n",
 801 |             "          \"word\": \"sufficient\",\n",
 802 |             "          \"start_time\": 1.44,\n",
 803 |             "          \"duration\": 0.36\n",
 804 |             "        },\n",
 805 |             "        {\n",
 806 |             "          \"word\": \"i\",\n",
 807 |             "          \"start_time\": 1.92,\n",
 808 |             "          \"duration\": 0.12\n",
 809 |             "        },\n",
 810 |             "        {\n",
 811 |             "          \"word\": \"said\",\n",
 812 |             "          \"start_time\": 2.1,\n",
 813 |             "          \"duration\": 0.1\n",
 814 |             "        }\n",
 815 |             "      ]\n",
 816 |             "    }\n",
 817 |             "  ]\n",
 818 |             "}\n",
 819 |             "Inference took 1.324s for 2.590s audio file.\n"
 820 |           ]
 821 |         }
 822 |       ],
 823 |       "source": [
 824 |         "!stt --json --model coqui-stt-1.0.0-models/model.tflite --scorer coqui-stt-1.0.0-models/large_vocabulary.scorer --audio ./audio/8455-210777-0068.wav \n"
 825 |       ]
 826 |     },
 827 |     {
 828 |       "cell_type": "markdown",
 829 |       "metadata": {
 830 |         "id": "p61UrYSvQrOd"
 831 |       },
 832 |       "source": [
 833 |         "# DeepSpeech API\n",
 834 |         "\n",
 835 |         "1.   **Import deepspeech**"
 836 |       ]
 837 |     },
 838 |     {
 839 |       "cell_type": "code",
 840 |       "execution_count": 20,
 841 |       "metadata": {
 842 |         "id": "LKwSvpvaRFIe"
 843 |       },
 844 |       "outputs": [],
 845 |       "source": [
 846 |         "import stt\n"
 847 |       ]
 848 |     },
 849 |     {
 850 |       "cell_type": "markdown",
 851 |       "metadata": {
 852 |         "id": "xqd6bQ_gRPOB"
 853 |       },
 854 |       "source": [
 855 |         "2. **Create a model**"
 856 |       ]
 857 |     },
 858 |     {
 859 |       "cell_type": "code",
 860 |       "execution_count": 21,
 861 |       "metadata": {
 862 |         "id": "jKDVOmbFRR1A"
 863 |       },
 864 |       "outputs": [],
 865 |       "source": [
 866 |         "model_file_path = 'coqui-stt-1.0.0-models/model.tflite'\n",
 867 |         "model = stt.Model(model_file_path)\n"
 868 |       ]
 869 |     },
 870 |     {
 871 |       "cell_type": "markdown",
 872 |       "metadata": {
 873 |         "id": "VNRxsb2zRgeJ"
 874 |       },
 875 |       "source": [
 876 |         "3. **Add scorer and other parameters**"
 877 |       ]
 878 |     },
 879 |     {
 880 |       "cell_type": "code",
 881 |       "execution_count": 22,
 882 |       "metadata": {
 883 |         "colab": {
 884 |           "base_uri": "https://localhost:8080/"
 885 |         },
 886 |         "id": "6FRX1EvDRnLH",
 887 |         "outputId": "7be32767-0dc6-45cc-f4c0-80dac13b7e2a"
 888 |       },
 889 |       "outputs": [
 890 |         {
 891 |           "data": {
 892 |             "text/plain": [
 893 |               "0"
 894 |             ]
 895 |           },
 896 |           "execution_count": 22,
 897 |           "metadata": {},
 898 |           "output_type": "execute_result"
 899 |         }
 900 |       ],
 901 |       "source": [
 902 |         "scorer_file_path = 'coqui-stt-1.0.0-models/large_vocabulary.scorer'\n",
 903 |         "model.enableExternalScorer(scorer_file_path)\n",
 904 |         "\n",
 905 |         "lm_alpha = 0.75\n",
 906 |         "lm_beta = 1.85\n",
 907 |         "model.setScorerAlphaBeta(lm_alpha, lm_beta)\n",
 908 |         "\n",
 909 |         "beam_width = 500\n",
 910 |         "model.setBeamWidth(beam_width)\n"
 911 |       ]
 912 |     },
 913 |     {
 914 |       "cell_type": "markdown",
 915 |       "metadata": {
 916 |         "id": "tWbHnlwCRuDo"
 917 |       },
 918 |       "source": [
 919 |         "## Batch API\n",
 920 |         "\n",
 921 |         "1.   **Read an input wav file**\n"
 922 |       ]
 923 |     },
 924 |     {
 925 |       "cell_type": "code",
 926 |       "execution_count": 23,
 927 |       "metadata": {
 928 |         "id": "PRshwTMoSFEL"
 929 |       },
 930 |       "outputs": [],
 931 |       "source": [
 932 |         "import wave\n",
 933 |         "filename = 'audio/8455-210777-0068.wav'\n",
 934 |         "w = wave.open(filename, 'r')\n",
 935 |         "rate = w.getframerate()\n",
 936 |         "frames = w.getnframes()\n",
 937 |         "buffer = w.readframes(frames)\n"
 938 |       ]
 939 |     },
 940 |     {
 941 |       "cell_type": "markdown",
 942 |       "metadata": {
 943 |         "id": "cAowLS39SNC_"
 944 |       },
 945 |       "source": [
 946 |         "Checkout sample rate and buffer type"
 947 |       ]
 948 |     },
 949 |     {
 950 |       "cell_type": "code",
 951 |       "execution_count": 24,
 952 |       "metadata": {
 953 |         "colab": {
 954 |           "base_uri": "https://localhost:8080/"
 955 |         },
 956 |         "id": "NHvatdmxSYGu",
 957 |         "outputId": "ad0916d8-26bc-4d6c-eaef-428759cc9164"
 958 |       },
 959 |       "outputs": [
 960 |         {
 961 |           "name": "stdout",
 962 |           "output_type": "stream",
 963 |           "text": [
 964 |             "16000\n",
 965 |             "16000\n",
 966 |             "<class 'bytes'>\n"
 967 |           ]
 968 |         }
 969 |       ],
 970 |       "source": [
 971 |         "print(rate)\n",
 972 |         "print(model.sampleRate())\n",
 973 |         "print(str(type(buffer)))\n"
 974 |       ]
 975 |     },
 976 |     {
 977 |       "cell_type": "markdown",
 978 |       "metadata": {
 979 |         "id": "uOhbO3iTS3ft"
 980 |       },
 981 |       "source": [
 982 |         "As you can see that the speech sample rate of the wav file is 16000hz, same as the model’s sample rate. But the buffer is a byte array, whereas DeepSpeech model expects 16-bit int array.\n",
 983 |         "\n",
 984 |         "2.  **Convert byte array buffer to int16 array**"
 985 |       ]
 986 |     },
 987 |     {
 988 |       "cell_type": "code",
 989 |       "execution_count": 25,
 990 |       "metadata": {
 991 |         "colab": {
 992 |           "base_uri": "https://localhost:8080/"
 993 |         },
 994 |         "id": "XYXF6AU2S8m2",
 995 |         "outputId": "4a504ca4-4ee7-4b08-f240-03676261881c"
 996 |       },
 997 |       "outputs": [
 998 |         {
 999 |           "name": "stdout",
1000 |           "output_type": "stream",
1001 |           "text": [
1002 |             "<class 'numpy.ndarray'>\n"
1003 |           ]
1004 |         }
1005 |       ],
1006 |       "source": [
1007 |         "import numpy as np\n",
1008 |         "data16 = np.frombuffer(buffer, dtype=np.int16)\n",
1009 |         "print(str(type(data16)))\n"
1010 |       ]
1011 |     },
1012 |     {
1013 |       "cell_type": "markdown",
1014 |       "metadata": {
1015 |         "id": "yyIxzx1zTVFp"
1016 |       },
1017 |       "source": [
1018 |         "3.  **Run speech-to-text in batch mode to get the text**"
1019 |       ]
1020 |     },
1021 |     {
1022 |       "cell_type": "code",
1023 |       "execution_count": 26,
1024 |       "metadata": {
1025 |         "colab": {
1026 |           "base_uri": "https://localhost:8080/"
1027 |         },
1028 |         "id": "XdzZteC7TZDP",
1029 |         "outputId": "32a71587-11f0-43b0-f14e-eb18090521df"
1030 |       },
1031 |       "outputs": [
1032 |         {
1033 |           "name": "stdout",
1034 |           "output_type": "stream",
1035 |           "text": [
1036 |             "your power is sufficient i said\n"
1037 |           ]
1038 |         }
1039 |       ],
1040 |       "source": [
1041 |         "text = model.stt(data16)\n",
1042 |         "print(text)\n"
1043 |       ]
1044 |     },
1045 |     {
1046 |       "cell_type": "markdown",
1047 |       "metadata": {
1048 |         "id": "VUCXp-5uTh0L"
1049 |       },
1050 |       "source": [
1051 |         "## Streaming API\n",
1052 |         "\n",
1053 |         "Now let’s accomplish the same using streaming API. It consists of 3 steps: open session, feed data, close session.\n",
1054 |         "\n",
1055 |         "1.  **Open a streaming session**"
1056 |       ]
1057 |     },
1058 |     {
1059 |       "cell_type": "code",
1060 |       "execution_count": 27,
1061 |       "metadata": {
1062 |         "id": "uMSQ2VYCTyao"
1063 |       },
1064 |       "outputs": [],
1065 |       "source": [
1066 |         "stt_stream = model.createStream()\n"
1067 |       ]
1068 |     },
1069 |     {
1070 |       "cell_type": "markdown",
1071 |       "metadata": {
1072 |         "id": "YK4QDAZtT3QZ"
1073 |       },
1074 |       "source": [
1075 |         "2.  **Repeatedly feed chunks of speech buffer, and get interim results if desired**"
1076 |       ]
1077 |     },
1078 |     {
1079 |       "cell_type": "code",
1080 |       "execution_count": 28,
1081 |       "metadata": {
1082 |         "colab": {
1083 |           "base_uri": "https://localhost:8080/"
1084 |         },
1085 |         "id": "ScS6c2QQT72-",
1086 |         "outputId": "c7ea4b0c-9df2-491d-8d67-3705ce1f4ed1"
1087 |       },
1088 |       "outputs": [
1089 |         {
1090 |           "name": "stdout",
1091 |           "output_type": "stream",
1092 |           "text": [
1093 |             "\n",
1094 |             "\n",
1095 |             "your power \n",
1096 |             "your power is suff\n",
1097 |             "your power is sufficient i said\n",
1098 |             "your power is sufficient i said\n"
1099 |           ]
1100 |         }
1101 |       ],
1102 |       "source": [
1103 |         "buffer_len = len(buffer)\n",
1104 |         "offset = 0\n",
1105 |         "batch_size = 16384\n",
1106 |         "text = ''\n",
1107 |         "while offset < buffer_len:\n",
1108 |         "    end_offset = offset + batch_size\n",
1109 |         "    chunk = buffer[offset:end_offset]\n",
1110 |         "    data16 = np.frombuffer(chunk, dtype=np.int16)\n",
1111 |         "    stt_stream.feedAudioContent(data16)\n",
1112 |         "    text = stt_stream.intermediateDecode()\n",
1113 |         "    print(text)\n",
1114 |         "    offset = end_offset\n"
1115 |       ]
1116 |     },
1117 |     {
1118 |       "cell_type": "markdown",
1119 |       "metadata": {
1120 |         "id": "zeV7x1NgUK-p"
1121 |       },
1122 |       "source": [
1123 |         "3.  **Close stream and get the final result**"
1124 |       ]
1125 |     },
1126 |     {
1127 |       "cell_type": "code",
1128 |       "execution_count": 29,
1129 |       "metadata": {
1130 |         "colab": {
1131 |           "base_uri": "https://localhost:8080/"
1132 |         },
1133 |         "id": "aS0WtnF5UM4n",
1134 |         "outputId": "153dc742-c1f1-4d00-f59e-f1800d3d9fac"
1135 |       },
1136 |       "outputs": [
1137 |         {
1138 |           "name": "stdout",
1139 |           "output_type": "stream",
1140 |           "text": [
1141 |             "your power is sufficient i said\n"
1142 |           ]
1143 |         }
1144 |       ],
1145 |       "source": [
1146 |         "text = stt_stream.finishStream()\n",
1147 |         "print(text)\n"
1148 |       ]
1149 |     },
1150 |     {
1151 |       "cell_type": "markdown",
1152 |       "metadata": {
1153 |         "id": "Q-vbd5CmUmsY"
1154 |       },
1155 |       "source": [
1156 |         "Verify that the output is same as as the batch API output: \"your power is sufficient i said.\""
1157 |       ]
1158 |     },
1159 |     {
1160 |       "cell_type": "markdown",
1161 |       "metadata": {
1162 |         "id": "wVNGdkq0fV-n"
1163 |       },
1164 |       "source": [
1165 |         "# Recap\n",
1166 |         "\n",
1167 |         "DeepSpeech has two modes: batch and streaming. First step is to create a model object, and then either call `stt()` or `feedAudioContnet()` to transcribe audio to text."
1168 |       ]
1169 |     },
1170 |     {
1171 |       "cell_type": "markdown",
1172 |       "metadata": {
1173 |         "id": "YJp2Lkt8WNoN"
1174 |       },
1175 |       "source": [
1176 |         "---\n",
1177 |         "<p>Copyright &copy 2020 - 2022 <a href=\"https://www.linkedin.com/in/scgupta\">Satish Chandra Gupta</a>.</p>\n",
1178 |         "<img src=\"https://licensebuttons.net/l/by-nc-sa/3.0/88x31.png\" align=\"left\"/> <p>&nbsp;<a href=\"https://creativecommons.org/licenses/by-nc-sa/4.0/\">CC BY-NC-SA 4.0 International</a> License.</p>"
1179 |       ]
1180 |     }
1181 |   ],
1182 |   "metadata": {
1183 |     "colab": {
1184 |       "collapsed_sections": [],
1185 |       "include_colab_link": true,
1186 |       "name": "mozilla_deepspeech_api_notebook.ipynb",
1187 |       "provenance": []
1188 |     },
1189 |     "gpuClass": "standard",
1190 |     "kernelspec": {
1191 |       "display_name": "Python 3",
1192 |       "name": "python3"
1193 |     },
1194 |     "language_info": {
1195 |       "name": "python",
1196 |       "version": "3.7.12"
1197 |     }
1198 |   },
1199 |   "nbformat": 4,
1200 |   "nbformat_minor": 0
1201 | }
1202 | 


--------------------------------------------------------------------------------
/speech/asr/python_speech_recognition_notebook.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |   "cells": [
   3 |     {
   4 |       "cell_type": "markdown",
   5 |       "metadata": {
   6 |         "colab_type": "text",
   7 |         "id": "view-in-github"
   8 |       },
   9 |       "source": [
  10 |         "<a href=\"https://colab.research.google.com/github/ml4devs/ml4devs-notebooks/blob/master/speech/asr/python_speech_recognition_notebook.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
  11 |       ]
  12 |     },
  13 |     {
  14 |       "cell_type": "markdown",
  15 |       "metadata": {
  16 |         "id": "dhBo4UlPaq4O"
  17 |       },
  18 |       "source": [
  19 |         "<h1><center>Speech Recognition with Python</center></h1>\n",
  20 |         "\n",
  21 |         "<p><center>\n",
  22 |         "<address>&copy; Satish Chandra Gupta<br/>\n",
  23 |         "LinkedIn: <a href=\"https://www.linkedin.com/in/scgupta/\">scgupta</a>,\n",
  24 |         "Twitter: <a href=\"https://twitter.com/scgupta\">scgupta</a>\n",
  25 |         "</address> \n",
  26 |         "</center></p>\n",
  27 |         "\n",
  28 |         "---\n",
  29 |         "\n",
  30 |         "# Introduction\n",
  31 |         "\n",
  32 |         "Blog Post: [Speech Recognition With Python](https://www.ml4devs.com/articles/speech-recognition-with-python/)\n",
  33 |         "\n",
  34 |         "There are several Automated Speech Recognition (ASR) alternatives, and most of them have bindings for Python. There are two kinds of solutions:\n",
  35 |         "\n",
  36 |         "- **Service:** These run on the cloud, and are accessed either through REST endpoints or Python library. Examples are cloud speech services from Google, Amazon, Microsoft.\n",
  37 |         "- **Software:** These run locally on the machine (not requiring network connection). Examples are CMU Sphinx and Mozilla DeepSpeech.\n",
  38 |         "\n",
  39 |         "Speech Recognition APIs are of two types:\n",
  40 |         "- **Batch:** The full audio file is passed as parameter, and speech-to-text transcribing is done in one shot.\n",
  41 |         "- **Streaming:** The chunks of audio buffer are repeatedly passed on, and intermediate results are accessible.\n",
  42 |         "\n",
  43 |         "All packages support batch mode, and some support streaming mode too.\n",
  44 |         "\n",
  45 |         "One common use case is to collect audio from microphone and passes on the buffer to the speech recognition API. Invariably, in such transcribers, microphone is accessed though [PyAudio](https://people.csail.mit.edu/hubert/pyaudio/), which is implemented over [PortAudio](http://www.portaudio.com/).\n",
  46 |         "\n",
  47 |         "From Colab menu, select: **Runtime** > **Change runtime type**, and verify that it is set to Python3, and select GPU if you want to try out GPU version."
  48 |       ]
  49 |     },
  50 |     {
  51 |       "cell_type": "markdown",
  52 |       "metadata": {
  53 |         "id": "7H6HFpi_LCBt"
  54 |       },
  55 |       "source": [
  56 |         "## Common Setup\n",
  57 |         "\n",
  58 |         "1. **Install google cloud speech package**\n",
  59 |         "\n",
  60 |         "You may have to restart the runtime after this."
  61 |       ]
  62 |     },
  63 |     {
  64 |       "cell_type": "code",
  65 |       "execution_count": null,
  66 |       "metadata": {
  67 |         "colab": {
  68 |           "base_uri": "https://localhost:8080/",
  69 |           "height": 492
  70 |         },
  71 |         "id": "RhIMRXQPLaaA",
  72 |         "outputId": "ee11bcc4-44ab-4f96-ec5e-84d5b2f630b3"
  73 |       },
  74 |       "outputs": [
  75 |         {
  76 |           "name": "stdout",
  77 |           "output_type": "stream",
  78 |           "text": [
  79 |             "Collecting google-cloud-speech\n",
  80 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/0c/81/c59a373c7668beb9de922b9c4419b793898d46c6d4a44f4fe28098e77623/google_cloud_speech-1.3.1-py2.py3-none-any.whl (88kB)\n",
  81 |             "\r\u001b[K     |███▊                            | 10kB 33.9MB/s eta 0:00:01\r\u001b[K     |███████▍                        | 20kB 2.0MB/s eta 0:00:01\r\u001b[K     |███████████▏                    | 30kB 3.0MB/s eta 0:00:01\r\u001b[K     |██████████████▉                 | 40kB 2.0MB/s eta 0:00:01\r\u001b[K     |██████████████████▋             | 51kB 2.5MB/s eta 0:00:01\r\u001b[K     |██████████████████████▎         | 61kB 2.9MB/s eta 0:00:01\r\u001b[K     |██████████████████████████      | 71kB 3.4MB/s eta 0:00:01\r\u001b[K     |█████████████████████████████▊  | 81kB 3.9MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 92kB 3.4MB/s \n",
  82 |             "\u001b[?25hRequirement already satisfied: google-api-core[grpc]<2.0.0dev,>=1.14.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-speech) (1.15.0)\n",
  83 |             "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (1.6.0)\n",
  84 |             "Requirement already satisfied: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (3.10.0)\n",
  85 |             "Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (2.21.0)\n",
  86 |             "Requirement already satisfied: setuptools>=34.0.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (42.0.2)\n",
  87 |             "Requirement already satisfied: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (2018.9)\n",
  88 |             "Requirement already satisfied: google-auth<2.0dev,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (1.4.2)\n",
  89 |             "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (1.12.0)\n",
  90 |             "Requirement already satisfied: grpcio<2.0dev,>=1.8.2; extra == \"grpc\" in /usr/local/lib/python3.6/dist-packages (from google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (1.15.0)\n",
  91 |             "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (3.0.4)\n",
  92 |             "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (2.8)\n",
  93 |             "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (1.24.3)\n",
  94 |             "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (2019.11.28)\n",
  95 |             "Requirement already satisfied: rsa>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth<2.0dev,>=0.4.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (4.0)\n",
  96 |             "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2.0dev,>=0.4.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (0.2.7)\n",
  97 |             "Requirement already satisfied: cachetools>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2.0dev,>=0.4.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (4.0.0)\n",
  98 |             "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa>=3.1.4->google-auth<2.0dev,>=0.4.0->google-api-core[grpc]<2.0.0dev,>=1.14.0->google-cloud-speech) (0.4.8)\n",
  99 |             "Installing collected packages: google-cloud-speech\n",
 100 |             "Successfully installed google-cloud-speech-1.3.1\n"
 101 |           ]
 102 |         },
 103 |         {
 104 |           "data": {
 105 |             "application/vnd.colab-display-data+json": {
 106 |               "pip_warning": {
 107 |                 "packages": [
 108 |                   "google"
 109 |                 ]
 110 |               }
 111 |             }
 112 |           },
 113 |           "metadata": {
 114 |             "tags": []
 115 |           },
 116 |           "output_type": "display_data"
 117 |         }
 118 |       ],
 119 |       "source": [
 120 |         "!pip3 install google-cloud-speech"
 121 |       ]
 122 |     },
 123 |     {
 124 |       "cell_type": "markdown",
 125 |       "metadata": {
 126 |         "id": "QUjuKePpRJt5"
 127 |       },
 128 |       "source": [
 129 |         "2. **Download audio files for testing**\n",
 130 |         "\n",
 131 |         "Following files will be used as test cases for all speech recognition alternatives covered in this notebook."
 132 |       ]
 133 |     },
 134 |     {
 135 |       "cell_type": "code",
 136 |       "execution_count": null,
 137 |       "metadata": {
 138 |         "colab": {
 139 |           "base_uri": "https://localhost:8080/",
 140 |           "height": 286
 141 |         },
 142 |         "id": "t3_2z6qMRXcY",
 143 |         "outputId": "c155e814-439b-4fb0-d26f-307516b3ab97"
 144 |       },
 145 |       "outputs": [
 146 |         {
 147 |           "name": "stdout",
 148 |           "output_type": "stream",
 149 |           "text": [
 150 |             "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
 151 |             "                                 Dload  Upload   Total   Spent    Left  Speed\n",
 152 |             "100   608    0   608    0     0   2632      0 --:--:-- --:--:-- --:--:--  2620\n",
 153 |             "100  192k  100  192k    0     0   310k      0 --:--:-- --:--:-- --:--:--  310k\n",
 154 |             "audio/\n",
 155 |             "audio/2830-3980-0043.wav\n",
 156 |             "audio/Attribution.txt\n",
 157 |             "audio/4507-16021-0012.wav\n",
 158 |             "audio/8455-210777-0068.wav\n",
 159 |             "audio/License.txt\n",
 160 |             "total 260\n",
 161 |             "-rw-r--r-- 1 501 staff 63244 Nov 18  2017 2830-3980-0043.wav\n",
 162 |             "-rw-r--r-- 1 501 staff 87564 Nov 18  2017 4507-16021-0012.wav\n",
 163 |             "-rw-r--r-- 1 501 staff 82924 Nov 18  2017 8455-210777-0068.wav\n",
 164 |             "-rw-r--r-- 1 501 staff   340 May 14  2018 Attribution.txt\n",
 165 |             "-rw-r--r-- 1 501 staff 18652 May 12  2018 License.txt\n"
 166 |           ]
 167 |         }
 168 |       ],
 169 |       "source": [
 170 |         "!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.6.0/audio-0.6.0.tar.gz\n",
 171 |         "!tar -xvzf audio-0.6.0.tar.gz\n",
 172 |         "!ls -l ./audio/"
 173 |       ]
 174 |     },
 175 |     {
 176 |       "cell_type": "markdown",
 177 |       "metadata": {
 178 |         "id": "wXzH4pu9Kxr4"
 179 |       },
 180 |       "source": [
 181 |         "3. **Define test cases**"
 182 |       ]
 183 |     },
 184 |     {
 185 |       "cell_type": "code",
 186 |       "execution_count": null,
 187 |       "metadata": {
 188 |         "id": "nbzYo01kRi8P"
 189 |       },
 190 |       "outputs": [],
 191 |       "source": [
 192 |         "TESTCASES = [\n",
 193 |         "  {\n",
 194 |         "    'filename': 'audio/2830-3980-0043.wav',\n",
 195 |         "    'text': 'experience proves this',\n",
 196 |         "    'encoding': 'LINEAR16',\n",
 197 |         "    'lang': 'en-US'\n",
 198 |         "  },\n",
 199 |         "  {\n",
 200 |         "    'filename': 'audio/4507-16021-0012.wav',\n",
 201 |         "    'text': 'why should one halt on the way',\n",
 202 |         "    'encoding': 'LINEAR16',\n",
 203 |         "    'lang': 'en-US'\n",
 204 |         "  },\n",
 205 |         "  {\n",
 206 |         "    'filename': 'audio/8455-210777-0068.wav',\n",
 207 |         "    'text': 'your power is sufficient i said',\n",
 208 |         "    'encoding': 'LINEAR16',\n",
 209 |         "    'lang': 'en-US'\n",
 210 |         "  }\n",
 211 |         "]"
 212 |       ]
 213 |     },
 214 |     {
 215 |       "cell_type": "markdown",
 216 |       "metadata": {
 217 |         "id": "f-23iCNLvBIx"
 218 |       },
 219 |       "source": [
 220 |         "4. **Utility Functions**"
 221 |       ]
 222 |     },
 223 |     {
 224 |       "cell_type": "code",
 225 |       "execution_count": null,
 226 |       "metadata": {
 227 |         "id": "Ujeuvj35Ksv8"
 228 |       },
 229 |       "outputs": [],
 230 |       "source": [
 231 |         "from typing import Tuple\n",
 232 |         "import wave\n",
 233 |         "\n",
 234 |         "def read_wav_file(filename) -> Tuple[bytes, int]:\n",
 235 |         "    with wave.open(filename, 'rb') as w:\n",
 236 |         "        rate = w.getframerate()\n",
 237 |         "        frames = w.getnframes()\n",
 238 |         "        buffer = w.readframes(frames)\n",
 239 |         "\n",
 240 |         "    return buffer, rate\n",
 241 |         "\n",
 242 |         "def simulate_stream(buffer: bytes, batch_size: int = 4096):\n",
 243 |         "    buffer_len = len(buffer)\n",
 244 |         "    offset = 0\n",
 245 |         "    while offset < buffer_len:\n",
 246 |         "        end_offset = offset + batch_size\n",
 247 |         "        buf = buffer[offset:end_offset]\n",
 248 |         "        yield buf\n",
 249 |         "        offset = end_offset"
 250 |       ]
 251 |     },
 252 |     {
 253 |       "cell_type": "markdown",
 254 |       "metadata": {
 255 |         "id": "9wFdQoEUQH-3"
 256 |       },
 257 |       "source": [
 258 |         "\n",
 259 |         "---\n",
 260 |         "\n",
 261 |         "# Google\n",
 262 |         "\n",
 263 |         "Google has [speech-to-text](https://cloud.google.com/speech-to-text/docs) as one of the Google Cloud services. It has [libraries](https://cloud.google.com/speech-to-text/docs/reference/libraries) in C#, Go, Java, JavaScript, PHP, Python, and Ruby. It supports both batch and stream modes."
 264 |       ]
 265 |     },
 266 |     {
 267 |       "cell_type": "markdown",
 268 |       "metadata": {
 269 |         "id": "GjcnbQvvY3Xu"
 270 |       },
 271 |       "source": [
 272 |         "## Setup\n",
 273 |         "\n",
 274 |         "1. **Upload Google Cloud Cred file**\n",
 275 |         "\n",
 276 |         "Have Google Cloud creds stored in a file named **`gc-creds.json`**, and upload it by running following code cell. See https://developers.google.com/accounts/docs/application-default-credentials for more details.\n",
 277 |         "\n",
 278 |         "This may reqire enabling **third-party cookies**. Check out https://colab.research.google.com/notebooks/io.ipynb for other alternatives."
 279 |       ]
 280 |     },
 281 |     {
 282 |       "cell_type": "code",
 283 |       "execution_count": null,
 284 |       "metadata": {
 285 |         "colab": {
 286 |           "base_uri": "https://localhost:8080/",
 287 |           "height": 87,
 288 |           "resources": {
 289 |             "http://localhost:8080/nbextensions/google.colab/files.js": {
 290 |               "data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7Ci8vIE1heCBhbW91bnQgb2YgdGltZSB0byBibG9jayB3YWl0aW5nIGZvciB0aGUgdXNlci4KY29uc3QgRklMRV9DSEFOR0VfVElNRU9VVF9NUyA9IDMwICogMTAwMDsKCmZ1bmN0aW9uIF91cGxvYWRGaWxlcyhpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IHN0ZXBzID0gdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKTsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIC8vIENhY2hlIHN0ZXBzIG9uIHRoZSBvdXRwdXRFbGVtZW50IHRvIG1ha2UgaXQgYXZhaWxhYmxlIGZvciB0aGUgbmV4dCBjYWxsCiAgLy8gdG8gdXBsb2FkRmlsZXNDb250aW51ZSBmcm9tIFB5dGhvbi4KICBvdXRwdXRFbGVtZW50LnN0ZXBzID0gc3RlcHM7CgogIHJldHVybiBfdXBsb2FkRmlsZXNDb250aW51ZShvdXRwdXRJZCk7Cn0KCi8vIFRoaXMgaXMgcm91Z2hseSBhbiBhc3luYyBnZW5lcmF0b3IgKG5vdCBzdXBwb3J0ZWQgaW4gdGhlIGJyb3dzZXIgeWV0KSwKLy8gd2hlcmUgdGhlcmUgYXJlIG11bHRpcGxlIGFzeW5jaHJvbm91cyBzdGVwcyBhbmQgdGhlIFB5dGhvbiBzaWRlIGlzIGdvaW5nCi8vIHRvIHBvbGwgZm9yIGNvbXBsZXRpb24gb2YgZWFjaCBzdGVwLgovLyBUaGlzIHVzZXMgYSBQcm9taXNlIHRvIGJsb2NrIHRoZSBweXRob24gc2lkZSBvbiBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcCwKLy8gdGhlbiBwYXNzZXMgdGhlIHJlc3VsdCBvZiB0aGUgcHJldmlvdXMgc3RlcCBhcyB0aGUgaW5wdXQgdG8gdGhlIG5leHQgc3RlcC4KZnVuY3Rpb24gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpIHsKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIGNvbnN0IHN0ZXBzID0gb3V0cHV0RWxlbWVudC5zdGVwczsKCiAgY29uc3QgbmV4dCA9IHN0ZXBzLm5leHQob3V0cHV0RWxlbWVudC5sYXN0UHJvbWlzZVZhbHVlKTsKICByZXR1cm4gUHJvbWlzZS5yZXNvbHZlKG5leHQudmFsdWUucHJvbWlzZSkudGhlbigodmFsdWUpID0+IHsKICAgIC8vIENhY2hlIHRoZSBsYXN0IHByb21pc2UgdmFsdWUgdG8gbWFrZSBpdCBhdmFpbGFibGUgdG8gdGhlIG5leHQKICAgIC8vIHN0ZXAgb2YgdGhlIGdlbmVyYXRvci4KICAgIG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSA9IHZhbHVlOwogICAgcmV0dXJuIG5leHQudmFsdWUucmVzcG9uc2U7CiAgfSk7Cn0KCi8qKgogKiBHZW5lcmF0b3IgZnVuY3Rpb24gd2hpY2ggaXMgY2FsbGVkIGJldHdlZW4gZWFjaCBhc3luYyBzdGVwIG9mIHRoZSB1cGxvYWQKICogcHJvY2Vzcy4KICogQHBhcmFtIHtzdHJpbmd9IGlucHV0SWQgRWxlbWVudCBJRCBvZiB0aGUgaW5wdXQgZmlsZSBwaWNrZXIgZWxlbWVudC4KICogQHBhcmFtIHtzdHJpbmd9IG91dHB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIG91dHB1dCBkaXNwbGF5LgogKiBAcmV0dXJuIHshSXRlcmFibGU8IU9iamVjdD59IEl0ZXJhYmxlIG9mIG5leHQgc3RlcHMuCiAqLwpmdW5jdGlvbiogdXBsb2FkRmlsZXNTdGVwKGlucHV0SWQsIG91dHB1dElkKSB7CiAgY29uc3QgaW5wdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQoaW5wdXRJZCk7CiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gZmFsc2U7CgogIGNvbnN0IG91dHB1dEVsZW1lbnQgPSBkb2N1bWVudC5nZXRFbGVtZW50QnlJZChvdXRwdXRJZCk7CiAgb3V0cHV0RWxlbWVudC5pbm5lckhUTUwgPSAnJzsKCiAgY29uc3QgcGlja2VkUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBpbnB1dEVsZW1lbnQuYWRkRXZlbnRMaXN0ZW5lcignY2hhbmdlJywgKGUpID0+IHsKICAgICAgcmVzb2x2ZShlLnRhcmdldC5maWxlcyk7CiAgICB9KTsKICB9KTsKCiAgY29uc3QgY2FuY2VsID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnYnV0dG9uJyk7CiAgaW5wdXRFbGVtZW50LnBhcmVudEVsZW1lbnQuYXBwZW5kQ2hpbGQoY2FuY2VsKTsKICBjYW5jZWwudGV4dENvbnRlbnQgPSAnQ2FuY2VsIHVwbG9hZCc7CiAgY29uc3QgY2FuY2VsUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICBjYW5jZWwub25jbGljayA9ICgpID0+IHsKICAgICAgcmVzb2x2ZShudWxsKTsKICAgIH07CiAgfSk7CgogIC8vIENhbmNlbCB1cGxvYWQgaWYgdXNlciBoYXNuJ3QgcGlja2VkIGFueXRoaW5nIGluIHRpbWVvdXQuCiAgY29uc3QgdGltZW91dFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgc2V0VGltZW91dCgoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9LCBGSUxFX0NIQU5HRV9USU1FT1VUX01TKTsKICB9KTsKCiAgLy8gV2FpdCBmb3IgdGhlIHVzZXIgdG8gcGljayB0aGUgZmlsZXMuCiAgY29uc3QgZmlsZXMgPSB5aWVsZCB7CiAgICBwcm9taXNlOiBQcm9taXNlLnJhY2UoW3BpY2tlZFByb21pc2UsIHRpbWVvdXRQcm9taXNlLCBjYW5jZWxQcm9taXNlXSksCiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdzdGFydGluZycsCiAgICB9CiAgfTsKCiAgaWYgKCFmaWxlcykgewogICAgcmV0dXJuIHsKICAgICAgcmVzcG9uc2U6IHsKICAgICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICAgIH0KICAgIH07CiAgfQoKICBjYW5jZWwucmVtb3ZlKCk7CgogIC8vIERpc2FibGUgdGhlIGlucHV0IGVsZW1lbnQgc2luY2UgZnVydGhlciBwaWNrcyBhcmUgbm90IGFsbG93ZWQuCiAgaW5wdXRFbGVtZW50LmRpc2FibGVkID0gdHJ1ZTsKCiAgZm9yIChjb25zdCBmaWxlIG9mIGZpbGVzKSB7CiAgICBjb25zdCBsaSA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2xpJyk7CiAgICBsaS5hcHBlbmQoc3BhbihmaWxlLm5hbWUsIHtmb250V2VpZ2h0OiAnYm9sZCd9KSk7CiAgICBsaS5hcHBlbmQoc3BhbigKICAgICAgICBgKCR7ZmlsZS50eXBlIHx8ICduL2EnfSkgLSAke2ZpbGUuc2l6ZX0gYnl0ZXMsIGAgKwogICAgICAgIGBsYXN0IG1vZGlmaWVkOiAkewogICAgICAgICAgICBmaWxlLmxhc3RNb2RpZmllZERhdGUgPyBmaWxlLmxhc3RNb2RpZmllZERhdGUudG9Mb2NhbGVEYXRlU3RyaW5nKCkgOgogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnbi9hJ30gLSBgKSk7CiAgICBjb25zdCBwZXJjZW50ID0gc3BhbignMCUgZG9uZScpOwogICAgbGkuYXBwZW5kQ2hpbGQocGVyY2VudCk7CgogICAgb3V0cHV0RWxlbWVudC5hcHBlbmRDaGlsZChsaSk7CgogICAgY29uc3QgZmlsZURhdGFQcm9taXNlID0gbmV3IFByb21pc2UoKHJlc29sdmUpID0+IHsKICAgICAgY29uc3QgcmVhZGVyID0gbmV3IEZpbGVSZWFkZXIoKTsKICAgICAgcmVhZGVyLm9ubG9hZCA9IChlKSA9PiB7CiAgICAgICAgcmVzb2x2ZShlLnRhcmdldC5yZXN1bHQpOwogICAgICB9OwogICAgICByZWFkZXIucmVhZEFzQXJyYXlCdWZmZXIoZmlsZSk7CiAgICB9KTsKICAgIC8vIFdhaXQgZm9yIHRoZSBkYXRhIHRvIGJlIHJlYWR5LgogICAgbGV0IGZpbGVEYXRhID0geWllbGQgewogICAgICBwcm9taXNlOiBmaWxlRGF0YVByb21pc2UsCiAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgYWN0aW9uOiAnY29udGludWUnLAogICAgICB9CiAgICB9OwoKICAgIC8vIFVzZSBhIGNodW5rZWQgc2VuZGluZyB0byBhdm9pZCBtZXNzYWdlIHNpemUgbGltaXRzLiBTZWUgYi82MjExNTY2MC4KICAgIGxldCBwb3NpdGlvbiA9IDA7CiAgICB3aGlsZSAocG9zaXRpb24gPCBmaWxlRGF0YS5ieXRlTGVuZ3RoKSB7CiAgICAgIGNvbnN0IGxlbmd0aCA9IE1hdGgubWluKGZpbGVEYXRhLmJ5dGVMZW5ndGggLSBwb3NpdGlvbiwgTUFYX1BBWUxPQURfU0laRSk7CiAgICAgIGNvbnN0IGNodW5rID0gbmV3IFVpbnQ4QXJyYXkoZmlsZURhdGEsIHBvc2l0aW9uLCBsZW5ndGgpOwogICAgICBwb3NpdGlvbiArPSBsZW5ndGg7CgogICAgICBjb25zdCBiYXNlNjQgPSBidG9hKFN0cmluZy5mcm9tQ2hhckNvZGUuYXBwbHkobnVsbCwgY2h1bmspKTsKICAgICAgeWllbGQgewogICAgICAgIHJlc3BvbnNlOiB7CiAgICAgICAgICBhY3Rpb246ICdhcHBlbmQnLAogICAgICAgICAgZmlsZTogZmlsZS5uYW1lLAogICAgICAgICAgZGF0YTogYmFzZTY0LAogICAgICAgIH0sCiAgICAgIH07CiAgICAgIHBlcmNlbnQudGV4dENvbnRlbnQgPQogICAgICAgICAgYCR7TWF0aC5yb3VuZCgocG9zaXRpb24gLyBmaWxlRGF0YS5ieXRlTGVuZ3RoKSAqIDEwMCl9JSBkb25lYDsKICAgIH0KICB9CgogIC8vIEFsbCBkb25lLgogIHlpZWxkIHsKICAgIHJlc3BvbnNlOiB7CiAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgIH0KICB9Owp9CgpzY29wZS5nb29nbGUgPSBzY29wZS5nb29nbGUgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYiA9IHNjb3BlLmdvb2dsZS5jb2xhYiB8fCB7fTsKc2NvcGUuZ29vZ2xlLmNvbGFiLl9maWxlcyA9IHsKICBfdXBsb2FkRmlsZXMsCiAgX3VwbG9hZEZpbGVzQ29udGludWUsCn07Cn0pKHNlbGYpOwo=",
 291 |               "headers": [
 292 |                 [
 293 |                   "content-type",
 294 |                   "application/javascript"
 295 |                 ]
 296 |               ],
 297 |               "ok": true,
 298 |               "status": 200,
 299 |               "status_text": ""
 300 |             }
 301 |           }
 302 |         },
 303 |         "id": "qXquL3Y7bLQ6",
 304 |         "outputId": "5a46b013-8b39-4227-ba5b-28677885d763"
 305 |       },
 306 |       "outputs": [
 307 |         {
 308 |           "data": {
 309 |             "text/html": [
 310 |               "\n",
 311 |               "     <input type=\"file\" id=\"files-181e99b1-b88d-4bab-9b36-9db9bfb86b94\" name=\"files[]\" multiple disabled />\n",
 312 |               "     <output id=\"result-181e99b1-b88d-4bab-9b36-9db9bfb86b94\">\n",
 313 |               "      Upload widget is only available when the cell has been executed in the\n",
 314 |               "      current browser session. Please rerun this cell to enable.\n",
 315 |               "      </output>\n",
 316 |               "      <script src=\"/nbextensions/google.colab/files.js\"></script> "
 317 |             ],
 318 |             "text/plain": [
 319 |               "<IPython.core.display.HTML object>"
 320 |             ]
 321 |           },
 322 |           "metadata": {
 323 |             "tags": []
 324 |           },
 325 |           "output_type": "display_data"
 326 |         },
 327 |         {
 328 |           "name": "stdout",
 329 |           "output_type": "stream",
 330 |           "text": [
 331 |             "Saving gc-creds.json to gc-creds.json\n",
 332 |             "User uploaded file \"gc-creds.json\" with length 2314 bytes\n"
 333 |           ]
 334 |         }
 335 |       ],
 336 |       "source": [
 337 |         "from google.colab import files\n",
 338 |         "\n",
 339 |         "uploaded = files.upload()\n",
 340 |         "\n",
 341 |         "for fn in uploaded.keys():\n",
 342 |         "  print('User uploaded file \"{name}\" with length {length} bytes'.format(\n",
 343 |         "      name=fn, length=len(uploaded[fn])))"
 344 |       ]
 345 |     },
 346 |     {
 347 |       "cell_type": "code",
 348 |       "execution_count": null,
 349 |       "metadata": {
 350 |         "colab": {
 351 |           "base_uri": "https://localhost:8080/",
 352 |           "height": 50
 353 |         },
 354 |         "id": "-emXbdQ1bTDg",
 355 |         "outputId": "2a343cc2-8a58-476c-ee89-8717954a08dd"
 356 |       },
 357 |       "outputs": [
 358 |         {
 359 |           "name": "stdout",
 360 |           "output_type": "stream",
 361 |           "text": [
 362 |             "/content\n",
 363 |             "-rw-r--r-- 1 root root 2314 Jan 30 00:20 ./gc-creds.json\n"
 364 |           ]
 365 |         }
 366 |       ],
 367 |       "source": [
 368 |         "!pwd\n",
 369 |         "!ls -l ./gc-creds.json"
 370 |       ]
 371 |     },
 372 |     {
 373 |       "cell_type": "markdown",
 374 |       "metadata": {
 375 |         "id": "o-80JnSyMWEV"
 376 |       },
 377 |       "source": [
 378 |         "2. **Set environment variable**"
 379 |       ]
 380 |     },
 381 |     {
 382 |       "cell_type": "code",
 383 |       "execution_count": null,
 384 |       "metadata": {
 385 |         "colab": {
 386 |           "base_uri": "https://localhost:8080/",
 387 |           "height": 34
 388 |         },
 389 |         "id": "msFTMyUWgtEv",
 390 |         "outputId": "ead40fcc-e41f-4ff6-e79d-418aad093c66"
 391 |       },
 392 |       "outputs": [
 393 |         {
 394 |           "name": "stdout",
 395 |           "output_type": "stream",
 396 |           "text": [
 397 |             "-rw-r--r-- 1 root root 2314 Jan 30 00:20 /content/gc-creds.json\n"
 398 |           ]
 399 |         }
 400 |       ],
 401 |       "source": [
 402 |         "import os\n",
 403 |         "os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/gc-creds.json'\n",
 404 |         "\n",
 405 |         "!ls -l $GOOGLE_APPLICATION_CREDENTIALS"
 406 |       ]
 407 |     },
 408 |     {
 409 |       "cell_type": "markdown",
 410 |       "metadata": {
 411 |         "id": "fljBsBFHWCMi"
 412 |       },
 413 |       "source": [
 414 |         "## Batch API"
 415 |       ]
 416 |     },
 417 |     {
 418 |       "cell_type": "code",
 419 |       "execution_count": null,
 420 |       "metadata": {
 421 |         "colab": {
 422 |           "base_uri": "https://localhost:8080/",
 423 |           "height": 168
 424 |         },
 425 |         "id": "7dlm4CWyQPeR",
 426 |         "outputId": "d5bfd6f5-e16a-4944-ad36-4325f9021935"
 427 |       },
 428 |       "outputs": [
 429 |         {
 430 |           "name": "stdout",
 431 |           "output_type": "stream",
 432 |           "text": [
 433 |             "\n",
 434 |             "audio file=\"audio/2830-3980-0043.wav\"    expected text=\"experience proves this\"\n",
 435 |             "google-cloud-batch-stt: \"experience proves this\"\n",
 436 |             "\n",
 437 |             "audio file=\"audio/4507-16021-0012.wav\"    expected text=\"why should one halt on the way\"\n",
 438 |             "google-cloud-batch-stt: \"why should one halt on the way\"\n",
 439 |             "\n",
 440 |             "audio file=\"audio/8455-210777-0068.wav\"    expected text=\"your power is sufficient i said\"\n",
 441 |             "google-cloud-batch-stt: \"your power is sufficient I said\"\n"
 442 |           ]
 443 |         }
 444 |       ],
 445 |       "source": [
 446 |         "from google.cloud import speech_v1\n",
 447 |         "from google.cloud.speech_v1 import enums\n",
 448 |         "\n",
 449 |         "def google_batch_stt(filename: str, lang: str, encoding: str) -> str:\n",
 450 |         "    buffer, rate = read_wav_file(filename)\n",
 451 |         "    client = speech_v1.SpeechClient()\n",
 452 |         "\n",
 453 |         "    config = {\n",
 454 |         "        'language_code': lang,\n",
 455 |         "        'sample_rate_hertz': rate,\n",
 456 |         "        'encoding': enums.RecognitionConfig.AudioEncoding[encoding]\n",
 457 |         "    }\n",
 458 |         "\n",
 459 |         "    audio = {\n",
 460 |         "        'content': buffer\n",
 461 |         "    }\n",
 462 |         "\n",
 463 |         "    response = client.recognize(config, audio)\n",
 464 |         "    # For bigger audio file, the previous line can be replaced with following:\n",
 465 |         "    # operation = client.long_running_recognize(config, audio)\n",
 466 |         "    # response = operation.result()\n",
 467 |         "\n",
 468 |         "    for result in response.results:\n",
 469 |         "        # First alternative is the most probable result\n",
 470 |         "        alternative = result.alternatives[0]\n",
 471 |         "        return alternative.transcript\n",
 472 |         "\n",
 473 |         "# Run tests\n",
 474 |         "for t in TESTCASES:\n",
 475 |         "    print('\\naudio file=\"{0}\"    expected text=\"{1}\"'.format(t['filename'], t['text']))\n",
 476 |         "    print('google-cloud-batch-stt: \"{}\"'.format(\n",
 477 |         "        google_batch_stt(t['filename'], t['lang'], t['encoding'])\n",
 478 |         "    ))"
 479 |       ]
 480 |     },
 481 |     {
 482 |       "cell_type": "markdown",
 483 |       "metadata": {
 484 |         "id": "pGhaFWC7rN9b"
 485 |       },
 486 |       "source": [
 487 |         "## Streaming API"
 488 |       ]
 489 |     },
 490 |     {
 491 |       "cell_type": "code",
 492 |       "execution_count": null,
 493 |       "metadata": {
 494 |         "colab": {
 495 |           "base_uri": "https://localhost:8080/",
 496 |           "height": 991
 497 |         },
 498 |         "id": "h9wMydkzrdX-",
 499 |         "outputId": "53c3427b-6a1e-4520-939b-a9bc34515760"
 500 |       },
 501 |       "outputs": [
 502 |         {
 503 |           "name": "stdout",
 504 |           "output_type": "stream",
 505 |           "text": [
 506 |             "\n",
 507 |             "audio file=\"audio/2830-3980-0043.wav\"    expected text=\"experience proves this\"\n",
 508 |             "interim results: \n",
 509 |             "not final: next\n",
 510 |             "not final: iSpy\n",
 511 |             "not final: Aspira\n",
 512 |             "not final: Xperia\n",
 513 |             "not final: Experian\n",
 514 |             "not final: experience\n",
 515 |             "not final: experience proved\n",
 516 |             "not final: experience proves\n",
 517 |             "not final: experience proves the\n",
 518 |             "not final: experience proves that\n",
 519 |             "not final: experience\n",
 520 |             "final: experience proves this\n",
 521 |             "google-cloud-streaming-stt: \"experience proves this\"\n",
 522 |             "\n",
 523 |             "audio file=\"audio/4507-16021-0012.wav\"    expected text=\"why should one halt on the way\"\n",
 524 |             "interim results: \n",
 525 |             "not final: what\n",
 526 |             "not final: watch\n",
 527 |             "not final: why should\n",
 528 |             "not final: why should we\n",
 529 |             "not final: why should one\n",
 530 |             "not final: why should one who\n",
 531 |             "not final: why should one have\n",
 532 |             "not final: why should\n",
 533 |             "not final: why should\n",
 534 |             "not final: why should\n",
 535 |             "not final: why should\n",
 536 |             "not final: why should one\n",
 537 |             "not final: why should one\n",
 538 |             "not final: why should one\n",
 539 |             "not final: why should one\n",
 540 |             "not final: why should one halt\n",
 541 |             "not final: why should one halt on\n",
 542 |             "not final: why should one halt on the\n",
 543 |             "final: why should one halt on the way\n",
 544 |             "google-cloud-streaming-stt: \"why should one halt on the way\"\n",
 545 |             "\n",
 546 |             "audio file=\"audio/8455-210777-0068.wav\"    expected text=\"your power is sufficient i said\"\n",
 547 |             "interim results: \n",
 548 |             "not final: you're\n",
 549 |             "not final: your pie\n",
 550 |             "not final: your power\n",
 551 |             "not final: your power is\n",
 552 |             "not final: your power is so\n",
 553 |             "not final: your power is a\n",
 554 |             "not final: your\n",
 555 |             "not final: your power\n",
 556 |             "not final: your power\n",
 557 |             "not final: your power is\n",
 558 |             "not final: your power is\n",
 559 |             "not final: your power is\n",
 560 |             "not final: your power is\n",
 561 |             "not final: your power is sufficient\n",
 562 |             "final: your power is sufficient I said\n",
 563 |             "google-cloud-streaming-stt: \"your power is sufficient I said\"\n"
 564 |           ]
 565 |         }
 566 |       ],
 567 |       "source": [
 568 |         "from google.cloud import speech\n",
 569 |         "from google.cloud.speech import enums\n",
 570 |         "from google.cloud.speech import types\n",
 571 |         "\n",
 572 |         "def response_stream_processor(responses):\n",
 573 |         "    print('interim results: ')\n",
 574 |         "\n",
 575 |         "    transcript = ''\n",
 576 |         "    num_chars_printed = 0\n",
 577 |         "    for response in responses:\n",
 578 |         "        if not response.results:\n",
 579 |         "            continue\n",
 580 |         "\n",
 581 |         "        result = response.results[0]\n",
 582 |         "        if not result.alternatives:\n",
 583 |         "            continue\n",
 584 |         "\n",
 585 |         "        transcript = result.alternatives[0].transcript\n",
 586 |         "        print('{0}final: {1}'.format(\n",
 587 |         "            '' if result.is_final else 'not ',\n",
 588 |         "            transcript\n",
 589 |         "        ))\n",
 590 |         "\n",
 591 |         "    return transcript\n",
 592 |         "\n",
 593 |         "def google_streaming_stt(filename: str, lang: str, encoding: str) -> str:\n",
 594 |         "    buffer, rate = read_wav_file(filename)\n",
 595 |         "\n",
 596 |         "    client = speech.SpeechClient()\n",
 597 |         "\n",
 598 |         "    config = types.RecognitionConfig(\n",
 599 |         "        encoding=enums.RecognitionConfig.AudioEncoding[encoding],\n",
 600 |         "        sample_rate_hertz=rate,\n",
 601 |         "        language_code=lang\n",
 602 |         "    )\n",
 603 |         "\n",
 604 |         "    streaming_config = types.StreamingRecognitionConfig(\n",
 605 |         "        config=config,\n",
 606 |         "        interim_results=True\n",
 607 |         "    )\n",
 608 |         "\n",
 609 |         "    audio_generator = simulate_stream(buffer)  # buffer chunk generator\n",
 610 |         "    requests = (types.StreamingRecognizeRequest(audio_content=chunk) for chunk in audio_generator)\n",
 611 |         "    responses = client.streaming_recognize(streaming_config, requests)\n",
 612 |         "    # Now, put the transcription responses to use.\n",
 613 |         "    return response_stream_processor(responses)\n",
 614 |         "\n",
 615 |         "# Run tests\n",
 616 |         "for t in TESTCASES:\n",
 617 |         "    print('\\naudio file=\"{0}\"    expected text=\"{1}\"'.format(t['filename'], t['text']))\n",
 618 |         "    print('google-cloud-streaming-stt: \"{}\"'.format(\n",
 619 |         "        google_streaming_stt(t['filename'], t['lang'], t['encoding'])\n",
 620 |         "    ))"
 621 |       ]
 622 |     },
 623 |     {
 624 |       "cell_type": "markdown",
 625 |       "metadata": {
 626 |         "id": "S8Fg2BE75Qoo"
 627 |       },
 628 |       "source": [
 629 |         "\n",
 630 |         "---\n",
 631 |         "\n",
 632 |         "# Microsoft Azure\n",
 633 |         "\n",
 634 |         "Microsoft Azure [Speech Services](https://azure.microsoft.com/en-in/services/cognitive-services/speech-services/) have [Speech to Text](https://azure.microsoft.com/en-in/services/cognitive-services/speech-to-text/) service."
 635 |       ]
 636 |     },
 637 |     {
 638 |       "cell_type": "markdown",
 639 |       "metadata": {
 640 |         "id": "mk8NgzQIlwwX"
 641 |       },
 642 |       "source": [
 643 |         "## Setup\n",
 644 |         "\n",
 645 |         "1. **Install azure speech package**"
 646 |       ]
 647 |     },
 648 |     {
 649 |       "cell_type": "code",
 650 |       "execution_count": null,
 651 |       "metadata": {
 652 |         "colab": {
 653 |           "base_uri": "https://localhost:8080/",
 654 |           "height": 121
 655 |         },
 656 |         "id": "2A5YJHlswQSs",
 657 |         "outputId": "4f2e7f65-dfcc-450e-cb26-c9ca780957c7"
 658 |       },
 659 |       "outputs": [
 660 |         {
 661 |           "name": "stdout",
 662 |           "output_type": "stream",
 663 |           "text": [
 664 |             "Collecting azure-cognitiveservices-speech\n",
 665 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/a0/d8/690896a3543b7bed058029b1b3450f4ce2e952d19347663fe570e6dec72c/azure_cognitiveservices_speech-1.9.0-cp36-cp36m-manylinux1_x86_64.whl (3.9MB)\n",
 666 |             "\u001b[K     |████████████████████████████████| 3.9MB 3.4MB/s \n",
 667 |             "\u001b[?25hInstalling collected packages: azure-cognitiveservices-speech\n",
 668 |             "Successfully installed azure-cognitiveservices-speech-1.9.0\n"
 669 |           ]
 670 |         }
 671 |       ],
 672 |       "source": [
 673 |         "!pip3 install azure-cognitiveservices-speech"
 674 |       ]
 675 |     },
 676 |     {
 677 |       "cell_type": "markdown",
 678 |       "metadata": {
 679 |         "id": "K4ME2jnAimEQ"
 680 |       },
 681 |       "source": [
 682 |         "2. **Set service credentials**\n",
 683 |         "\n",
 684 |         "You can enable Speech service and find credentials for your account at [Microsoft Azure portal](https://portal.azure.com/). You can open a free account [here](https://azure.microsoft.com/en-in/free/ai/)."
 685 |       ]
 686 |     },
 687 |     {
 688 |       "cell_type": "code",
 689 |       "execution_count": null,
 690 |       "metadata": {
 691 |         "id": "rSqzFx-lwyz7"
 692 |       },
 693 |       "outputs": [],
 694 |       "source": [
 695 |         "AZURE_SPEECH_KEY = 'YOUR AZURE SPEECH KEY'\n",
 696 |         "AZURE_SERVICE_REGION = 'YOUR AZURE SERVICE REGION'"
 697 |       ]
 698 |     },
 699 |     {
 700 |       "cell_type": "markdown",
 701 |       "metadata": {
 702 |         "id": "ZVvMt_qylUjF"
 703 |       },
 704 |       "source": [
 705 |         "## Batch API"
 706 |       ]
 707 |     },
 708 |     {
 709 |       "cell_type": "code",
 710 |       "execution_count": null,
 711 |       "metadata": {
 712 |         "colab": {
 713 |           "base_uri": "https://localhost:8080/",
 714 |           "height": 168
 715 |         },
 716 |         "id": "rRMjNB68wYYN",
 717 |         "outputId": "7539fb6e-1625-4931-f981-96874628c934"
 718 |       },
 719 |       "outputs": [
 720 |         {
 721 |           "name": "stdout",
 722 |           "output_type": "stream",
 723 |           "text": [
 724 |             "\n",
 725 |             "audio file=\"audio/2830-3980-0043.wav\"    expected text=\"experience proves this\"\n",
 726 |             "azure-batch-stt: \"Experience proves this.\"\n",
 727 |             "\n",
 728 |             "audio file=\"audio/4507-16021-0012.wav\"    expected text=\"why should one halt on the way\"\n",
 729 |             "azure-batch-stt: \"Whi should one halt on the way.\"\n",
 730 |             "\n",
 731 |             "audio file=\"audio/8455-210777-0068.wav\"    expected text=\"your power is sufficient i said\"\n",
 732 |             "azure-batch-stt: \"Your power is sufficient I said.\"\n"
 733 |           ]
 734 |         }
 735 |       ],
 736 |       "source": [
 737 |         "import azure.cognitiveservices.speech as speechsdk\n",
 738 |         "\n",
 739 |         "def azure_batch_stt(filename: str, lang: str, encoding: str) -> str:\n",
 740 |         "    speech_config = speechsdk.SpeechConfig(\n",
 741 |         "        subscription=AZURE_SPEECH_KEY,\n",
 742 |         "        region=AZURE_SERVICE_REGION\n",
 743 |         "    )\n",
 744 |         "    audio_input = speechsdk.AudioConfig(filename=filename)\n",
 745 |         "    speech_recognizer = speechsdk.SpeechRecognizer(\n",
 746 |         "        speech_config=speech_config,\n",
 747 |         "        audio_config=audio_input\n",
 748 |         "    )\n",
 749 |         "    result = speech_recognizer.recognize_once()\n",
 750 |         "\n",
 751 |         "    return result.text if result.reason == speechsdk.ResultReason.RecognizedSpeech else None\n",
 752 |         "\n",
 753 |         "# Run tests\n",
 754 |         "for t in TESTCASES:\n",
 755 |         "    print('\\naudio file=\"{0}\"    expected text=\"{1}\"'.format(t['filename'], t['text']))\n",
 756 |         "    print('azure-batch-stt: \"{}\"'.format(\n",
 757 |         "        azure_batch_stt(t['filename'], t['lang'], t['encoding'])\n",
 758 |         "    ))"
 759 |       ]
 760 |     },
 761 |     {
 762 |       "cell_type": "markdown",
 763 |       "metadata": {
 764 |         "id": "aXd7OC7plbAu"
 765 |       },
 766 |       "source": [
 767 |         "## Streaming API"
 768 |       ]
 769 |     },
 770 |     {
 771 |       "cell_type": "code",
 772 |       "execution_count": null,
 773 |       "metadata": {
 774 |         "colab": {
 775 |           "base_uri": "https://localhost:8080/",
 776 |           "height": 521
 777 |         },
 778 |         "id": "IzfBW4kczY9l",
 779 |         "outputId": "8694efbf-5886-4359-ec6c-5b6f3c970372"
 780 |       },
 781 |       "outputs": [
 782 |         {
 783 |           "name": "stdout",
 784 |           "output_type": "stream",
 785 |           "text": [
 786 |             "\n",
 787 |             "audio file=\"audio/2830-3980-0043.wav\"    expected text=\"experience proves this\"\n",
 788 |             "interim text: \"experience\"\n",
 789 |             "interim text: \"experienced\"\n",
 790 |             "interim text: \"experience\"\n",
 791 |             "interim text: \"experience proves\"\n",
 792 |             "interim text: \"experience proves this\"\n",
 793 |             "azure-streaming-stt: \"Experience proves this.\"\n",
 794 |             "\n",
 795 |             "audio file=\"audio/4507-16021-0012.wav\"    expected text=\"why should one halt on the way\"\n",
 796 |             "interim text: \"huaisheng\"\n",
 797 |             "interim text: \"white\"\n",
 798 |             "interim text: \"whi should\"\n",
 799 |             "interim text: \"whi should one\"\n",
 800 |             "interim text: \"whi should one halt\"\n",
 801 |             "interim text: \"whi should one halt on\"\n",
 802 |             "interim text: \"whi should one halt on the\"\n",
 803 |             "interim text: \"whi should one halt on the way\"\n",
 804 |             "azure-streaming-stt: \"Whi should one halt on the way.\"\n",
 805 |             "\n",
 806 |             "audio file=\"audio/8455-210777-0068.wav\"    expected text=\"your power is sufficient i said\"\n",
 807 |             "interim text: \"you're\"\n",
 808 |             "interim text: \"your\"\n",
 809 |             "interim text: \"your power\"\n",
 810 |             "interim text: \"your\"\n",
 811 |             "interim text: \"your power is\"\n",
 812 |             "interim text: \"your power is sufficient\"\n",
 813 |             "interim text: \"your power is sufficient i\"\n",
 814 |             "interim text: \"your power is sufficient i said\"\n",
 815 |             "azure-streaming-stt: \"Your power is sufficient I said.\"\n"
 816 |           ]
 817 |         }
 818 |       ],
 819 |       "source": [
 820 |         "import time\n",
 821 |         "import azure.cognitiveservices.speech as speechsdk\n",
 822 |         "\n",
 823 |         "def azure_streaming_stt(filename: str, lang: str, encoding: str) -> str:\n",
 824 |         "    speech_config = speechsdk.SpeechConfig(\n",
 825 |         "        subscription=AZURE_SPEECH_KEY,\n",
 826 |         "        region=AZURE_SERVICE_REGION\n",
 827 |         "    )\n",
 828 |         "    stream = speechsdk.audio.PushAudioInputStream()\n",
 829 |         "    audio_config = speechsdk.audio.AudioConfig(stream=stream)\n",
 830 |         "    speech_recognizer = speechsdk.SpeechRecognizer(\n",
 831 |         "        speech_config=speech_config,\n",
 832 |         "        audio_config=audio_config\n",
 833 |         "    )\n",
 834 |         "\n",
 835 |         "    # Connect callbacks to the events fired by the speech recognizer\n",
 836 |         "    speech_recognizer.recognizing.connect(lambda evt: print('interim text: \"{}\"'.format(evt.result.text)))\n",
 837 |         "    speech_recognizer.recognized.connect(lambda evt:  print('azure-streaming-stt: \"{}\"'.format(evt.result.text)))\n",
 838 |         "\n",
 839 |         "    # start continuous speech recognition\n",
 840 |         "    speech_recognizer.start_continuous_recognition()\n",
 841 |         "\n",
 842 |         "    # push buffer chunks to stream\n",
 843 |         "    buffer, rate = read_wav_file(filename)\n",
 844 |         "    audio_generator = simulate_stream(buffer)\n",
 845 |         "    for chunk in audio_generator:\n",
 846 |         "      stream.write(chunk)\n",
 847 |         "      time.sleep(0.1)  # to give callback a chance against this fast loop\n",
 848 |         "\n",
 849 |         "    # stop continuous speech recognition\n",
 850 |         "    stream.close()\n",
 851 |         "    time.sleep(0.5)  # give chance to VAD to kick in\n",
 852 |         "    speech_recognizer.stop_continuous_recognition()\n",
 853 |         "    time.sleep(0.5)  # Let all callback run\n",
 854 |         "\n",
 855 |         "# Run tests\n",
 856 |         "for t in TESTCASES:\n",
 857 |         "    print('\\naudio file=\"{0}\"    expected text=\"{1}\"'.format(t['filename'], t['text']))\n",
 858 |         "    azure_streaming_stt(t['filename'], t['lang'], t['encoding'])"
 859 |       ]
 860 |     },
 861 |     {
 862 |       "cell_type": "markdown",
 863 |       "metadata": {
 864 |         "id": "5ASpAymRMzOz"
 865 |       },
 866 |       "source": [
 867 |         "---\n",
 868 |         "\n",
 869 |         "# IBM Watson\n",
 870 |         "\n",
 871 |         "For IBM [Watson Speech to Text](https://www.ibm.com/in-en/cloud/watson-speech-to-text) is ASR service with .NET, Go, JavaScript, [Python](https://cloud.ibm.com/apidocs/speech-to-text/speech-to-text?code=python), Ruby, Swift and Unity API libraries, as well as REST endpoints.\n"
 872 |       ]
 873 |     },
 874 |     {
 875 |       "cell_type": "markdown",
 876 |       "metadata": {
 877 |         "id": "atuGghM2RxWd"
 878 |       },
 879 |       "source": [
 880 |         "## Setup\n",
 881 |         "\n",
 882 |         "1. **Install IBM Watson package**"
 883 |       ]
 884 |     },
 885 |     {
 886 |       "cell_type": "code",
 887 |       "execution_count": null,
 888 |       "metadata": {
 889 |         "colab": {
 890 |           "base_uri": "https://localhost:8080/",
 891 |           "height": 490
 892 |         },
 893 |         "id": "nG5jW68yRWGk",
 894 |         "outputId": "b3beae11-4775-430c-8f5b-57ad5809bd93"
 895 |       },
 896 |       "outputs": [
 897 |         {
 898 |           "name": "stdout",
 899 |           "output_type": "stream",
 900 |           "text": [
 901 |             "Collecting ibm-watson\n",
 902 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/da/f4/7e256026ee22c75a630c6de53eb45b6fef4840ac6728b80a92dd2e523a1a/ibm-watson-4.2.1.tar.gz (348kB)\n",
 903 |             "\u001b[K     |████████████████████████████████| 358kB 3.4MB/s \n",
 904 |             "\u001b[?25hRequirement already satisfied: requests<3.0,>=2.0 in /usr/local/lib/python3.6/dist-packages (from ibm-watson) (2.21.0)\n",
 905 |             "Requirement already satisfied: python_dateutil>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from ibm-watson) (2.6.1)\n",
 906 |             "Collecting websocket-client==0.48.0\n",
 907 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/8a/a1/72ef9aa26cfe1a75cee09fc1957e4723add9de098c15719416a1ee89386b/websocket_client-0.48.0-py2.py3-none-any.whl (198kB)\n",
 908 |             "\u001b[K     |████████████████████████████████| 204kB 43.6MB/s \n",
 909 |             "\u001b[?25hCollecting ibm_cloud_sdk_core==1.5.1\n",
 910 |             "  Downloading https://files.pythonhosted.org/packages/b7/f6/10d5271c807d73d236e6ae07b68035fed78b28b5ab836704d34097af3986/ibm-cloud-sdk-core-1.5.1.tar.gz\n",
 911 |             "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0,>=2.0->ibm-watson) (2.8)\n",
 912 |             "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3.0,>=2.0->ibm-watson) (1.24.3)\n",
 913 |             "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0,>=2.0->ibm-watson) (3.0.4)\n",
 914 |             "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3.0,>=2.0->ibm-watson) (2019.11.28)\n",
 915 |             "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python_dateutil>=2.5.3->ibm-watson) (1.12.0)\n",
 916 |             "Collecting PyJWT>=1.7.1\n",
 917 |             "  Downloading https://files.pythonhosted.org/packages/87/8b/6a9f14b5f781697e51259d81657e6048fd31a113229cf346880bb7545565/PyJWT-1.7.1-py2.py3-none-any.whl\n",
 918 |             "Building wheels for collected packages: ibm-watson, ibm-cloud-sdk-core\n",
 919 |             "  Building wheel for ibm-watson (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 920 |             "  Created wheel for ibm-watson: filename=ibm_watson-4.2.1-cp36-none-any.whl size=343298 sha256=3fcdea1185ceb522ed5f080ad4d66048d9286cd28e8d9bc86094b08a84cb6211\n",
 921 |             "  Stored in directory: /root/.cache/pip/wheels/ce/4d/6e/ae352b7c7acdddf073aeb06617fbfeefaea9fcb6d7ae98800b\n",
 922 |             "  Building wheel for ibm-cloud-sdk-core (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 923 |             "  Created wheel for ibm-cloud-sdk-core: filename=ibm_cloud_sdk_core-1.5.1-cp36-none-any.whl size=44492 sha256=8fbd5fdfa4ca15217877ee44671387c23ce61f390cd15d8006200d502d56dc63\n",
 924 |             "  Stored in directory: /root/.cache/pip/wheels/6a/42/50/f96888116b329578304f9dda4693cef6f3e76e18272d22cb6c\n",
 925 |             "Successfully built ibm-watson ibm-cloud-sdk-core\n",
 926 |             "Installing collected packages: websocket-client, PyJWT, ibm-cloud-sdk-core, ibm-watson\n",
 927 |             "Successfully installed PyJWT-1.7.1 ibm-cloud-sdk-core-1.5.1 ibm-watson-4.2.1 websocket-client-0.48.0\n"
 928 |           ]
 929 |         }
 930 |       ],
 931 |       "source": [
 932 |         "!pip install ibm-watson"
 933 |       ]
 934 |     },
 935 |     {
 936 |       "cell_type": "markdown",
 937 |       "metadata": {
 938 |         "id": "bntnwqJ3Q99Z"
 939 |       },
 940 |       "source": [
 941 |         "2. **Set service credentials**\n",
 942 |         "\n",
 943 |         "You will need to [sign up/in](https://cloud.ibm.com/docs/services/text-to-speech?topic=text-to-speech-gettingStarted), and get API key credential and service URL, and fill it below."
 944 |       ]
 945 |     },
 946 |     {
 947 |       "cell_type": "code",
 948 |       "execution_count": null,
 949 |       "metadata": {
 950 |         "id": "cdl6Y7MJPtoT"
 951 |       },
 952 |       "outputs": [],
 953 |       "source": [
 954 |         "WATSON_API_KEY = 'YOUR WATSON API KEY'\n",
 955 |         "WATSON_STT_URL = 'YOUR WATSON SERVICE URL'"
 956 |       ]
 957 |     },
 958 |     {
 959 |       "cell_type": "markdown",
 960 |       "metadata": {
 961 |         "id": "8jqxI2XrRmKz"
 962 |       },
 963 |       "source": [
 964 |         "## Batch API"
 965 |       ]
 966 |     },
 967 |     {
 968 |       "cell_type": "code",
 969 |       "execution_count": null,
 970 |       "metadata": {
 971 |         "colab": {
 972 |           "base_uri": "https://localhost:8080/",
 973 |           "height": 168
 974 |         },
 975 |         "id": "QFWX40PYRogi",
 976 |         "outputId": "6feb0b03-8ca4-485e-bf25-5042340f8ed7"
 977 |       },
 978 |       "outputs": [
 979 |         {
 980 |           "name": "stdout",
 981 |           "output_type": "stream",
 982 |           "text": [
 983 |             "\n",
 984 |             "audio file=\"audio/2830-3980-0043.wav\"    expected text=\"experience proves this\"\n",
 985 |             "watson-batch-stt: \"experience proves this \"\n",
 986 |             "\n",
 987 |             "audio file=\"audio/4507-16021-0012.wav\"    expected text=\"why should one halt on the way\"\n",
 988 |             "watson-batch-stt: \"why should one hold on the way \"\n",
 989 |             "\n",
 990 |             "audio file=\"audio/8455-210777-0068.wav\"    expected text=\"your power is sufficient i said\"\n",
 991 |             "watson-batch-stt: \"your power is sufficient I set \"\n"
 992 |           ]
 993 |         }
 994 |       ],
 995 |       "source": [
 996 |         "import os\n",
 997 |         "\n",
 998 |         "from ibm_watson import SpeechToTextV1\n",
 999 |         "from ibm_cloud_sdk_core.authenticators import IAMAuthenticator\n",
1000 |         "\n",
1001 |         "def watson_batch_stt(filename: str, lang: str, encoding: str) -> str:\n",
1002 |         "    authenticator = IAMAuthenticator(WATSON_API_KEY)\n",
1003 |         "    speech_to_text = SpeechToTextV1(authenticator=authenticator)\n",
1004 |         "    speech_to_text.set_service_url(WATSON_STT_URL)\n",
1005 |         "\n",
1006 |         "    with open(filename, 'rb') as audio_file:\n",
1007 |         "        response = speech_to_text.recognize(\n",
1008 |         "            audio=audio_file,\n",
1009 |         "            content_type='audio/{}'.format(os.path.splitext(filename)[1][1:]),\n",
1010 |         "            model=lang + '_BroadbandModel',\n",
1011 |         "            max_alternatives=3,\n",
1012 |         "        ).get_result()\n",
1013 |         "\n",
1014 |         "    return response['results'][0]['alternatives'][0]['transcript']\n",
1015 |         "\n",
1016 |         "# Run tests\n",
1017 |         "for t in TESTCASES:\n",
1018 |         "    print('\\naudio file=\"{0}\"    expected text=\"{1}\"'.format(t['filename'], t['text']))\n",
1019 |         "    print('watson-batch-stt: \"{}\"'.format(\n",
1020 |         "        watson_batch_stt(t['filename'], t['lang'], t['encoding'])\n",
1021 |         "    ))"
1022 |       ]
1023 |     },
1024 |     {
1025 |       "cell_type": "markdown",
1026 |       "metadata": {
1027 |         "id": "vOsBdku-RpB-"
1028 |       },
1029 |       "source": [
1030 |         "## Streaming API\n",
1031 |         "\n",
1032 |         "Streaming API works over websocket."
1033 |       ]
1034 |     },
1035 |     {
1036 |       "cell_type": "code",
1037 |       "execution_count": null,
1038 |       "metadata": {
1039 |         "colab": {
1040 |           "base_uri": "https://localhost:8080/",
1041 |           "height": 672
1042 |         },
1043 |         "id": "Pwb0uZjPR0rX",
1044 |         "outputId": "97e95e3f-053e-46b7-c459-13697a6eb872"
1045 |       },
1046 |       "outputs": [
1047 |         {
1048 |           "name": "stdout",
1049 |           "output_type": "stream",
1050 |           "text": [
1051 |             "\n",
1052 |             "audio file=\"audio/2830-3980-0043.wav\"    expected text=\"experience proves this\"\n",
1053 |             "not final: X. \n",
1054 |             "not final: experts \n",
1055 |             "not final: experience \n",
1056 |             "not final: experienced \n",
1057 |             "not final: experience prove \n",
1058 |             "not final: experience proves \n",
1059 |             "not final: experience proves that \n",
1060 |             "not final: experience proves this \n",
1061 |             "final: experience proves this \n",
1062 |             "watson-cloud-streaming-stt: \"experience proves this \"\n",
1063 |             "\n",
1064 |             "audio file=\"audio/4507-16021-0012.wav\"    expected text=\"why should one halt on the way\"\n",
1065 |             "not final: why \n",
1066 |             "not final: what \n",
1067 |             "not final: why should \n",
1068 |             "not final: why should we \n",
1069 |             "not final: why should one \n",
1070 |             "not final: why should one whole \n",
1071 |             "not final: why should one hold \n",
1072 |             "not final: why should one hold on \n",
1073 |             "not final: why should one hold on the \n",
1074 |             "not final: why should one hold on the way \n",
1075 |             "final: why should one hold on the way \n",
1076 |             "watson-cloud-streaming-stt: \"why should one hold on the way \"\n",
1077 |             "\n",
1078 |             "audio file=\"audio/8455-210777-0068.wav\"    expected text=\"your power is sufficient i said\"\n",
1079 |             "not final: your \n",
1080 |             "not final: your power \n",
1081 |             "not final: your power is \n",
1082 |             "not final: your power is the \n",
1083 |             "not final: your power is sufficient \n",
1084 |             "not final: your power is sufficient I \n",
1085 |             "not final: your power is sufficient I saw \n",
1086 |             "not final: your power is sufficient I said \n",
1087 |             "not final: your power is sufficient I set \n",
1088 |             "final: your power is sufficient I set \n",
1089 |             "watson-cloud-streaming-stt: \"your power is sufficient I set \"\n"
1090 |           ]
1091 |         }
1092 |       ],
1093 |       "source": [
1094 |         "import json\n",
1095 |         "import logging\n",
1096 |         "import os\n",
1097 |         "from queue import Queue\n",
1098 |         "from threading import Thread\n",
1099 |         "import time\n",
1100 |         "\n",
1101 |         "from ibm_watson import SpeechToTextV1\n",
1102 |         "from ibm_watson.websocket import RecognizeCallback, AudioSource\n",
1103 |         "from ibm_cloud_sdk_core.authenticators import IAMAuthenticator\n",
1104 |         "\n",
1105 |         "# Watson websocket prints justs too many debug logs, so disable it\n",
1106 |         "logging.disable(logging.CRITICAL)\n",
1107 |         "\n",
1108 |         "# Chunk and buffer size\n",
1109 |         "CHUNK_SIZE = 4096\n",
1110 |         "BUFFER_MAX_ELEMENT = 10\n",
1111 |         "\n",
1112 |         "# A callback class to process various streaming STT events\n",
1113 |         "class MyRecognizeCallback(RecognizeCallback):\n",
1114 |         "    def __init__(self):\n",
1115 |         "        RecognizeCallback.__init__(self)\n",
1116 |         "        self.transcript = None\n",
1117 |         "\n",
1118 |         "    def on_transcription(self, transcript):\n",
1119 |         "        # print('transcript: {}'.format(transcript))\n",
1120 |         "        pass\n",
1121 |         "\n",
1122 |         "    def on_connected(self):\n",
1123 |         "        # print('Connection was successful')\n",
1124 |         "        pass\n",
1125 |         "\n",
1126 |         "    def on_error(self, error):\n",
1127 |         "        # print('Error received: {}'.format(error))\n",
1128 |         "        pass\n",
1129 |         "\n",
1130 |         "    def on_inactivity_timeout(self, error):\n",
1131 |         "        # print('Inactivity timeout: {}'.format(error))\n",
1132 |         "        pass\n",
1133 |         "\n",
1134 |         "    def on_listening(self):\n",
1135 |         "        # print('Service is listening')\n",
1136 |         "        pass\n",
1137 |         "\n",
1138 |         "    def on_hypothesis(self, hypothesis):\n",
1139 |         "        # print('hypothesis: {}'.format(hypothesis))\n",
1140 |         "        pass\n",
1141 |         "\n",
1142 |         "    def on_data(self, data):\n",
1143 |         "        self.transcript = data['results'][0]['alternatives'][0]['transcript']\n",
1144 |         "        print('{0}final: {1}'.format(\n",
1145 |         "            '' if data['results'][0]['final'] else 'not ',\n",
1146 |         "            self.transcript\n",
1147 |         "        ))\n",
1148 |         "\n",
1149 |         "    def on_close(self):\n",
1150 |         "        # print(\"Connection closed\")\n",
1151 |         "        pass\n",
1152 |         "\n",
1153 |         "def watson_streaming_stt(filename: str, lang: str, encoding: str) -> str:\n",
1154 |         "    authenticator = IAMAuthenticator(WATSON_API_KEY)\n",
1155 |         "    speech_to_text = SpeechToTextV1(authenticator=authenticator)\n",
1156 |         "    speech_to_text.set_service_url(WATSON_STT_URL)\n",
1157 |         "\n",
1158 |         "    # Make watson audio source fed by a buffer queue\n",
1159 |         "    buffer_queue = Queue(maxsize=BUFFER_MAX_ELEMENT)\n",
1160 |         "    audio_source = AudioSource(buffer_queue, True, True)\n",
1161 |         "\n",
1162 |         "    # Callback object\n",
1163 |         "    mycallback = MyRecognizeCallback()\n",
1164 |         "\n",
1165 |         "    # Read the file\n",
1166 |         "    buffer, rate = read_wav_file(filename)\n",
1167 |         "\n",
1168 |         "    # Start Speech-to-Text recognition thread\n",
1169 |         "    stt_stream_thread = Thread(\n",
1170 |         "        target=speech_to_text.recognize_using_websocket,\n",
1171 |         "        kwargs={\n",
1172 |         "            'audio': audio_source,\n",
1173 |         "            'content_type': 'audio/l16; rate={}'.format(rate),\n",
1174 |         "            'recognize_callback': mycallback,\n",
1175 |         "            'interim_results': True\n",
1176 |         "        }\n",
1177 |         "    )\n",
1178 |         "    stt_stream_thread.start()\n",
1179 |         "\n",
1180 |         "    # Simulation audio stream by breaking file into chunks and filling buffer queue\n",
1181 |         "    audio_generator = simulate_stream(buffer, CHUNK_SIZE)\n",
1182 |         "    for chunk in audio_generator:\n",
1183 |         "        buffer_queue.put(chunk)\n",
1184 |         "        time.sleep(0.5)  # give a chance to callback\n",
1185 |         "\n",
1186 |         "    # Close the audio feed and wait for STTT thread to complete\n",
1187 |         "    audio_source.completed_recording()\n",
1188 |         "    stt_stream_thread.join()\n",
1189 |         "\n",
1190 |         "    # send final result\n",
1191 |         "    return mycallback.transcript\n",
1192 |         "\n",
1193 |         "# Run tests\n",
1194 |         "for t in TESTCASES:\n",
1195 |         "    print('\\naudio file=\"{0}\"    expected text=\"{1}\"'.format(t['filename'], t['text']))\n",
1196 |         "    print('watson-cloud-streaming-stt: \"{}\"'.format(\n",
1197 |         "        watson_streaming_stt(t['filename'], t['lang'], t['encoding'])\n",
1198 |         "    ))"
1199 |       ]
1200 |     },
1201 |     {
1202 |       "cell_type": "markdown",
1203 |       "metadata": {
1204 |         "id": "M7W8nsP45IUx"
1205 |       },
1206 |       "source": [
1207 |         "\n",
1208 |         "---\n",
1209 |         "\n",
1210 |         "# CMU Sphinx\n",
1211 |         "\n",
1212 |         "[CMUSphinx](https://cmusphinx.github.io/) is has been around for quite some time, and has been adapting to advancements in ASR technologies. [PocketSphinx](https://github.com/cmusphinx/pocketsphinx-python) is speech-to-text decoder software package."
1213 |       ]
1214 |     },
1215 |     {
1216 |       "cell_type": "markdown",
1217 |       "metadata": {
1218 |         "id": "eSdcTyoTl3XL"
1219 |       },
1220 |       "source": [
1221 |         "## Setup\n",
1222 |         "\n",
1223 |         "1. **Install swig**\n",
1224 |         "\n",
1225 |         "For macOS:"
1226 |       ]
1227 |     },
1228 |     {
1229 |       "cell_type": "code",
1230 |       "execution_count": null,
1231 |       "metadata": {
1232 |         "id": "nMWqogsSSk2H"
1233 |       },
1234 |       "outputs": [],
1235 |       "source": [
1236 |         "!brew install swig\n",
1237 |         "!swig -version"
1238 |       ]
1239 |     },
1240 |     {
1241 |       "cell_type": "markdown",
1242 |       "metadata": {
1243 |         "id": "tn_LqRxjSoMT"
1244 |       },
1245 |       "source": [
1246 |         "For Linux:"
1247 |       ]
1248 |     },
1249 |     {
1250 |       "cell_type": "code",
1251 |       "execution_count": null,
1252 |       "metadata": {
1253 |         "colab": {
1254 |           "base_uri": "https://localhost:8080/",
1255 |           "height": 806
1256 |         },
1257 |         "id": "ID2AUZX4SqkU",
1258 |         "outputId": "01770ecb-4b8d-4047-c1a2-2f1490c2f74b"
1259 |       },
1260 |       "outputs": [
1261 |         {
1262 |           "name": "stdout",
1263 |           "output_type": "stream",
1264 |           "text": [
1265 |             "Reading package lists... Done\n",
1266 |             "Building dependency tree       \n",
1267 |             "Reading state information... Done\n",
1268 |             "The following package was automatically installed and is no longer required:\n",
1269 |             "  libnvidia-common-430\n",
1270 |             "Use 'apt autoremove' to remove it.\n",
1271 |             "The following additional packages will be installed:\n",
1272 |             "  libpulse-mainloop-glib0 swig3.0\n",
1273 |             "Suggested packages:\n",
1274 |             "  swig-doc swig-examples swig3.0-examples swig3.0-doc\n",
1275 |             "The following NEW packages will be installed:\n",
1276 |             "  libpulse-dev libpulse-mainloop-glib0 swig swig3.0\n",
1277 |             "0 upgraded, 4 newly installed, 0 to remove and 7 not upgraded.\n",
1278 |             "Need to get 1,204 kB of archives.\n",
1279 |             "After this operation, 6,538 kB of additional disk space will be used.\n",
1280 |             "Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libpulse-mainloop-glib0 amd64 1:11.1-1ubuntu7.4 [22.1 kB]\n",
1281 |             "Get:2 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libpulse-dev amd64 1:11.1-1ubuntu7.4 [81.5 kB]\n",
1282 |             "Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig3.0 amd64 3.0.12-1 [1,094 kB]\n",
1283 |             "Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig amd64 3.0.12-1 [6,460 B]\n",
1284 |             "Fetched 1,204 kB in 1s (1,336 kB/s)\n",
1285 |             "Selecting previously unselected package libpulse-mainloop-glib0:amd64.\n",
1286 |             "(Reading database ... 145674 files and directories currently installed.)\n",
1287 |             "Preparing to unpack .../libpulse-mainloop-glib0_1%3a11.1-1ubuntu7.4_amd64.deb ...\n",
1288 |             "Unpacking libpulse-mainloop-glib0:amd64 (1:11.1-1ubuntu7.4) ...\n",
1289 |             "Selecting previously unselected package libpulse-dev:amd64.\n",
1290 |             "Preparing to unpack .../libpulse-dev_1%3a11.1-1ubuntu7.4_amd64.deb ...\n",
1291 |             "Unpacking libpulse-dev:amd64 (1:11.1-1ubuntu7.4) ...\n",
1292 |             "Selecting previously unselected package swig3.0.\n",
1293 |             "Preparing to unpack .../swig3.0_3.0.12-1_amd64.deb ...\n",
1294 |             "Unpacking swig3.0 (3.0.12-1) ...\n",
1295 |             "Selecting previously unselected package swig.\n",
1296 |             "Preparing to unpack .../swig_3.0.12-1_amd64.deb ...\n",
1297 |             "Unpacking swig (3.0.12-1) ...\n",
1298 |             "Setting up libpulse-mainloop-glib0:amd64 (1:11.1-1ubuntu7.4) ...\n",
1299 |             "Setting up libpulse-dev:amd64 (1:11.1-1ubuntu7.4) ...\n",
1300 |             "Setting up swig3.0 (3.0.12-1) ...\n",
1301 |             "Setting up swig (3.0.12-1) ...\n",
1302 |             "Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n",
1303 |             "Processing triggers for libc-bin (2.27-3ubuntu1) ...\n",
1304 |             "\n",
1305 |             "SWIG Version 3.0.12\n",
1306 |             "\n",
1307 |             "Compiled with g++ [x86_64-pc-linux-gnu]\n",
1308 |             "\n",
1309 |             "Configured options: +pcre\n",
1310 |             "\n",
1311 |             "Please see http://www.swig.org for reporting bugs and further information\n"
1312 |           ]
1313 |         }
1314 |       ],
1315 |       "source": [
1316 |         "!apt-get install -y swig libpulse-dev\n",
1317 |         "!swig -version"
1318 |       ]
1319 |     },
1320 |     {
1321 |       "cell_type": "markdown",
1322 |       "metadata": {
1323 |         "id": "yZlvnjsfSu-3"
1324 |       },
1325 |       "source": [
1326 |         "2. **Install poocketsphinx using pip**"
1327 |       ]
1328 |     },
1329 |     {
1330 |       "cell_type": "code",
1331 |       "execution_count": null,
1332 |       "metadata": {
1333 |         "colab": {
1334 |           "base_uri": "https://localhost:8080/",
1335 |           "height": 222
1336 |         },
1337 |         "id": "XzOkKfgKS789",
1338 |         "outputId": "116c7a60-cf24-4ba9-f3ac-ab09a5da0145"
1339 |       },
1340 |       "outputs": [
1341 |         {
1342 |           "name": "stdout",
1343 |           "output_type": "stream",
1344 |           "text": [
1345 |             "Collecting pocketsphinx\n",
1346 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/cd/4a/adea55f189a81aed88efa0b0e1d25628e5ed22622ab9174bf696dd4f9474/pocketsphinx-0.1.15.tar.gz (29.1MB)\n",
1347 |             "\u001b[K     |████████████████████████████████| 29.1MB 102kB/s \n",
1348 |             "\u001b[?25hBuilding wheels for collected packages: pocketsphinx\n",
1349 |             "  Building wheel for pocketsphinx (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
1350 |             "  Created wheel for pocketsphinx: filename=pocketsphinx-0.1.15-cp36-cp36m-linux_x86_64.whl size=30126870 sha256=d111bc1a768251e9b8b4bea71f05b498955eda209f5d5650f7e68cc336bb5075\n",
1351 |             "  Stored in directory: /root/.cache/pip/wheels/52/fd/52/2f62c9a0036940cc0c89e58ee0b9d00fcf78243aeaf416265f\n",
1352 |             "Successfully built pocketsphinx\n",
1353 |             "Installing collected packages: pocketsphinx\n",
1354 |             "Successfully installed pocketsphinx-0.1.15\n",
1355 |             "pocketsphinx                   0.1.15     \n"
1356 |           ]
1357 |         }
1358 |       ],
1359 |       "source": [
1360 |         "!pip3 install pocketsphinx\n",
1361 |         "!pip3 list | grep pocketsphinx"
1362 |       ]
1363 |     },
1364 |     {
1365 |       "cell_type": "markdown",
1366 |       "metadata": {
1367 |         "id": "weYD8oA-S-vu"
1368 |       },
1369 |       "source": [
1370 |         "## Create Decoder object"
1371 |       ]
1372 |     },
1373 |     {
1374 |       "cell_type": "code",
1375 |       "execution_count": null,
1376 |       "metadata": {
1377 |         "id": "TEpNoVUiTK4k"
1378 |       },
1379 |       "outputs": [],
1380 |       "source": [
1381 |         "import pocketsphinx\n",
1382 |         "import os\n",
1383 |         "\n",
1384 |         "MODELDIR = os.path.join(os.path.dirname(pocketsphinx.__file__), 'model')\n",
1385 |         "\n",
1386 |         "config = pocketsphinx.Decoder.default_config()\n",
1387 |         "config.set_string('-hmm', os.path.join(MODELDIR, 'en-us'))\n",
1388 |         "config.set_string('-lm', os.path.join(MODELDIR, 'en-us.lm.bin'))\n",
1389 |         "config.set_string('-dict', os.path.join(MODELDIR, 'cmudict-en-us.dict'))\n",
1390 |         "\n",
1391 |         "decoder = pocketsphinx.Decoder(config)"
1392 |       ]
1393 |     },
1394 |     {
1395 |       "cell_type": "markdown",
1396 |       "metadata": {
1397 |         "id": "a6s9CCA9WvIZ"
1398 |       },
1399 |       "source": [
1400 |         "## Batch API"
1401 |       ]
1402 |     },
1403 |     {
1404 |       "cell_type": "code",
1405 |       "execution_count": null,
1406 |       "metadata": {
1407 |         "colab": {
1408 |           "base_uri": "https://localhost:8080/",
1409 |           "height": 168
1410 |         },
1411 |         "id": "klStePTBWxO7",
1412 |         "outputId": "c06996f3-5da9-4b1a-a5bc-a18c584da3e8"
1413 |       },
1414 |       "outputs": [
1415 |         {
1416 |           "name": "stdout",
1417 |           "output_type": "stream",
1418 |           "text": [
1419 |             "\n",
1420 |             "audio file=\"audio/2830-3980-0043.wav\"    expected text=\"experience proves this\"\n",
1421 |             "sphinx-batch-stt: \"experience proves this\"\n",
1422 |             "\n",
1423 |             "audio file=\"audio/4507-16021-0012.wav\"    expected text=\"why should one halt on the way\"\n",
1424 |             "sphinx-batch-stt: \"why should one hold on the way\"\n",
1425 |             "\n",
1426 |             "audio file=\"audio/8455-210777-0068.wav\"    expected text=\"your power is sufficient i said\"\n",
1427 |             "sphinx-batch-stt: \"your paris sufficient i said\"\n"
1428 |           ]
1429 |         }
1430 |       ],
1431 |       "source": [
1432 |         "def sphinx_batch_stt(filename: str, lang: str, encoding: str) -> str:\n",
1433 |         "    buffer, rate = read_wav_file(filename)\n",
1434 |         "    decoder.start_utt()\n",
1435 |         "    decoder.process_raw(buffer, False, False)\n",
1436 |         "    decoder.end_utt()\n",
1437 |         "    hypothesis = decoder.hyp()\n",
1438 |         "    return hypothesis.hypstr\n",
1439 |         "\n",
1440 |         "# Run tests\n",
1441 |         "for t in TESTCASES:\n",
1442 |         "    print('\\naudio file=\"{0}\"    expected text=\"{1}\"'.format(t['filename'], t['text']))\n",
1443 |         "    print('sphinx-batch-stt: \"{}\"'.format(\n",
1444 |         "        sphinx_batch_stt(t['filename'], t['lang'], t['encoding'])\n",
1445 |         "    ))"
1446 |       ]
1447 |     },
1448 |     {
1449 |       "cell_type": "markdown",
1450 |       "metadata": {
1451 |         "id": "gQJ82tkTTyX3"
1452 |       },
1453 |       "source": [
1454 |         "## Streaming API"
1455 |       ]
1456 |     },
1457 |     {
1458 |       "cell_type": "code",
1459 |       "execution_count": null,
1460 |       "metadata": {
1461 |         "colab": {
1462 |           "base_uri": "https://localhost:8080/",
1463 |           "height": 168
1464 |         },
1465 |         "id": "iGfmRd6qTzq9",
1466 |         "outputId": "65acfc30-d65e-432b-8abe-ed101ae4ee00"
1467 |       },
1468 |       "outputs": [
1469 |         {
1470 |           "name": "stdout",
1471 |           "output_type": "stream",
1472 |           "text": [
1473 |             "\n",
1474 |             "audio file=\"audio/2830-3980-0043.wav\"    expected text=\"experience proves this\"\n",
1475 |             "sphinx-streaming-stt: \"experience proves this\"\n",
1476 |             "\n",
1477 |             "audio file=\"audio/4507-16021-0012.wav\"    expected text=\"why should one halt on the way\"\n",
1478 |             "sphinx-streaming-stt: \"why should one hold on the way\"\n",
1479 |             "\n",
1480 |             "audio file=\"audio/8455-210777-0068.wav\"    expected text=\"your power is sufficient i said\"\n",
1481 |             "sphinx-streaming-stt: \"your paris sufficient i said\"\n"
1482 |           ]
1483 |         }
1484 |       ],
1485 |       "source": [
1486 |         "def sphinx_streaming_stt(filename: str, lang: str, encoding: str) -> str:\n",
1487 |         "    buffer, rate = read_wav_file(filename)\n",
1488 |         "    audio_generator = simulate_stream(buffer)\n",
1489 |         "\n",
1490 |         "    decoder.start_utt()\n",
1491 |         "    for chunk in audio_generator:\n",
1492 |         "        decoder.process_raw(chunk, False, False)\n",
1493 |         "    decoder.end_utt()\n",
1494 |         "\n",
1495 |         "    hypothesis = decoder.hyp()\n",
1496 |         "    return hypothesis.hypstr\n",
1497 |         "\n",
1498 |         "# Run tests\n",
1499 |         "for t in TESTCASES:\n",
1500 |         "    print('\\naudio file=\"{0}\"    expected text=\"{1}\"'.format(t['filename'], t['text']))\n",
1501 |         "    print('sphinx-streaming-stt: \"{}\"'.format(\n",
1502 |         "        sphinx_streaming_stt(t['filename'], t['lang'], t['encoding'])\n",
1503 |         "    ))"
1504 |       ]
1505 |     },
1506 |     {
1507 |       "cell_type": "markdown",
1508 |       "metadata": {
1509 |         "id": "awZEgZKG5cWg"
1510 |       },
1511 |       "source": [
1512 |         "\n",
1513 |         "---\n",
1514 |         "\n",
1515 |         "# Mozilla DeepSpeech\n",
1516 |         "\n",
1517 |         "Mozilla released [DeepSpeech 0.6](https://hacks.mozilla.org/2019/12/deepspeech-0-6-mozillas-speech-to-text-engine/) software package in December 2019 with [APIs](https://github.com/mozilla/DeepSpeech/releases/tag/v0.6.0) in C, Java, .NET, [Python](https://deepspeech.readthedocs.io/en/v0.6.0/Python-API.html), and JavaScript, including support for TensorFlow Lite models for use on edge devices."
1518 |       ]
1519 |     },
1520 |     {
1521 |       "cell_type": "markdown",
1522 |       "metadata": {
1523 |         "id": "Ilmp9i-ql7V1"
1524 |       },
1525 |       "source": [
1526 |         "## Setup\n",
1527 |         "\n",
1528 |         "1. **Install DeepSpeech**\n",
1529 |         "\n",
1530 |         "You can install DeepSpeech with pip (make it deepspeech-gpu==0.6.0 if you are using GPU in colab runtime)."
1531 |       ]
1532 |     },
1533 |     {
1534 |       "cell_type": "code",
1535 |       "execution_count": null,
1536 |       "metadata": {
1537 |         "colab": {
1538 |           "base_uri": "https://localhost:8080/",
1539 |           "height": 138
1540 |         },
1541 |         "id": "gbWPbs_27f3Y",
1542 |         "outputId": "583f2b3c-cdea-4027-b859-13118fc4b538"
1543 |       },
1544 |       "outputs": [
1545 |         {
1546 |           "name": "stdout",
1547 |           "output_type": "stream",
1548 |           "text": [
1549 |             "Collecting deepspeech==0.6.0\n",
1550 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/26/f4/1ef0397097e8a8bbb7e24caabecbdb226b4e027e5018e9353ef65af14672/deepspeech-0.6.0-cp36-cp36m-manylinux1_x86_64.whl (9.6MB)\n",
1551 |             "\u001b[K     |████████████████████████████████| 9.6MB 3.0MB/s \n",
1552 |             "\u001b[?25hRequirement already satisfied: numpy>=1.7.0 in /usr/local/lib/python3.6/dist-packages (from deepspeech==0.6.0) (1.17.5)\n",
1553 |             "Installing collected packages: deepspeech\n",
1554 |             "Successfully installed deepspeech-0.6.0\n"
1555 |           ]
1556 |         }
1557 |       ],
1558 |       "source": [
1559 |         "!pip install deepspeech==0.6.0"
1560 |       ]
1561 |     },
1562 |     {
1563 |       "cell_type": "markdown",
1564 |       "metadata": {
1565 |         "id": "fIe7haLO7yo4"
1566 |       },
1567 |       "source": [
1568 |         "2. **Download and unzip models**"
1569 |       ]
1570 |     },
1571 |     {
1572 |       "cell_type": "code",
1573 |       "execution_count": null,
1574 |       "metadata": {
1575 |         "colab": {
1576 |           "base_uri": "https://localhost:8080/",
1577 |           "height": 286
1578 |         },
1579 |         "id": "eT-n1jLj8Ff4",
1580 |         "outputId": "eb58aab5-aafe-4d3c-97dc-58ad4fd7e6b9"
1581 |       },
1582 |       "outputs": [
1583 |         {
1584 |           "name": "stdout",
1585 |           "output_type": "stream",
1586 |           "text": [
1587 |             "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
1588 |             "                                 Dload  Upload   Total   Spent    Left  Speed\n",
1589 |             "100   620    0   620    0     0   2857      0 --:--:-- --:--:-- --:--:--  2857\n",
1590 |             "100 1172M  100 1172M    0     0  48.9M      0  0:00:23  0:00:23 --:--:-- 56.8M\n",
1591 |             "deepspeech-0.6.0-models/\n",
1592 |             "deepspeech-0.6.0-models/lm.binary\n",
1593 |             "deepspeech-0.6.0-models/output_graph.pbmm\n",
1594 |             "deepspeech-0.6.0-models/output_graph.pb\n",
1595 |             "deepspeech-0.6.0-models/trie\n",
1596 |             "deepspeech-0.6.0-models/output_graph.tflite\n",
1597 |             "total 1350664\n",
1598 |             "-rw-r--r-- 1 501 staff 945699324 Dec  3 06:51 lm.binary\n",
1599 |             "-rw-r--r-- 1 501 staff 188914896 Dec  3 09:03 output_graph.pb\n",
1600 |             "-rw-r--r-- 1 501 staff 188915850 Dec  3 09:49 output_graph.pbmm\n",
1601 |             "-rw-r--r-- 1 501 staff  47335752 Dec  3 09:05 output_graph.tflite\n",
1602 |             "-rw-r--r-- 1 501 staff  12200736 Dec  3 06:51 trie\n"
1603 |           ]
1604 |         }
1605 |       ],
1606 |       "source": [
1607 |         "!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.6.0/deepspeech-0.6.0-models.tar.gz\n",
1608 |         "!tar -xvzf deepspeech-0.6.0-models.tar.gz\n",
1609 |         "!ls -l ./deepspeech-0.6.0-models/"
1610 |       ]
1611 |     },
1612 |     {
1613 |       "cell_type": "markdown",
1614 |       "metadata": {
1615 |         "id": "uGGaM4wp8Ykp"
1616 |       },
1617 |       "source": [
1618 |         "3. **Test that it all works**\n",
1619 |         "\n",
1620 |         "Examine the output of the last three commands, and you will see results *“experience proof less”*, *“why should one halt on the way”*, and *“your power is sufficient i said”* respectively. You are all set."
1621 |       ]
1622 |     },
1623 |     {
1624 |       "cell_type": "code",
1625 |       "execution_count": null,
1626 |       "metadata": {
1627 |         "colab": {
1628 |           "base_uri": "https://localhost:8080/",
1629 |           "height": 222
1630 |         },
1631 |         "id": "3pPnZssj8fPY",
1632 |         "outputId": "5ebaeec2-f484-4047-9766-026a3f53d730"
1633 |       },
1634 |       "outputs": [
1635 |         {
1636 |           "name": "stdout",
1637 |           "output_type": "stream",
1638 |           "text": [
1639 |             "Loading model from file deepspeech-0.6.0-models/output_graph.pb\n",
1640 |             "TensorFlow: v1.14.0-21-ge77504a\n",
1641 |             "DeepSpeech: v0.6.0-0-g6d43e21\n",
1642 |             "Warning: reading entire model file into memory. Transform model file into an mmapped graph to reduce heap usage.\n",
1643 |             "2020-01-30 00:27:46.675441: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n",
1644 |             "Loaded model in 0.13s.\n",
1645 |             "Loading language model from files deepspeech-0.6.0-models/lm.binary ./deepspeech-0.6.0-models/trie\n",
1646 |             "Loaded language model in 0.000221s.\n",
1647 |             "Running inference.\n",
1648 |             "experience proof less\n",
1649 |             "Inference took 2.418s for 1.975s audio file.\n"
1650 |           ]
1651 |         }
1652 |       ],
1653 |       "source": [
1654 |         "!deepspeech --model deepspeech-0.6.0-models/output_graph.pb --lm deepspeech-0.6.0-models/lm.binary --trie ./deepspeech-0.6.0-models/trie --audio ./audio/2830-3980-0043.wav"
1655 |       ]
1656 |     },
1657 |     {
1658 |       "cell_type": "code",
1659 |       "execution_count": null,
1660 |       "metadata": {
1661 |         "colab": {
1662 |           "base_uri": "https://localhost:8080/",
1663 |           "height": 222
1664 |         },
1665 |         "id": "gvxm5RE68zu4",
1666 |         "outputId": "84c877c7-d1fd-4bd9-ae96-56f63bf37dba"
1667 |       },
1668 |       "outputs": [
1669 |         {
1670 |           "name": "stdout",
1671 |           "output_type": "stream",
1672 |           "text": [
1673 |             "Loading model from file deepspeech-0.6.0-models/output_graph.pb\n",
1674 |             "TensorFlow: v1.14.0-21-ge77504a\n",
1675 |             "DeepSpeech: v0.6.0-0-g6d43e21\n",
1676 |             "Warning: reading entire model file into memory. Transform model file into an mmapped graph to reduce heap usage.\n",
1677 |             "2020-01-30 00:27:53.427469: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n",
1678 |             "Loaded model in 0.131s.\n",
1679 |             "Loading language model from files deepspeech-0.6.0-models/lm.binary ./deepspeech-0.6.0-models/trie\n",
1680 |             "Loaded language model in 0.000188s.\n",
1681 |             "Running inference.\n",
1682 |             "why should one halt on the way\n",
1683 |             "Inference took 2.941s for 2.735s audio file.\n"
1684 |           ]
1685 |         }
1686 |       ],
1687 |       "source": [
1688 |         "!deepspeech --model deepspeech-0.6.0-models/output_graph.pb --lm deepspeech-0.6.0-models/lm.binary --trie ./deepspeech-0.6.0-models/trie --audio ./audio/4507-16021-0012.wav"
1689 |       ]
1690 |     },
1691 |     {
1692 |       "cell_type": "code",
1693 |       "execution_count": null,
1694 |       "metadata": {
1695 |         "colab": {
1696 |           "base_uri": "https://localhost:8080/",
1697 |           "height": 222
1698 |         },
1699 |         "id": "1Hq_tEFQ8254",
1700 |         "outputId": "7f4a4720-72da-442a-ea4d-d7f08a66ec0f"
1701 |       },
1702 |       "outputs": [
1703 |         {
1704 |           "name": "stdout",
1705 |           "output_type": "stream",
1706 |           "text": [
1707 |             "Loading model from file deepspeech-0.6.0-models/output_graph.pb\n",
1708 |             "TensorFlow: v1.14.0-21-ge77504a\n",
1709 |             "DeepSpeech: v0.6.0-0-g6d43e21\n",
1710 |             "Warning: reading entire model file into memory. Transform model file into an mmapped graph to reduce heap usage.\n",
1711 |             "2020-01-30 00:28:00.365841: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n",
1712 |             "Loaded model in 0.129s.\n",
1713 |             "Loading language model from files deepspeech-0.6.0-models/lm.binary ./deepspeech-0.6.0-models/trie\n",
1714 |             "Loaded language model in 0.000228s.\n",
1715 |             "Running inference.\n",
1716 |             "your power is sufficient i said\n",
1717 |             "Inference took 2.839s for 2.590s audio file.\n"
1718 |           ]
1719 |         }
1720 |       ],
1721 |       "source": [
1722 |         "!deepspeech --model deepspeech-0.6.0-models/output_graph.pb --lm deepspeech-0.6.0-models/lm.binary --trie ./deepspeech-0.6.0-models/trie --audio ./audio/8455-210777-0068.wav"
1723 |       ]
1724 |     },
1725 |     {
1726 |       "cell_type": "markdown",
1727 |       "metadata": {
1728 |         "id": "PTcABJ2c9CRa"
1729 |       },
1730 |       "source": [
1731 |         "## Create model object"
1732 |       ]
1733 |     },
1734 |     {
1735 |       "cell_type": "code",
1736 |       "execution_count": null,
1737 |       "metadata": {
1738 |         "colab": {
1739 |           "base_uri": "https://localhost:8080/",
1740 |           "height": 34
1741 |         },
1742 |         "id": "tU41WTEr9G-X",
1743 |         "outputId": "8c4f73ad-f61f-4467-a3fa-23ef5375de74"
1744 |       },
1745 |       "outputs": [
1746 |         {
1747 |           "data": {
1748 |             "text/plain": [
1749 |               "0"
1750 |             ]
1751 |           },
1752 |           "execution_count": 27,
1753 |           "metadata": {
1754 |             "tags": []
1755 |           },
1756 |           "output_type": "execute_result"
1757 |         }
1758 |       ],
1759 |       "source": [
1760 |         "import deepspeech\n",
1761 |         "\n",
1762 |         "model_file_path = 'deepspeech-0.6.0-models/output_graph.pbmm'\n",
1763 |         "beam_width = 500\n",
1764 |         "model = deepspeech.Model(model_file_path, beam_width)\n",
1765 |         "\n",
1766 |         "# Add language model for better accuracy\n",
1767 |         "lm_file_path = 'deepspeech-0.6.0-models/lm.binary'\n",
1768 |         "trie_file_path = 'deepspeech-0.6.0-models/trie'\n",
1769 |         "lm_alpha = 0.75\n",
1770 |         "lm_beta = 1.85\n",
1771 |         "model.enableDecoderWithLM(lm_file_path, trie_file_path, lm_alpha, lm_beta)"
1772 |       ]
1773 |     },
1774 |     {
1775 |       "cell_type": "markdown",
1776 |       "metadata": {
1777 |         "id": "gB4wl_9P9ilW"
1778 |       },
1779 |       "source": [
1780 |         "## Batch API"
1781 |       ]
1782 |     },
1783 |     {
1784 |       "cell_type": "code",
1785 |       "execution_count": null,
1786 |       "metadata": {
1787 |         "colab": {
1788 |           "base_uri": "https://localhost:8080/",
1789 |           "height": 168
1790 |         },
1791 |         "id": "MTaKt_rm9wY_",
1792 |         "outputId": "8bc1dc02-3c8b-4a66-ddb4-b61f362167e0"
1793 |       },
1794 |       "outputs": [
1795 |         {
1796 |           "name": "stdout",
1797 |           "output_type": "stream",
1798 |           "text": [
1799 |             "\n",
1800 |             "audio file=\"audio/2830-3980-0043.wav\"    expected text=\"experience proves this\"\n",
1801 |             "deepspeech-batch-stt: \"experience proof less\"\n",
1802 |             "\n",
1803 |             "audio file=\"audio/4507-16021-0012.wav\"    expected text=\"why should one halt on the way\"\n",
1804 |             "deepspeech-batch-stt: \"why should one halt on the way\"\n",
1805 |             "\n",
1806 |             "audio file=\"audio/8455-210777-0068.wav\"    expected text=\"your power is sufficient i said\"\n",
1807 |             "deepspeech-batch-stt: \"your power is sufficient i said\"\n"
1808 |           ]
1809 |         }
1810 |       ],
1811 |       "source": [
1812 |         "import numpy as np\n",
1813 |         "\n",
1814 |         "def deepspeech_batch_stt(filename: str, lang: str, encoding: str) -> str:\n",
1815 |         "    buffer, rate = read_wav_file(filename)\n",
1816 |         "    data16 = np.frombuffer(buffer, dtype=np.int16)\n",
1817 |         "    return model.stt(data16)\n",
1818 |         "\n",
1819 |         "# Run tests\n",
1820 |         "for t in TESTCASES:\n",
1821 |         "    print('\\naudio file=\"{0}\"    expected text=\"{1}\"'.format(t['filename'], t['text']))\n",
1822 |         "    print('deepspeech-batch-stt: \"{}\"'.format(\n",
1823 |         "        deepspeech_batch_stt(t['filename'], t['lang'], t['encoding'])\n",
1824 |         "    ))"
1825 |       ]
1826 |     },
1827 |     {
1828 |       "cell_type": "markdown",
1829 |       "metadata": {
1830 |         "id": "9v3jT8NR-qGb"
1831 |       },
1832 |       "source": [
1833 |         "## Streaming API"
1834 |       ]
1835 |     },
1836 |     {
1837 |       "cell_type": "code",
1838 |       "execution_count": null,
1839 |       "metadata": {
1840 |         "colab": {
1841 |           "base_uri": "https://localhost:8080/",
1842 |           "height": 454
1843 |         },
1844 |         "id": "EU7lHQ2A-svH",
1845 |         "outputId": "8fc02288-a1a9-4709-ef25-bd42c4c99bf8"
1846 |       },
1847 |       "outputs": [
1848 |         {
1849 |           "name": "stdout",
1850 |           "output_type": "stream",
1851 |           "text": [
1852 |             "\n",
1853 |             "audio file=\"audio/2830-3980-0043.wav\"    expected text=\"experience proves this\"\n",
1854 |             "inetrim text: i\n",
1855 |             "inetrim text: e\n",
1856 |             "inetrim text: experi en\n",
1857 |             "inetrim text: experience pro\n",
1858 |             "inetrim text: experience proof les\n",
1859 |             "deepspeech-streaming-stt: \"experience proof less\"\n",
1860 |             "\n",
1861 |             "audio file=\"audio/4507-16021-0012.wav\"    expected text=\"why should one halt on the way\"\n",
1862 |             "inetrim text: i\n",
1863 |             "inetrim text: why shou\n",
1864 |             "inetrim text: why should one\n",
1865 |             "inetrim text: why should one haul\n",
1866 |             "inetrim text: why should one halt \n",
1867 |             "inetrim text: why should one halt on the \n",
1868 |             "deepspeech-streaming-stt: \"why should one halt on the way\"\n",
1869 |             "\n",
1870 |             "audio file=\"audio/8455-210777-0068.wav\"    expected text=\"your power is sufficient i said\"\n",
1871 |             "inetrim text: i\n",
1872 |             "inetrim text: your p\n",
1873 |             "inetrim text: your power is\n",
1874 |             "inetrim text: your power is suffi\n",
1875 |             "inetrim text: your power is sufficient i\n",
1876 |             "inetrim text: your power is sufficient i said\n",
1877 |             "deepspeech-streaming-stt: \"your power is sufficient i said\"\n"
1878 |           ]
1879 |         }
1880 |       ],
1881 |       "source": [
1882 |         "def deepspeech_streaming_stt(filename: str, lang: str, encoding: str) -> str:\n",
1883 |         "    buffer, rate = read_wav_file(filename)\n",
1884 |         "    audio_generator = simulate_stream(buffer)\n",
1885 |         "\n",
1886 |         "    # Create stream\n",
1887 |         "    context = model.createStream()\n",
1888 |         "\n",
1889 |         "    text = ''\n",
1890 |         "    for chunk in audio_generator:\n",
1891 |         "        data16 = np.frombuffer(chunk, dtype=np.int16)\n",
1892 |         "        # feed stream of chunks\n",
1893 |         "        model.feedAudioContent(context, data16)\n",
1894 |         "        interim_text = model.intermediateDecode(context)\n",
1895 |         "        if interim_text != text:\n",
1896 |         "            text = interim_text\n",
1897 |         "            print('inetrim text: {}'.format(text))\n",
1898 |         "\n",
1899 |         "    # get final resut and close stream\n",
1900 |         "    text = model.finishStream(context)\n",
1901 |         "    return text\n",
1902 |         "\n",
1903 |         "# Run tests\n",
1904 |         "for t in TESTCASES:\n",
1905 |         "    print('\\naudio file=\"{0}\"    expected text=\"{1}\"'.format(t['filename'], t['text']))\n",
1906 |         "    print('deepspeech-streaming-stt: \"{}\"'.format(\n",
1907 |         "        deepspeech_streaming_stt(t['filename'], t['lang'], t['encoding'])\n",
1908 |         "    ))"
1909 |       ]
1910 |     },
1911 |     {
1912 |       "cell_type": "markdown",
1913 |       "metadata": {
1914 |         "id": "3aqlb4wEcdOx"
1915 |       },
1916 |       "source": [
1917 |         "\n",
1918 |         "---\n",
1919 |         "\n",
1920 |         "# SpeechRecognition Package\n",
1921 |         "\n",
1922 |         "The [SpeechRecognition](https://pypi.org/project/SpeechRecognition/) package provide a nice abstraction over several solutions. In this notebook we explore using CMU Sphinx (i.e. model running locally on the machine), and Google (i.e. service accessed over the network/cloud), but both through SpeechRecognition package APIs."
1923 |       ]
1924 |     },
1925 |     {
1926 |       "cell_type": "markdown",
1927 |       "metadata": {
1928 |         "id": "QpxAVH5OmPtn"
1929 |       },
1930 |       "source": [
1931 |         "## Setup\n",
1932 |         "\n",
1933 |         "We need to install SpeechRecognition and pocketsphinx python packages, and download some files to test these APIs.\n",
1934 |         "\n",
1935 |         "1. **Install SpeechRecognition py package**"
1936 |       ]
1937 |     },
1938 |     {
1939 |       "cell_type": "code",
1940 |       "execution_count": null,
1941 |       "metadata": {
1942 |         "colab": {
1943 |           "base_uri": "https://localhost:8080/",
1944 |           "height": 121
1945 |         },
1946 |         "id": "kJ0rokUuby2i",
1947 |         "outputId": "c0d99348-92e9-49f7-edf0-20493983a1e8"
1948 |       },
1949 |       "outputs": [
1950 |         {
1951 |           "name": "stdout",
1952 |           "output_type": "stream",
1953 |           "text": [
1954 |             "Collecting SpeechRecognition\n",
1955 |             "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/26/e1/7f5678cd94ec1234269d23756dbdaa4c8cfaed973412f88ae8adf7893a50/SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8MB)\n",
1956 |             "\u001b[K     |████████████████████████████████| 32.8MB 92kB/s \n",
1957 |             "\u001b[?25hInstalling collected packages: SpeechRecognition\n",
1958 |             "Successfully installed SpeechRecognition-3.8.1\n"
1959 |           ]
1960 |         }
1961 |       ],
1962 |       "source": [
1963 |         "!pip3 install SpeechRecognition"
1964 |       ]
1965 |     },
1966 |     {
1967 |       "cell_type": "markdown",
1968 |       "metadata": {
1969 |         "id": "TBjNf3GoTU1l"
1970 |       },
1971 |       "source": [
1972 |         "Pocketsphinx has already been installed in earlier sections."
1973 |       ]
1974 |     },
1975 |     {
1976 |       "cell_type": "markdown",
1977 |       "metadata": {
1978 |         "id": "piIB_P7CXey4"
1979 |       },
1980 |       "source": [
1981 |         "## Batch API\n",
1982 |         "\n",
1983 |         "SpeechRecognition has only batch API. First step to create an audio record, eithher from a file or from mic, and the second step is to call `recognize_<speech engine name>` function. It currently has APIs for [CMU Sphinx, Google, Microsoft, IBM, Houndify, and Wit](https://github.com/Uberi/speech_recognition)."
1984 |       ]
1985 |     },
1986 |     {
1987 |       "cell_type": "code",
1988 |       "execution_count": null,
1989 |       "metadata": {
1990 |         "colab": {
1991 |           "base_uri": "https://localhost:8080/",
1992 |           "height": 218
1993 |         },
1994 |         "id": "0aia5lFgb-vV",
1995 |         "outputId": "bdf84ea8-98f0-43b9-e5f5-305c9745795e"
1996 |       },
1997 |       "outputs": [
1998 |         {
1999 |           "name": "stdout",
2000 |           "output_type": "stream",
2001 |           "text": [
2002 |             "\n",
2003 |             "audio file=\"audio/2830-3980-0043.wav\"    expected text=\"experience proves this\"\n",
2004 |             "sphinx: \"experience proves that\"\n",
2005 |             "google: \"experience proves this\"\n",
2006 |             "\n",
2007 |             "audio file=\"audio/4507-16021-0012.wav\"    expected text=\"why should one halt on the way\"\n",
2008 |             "sphinx: \"why should one hold on the way\"\n",
2009 |             "google: \"why should one halt on the way\"\n",
2010 |             "\n",
2011 |             "audio file=\"audio/8455-210777-0068.wav\"    expected text=\"your power is sufficient i said\"\n",
2012 |             "sphinx: \"your paris official said\"\n",
2013 |             "google: \"your power is sufficient I said\"\n"
2014 |           ]
2015 |         }
2016 |       ],
2017 |       "source": [
2018 |         "import speech_recognition as sr\n",
2019 |         "from enum import Enum, unique\n",
2020 |         "\n",
2021 |         "@unique\n",
2022 |         "class ASREngine(Enum):\n",
2023 |         "    sphinx = 0\n",
2024 |         "    google = 1\n",
2025 |         "\n",
2026 |         "def speech_to_text(filename: str, engine: ASREngine, language: str, show_all: bool = False) -> str:\n",
2027 |         "    r = sr.Recognizer()\n",
2028 |         "\n",
2029 |         "    with sr.AudioFile(filename) as source:\n",
2030 |         "        audio = r.record(source)\n",
2031 |         "\n",
2032 |         "    asr_functions = {\n",
2033 |         "        ASREngine.sphinx: r.recognize_sphinx,\n",
2034 |         "        ASREngine.google: r.recognize_google,\n",
2035 |         "    }\n",
2036 |         "\n",
2037 |         "    response = asr_functions[engine](audio, language=language, show_all=show_all)\n",
2038 |         "    return response\n",
2039 |         "\n",
2040 |         "# Run tests\n",
2041 |         "for t in TESTCASES:\n",
2042 |         "    filename = t['filename']\n",
2043 |         "    text = t['text']\n",
2044 |         "    lang = t['lang']\n",
2045 |         "\n",
2046 |         "    print('\\naudio file=\"{0}\"    expected text=\"{1}\"'.format(filename, text))\n",
2047 |         "    for asr_engine in ASREngine:\n",
2048 |         "        try:\n",
2049 |         "            response = speech_to_text(filename, asr_engine, language=lang)\n",
2050 |         "            print('{0}: \"{1}\"'.format(asr_engine.name, response))\n",
2051 |         "        except sr.UnknownValueError:\n",
2052 |         "            print('{0} could not understand audio'.format(asr_engine.name))\n",
2053 |         "        except sr.RequestError as e:\n",
2054 |         "            print('{0} error: {0}'.format(asr_engine.name, e))"
2055 |       ]
2056 |     },
2057 |     {
2058 |       "cell_type": "markdown",
2059 |       "metadata": {
2060 |         "id": "66lLoLCaL_nE"
2061 |       },
2062 |       "source": [
2063 |         "### API for other providers\n",
2064 |         "\n",
2065 |         "For other speech recognition providers, you will need to create API credentials, which you have to pass to `recognize_<speech engine name>` function, you can checkout [this example](https://github.com/Uberi/speech_recognition/blob/master/examples/audio_transcribe.py).\n",
2066 |         "\n",
2067 |         "It also has a nice abstraction for Microphone, implemented over PyAudio/PortAudio. Here is an example to capture input from mic in [batch](https://github.com/Uberi/speech_recognition/blob/master/examples/microphone_recognition.py) and continously in [background](https://github.com/Uberi/speech_recognition/blob/master/examples/background_listening.py)."
2068 |       ]
2069 |     },
2070 |     {
2071 |       "cell_type": "markdown",
2072 |       "metadata": {
2073 |         "id": "cTfKgcgF0uzz"
2074 |       },
2075 |       "source": [
2076 |         "---\n",
2077 |         "\n",
2078 |         "# Summary\n",
2079 |         "\n",
2080 |         "This note covers various available speech recognition:\n",
2081 |         "\n",
2082 |         "- services: Google, Azure, Watson\n",
2083 |         "- software: CMU Sphinx, Mozilla DeepSpeech\n",
2084 |         "\n",
2085 |         "All of these have two kind of Speech-to-Text APIs:\n",
2086 |         "\n",
2087 |         "- batch: the audio data is fed in one go\n",
2088 |         "- streaming: the audio data is fed in chunks (very useful for transcribing microphone input)\n",
2089 |         "\n",
2090 |         "The Python SpeechRecognition package provides abstraction over several speech recognition services and softwares.\n",
2091 |         "\n",
2092 |         "I hope to include following in future:\n",
2093 |         "\n",
2094 |         "- services: [Amazon Transcribe](https://aws.amazon.com/transcribe/), and [Nuance](https://nuancedev.github.io/samples/http/python/)\n",
2095 |         "- software: [Kaldi](https://pykaldi.github.io/), and [Facebook wav2letter](https://ai.facebook.com/blog/online-speech-recognition-with-wav2letteranywhere/)\n",
2096 |         "\n",
2097 |         "<br/>\n",
2098 |         "\n",
2099 |         "---\n",
2100 |         "<p>Copyright &copy 2020 <a href=\"https://www.linkedin.com/in/scgupta\">Satish Chandra Gupta</a>.</p>\n",
2101 |         "<img src=\"https://licensebuttons.net/l/by-nc-sa/3.0/88x31.png\" align=\"left\"/> <p>&nbsp;<a href=\"https://creativecommons.org/licenses/by-nc-sa/4.0/\">CC BY-NC-SA 4.0 International</a> License.</p>"
2102 |       ]
2103 |     }
2104 |   ],
2105 |   "metadata": {
2106 |     "accelerator": "GPU",
2107 |     "colab": {
2108 |       "authorship_tag": "ABX9TyOugDdFcqekLlzo1yrW03ry",
2109 |       "collapsed_sections": [],
2110 |       "include_colab_link": true,
2111 |       "name": "python_speech_recognition_notebook.ipynb",
2112 |       "provenance": [],
2113 |       "toc_visible": true
2114 |     },
2115 |     "kernelspec": {
2116 |       "display_name": "Python 3.7.12 64-bit ('kaggle')",
2117 |       "language": "python",
2118 |       "name": "python3"
2119 |     },
2120 |     "language_info": {
2121 |       "name": "python",
2122 |       "version": "3.7.12"
2123 |     },
2124 |     "vscode": {
2125 |       "interpreter": {
2126 |         "hash": "180997e3444791da3f2b3061c2f2c1b4404ea4925238c499191fa40f3054d99b"
2127 |       }
2128 |     }
2129 |   },
2130 |   "nbformat": 4,
2131 |   "nbformat_minor": 0
2132 | }
2133 | 


--------------------------------------------------------------------------------