├── .gitignore
├── README.md
├── asr
    ├── __init__.py
    ├── language_model
    │   ├── __init__.py
    │   └── language_model.py
    ├── utils
    │   ├── __init__.py
    │   ├── beam_search_decoder.py
    │   └── utils.py
    └── wav2vec2
    │   ├── __init__.py
    │   ├── decoder
    │       ├── __init__.py
    │       └── ctc_decoder.py
    │   ├── inference.py
    │   └── vocab.py
├── asr_inference_live.py
├── asr_inference_offline.py
├── asr_inference_recording.py
├── data
    ├── lm_training_corpus
    │   └── corpus.txt
    ├── models
    │   └── lm
    │   │   ├── twitter
    │   │       ├── bigram.pkl
    │   │       └── unigram.pkl
    │   │   └── wikipedia
    │   │       ├── bigram.pkl
    │   │       └── unigram.pkl
    └── samples
    │   ├── Achievements_of_the_Democratic_Party_(Homer_S._Cummings).ogg
    │   ├── rec.wav
    │   ├── rec2.wav
    │   └── shortened.wav
├── notebooks
    ├── Training_Simple_Lanugage_Model.ipynb
    ├── wav2vec2_asr_pretrained_inference.ipynb
    ├── wav2vec2_experiment_language_model.ipynb
    ├── wav2vec2_finetuning_version_1.ipynb
    ├── wav2vec2_finetuning_version_2_with_data_augmentations.ipynb
    └── wav2vec2large_experiment_language_model.ipynb
├── requirements.txt
└── train_language_model.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | *.tar
  2 | *.zip
  3 | /data/models/asr/*
  4 | /output/*
  5 | /.vscode/*
  6 | # Created by https://www.gitignore.io/api/python
  7 | # Edit at https://www.gitignore.io/?templates=python
  8 | 
  9 | ### Python ###
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | pip-wheel-metadata/
 33 | share/python-wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .nox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | .hypothesis/
 60 | .pytest_cache/
 61 | 
 62 | # Translations
 63 | *.mo
 64 | *.pot
 65 | 
 66 | # Django stuff:
 67 | *.log
 68 | local_settings.py
 69 | db.sqlite3
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | target/
 83 | 
 84 | # Jupyter Notebook
 85 | .ipynb_checkpoints
 86 | 
 87 | # IPython
 88 | profile_default/
 89 | ipython_config.py
 90 | 
 91 | # pyenv
 92 | .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # celery beat schedule file
102 | celerybeat-schedule
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # End of https://www.gitignore.io/api/python
135 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Automatic Sound Recognition using Wav2Vec2
  2 | 
  3 | This repository uses wav2vec2 model from hugging face transformers to create an ASR system which takes input speech signal as input and outputs transcriptions asynchronously.
  4 | 
  5 | I have also written a [post](https://www.tarunbisht.com/deep%20learning/2021/06/17/speech-recognition-using-wav2vec-model/) explaining wave2vec2 in some detail with some further learning directions.
  6 | 
  7 | ## Installation
  8 | 
  9 | ### Installing via pip
 10 | - Download and Install python
 11 | - Create a virtual environment using `python -m venv env_name`
 12 | - enable created environment `env_path\Scripts\activate`
 13 | - Install PyTorch `pip install torch==1.8.0+cu102 torchaudio===0.8.0 -f https://download.pytorch.org/whl/torch_stable.html`
 14 | - Install required dependencies `pip install -r requirements.txt`
 15 | 
 16 | ### Installing via conda
 17 | - Download and install miniconda
 18 | - Create a new virutal environment using `conda create --name env_name python==3.8`
 19 | - enable create environment `conda activate env_name`
 20 | - Install PyTorch `conda install pytorch torchaudio cudatoolkit=11.1 -c pytorch`
 21 | - Install required dependencies `pip install -r requirements.txt`
 22 | 
 23 | ## Inferencing
 24 | ### transcribing an audio file
 25 | - run  `python asr_inference_offline.py` with parameters:
 26 |     - `--model` or `-m`: path to saved wavenetctc local model if not passed it will be downloaded (Defaults to None)
 27 |     - `--pipeline` or `-t` : path to saved wav2vec local pipeline path if not passed then it will be downloaded (Defaults to None)
 28 |     - `--output` or `-out` : path to output file to save transcriptions. (not required)
 29 |     - `--device` or `-d` : device to use for inferencing (choices=["cpu", "cuda"] and Defaults to cpu)
 30 |     - `--lm` or `l` : path to folder in which trained language model is saved with unigram and bigram files. This language model will be used by beam search algorithm to weight scores of beams (Defaults to None)
 31 |     - `--beam_width` or `-bw` : beam width to use for beam search decoder during inferencing (Defaults to 1). If `beam_width <= 1` then max decoding will be used to decode ctc inputs, else beam search decoding will be used.
 32 | - example
 33 |     - `python asr_inference_offline.py --recording data/samples/rec.wav -out output/transcription.txt`
 34 |     - `python asr_inference_offline.py --recording data/samples/rec.wav --device cuda`
 35 | ### transcribing a streaming audio
 36 | - run  `python asr_inference_recording.py` with parameters:
 37 |     - `--recording` or `-rec` : path to audio recording
 38 |     - `--model` or `-m`: path to saved wavenetctc local model if not passed it will be downloaded (Defaults to None)
 39 |     - `--pipeline` or `-t` : path to saved wav2vec local pipeline path if not passed then it will be downloaded (Defaults to None)
 40 |     - `--blocksize` or `-bs` : size of each audio block to be passed to model (Defaults to 16000)
 41 |     - `--overlap` or `-ov` : overlapping between each loaded block (Defaults to 0)
 42 |     - `--output` or `-out` : path to output file to save transcriptions. (not required)
 43 |     - `--device` or `-d` : device to use for inferencing (choices=["cpu", "cuda"] and Defaults to cpu)
 44 |     - `--lm` or `l` : path to folder in which trained language model is saved with unigram and bigram files. This language model will be used by beam search algorithm to weight scores of beams (Defaults to None)
 45 |     - `--beam_width` or `-bw` : beam width to use for beam search decoder during inferencing (Defaults to 1). If `beam_width <= 1` then max decoding will be used to decode ctc inputs, else beam search decoding will be used.
 46 | - example
 47 |     - `python asr_inference_recording.py --recording data/samples/rec.wav -bs 16000 -out output/transcription.txt`
 48 |     - `python asr_inference_recording.py --recording data/samples/rec.wav -bs 16000 -ov 1600 -out output/transcription.txt`
 49 |     - `python asr_inference_recording.py --recording data/samples/rec.wav -bs 16000 -ov 1600 -out output/transcription.txt --device gpu`
 50 | 
 51 | ### live recording and transcribing
 52 | - run  `python asr_inference_live.py` with parameters:
 53 |     - `--model` or `-m`: path to saved wavenetctc local model if not passed it will be downloaded (Defaults to None)
 54 |     - `--pipeline` or `-t` : path to saved wav2vec local pipeline path if not passed then it will be downloaded (Defaults to None)
 55 |     - `--blocksize` or `-bs` : size of each audio block to be passed to model (Defaults to 16000)
 56 |     - `--output` or `-out` : path to output file to save transcriptions. (not required)
 57 |     - `--device` or `-d` : device to use for inferencing (choices=["cpu", "cuda"] and Defaults to cpu)
 58 |     - `--lm` or `l` : path to folder in which trained language model is saved with unigram and bigram files. This language model will be used by beam search algorithm to weight scores of beams (Defaults to None)
 59 |     - `--beam_width` or `-bw` : beam width to use for beam search decoder during inferencing (Defaults to 1). If `beam_width <= 1` then max decoding will be used to decode ctc inputs, else beam search decoding will be used.
 60 | - example
 61 |     - `python asr_inference_live.py -bs 16000 -out output/transcription.txt`
 62 |     - `python asr_inference_live.py`
 63 |     - `python asr_inference_live.py --device cuda`
 64 | 
 65 | ## Training Language Model
 66 | - run `python asr_inference_live.py` with parameters:
 67 |     - `--corpus` or `-c` : path to corpus text file.
 68 |     - `--save` or `-s` : folder path to save model files.
 69 | 
 70 | ## Notebooks
 71 | All notebooks resides in notebook folder these are handy when using google colab or similar platforms. All these notebooks are tested in google colab.
 72 | - `wav2vec2_asr_pretrained_inference` : Basic inference notebook
 73 | - `wav2vec2_experiment_language_model` : kenlm language model with beam search
 74 | - `wav2vec2large_experiment_language_model` : kenlm language model with beam search for larger model
 75 | - `wav2vec2_finetuning_version_1` : finetuning notebook without augmentation
 76 | - `wav2vec2_finetuning_version_2_with_data_augmentations` : finetuning notebook with augmentation
 77 | - `Training_Simple_Lanugage_Model` : training language model notebook version with wikipedia data
 78 | 
 79 | ## Comparisions
 80 | ### GPU inference vs CPU inference
 81 | For 4min 10sec recorder audio total time taken
 82 | 1. GPU (Nvidia GeForce 940MX) : 18.29sec
 83 | 2. CPU : 116.85sec
 84 | 
 85 | ## To do list
 86 | - Environment Setup ✔
 87 | - Inferencing with CPU ✔
 88 | - Inferencing with GPU ✔
 89 | - Asyncio Compatible ✔
 90 | - Training and Finetuning Notebooks ✔
 91 | - Training and Finetuning Scripts
 92 | - Converting model to TensorFlow with ONNX for inference using TensorFlow
 93 | 
 94 | ## Tested Platforms
 95 | - native windows 10 ✔
 96 | - windows-10 wsl2 cpu ✔
 97 | - windows-10 wsl2 gpu ✔
 98 | - Linux ✔
 99 | 
100 | ## References
101 | - [Hugging Face Wav2Vec2](https://huggingface.co/transformers/master/model_doc/wav2vec2.html)
102 | - [CTC decoder adapted from githubharald/CTCDecoder](https://github.com/githubharald/CTCDecoder)
103 | 


--------------------------------------------------------------------------------
/asr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tarun-bisht/wav2vec2-asr/17308ac128f9762b30220f1e580f699bc98be2e7/asr/__init__.py


--------------------------------------------------------------------------------
/asr/language_model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model import LanguageModel


--------------------------------------------------------------------------------
/asr/language_model/language_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | 
 5 | class LanguageModel:
 6 |     """Simple character-level language model"""
 7 | 
 8 |     def __init__(self, chars: list) -> None:
 9 |         self._unigram = {c: 0 for c in chars}
10 |         self._bigram = {c: {d: 0 for d in chars} for c in chars}
11 |         self.chars = chars
12 | 
13 |     def get_char_unigram(self, c: str) -> float:
14 |         """Probability of character c."""
15 |         return self._unigram[c]
16 | 
17 |     def get_char_bigram(self, c: str, d: str) -> float:
18 |         """Probability that character c is followed by character d."""
19 |         return self._bigram[c][d]
20 | 
21 |     def train(self, txt: str, normalize=False):
22 |         """Create language model from text corpus."""
23 |         # compute unigrams
24 |         for c in txt:
25 |             # ignore unknown chars
26 |             if c not in self._unigram:
27 |                 continue
28 |             self._unigram[c] += 1
29 | 
30 |         # compute bigrams
31 |         for i in range(len(txt) - 1):
32 |             c = txt[i]
33 |             d = txt[i + 1]
34 | 
35 |             # ignore unknown chars
36 |             if c not in self._bigram or d not in self._bigram[c]:
37 |                 continue
38 | 
39 |             self._bigram[c][d] += 1
40 |         if normalize:
41 |             self.normalize()
42 | 
43 |     def normalize(self):
44 |         # normalize
45 |         sum_unigram = sum(self._unigram.values())
46 |         for c in self.chars:
47 |             self._unigram[c] /= sum_unigram
48 | 
49 |         for c in self.chars:
50 |             sum_bigram = sum(self._bigram[c].values())
51 |             if sum_bigram == 0:
52 |                 continue
53 |             for d in self.chars:
54 |                 self._bigram[c][d] /= sum_bigram
55 | 
56 |     def save(self, path):
57 |         with open(os.path.join(path, "unigram.pkl"), 'wb') as pkl:
58 |             pickle.dump(self._unigram, pkl)
59 |         with open(os.path.join(path, "bigram.pkl"), 'wb') as pkl:
60 |             pickle.dump(self._bigram, pkl)
61 | 
62 |     def load(self, path):
63 |         with open(os.path.join(path, "unigram.pkl"), 'rb') as pkl:
64 |             self._unigram = pickle.load(pkl)
65 |         with open(os.path.join(path, "bigram.pkl"), 'rb') as pkl:
66 |             self._bigram = pickle.load(pkl)


--------------------------------------------------------------------------------
/asr/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (MicrophoneStreaming, AudioStreaming, 
2 |                     AudioReader, MicrophoneCaptureFailed)
3 | from .beam_search_decoder import BeamSearchDecoder


--------------------------------------------------------------------------------
/asr/utils/beam_search_decoder.py:
--------------------------------------------------------------------------------
  1 | '''adapted from https://github.com/githubharald/CTCDecoder'''
  2 | from collections import defaultdict
  3 | from dataclasses import dataclass
  4 | from typing import Optional, List, Tuple
  5 | import numpy as np
  6 | from asr.language_model import LanguageModel
  7 | 
  8 | 
  9 | def log(x: float) -> float:
 10 |     with np.errstate(divide='ignore'):
 11 |         return np.log(x)
 12 | 
 13 | 
 14 | @dataclass
 15 | class BeamEntry:
 16 |     """Information about one single beam at specific time-step."""
 17 |     pr_total: float = log(0)  # blank and non-blank
 18 |     pr_non_blank: float = log(0)  # non-blank
 19 |     pr_blank: float = log(0)  # blank
 20 |     pr_text: float = log(1)  # LM score
 21 |     lm_applied: bool = False  # flag if LM was already applied to this beam
 22 |     labeling: tuple = ()  # beam-labeling
 23 | 
 24 | 
 25 | class BeamList:
 26 |     """Information about all beams at specific time-step."""
 27 | 
 28 |     def __init__(self) -> None:
 29 |         self.entries = defaultdict(BeamEntry)
 30 | 
 31 |     def normalize(self) -> None:
 32 |         """Length-normalise LM score."""
 33 |         for k in self.entries.keys():
 34 |             labeling_len = len(self.entries[k].labeling)
 35 |             self.entries[k].pr_text = (1.0 / (labeling_len if labeling_len else 1.0)) * self.entries[k].pr_text
 36 | 
 37 |     def sort_labelings(self) -> List[Tuple[int]]:
 38 |         """Return beam-labelings, sorted by probability."""
 39 |         beams = self.entries.values()
 40 |         sorted_beams = sorted(beams, reverse=True, key=lambda x: x.pr_total + x.pr_text)
 41 |         return [x.labeling for x in sorted_beams]
 42 | 
 43 | 
 44 | class BeamSearchDecoder:
 45 |     def __init__(self, vocab: list, blank_idx: int, beam_width: int = 5, num_sentences: int = 1, lm: Optional[LanguageModel] = None):
 46 |         self.vocab = vocab
 47 |         self.blank_idx = blank_idx
 48 |         self.beam_width = beam_width
 49 |         self.num_sentences = num_sentences
 50 |         self.lm = lm
 51 | 
 52 |     def __apply_lm(self, parent_beam: BeamEntry, child_beam: BeamEntry) -> None:
 53 |         """Calculate LM score of child beam by taking score from 
 54 |         parent beam and bigram probability of last two chars."""
 55 |         if not self.lm or child_beam.lm_applied:
 56 |             return
 57 |         # take bigram if beam length at least 2
 58 |         if len(child_beam.labeling) > 1:
 59 |             c = self.vocab[child_beam.labeling[-2]]
 60 |             d = self.vocab[child_beam.labeling[-1]]
 61 |             ngram_prob = self.lm.get_char_bigram(c, d)
 62 |         # otherwise take unigram
 63 |         else:
 64 |             c = self.vocab[child_beam.labeling[-1]]
 65 |             ngram_prob = self.lm.get_char_unigram(c)
 66 | 
 67 |         lm_factor = 0.01  # influence of language model
 68 |         # probability of char sequence
 69 |         child_beam.pr_text = parent_beam.pr_text + lm_factor * log(ngram_prob)
 70 |         child_beam.lm_applied = True  # only apply LM once per beam entry
 71 | 
 72 |     def __call__(self, logits: np.array) -> list:
 73 |         """Beam search decoder.
 74 | 
 75 |         See the paper of Hwang et al. and the paper of Graves et al.
 76 | 
 77 |         Args:
 78 |             logits: Output of neural network of shape TxC.
 79 | 
 80 |         Returns:
 81 |             The decoded text.
 82 |         """
 83 | 
 84 |         max_T, max_C = logits.shape
 85 | 
 86 |         # initialise beam state
 87 |         last = BeamList()
 88 |         labeling = ()
 89 |         last.entries[labeling] = BeamEntry()
 90 |         last.entries[labeling].pr_blank = log(1)
 91 |         last.entries[labeling].pr_total = log(1)
 92 | 
 93 |         # go over all time-steps
 94 |         for t in range(max_T):
 95 |             curr = BeamList()
 96 | 
 97 |             # get beam-labelings of best beams
 98 |             best_labelings = last.sort_labelings()[:self.beam_width]
 99 | 
100 |             # go over best beams
101 |             for labeling in best_labelings:
102 |                 # probability of paths ending with a non-blank
103 |                 pr_non_blank = log(0)
104 |                 # in case of non-empty beam
105 |                 if labeling:
106 |                     # probability of paths with repeated last char at the end
107 |                     pr_non_blank = last.entries[labeling].pr_non_blank + log(logits[t, labeling[-1]])
108 | 
109 |                 # probability of paths ending with a blank
110 |                 pr_blank = last.entries[labeling].pr_total + log(logits[t, self.blank_idx])
111 | 
112 |                 # fill in data for current beam
113 |                 curr.entries[labeling].labeling = labeling
114 |                 curr.entries[labeling].pr_non_blank = np.logaddexp(curr.entries[labeling].pr_non_blank, pr_non_blank)
115 |                 curr.entries[labeling].pr_blank = np.logaddexp(curr.entries[labeling].pr_blank, pr_blank)
116 |                 curr.entries[labeling].pr_total = np.logaddexp(curr.entries[labeling].pr_total,
117 |                                                                np.logaddexp(pr_blank, pr_non_blank))
118 |                 curr.entries[labeling].pr_text = last.entries[labeling].pr_text
119 |                 curr.entries[labeling].lm_applied = True  # LM already applied at previous time-step for this beam-labeling
120 | 
121 |                 # extend current beam-labeling
122 |                 for c in range(max_C - 1):
123 |                     # add new char to current beam-labeling
124 |                     new_labeling = labeling + (c,)
125 | 
126 |                     # if new labeling contains duplicate char at the end, only consider paths ending with a blank
127 |                     if labeling and labeling[-1] == c:
128 |                         pr_non_blank = last.entries[labeling].pr_blank + log(logits[t, c])
129 |                     else:
130 |                         pr_non_blank = last.entries[labeling].pr_total + log(logits[t, c])
131 | 
132 |                     # fill in data
133 |                     curr.entries[new_labeling].labeling = new_labeling
134 |                     curr.entries[new_labeling].pr_non_blank = np.logaddexp(curr.entries[new_labeling].pr_non_blank,
135 |                                                                            pr_non_blank)
136 |                     curr.entries[new_labeling].pr_total = np.logaddexp(curr.entries[new_labeling].pr_total, pr_non_blank)
137 | 
138 |                     # apply LM
139 |                     self.__apply_lm(curr.entries[labeling], curr.entries[new_labeling])
140 | 
141 |             # set new beam state
142 |             last = curr
143 | 
144 |         # normalise LM scores according to beam-labeling-length
145 |         last.normalize()
146 | 
147 |         # sort by probability and get most probable labelings
148 |         best_labeling = last.sort_labelings()[:self.num_sentences]
149 |         return best_labeling
150 | 


--------------------------------------------------------------------------------
/asr/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import asyncio
  3 | import sounddevice as sd
  4 | import numpy as np
  5 | import soundfile as sf
  6 | from scipy.signal import resample
  7 | 
  8 | import torch
  9 | from torchaudio.transforms import Resample
 10 | 
 11 | 
 12 | class MicrophoneCaptureFailed(Exception):
 13 |     pass
 14 | 
 15 | 
 16 | class MicrophoneStreaming:
 17 |     def __init__(self, sr=16000, blocksize=1024, channels=1, device=None, loop=None, dtype="float32"):
 18 |         self._sr = sr
 19 |         self._channels = channels
 20 |         self._device = device
 21 |         self._buffer = asyncio.Queue()
 22 |         self._buffersize = blocksize
 23 |         self._dtype = dtype
 24 |         self._loop = loop
 25 |         
 26 |     def __callback(self, indata, frame_count, time_info, status):
 27 |         self._loop.call_soon_threadsafe(self._buffer.put_nowait, (indata.copy(), status))
 28 |     
 29 |     async def record_to_file(self, filename, duration=None):
 30 |         with sf.SoundFile(filename, mode='x', samplerate=self._sr, channels=self._channels) as f:
 31 |             t = time.time()
 32 |             rec = duration if duration is not None else 10
 33 |             async for block, status in self.generator():
 34 |                 f.write(block)
 35 |                 rec = duration+0 if duration is not None else duration+1
 36 |                 if(time.time() - t) > rec:
 37 |                     break 
 38 | 
 39 |     async def generator(self, future: asyncio.Future = None):
 40 |         if self._loop is None:
 41 |             self._loop = asyncio.get_running_loop()
 42 |         stream = sd.InputStream(
 43 |             samplerate=self._sr,
 44 |             device=self._device,
 45 |             channels=self._channels,
 46 |             callback=self.__callback,
 47 |             dtype=self._dtype,
 48 |             blocksize=self._buffersize)
 49 |         with stream:
 50 |             if not stream.active:
 51 |                 # if it was not called start() or exception was raised
 52 |                 # in the audio callback
 53 |                 if future: 
 54 |                     # if the future is waiting for the start or any failure
 55 |                     # set the exception
 56 |                     future.set_exception(f"Could not open the {self._device} capture device")
 57 |                 
 58 |                 # coroutine also will be notified
 59 |                 raise MicrophoneCaptureFailed
 60 |             else:
 61 |                 if future:
 62 |                     # if the future is waiting for the start or any failure
 63 |                     # set True meaning that the microphone was successfully opened
 64 |                     future.set_result(True)
 65 |             
 66 |             while stream.active:
 67 |                 indata, status = await self._buffer.get()
 68 |                 yield indata.squeeze(), status
 69 | 
 70 | 
 71 | class AudioStreaming:
 72 |     def __init__(self, audio_path, blocksize, sr=16000, overlap=0, padding=None, dtype="float32"):
 73 |         assert blocksize >= 0, "blocksize cannot be 0 or negative"
 74 |         self._sr = sr
 75 |         self._orig_sr = sf.info(audio_path).samplerate
 76 |         self._sf_blocks = sf.blocks(audio_path,
 77 |                         blocksize=blocksize, 
 78 |                         overlap=overlap,
 79 |                         fill_value=padding,
 80 |                         dtype=dtype)
 81 | 
 82 |     async def generator(self, future: asyncio.Future=None):
 83 |         for block in self._sf_blocks:
 84 |             chunk = await self.__resample_file(block, self._orig_sr, self._sr)
 85 |             yield chunk, self._orig_sr
 86 | 
 87 |     async def __resample_file(self, array, original_sr, target_sr):
 88 |         resampling_transform = Resample(orig_freq=original_sr,
 89 |                                         new_freq=target_sr)
 90 | 
 91 |         sample = resampling_transform(torch.Tensor([array])).squeeze()
 92 |         return sample
 93 | 
 94 | 
 95 | class AudioReader:
 96 |     def __init__(self, audio_path, sr=16000, dtype="float32"):
 97 |         self._sr = sr
 98 |         self._dtype = dtype
 99 |         self._audio_path = audio_path
100 |     
101 |     def read(self):
102 |         data, sr = sf.read(self._audio_path, dtype=self._dtype)
103 |         data = self.__resample_file(data, sr, self._sr)
104 |         return data, sr
105 | 
106 |     def __resample_file(self, array, original_sr, target_sr):
107 |         return resample(array, num=int(len(array)*target_sr/original_sr))


--------------------------------------------------------------------------------
/asr/wav2vec2/__init__.py:
--------------------------------------------------------------------------------
1 | from .inference import Wav2Vec2ASR


--------------------------------------------------------------------------------
/asr/wav2vec2/decoder/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tarun-bisht/wav2vec2-asr/17308ac128f9762b30220f1e580f699bc98be2e7/asr/wav2vec2/decoder/__init__.py


--------------------------------------------------------------------------------
/asr/wav2vec2/decoder/ctc_decoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | from asr.language_model import LanguageModel
 4 | from asr.utils import BeamSearchDecoder
 5 | from asr.wav2vec2.vocab import vocab_list
 6 | 
 7 | 
 8 | class CTCDecoder:
 9 |     def __init__(self, blank_idx: int = 0, beam_width: int = 100, lm_path: str = None):
10 |         """constructor
11 | 
12 |         Args:
13 |             blank_idx (int, optional): index of ctc blank token. Defaults to 0.
14 |             beam_width (int, optional): beam width to search larget the value gives more accurate decoding costing computation. Defaults to 100.
15 |             lm_path (str, optional): path to langugage model folder with unigram and bigrams. Defaults to None.
16 |         """        
17 |         lm = None
18 |         if beam_width <= 1:
19 |             self.mode = "greedy"
20 |         else:
21 |             self.mode = "beam"
22 |             if lm_path is not None:
23 |                 self.mode = "beam_lm"
24 |                 lm = LanguageModel(chars=vocab_list[1:])
25 |                 lm.load(lm_path)
26 |         self._beam_search = BeamSearchDecoder(vocab_list[1:],
27 |                                               blank_idx,
28 |                                               beam_width,
29 |                                               lm=lm)
30 | 
31 |     def __call__(self, logits: torch.tensor):
32 |         return self.decode(logits)
33 | 
34 |     def decode(self, logits: torch.tensor):
35 |         """decode logits using greedy method or beam search if beam width <= 1 then greedy else beam search.
36 | 
37 |         Args:
38 |             logits (torch.tensor): logits from model outputs
39 | 
40 |         Returns:
41 |             np.array: ctc decoded output
42 |         """
43 |         out_proba = torch.nn.functional.softmax(logits, dim=-1)[0]
44 |         if self.mode == "greedy":
45 |             out = self._greedy_path(out_proba).cpu().numpy()
46 |         elif self.mode == "beam":
47 |             out = self._beam_search(out_proba.cpu().numpy())[0]
48 |         elif self.mode == "beam_lm":
49 |             out = self._beam_search(out_proba.cpu().numpy())[0]
50 |         else:
51 |             out = None
52 |             raise ValueError(
53 |                 "Mode not defined mode choices [greedy, beam and beam_lm]")
54 |         return out
55 | 
56 |     def _greedy_path(self, probs: torch.tensor) -> torch.tensor:
57 |         """max decoding ctc output by taking maximum probabilities from each timestep
58 | 
59 |         Args:
60 |             probs (torch.tensor): softmax logits from model
61 | 
62 |         Returns:
63 |             torch.tensor: max decoded outputs
64 |         """
65 |         return torch.argmax(probs, axis=1)
66 | 


--------------------------------------------------------------------------------
/asr/wav2vec2/inference.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import asyncio
  3 | import functools
  4 | import transformers
  5 | import numpy as np
  6 | from asr.wav2vec2.decoder.ctc_decoder import CTCDecoder
  7 | 
  8 | 
  9 | class Wav2Vec2ASR:
 10 |     """
 11 |     Wav2Vec2 class wrapper for speech recognition
 12 |     """
 13 | 
 14 |     def __init__(self, sr: int = 16000, device: str = "cpu", 
 15 |                  processor_path: str = None, model_path: str = None,
 16 |                  pretrained_model_name: str = "facebook/wav2vec2-base-960h",
 17 |                  beam_width: int = 5, lm_path: str = None):
 18 |         """Wave2Vec2 class constructor
 19 | 
 20 |         Args:
 21 |             sr (int, optional): sample rate of audio passing as input
 22 |             device (str, optional): device to load model and inputs choices are 'cpu' and 'cuda. Defaults to "cpu".
 23 |             processor_path (str, optional): path to saved local processor files. Defaults to None.
 24 |             model_path (str, optional): path to saved local model. Defaults to None
 25 |             pretrained_model_name (str, optional): pretrained model name as per hugging face pretrained models to load. Defaults to "facebook/wav2vec2-base-960h".
 26 |             beam_width (int, optional): width of beam search more the number better the results but increase computation. Defaults to 5.
 27 |             lm_path (str, optional): path to saved language model. Defaults to None.
 28 |         """
 29 |         self.sr = sr
 30 |         self.device = torch.device(device)
 31 |         self.processor_path = processor_path
 32 |         self.model_path = model_path
 33 |         self.pretrained_model_name = pretrained_model_name
 34 |         self.decoder = CTCDecoder(blank_idx=0,
 35 |                                   beam_width=beam_width,
 36 |                                   lm_path=lm_path)
 37 | 
 38 |     def load(self):
 39 |         """load models and processors
 40 |         """
 41 |         processor = (transformers.Wav2Vec2Processor.from_pretrained(self.pretrained_model_name)
 42 |                      if self.processor_path is None else torch.load(self.processor_path))
 43 |         model = (transformers.Wav2Vec2ForCTC.from_pretrained(self.pretrained_model_name)
 44 |                  if self.model_path is None else torch.load(self.model_path))
 45 |         model.eval()
 46 |         model.to(self.device)
 47 |         self.model = model
 48 |         self.processor = processor
 49 | 
 50 |     def _transcribe(self, inputs: torch.tensor) -> str:
 51 |         """transcribe input speech and return resulting transcription
 52 | 
 53 |         Args:
 54 |             inputs (torch.tensor): single raw speech torch tensor (timestep,1)
 55 | 
 56 |         Returns:
 57 |             str: transcription of raw speech signal
 58 |         """
 59 |         inputs = self.processor(inputs, sampling_rate=self.sr,
 60 |                                 padding="longest",
 61 |                                 return_tensors='pt').input_values.to(self.device)
 62 |         with torch.no_grad():
 63 |             logits = self.model(inputs).logits
 64 |         outs = self.decoder(logits)
 65 |         return self.processor.decode(outs)
 66 | 
 67 |     async def capture_and_transcribe(self,
 68 |                                      stream_obj,
 69 |                                      started_future: asyncio.Future = None,
 70 |                                      loop=None):
 71 |         """capture streaming audio and transcribe
 72 | 
 73 |         Args:
 74 |             stream_obj (asr.utils.MicrophoneStreaming or asr.utils.AudioStreaming): streaming object with generator that yields audio blocks
 75 |             started_future (asyncio.Future, optional): asyncio future. Defaults to None.
 76 |             loop (optional): asyncio event loop which we can get using asyncio.get_running_loop(). Defaults to None.
 77 | 
 78 |         Yields:
 79 |             [generator object]: returns generator that yield outputs from streaming audio
 80 |         """
 81 |         if loop is None:
 82 |             loop = asyncio.get_running_loop()
 83 |         async for block, status in stream_obj.generator(started_future):
 84 |             process_func = functools.partial(self._transcribe, inputs=block)
 85 |             transcriptions = await loop.run_in_executor(None, process_func)
 86 |             yield transcriptions
 87 | 
 88 |     async def transcribe(self, inputs: torch.tensor, loop=None):
 89 |         """transcribe and audio signal use for offline audio transcription
 90 | 
 91 |         Args:
 92 |             inputs (torch.tensor): raw speech signal as pytorch tensor (timestep,1)
 93 |             loop (optional): asyncio event loop which we can get using asyncio.get_running_loop(). Defaults to None.
 94 | 
 95 |         Returns:
 96 |             [corountine object]: coroutine object which we get await and get results asynchronously
 97 |         """
 98 |         if loop is None:
 99 |             loop = asyncio.get_running_loop()
100 |         process_func = functools.partial(self._transcribe, inputs=inputs)
101 |         return await loop.run_in_executor(None, process_func)
102 | 


--------------------------------------------------------------------------------
/asr/wav2vec2/vocab.py:
--------------------------------------------------------------------------------
 1 | vocab_dict = {"<pad>": 0,
 2 |               "<s>": 1,
 3 |               "</s>": 2,
 4 |               "<unk>": 3,
 5 |               "|": 4,
 6 |               "E": 5,
 7 |               "T": 6,
 8 |               "A": 7,
 9 |               "O": 8,
10 |               "N": 9,
11 |               "I": 10,
12 |               "H": 11,
13 |               "S": 12,
14 |               "R": 13,
15 |               "D": 14,
16 |               "L": 15,
17 |               "U": 16,
18 |               "M": 17,
19 |               "W": 18,
20 |               "C": 19,
21 |               "F": 20,
22 |               "G": 21,
23 |               "Y": 22,
24 |               "P": 23,
25 |               "B": 24,
26 |               "V": 25,
27 |               "K": 26,
28 |               "'": 27,
29 |               "X": 28,
30 |               "J": 29,
31 |               "Q": 30,
32 |               "Z": 31
33 |               }
34 | 
35 | vocab_list = [key for key, value in vocab_dict.items()]
36 | 


--------------------------------------------------------------------------------
/asr_inference_live.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import asyncio
 3 | import functools
 4 | import sys
 5 | from asr.utils import MicrophoneStreaming
 6 | from asr.wav2vec2 import Wav2Vec2ASR
 7 | 
 8 | parser = argparse.ArgumentParser(description="ASR with live audio")
 9 | parser.add_argument("--model", "-m", default=None, required=False,
10 |                     help="Trained Model local path")
11 | parser.add_argument("--processor", "-t", default=None, required=False,
12 |                     help="Local asr processor path")
13 | parser.add_argument("--blocksize", "-bs", default=16000, type=int, required=False,
14 |                     help="Size of each audio block to be passed to model")
15 | parser.add_argument("--output", "-out", required=False,
16 |                     help="path to save resultant transcriptions")
17 | parser.add_argument("--device", "-d", default='cpu', nargs='?', choices=['cuda', 'cpu'], required=False,
18 |                     help="device to use for inferencing")
19 | parser.add_argument("--beam_width", "-bw", default=1, type=int, required=False,
20 |                     help="beam width to use for beam search decoder during inferencing")
21 | parser.add_argument("--lm", "-l", default=None, required=False,
22 |                     help="Trained lm folder path with unigram and bigram files")
23 | parser.add_argument("--pretrained_model_name", "-pwmn", default="facebook/wav2vec2-base-960h",
24 |                     type=str, required=False, help="Pretrained wav2vec2 model name")
25 | 
26 | args = parser.parse_args()
27 | 
28 | asr = Wav2Vec2ASR(device=args.device,
29 |                   processor_path=args.processor,
30 |                   model_path=args.model,
31 |                   pretrained_model_name=args.pretrained_model_name,
32 |                   beam_width=args.beam_width,
33 |                   lm_path=args.lm)
34 | 
35 | print("Loading Models ...")
36 | asr.load()
37 | print("Models Loaded ...")
38 | 
39 | 
40 | def write_to_file(output_file, transcriptions):
41 |     output_file.write(transcriptions)
42 | 
43 | 
44 | def print_transcription(transcription):
45 |     print(transcription, end=" ")
46 |     sys.stdout.flush()
47 | 
48 | 
49 | async def main(output_file=None):
50 |     loop = asyncio.get_running_loop()
51 |     stream = MicrophoneStreaming(blocksize=args.blocksize, loop=loop)
52 |     async for transcription in asr.capture_and_transcribe(stream, loop=loop):
53 |         if not transcription == "":
54 |             print_func = functools.partial(
55 |                 print_transcription, transcription=transcription)
56 |             await loop.run_in_executor(None, print_func)
57 |             if output_file is not None:
58 |                 write_func = functools.partial(write_to_file, output_file=output_file,
59 |                                                transcriptions=transcriptions)
60 |                 await loop.run_in_executor(None, write_func)
61 | 
62 | if __name__ == "__main__":
63 |     print("Start Transcribing...")
64 |     try:
65 |         if args.output:
66 |             with open(args.output, "w") as f:
67 |                 asyncio.run(main(f))
68 |         else:
69 |             asyncio.run(main())
70 |     except KeyboardInterrupt:
71 |         print("Exited")
72 | 


--------------------------------------------------------------------------------
/asr_inference_offline.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import asyncio
 3 | import functools
 4 | from asr.utils import AudioReader
 5 | from asr.wav2vec2 import Wav2Vec2ASR
 6 | 
 7 | parser = argparse.ArgumentParser(
 8 |     description="ASR with recorded audio (offline)")
 9 | parser.add_argument("--recording", "-rec", required=True,
10 |                     help="path to recording file")
11 | parser.add_argument("--model", "-m", default=None, required=False,
12 |                     help="path to local saved model")
13 | parser.add_argument("--processor", "-t", default=None, required=False,
14 |                     help="path to local saved processor")
15 | parser.add_argument("--output", "-out", required=False,
16 |                     help="path to save resultant transcriptions")
17 | parser.add_argument("--lm", "-l", default=None, required=False,
18 |                     help="Trained lm folder path with unigram and bigram files")
19 | parser.add_argument("--device", "-d", default='cpu', nargs='?', choices=['cuda', 'cpu'], required=False,
20 |                     help="device to use for inferencing")
21 | parser.add_argument("--beam_width", "-bw", default=1, type=int, required=False,
22 |                     help="beam width to use for beam search decoder during inferencing")
23 | parser.add_argument("--pretrained_model_name", "-pwmn", default="facebook/wav2vec2-base-960h",
24 |                     type=str, required=False, help="Pretrained wav2vec2 model name")
25 | 
26 | args = parser.parse_args()
27 | 
28 | asr = Wav2Vec2ASR(device=args.device,
29 |                   processor_path=args.processor,
30 |                   model_path=args.model,
31 |                   pretrained_model_name=args.pretrained_model_name,
32 |                   beam_width=args.beam_width,
33 |                   lm_path=args.lm)
34 | 
35 | print("Loading Models ...")
36 | asr.load()
37 | print("Models Loaded ...")
38 | 
39 | 
40 | async def main():
41 |     loop = asyncio.get_running_loop()
42 |     reader = AudioReader(audio_path=args.recording,
43 |                          sr=16000,
44 |                          dtype="float32")
45 |     inputs, sr = reader.read()
46 |     transcriptions = await asr.transcribe(inputs, loop=loop)
47 |     print(transcriptions)
48 |     if args.output:
49 |         with open(args.output, "w") as f:
50 |             f.write(transcriptions)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     print("Start Transcribing...")
55 |     try:
56 |         asyncio.run(main())
57 |     except KeyboardInterrupt:
58 |         print("Exited")
59 | 


--------------------------------------------------------------------------------
/asr_inference_recording.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import asyncio
 3 | import functools
 4 | import sys
 5 | from asr.utils import AudioStreaming
 6 | from asr.wav2vec2 import Wav2Vec2ASR
 7 | 
 8 | parser = argparse.ArgumentParser(description="ASR with live audio")
 9 | parser.add_argument("--recording", "-rec", required=True,
10 |                     help="path to recording file")
11 | parser.add_argument("--model", "-m", default=None, required=False,
12 |                     help="Trained Model local path")
13 | parser.add_argument("--processor", "-t", default=None, required=False,
14 |                     help="Local asr processor path")
15 | parser.add_argument("--blocksize", "-bs", default=16000, type=int, required=False,
16 |                     help="Size of each audio block to be passed to model")
17 | parser.add_argument("--overlap", "-ov", default=0, type=int, required=False,
18 |                     help="Overlapping amount in audio blocks")
19 | parser.add_argument("--output", "-out", required=False,
20 |                     help="path to save resultant transcriptions")
21 | parser.add_argument("--device", "-d", default='cpu', nargs='?', choices=['cuda', 'cpu'], required=False,
22 |                     help="device to use for inferencing")
23 | parser.add_argument("--beam_width", "-bw", default=1, type=int, required=False,
24 |                     help="beam width to use for beam search decoder during inferencing")
25 | parser.add_argument("--lm", "-l", default=None, required=False,
26 |                     help="Trained lm folder path with unigram and bigram files")
27 | parser.add_argument("--pretrained_model_name", "-pwmn", default="facebook/wav2vec2-base-960h",
28 |                     type=str, required=False, help="Pretrained wav2vec2 model name")
29 | 
30 | args = parser.parse_args()
31 | 
32 | asr = Wav2Vec2ASR(device=args.device,
33 |                   processor_path=args.processor,
34 |                   model_path=args.model,
35 |                   pretrained_model_name=args.pretrained_model_name,
36 |                   beam_width=args.beam_width,
37 |                   lm_path=args.lm)
38 | 
39 | print("Loading Models ...")
40 | asr.load()
41 | print("Models Loaded ...")
42 | 
43 | 
44 | def write_to_file(output_file, transcriptions):
45 |     output_file.write(transcriptions)
46 | 
47 | 
48 | def print_transcription(transcription):
49 |     print(transcription, end=" ")
50 |     sys.stdout.flush()
51 | 
52 | 
53 | async def main(output_file=None):
54 |     loop = asyncio.get_running_loop()
55 |     stream = AudioStreaming(audio_path=args.recording, blocksize=args.blocksize,
56 |                             overlap=args.overlap)
57 |     async for transcription in asr.capture_and_transcribe(stream, loop=loop):
58 |         if not transcription == "":
59 |             print_func = functools.partial(
60 |                 print_transcription, transcription=transcription)
61 |             await loop.run_in_executor(None, print_func)
62 |             if output_file is not None:
63 |                 write_func = functools.partial(write_to_file, output_file=output_file,
64 |                                                transcriptions=transcriptions)
65 |                 await loop.run_in_executor(None, write_func)
66 | 
67 | if __name__ == "__main__":
68 |     print("Start Transcribing...")
69 |     try:
70 |         if args.output:
71 |             with open(args.output, "w") as f:
72 |                 asyncio.run(main(f))
73 |         else:
74 |             asyncio.run(main())
75 |     except KeyboardInterrupt:
76 |         print("Exited")
77 | 


--------------------------------------------------------------------------------
/data/models/lm/twitter/bigram.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tarun-bisht/wav2vec2-asr/17308ac128f9762b30220f1e580f699bc98be2e7/data/models/lm/twitter/bigram.pkl


--------------------------------------------------------------------------------
/data/models/lm/twitter/unigram.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tarun-bisht/wav2vec2-asr/17308ac128f9762b30220f1e580f699bc98be2e7/data/models/lm/twitter/unigram.pkl


--------------------------------------------------------------------------------
/data/models/lm/wikipedia/bigram.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tarun-bisht/wav2vec2-asr/17308ac128f9762b30220f1e580f699bc98be2e7/data/models/lm/wikipedia/bigram.pkl


--------------------------------------------------------------------------------
/data/models/lm/wikipedia/unigram.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tarun-bisht/wav2vec2-asr/17308ac128f9762b30220f1e580f699bc98be2e7/data/models/lm/wikipedia/unigram.pkl


--------------------------------------------------------------------------------
/data/samples/Achievements_of_the_Democratic_Party_(Homer_S._Cummings).ogg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tarun-bisht/wav2vec2-asr/17308ac128f9762b30220f1e580f699bc98be2e7/data/samples/Achievements_of_the_Democratic_Party_(Homer_S._Cummings).ogg


--------------------------------------------------------------------------------
/data/samples/rec.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tarun-bisht/wav2vec2-asr/17308ac128f9762b30220f1e580f699bc98be2e7/data/samples/rec.wav


--------------------------------------------------------------------------------
/data/samples/rec2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tarun-bisht/wav2vec2-asr/17308ac128f9762b30220f1e580f699bc98be2e7/data/samples/rec2.wav


--------------------------------------------------------------------------------
/data/samples/shortened.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tarun-bisht/wav2vec2-asr/17308ac128f9762b30220f1e580f699bc98be2e7/data/samples/shortened.wav


--------------------------------------------------------------------------------
/notebooks/Training_Simple_Lanugage_Model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Training Simple Lanugage Model.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     },
 17 |     "widgets": {
 18 |       "application/vnd.jupyter.widget-state+json": {
 19 |         "0677f310a8104790b2f4545d1bece7b0": {
 20 |           "model_module": "@jupyter-widgets/controls",
 21 |           "model_name": "HBoxModel",
 22 |           "model_module_version": "1.5.0",
 23 |           "state": {
 24 |             "_view_name": "HBoxView",
 25 |             "_dom_classes": [],
 26 |             "_model_name": "HBoxModel",
 27 |             "_view_module": "@jupyter-widgets/controls",
 28 |             "_model_module_version": "1.5.0",
 29 |             "_view_count": null,
 30 |             "_view_module_version": "1.5.0",
 31 |             "box_style": "",
 32 |             "layout": "IPY_MODEL_5cb0a81006e14c4eba6da4cb4b9bf14b",
 33 |             "_model_module": "@jupyter-widgets/controls",
 34 |             "children": [
 35 |               "IPY_MODEL_0cd3f576c0c2445f9817c5b56a87b900",
 36 |               "IPY_MODEL_4f066cb8912e40108de0c8d0d5667b09",
 37 |               "IPY_MODEL_35e18b46304a40e696ea98c7e05d6c94"
 38 |             ]
 39 |           }
 40 |         },
 41 |         "5cb0a81006e14c4eba6da4cb4b9bf14b": {
 42 |           "model_module": "@jupyter-widgets/base",
 43 |           "model_name": "LayoutModel",
 44 |           "model_module_version": "1.2.0",
 45 |           "state": {
 46 |             "_view_name": "LayoutView",
 47 |             "grid_template_rows": null,
 48 |             "right": null,
 49 |             "justify_content": null,
 50 |             "_view_module": "@jupyter-widgets/base",
 51 |             "overflow": null,
 52 |             "_model_module_version": "1.2.0",
 53 |             "_view_count": null,
 54 |             "flex_flow": null,
 55 |             "width": null,
 56 |             "min_width": null,
 57 |             "border": null,
 58 |             "align_items": null,
 59 |             "bottom": null,
 60 |             "_model_module": "@jupyter-widgets/base",
 61 |             "top": null,
 62 |             "grid_column": null,
 63 |             "overflow_y": null,
 64 |             "overflow_x": null,
 65 |             "grid_auto_flow": null,
 66 |             "grid_area": null,
 67 |             "grid_template_columns": null,
 68 |             "flex": null,
 69 |             "_model_name": "LayoutModel",
 70 |             "justify_items": null,
 71 |             "grid_row": null,
 72 |             "max_height": null,
 73 |             "align_content": null,
 74 |             "visibility": null,
 75 |             "align_self": null,
 76 |             "height": null,
 77 |             "min_height": null,
 78 |             "padding": null,
 79 |             "grid_auto_rows": null,
 80 |             "grid_gap": null,
 81 |             "max_width": null,
 82 |             "order": null,
 83 |             "_view_module_version": "1.2.0",
 84 |             "grid_template_areas": null,
 85 |             "object_position": null,
 86 |             "object_fit": null,
 87 |             "grid_auto_columns": null,
 88 |             "margin": null,
 89 |             "display": null,
 90 |             "left": null
 91 |           }
 92 |         },
 93 |         "0cd3f576c0c2445f9817c5b56a87b900": {
 94 |           "model_module": "@jupyter-widgets/controls",
 95 |           "model_name": "HTMLModel",
 96 |           "model_module_version": "1.5.0",
 97 |           "state": {
 98 |             "_view_name": "HTMLView",
 99 |             "style": "IPY_MODEL_54b399d1ad7747088cc4a73837b87c60",
100 |             "_dom_classes": [],
101 |             "description": "",
102 |             "_model_name": "HTMLModel",
103 |             "placeholder": "​",
104 |             "_view_module": "@jupyter-widgets/controls",
105 |             "_model_module_version": "1.5.0",
106 |             "value": "  0%",
107 |             "_view_count": null,
108 |             "_view_module_version": "1.5.0",
109 |             "description_tooltip": null,
110 |             "_model_module": "@jupyter-widgets/controls",
111 |             "layout": "IPY_MODEL_e2d00fc27634452383943b40e131f36d"
112 |           }
113 |         },
114 |         "4f066cb8912e40108de0c8d0d5667b09": {
115 |           "model_module": "@jupyter-widgets/controls",
116 |           "model_name": "FloatProgressModel",
117 |           "model_module_version": "1.5.0",
118 |           "state": {
119 |             "_view_name": "ProgressView",
120 |             "style": "IPY_MODEL_b8da8be70fb342b0ae900853af0bb7d1",
121 |             "_dom_classes": [],
122 |             "description": "",
123 |             "_model_name": "FloatProgressModel",
124 |             "bar_style": "danger",
125 |             "max": 5824596,
126 |             "_view_module": "@jupyter-widgets/controls",
127 |             "_model_module_version": "1.5.0",
128 |             "value": 999,
129 |             "_view_count": null,
130 |             "_view_module_version": "1.5.0",
131 |             "orientation": "horizontal",
132 |             "min": 0,
133 |             "description_tooltip": null,
134 |             "_model_module": "@jupyter-widgets/controls",
135 |             "layout": "IPY_MODEL_3321512e63f9446f8d2d8d57d7586607"
136 |           }
137 |         },
138 |         "35e18b46304a40e696ea98c7e05d6c94": {
139 |           "model_module": "@jupyter-widgets/controls",
140 |           "model_name": "HTMLModel",
141 |           "model_module_version": "1.5.0",
142 |           "state": {
143 |             "_view_name": "HTMLView",
144 |             "style": "IPY_MODEL_ec1edc03d12241ba96e467f120e46ac0",
145 |             "_dom_classes": [],
146 |             "description": "",
147 |             "_model_name": "HTMLModel",
148 |             "placeholder": "​",
149 |             "_view_module": "@jupyter-widgets/controls",
150 |             "_model_module_version": "1.5.0",
151 |             "value": " 999/5824596 [00:01&lt;36:25, 2665.15it/s]",
152 |             "_view_count": null,
153 |             "_view_module_version": "1.5.0",
154 |             "description_tooltip": null,
155 |             "_model_module": "@jupyter-widgets/controls",
156 |             "layout": "IPY_MODEL_030880aa5a054d788231490e0f4a7e78"
157 |           }
158 |         },
159 |         "54b399d1ad7747088cc4a73837b87c60": {
160 |           "model_module": "@jupyter-widgets/controls",
161 |           "model_name": "DescriptionStyleModel",
162 |           "model_module_version": "1.5.0",
163 |           "state": {
164 |             "_view_name": "StyleView",
165 |             "_model_name": "DescriptionStyleModel",
166 |             "description_width": "",
167 |             "_view_module": "@jupyter-widgets/base",
168 |             "_model_module_version": "1.5.0",
169 |             "_view_count": null,
170 |             "_view_module_version": "1.2.0",
171 |             "_model_module": "@jupyter-widgets/controls"
172 |           }
173 |         },
174 |         "e2d00fc27634452383943b40e131f36d": {
175 |           "model_module": "@jupyter-widgets/base",
176 |           "model_name": "LayoutModel",
177 |           "model_module_version": "1.2.0",
178 |           "state": {
179 |             "_view_name": "LayoutView",
180 |             "grid_template_rows": null,
181 |             "right": null,
182 |             "justify_content": null,
183 |             "_view_module": "@jupyter-widgets/base",
184 |             "overflow": null,
185 |             "_model_module_version": "1.2.0",
186 |             "_view_count": null,
187 |             "flex_flow": null,
188 |             "width": null,
189 |             "min_width": null,
190 |             "border": null,
191 |             "align_items": null,
192 |             "bottom": null,
193 |             "_model_module": "@jupyter-widgets/base",
194 |             "top": null,
195 |             "grid_column": null,
196 |             "overflow_y": null,
197 |             "overflow_x": null,
198 |             "grid_auto_flow": null,
199 |             "grid_area": null,
200 |             "grid_template_columns": null,
201 |             "flex": null,
202 |             "_model_name": "LayoutModel",
203 |             "justify_items": null,
204 |             "grid_row": null,
205 |             "max_height": null,
206 |             "align_content": null,
207 |             "visibility": null,
208 |             "align_self": null,
209 |             "height": null,
210 |             "min_height": null,
211 |             "padding": null,
212 |             "grid_auto_rows": null,
213 |             "grid_gap": null,
214 |             "max_width": null,
215 |             "order": null,
216 |             "_view_module_version": "1.2.0",
217 |             "grid_template_areas": null,
218 |             "object_position": null,
219 |             "object_fit": null,
220 |             "grid_auto_columns": null,
221 |             "margin": null,
222 |             "display": null,
223 |             "left": null
224 |           }
225 |         },
226 |         "b8da8be70fb342b0ae900853af0bb7d1": {
227 |           "model_module": "@jupyter-widgets/controls",
228 |           "model_name": "ProgressStyleModel",
229 |           "model_module_version": "1.5.0",
230 |           "state": {
231 |             "_view_name": "StyleView",
232 |             "_model_name": "ProgressStyleModel",
233 |             "description_width": "",
234 |             "_view_module": "@jupyter-widgets/base",
235 |             "_model_module_version": "1.5.0",
236 |             "_view_count": null,
237 |             "_view_module_version": "1.2.0",
238 |             "bar_color": null,
239 |             "_model_module": "@jupyter-widgets/controls"
240 |           }
241 |         },
242 |         "3321512e63f9446f8d2d8d57d7586607": {
243 |           "model_module": "@jupyter-widgets/base",
244 |           "model_name": "LayoutModel",
245 |           "model_module_version": "1.2.0",
246 |           "state": {
247 |             "_view_name": "LayoutView",
248 |             "grid_template_rows": null,
249 |             "right": null,
250 |             "justify_content": null,
251 |             "_view_module": "@jupyter-widgets/base",
252 |             "overflow": null,
253 |             "_model_module_version": "1.2.0",
254 |             "_view_count": null,
255 |             "flex_flow": null,
256 |             "width": null,
257 |             "min_width": null,
258 |             "border": null,
259 |             "align_items": null,
260 |             "bottom": null,
261 |             "_model_module": "@jupyter-widgets/base",
262 |             "top": null,
263 |             "grid_column": null,
264 |             "overflow_y": null,
265 |             "overflow_x": null,
266 |             "grid_auto_flow": null,
267 |             "grid_area": null,
268 |             "grid_template_columns": null,
269 |             "flex": null,
270 |             "_model_name": "LayoutModel",
271 |             "justify_items": null,
272 |             "grid_row": null,
273 |             "max_height": null,
274 |             "align_content": null,
275 |             "visibility": null,
276 |             "align_self": null,
277 |             "height": null,
278 |             "min_height": null,
279 |             "padding": null,
280 |             "grid_auto_rows": null,
281 |             "grid_gap": null,
282 |             "max_width": null,
283 |             "order": null,
284 |             "_view_module_version": "1.2.0",
285 |             "grid_template_areas": null,
286 |             "object_position": null,
287 |             "object_fit": null,
288 |             "grid_auto_columns": null,
289 |             "margin": null,
290 |             "display": null,
291 |             "left": null
292 |           }
293 |         },
294 |         "ec1edc03d12241ba96e467f120e46ac0": {
295 |           "model_module": "@jupyter-widgets/controls",
296 |           "model_name": "DescriptionStyleModel",
297 |           "model_module_version": "1.5.0",
298 |           "state": {
299 |             "_view_name": "StyleView",
300 |             "_model_name": "DescriptionStyleModel",
301 |             "description_width": "",
302 |             "_view_module": "@jupyter-widgets/base",
303 |             "_model_module_version": "1.5.0",
304 |             "_view_count": null,
305 |             "_view_module_version": "1.2.0",
306 |             "_model_module": "@jupyter-widgets/controls"
307 |           }
308 |         },
309 |         "030880aa5a054d788231490e0f4a7e78": {
310 |           "model_module": "@jupyter-widgets/base",
311 |           "model_name": "LayoutModel",
312 |           "model_module_version": "1.2.0",
313 |           "state": {
314 |             "_view_name": "LayoutView",
315 |             "grid_template_rows": null,
316 |             "right": null,
317 |             "justify_content": null,
318 |             "_view_module": "@jupyter-widgets/base",
319 |             "overflow": null,
320 |             "_model_module_version": "1.2.0",
321 |             "_view_count": null,
322 |             "flex_flow": null,
323 |             "width": null,
324 |             "min_width": null,
325 |             "border": null,
326 |             "align_items": null,
327 |             "bottom": null,
328 |             "_model_module": "@jupyter-widgets/base",
329 |             "top": null,
330 |             "grid_column": null,
331 |             "overflow_y": null,
332 |             "overflow_x": null,
333 |             "grid_auto_flow": null,
334 |             "grid_area": null,
335 |             "grid_template_columns": null,
336 |             "flex": null,
337 |             "_model_name": "LayoutModel",
338 |             "justify_items": null,
339 |             "grid_row": null,
340 |             "max_height": null,
341 |             "align_content": null,
342 |             "visibility": null,
343 |             "align_self": null,
344 |             "height": null,
345 |             "min_height": null,
346 |             "padding": null,
347 |             "grid_auto_rows": null,
348 |             "grid_gap": null,
349 |             "max_width": null,
350 |             "order": null,
351 |             "_view_module_version": "1.2.0",
352 |             "grid_template_areas": null,
353 |             "object_position": null,
354 |             "object_fit": null,
355 |             "grid_auto_columns": null,
356 |             "margin": null,
357 |             "display": null,
358 |             "left": null
359 |           }
360 |         }
361 |       }
362 |     }
363 |   },
364 |   "cells": [
365 |     {
366 |       "cell_type": "markdown",
367 |       "metadata": {
368 |         "id": "F3MEfwY-PRn1"
369 |       },
370 |       "source": [
371 |         "## Simple Language Model Training with wikipedia data"
372 |       ]
373 |     },
374 |     {
375 |       "cell_type": "code",
376 |       "metadata": {
377 |         "id": "FkkJxWk89_kM"
378 |       },
379 |       "source": [
380 |         "import os\n",
381 |         "import pickle\n",
382 |         "import tensorflow as tf\n",
383 |         "import tensorflow_datasets as tfds\n",
384 |         "import numpy as np\n",
385 |         "from tqdm.auto import tqdm"
386 |       ],
387 |       "execution_count": 21,
388 |       "outputs": []
389 |     },
390 |     {
391 |       "cell_type": "code",
392 |       "metadata": {
393 |         "id": "wJmMm4doOjxM"
394 |       },
395 |       "source": [
396 |         "class LanguageModel:\n",
397 |         "    \"\"\"Simple character-level language model\"\"\"\n",
398 |         "\n",
399 |         "    def __init__(self, chars: list) -> None:\n",
400 |         "        self._unigram = {c: 0 for c in chars}\n",
401 |         "        self._bigram = {c: {d: 0 for d in chars} for c in chars}\n",
402 |         "        self.chars = chars\n",
403 |         "\n",
404 |         "    def get_char_unigram(self, c: str) -> float:\n",
405 |         "        \"\"\"Probability of character c.\"\"\"\n",
406 |         "        return self._unigram[c]\n",
407 |         "\n",
408 |         "    def get_char_bigram(self, c: str, d: str) -> float:\n",
409 |         "        \"\"\"Probability that character c is followed by character d.\"\"\"\n",
410 |         "        return self._bigram[c][d]\n",
411 |         "\n",
412 |         "    def train(self, txt: str):\n",
413 |         "        \"\"\"Create language model from text corpus.\"\"\"\n",
414 |         "        # compute unigrams\n",
415 |         "        for c in txt:\n",
416 |         "            # ignore unknown chars\n",
417 |         "            if c not in self._unigram:\n",
418 |         "                continue\n",
419 |         "            self._unigram[c] += 1\n",
420 |         "\n",
421 |         "        # compute bigrams\n",
422 |         "        for i in range(len(txt) - 1):\n",
423 |         "            c = txt[i]\n",
424 |         "            d = txt[i + 1]\n",
425 |         "\n",
426 |         "            # ignore unknown chars\n",
427 |         "            if c not in self._bigram or d not in self._bigram[c]:\n",
428 |         "                continue\n",
429 |         "\n",
430 |         "            self._bigram[c][d] += 1\n",
431 |         "\n",
432 |         "    def normalize(self):\n",
433 |         "        # normalize\n",
434 |         "        sum_unigram = sum(self._unigram.values())\n",
435 |         "        for c in self.chars:\n",
436 |         "            self._unigram[c] /= sum_unigram\n",
437 |         "\n",
438 |         "        for c in self.chars:\n",
439 |         "            sum_bigram = sum(self._bigram[c].values())\n",
440 |         "            if sum_bigram == 0:\n",
441 |         "                continue\n",
442 |         "            for d in self.chars:\n",
443 |         "                self._bigram[c][d] /= sum_bigram\n",
444 |         "\n",
445 |         "    def save(self, path):\n",
446 |         "        with open(os.path.join(path, \"unigram.pkl\"), 'wb') as pkl:\n",
447 |         "            pickle.dump(self._unigram, pkl)\n",
448 |         "        with open(os.path.join(path, \"bigram.pkl\"), 'wb') as pkl:\n",
449 |         "            pickle.dump(self._bigram, pkl)\n",
450 |         "\n",
451 |         "    def load(self, path):\n",
452 |         "        with open(os.path.join(path, \"unigram.pkl\"), 'rb') as pkl:\n",
453 |         "            self._unigram = pickle.load(pkl)\n",
454 |         "        with open(os.path.join(path, \"bigram.pkl\"), 'rb') as pkl:\n",
455 |         "            self._bigram = pickle.load(pkl)"
456 |       ],
457 |       "execution_count": 22,
458 |       "outputs": []
459 |     },
460 |     {
461 |       "cell_type": "code",
462 |       "metadata": {
463 |         "id": "BdAZjTyu2EgL"
464 |       },
465 |       "source": [
466 |         "# Loading the wikipedia dataset.\n",
467 |         "DATASET_NAME = 'wikipedia/20190301.en'\n",
468 |         "# DATASET_NAME = 'wikipedia/20190301.uk'\n",
469 |         "\n",
470 |         "dataset, dataset_info = tfds.load(\n",
471 |         "    name=DATASET_NAME,\n",
472 |         "    data_dir='tmp',\n",
473 |         "    with_info=True,\n",
474 |         "    split=tfds.Split.TRAIN)"
475 |       ],
476 |       "execution_count": 23,
477 |       "outputs": []
478 |     },
479 |     {
480 |       "cell_type": "code",
481 |       "metadata": {
482 |         "colab": {
483 |           "base_uri": "https://localhost:8080/"
484 |         },
485 |         "id": "bMhatKag2Nw9",
486 |         "outputId": "412f4dce-a3cf-4779-a0aa-e3963c625cba"
487 |       },
488 |       "source": [
489 |         "print(dataset)"
490 |       ],
491 |       "execution_count": 24,
492 |       "outputs": [
493 |         {
494 |           "output_type": "stream",
495 |           "name": "stdout",
496 |           "text": [
497 |             "<PrefetchDataset shapes: {text: (), title: ()}, types: {text: tf.string, title: tf.string}>\n"
498 |           ]
499 |         }
500 |       ]
501 |     },
502 |     {
503 |       "cell_type": "code",
504 |       "metadata": {
505 |         "colab": {
506 |           "base_uri": "https://localhost:8080/"
507 |         },
508 |         "id": "CUoeROKZ6bw7",
509 |         "outputId": "f57b710d-f87a-4b51-9346-5a76453a8646"
510 |       },
511 |       "source": [
512 |         "TRAIN_NUM_EXAMPLES = dataset_info.splits['train'].num_examples\n",
513 |         "print('Total number of articles: ', TRAIN_NUM_EXAMPLES)"
514 |       ],
515 |       "execution_count": 25,
516 |       "outputs": [
517 |         {
518 |           "output_type": "stream",
519 |           "name": "stdout",
520 |           "text": [
521 |             "Total number of articles:  5824596\n"
522 |           ]
523 |         }
524 |       ]
525 |     },
526 |     {
527 |       "cell_type": "code",
528 |       "metadata": {
529 |         "id": "Vrs6RqLE_of_"
530 |       },
531 |       "source": [
532 |         "vocab_dict = {\"<pad>\": 0,\n",
533 |         "              \"<s>\": 1,\n",
534 |         "              \"</s>\": 2,\n",
535 |         "              \"<unk>\": 3,\n",
536 |         "              \"|\": 4,\n",
537 |         "              \"E\": 5,\n",
538 |         "              \"T\": 6,\n",
539 |         "              \"A\": 7,\n",
540 |         "              \"O\": 8,\n",
541 |         "              \"N\": 9,\n",
542 |         "              \"I\": 10,\n",
543 |         "              \"H\": 11,\n",
544 |         "              \"S\": 12,\n",
545 |         "              \"R\": 13,\n",
546 |         "              \"D\": 14,\n",
547 |         "              \"L\": 15,\n",
548 |         "              \"U\": 16,\n",
549 |         "              \"M\": 17,\n",
550 |         "              \"W\": 18,\n",
551 |         "              \"C\": 19,\n",
552 |         "              \"F\": 20,\n",
553 |         "              \"G\": 21,\n",
554 |         "              \"Y\": 22,\n",
555 |         "              \"P\": 23,\n",
556 |         "              \"B\": 24,\n",
557 |         "              \"V\": 25,\n",
558 |         "              \"K\": 26,\n",
559 |         "              \"'\": 27,\n",
560 |         "              \"X\": 28,\n",
561 |         "              \"J\": 29,\n",
562 |         "              \"Q\": 30,\n",
563 |         "              \"Z\": 31\n",
564 |         "              }\n",
565 |         "\n",
566 |         "vocab_list = [key for key, value in vocab_dict.items()]"
567 |       ],
568 |       "execution_count": 26,
569 |       "outputs": []
570 |     },
571 |     {
572 |       "cell_type": "code",
573 |       "metadata": {
574 |         "id": "Bi3qjUKjAYR-"
575 |       },
576 |       "source": [
577 |         "def change_digit_to_word(x):\n",
578 |         "    x = x.replace(\"0\", \"zero \")\n",
579 |         "    x = x.replace(\"1\", \"one \")\n",
580 |         "    x = x.replace(\"2\", \"two \")\n",
581 |         "    x = x.replace(\"3\", \"three \")\n",
582 |         "    x = x.replace(\"4\", \"four \")\n",
583 |         "    x = x.replace(\"5\", \"five \")\n",
584 |         "    x = x.replace(\"6\", \"six \")\n",
585 |         "    x = x.replace(\"7\", \"seven \")\n",
586 |         "    x = x.replace(\"8\", \"eight \")\n",
587 |         "    x = x.replace(\"9\", \"nine \")\n",
588 |         "    x = x.replace(\"  \", \" \")\n",
589 |         "    x = x.strip()\n",
590 |         "    return x"
591 |       ],
592 |       "execution_count": 27,
593 |       "outputs": []
594 |     },
595 |     {
596 |       "cell_type": "code",
597 |       "metadata": {
598 |         "id": "mE4QEWz8_ORR"
599 |       },
600 |       "source": [
601 |         "lm = LanguageModel(chars=vocab_list[1:])"
602 |       ],
603 |       "execution_count": 31,
604 |       "outputs": []
605 |     },
606 |     {
607 |       "cell_type": "code",
608 |       "metadata": {
609 |         "colab": {
610 |           "base_uri": "https://localhost:8080/",
611 |           "height": 49,
612 |           "referenced_widgets": [
613 |             "0677f310a8104790b2f4545d1bece7b0",
614 |             "5cb0a81006e14c4eba6da4cb4b9bf14b",
615 |             "0cd3f576c0c2445f9817c5b56a87b900",
616 |             "4f066cb8912e40108de0c8d0d5667b09",
617 |             "35e18b46304a40e696ea98c7e05d6c94",
618 |             "54b399d1ad7747088cc4a73837b87c60",
619 |             "e2d00fc27634452383943b40e131f36d",
620 |             "b8da8be70fb342b0ae900853af0bb7d1",
621 |             "3321512e63f9446f8d2d8d57d7586607",
622 |             "ec1edc03d12241ba96e467f120e46ac0",
623 |             "030880aa5a054d788231490e0f4a7e78"
624 |           ]
625 |         },
626 |         "id": "ap0wYRTa4w6L",
627 |         "outputId": "1ba7cd8d-d70b-489b-9dc7-a143668c1fe0"
628 |       },
629 |       "source": [
630 |         "sample_per_corpus = 1000\n",
631 |         "corpus = \"\"\n",
632 |         "step = 0\n",
633 |         "for example in tqdm(dataset):\n",
634 |         "    corpus += example['title'].numpy().decode('utf-8')\n",
635 |         "    corpus += \" \"\n",
636 |         "    corpus += example['text'].numpy().decode('utf-8')\n",
637 |         "    step += 1\n",
638 |         "    if step == sample_per_corpus:\n",
639 |         "        lm.train(corpus)\n",
640 |         "        step = 0\n",
641 |         "        corpus = \"\"\n",
642 |         "lm.normalize()"
643 |       ],
644 |       "execution_count": 32,
645 |       "outputs": [
646 |         {
647 |           "output_type": "display_data",
648 |           "data": {
649 |             "application/vnd.jupyter.widget-view+json": {
650 |               "model_id": "0677f310a8104790b2f4545d1bece7b0",
651 |               "version_minor": 0,
652 |               "version_major": 2
653 |             },
654 |             "text/plain": [
655 |               "  0%|          | 0/5824596 [00:00<?, ?it/s]"
656 |             ]
657 |           },
658 |           "metadata": {}
659 |         }
660 |       ]
661 |     },
662 |     {
663 |       "cell_type": "code",
664 |       "metadata": {
665 |         "id": "pic_tHjgGiUy"
666 |       },
667 |       "source": [
668 |         "os.makedirs(\"lm\", exist_ok=True)\n",
669 |         "lm.save(\"lm\")"
670 |       ],
671 |       "execution_count": 35,
672 |       "outputs": []
673 |     },
674 |     {
675 |       "cell_type": "code",
676 |       "metadata": {
677 |         "colab": {
678 |           "base_uri": "https://localhost:8080/"
679 |         },
680 |         "id": "94Wi1OX3Hs0N",
681 |         "outputId": "1c6758b7-cb48-48a4-aa2e-a32cdf05734a"
682 |       },
683 |       "source": [
684 |         "!zip lm.zip -r lm"
685 |       ],
686 |       "execution_count": 37,
687 |       "outputs": [
688 |         {
689 |           "output_type": "stream",
690 |           "name": "stdout",
691 |           "text": [
692 |             "  adding: lm/ (stored 0%)\n",
693 |             "  adding: lm/unigram.pkl (deflated 19%)\n",
694 |             "  adding: lm/bigram.pkl (deflated 60%)\n"
695 |           ]
696 |         }
697 |       ]
698 |     },
699 |     {
700 |       "cell_type": "code",
701 |       "metadata": {
702 |         "id": "unQL7YvUK8-Q"
703 |       },
704 |       "source": [
705 |         ""
706 |       ],
707 |       "execution_count": null,
708 |       "outputs": []
709 |     }
710 |   ]
711 | }


--------------------------------------------------------------------------------
/notebooks/wav2vec2_asr_pretrained_inference.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |   "nbformat": 4,
   3 |   "nbformat_minor": 0,
   4 |   "metadata": {
   5 |     "colab": {
   6 |       "name": "Wave2Vec2 ASR Pretrained.ipynb",
   7 |       "provenance": [],
   8 |       "collapsed_sections": []
   9 |     },
  10 |     "kernelspec": {
  11 |       "name": "python3",
  12 |       "display_name": "Python 3"
  13 |     },
  14 |     "accelerator": "GPU",
  15 |     "widgets": {
  16 |       "application/vnd.jupyter.widget-state+json": {
  17 |         "b9eba979d93045a59d9c7dc84c81563d": {
  18 |           "model_module": "@jupyter-widgets/controls",
  19 |           "model_name": "HBoxModel",
  20 |           "state": {
  21 |             "_view_name": "HBoxView",
  22 |             "_dom_classes": [],
  23 |             "_model_name": "HBoxModel",
  24 |             "_view_module": "@jupyter-widgets/controls",
  25 |             "_model_module_version": "1.5.0",
  26 |             "_view_count": null,
  27 |             "_view_module_version": "1.5.0",
  28 |             "box_style": "",
  29 |             "layout": "IPY_MODEL_56d9389646aa4511beaaa7c0b0f19407",
  30 |             "_model_module": "@jupyter-widgets/controls",
  31 |             "children": [
  32 |               "IPY_MODEL_4afaee0f80ed43bbad8e12512069b585",
  33 |               "IPY_MODEL_ea41554b73df4a9a8ea8d231ef540b91"
  34 |             ]
  35 |           }
  36 |         },
  37 |         "56d9389646aa4511beaaa7c0b0f19407": {
  38 |           "model_module": "@jupyter-widgets/base",
  39 |           "model_name": "LayoutModel",
  40 |           "state": {
  41 |             "_view_name": "LayoutView",
  42 |             "grid_template_rows": null,
  43 |             "right": null,
  44 |             "justify_content": null,
  45 |             "_view_module": "@jupyter-widgets/base",
  46 |             "overflow": null,
  47 |             "_model_module_version": "1.2.0",
  48 |             "_view_count": null,
  49 |             "flex_flow": null,
  50 |             "width": null,
  51 |             "min_width": null,
  52 |             "border": null,
  53 |             "align_items": null,
  54 |             "bottom": null,
  55 |             "_model_module": "@jupyter-widgets/base",
  56 |             "top": null,
  57 |             "grid_column": null,
  58 |             "overflow_y": null,
  59 |             "overflow_x": null,
  60 |             "grid_auto_flow": null,
  61 |             "grid_area": null,
  62 |             "grid_template_columns": null,
  63 |             "flex": null,
  64 |             "_model_name": "LayoutModel",
  65 |             "justify_items": null,
  66 |             "grid_row": null,
  67 |             "max_height": null,
  68 |             "align_content": null,
  69 |             "visibility": null,
  70 |             "align_self": null,
  71 |             "height": null,
  72 |             "min_height": null,
  73 |             "padding": null,
  74 |             "grid_auto_rows": null,
  75 |             "grid_gap": null,
  76 |             "max_width": null,
  77 |             "order": null,
  78 |             "_view_module_version": "1.2.0",
  79 |             "grid_template_areas": null,
  80 |             "object_position": null,
  81 |             "object_fit": null,
  82 |             "grid_auto_columns": null,
  83 |             "margin": null,
  84 |             "display": null,
  85 |             "left": null
  86 |           }
  87 |         },
  88 |         "4afaee0f80ed43bbad8e12512069b585": {
  89 |           "model_module": "@jupyter-widgets/controls",
  90 |           "model_name": "FloatProgressModel",
  91 |           "state": {
  92 |             "_view_name": "ProgressView",
  93 |             "style": "IPY_MODEL_c52cb77480a2447ca8f398ccd3bf77ef",
  94 |             "_dom_classes": [],
  95 |             "description": "Downloading: 100%",
  96 |             "_model_name": "FloatProgressModel",
  97 |             "bar_style": "success",
  98 |             "max": 291,
  99 |             "_view_module": "@jupyter-widgets/controls",
 100 |             "_model_module_version": "1.5.0",
 101 |             "value": 291,
 102 |             "_view_count": null,
 103 |             "_view_module_version": "1.5.0",
 104 |             "orientation": "horizontal",
 105 |             "min": 0,
 106 |             "description_tooltip": null,
 107 |             "_model_module": "@jupyter-widgets/controls",
 108 |             "layout": "IPY_MODEL_70f1773130164f95b956c1586146b11e"
 109 |           }
 110 |         },
 111 |         "ea41554b73df4a9a8ea8d231ef540b91": {
 112 |           "model_module": "@jupyter-widgets/controls",
 113 |           "model_name": "HTMLModel",
 114 |           "state": {
 115 |             "_view_name": "HTMLView",
 116 |             "style": "IPY_MODEL_ebba19e5d13647618e52411f1e17380b",
 117 |             "_dom_classes": [],
 118 |             "description": "",
 119 |             "_model_name": "HTMLModel",
 120 |             "placeholder": "​",
 121 |             "_view_module": "@jupyter-widgets/controls",
 122 |             "_model_module_version": "1.5.0",
 123 |             "value": " 291/291 [00:00&lt;00:00, 3.60kB/s]",
 124 |             "_view_count": null,
 125 |             "_view_module_version": "1.5.0",
 126 |             "description_tooltip": null,
 127 |             "_model_module": "@jupyter-widgets/controls",
 128 |             "layout": "IPY_MODEL_d2812369c18a4c0bb62bfacde73e9bb9"
 129 |           }
 130 |         },
 131 |         "c52cb77480a2447ca8f398ccd3bf77ef": {
 132 |           "model_module": "@jupyter-widgets/controls",
 133 |           "model_name": "ProgressStyleModel",
 134 |           "state": {
 135 |             "_view_name": "StyleView",
 136 |             "_model_name": "ProgressStyleModel",
 137 |             "description_width": "initial",
 138 |             "_view_module": "@jupyter-widgets/base",
 139 |             "_model_module_version": "1.5.0",
 140 |             "_view_count": null,
 141 |             "_view_module_version": "1.2.0",
 142 |             "bar_color": null,
 143 |             "_model_module": "@jupyter-widgets/controls"
 144 |           }
 145 |         },
 146 |         "70f1773130164f95b956c1586146b11e": {
 147 |           "model_module": "@jupyter-widgets/base",
 148 |           "model_name": "LayoutModel",
 149 |           "state": {
 150 |             "_view_name": "LayoutView",
 151 |             "grid_template_rows": null,
 152 |             "right": null,
 153 |             "justify_content": null,
 154 |             "_view_module": "@jupyter-widgets/base",
 155 |             "overflow": null,
 156 |             "_model_module_version": "1.2.0",
 157 |             "_view_count": null,
 158 |             "flex_flow": null,
 159 |             "width": null,
 160 |             "min_width": null,
 161 |             "border": null,
 162 |             "align_items": null,
 163 |             "bottom": null,
 164 |             "_model_module": "@jupyter-widgets/base",
 165 |             "top": null,
 166 |             "grid_column": null,
 167 |             "overflow_y": null,
 168 |             "overflow_x": null,
 169 |             "grid_auto_flow": null,
 170 |             "grid_area": null,
 171 |             "grid_template_columns": null,
 172 |             "flex": null,
 173 |             "_model_name": "LayoutModel",
 174 |             "justify_items": null,
 175 |             "grid_row": null,
 176 |             "max_height": null,
 177 |             "align_content": null,
 178 |             "visibility": null,
 179 |             "align_self": null,
 180 |             "height": null,
 181 |             "min_height": null,
 182 |             "padding": null,
 183 |             "grid_auto_rows": null,
 184 |             "grid_gap": null,
 185 |             "max_width": null,
 186 |             "order": null,
 187 |             "_view_module_version": "1.2.0",
 188 |             "grid_template_areas": null,
 189 |             "object_position": null,
 190 |             "object_fit": null,
 191 |             "grid_auto_columns": null,
 192 |             "margin": null,
 193 |             "display": null,
 194 |             "left": null
 195 |           }
 196 |         },
 197 |         "ebba19e5d13647618e52411f1e17380b": {
 198 |           "model_module": "@jupyter-widgets/controls",
 199 |           "model_name": "DescriptionStyleModel",
 200 |           "state": {
 201 |             "_view_name": "StyleView",
 202 |             "_model_name": "DescriptionStyleModel",
 203 |             "description_width": "",
 204 |             "_view_module": "@jupyter-widgets/base",
 205 |             "_model_module_version": "1.5.0",
 206 |             "_view_count": null,
 207 |             "_view_module_version": "1.2.0",
 208 |             "_model_module": "@jupyter-widgets/controls"
 209 |           }
 210 |         },
 211 |         "d2812369c18a4c0bb62bfacde73e9bb9": {
 212 |           "model_module": "@jupyter-widgets/base",
 213 |           "model_name": "LayoutModel",
 214 |           "state": {
 215 |             "_view_name": "LayoutView",
 216 |             "grid_template_rows": null,
 217 |             "right": null,
 218 |             "justify_content": null,
 219 |             "_view_module": "@jupyter-widgets/base",
 220 |             "overflow": null,
 221 |             "_model_module_version": "1.2.0",
 222 |             "_view_count": null,
 223 |             "flex_flow": null,
 224 |             "width": null,
 225 |             "min_width": null,
 226 |             "border": null,
 227 |             "align_items": null,
 228 |             "bottom": null,
 229 |             "_model_module": "@jupyter-widgets/base",
 230 |             "top": null,
 231 |             "grid_column": null,
 232 |             "overflow_y": null,
 233 |             "overflow_x": null,
 234 |             "grid_auto_flow": null,
 235 |             "grid_area": null,
 236 |             "grid_template_columns": null,
 237 |             "flex": null,
 238 |             "_model_name": "LayoutModel",
 239 |             "justify_items": null,
 240 |             "grid_row": null,
 241 |             "max_height": null,
 242 |             "align_content": null,
 243 |             "visibility": null,
 244 |             "align_self": null,
 245 |             "height": null,
 246 |             "min_height": null,
 247 |             "padding": null,
 248 |             "grid_auto_rows": null,
 249 |             "grid_gap": null,
 250 |             "max_width": null,
 251 |             "order": null,
 252 |             "_view_module_version": "1.2.0",
 253 |             "grid_template_areas": null,
 254 |             "object_position": null,
 255 |             "object_fit": null,
 256 |             "grid_auto_columns": null,
 257 |             "margin": null,
 258 |             "display": null,
 259 |             "left": null
 260 |           }
 261 |         },
 262 |         "3bc07119f9464d91adf78b8c0468ba80": {
 263 |           "model_module": "@jupyter-widgets/controls",
 264 |           "model_name": "HBoxModel",
 265 |           "state": {
 266 |             "_view_name": "HBoxView",
 267 |             "_dom_classes": [],
 268 |             "_model_name": "HBoxModel",
 269 |             "_view_module": "@jupyter-widgets/controls",
 270 |             "_model_module_version": "1.5.0",
 271 |             "_view_count": null,
 272 |             "_view_module_version": "1.5.0",
 273 |             "box_style": "",
 274 |             "layout": "IPY_MODEL_3c4629d13adf40808422244e7a9241b7",
 275 |             "_model_module": "@jupyter-widgets/controls",
 276 |             "children": [
 277 |               "IPY_MODEL_ae59d2fa5e5348d28f3234eeeefa1ad0",
 278 |               "IPY_MODEL_96a8f6d86b2e43eb953234b9154d2ae9"
 279 |             ]
 280 |           }
 281 |         },
 282 |         "3c4629d13adf40808422244e7a9241b7": {
 283 |           "model_module": "@jupyter-widgets/base",
 284 |           "model_name": "LayoutModel",
 285 |           "state": {
 286 |             "_view_name": "LayoutView",
 287 |             "grid_template_rows": null,
 288 |             "right": null,
 289 |             "justify_content": null,
 290 |             "_view_module": "@jupyter-widgets/base",
 291 |             "overflow": null,
 292 |             "_model_module_version": "1.2.0",
 293 |             "_view_count": null,
 294 |             "flex_flow": null,
 295 |             "width": null,
 296 |             "min_width": null,
 297 |             "border": null,
 298 |             "align_items": null,
 299 |             "bottom": null,
 300 |             "_model_module": "@jupyter-widgets/base",
 301 |             "top": null,
 302 |             "grid_column": null,
 303 |             "overflow_y": null,
 304 |             "overflow_x": null,
 305 |             "grid_auto_flow": null,
 306 |             "grid_area": null,
 307 |             "grid_template_columns": null,
 308 |             "flex": null,
 309 |             "_model_name": "LayoutModel",
 310 |             "justify_items": null,
 311 |             "grid_row": null,
 312 |             "max_height": null,
 313 |             "align_content": null,
 314 |             "visibility": null,
 315 |             "align_self": null,
 316 |             "height": null,
 317 |             "min_height": null,
 318 |             "padding": null,
 319 |             "grid_auto_rows": null,
 320 |             "grid_gap": null,
 321 |             "max_width": null,
 322 |             "order": null,
 323 |             "_view_module_version": "1.2.0",
 324 |             "grid_template_areas": null,
 325 |             "object_position": null,
 326 |             "object_fit": null,
 327 |             "grid_auto_columns": null,
 328 |             "margin": null,
 329 |             "display": null,
 330 |             "left": null
 331 |           }
 332 |         },
 333 |         "ae59d2fa5e5348d28f3234eeeefa1ad0": {
 334 |           "model_module": "@jupyter-widgets/controls",
 335 |           "model_name": "FloatProgressModel",
 336 |           "state": {
 337 |             "_view_name": "ProgressView",
 338 |             "style": "IPY_MODEL_2c4aedd6b81f468f9004ba95292e4442",
 339 |             "_dom_classes": [],
 340 |             "description": "Downloading: 100%",
 341 |             "_model_name": "FloatProgressModel",
 342 |             "bar_style": "success",
 343 |             "max": 163,
 344 |             "_view_module": "@jupyter-widgets/controls",
 345 |             "_model_module_version": "1.5.0",
 346 |             "value": 163,
 347 |             "_view_count": null,
 348 |             "_view_module_version": "1.5.0",
 349 |             "orientation": "horizontal",
 350 |             "min": 0,
 351 |             "description_tooltip": null,
 352 |             "_model_module": "@jupyter-widgets/controls",
 353 |             "layout": "IPY_MODEL_6db0bd83adcb4bd0ac9561460b6c96b3"
 354 |           }
 355 |         },
 356 |         "96a8f6d86b2e43eb953234b9154d2ae9": {
 357 |           "model_module": "@jupyter-widgets/controls",
 358 |           "model_name": "HTMLModel",
 359 |           "state": {
 360 |             "_view_name": "HTMLView",
 361 |             "style": "IPY_MODEL_f7054ca178bb47bd9f9ab35c9c1efa87",
 362 |             "_dom_classes": [],
 363 |             "description": "",
 364 |             "_model_name": "HTMLModel",
 365 |             "placeholder": "​",
 366 |             "_view_module": "@jupyter-widgets/controls",
 367 |             "_model_module_version": "1.5.0",
 368 |             "value": " 163/163 [00:05&lt;00:00, 27.3B/s]",
 369 |             "_view_count": null,
 370 |             "_view_module_version": "1.5.0",
 371 |             "description_tooltip": null,
 372 |             "_model_module": "@jupyter-widgets/controls",
 373 |             "layout": "IPY_MODEL_c47adcf2f9374fc0a61fac2f00bb645d"
 374 |           }
 375 |         },
 376 |         "2c4aedd6b81f468f9004ba95292e4442": {
 377 |           "model_module": "@jupyter-widgets/controls",
 378 |           "model_name": "ProgressStyleModel",
 379 |           "state": {
 380 |             "_view_name": "StyleView",
 381 |             "_model_name": "ProgressStyleModel",
 382 |             "description_width": "initial",
 383 |             "_view_module": "@jupyter-widgets/base",
 384 |             "_model_module_version": "1.5.0",
 385 |             "_view_count": null,
 386 |             "_view_module_version": "1.2.0",
 387 |             "bar_color": null,
 388 |             "_model_module": "@jupyter-widgets/controls"
 389 |           }
 390 |         },
 391 |         "6db0bd83adcb4bd0ac9561460b6c96b3": {
 392 |           "model_module": "@jupyter-widgets/base",
 393 |           "model_name": "LayoutModel",
 394 |           "state": {
 395 |             "_view_name": "LayoutView",
 396 |             "grid_template_rows": null,
 397 |             "right": null,
 398 |             "justify_content": null,
 399 |             "_view_module": "@jupyter-widgets/base",
 400 |             "overflow": null,
 401 |             "_model_module_version": "1.2.0",
 402 |             "_view_count": null,
 403 |             "flex_flow": null,
 404 |             "width": null,
 405 |             "min_width": null,
 406 |             "border": null,
 407 |             "align_items": null,
 408 |             "bottom": null,
 409 |             "_model_module": "@jupyter-widgets/base",
 410 |             "top": null,
 411 |             "grid_column": null,
 412 |             "overflow_y": null,
 413 |             "overflow_x": null,
 414 |             "grid_auto_flow": null,
 415 |             "grid_area": null,
 416 |             "grid_template_columns": null,
 417 |             "flex": null,
 418 |             "_model_name": "LayoutModel",
 419 |             "justify_items": null,
 420 |             "grid_row": null,
 421 |             "max_height": null,
 422 |             "align_content": null,
 423 |             "visibility": null,
 424 |             "align_self": null,
 425 |             "height": null,
 426 |             "min_height": null,
 427 |             "padding": null,
 428 |             "grid_auto_rows": null,
 429 |             "grid_gap": null,
 430 |             "max_width": null,
 431 |             "order": null,
 432 |             "_view_module_version": "1.2.0",
 433 |             "grid_template_areas": null,
 434 |             "object_position": null,
 435 |             "object_fit": null,
 436 |             "grid_auto_columns": null,
 437 |             "margin": null,
 438 |             "display": null,
 439 |             "left": null
 440 |           }
 441 |         },
 442 |         "f7054ca178bb47bd9f9ab35c9c1efa87": {
 443 |           "model_module": "@jupyter-widgets/controls",
 444 |           "model_name": "DescriptionStyleModel",
 445 |           "state": {
 446 |             "_view_name": "StyleView",
 447 |             "_model_name": "DescriptionStyleModel",
 448 |             "description_width": "",
 449 |             "_view_module": "@jupyter-widgets/base",
 450 |             "_model_module_version": "1.5.0",
 451 |             "_view_count": null,
 452 |             "_view_module_version": "1.2.0",
 453 |             "_model_module": "@jupyter-widgets/controls"
 454 |           }
 455 |         },
 456 |         "c47adcf2f9374fc0a61fac2f00bb645d": {
 457 |           "model_module": "@jupyter-widgets/base",
 458 |           "model_name": "LayoutModel",
 459 |           "state": {
 460 |             "_view_name": "LayoutView",
 461 |             "grid_template_rows": null,
 462 |             "right": null,
 463 |             "justify_content": null,
 464 |             "_view_module": "@jupyter-widgets/base",
 465 |             "overflow": null,
 466 |             "_model_module_version": "1.2.0",
 467 |             "_view_count": null,
 468 |             "flex_flow": null,
 469 |             "width": null,
 470 |             "min_width": null,
 471 |             "border": null,
 472 |             "align_items": null,
 473 |             "bottom": null,
 474 |             "_model_module": "@jupyter-widgets/base",
 475 |             "top": null,
 476 |             "grid_column": null,
 477 |             "overflow_y": null,
 478 |             "overflow_x": null,
 479 |             "grid_auto_flow": null,
 480 |             "grid_area": null,
 481 |             "grid_template_columns": null,
 482 |             "flex": null,
 483 |             "_model_name": "LayoutModel",
 484 |             "justify_items": null,
 485 |             "grid_row": null,
 486 |             "max_height": null,
 487 |             "align_content": null,
 488 |             "visibility": null,
 489 |             "align_self": null,
 490 |             "height": null,
 491 |             "min_height": null,
 492 |             "padding": null,
 493 |             "grid_auto_rows": null,
 494 |             "grid_gap": null,
 495 |             "max_width": null,
 496 |             "order": null,
 497 |             "_view_module_version": "1.2.0",
 498 |             "grid_template_areas": null,
 499 |             "object_position": null,
 500 |             "object_fit": null,
 501 |             "grid_auto_columns": null,
 502 |             "margin": null,
 503 |             "display": null,
 504 |             "left": null
 505 |           }
 506 |         },
 507 |         "43c1ab4c55944e5d944fb284a8f3bd5c": {
 508 |           "model_module": "@jupyter-widgets/controls",
 509 |           "model_name": "HBoxModel",
 510 |           "state": {
 511 |             "_view_name": "HBoxView",
 512 |             "_dom_classes": [],
 513 |             "_model_name": "HBoxModel",
 514 |             "_view_module": "@jupyter-widgets/controls",
 515 |             "_model_module_version": "1.5.0",
 516 |             "_view_count": null,
 517 |             "_view_module_version": "1.5.0",
 518 |             "box_style": "",
 519 |             "layout": "IPY_MODEL_cfcddae93f334b8c997e62c1a897ce8e",
 520 |             "_model_module": "@jupyter-widgets/controls",
 521 |             "children": [
 522 |               "IPY_MODEL_f076c530b4334df78d69463dd77133f9",
 523 |               "IPY_MODEL_cc034ea0d4c34d37a71efa54f41599bc"
 524 |             ]
 525 |           }
 526 |         },
 527 |         "cfcddae93f334b8c997e62c1a897ce8e": {
 528 |           "model_module": "@jupyter-widgets/base",
 529 |           "model_name": "LayoutModel",
 530 |           "state": {
 531 |             "_view_name": "LayoutView",
 532 |             "grid_template_rows": null,
 533 |             "right": null,
 534 |             "justify_content": null,
 535 |             "_view_module": "@jupyter-widgets/base",
 536 |             "overflow": null,
 537 |             "_model_module_version": "1.2.0",
 538 |             "_view_count": null,
 539 |             "flex_flow": null,
 540 |             "width": null,
 541 |             "min_width": null,
 542 |             "border": null,
 543 |             "align_items": null,
 544 |             "bottom": null,
 545 |             "_model_module": "@jupyter-widgets/base",
 546 |             "top": null,
 547 |             "grid_column": null,
 548 |             "overflow_y": null,
 549 |             "overflow_x": null,
 550 |             "grid_auto_flow": null,
 551 |             "grid_area": null,
 552 |             "grid_template_columns": null,
 553 |             "flex": null,
 554 |             "_model_name": "LayoutModel",
 555 |             "justify_items": null,
 556 |             "grid_row": null,
 557 |             "max_height": null,
 558 |             "align_content": null,
 559 |             "visibility": null,
 560 |             "align_self": null,
 561 |             "height": null,
 562 |             "min_height": null,
 563 |             "padding": null,
 564 |             "grid_auto_rows": null,
 565 |             "grid_gap": null,
 566 |             "max_width": null,
 567 |             "order": null,
 568 |             "_view_module_version": "1.2.0",
 569 |             "grid_template_areas": null,
 570 |             "object_position": null,
 571 |             "object_fit": null,
 572 |             "grid_auto_columns": null,
 573 |             "margin": null,
 574 |             "display": null,
 575 |             "left": null
 576 |           }
 577 |         },
 578 |         "f076c530b4334df78d69463dd77133f9": {
 579 |           "model_module": "@jupyter-widgets/controls",
 580 |           "model_name": "FloatProgressModel",
 581 |           "state": {
 582 |             "_view_name": "ProgressView",
 583 |             "style": "IPY_MODEL_be95b7d1ce6f409c82cec0a84a63f893",
 584 |             "_dom_classes": [],
 585 |             "description": "Downloading: 100%",
 586 |             "_model_name": "FloatProgressModel",
 587 |             "bar_style": "success",
 588 |             "max": 85,
 589 |             "_view_module": "@jupyter-widgets/controls",
 590 |             "_model_module_version": "1.5.0",
 591 |             "value": 85,
 592 |             "_view_count": null,
 593 |             "_view_module_version": "1.5.0",
 594 |             "orientation": "horizontal",
 595 |             "min": 0,
 596 |             "description_tooltip": null,
 597 |             "_model_module": "@jupyter-widgets/controls",
 598 |             "layout": "IPY_MODEL_5b0c05260b1344c98f71e07cb3000765"
 599 |           }
 600 |         },
 601 |         "cc034ea0d4c34d37a71efa54f41599bc": {
 602 |           "model_module": "@jupyter-widgets/controls",
 603 |           "model_name": "HTMLModel",
 604 |           "state": {
 605 |             "_view_name": "HTMLView",
 606 |             "style": "IPY_MODEL_45b8aad8e4a34d3dac0965acaee0bf1e",
 607 |             "_dom_classes": [],
 608 |             "description": "",
 609 |             "_model_name": "HTMLModel",
 610 |             "placeholder": "​",
 611 |             "_view_module": "@jupyter-widgets/controls",
 612 |             "_model_module_version": "1.5.0",
 613 |             "value": " 85.0/85.0 [00:00&lt;00:00, 542B/s]",
 614 |             "_view_count": null,
 615 |             "_view_module_version": "1.5.0",
 616 |             "description_tooltip": null,
 617 |             "_model_module": "@jupyter-widgets/controls",
 618 |             "layout": "IPY_MODEL_1e41b1904bde469b8838b7e5c1cc629d"
 619 |           }
 620 |         },
 621 |         "be95b7d1ce6f409c82cec0a84a63f893": {
 622 |           "model_module": "@jupyter-widgets/controls",
 623 |           "model_name": "ProgressStyleModel",
 624 |           "state": {
 625 |             "_view_name": "StyleView",
 626 |             "_model_name": "ProgressStyleModel",
 627 |             "description_width": "initial",
 628 |             "_view_module": "@jupyter-widgets/base",
 629 |             "_model_module_version": "1.5.0",
 630 |             "_view_count": null,
 631 |             "_view_module_version": "1.2.0",
 632 |             "bar_color": null,
 633 |             "_model_module": "@jupyter-widgets/controls"
 634 |           }
 635 |         },
 636 |         "5b0c05260b1344c98f71e07cb3000765": {
 637 |           "model_module": "@jupyter-widgets/base",
 638 |           "model_name": "LayoutModel",
 639 |           "state": {
 640 |             "_view_name": "LayoutView",
 641 |             "grid_template_rows": null,
 642 |             "right": null,
 643 |             "justify_content": null,
 644 |             "_view_module": "@jupyter-widgets/base",
 645 |             "overflow": null,
 646 |             "_model_module_version": "1.2.0",
 647 |             "_view_count": null,
 648 |             "flex_flow": null,
 649 |             "width": null,
 650 |             "min_width": null,
 651 |             "border": null,
 652 |             "align_items": null,
 653 |             "bottom": null,
 654 |             "_model_module": "@jupyter-widgets/base",
 655 |             "top": null,
 656 |             "grid_column": null,
 657 |             "overflow_y": null,
 658 |             "overflow_x": null,
 659 |             "grid_auto_flow": null,
 660 |             "grid_area": null,
 661 |             "grid_template_columns": null,
 662 |             "flex": null,
 663 |             "_model_name": "LayoutModel",
 664 |             "justify_items": null,
 665 |             "grid_row": null,
 666 |             "max_height": null,
 667 |             "align_content": null,
 668 |             "visibility": null,
 669 |             "align_self": null,
 670 |             "height": null,
 671 |             "min_height": null,
 672 |             "padding": null,
 673 |             "grid_auto_rows": null,
 674 |             "grid_gap": null,
 675 |             "max_width": null,
 676 |             "order": null,
 677 |             "_view_module_version": "1.2.0",
 678 |             "grid_template_areas": null,
 679 |             "object_position": null,
 680 |             "object_fit": null,
 681 |             "grid_auto_columns": null,
 682 |             "margin": null,
 683 |             "display": null,
 684 |             "left": null
 685 |           }
 686 |         },
 687 |         "45b8aad8e4a34d3dac0965acaee0bf1e": {
 688 |           "model_module": "@jupyter-widgets/controls",
 689 |           "model_name": "DescriptionStyleModel",
 690 |           "state": {
 691 |             "_view_name": "StyleView",
 692 |             "_model_name": "DescriptionStyleModel",
 693 |             "description_width": "",
 694 |             "_view_module": "@jupyter-widgets/base",
 695 |             "_model_module_version": "1.5.0",
 696 |             "_view_count": null,
 697 |             "_view_module_version": "1.2.0",
 698 |             "_model_module": "@jupyter-widgets/controls"
 699 |           }
 700 |         },
 701 |         "1e41b1904bde469b8838b7e5c1cc629d": {
 702 |           "model_module": "@jupyter-widgets/base",
 703 |           "model_name": "LayoutModel",
 704 |           "state": {
 705 |             "_view_name": "LayoutView",
 706 |             "grid_template_rows": null,
 707 |             "right": null,
 708 |             "justify_content": null,
 709 |             "_view_module": "@jupyter-widgets/base",
 710 |             "overflow": null,
 711 |             "_model_module_version": "1.2.0",
 712 |             "_view_count": null,
 713 |             "flex_flow": null,
 714 |             "width": null,
 715 |             "min_width": null,
 716 |             "border": null,
 717 |             "align_items": null,
 718 |             "bottom": null,
 719 |             "_model_module": "@jupyter-widgets/base",
 720 |             "top": null,
 721 |             "grid_column": null,
 722 |             "overflow_y": null,
 723 |             "overflow_x": null,
 724 |             "grid_auto_flow": null,
 725 |             "grid_area": null,
 726 |             "grid_template_columns": null,
 727 |             "flex": null,
 728 |             "_model_name": "LayoutModel",
 729 |             "justify_items": null,
 730 |             "grid_row": null,
 731 |             "max_height": null,
 732 |             "align_content": null,
 733 |             "visibility": null,
 734 |             "align_self": null,
 735 |             "height": null,
 736 |             "min_height": null,
 737 |             "padding": null,
 738 |             "grid_auto_rows": null,
 739 |             "grid_gap": null,
 740 |             "max_width": null,
 741 |             "order": null,
 742 |             "_view_module_version": "1.2.0",
 743 |             "grid_template_areas": null,
 744 |             "object_position": null,
 745 |             "object_fit": null,
 746 |             "grid_auto_columns": null,
 747 |             "margin": null,
 748 |             "display": null,
 749 |             "left": null
 750 |           }
 751 |         },
 752 |         "6b8f56ee5d4b446fa1d9505a6fc37406": {
 753 |           "model_module": "@jupyter-widgets/controls",
 754 |           "model_name": "HBoxModel",
 755 |           "state": {
 756 |             "_view_name": "HBoxView",
 757 |             "_dom_classes": [],
 758 |             "_model_name": "HBoxModel",
 759 |             "_view_module": "@jupyter-widgets/controls",
 760 |             "_model_module_version": "1.5.0",
 761 |             "_view_count": null,
 762 |             "_view_module_version": "1.5.0",
 763 |             "box_style": "",
 764 |             "layout": "IPY_MODEL_5cfb0b9c33ef4df3b589d7693e8ea18a",
 765 |             "_model_module": "@jupyter-widgets/controls",
 766 |             "children": [
 767 |               "IPY_MODEL_6933f1570f004c1aac5c38f3d94bece0",
 768 |               "IPY_MODEL_2c109bfebef741e7b73a673dc8992f67"
 769 |             ]
 770 |           }
 771 |         },
 772 |         "5cfb0b9c33ef4df3b589d7693e8ea18a": {
 773 |           "model_module": "@jupyter-widgets/base",
 774 |           "model_name": "LayoutModel",
 775 |           "state": {
 776 |             "_view_name": "LayoutView",
 777 |             "grid_template_rows": null,
 778 |             "right": null,
 779 |             "justify_content": null,
 780 |             "_view_module": "@jupyter-widgets/base",
 781 |             "overflow": null,
 782 |             "_model_module_version": "1.2.0",
 783 |             "_view_count": null,
 784 |             "flex_flow": null,
 785 |             "width": null,
 786 |             "min_width": null,
 787 |             "border": null,
 788 |             "align_items": null,
 789 |             "bottom": null,
 790 |             "_model_module": "@jupyter-widgets/base",
 791 |             "top": null,
 792 |             "grid_column": null,
 793 |             "overflow_y": null,
 794 |             "overflow_x": null,
 795 |             "grid_auto_flow": null,
 796 |             "grid_area": null,
 797 |             "grid_template_columns": null,
 798 |             "flex": null,
 799 |             "_model_name": "LayoutModel",
 800 |             "justify_items": null,
 801 |             "grid_row": null,
 802 |             "max_height": null,
 803 |             "align_content": null,
 804 |             "visibility": null,
 805 |             "align_self": null,
 806 |             "height": null,
 807 |             "min_height": null,
 808 |             "padding": null,
 809 |             "grid_auto_rows": null,
 810 |             "grid_gap": null,
 811 |             "max_width": null,
 812 |             "order": null,
 813 |             "_view_module_version": "1.2.0",
 814 |             "grid_template_areas": null,
 815 |             "object_position": null,
 816 |             "object_fit": null,
 817 |             "grid_auto_columns": null,
 818 |             "margin": null,
 819 |             "display": null,
 820 |             "left": null
 821 |           }
 822 |         },
 823 |         "6933f1570f004c1aac5c38f3d94bece0": {
 824 |           "model_module": "@jupyter-widgets/controls",
 825 |           "model_name": "FloatProgressModel",
 826 |           "state": {
 827 |             "_view_name": "ProgressView",
 828 |             "style": "IPY_MODEL_01a986ebdc51478084ed5a7a8dfd9980",
 829 |             "_dom_classes": [],
 830 |             "description": "Downloading: 100%",
 831 |             "_model_name": "FloatProgressModel",
 832 |             "bar_style": "success",
 833 |             "max": 843,
 834 |             "_view_module": "@jupyter-widgets/controls",
 835 |             "_model_module_version": "1.5.0",
 836 |             "value": 843,
 837 |             "_view_count": null,
 838 |             "_view_module_version": "1.5.0",
 839 |             "orientation": "horizontal",
 840 |             "min": 0,
 841 |             "description_tooltip": null,
 842 |             "_model_module": "@jupyter-widgets/controls",
 843 |             "layout": "IPY_MODEL_2c785d1260a14da3a3029e44f1404b7f"
 844 |           }
 845 |         },
 846 |         "2c109bfebef741e7b73a673dc8992f67": {
 847 |           "model_module": "@jupyter-widgets/controls",
 848 |           "model_name": "HTMLModel",
 849 |           "state": {
 850 |             "_view_name": "HTMLView",
 851 |             "style": "IPY_MODEL_43ebb7099ead40c880b041791532a8e9",
 852 |             "_dom_classes": [],
 853 |             "description": "",
 854 |             "_model_name": "HTMLModel",
 855 |             "placeholder": "​",
 856 |             "_view_module": "@jupyter-widgets/controls",
 857 |             "_model_module_version": "1.5.0",
 858 |             "value": " 843/843 [00:00&lt;00:00, 12.8kB/s]",
 859 |             "_view_count": null,
 860 |             "_view_module_version": "1.5.0",
 861 |             "description_tooltip": null,
 862 |             "_model_module": "@jupyter-widgets/controls",
 863 |             "layout": "IPY_MODEL_01ea37749561419bb677ea2deea16d65"
 864 |           }
 865 |         },
 866 |         "01a986ebdc51478084ed5a7a8dfd9980": {
 867 |           "model_module": "@jupyter-widgets/controls",
 868 |           "model_name": "ProgressStyleModel",
 869 |           "state": {
 870 |             "_view_name": "StyleView",
 871 |             "_model_name": "ProgressStyleModel",
 872 |             "description_width": "initial",
 873 |             "_view_module": "@jupyter-widgets/base",
 874 |             "_model_module_version": "1.5.0",
 875 |             "_view_count": null,
 876 |             "_view_module_version": "1.2.0",
 877 |             "bar_color": null,
 878 |             "_model_module": "@jupyter-widgets/controls"
 879 |           }
 880 |         },
 881 |         "2c785d1260a14da3a3029e44f1404b7f": {
 882 |           "model_module": "@jupyter-widgets/base",
 883 |           "model_name": "LayoutModel",
 884 |           "state": {
 885 |             "_view_name": "LayoutView",
 886 |             "grid_template_rows": null,
 887 |             "right": null,
 888 |             "justify_content": null,
 889 |             "_view_module": "@jupyter-widgets/base",
 890 |             "overflow": null,
 891 |             "_model_module_version": "1.2.0",
 892 |             "_view_count": null,
 893 |             "flex_flow": null,
 894 |             "width": null,
 895 |             "min_width": null,
 896 |             "border": null,
 897 |             "align_items": null,
 898 |             "bottom": null,
 899 |             "_model_module": "@jupyter-widgets/base",
 900 |             "top": null,
 901 |             "grid_column": null,
 902 |             "overflow_y": null,
 903 |             "overflow_x": null,
 904 |             "grid_auto_flow": null,
 905 |             "grid_area": null,
 906 |             "grid_template_columns": null,
 907 |             "flex": null,
 908 |             "_model_name": "LayoutModel",
 909 |             "justify_items": null,
 910 |             "grid_row": null,
 911 |             "max_height": null,
 912 |             "align_content": null,
 913 |             "visibility": null,
 914 |             "align_self": null,
 915 |             "height": null,
 916 |             "min_height": null,
 917 |             "padding": null,
 918 |             "grid_auto_rows": null,
 919 |             "grid_gap": null,
 920 |             "max_width": null,
 921 |             "order": null,
 922 |             "_view_module_version": "1.2.0",
 923 |             "grid_template_areas": null,
 924 |             "object_position": null,
 925 |             "object_fit": null,
 926 |             "grid_auto_columns": null,
 927 |             "margin": null,
 928 |             "display": null,
 929 |             "left": null
 930 |           }
 931 |         },
 932 |         "43ebb7099ead40c880b041791532a8e9": {
 933 |           "model_module": "@jupyter-widgets/controls",
 934 |           "model_name": "DescriptionStyleModel",
 935 |           "state": {
 936 |             "_view_name": "StyleView",
 937 |             "_model_name": "DescriptionStyleModel",
 938 |             "description_width": "",
 939 |             "_view_module": "@jupyter-widgets/base",
 940 |             "_model_module_version": "1.5.0",
 941 |             "_view_count": null,
 942 |             "_view_module_version": "1.2.0",
 943 |             "_model_module": "@jupyter-widgets/controls"
 944 |           }
 945 |         },
 946 |         "01ea37749561419bb677ea2deea16d65": {
 947 |           "model_module": "@jupyter-widgets/base",
 948 |           "model_name": "LayoutModel",
 949 |           "state": {
 950 |             "_view_name": "LayoutView",
 951 |             "grid_template_rows": null,
 952 |             "right": null,
 953 |             "justify_content": null,
 954 |             "_view_module": "@jupyter-widgets/base",
 955 |             "overflow": null,
 956 |             "_model_module_version": "1.2.0",
 957 |             "_view_count": null,
 958 |             "flex_flow": null,
 959 |             "width": null,
 960 |             "min_width": null,
 961 |             "border": null,
 962 |             "align_items": null,
 963 |             "bottom": null,
 964 |             "_model_module": "@jupyter-widgets/base",
 965 |             "top": null,
 966 |             "grid_column": null,
 967 |             "overflow_y": null,
 968 |             "overflow_x": null,
 969 |             "grid_auto_flow": null,
 970 |             "grid_area": null,
 971 |             "grid_template_columns": null,
 972 |             "flex": null,
 973 |             "_model_name": "LayoutModel",
 974 |             "justify_items": null,
 975 |             "grid_row": null,
 976 |             "max_height": null,
 977 |             "align_content": null,
 978 |             "visibility": null,
 979 |             "align_self": null,
 980 |             "height": null,
 981 |             "min_height": null,
 982 |             "padding": null,
 983 |             "grid_auto_rows": null,
 984 |             "grid_gap": null,
 985 |             "max_width": null,
 986 |             "order": null,
 987 |             "_view_module_version": "1.2.0",
 988 |             "grid_template_areas": null,
 989 |             "object_position": null,
 990 |             "object_fit": null,
 991 |             "grid_auto_columns": null,
 992 |             "margin": null,
 993 |             "display": null,
 994 |             "left": null
 995 |           }
 996 |         },
 997 |         "1671cffa65044dd9bbf41d9edb79c44c": {
 998 |           "model_module": "@jupyter-widgets/controls",
 999 |           "model_name": "HBoxModel",
1000 |           "state": {
1001 |             "_view_name": "HBoxView",
1002 |             "_dom_classes": [],
1003 |             "_model_name": "HBoxModel",
1004 |             "_view_module": "@jupyter-widgets/controls",
1005 |             "_model_module_version": "1.5.0",
1006 |             "_view_count": null,
1007 |             "_view_module_version": "1.5.0",
1008 |             "box_style": "",
1009 |             "layout": "IPY_MODEL_a8fcbfdb95aa467da8c037f72c2b46fe",
1010 |             "_model_module": "@jupyter-widgets/controls",
1011 |             "children": [
1012 |               "IPY_MODEL_5d6b2c80c8494bf0a1429723599d4656",
1013 |               "IPY_MODEL_8119b4b189dc41a9b0319d5bb42ac50e"
1014 |             ]
1015 |           }
1016 |         },
1017 |         "a8fcbfdb95aa467da8c037f72c2b46fe": {
1018 |           "model_module": "@jupyter-widgets/base",
1019 |           "model_name": "LayoutModel",
1020 |           "state": {
1021 |             "_view_name": "LayoutView",
1022 |             "grid_template_rows": null,
1023 |             "right": null,
1024 |             "justify_content": null,
1025 |             "_view_module": "@jupyter-widgets/base",
1026 |             "overflow": null,
1027 |             "_model_module_version": "1.2.0",
1028 |             "_view_count": null,
1029 |             "flex_flow": null,
1030 |             "width": null,
1031 |             "min_width": null,
1032 |             "border": null,
1033 |             "align_items": null,
1034 |             "bottom": null,
1035 |             "_model_module": "@jupyter-widgets/base",
1036 |             "top": null,
1037 |             "grid_column": null,
1038 |             "overflow_y": null,
1039 |             "overflow_x": null,
1040 |             "grid_auto_flow": null,
1041 |             "grid_area": null,
1042 |             "grid_template_columns": null,
1043 |             "flex": null,
1044 |             "_model_name": "LayoutModel",
1045 |             "justify_items": null,
1046 |             "grid_row": null,
1047 |             "max_height": null,
1048 |             "align_content": null,
1049 |             "visibility": null,
1050 |             "align_self": null,
1051 |             "height": null,
1052 |             "min_height": null,
1053 |             "padding": null,
1054 |             "grid_auto_rows": null,
1055 |             "grid_gap": null,
1056 |             "max_width": null,
1057 |             "order": null,
1058 |             "_view_module_version": "1.2.0",
1059 |             "grid_template_areas": null,
1060 |             "object_position": null,
1061 |             "object_fit": null,
1062 |             "grid_auto_columns": null,
1063 |             "margin": null,
1064 |             "display": null,
1065 |             "left": null
1066 |           }
1067 |         },
1068 |         "5d6b2c80c8494bf0a1429723599d4656": {
1069 |           "model_module": "@jupyter-widgets/controls",
1070 |           "model_name": "FloatProgressModel",
1071 |           "state": {
1072 |             "_view_name": "ProgressView",
1073 |             "style": "IPY_MODEL_47e8801c45c44aad808de66857b646da",
1074 |             "_dom_classes": [],
1075 |             "description": "Downloading: 100%",
1076 |             "_model_name": "FloatProgressModel",
1077 |             "bar_style": "success",
1078 |             "max": 377667514,
1079 |             "_view_module": "@jupyter-widgets/controls",
1080 |             "_model_module_version": "1.5.0",
1081 |             "value": 377667514,
1082 |             "_view_count": null,
1083 |             "_view_module_version": "1.5.0",
1084 |             "orientation": "horizontal",
1085 |             "min": 0,
1086 |             "description_tooltip": null,
1087 |             "_model_module": "@jupyter-widgets/controls",
1088 |             "layout": "IPY_MODEL_eae1fe8da5434cc4b40a809bc79d90f3"
1089 |           }
1090 |         },
1091 |         "8119b4b189dc41a9b0319d5bb42ac50e": {
1092 |           "model_module": "@jupyter-widgets/controls",
1093 |           "model_name": "HTMLModel",
1094 |           "state": {
1095 |             "_view_name": "HTMLView",
1096 |             "style": "IPY_MODEL_9ebd00efcbb2479ab601e79e7c08ff43",
1097 |             "_dom_classes": [],
1098 |             "description": "",
1099 |             "_model_name": "HTMLModel",
1100 |             "placeholder": "​",
1101 |             "_view_module": "@jupyter-widgets/controls",
1102 |             "_model_module_version": "1.5.0",
1103 |             "value": " 378M/378M [00:05&lt;00:00, 73.3MB/s]",
1104 |             "_view_count": null,
1105 |             "_view_module_version": "1.5.0",
1106 |             "description_tooltip": null,
1107 |             "_model_module": "@jupyter-widgets/controls",
1108 |             "layout": "IPY_MODEL_1ab32c6aaa10433cabc527c69c265254"
1109 |           }
1110 |         },
1111 |         "47e8801c45c44aad808de66857b646da": {
1112 |           "model_module": "@jupyter-widgets/controls",
1113 |           "model_name": "ProgressStyleModel",
1114 |           "state": {
1115 |             "_view_name": "StyleView",
1116 |             "_model_name": "ProgressStyleModel",
1117 |             "description_width": "initial",
1118 |             "_view_module": "@jupyter-widgets/base",
1119 |             "_model_module_version": "1.5.0",
1120 |             "_view_count": null,
1121 |             "_view_module_version": "1.2.0",
1122 |             "bar_color": null,
1123 |             "_model_module": "@jupyter-widgets/controls"
1124 |           }
1125 |         },
1126 |         "eae1fe8da5434cc4b40a809bc79d90f3": {
1127 |           "model_module": "@jupyter-widgets/base",
1128 |           "model_name": "LayoutModel",
1129 |           "state": {
1130 |             "_view_name": "LayoutView",
1131 |             "grid_template_rows": null,
1132 |             "right": null,
1133 |             "justify_content": null,
1134 |             "_view_module": "@jupyter-widgets/base",
1135 |             "overflow": null,
1136 |             "_model_module_version": "1.2.0",
1137 |             "_view_count": null,
1138 |             "flex_flow": null,
1139 |             "width": null,
1140 |             "min_width": null,
1141 |             "border": null,
1142 |             "align_items": null,
1143 |             "bottom": null,
1144 |             "_model_module": "@jupyter-widgets/base",
1145 |             "top": null,
1146 |             "grid_column": null,
1147 |             "overflow_y": null,
1148 |             "overflow_x": null,
1149 |             "grid_auto_flow": null,
1150 |             "grid_area": null,
1151 |             "grid_template_columns": null,
1152 |             "flex": null,
1153 |             "_model_name": "LayoutModel",
1154 |             "justify_items": null,
1155 |             "grid_row": null,
1156 |             "max_height": null,
1157 |             "align_content": null,
1158 |             "visibility": null,
1159 |             "align_self": null,
1160 |             "height": null,
1161 |             "min_height": null,
1162 |             "padding": null,
1163 |             "grid_auto_rows": null,
1164 |             "grid_gap": null,
1165 |             "max_width": null,
1166 |             "order": null,
1167 |             "_view_module_version": "1.2.0",
1168 |             "grid_template_areas": null,
1169 |             "object_position": null,
1170 |             "object_fit": null,
1171 |             "grid_auto_columns": null,
1172 |             "margin": null,
1173 |             "display": null,
1174 |             "left": null
1175 |           }
1176 |         },
1177 |         "9ebd00efcbb2479ab601e79e7c08ff43": {
1178 |           "model_module": "@jupyter-widgets/controls",
1179 |           "model_name": "DescriptionStyleModel",
1180 |           "state": {
1181 |             "_view_name": "StyleView",
1182 |             "_model_name": "DescriptionStyleModel",
1183 |             "description_width": "",
1184 |             "_view_module": "@jupyter-widgets/base",
1185 |             "_model_module_version": "1.5.0",
1186 |             "_view_count": null,
1187 |             "_view_module_version": "1.2.0",
1188 |             "_model_module": "@jupyter-widgets/controls"
1189 |           }
1190 |         },
1191 |         "1ab32c6aaa10433cabc527c69c265254": {
1192 |           "model_module": "@jupyter-widgets/base",
1193 |           "model_name": "LayoutModel",
1194 |           "state": {
1195 |             "_view_name": "LayoutView",
1196 |             "grid_template_rows": null,
1197 |             "right": null,
1198 |             "justify_content": null,
1199 |             "_view_module": "@jupyter-widgets/base",
1200 |             "overflow": null,
1201 |             "_model_module_version": "1.2.0",
1202 |             "_view_count": null,
1203 |             "flex_flow": null,
1204 |             "width": null,
1205 |             "min_width": null,
1206 |             "border": null,
1207 |             "align_items": null,
1208 |             "bottom": null,
1209 |             "_model_module": "@jupyter-widgets/base",
1210 |             "top": null,
1211 |             "grid_column": null,
1212 |             "overflow_y": null,
1213 |             "overflow_x": null,
1214 |             "grid_auto_flow": null,
1215 |             "grid_area": null,
1216 |             "grid_template_columns": null,
1217 |             "flex": null,
1218 |             "_model_name": "LayoutModel",
1219 |             "justify_items": null,
1220 |             "grid_row": null,
1221 |             "max_height": null,
1222 |             "align_content": null,
1223 |             "visibility": null,
1224 |             "align_self": null,
1225 |             "height": null,
1226 |             "min_height": null,
1227 |             "padding": null,
1228 |             "grid_auto_rows": null,
1229 |             "grid_gap": null,
1230 |             "max_width": null,
1231 |             "order": null,
1232 |             "_view_module_version": "1.2.0",
1233 |             "grid_template_areas": null,
1234 |             "object_position": null,
1235 |             "object_fit": null,
1236 |             "grid_auto_columns": null,
1237 |             "margin": null,
1238 |             "display": null,
1239 |             "left": null
1240 |           }
1241 |         }
1242 |       }
1243 |     }
1244 |   },
1245 |   "cells": [
1246 |     {
1247 |       "cell_type": "markdown",
1248 |       "metadata": {
1249 |         "id": "9sfA2NOV4SJd"
1250 |       },
1251 |       "source": [
1252 |         "# Install ffmpeg-python \n",
1253 |         "\n",
1254 |         "- for recorded audio decoding"
1255 |       ]
1256 |     },
1257 |     {
1258 |       "cell_type": "code",
1259 |       "metadata": {
1260 |         "colab": {
1261 |           "base_uri": "https://localhost:8080/"
1262 |         },
1263 |         "id": "9Vs61y10ysmC",
1264 |         "outputId": "9f877e3e-e110-4743-ba16-e69793eed502"
1265 |       },
1266 |       "source": [
1267 |         "!pip install ffmpeg-python"
1268 |       ],
1269 |       "execution_count": null,
1270 |       "outputs": [
1271 |         {
1272 |           "output_type": "stream",
1273 |           "text": [
1274 |             "Collecting ffmpeg-python\n",
1275 |             "  Downloading https://files.pythonhosted.org/packages/d7/0c/56be52741f75bad4dc6555991fabd2e07b432d333da82c11ad701123888a/ffmpeg_python-0.2.0-py3-none-any.whl\n",
1276 |             "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from ffmpeg-python) (0.16.0)\n",
1277 |             "Installing collected packages: ffmpeg-python\n",
1278 |             "Successfully installed ffmpeg-python-0.2.0\n"
1279 |           ],
1280 |           "name": "stdout"
1281 |         }
1282 |       ]
1283 |     },
1284 |     {
1285 |       "cell_type": "markdown",
1286 |       "metadata": {
1287 |         "id": "NzvsmrBh4kQf"
1288 |       },
1289 |       "source": [
1290 |         "# Loading Dependencies\n",
1291 |         "\n",
1292 |         "- Transformers library\n",
1293 |         "- pytorch\n",
1294 |         "- librosa"
1295 |       ]
1296 |     },
1297 |     {
1298 |       "cell_type": "code",
1299 |       "metadata": {
1300 |         "colab": {
1301 |           "base_uri": "https://localhost:8080/"
1302 |         },
1303 |         "id": "idsQxe7bvi8D",
1304 |         "outputId": "8c926c74-ec84-4125-b258-7ea44e955f21"
1305 |       },
1306 |       "source": [
1307 |         "! pip install -q transformers"
1308 |       ],
1309 |       "execution_count": null,
1310 |       "outputs": [
1311 |         {
1312 |           "output_type": "stream",
1313 |           "text": [
1314 |             "\u001b[K     |████████████████████████████████| 1.8MB 20.4MB/s \n",
1315 |             "\u001b[K     |████████████████████████████████| 890kB 54.2MB/s \n",
1316 |             "\u001b[K     |████████████████████████████████| 3.2MB 48.2MB/s \n",
1317 |             "\u001b[?25h  Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
1318 |           ],
1319 |           "name": "stdout"
1320 |         }
1321 |       ]
1322 |     },
1323 |     {
1324 |       "cell_type": "code",
1325 |       "metadata": {
1326 |         "id": "IZ7fHuNIvrJv"
1327 |       },
1328 |       "source": [
1329 |         "import librosa\n",
1330 |         "import torch\n",
1331 |         "from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer"
1332 |       ],
1333 |       "execution_count": null,
1334 |       "outputs": []
1335 |     },
1336 |     {
1337 |       "cell_type": "markdown",
1338 |       "metadata": {
1339 |         "id": "-160SpaK45Hr"
1340 |       },
1341 |       "source": [
1342 |         "# Instantiate pretrained models\n",
1343 |         "\n",
1344 |         "- Tokenizer\n",
1345 |         "- Wav2Vec2 Model\n",
1346 |         "\n",
1347 |         "\n",
1348 |         "The model takes as input a speech signal in any language in its raw form. This audio data is one-dimensional and is passed to a multi-layer 1-d Convolutional neural network to generate audio representations of 25ms each\n",
1349 |         "\n",
1350 |         "\n"
1351 |       ]
1352 |     },
1353 |     {
1354 |       "cell_type": "code",
1355 |       "metadata": {
1356 |         "colab": {
1357 |           "base_uri": "https://localhost:8080/",
1358 |           "height": 262,
1359 |           "referenced_widgets": [
1360 |             "b9eba979d93045a59d9c7dc84c81563d",
1361 |             "56d9389646aa4511beaaa7c0b0f19407",
1362 |             "4afaee0f80ed43bbad8e12512069b585",
1363 |             "ea41554b73df4a9a8ea8d231ef540b91",
1364 |             "c52cb77480a2447ca8f398ccd3bf77ef",
1365 |             "70f1773130164f95b956c1586146b11e",
1366 |             "ebba19e5d13647618e52411f1e17380b",
1367 |             "d2812369c18a4c0bb62bfacde73e9bb9",
1368 |             "3bc07119f9464d91adf78b8c0468ba80",
1369 |             "3c4629d13adf40808422244e7a9241b7",
1370 |             "ae59d2fa5e5348d28f3234eeeefa1ad0",
1371 |             "96a8f6d86b2e43eb953234b9154d2ae9",
1372 |             "2c4aedd6b81f468f9004ba95292e4442",
1373 |             "6db0bd83adcb4bd0ac9561460b6c96b3",
1374 |             "f7054ca178bb47bd9f9ab35c9c1efa87",
1375 |             "c47adcf2f9374fc0a61fac2f00bb645d",
1376 |             "43c1ab4c55944e5d944fb284a8f3bd5c",
1377 |             "cfcddae93f334b8c997e62c1a897ce8e",
1378 |             "f076c530b4334df78d69463dd77133f9",
1379 |             "cc034ea0d4c34d37a71efa54f41599bc",
1380 |             "be95b7d1ce6f409c82cec0a84a63f893",
1381 |             "5b0c05260b1344c98f71e07cb3000765",
1382 |             "45b8aad8e4a34d3dac0965acaee0bf1e",
1383 |             "1e41b1904bde469b8838b7e5c1cc629d",
1384 |             "6b8f56ee5d4b446fa1d9505a6fc37406",
1385 |             "5cfb0b9c33ef4df3b589d7693e8ea18a",
1386 |             "6933f1570f004c1aac5c38f3d94bece0",
1387 |             "2c109bfebef741e7b73a673dc8992f67",
1388 |             "01a986ebdc51478084ed5a7a8dfd9980",
1389 |             "2c785d1260a14da3a3029e44f1404b7f",
1390 |             "43ebb7099ead40c880b041791532a8e9",
1391 |             "01ea37749561419bb677ea2deea16d65",
1392 |             "1671cffa65044dd9bbf41d9edb79c44c",
1393 |             "a8fcbfdb95aa467da8c037f72c2b46fe",
1394 |             "5d6b2c80c8494bf0a1429723599d4656",
1395 |             "8119b4b189dc41a9b0319d5bb42ac50e",
1396 |             "47e8801c45c44aad808de66857b646da",
1397 |             "eae1fe8da5434cc4b40a809bc79d90f3",
1398 |             "9ebd00efcbb2479ab601e79e7c08ff43",
1399 |             "1ab32c6aaa10433cabc527c69c265254"
1400 |           ]
1401 |         },
1402 |         "id": "YfyyWUjlv-O3",
1403 |         "outputId": "41438c63-e28a-4213-e50b-301d6b8eeead"
1404 |       },
1405 |       "source": [
1406 |         "#load model and tokenizer\n",
1407 |         "tokenizer = Wav2Vec2Tokenizer.from_pretrained(\"facebook/wav2vec2-base-960h\")\n",
1408 |         "model = Wav2Vec2ForCTC.from_pretrained(\"facebook/wav2vec2-base-960h\")"
1409 |       ],
1410 |       "execution_count": null,
1411 |       "outputs": [
1412 |         {
1413 |           "output_type": "display_data",
1414 |           "data": {
1415 |             "application/vnd.jupyter.widget-view+json": {
1416 |               "model_id": "b9eba979d93045a59d9c7dc84c81563d",
1417 |               "version_minor": 0,
1418 |               "version_major": 2
1419 |             },
1420 |             "text/plain": [
1421 |               "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=291.0, style=ProgressStyle(description_…"
1422 |             ]
1423 |           },
1424 |           "metadata": {
1425 |             "tags": []
1426 |           }
1427 |         },
1428 |         {
1429 |           "output_type": "stream",
1430 |           "text": [
1431 |             "\n"
1432 |           ],
1433 |           "name": "stdout"
1434 |         },
1435 |         {
1436 |           "output_type": "display_data",
1437 |           "data": {
1438 |             "application/vnd.jupyter.widget-view+json": {
1439 |               "model_id": "3bc07119f9464d91adf78b8c0468ba80",
1440 |               "version_minor": 0,
1441 |               "version_major": 2
1442 |             },
1443 |             "text/plain": [
1444 |               "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=163.0, style=ProgressStyle(description_…"
1445 |             ]
1446 |           },
1447 |           "metadata": {
1448 |             "tags": []
1449 |           }
1450 |         },
1451 |         {
1452 |           "output_type": "stream",
1453 |           "text": [
1454 |             "\n"
1455 |           ],
1456 |           "name": "stdout"
1457 |         },
1458 |         {
1459 |           "output_type": "display_data",
1460 |           "data": {
1461 |             "application/vnd.jupyter.widget-view+json": {
1462 |               "model_id": "43c1ab4c55944e5d944fb284a8f3bd5c",
1463 |               "version_minor": 0,
1464 |               "version_major": 2
1465 |             },
1466 |             "text/plain": [
1467 |               "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=85.0, style=ProgressStyle(description_w…"
1468 |             ]
1469 |           },
1470 |           "metadata": {
1471 |             "tags": []
1472 |           }
1473 |         },
1474 |         {
1475 |           "output_type": "stream",
1476 |           "text": [
1477 |             "\n"
1478 |           ],
1479 |           "name": "stdout"
1480 |         },
1481 |         {
1482 |           "output_type": "display_data",
1483 |           "data": {
1484 |             "application/vnd.jupyter.widget-view+json": {
1485 |               "model_id": "6b8f56ee5d4b446fa1d9505a6fc37406",
1486 |               "version_minor": 0,
1487 |               "version_major": 2
1488 |             },
1489 |             "text/plain": [
1490 |               "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=843.0, style=ProgressStyle(description_…"
1491 |             ]
1492 |           },
1493 |           "metadata": {
1494 |             "tags": []
1495 |           }
1496 |         },
1497 |         {
1498 |           "output_type": "stream",
1499 |           "text": [
1500 |             "\n"
1501 |           ],
1502 |           "name": "stdout"
1503 |         },
1504 |         {
1505 |           "output_type": "display_data",
1506 |           "data": {
1507 |             "application/vnd.jupyter.widget-view+json": {
1508 |               "model_id": "1671cffa65044dd9bbf41d9edb79c44c",
1509 |               "version_minor": 0,
1510 |               "version_major": 2
1511 |             },
1512 |             "text/plain": [
1513 |               "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=377667514.0, style=ProgressStyle(descri…"
1514 |             ]
1515 |           },
1516 |           "metadata": {
1517 |             "tags": []
1518 |           }
1519 |         },
1520 |         {
1521 |           "output_type": "stream",
1522 |           "text": [
1523 |             "\n"
1524 |           ],
1525 |           "name": "stdout"
1526 |         }
1527 |       ]
1528 |     },
1529 |     {
1530 |       "cell_type": "markdown",
1531 |       "metadata": {
1532 |         "id": "mU24DTxC5QLA"
1533 |       },
1534 |       "source": [
1535 |         "# Recording and loading audio functions\n",
1536 |         "\n",
1537 |         "Taken from [ricardodeazambuja.com](https://ricardodeazambuja.com/deep_learning/2019/03/09/audio_and_video_google_colab/)"
1538 |       ]
1539 |     },
1540 |     {
1541 |       "cell_type": "code",
1542 |       "metadata": {
1543 |         "id": "PoSS94ebwJ4W"
1544 |       },
1545 |       "source": [
1546 |         "# https://ricardodeazambuja.com/deep_learning/2019/03/09/audio_and_video_google_colab/\n",
1547 |         "from IPython.display import HTML, Audio\n",
1548 |         "from google.colab.output import eval_js\n",
1549 |         "from base64 import b64decode\n",
1550 |         "import numpy as np\n",
1551 |         "import io\n",
1552 |         "import ffmpeg\n",
1553 |         "\n",
1554 |         "AUDIO_HTML = \"\"\"\n",
1555 |         "<script>\n",
1556 |         "var my_div = document.createElement(\"DIV\");\n",
1557 |         "var my_p = document.createElement(\"P\");\n",
1558 |         "var my_btn = document.createElement(\"BUTTON\");\n",
1559 |         "var t = document.createTextNode(\"Press to start recording\");\n",
1560 |         "\n",
1561 |         "my_btn.appendChild(t);\n",
1562 |         "//my_p.appendChild(my_btn);\n",
1563 |         "my_div.appendChild(my_btn);\n",
1564 |         "document.body.appendChild(my_div);\n",
1565 |         "\n",
1566 |         "var base64data = 0;\n",
1567 |         "var reader;\n",
1568 |         "var recorder, gumStream;\n",
1569 |         "var recordButton = my_btn;\n",
1570 |         "\n",
1571 |         "var handleSuccess = function(stream) {\n",
1572 |         "  gumStream = stream;\n",
1573 |         "  var options = {\n",
1574 |         "    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
1575 |         "    mimeType : 'audio/webm;codecs=opus'\n",
1576 |         "    //mimeType : 'audio/webm;codecs=pcm'\n",
1577 |         "  };            \n",
1578 |         "  //recorder = new MediaRecorder(stream, options);\n",
1579 |         "  recorder = new MediaRecorder(stream);\n",
1580 |         "  recorder.ondataavailable = function(e) {            \n",
1581 |         "    var url = URL.createObjectURL(e.data);\n",
1582 |         "    var preview = document.createElement('audio');\n",
1583 |         "    preview.controls = true;\n",
1584 |         "    preview.src = url;\n",
1585 |         "    document.body.appendChild(preview);\n",
1586 |         "\n",
1587 |         "    reader = new FileReader();\n",
1588 |         "    reader.readAsDataURL(e.data); \n",
1589 |         "    reader.onloadend = function() {\n",
1590 |         "      base64data = reader.result;\n",
1591 |         "      //console.log(\"Inside FileReader:\" + base64data);\n",
1592 |         "    }\n",
1593 |         "  };\n",
1594 |         "  recorder.start();\n",
1595 |         "  };\n",
1596 |         "\n",
1597 |         "recordButton.innerText = \"Recording... press to stop\";\n",
1598 |         "\n",
1599 |         "navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);\n",
1600 |         "\n",
1601 |         "\n",
1602 |         "function toggleRecording() {\n",
1603 |         "  if (recorder && recorder.state == \"recording\") {\n",
1604 |         "      recorder.stop();\n",
1605 |         "      gumStream.getAudioTracks()[0].stop();\n",
1606 |         "      recordButton.innerText = \"Saving the recording... pls wait!\"\n",
1607 |         "  }\n",
1608 |         "}\n",
1609 |         "\n",
1610 |         "// https://stackoverflow.com/a/951057\n",
1611 |         "function sleep(ms) {\n",
1612 |         "  return new Promise(resolve => setTimeout(resolve, ms));\n",
1613 |         "}\n",
1614 |         "\n",
1615 |         "var data = new Promise(resolve=>{\n",
1616 |         "//recordButton.addEventListener(\"click\", toggleRecording);\n",
1617 |         "recordButton.onclick = ()=>{\n",
1618 |         "toggleRecording()\n",
1619 |         "\n",
1620 |         "sleep(2000).then(() => {\n",
1621 |         "  // wait 2000ms for the data to be available...\n",
1622 |         "  // ideally this should use something like await...\n",
1623 |         "  //console.log(\"Inside data:\" + base64data)\n",
1624 |         "  resolve(base64data.toString())\n",
1625 |         "\n",
1626 |         "});\n",
1627 |         "\n",
1628 |         "}\n",
1629 |         "});\n",
1630 |         "      \n",
1631 |         "</script>\n",
1632 |         "\"\"\"\n",
1633 |         "\n",
1634 |         "def get_audio(sr):\n",
1635 |         "  display(HTML(AUDIO_HTML))\n",
1636 |         "  data = eval_js(\"data\")\n",
1637 |         "  binary = b64decode(data.split(',')[1])\n",
1638 |         "  \n",
1639 |         "  process = (ffmpeg\n",
1640 |         "    .input('pipe:0')\n",
1641 |         "    .output('pipe:1', format='wav')\n",
1642 |         "    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)\n",
1643 |         "  )\n",
1644 |         "  output, err = process.communicate(input=binary)\n",
1645 |         "  \n",
1646 |         "  riff_chunk_size = len(output) - 8\n",
1647 |         "  # Break up the chunk size into four bytes, held in b.\n",
1648 |         "  q = riff_chunk_size\n",
1649 |         "  b = []\n",
1650 |         "  for i in range(4):\n",
1651 |         "      q, r = divmod(q, 256)\n",
1652 |         "      b.append(r)\n",
1653 |         "\n",
1654 |         "  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.\n",
1655 |         "  riff = output[:4] + bytes(b) + output[8:]\n",
1656 |         "\n",
1657 |         "  speech, rate = librosa.load(io.BytesIO(riff),sr=16000)\n",
1658 |         "  return speech, sr"
1659 |       ],
1660 |       "execution_count": null,
1661 |       "outputs": []
1662 |     },
1663 |     {
1664 |       "cell_type": "markdown",
1665 |       "metadata": {
1666 |         "id": "rLr68Rei5p8g"
1667 |       },
1668 |       "source": [
1669 |         "# Recording and loading audio"
1670 |       ]
1671 |     },
1672 |     {
1673 |       "cell_type": "code",
1674 |       "metadata": {
1675 |         "colab": {
1676 |           "base_uri": "https://localhost:8080/",
1677 |           "height": 96
1678 |         },
1679 |         "id": "KBIeAAWAwB7A",
1680 |         "outputId": "bf5c3fe4-b62c-4ab1-d8b7-8d6fb83c95f3"
1681 |       },
1682 |       "source": [
1683 |         "#load any audio file of your choice\n",
1684 |         "speech, rate = get_audio(sr=16000)"
1685 |       ],
1686 |       "execution_count": null,
1687 |       "outputs": [
1688 |         {
1689 |           "output_type": "display_data",
1690 |           "data": {
1691 |             "text/html": [
1692 |               "\n",
1693 |               "<script>\n",
1694 |               "var my_div = document.createElement(\"DIV\");\n",
1695 |               "var my_p = document.createElement(\"P\");\n",
1696 |               "var my_btn = document.createElement(\"BUTTON\");\n",
1697 |               "var t = document.createTextNode(\"Press to start recording\");\n",
1698 |               "\n",
1699 |               "my_btn.appendChild(t);\n",
1700 |               "//my_p.appendChild(my_btn);\n",
1701 |               "my_div.appendChild(my_btn);\n",
1702 |               "document.body.appendChild(my_div);\n",
1703 |               "\n",
1704 |               "var base64data = 0;\n",
1705 |               "var reader;\n",
1706 |               "var recorder, gumStream;\n",
1707 |               "var recordButton = my_btn;\n",
1708 |               "\n",
1709 |               "var handleSuccess = function(stream) {\n",
1710 |               "  gumStream = stream;\n",
1711 |               "  var options = {\n",
1712 |               "    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
1713 |               "    mimeType : 'audio/webm;codecs=opus'\n",
1714 |               "    //mimeType : 'audio/webm;codecs=pcm'\n",
1715 |               "  };            \n",
1716 |               "  //recorder = new MediaRecorder(stream, options);\n",
1717 |               "  recorder = new MediaRecorder(stream);\n",
1718 |               "  recorder.ondataavailable = function(e) {            \n",
1719 |               "    var url = URL.createObjectURL(e.data);\n",
1720 |               "    var preview = document.createElement('audio');\n",
1721 |               "    preview.controls = true;\n",
1722 |               "    preview.src = url;\n",
1723 |               "    document.body.appendChild(preview);\n",
1724 |               "\n",
1725 |               "    reader = new FileReader();\n",
1726 |               "    reader.readAsDataURL(e.data); \n",
1727 |               "    reader.onloadend = function() {\n",
1728 |               "      base64data = reader.result;\n",
1729 |               "      //console.log(\"Inside FileReader:\" + base64data);\n",
1730 |               "    }\n",
1731 |               "  };\n",
1732 |               "  recorder.start();\n",
1733 |               "  };\n",
1734 |               "\n",
1735 |               "recordButton.innerText = \"Recording... press to stop\";\n",
1736 |               "\n",
1737 |               "navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);\n",
1738 |               "\n",
1739 |               "\n",
1740 |               "function toggleRecording() {\n",
1741 |               "  if (recorder && recorder.state == \"recording\") {\n",
1742 |               "      recorder.stop();\n",
1743 |               "      gumStream.getAudioTracks()[0].stop();\n",
1744 |               "      recordButton.innerText = \"Saving the recording... pls wait!\"\n",
1745 |               "  }\n",
1746 |               "}\n",
1747 |               "\n",
1748 |               "// https://stackoverflow.com/a/951057\n",
1749 |               "function sleep(ms) {\n",
1750 |               "  return new Promise(resolve => setTimeout(resolve, ms));\n",
1751 |               "}\n",
1752 |               "\n",
1753 |               "var data = new Promise(resolve=>{\n",
1754 |               "//recordButton.addEventListener(\"click\", toggleRecording);\n",
1755 |               "recordButton.onclick = ()=>{\n",
1756 |               "toggleRecording()\n",
1757 |               "\n",
1758 |               "sleep(2000).then(() => {\n",
1759 |               "  // wait 2000ms for the data to be available...\n",
1760 |               "  // ideally this should use something like await...\n",
1761 |               "  //console.log(\"Inside data:\" + base64data)\n",
1762 |               "  resolve(base64data.toString())\n",
1763 |               "\n",
1764 |               "});\n",
1765 |               "\n",
1766 |               "}\n",
1767 |               "});\n",
1768 |               "      \n",
1769 |               "</script>\n"
1770 |             ],
1771 |             "text/plain": [
1772 |               "<IPython.core.display.HTML object>"
1773 |             ]
1774 |           },
1775 |           "metadata": {
1776 |             "tags": []
1777 |           }
1778 |         }
1779 |       ]
1780 |     },
1781 |     {
1782 |       "cell_type": "markdown",
1783 |       "metadata": {
1784 |         "id": "0Zya_f855yY_"
1785 |       },
1786 |       "source": [
1787 |         "# Inferencing\n",
1788 |         "- tokenizing(encoding) speech data and return pytorch tensor\n",
1789 |         "- pass encodings to model"
1790 |       ]
1791 |     },
1792 |     {
1793 |       "cell_type": "code",
1794 |       "metadata": {
1795 |         "id": "tA8ODZzNzp1_"
1796 |       },
1797 |       "source": [
1798 |         "input_values = tokenizer(speech, return_tensors = 'pt').input_values\n",
1799 |         "#logits (non-normalized predictions)\n",
1800 |         "logits = model(input_values).logits"
1801 |       ],
1802 |       "execution_count": null,
1803 |       "outputs": []
1804 |     },
1805 |     {
1806 |       "cell_type": "markdown",
1807 |       "metadata": {
1808 |         "id": "Bw7t8DuE7iNg"
1809 |       },
1810 |       "source": [
1811 |         "decoding transcript"
1812 |       ]
1813 |     },
1814 |     {
1815 |       "cell_type": "code",
1816 |       "metadata": {
1817 |         "colab": {
1818 |           "base_uri": "https://localhost:8080/"
1819 |         },
1820 |         "id": "E_5sijeizdPy",
1821 |         "outputId": "33064179-153f-4c76-fd0f-684ffac66d6b"
1822 |       },
1823 |       "source": [
1824 |         "predicted_ids = torch.argmax(logits, dim =-1)\n",
1825 |         "#decode the audio to generate text\n",
1826 |         "transcriptions = tokenizer.decode(predicted_ids[0])\n",
1827 |         "print(transcriptions)"
1828 |       ],
1829 |       "execution_count": null,
1830 |       "outputs": [
1831 |         {
1832 |           "output_type": "stream",
1833 |           "text": [
1834 |             "A WO\n"
1835 |           ],
1836 |           "name": "stdout"
1837 |         }
1838 |       ]
1839 |     },
1840 |     {
1841 |       "cell_type": "code",
1842 |       "metadata": {
1843 |         "id": "soSVdBHtB1Ab"
1844 |       },
1845 |       "source": [
1846 |         ""
1847 |       ],
1848 |       "execution_count": null,
1849 |       "outputs": []
1850 |     }
1851 |   ]
1852 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sounddevice==0.4.2
2 | SoundFile==0.10.3.post1
3 | transformers==4.9.2


--------------------------------------------------------------------------------
/train_language_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | from asr.language_model import LanguageModel
 4 | from asr.wav2vec2.vocab import vocab_list
 5 | from tqdm import tqdm
 6 | 
 7 | corpus_path = os.path.join("data", "lm_training_corpus", "corpus.txt")
 8 | save_path = os.path.join("data", "models", "lm")
 9 | parser = argparse.ArgumentParser(description="Train character language model from text corpus")
10 | parser.add_argument("--corpus", "-c", default=corpus_path, type=str, help="path to text corpus for training")
11 | parser.add_argument("--save", "-s", default=save_path, type=str, help="path to save trained model")
12 | args = parser.parse_args()
13 | 
14 | 
15 | def change_digit_to_word(x):
16 |     x = x.replace("0", "zero ")
17 |     x = x.replace("1", "one ")
18 |     x = x.replace("2", "two ")
19 |     x = x.replace("3", "three ")
20 |     x = x.replace("4", "four ")
21 |     x = x.replace("5", "five ")
22 |     x = x.replace("6", "six ")
23 |     x = x.replace("7", "seven ")
24 |     x = x.replace("8", "eight ")
25 |     x = x.replace("9", "nine ")
26 |     x = x.replace("  ", " ")
27 |     x = x.strip()
28 |     return x
29 | 
30 | 
31 | # excluding pad token for language model
32 | lm = LanguageModel(chars=vocab_list[1:])
33 | 
34 | lines = sum(1 for i in open(args.corpus, "r"))
35 | with open(args.corpus, "r") as txt:
36 |     for line in tqdm(txt, total=lines):
37 |         line = change_digit_to_word(line)
38 |         lm.train(line)
39 | lm.normalize()
40 | lm.save(args.save)


--------------------------------------------------------------------------------