├── synthetic-voices ├── README.md └── run-news.sh ├── README.md ├── timezones.tsv ├── sound_converter.py ├── currency.tsv ├── sound_resampler.py ├── LICENSE ├── voice_duration.py ├── news_splitter.py ├── measurements.tsv ├── .gitignore ├── transcription_writer.ipynb ├── text_processor.py ├── XLSR_Wav2Vec2_for_Indonesian_Evaluation-Javanese.ipynb ├── XLSR_Wav2Vec2_for_Indonesian_Evaluation-Sundanese.ipynb ├── XLSR_Wav2Vec2_for_Indonesian_Evaluation.ipynb └── XLSR_Wav2Vec2_for_Indonesian_Usage.ipynb /synthetic-voices/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Indonesian Speech Recognition 2 | Automatic Speech Recognition for Indonesian 3 | -------------------------------------------------------------------------------- /timezones.tsv: -------------------------------------------------------------------------------- 1 | WITA Waktu Indonesia Tengah 2 | WIB Waktu Indonesia Barat 3 | WIT Waktu Indonesia Timur 4 | GMT Greenwich Mean Time 5 | -------------------------------------------------------------------------------- /sound_converter.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import librosa 3 | import soundfile as sf 4 | 5 | """ 6 | The script convert the ogg files from opus subtype to with vorbis ogg. 7 | """ 8 | 9 | root_dir = Path("/mnt/mldata/data/ASR/news/test") 10 | src_dir = root_dir/"src" 11 | dst_dir = root_dir/"dst" 12 | 13 | for path in Path(src_dir).rglob('*.ogg'): 14 | print(path.name) 15 | data, sample_rate = librosa.load(path, sr=16000) 16 | dst_path = Path(str(path).replace(str(src_dir), str(dst_dir))) 17 | dst_path.parent.mkdir(parents=True, exist_ok=True) 18 | sf.write(dst_path, data, sample_rate) -------------------------------------------------------------------------------- /currency.tsv: -------------------------------------------------------------------------------- 1 | US$ dollar amerika serikat 2 | nzd dollar new zealand 3 | rs rupee 4 | chf franc swiss 5 | dkk kroner denmark 6 | fim markka finland 7 | aed dirham arab 8 | czk koruna ceko 9 | mro ouguiya mauritania 10 | pkr rupee pakistan 11 | crc colon costa rica 12 | hk$ dollar hong kong 13 | npr rupee nepal 14 | awg florin aruban 15 | nok kroner norwegia 16 | tzs shilling tanzania 17 | sek kronor swedish 18 | cyp pounds cypriot 19 | sar riyal saudi 20 | cve escudo cape verde 21 | rsd dinar serbia 22 | dm mark jerman 23 | shp pounds saint helena 24 | php peso philipina 25 | cad dollar canada 26 | ssp pounds sudan selatan 27 | scr rupee seychell 28 | mvr rufiyaa maldivia 29 | Rp rupiah 30 | r real 31 | $ dollar 32 | € euro 33 | £ pounds 34 | ₩ won 35 | ¥ yen -------------------------------------------------------------------------------- /sound_resampler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | import torchaudio 4 | 5 | """ 6 | The script resample sound files to 16KHz and save it as ogg files. 7 | 8 | """ 9 | 10 | if len(sys.argv) != 3: 11 | print(sys.argv[0], " ") 12 | exit(1) 13 | 14 | src_dir = Path(sys.argv[1]) 15 | dst_dir = Path(sys.argv[2]) 16 | dst_sample_rate = 16_000 17 | 18 | for path in Path(src_dir).rglob('*'): 19 | if path.suffix in [".mp3", ".ogg", ".wav"]: 20 | print(path.name) 21 | data, src_sample_rate = torchaudio.load(path) 22 | resampler = torchaudio.transforms.Resample(src_sample_rate, dst_sample_rate) 23 | data = resampler(data) 24 | dst_path = Path(str(path.parent).replace(str(src_dir), str(dst_dir))) 25 | dst_path.mkdir(parents=True, exist_ok=True) 26 | dst_path = dst_path/f'{path.stem}.ogg' 27 | dst_path.parent.mkdir(parents=True, exist_ok=True) 28 | torchaudio.save(str(dst_path), data, dst_sample_rate) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Cahya Wirawan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /voice_duration.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import librosa 3 | import datetime 4 | from pydub import AudioSegment 5 | 6 | data_dir = "/mnt/mldata/data/ASR/news/id-Wavenet" 7 | 8 | 9 | def duration(dir, ext): 10 | path = Path(dir).glob(f'**/*.{ext}') 11 | duration = 0.0 12 | counter = 0 13 | for filename in path: 14 | if counter % 100 == 0: 15 | print(counter, datetime.timedelta(seconds=duration), filename) 16 | counter += 1 17 | if filename.is_file(): 18 | duration += librosa.get_duration(filename=str(filename)) 19 | # print(f'{duration}s, {duration/60}m, {duration/3600}h') 20 | return duration 21 | 22 | 23 | def pduration(dir, ext): 24 | path = Path(dir).glob(f'**/*.{ext}') 25 | duration = 0.0 26 | counter = 0 27 | for filename in path: 28 | if counter % 100 == 0: 29 | print(counter, datetime.timedelta(seconds=duration/1000), filename) 30 | counter += 1 31 | if filename.is_file(): 32 | sound = AudioSegment.from_file(filename, format=ext) 33 | duration += len(sound) 34 | # print(f'{duration}s, {duration/60}m, {duration/3600}h') 35 | return duration/1000 36 | 37 | 38 | sound_length = duration(data_dir, "ogg") 39 | print(str(datetime.timedelta(seconds=sound_length))) -------------------------------------------------------------------------------- /synthetic-voices/run-news.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## This script uses https://github.com/cahya-wirawan/artificial-commonvoice to 4 | ## generates synthetic voices. It reads the SOURCE_FILE to retrieve the sound file name and its sentences. 5 | ## It generates 4 voice types for each sentence: id-ID-Wavenet-A, id-ID-Wavenet-B, id-ID-Wavenet-C, id-ID-Wavenet-D. 6 | ## It starts from the line number START+1 to the line number END of SOURCE_FILE. 7 | ## The result will be stored in DESTINATION_DIR. 8 | 9 | ## Please update following SOURCE_FILE and DESTINATION_DIR accordingly 10 | SOURCE_FILE="/mnt/mldata/data/ASR/news/id-newspapers-small.tsv" 11 | DESTINATION_DIR="/mnt/mldata/data/ASR/news/id-newspapers" 12 | 13 | ## Please uncomment the variable START according to your name 14 | 15 | ## Cahya 16 | #START=0 17 | 18 | ## Galuh 19 | #START=100000 20 | 21 | ## Akmal 22 | #START=200000 23 | 24 | ## Yasir 25 | #START=300000 26 | 27 | # Agung 28 | #START=400000 29 | 30 | ## Samsul 31 | #START=500000 32 | 33 | # Length is *ten thousand* lines. Since we use 4 voice types, it will generate 4*10000 synthetic sound files. 34 | LENGTH=10000 35 | END=$((START+LENGTH)) 36 | 37 | if [ -z ${START} ] 38 | then 39 | echo "Please set the env variable START properly" 40 | else 41 | python commonvoice.py --debug -s -t 0.15 -c "${SOURCE_FILE}" -v id-ID-Wavenet-A id-ID-Wavenet-B id-ID-Wavenet-C id-ID-Wavenet-D --start "${START}" --end "${END}" -o "${DESTINATION_DIR}" 42 | fi -------------------------------------------------------------------------------- /news_splitter.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from langdetect import detect 3 | from text_normalizer import TextProcessor 4 | 5 | news_path = Path("/mnt/mldata/data/newspapers/newspapers-clean.txt") 6 | voices_path = Path("/mnt/mldata/data/ASR/news/newspapers.tsv") 7 | min_text_length = 15 8 | 9 | tp = TextProcessor() 10 | 11 | with open(news_path, "r") as news: 12 | i = 1 13 | with open(voices_path, "w") as voices: 14 | voices.write(f'path\tsentence\n') 15 | for line in news: 16 | if line != "\n" and len(line) > min_text_length: 17 | line = line.strip() 18 | try: 19 | lang = detect(line) 20 | if lang == "id": 21 | sentences = line.split(". ") 22 | sentence_merged = "" 23 | for sentence in sentences: 24 | if len(sentence) > min_text_length: 25 | sentence = tp.normalize(sentence) 26 | if len(sentence) + len(sentence_merged) < min_text_length*4: 27 | if sentence_merged == "": 28 | sentence_merged = sentence 29 | else: 30 | sentence_merged = sentence_merged + f'. {sentence}' 31 | continue 32 | if sentence_merged == "": 33 | sentence_merged = sentence 34 | else: 35 | sentence_merged = sentence_merged + f'. {sentence}' 36 | if (i-1) % 10000 == 0: 37 | print(f'newspapers_{i:09}.ogg\t{sentence_merged}') 38 | voices.write(f'newspapers_{i:09}.ogg\t{sentence_merged}.\n') 39 | sentence_merged = "" 40 | i += 1 41 | except Exception as e: 42 | print(sentence) 43 | print(e) -------------------------------------------------------------------------------- /measurements.tsv: -------------------------------------------------------------------------------- 1 | sq mi mil kuadrat 2 | sq ft kaki kuadrat 3 | kbps kilobit per detik 4 | mbps megabit per detik 5 | kcal kilo kalori 6 | ghz gigahertz 7 | khz kilohertz 8 | mhz megahertz 9 | lbs pound 10 | rpm revolution per menit 11 | kwh kilo watt jam 12 | min menit 13 | mph mil per jam 14 | mol mol 15 | gpa giga pascal 16 | km² kilometer kuadrat 17 | km2 kilometer kuadrat 18 | rad radian 19 | kgf kilogram force 20 | mm² millimeter kuadrat 21 | mm2 millimeter kuadrat 22 | cm² centimeter kuadrat 23 | cm2 centimeter kuadrat 24 | dm³ desimeter kubik 25 | dm3 desimeter kubik 26 | amu atomic mass unit 27 | gwh giga watt jam 28 | kpa kilopascal 29 | cwt hundredweight 30 | atm atmosphere 31 | bar bar 32 | km kilometer 33 | cm centimeter 34 | mm millimeter 35 | ha hectare 36 | mi mil 37 | m² meter kuadrat 38 | m2 meter kuadrat 39 | ft kaki 40 | hz hertz 41 | kw kilowatt 42 | hp tenaga kuda 43 | mg milligram 44 | kg kilogram 45 | lb pound 46 | mc mega coulomb 47 | nm nanometer 48 | mA milli ampere 49 | m³ meter kubik 50 | m3 meter kubik 51 | tw tera watt 52 | mv milli volt 53 | mw megawatt 54 | μm mikrometer 55 | " inch 56 | tb terabyte 57 | cc c c 58 | da dalton 59 | db desibel 60 | ps peta detik 61 | oz ounce 62 | hl hecto liter 63 | μg mikrogram 64 | pg petagram 65 | gb gigabyte 66 | kb kilobit 67 | ev electron volt 68 | mb megabyte 69 | kb kilobyte 70 | kl kilo liter 71 | tj tera joule 72 | kv kilo volt 73 | mv mega volt 74 | kn kilonewton 75 | mm megameter 76 | au astronomical unit 77 | yd yard 78 | lm lumen 79 | hs hecto detik 80 | ml milliliter 81 | gw gigawatt 82 | ma mega ampere 83 | kt knot 84 | ng nano gram 85 | ns nano detik 86 | ms mega siemens 87 | gl giga liter 88 | μs mikro detik 89 | da desi ampere 90 | pa pascal 91 | ds desi detik 92 | ms milli detik 93 | dm desimeter 94 | mb megabit 95 | mf mega farad 96 | bq becquerel 97 | pb petabit 98 | cd candela 99 | tl tera liter 100 | ms mega detik 101 | mpa megapascal 102 | pb peta byte 103 | gy gray 104 | sv sievert 105 | cc c c 106 | f fahrenheit 107 | c celsius 108 | m meter 109 | % percent 110 | v volt 111 | h jam 112 | g gram 113 | s detik 114 | ω ohm -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /transcription_writer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pathlib import Path" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 9, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "sentence_file = \"/mnt/mldata/data/ASR/news/id-newspapers-small.tsv\"\n", 19 | "news_dir = \"/mnt/mldata/data/ASR/news\"\n", 20 | "sound_dir = \"/mnt/mldata/data/ASR/synthetic-voice/id-newspapers/\"\n", 21 | "transcription_file = f'{sound_dir}/transcription.tsv'\n", 22 | "cv_dir = \"/mnt/mldata/data/ASR/commonvoice\"\n", 23 | "mix_dir = \"/mnt/mldata/data/ASR/synthetic_voice\"" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "def read_transcript_news(voice_dir):\n", 33 | " transcript_file = \"id-newspapers-small.tsv\"\n", 34 | " transcripts = {}\n", 35 | " with open(Path(voice_dir)/transcript_file, \"r\") as sf:\n", 36 | " sf.readline()\n", 37 | " counter = 0\n", 38 | " for line in sf:\n", 39 | " #if counter >= 5:\n", 40 | " # break\n", 41 | " counter += 1\n", 42 | " line = line.strip()\n", 43 | " path, sentence = line.split('\\t')\n", 44 | " transcripts[path] = sentence \n", 45 | " return transcripts\n", 46 | " " 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 4, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "def read_transcript_cv(voice_dir):\n", 56 | " transcript_files = ['validated_notest.tsv', 'dev.tsv', 'test.tsv'] \n", 57 | " transcripts = {}\n", 58 | " for file in transcript_files:\n", 59 | " with open(Path(voice_dir)/file, \"r\") as tf:\n", 60 | " tf.readline()\n", 61 | " counter = 0\n", 62 | " for line in tf:\n", 63 | " #if counter >= 5:\n", 64 | " # break\n", 65 | " counter += 1\n", 66 | " line = line.strip()\n", 67 | " row = line.split('\\t')\n", 68 | " transcripts[row[1]] = row[2]\n", 69 | " return transcripts" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "608695" 81 | ] 82 | }, 83 | "execution_count": 5, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "transcripts = {**read_transcript_news(news_dir), **read_transcript_cv(cv_dir)}\n", 90 | "len(transcripts)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 10, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "name": "stdout", 100 | "output_type": "stream", 101 | "text": [ 102 | "/mnt/mldata/data/ASR/synthetic-voice/id-newspapers/\n", 103 | "**************************************************\n", 104 | "**************************************************\n", 105 | "********************" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "counter = 0\n", 111 | "with open(transcription_file, \"w\") as psf:\n", 112 | " print(sound_dir)\n", 113 | " psf.write(f'path\\tsentence\\n')\n", 114 | " for path in Path(sound_dir).glob('**/*'):\n", 115 | " if path.suffix not in ['.mp3', '.ogg']:\n", 116 | " continue\n", 117 | " counter += 1\n", 118 | " sound_file = path.name\n", 119 | " path = str(path).replace(sound_dir+'/', \"\")\n", 120 | " if counter%1000==0:\n", 121 | " print(\"*\", end=\"\")\n", 122 | " if counter%50000==0:\n", 123 | " print()\n", 124 | " psf.write(f'{path}\\t{transcripts[sound_file]}\\n')\n", 125 | " " 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 78, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/plain": [ 136 | "20" 137 | ] 138 | }, 139 | "execution_count": 78, 140 | "metadata": {}, 141 | "output_type": "execute_result" 142 | } 143 | ], 144 | "source": [] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 69, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "c = {**a, **b}" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 71, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "20" 164 | ] 165 | }, 166 | "execution_count": 71, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "len(c)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 30, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "!cat $path_sentence_file" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "### Sentence" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 6, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "599999" 200 | ] 201 | }, 202 | "execution_count": 6, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "len(sentences)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 7, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "data": { 218 | "text/plain": [ 219 | "'Meskipun begitu, ia menyatakan kemungkinan besar wilayah Jakarta tetap akan merasakan gempa dengan kekuatan yang cukup besar seperti yang terjadi saat gempa di dekat Lebak, Banten pada dua puluh tiga Januari lalu.'" 220 | ] 221 | }, 222 | "execution_count": 7, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "sentences['newspapers_000000010.ogg']" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [] 237 | } 238 | ], 239 | "metadata": { 240 | "kernelspec": { 241 | "display_name": "Python 3", 242 | "language": "python", 243 | "name": "python3" 244 | }, 245 | "language_info": { 246 | "codemirror_mode": { 247 | "name": "ipython", 248 | "version": 3 249 | }, 250 | "file_extension": ".py", 251 | "mimetype": "text/x-python", 252 | "name": "python", 253 | "nbconvert_exporter": "python", 254 | "pygments_lexer": "ipython3", 255 | "version": "3.7.7" 256 | } 257 | }, 258 | "nbformat": 4, 259 | "nbformat_minor": 4 260 | } 261 | -------------------------------------------------------------------------------- /text_processor.py: -------------------------------------------------------------------------------- 1 | import re 2 | from num2words import num2words 3 | 4 | 5 | class TextProcessor: 6 | thousands = ["ratus", "ribu", "juta", "miliar", "milyar", "triliun"] 7 | months = ["Januari", "Februari", "Maret", "April", 8 | "Mei", "Juni", "Juli", "Agustus", 9 | "September", "Oktober", "November", "Desember"] 10 | measurements_path = "measurements.tsv" 11 | currencies_path = "currency.tsv" 12 | timezones_path = "timezones.tsv" 13 | 14 | def __init__(self): 15 | self.measurements = {} 16 | with open(TextProcessor.measurements_path, "r") as file: 17 | for line in file: 18 | line = line.strip().split("\t") 19 | self.measurements[line[0]] = line[1] 20 | 21 | self.currencies = {} 22 | with open(TextProcessor.currencies_path, "r") as file: 23 | for line in file: 24 | line = line.strip().split("\t") 25 | self.currencies[line[0]] = line[1] 26 | 27 | self.timezones = {} 28 | with open(TextProcessor.timezones_path, "r") as file: 29 | for line in file: 30 | line = line.strip().split("\t") 31 | self.timezones[line[0]] = line[1] 32 | 33 | self.re_thousands = '|'.join([t for t in TextProcessor.thousands]) 34 | self.re_currencies = r'\b' + re.sub(r'\|([^|$£€¥₩]+)', r'|\\b\1', '|'.join([c for c in self.currencies])) 35 | self.re_currencies = re.sub(r'([$£€¥₩])', r'\\\1', self.re_currencies) 36 | self.re_moneys = r'(({}) ?([\d\.\,]+)( ({})?(an)?)?)'.format(self.re_currencies, self.re_thousands) 37 | self.re_measurements = '|'.join([t for t in self.measurements]) 38 | self.re_measurements = r'(\b([\d\.\,]+) ?({})\b)'.format(self.re_measurements) 39 | self.re_timezones = '|'.join([c for c in self.timezones]) 40 | self.re_timezones = r'((\d{1,2})[\.:](\d{1,2}) ' + r'\b({})\b)'.format(self.re_timezones) 41 | self.re_http = r'(https?://(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9()@:%_\+.~#?&//=]*)' 42 | 43 | @staticmethod 44 | def is_integer(number): 45 | try: 46 | int(number) 47 | return True 48 | except ValueError: 49 | return False 50 | 51 | @staticmethod 52 | def is_float(number): 53 | try: 54 | float(number) 55 | return True 56 | except ValueError: 57 | return False 58 | 59 | def normalize(self, text): 60 | found_errors = False 61 | # Remove URL 62 | urls = re.findall(self.re_http, text) 63 | for url in urls: 64 | text = text.replace(url[0], "") 65 | 66 | # Currency 67 | moneys = re.findall(self.re_moneys, text) 68 | for money in moneys: 69 | number = re.sub(',', '.', re.sub(r'\.', '', money[2].strip(" ,."))) 70 | try: 71 | if number == "": 72 | continue 73 | if self.is_integer(number): 74 | number = int(number) 75 | elif self.is_float(number): 76 | number = float(number) 77 | else: 78 | number = re.sub(r'[.,]', "", number) 79 | number = int(number) 80 | number = num2words(number, to='cardinal', lang='id') 81 | text = text.replace(money[0].strip(" ,."), f'{number} {money[3]} {self.currencies[money[1]]}') 82 | except Exception as error: 83 | found_errors = True 84 | print(error) 85 | print(f'Problem with money: <{text}>: {number}') 86 | 87 | # Measurements 88 | units = re.findall(self.re_measurements, text) 89 | for unit in units: 90 | number = re.sub(',', '.', re.sub(r'\.', '', unit[1].strip(" ,."))) 91 | try: 92 | if number == "": 93 | continue 94 | if re.search(r'\.', number): 95 | number = float(number) 96 | else: 97 | number = int(number) 98 | number = num2words(number, to='cardinal', lang='id') 99 | text = text.replace(unit[0].strip(" ,."), f'{number} {self.measurements[unit[2]]}') 100 | except Exception as error: 101 | found_errors = True 102 | print(error) 103 | print(f'Problem with measurements: <{text}>: {number}') 104 | 105 | # Date 106 | dates = re.findall(r'(\((\d{1,2})/(\d{1,2})(/(\d+))?\))', text) 107 | for date in dates: 108 | try: 109 | day = num2words(int(date[1]), to='cardinal', lang='id') 110 | month = int(date[2]) - 1 111 | if month >= 12: 112 | month = 0 113 | month = self.months[month] 114 | if date[4] != "": 115 | year = num2words(int(date[4]), to='cardinal', lang='id') 116 | date_string = f'{day} {month} {year}' 117 | else: 118 | date_string = f'{day} {month}' 119 | text = text.replace(date[0], f' {date_string} ') 120 | except Exception as error: 121 | found_errors = True 122 | print(error) 123 | print(f'Problem with dates: <{text}>: {date}') 124 | 125 | # Timezones 126 | timezones = re.findall(self.re_timezones, text) 127 | for timezone in timezones: 128 | try: 129 | hour = num2words(int(timezone[1]), to='cardinal', lang='id') 130 | minute = num2words(int(timezone[2]), to='cardinal', lang='id') 131 | zone = self.timezones[timezone[3]] 132 | if minute == "nol": 133 | time_string = f'{hour} {zone}' 134 | else: 135 | time_string = f'{hour} lewat {minute} menit {zone}' 136 | text = text.replace(timezone[0], f'{time_string}') 137 | except Exception as error: 138 | found_errors = True 139 | print(error) 140 | print(f'Problem with timezones: <{text}>: {timezone}') 141 | 142 | # Any number 143 | re_numbers = [r'([\d.,]+)', r'\d+'] 144 | for re_number in re_numbers: 145 | number_len = 0 146 | for i in re.finditer(re_number, text): 147 | start = i.start() + number_len 148 | end = i.end() + number_len 149 | number = text[start:end] 150 | number = re.sub(',', '.', re.sub(r'\.', '', number.strip(" ,."))) 151 | if number == "": 152 | continue 153 | if self.is_integer(number) or self.is_float(number): 154 | try: 155 | if self.is_integer(number): 156 | number = int(number) 157 | else: 158 | number = float(number) 159 | number = num2words(number, to='cardinal', lang="id") 160 | text = text[:start] + number + text[end:] 161 | number_len += len(number) - (end - start) 162 | except Exception as error: 163 | found_errors = True 164 | print(error) 165 | print(f'Problem with number: <{text}>: {number}') 166 | 167 | text = re.sub(r"\s+", " ", text) 168 | if found_errors: 169 | print(f'>>> {text}') 170 | return text 171 | -------------------------------------------------------------------------------- /XLSR_Wav2Vec2_for_Indonesian_Evaluation-Javanese.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "supreme-command", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "/sysadmin/wirawan/miniconda3/envs/bert2bert/lib/python3.7/site-packages/torchaudio/backend/utils.py:54: UserWarning: \"sox\" backend is being deprecated. The default backend will be changed to \"sox_io\" backend in 0.8.0 and \"sox\" backend will be removed in 0.9.0. Please migrate to \"sox_io\" backend. Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n", 14 | " '\"sox\" backend is being deprecated. '\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import torch\n", 20 | "import torchaudio\n", 21 | "import datasets\n", 22 | "from datasets import load_dataset, load_metric, Dataset\n", 23 | "from datasets.utils.download_manager import DownloadManager\n", 24 | "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n", 25 | "import re\n", 26 | "from pathlib import Path\n", 27 | "import pandas as pd" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "id": "protecting-submission", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Preprocessing the datasets.\n", 38 | "# We need to read the audio files as arrays\n", 39 | "def speech_file_to_array_fn(batch):\n", 40 | " batch[\"sentence\"] = re.sub(chars_to_ignore_regex, '', batch[\"sentence\"]).lower()\n", 41 | " batch[\"sentence\"] = batch[\"sentence\"].replace('! ', '')\n", 42 | " batch[\"sentence\"] = batch[\"sentence\"].replace(',', '')\n", 43 | " batch[\"sentence\"] = batch[\"sentence\"].replace('é', 'e')\n", 44 | " speech_array, sampling_rate = torchaudio.load(batch[\"path\"])\n", 45 | " batch[\"speech\"] = resampler(speech_array).squeeze().numpy()\n", 46 | " return batch" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "id": "lined-marriage", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# Preprocessing the datasets.\n", 57 | "# We need to read the aduio files as arrays\n", 58 | "def evaluate(batch):\n", 59 | " inputs = processor(batch[\"speech\"], sampling_rate=16_000, return_tensors=\"pt\", padding=True)\n", 60 | "\n", 61 | " with torch.no_grad():\n", 62 | " logits = model(inputs.input_values.to(\"cuda\"), attention_mask=inputs.attention_mask.to(\"cuda\")).logits\n", 63 | "\n", 64 | " pred_ids = torch.argmax(logits, dim=-1)\n", 65 | " batch[\"pred_strings\"] = processor.batch_decode(pred_ids)\n", 66 | " return batch" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "id": "binary-network", 72 | "metadata": {}, 73 | "source": [ 74 | "### Load the dataset" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "id": "arabic-cherry", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "def load_dataset_javanese():\n", 85 | " urls = [\n", 86 | " \"https://www.openslr.org/resources/41/jv_id_female.zip\",\n", 87 | " \"https://www.openslr.org/resources/41/jv_id_male.zip\"\n", 88 | " ]\n", 89 | " dm = DownloadManager()\n", 90 | " download_dirs = dm.download_and_extract(urls)\n", 91 | " data_dirs = [ \n", 92 | " Path(download_dirs[0])/\"jv_id_female/wavs\",\n", 93 | " Path(download_dirs[1])/\"jv_id_male/wavs\",\n", 94 | " ]\n", 95 | " filenames = [ \n", 96 | " Path(download_dirs[0])/\"jv_id_female/line_index.tsv\",\n", 97 | " Path(download_dirs[1])/\"jv_id_male/line_index.tsv\",\n", 98 | " ]\n", 99 | " \n", 100 | " dfs = []\n", 101 | " dfs.append(pd.read_csv(filenames[0], sep='\\t', names=[\"path\", \"sentence\"]))\n", 102 | " dfs.append(pd.read_csv(filenames[1], sep='\\t', names=[\"path\", \"client_id\", \"sentence\"]))\n", 103 | " dfs[1] = dfs[1].drop([\"client_id\"], axis=1)\n", 104 | " \n", 105 | " for i, dir in enumerate(data_dirs):\n", 106 | " dfs[i][\"path\"] = dfs[i].apply(lambda row: str(data_dirs[i]) + \"/\" + row + \".wav\", axis=1)\n", 107 | " df = pd.concat(dfs)\n", 108 | " # df = df.sample(frac=1, random_state=1).reset_index(drop=True)\n", 109 | " dataset = Dataset.from_pandas(df)\n", 110 | " dataset = dataset.remove_columns('__index_level_0__')\n", 111 | " \n", 112 | " return dataset.train_test_split(test_size=0.1, seed=1)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 5, 118 | "id": "medieval-arrival", 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/plain": [ 124 | "DatasetDict({\n", 125 | " train: Dataset({\n", 126 | " features: ['path', 'sentence'],\n", 127 | " num_rows: 5239\n", 128 | " })\n", 129 | " test: Dataset({\n", 130 | " features: ['path', 'sentence'],\n", 131 | " num_rows: 583\n", 132 | " })\n", 133 | "})" 134 | ] 135 | }, 136 | "execution_count": 5, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "dataset = load_dataset_javanese()\n", 143 | "# We can also just load it from the disk created during training\n", 144 | "# dataset = datasets.load_from_disk(\"dataset_javanese\")\n", 145 | "dataset" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 6, 151 | "id": "skilled-drove", 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "test_dataset = dataset['test']\n", 156 | "wer = load_metric(\"wer\")" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 7, 162 | "id": "hindu-flour", 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "name": "stderr", 167 | "output_type": "stream", 168 | "text": [ 169 | "Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "%%capture\n", 175 | "\n", 176 | "model_name = \"cahya/wav2vec2-large-xlsr-javanese\"\n", 177 | "\n", 178 | "processor = Wav2Vec2Processor.from_pretrained(model_name)\n", 179 | "model = Wav2Vec2ForCTC.from_pretrained(model_name) \n", 180 | "model.to(\"cuda\")" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 8, 186 | "id": "requested-alexander", 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "application/vnd.jupyter.widget-view+json": { 192 | "model_id": "1283cb4f6b9e4ccdbbc428eb82bb7234", 193 | "version_major": 2, 194 | "version_minor": 0 195 | }, 196 | "text/plain": [ 197 | "HBox(children=(FloatProgress(value=0.0, max=583.0), HTML(value='')))" 198 | ] 199 | }, 200 | "metadata": {}, 201 | "output_type": "display_data" 202 | }, 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | "\n" 208 | ] 209 | } 210 | ], 211 | "source": [ 212 | "chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\\"\\“\\%\\‘\\'\\”_\\�]'\n", 213 | "resampler = torchaudio.transforms.Resample(48_000, 16_000)\n", 214 | "test_dataset = test_dataset.map(speech_file_to_array_fn)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 9, 220 | "id": "endless-circuit", 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "data": { 225 | "application/vnd.jupyter.widget-view+json": { 226 | "model_id": "abd9d20cfe854eae91f3611e53ad521e", 227 | "version_major": 2, 228 | "version_minor": 0 229 | }, 230 | "text/plain": [ 231 | "HBox(children=(FloatProgress(value=0.0, max=73.0), HTML(value='')))" 232 | ] 233 | }, 234 | "metadata": {}, 235 | "output_type": "display_data" 236 | }, 237 | { 238 | "name": "stdout", 239 | "output_type": "stream", 240 | "text": [ 241 | "\n", 242 | "WER: 17.609414\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "result = test_dataset.map(evaluate, batched=True, batch_size=8)\n", 248 | "\n", 249 | "print(\"WER: {:2f}\".format(100 * wer.compute(predictions=result[\"pred_strings\"], references=result[\"sentence\"])))" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "id": "hollywood-commerce", 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [] 259 | } 260 | ], 261 | "metadata": { 262 | "kernelspec": { 263 | "display_name": "Python 3", 264 | "language": "python", 265 | "name": "python3" 266 | }, 267 | "language_info": { 268 | "codemirror_mode": { 269 | "name": "ipython", 270 | "version": 3 271 | }, 272 | "file_extension": ".py", 273 | "mimetype": "text/x-python", 274 | "name": "python", 275 | "nbconvert_exporter": "python", 276 | "pygments_lexer": "ipython3", 277 | "version": "3.7.9" 278 | } 279 | }, 280 | "nbformat": 4, 281 | "nbformat_minor": 5 282 | } 283 | -------------------------------------------------------------------------------- /XLSR_Wav2Vec2_for_Indonesian_Evaluation-Sundanese.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "hungry-pickup", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "/sysadmin/wirawan/miniconda3/envs/bert2bert/lib/python3.7/site-packages/torchaudio/backend/utils.py:54: UserWarning: \"sox\" backend is being deprecated. The default backend will be changed to \"sox_io\" backend in 0.8.0 and \"sox\" backend will be removed in 0.9.0. Please migrate to \"sox_io\" backend. Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n", 14 | " '\"sox\" backend is being deprecated. '\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import torch\n", 20 | "import torchaudio\n", 21 | "import datasets\n", 22 | "from datasets import load_dataset, load_metric, Dataset\n", 23 | "from datasets.utils.download_manager import DownloadManager\n", 24 | "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n", 25 | "import re\n", 26 | "from pathlib import Path\n", 27 | "import pandas as pd" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "id": "opponent-animal", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Preprocessing the datasets.\n", 38 | "# We need to read the audio files as arrays\n", 39 | "def speech_file_to_array_fn(batch):\n", 40 | " batch[\"sentence\"] = re.sub(chars_to_ignore_regex, '', batch[\"sentence\"]).lower()\n", 41 | " batch[\"sentence\"] = batch[\"sentence\"].replace('! ', '')\n", 42 | " batch[\"sentence\"] = batch[\"sentence\"].replace(',', '')\n", 43 | " batch[\"sentence\"] = batch[\"sentence\"].replace('é', 'e')\n", 44 | " speech_array, sampling_rate = torchaudio.load(batch[\"path\"])\n", 45 | " batch[\"speech\"] = resampler(speech_array).squeeze().numpy()\n", 46 | " return batch" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "id": "incoming-scheduling", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# Preprocessing the datasets.\n", 57 | "# We need to read the aduio files as arrays\n", 58 | "def evaluate(batch):\n", 59 | " inputs = processor(batch[\"speech\"], sampling_rate=16_000, return_tensors=\"pt\", padding=True)\n", 60 | "\n", 61 | " with torch.no_grad():\n", 62 | " logits = model(inputs.input_values.to(\"cuda\"), attention_mask=inputs.attention_mask.to(\"cuda\")).logits\n", 63 | "\n", 64 | " pred_ids = torch.argmax(logits, dim=-1)\n", 65 | " batch[\"pred_strings\"] = processor.batch_decode(pred_ids)\n", 66 | " return batch" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "id": "certified-moisture", 72 | "metadata": {}, 73 | "source": [ 74 | "### Load the dataset" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "latin-queue", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "def load_dataset_sundanese(): \n", 85 | " urls = [\n", 86 | " \"https://www.openslr.org/resources/44/su_id_female.zip\",\n", 87 | " \"https://www.openslr.org/resources/44/su_id_male.zip\"\n", 88 | " ]\n", 89 | " dm = DownloadManager()\n", 90 | " download_dirs = dm.download_and_extract(urls)\n", 91 | " data_dirs = [ \n", 92 | " Path(download_dirs[0])/\"su_id_female/wavs\",\n", 93 | " Path(download_dirs[1])/\"su_id_male/wavs\",\n", 94 | " ]\n", 95 | " filenames = [ \n", 96 | " Path(download_dirs[0])/\"su_id_female/line_index.tsv\",\n", 97 | " Path(download_dirs[1])/\"su_id_male/line_index.tsv\",\n", 98 | " ]\n", 99 | " \n", 100 | " dfs = [] \n", 101 | " dfs.append(pd.read_csv(filenames[0], sep='\\t4?\\t', names=[\"path\", \"sentence\"]))\n", 102 | " dfs.append(pd.read_csv(filenames[1], sep='\\t\\t', names=[\"path\", \"sentence\"]))\n", 103 | " \n", 104 | " for i, dir in enumerate(data_dirs):\n", 105 | " dfs[i][\"path\"] = dfs[i].apply(lambda row: str(data_dirs[i]) + \"/\" + row + \".wav\", axis=1)\n", 106 | " df = pd.concat(dfs)\n", 107 | " # df = df.sample(frac=1, random_state=1).reset_index(drop=True)\n", 108 | " dataset = Dataset.from_pandas(df)\n", 109 | " dataset = dataset.remove_columns('__index_level_0__')\n", 110 | " \n", 111 | " return dataset.train_test_split(test_size=0.1, seed=1)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 5, 117 | "id": "blessed-document", 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "name": "stderr", 122 | "output_type": "stream", 123 | "text": [ 124 | "/sysadmin/wirawan/miniconda3/envs/bert2bert/lib/python3.7/site-packages/ipykernel_launcher.py:19: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", 125 | "/sysadmin/wirawan/miniconda3/envs/bert2bert/lib/python3.7/site-packages/ipykernel_launcher.py:20: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n" 126 | ] 127 | }, 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "DatasetDict({\n", 132 | " train: Dataset({\n", 133 | " features: ['path', 'sentence'],\n", 134 | " num_rows: 3791\n", 135 | " })\n", 136 | " test: Dataset({\n", 137 | " features: ['path', 'sentence'],\n", 138 | " num_rows: 422\n", 139 | " })\n", 140 | "})" 141 | ] 142 | }, 143 | "execution_count": 5, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "dataset = load_dataset_sundanese()\n", 150 | "# We can also just load it from the disk created during training\n", 151 | "# dataset = datasets.load_from_disk(\"dataset_sundanese\")\n", 152 | "dataset" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 6, 158 | "id": "vocational-breath", 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "test_dataset = dataset['test']\n", 163 | "wer = load_metric(\"wer\")" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 7, 169 | "id": "occasional-palace", 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "name": "stderr", 174 | "output_type": "stream", 175 | "text": [ 176 | "Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "%%capture\n", 182 | "\n", 183 | "model_name = \"cahya/wav2vec2-large-xlsr-sundanese\"\n", 184 | "\n", 185 | "processor = Wav2Vec2Processor.from_pretrained(model_name)\n", 186 | "model = Wav2Vec2ForCTC.from_pretrained(model_name) \n", 187 | "model.to(\"cuda\")" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 8, 193 | "id": "stretch-programmer", 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "application/vnd.jupyter.widget-view+json": { 199 | "model_id": "5c0ca027431548599d62b10b5f1af2c4", 200 | "version_major": 2, 201 | "version_minor": 0 202 | }, 203 | "text/plain": [ 204 | "HBox(children=(FloatProgress(value=0.0, max=422.0), HTML(value='')))" 205 | ] 206 | }, 207 | "metadata": {}, 208 | "output_type": "display_data" 209 | }, 210 | { 211 | "name": "stdout", 212 | "output_type": "stream", 213 | "text": [ 214 | "\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "#chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\\"\\“\\%\\‘\\'\\”]'\n", 220 | "chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\\"\\“\\%\\‘\\'\\”_\\�]'\n", 221 | "resampler = torchaudio.transforms.Resample(48_000, 16_000)\n", 222 | "test_dataset = test_dataset.map(speech_file_to_array_fn)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 9, 228 | "id": "three-aurora", 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "data": { 233 | "application/vnd.jupyter.widget-view+json": { 234 | "model_id": "2599826d20b64969a7ecf7382ac37a1a", 235 | "version_major": 2, 236 | "version_minor": 0 237 | }, 238 | "text/plain": [ 239 | "HBox(children=(FloatProgress(value=0.0, max=53.0), HTML(value='')))" 240 | ] 241 | }, 242 | "metadata": {}, 243 | "output_type": "display_data" 244 | }, 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "\n", 250 | "WER: 6.190727\n" 251 | ] 252 | } 253 | ], 254 | "source": [ 255 | "result = test_dataset.map(evaluate, batched=True, batch_size=8)\n", 256 | "\n", 257 | "print(\"WER: {:2f}\".format(100 * wer.compute(predictions=result[\"pred_strings\"], references=result[\"sentence\"])))" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "id": "first-schedule", 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [] 267 | } 268 | ], 269 | "metadata": { 270 | "kernelspec": { 271 | "display_name": "Python 3", 272 | "language": "python", 273 | "name": "python3" 274 | }, 275 | "language_info": { 276 | "codemirror_mode": { 277 | "name": "ipython", 278 | "version": 3 279 | }, 280 | "file_extension": ".py", 281 | "mimetype": "text/x-python", 282 | "name": "python", 283 | "nbconvert_exporter": "python", 284 | "pygments_lexer": "ipython3", 285 | "version": "3.7.9" 286 | } 287 | }, 288 | "nbformat": 4, 289 | "nbformat_minor": 5 290 | } 291 | -------------------------------------------------------------------------------- /XLSR_Wav2Vec2_for_Indonesian_Evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "obvious-dominican", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "/sysadmin/wirawan/miniconda3/envs/bert2bert/lib/python3.7/site-packages/torchaudio/backend/utils.py:54: UserWarning: \"sox\" backend is being deprecated. The default backend will be changed to \"sox_io\" backend in 0.8.0 and \"sox\" backend will be removed in 0.9.0. Please migrate to \"sox_io\" backend. Please refer to https://github.com/pytorch/audio/issues/903 for the detail.\n", 14 | " '\"sox\" backend is being deprecated. '\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import torch\n", 20 | "import torchaudio\n", 21 | "import datasets\n", 22 | "from datasets import load_dataset, load_metric\n", 23 | "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n", 24 | "import re\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "id": "assumed-charge", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# Preprocessing the datasets.\n", 35 | "# We need to read the audio files as arrays\n", 36 | "def speech_file_to_array_fn(batch):\n", 37 | " batch[\"sentence\"] = re.sub(chars_to_ignore_regex, '', batch[\"sentence\"]).lower()\n", 38 | " batch[\"sentence\"] = batch[\"sentence\"].replace('! ', '')\n", 39 | " batch[\"sentence\"] = batch[\"sentence\"].replace(',', '')\n", 40 | " batch[\"sentence\"] = batch[\"sentence\"].replace('é', 'e') \n", 41 | " speech_array, sampling_rate = torchaudio.load(batch[\"path\"])\n", 42 | " resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)\n", 43 | " batch[\"speech\"] = resampler(speech_array).squeeze().numpy()\n", 44 | " return batch" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "id": "contemporary-richardson", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# Preprocessing the datasets.\n", 55 | "# We need to read the aduio files as arrays\n", 56 | "def evaluate(batch):\n", 57 | " inputs = processor(batch[\"speech\"], sampling_rate=16_000, return_tensors=\"pt\", padding=True)\n", 58 | "\n", 59 | " with torch.no_grad():\n", 60 | " logits = model(inputs.input_values.to(\"cuda\"), attention_mask=inputs.attention_mask.to(\"cuda\")).logits\n", 61 | "\n", 62 | " pred_ids = torch.argmax(logits, dim=-1)\n", 63 | " batch[\"pred_strings\"] = processor.batch_decode(pred_ids)\n", 64 | " return batch" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "id": "directed-quebec", 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "#dataset = datasets.load_from_disk(\"sundanese_train_dataset\")" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 5, 80 | "id": "banned-jonathan", 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stderr", 85 | "output_type": "stream", 86 | "text": [ 87 | "Couldn't find file locally at common_voice/common_voice.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.4.1/datasets/common_voice/common_voice.py.\n", 88 | "The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/datasets/common_voice/common_voice.py.\n", 89 | "Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/id/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f)\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "test_dataset = load_dataset(\"common_voice\", \"id\", split=\"test\")\n", 95 | "wer = load_metric(\"wer\")" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 6, 101 | "id": "strange-federal", 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "name": "stderr", 106 | "output_type": "stream", 107 | "text": [ 108 | "Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "%%capture\n", 114 | "\n", 115 | "model_name = \"cahya/wav2vec2-large-xlsr-indonesian-mix\"\n", 116 | "#model_name = \"./wav2vec2-large-xlsr-indonesian-articial-CV\"\n", 117 | "#model_name = \"cahya/wav2vec2-large-xlsr-indonesian\"\n", 118 | "#model_name = \"ayameRushia/wav2vec2-large-xlsr-indonesia-demo\"\n", 119 | "#model_name = \"Galuh/wav2vec2-large-xlsr-indonesian-demo\"\n", 120 | "\n", 121 | "processor = Wav2Vec2Processor.from_pretrained(model_name)\n", 122 | "model = Wav2Vec2ForCTC.from_pretrained(model_name) \n", 123 | "model.to(\"cuda\")" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 7, 129 | "id": "appropriate-balance", 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "name": "stderr", 134 | "output_type": "stream", 135 | "text": [ 136 | "Loading cached processed dataset at /root/.cache/huggingface/datasets/common_voice/id/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f/cache-6273e8490b649de6.arrow\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "chars_to_ignore_regex = '[\\,\\?\\.\\!\\-\\;\\:\\\"\\“\\%\\‘\\'\\”]'\n", 142 | "test_dataset = test_dataset.map(speech_file_to_array_fn)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 8, 148 | "id": "honey-grammar", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "application/vnd.jupyter.widget-view+json": { 154 | "model_id": "2858cf1691f64a589fca0ee97f2eb39b", 155 | "version_major": 2, 156 | "version_minor": 0 157 | }, 158 | "text/plain": [ 159 | "HBox(children=(FloatProgress(value=0.0, max=231.0), HTML(value='')))" 160 | ] 161 | }, 162 | "metadata": {}, 163 | "output_type": "display_data" 164 | }, 165 | { 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "\n", 170 | "WER: 19.373485\n" 171 | ] 172 | } 173 | ], 174 | "source": [ 175 | "result = test_dataset.map(evaluate, batched=True, batch_size=8)\n", 176 | "\n", 177 | "print(\"WER: {:2f}\".format(100 * wer.compute(predictions=result[\"pred_strings\"], references=result[\"sentence\"])))" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 9, 183 | "id": "unlike-dispatch", 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "application/vnd.jupyter.widget-view+json": { 189 | "model_id": "a648f87eb0ef498e961770342c848363", 190 | "version_major": 2, 191 | "version_minor": 0 192 | }, 193 | "text/plain": [ 194 | "HBox(children=(FloatProgress(value=0.0, max=29.0), HTML(value='')))" 195 | ] 196 | }, 197 | "metadata": {}, 198 | "output_type": "display_data" 199 | }, 200 | { 201 | "name": "stdout", 202 | "output_type": "stream", 203 | "text": [ 204 | "\n", 205 | "WER: 19.671825\n" 206 | ] 207 | } 208 | ], 209 | "source": [ 210 | "result = test_dataset.map(evaluate, batched=True, batch_size=64)\n", 211 | "\n", 212 | "print(\"WER: {:2f}\".format(100 * wer.compute(predictions=result[\"pred_strings\"], references=result[\"sentence\"])))" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "id": "partial-smith", 218 | "metadata": {}, 219 | "source": [ 220 | "### Check the sampling rate of the sound files" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 28, 226 | "id": "nearby-national", 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "name": "stderr", 231 | "output_type": "stream", 232 | "text": [ 233 | "Couldn't find file locally at common_voice/common_voice.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.4.1/datasets/common_voice/common_voice.py.\n", 234 | "The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/datasets/common_voice/common_voice.py.\n", 235 | "Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/id/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f)\n", 236 | "Couldn't find file locally at common_voice/common_voice.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.4.1/datasets/common_voice/common_voice.py.\n", 237 | "The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/datasets/common_voice/common_voice.py.\n", 238 | "Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/id/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f)\n" 239 | ] 240 | } 241 | ], 242 | "source": [ 243 | "train_dataset = load_dataset(\"common_voice\", \"id\", split=\"train+validation\")\n", 244 | "test_dataset = load_dataset(\"common_voice\", \"id\", split=\"test\")" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 17, 250 | "id": "according-concept", 251 | "metadata": {}, 252 | "outputs": [ 253 | { 254 | "data": { 255 | "text/plain": [ 256 | "Dataset({\n", 257 | " features: ['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n", 258 | " num_rows: 1844\n", 259 | "})" 260 | ] 261 | }, 262 | "execution_count": 17, 263 | "metadata": {}, 264 | "output_type": "execute_result" 265 | } 266 | ], 267 | "source": [ 268 | "test_dataset" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 18, 274 | "id": "verbal-keeping", 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "data": { 279 | "text/plain": [ 280 | "{'accent': '',\n", 281 | " 'age': '',\n", 282 | " 'client_id': '057bf45c0c338db897f5717f744bcac8a2ac2eee990a4294ce406f1f79c3326aff181d59e49e7e82813e532ed160c027a80eb4c7e36d8899d3fc96e48d102de5',\n", 283 | " 'down_votes': 0,\n", 284 | " 'gender': '',\n", 285 | " 'locale': 'id',\n", 286 | " 'path': '/root/.cache/huggingface/datasets/downloads/extracted/fd8a16a97efd77adba3c26c54d0cfae6c9d9494c1017f8070f3f79db72c4b57c/cv-corpus-6.1-2020-12-11/id/clips/common_voice_id_22888800.mp3',\n", 287 | " 'segment': \"''\",\n", 288 | " 'sentence': 'Minggu depan kakak perempuan saya menikah.',\n", 289 | " 'up_votes': 2}" 290 | ] 291 | }, 292 | "execution_count": 18, 293 | "metadata": {}, 294 | "output_type": "execute_result" 295 | } 296 | ], 297 | "source": [ 298 | "test_dataset[0]" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 35, 304 | "id": "collaborative-tracker", 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "sampling_rates = {}\n", 309 | "for i, data in enumerate(test_dataset): \n", 310 | " speech_array, sampling_rate = torchaudio.load(data[\"path\"])\n", 311 | " if sampling_rate not in sampling_rates:\n", 312 | " sampling_rates[sampling_rate] = 1\n", 313 | " else:\n", 314 | " sampling_rates[sampling_rate] += 1" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 36, 320 | "id": "civil-deployment", 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "Rate 48000 1682 91.21475054229936\n", 328 | "Rate 32000 115 6.236442516268981\n", 329 | "Rate 44100 44 2.386117136659436\n" 330 | ] 331 | } 332 | ], 333 | "source": [ 334 | "for rate in sampling_rates:\n", 335 | " print(\"Rate\", rate, sampling_rates[rate], sampling_rates[rate]/len(test_dataset)*100.0)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 33, 341 | "id": "impressive-storage", 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "sampling_rates = {}\n", 346 | "for i, data in enumerate(train_dataset): \n", 347 | " speech_array, sampling_rate = torchaudio.load(data[\"path\"])\n", 348 | " if sampling_rate not in sampling_rates:\n", 349 | " sampling_rates[sampling_rate] = 1\n", 350 | " else:\n", 351 | " sampling_rates[sampling_rate] += 1" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 34, 357 | "id": "bored-softball", 358 | "metadata": {}, 359 | "outputs": [ 360 | { 361 | "name": "stdout", 362 | "output_type": "stream", 363 | "text": [ 364 | "Rate 48000 3965 100.0\n" 365 | ] 366 | } 367 | ], 368 | "source": [ 369 | "for rate in sampling_rates:\n", 370 | " print(\"Rate\", rate, sampling_rates[rate], sampling_rates[rate]/len(train_dataset)*100.0)" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 31, 376 | "id": "italian-idaho", 377 | "metadata": {}, 378 | "outputs": [ 379 | { 380 | "data": { 381 | "text/plain": [ 382 | "3965" 383 | ] 384 | }, 385 | "execution_count": 31, 386 | "metadata": {}, 387 | "output_type": "execute_result" 388 | } 389 | ], 390 | "source": [ 391 | "len(train_dataset)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 32, 397 | "id": "large-fountain", 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "data": { 402 | "text/plain": [ 403 | "{48000: 3964}" 404 | ] 405 | }, 406 | "execution_count": 32, 407 | "metadata": {}, 408 | "output_type": "execute_result" 409 | } 410 | ], 411 | "source": [ 412 | "sampling_rates" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "id": "unavailable-madness", 418 | "metadata": {}, 419 | "source": [ 420 | "### Turkish" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 37, 426 | "id": "willing-tokyo", 427 | "metadata": {}, 428 | "outputs": [ 429 | { 430 | "name": "stderr", 431 | "output_type": "stream", 432 | "text": [ 433 | "Couldn't find file locally at common_voice/common_voice.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.4.1/datasets/common_voice/common_voice.py.\n", 434 | "The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/datasets/common_voice/common_voice.py.\n" 435 | ] 436 | }, 437 | { 438 | "name": "stdout", 439 | "output_type": "stream", 440 | "text": [ 441 | "Downloading and preparing dataset common_voice/tr (download: 592.09 MiB, generated: 2.89 MiB, post-processed: Unknown size, total: 594.98 MiB) to /root/.cache/huggingface/datasets/common_voice/tr/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f...\n" 442 | ] 443 | }, 444 | { 445 | "data": { 446 | "application/vnd.jupyter.widget-view+json": { 447 | "model_id": "c9c50ca6fa5946ca833815652d5c6d4d", 448 | "version_major": 2, 449 | "version_minor": 0 450 | }, 451 | "text/plain": [ 452 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=620848700.0, style=ProgressStyle(descri…" 453 | ] 454 | }, 455 | "metadata": {}, 456 | "output_type": "display_data" 457 | }, 458 | { 459 | "name": "stdout", 460 | "output_type": "stream", 461 | "text": [ 462 | "\n" 463 | ] 464 | }, 465 | { 466 | "data": { 467 | "application/vnd.jupyter.widget-view+json": { 468 | "model_id": "", 469 | "version_major": 2, 470 | "version_minor": 0 471 | }, 472 | "text/plain": [ 473 | "HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…" 474 | ] 475 | }, 476 | "metadata": {}, 477 | "output_type": "display_data" 478 | }, 479 | { 480 | "data": { 481 | "application/vnd.jupyter.widget-view+json": { 482 | "model_id": "", 483 | "version_major": 2, 484 | "version_minor": 0 485 | }, 486 | "text/plain": [ 487 | "HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…" 488 | ] 489 | }, 490 | "metadata": {}, 491 | "output_type": "display_data" 492 | }, 493 | { 494 | "data": { 495 | "application/vnd.jupyter.widget-view+json": { 496 | "model_id": "", 497 | "version_major": 2, 498 | "version_minor": 0 499 | }, 500 | "text/plain": [ 501 | "HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…" 502 | ] 503 | }, 504 | "metadata": {}, 505 | "output_type": "display_data" 506 | }, 507 | { 508 | "data": { 509 | "application/vnd.jupyter.widget-view+json": { 510 | "model_id": "", 511 | "version_major": 2, 512 | "version_minor": 0 513 | }, 514 | "text/plain": [ 515 | "HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…" 516 | ] 517 | }, 518 | "metadata": {}, 519 | "output_type": "display_data" 520 | }, 521 | { 522 | "data": { 523 | "application/vnd.jupyter.widget-view+json": { 524 | "model_id": "", 525 | "version_major": 2, 526 | "version_minor": 0 527 | }, 528 | "text/plain": [ 529 | "HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…" 530 | ] 531 | }, 532 | "metadata": {}, 533 | "output_type": "display_data" 534 | }, 535 | { 536 | "name": "stdout", 537 | "output_type": "stream", 538 | "text": [ 539 | "Dataset common_voice downloaded and prepared to /root/.cache/huggingface/datasets/common_voice/tr/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f. Subsequent calls will reuse this data.\n" 540 | ] 541 | }, 542 | { 543 | "name": "stderr", 544 | "output_type": "stream", 545 | "text": [ 546 | "Couldn't find file locally at common_voice/common_voice.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.4.1/datasets/common_voice/common_voice.py.\n", 547 | "The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/datasets/common_voice/common_voice.py.\n", 548 | "Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/tr/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f)\n" 549 | ] 550 | } 551 | ], 552 | "source": [ 553 | "train_dataset = load_dataset(\"common_voice\", \"tr\", split=\"train+validation\")\n", 554 | "test_dataset = load_dataset(\"common_voice\", \"tr\", split=\"test\")" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 38, 560 | "id": "numeric-chamber", 561 | "metadata": {}, 562 | "outputs": [], 563 | "source": [ 564 | "sampling_rates = {}\n", 565 | "for i, data in enumerate(test_dataset): \n", 566 | " speech_array, sampling_rate = torchaudio.load(data[\"path\"])\n", 567 | " if sampling_rate not in sampling_rates:\n", 568 | " sampling_rates[sampling_rate] = 1\n", 569 | " else:\n", 570 | " sampling_rates[sampling_rate] += 1" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 39, 576 | "id": "duplicate-imagination", 577 | "metadata": {}, 578 | "outputs": [ 579 | { 580 | "name": "stdout", 581 | "output_type": "stream", 582 | "text": [ 583 | "Rate 48000 1643 47.23979298447383\n", 584 | "Rate 44100 4 0.11500862564692352\n" 585 | ] 586 | } 587 | ], 588 | "source": [ 589 | "for rate in sampling_rates:\n", 590 | " print(\"Rate\", rate, sampling_rates[rate], sampling_rates[rate]/len(train_dataset)*100.0)" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 40, 596 | "id": "detailed-sullivan", 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "sampling_rates = {}\n", 601 | "for i, data in enumerate(train_dataset): \n", 602 | " speech_array, sampling_rate = torchaudio.load(data[\"path\"])\n", 603 | " if sampling_rate not in sampling_rates:\n", 604 | " sampling_rates[sampling_rate] = 1\n", 605 | " else:\n", 606 | " sampling_rates[sampling_rate] += 1" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 41, 612 | "id": "boxed-specific", 613 | "metadata": {}, 614 | "outputs": [ 615 | { 616 | "name": "stdout", 617 | "output_type": "stream", 618 | "text": [ 619 | "Rate 48000 3462 99.53996549741231\n", 620 | "Rate 44100 16 0.46003450258769407\n" 621 | ] 622 | } 623 | ], 624 | "source": [ 625 | "for rate in sampling_rates:\n", 626 | " print(\"Rate\", rate, sampling_rates[rate], sampling_rates[rate]/len(train_dataset)*100.0)" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "id": "systematic-leader", 633 | "metadata": {}, 634 | "outputs": [], 635 | "source": [ 636 | " " 637 | ] 638 | } 639 | ], 640 | "metadata": { 641 | "kernelspec": { 642 | "display_name": "Python 3", 643 | "language": "python", 644 | "name": "python3" 645 | }, 646 | "language_info": { 647 | "codemirror_mode": { 648 | "name": "ipython", 649 | "version": 3 650 | }, 651 | "file_extension": ".py", 652 | "mimetype": "text/x-python", 653 | "name": "python", 654 | "nbconvert_exporter": "python", 655 | "pygments_lexer": "ipython3", 656 | "version": "3.7.9" 657 | } 658 | }, 659 | "nbformat": 4, 660 | "nbformat_minor": 5 661 | } 662 | -------------------------------------------------------------------------------- /XLSR_Wav2Vec2_for_Indonesian_Usage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "continent-compilation", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import torch\n", 11 | "import torchaudio\n", 12 | "from datasets import load_dataset\n", 13 | "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 4, 19 | "id": "freelance-ozone", 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stderr", 24 | "output_type": "stream", 25 | "text": [ 26 | "Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "model_name = \"indonesian-nlp/wav2vec2-large-xlsr-indonesian\"\n", 32 | "#model_name = \"ayameRushia/wav2vec2-large-xlsr-indonesia-demo\"\n", 33 | "#model_name = \"Galuh/wav2vec2-large-xlsr-indonesian-demo\"\n", 34 | "\n", 35 | "processor = Wav2Vec2Processor.from_pretrained(model_name)\n", 36 | "model = Wav2Vec2ForCTC.from_pretrained(model_name) \n", 37 | "# model.to(\"cuda\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 100, 43 | "id": "brilliant-serbia", 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# Preprocessing the datasets.\n", 48 | "# We need to read the aduio files as arrays\n", 49 | "def speech_file_to_array_fn(batch):\n", 50 | " speech_array, sampling_rate = torchaudio.load(batch[\"path\"])\n", 51 | " resampler = torchaudio.transforms.Resample(sampling_rate/1.0, 16_000)\n", 52 | " volume = torchaudio.transforms.Vol(1.0)\n", 53 | " batch[\"speech\"] = resampler(volume(speech_array)).squeeze().numpy()\n", 54 | " return batch" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 13, 60 | "id": "czech-endorsement", 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "name": "stderr", 65 | "output_type": "stream", 66 | "text": [ 67 | "Couldn't find file locally at common_voice/common_voice.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.4.1/datasets/common_voice/common_voice.py.\n", 68 | "The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/datasets/common_voice/common_voice.py.\n", 69 | "Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/id/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f)\n", 70 | "Loading cached processed dataset at /root/.cache/huggingface/datasets/common_voice/id/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f/cache-e1af9a99f71daf8f.arrow\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "test_dataset = load_dataset(\"common_voice\", \"id\", split=\"test[:2%]\")\n", 76 | "test_dataset = test_dataset.map(speech_file_to_array_fn)\n", 77 | "inputs = processor(test_dataset[\"speech\"][:5], sampling_rate=16_000, return_tensors=\"pt\", padding=True)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 17, 83 | "id": "loving-medium", 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "tensor([[ 0.0011, 0.0011, 0.0011, ..., 0.0000, 0.0000, 0.0000],\n", 90 | " [-0.0002, -0.0002, -0.0002, ..., -0.0093, -0.0029, -0.0019],\n", 91 | " [-0.0014, -0.0014, -0.0014, ..., 0.0000, 0.0000, 0.0000],\n", 92 | " [ 0.0013, 0.0013, 0.0013, ..., 0.0000, 0.0000, 0.0000],\n", 93 | " [-0.0007, -0.0007, -0.0007, ..., 0.0000, 0.0000, 0.0000]])" 94 | ] 95 | }, 96 | "execution_count": 17, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "inputs['input_values']" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 5, 108 | "id": "fourth-prompt", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "with torch.no_grad():\n", 113 | " logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits\n", 114 | "\n", 115 | "predicted_ids = torch.argmax(logits, dim=-1)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 6, 121 | "id": "alternative-launch", 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "Prediction: ['minggu depan kakak perempuan saya menikah', 'berbagai bahasa daerah dan bahasa asing menjadu bahasa serapaan klim dan kemudian menjadi bahasa indonesia', 'apa yang bisa saya berikan kepadamu', 'inilah dunia kecil', 'nol']\n", 129 | "Reference: ['Minggu depan kakak perempuan saya menikah.', 'Berbagai bahasa daerah dan bahasa asing menjadi bahasa serapan dan kemudian menjadi bahasa Indonesia.', 'apa yang bisa saya berikan kepadamu?', 'Inilah dunia kecil.', 'nol']\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "print(\"Prediction:\", processor.batch_decode(predicted_ids))\n", 135 | "print(\"Reference:\", test_dataset[\"sentence\"][:5])" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 55, 141 | "id": "worst-primary", 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stderr", 146 | "output_type": "stream", 147 | "text": [ 148 | "Couldn't find file locally at common_voice/common_voice.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.4.1/datasets/common_voice/common_voice.py.\n", 149 | "The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/datasets/common_voice/common_voice.py.\n", 150 | "Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/id/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f)\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "test_dataset = load_dataset(\"common_voice\", \"id\", split=\"test\")" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 56, 161 | "id": "blind-release", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "'/root/.cache/huggingface/datasets/downloads/extracted/fd8a16a97efd77adba3c26c54d0cfae6c9d9494c1017f8070f3f79db72c4b57c/cv-corpus-6.1-2020-12-11/id/clips/common_voice_id_22888800.mp3'" 168 | ] 169 | }, 170 | "execution_count": 56, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "test_dataset[0][\"path\"]" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 64, 182 | "id": "worse-lincoln", 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "import matplotlib.pyplot as plt" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 61, 192 | "id": "alpha-preliminary", 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "def show_wave(filename):\n", 197 | " index = songs[filename]\n", 198 | " print(test_dataset[index][\"path\"], \"\\n\", test_dataset[index][\"sentence\"], )" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 62, 204 | "id": "mechanical-reconstruction", 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "/root/.cache/huggingface/datasets/downloads/extracted/fd8a16a97efd77adba3c26c54d0cfae6c9d9494c1017f8070f3f79db72c4b57c/cv-corpus-6.1-2020-12-11/id/clips/common_voice_id_19773609.mp3 \n", 212 | " Saya pikir itu ide yang bagus.\n" 213 | ] 214 | } 215 | ], 216 | "source": [] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 77, 221 | "id": "mineral-christianity", 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "# We need to read the aduio files as arrays\n", 226 | "def show_wave(filename, gain=1.0):\n", 227 | " index = songs[filename]\n", 228 | " speech_array, sampling_rate = torchaudio.load(test_dataset[index][\"path\"])\n", 229 | " resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)\n", 230 | " volume = torchaudio.transforms.Vol(gain)\n", 231 | " audio = resampler(volume(speech_array)).squeeze().numpy()\n", 232 | " plt.figure()\n", 233 | " plt.ylim([-1.0,1.0])\n", 234 | " plt.plot(audio)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 78, 240 | "id": "incorporated-greenhouse", 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAD8CAYAAACLrvgBAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAd50lEQVR4nO3de5xVdb3/8dd7ZhhAQBguwsgdRUHFECcTTS3FvJ3UzAqzRLPoonU8PfKIP885lidPVufx0C6elGOUlkcttaQ0yWtqeWFQ5KJyFWUQYbgqcp2Zz++PvcA941wW7L3nAu/n47Efs9b3+11rfb/MYt57XfbaigjMzMzSKGrrDpiZWcfh0DAzs9QcGmZmlppDw8zMUnNomJlZag4NMzNLLS+hIWmapNWS5jVRL0k/lbRY0hxJ47LqJklalLwm5aM/ZmZWGPk60vg1cHoz9WcAI5PXZOAXAJJ6A9cCHwGOAa6VVJanPpmZWZ7lJTQi4ilgXTNNzgHuiIzngF6SyoHTgEciYl1ErAceofnwMTOzNlTSStsZCCzPmq9Kypoq/wBJk8kcpdCtW7ejR40aVZiempntpWbNmrUmIvrlso7WCo2cRcRUYCpARUVFVFZWtnGPzMw6Fklv5LqO1rp7agUwOGt+UFLWVLmZmbVDrRUa04GLkruojgU2RsRKYAbwCUllyQXwTyRlZmbWDuXl9JSku4CPAX0lVZG5I6oTQETcAjwEnAksBjYDlyR16yT9JzAzWdV1EdHcBXUzM2tDeQmNiLighfoALmuibhowLR/9MDOzwvInws3MLDWHhpmZpebQMDOz1BwaZmaWmkPDzMxSc2iYmVlqDg0zM0vNoWFmZqk5NMzMLDWHhpmZpebQMDOz1BwaZmaWmkPDzMxSc2iYmVlqDg0zM0vNoWFmZqk5NMzMLDWHhpmZpZaX0JB0uqQFkhZLmtJI/Y2SZievhZI2ZNXVZtVNz0d/zMysMHL+jnBJxcDNwKlAFTBT0vSIeGVnm4j4l6z23wSOylrFlogYm2s/zMys8PJxpHEMsDgilkbEduBu4Jxm2l8A3JWH7ZqZWSvLR2gMBJZnzVclZR8gaSgwHHg8q7iLpEpJz0k6Nw/9MTOzAsn59NRumgjcGxG1WWVDI2KFpBHA45LmRsSShgtKmgxMBhgyZEjr9NbMzOrJx5HGCmBw1vygpKwxE2lwaioiViQ/lwJPUv96R3a7qRFREREV/fr1y7XPZma2B/IRGjOBkZKGSyolEwwfuAtK0iigDHg2q6xMUudkui9wPPBKw2XNzKx9yPn0VETUSLocmAEUA9MiYr6k64DKiNgZIBOBuyMishYfDdwqqY5MgN2QfdeVmZm1L6r/N7xjqKioiMrKyrbuhplZhyJpVkRU5LIOfyLczMxSc2iYmVlqDg0zM0vNoWFmZqk5NMzMLDWHhpmZpebQMDOz1BwaZmaWmkPDzMxSc2iYmVlqDg0zM0vNoWFmZqk5NMzMLDWHhpmZpebQMDOz1BwaZmaWmkPDzMxSc2iYmVlqDg0zM0stL6Eh6XRJCyQtljSlkfqLJVVLmp28vpxVN0nSouQ1KR/9MTOzwijJdQWSioGbgVOBKmCmpOkR8UqDpvdExOUNlu0NXAtUAAHMSpZdn2u/zMws//JxpHEMsDgilkbEduBu4JyUy54GPBIR65KgeAQ4PQ99MjOzAshHaAwElmfNVyVlDX1a0hxJ90oavJvLImmypEpJldXV1XnotpmZ7a7WuhD+J2BYRBxJ5mji9t1dQURMjYiKiKjo169f3jtoZmYty0dorAAGZ80PSsp2iYi1EbEtmb0NODrtsmZm1n7kIzRmAiMlDZdUCkwEpmc3kFSeNXs28GoyPQP4hKQySWXAJ5IyMzNrh3K+eyoiaiRdTuaPfTEwLSLmS7oOqIyI6cC3JJ0N1ADrgIuTZddJ+k8ywQNwXUSsy7VPZmZWGIqItu7DbquoqIjKysq27oaZWYciaVZEVOSyDn8i3MzMUnNomJlZag4NMzNLzaFhZmapOTTMzCw1h4aZmaXm0DAzs9QcGmZmlppDw8zMUnNomJlZag4NMzNLzaFhZmapOTTMzCw1h4aZmaXm0DAzs9QcGmZmlppDw8zMUnNomJlZankJDUmnS1ogabGkKY3Uf1vSK5LmSHpM0tCsulpJs5PX9Hz0x8zMCqMk1xVIKgZuBk4FqoCZkqZHxCtZzV4CKiJis6SvAz8CPpfUbYmIsbn2w8zMCi8fRxrHAIsjYmlEbAfuBs7JbhART0TE5mT2OWBQHrZrZmatLB+hMRBYnjVflZQ15VLgL1nzXSRVSnpO0rlNLSRpctKusrq6OqcOm5nZnsn59NTukPQFoAI4Kat4aESskDQCeFzS3IhY0nDZiJgKTAWoqKiIVumwmZnVk48jjRXA4Kz5QUlZPZImANcAZ0fEtp3lEbEi+bkUeBI4Kg99MjOzAshHaMwERkoaLqkUmAjUuwtK0lHArWQCY3VWeZmkzsl0X+B4IPsCupmZtSM5n56KiBpJlwMzgGJgWkTMl3QdUBkR04EfA92B30sCeDMizgZGA7dKqiMTYDc0uOvKzMzaEUV0vMsDFRUVUVlZ2dbdMDPrUCTNioiKXNbhT4SbmVlqDg0zM0vNoWFmZqk5NMzMLDWHhpmZpebQMDOz1BwaZmaWmkPDzMxSc2iYmVlqDg0zM0vNoWFmZqk5NMzMLDWHhpmZpebQMDOz1BwaZmaWmkPDzMxSc2iYWYexdtM2nl5U3dbd2Kc5NPYBEUFdXW7f0Pjy8g3U5riOQoiIdtmvtrRx8w6eWbSGYVMe5PU179Wre3vj1tTr2bqjlr8tbNs/0BHBsCkPMmzKg0QEF/zvc3zxly/kvD/bnsv5O8IBJJ0O/ITMd4TfFhE3NKjvDNwBHA2sBT4XEcuSuquBS4Fa4FsRMSMffcqHuVUb+dOct5j61FKuPO1QDj6gO6cdPqCtu9WsrTtq2V5bx5Hf/Wuj9d87+3CWrX2Pc8cOpKYuOHpoWYvrrFy2jvNveZavnDCca846rNm2EcEbazczrG83amrrqAsoLSnce5Mp983lnsrl/PmbH6VP91IG7N8l6QcUFQmAurpAql+W7b1tNax+dxsf/+8nW9zeuCG9GDekjEnHDePlqg0cd1BfOhWLxas3MaBnF+6ZuZyBvboyp2ojI/p146wjy9m2o46BvbqytaaW/Urf/y9XVxfURiBAEsWN9A2gav1m1mzazn899Cpbd9Qyp2ojt3xhHKcfUb6rTW1d7Fr+Q9e9/7v/+H8/yX1fP46jh5Zx76wqvvP7l7n/G8cxbkjLv/dR//4wANd+8jAuOX54i+0bExFI74+rri4a/R005dkla3dN3/6PZSxctQmArFVy2o1PsWDVu3z1xBGcMaacsYN77VFfLZ2cvyNcUjGwEDgVqAJmAhdExCtZbb4BHBkRX5M0EfhURHxO0mHAXcAxwIHAo8AhEVHb3Dbz+R3hzy5Zy42PLOSFZet2/UecNO2FJt9hDSrrylWnj6Jrp2JOGX0AkogIIt7fkWvrgperNrB/l068vuY9JozuT20EL76xnv27dqJIYltNLb997g2+dcpI9u/aif27dNq1jU/+7BnGDOrJP58ykgN6dCYCttXUUVQEf3p5JWeOGUDnkmIO+n8PAVAkePqqk1myehMXTXtht/8Nfnz+kVx57xx+c+kxFEsM7r0f+5UWM/6Gx7n0o8P5xZNLmlx2wugDePTV1dzxpWMI4Ou/ncXm7c3++gD4/dfGI+D8W57lw8PKmLlsfaPtTh51AGeOKWfLjloO7d+DP738Fr957o3dHmNjvnjs0LytK62SInH2hw7k/pdWNFr/qaMG8vSiaob03o8vHDuU88YNYtiUB5tc30cP7sszi9e0uN2vnjSCW/+2tNG6E0b25V9OPYRf/X0ZJUXiD4307awx5dx84Tg2b6/hW3fN5sU319OnWylXTDiEO59/g38sWcsh/bvzicMGcOfzb7B+844m+/Kl44fTa79OvLVhCyXF4rfPvVmv/gfnjWHB2+9yztgDeXDOSm575vUPrOOlfz+Vsm6lAM3++2T7+KH9eGJB5v/1TyaOZfyIPmzeXkvV+i3c/MRiLjx2CE8uqOZbJ4/kpeXreXPtZo4Y2JMg6NGlE7+vXM6YgT0ZO7iM4iIxsKwrS6s30a9HZ3p06cTcqo1061zMA7PfYnR5Dw4r70lZt04M2L8L22vr2Lajjk4lRWzaWkPf7qWs27ydA3p0ISLYuqOOLp2KqN60jR6dO1FcJJ5aWM2xB/Wh+t1t9OvRmS4lRc2+uUgjH98Rno/QGA98NyJOS+avBoiIH2S1mZG0eVZSCfA20A+Ykt02u11z29zT0Pj544uYvXwjW3bU8PfFa1tewMzarV9f8mFOOqQfw69+qK270qoevuIERg3Yf4+WzUdo5OO8wUBgedZ8VVLWaJuIqAE2An1SLguApMmSKiVVVlfv2XnWlRu3smLDFp5fum6PljeztrHo+jM+UHbxr2buc4EBMKT3fm26/bxc02gNETEVmAqZI409Wcf1nxoDwKZtNRxx7fuXTr539uHMqdrIfS9W7Srr0aWEM44YwI/O/9CussP/42HeS069DOzVlRUbttRb/yH9u9O7WynPZYXSiYf046mF1Rx/cJ9dRzc/OG8Mf53/NhccM4SZy9bxv0+/Xm99o8v359pPHkZxkfjMLc/yy0kVPL1oDaUlRYzo240p98/lqCG9eOnNDQzu3ZWRB/Tg8ddW78k/yS4H9uzCW8lF0t7dSunZtRM/mTiWac+8Tm3An15+q9HlfnHhOHp27cTPn1jMS29uYMuOlk9N5cu5Yw/kj7Mb79febsLo/jz66qpd8z84bwxX3z+31bb/zZMP5mePL855PSeM7Mvwvt2449n6pwl3/r/ZqVNx8+9vJ584gismjGT1O9vYr3Mx/3rvHC4aP5StO+o4edQBdOlUzNYdtfzXQ6+y7r3tbNyyg4P6defX/1jGzZ8fR10E81ZsZHT5/vyucjm99uuEJB6cs5IrTzuUvy2s5sKPDKFq/RYenLOSz314MNdOn89XThjOwlWbGDWgBxd+ZCiPvrqKo4b0ovrdbXTvXEKf7p057aanmHZxBXc+9ybnjRtE9y4lTJr2And95Vi27KjhyQXVjBnYkyvvncNPLziKWcvWEcCnxw3i70vWcNaYclZs2MKzS9ZySP8e9a6LtYV96vRUGq+9/Q4r1m/hlNH9C7L+QnrklVXMXLaOd7fWcNxBffjHkrUcf3Af/vjSW2zatoMjDuzJus3buf/FzDnrfztrNOeNG0Tv5NxwPtz29FK+/+Cr/PGy4+lcUsTo8sxh9JT75nD3zPcPKu/6yrGMP6gPb23YwlfuqOQP3zie0pIidtTWsa2mjmcWVbOtpo4eXUo4eVR/IoKq9VsY3MS7rIjg4Gv+wpGDetKlpJhbvnA0JcViwap3GdSrKwckF8h32l5Tx6p3ttK3e2d21DV948DumPjhwXzntEPZsr2WVe9spahIvLu1hnFDevHG2s0AdC0tZsPm7RzSvwc9sq5jZV8w3rqjFgkO/beH663/rCPLufnz45rtw8PzVgJi07YaVr2zlR/PWPCBNmeOGcD/XHg0AC++uZ5NW2u4aNoLzPveaXTvXLKrDw/Pe5szx5RTWxd0LS3ede1g54X1hnb+Lckex+p3tjGkz/u/s9q64K0NTf8em9PctYtF15/RYrBY+7mmUULmQvgpwAoyF8I/HxHzs9pcBozJuhB+XkR8VtLhwP/x/oXwx4CRrXkhfF+0dUctr739bkHuMokI1m/e0WgQrX9vO/dULuerJ46od0dNe3DTowt5YPZbvL7mPT51VOYM6UXjh9K5pJg/zl7BgT278Jd5b9OpuIiDD+hO9aZtfP2kg3h60Rq+dlLhxvOjh1/jf55cwrljD+SmiUft1rIRwfbaOjqXFANw7QPzuP3ZN3K6G6otrd20jaO//ygApx3enxnzM0daT3znYwzv260tu9Zh5CM0cj7OiYgaSZcDM8jccjstIuZLug6ojIjpwC+B30haDKwDJibLzpf0O+AVoAa4rKXAsNx16VRcsNsSJTV55FLWrZSvnXRQQbabqysmHMIVEw5ptO6wAzNHSxc38of2iIE9C9qvocm79D15Fy1pV2AADOmT+cM6uKxtz4nvqT7dO++6dfpnF4wr6K3c1rS8nByLiIeAhxqU/UfW9FbgM00sez1wfT76Yba3OWFkPwC+OH5ozuu65LhhjC7vwXEH9c15XW1l54mRHO46tRx1mAvhZvuiA3t1ZdkNZ+VlXUVF6tCBka29nd7cl/j4zsw6HB9ptB2Hhpl1OD7SaDsODTMzS82hYWZmqTk0zMwsNYeGmZml5tAwsw7j0+MGtXUX9nk5P0akLfgxImb7prq6oKYu/GnwPdQuHiNiZtZaiopEqT+k0aYc12ZmlppDw8zMUnNomJlZag4NMzNLzaFhZmapOTTMzCw1h4aZmaXm0DAzs9RyCg1JvSU9ImlR8rOskTZjJT0rab6kOZI+l1X3a0mvS5qdvMbm0h8zMyusXI80pgCPRcRI4LFkvqHNwEURcThwOnCTpF5Z9VdGxNjkNTvH/piZWQHlGhrnALcn07cD5zZsEBELI2JRMv0WsBrol+N2zcysDeQaGv0jYmUy/TbQv7nGko4BSoElWcXXJ6etbpTUuZllJ0uqlFRZXV2dY7fNzGxPtBgakh6VNK+R1znZ7SLzuNwmH5krqRz4DXBJRNQlxVcDo4APA72Bq5paPiKmRkRFRFT06+cDFTOzttDiU24jYkJTdZJWSSqPiJVJKKxuot3+wIPANRHxXNa6dx6lbJP0K+A7u9V7MzNrVbmenpoOTEqmJwEPNGwgqRT4A3BHRNzboK48+Sky10Pm5dgfMzMroFxD4wbgVEmLgAnJPJIqJN2WtPkscCJwcSO31t4paS4wF+gLfD/H/piZWQH5m/vMzPYR+fjmPn8i3MzMUnNomJlZag4NMzNLzaFhZmapOTTMzCw1h4aZmaXm0DAzs9QcGmZmlppDw8zMUnNomJlZag4NMzNLzaFhZmapOTTMzCw1h4aZmaXm0DAzs9QcGmZmlppDw8zMUnNomJlZajmFhqTekh6RtCj5WdZEu9qs7wefnlU+XNLzkhZLukdSaS79MTOzwsr1SGMK8FhEjAQeS+YbsyUixiavs7PKfwjcGBEHA+uBS3Psj5mZFVCuoXEOcHsyfTtwbtoFJQk4Gbh3T5Y3M7PWl2to9I+Ilcn020D/Jtp1kVQp6TlJ5yZlfYANEVGTzFcBA5vakKTJyToqq6urc+y2mZntiZKWGkh6FBjQSNU12TMREZKiidUMjYgVkkYAj0uaC2zcnY5GxFRgKkBFRUVT2zEzswJqMTQiYkJTdZJWSSqPiJWSyoHVTaxjRfJzqaQngaOA+4BekkqSo41BwIo9GIOZmbWSXE9PTQcmJdOTgAcaNpBUJqlzMt0XOB54JSICeAI4v7nlzcys/cg1NG4ATpW0CJiQzCOpQtJtSZvRQKWkl8mExA0R8UpSdxXwbUmLyVzj+GWO/TEzswJS5g1/x1JRURGVlZVt3Q0zsw5F0qyIqMhlHf5EuJmZpebQMDOz1BwaZmaWmkPDzMxSc2iYmVlqDg0zM0vNoWFmZqk5NMzMLDWHhpmZpebQMDOz1BwaZmaWmkPDzMxSc2iYmVlqDg0zM0vNoWFmZqk5NMzMLDWHhpmZpebQMDOz1HIKDUm9JT0iaVHys6yRNh+XNDvrtVXSuUndryW9nlU3Npf+mJlZYeV6pDEFeCwiRgKPJfP1RMQTETE2IsYCJwObgb9mNblyZ31EzM6xP2ZmVkC5hsY5wO3J9O3AuS20Px/4S0RsznG7ZmbWBnINjf4RsTKZfhvo30L7icBdDcqulzRH0o2SOufYHzMzK6CSlhpIehQY0EjVNdkzERGSopn1lANjgBlZxVeTCZtSYCpwFXBdE8tPBiYDDBkypKVum5lZAbQYGhExoak6SasklUfEyiQUVjezqs8Cf4iIHVnr3nmUsk3Sr4DvNNOPqWSChYqKiibDyczMCifX01PTgUnJ9CTggWbaXkCDU1NJ0CBJZK6HzMuxP2ZmVkC5hsYNwKmSFgETknkkVUi6bWcjScOAwcDfGix/p6S5wFygL/D9HPtjZmYF1OLpqeZExFrglEbKK4EvZ80vAwY20u7kXLZvZmaty58INzOz1BwaZmaWmkPDzMxSc2iYmVlqDg0zM0vNoWFmZqk5NMzMLDWHhpmZpebQMDOz1BwaZmaWmkPDzMxSc2iYmVlqDg0zM0vNoWFmZqk5NMzMLDWHhpmZpebQMDOz1BwaZmaWmkPDzMxSyyk0JH1G0nxJdZIqmml3uqQFkhZLmpJVPlzS80n5PZJKc+mPmZkVVq5HGvOA84CnmmogqRi4GTgDOAy4QNJhSfUPgRsj4mBgPXBpjv0xM7MCyik0IuLViFjQQrNjgMURsTQitgN3A+dIEnAycG/S7nbg3Fz6Y2ZmhVXSCtsYCCzPmq8CPgL0ATZERE1W+cCmViJpMjA5md0kqaWwakpfYM0eLtte7W1j2tvGA3vfmPa28cDeN6bGxjM015W2GBqSHgUGNFJ1TUQ8kGsH0oqIqcDUXNcjqTIimrz+0hHtbWPa28YDe9+Y9rbxwN43pkKNp8XQiIgJOW5jBTA4a35QUrYW6CWpJDna2FluZmbtVGvccjsTGJncKVUKTASmR0QATwDnJ+0mAa125GJmZrsv11tuPyWpChgPPChpRlJ+oKSHAJKjiMuBGcCrwO8iYn6yiquAb0taTOYaxy9z6U9KOZ/iaof2tjHtbeOBvW9Me9t4YO8bU0HGo8wbfjMzs5b5E+FmZpaaQ8PMzFLbp0KjqceZtAeSpklaLWleVllvSY9IWpT8LEvKJemnyTjmSBqXtcykpP0iSZOyyo+WNDdZ5qfJhysLOZ7Bkp6Q9EryqJl/3gvG1EXSC5JeTsb0vaS80cfhSOqczC9O6odlrevqpHyBpNOyylt9H5VULOklSX/eS8azLNkvZkuqTMo68n7XS9K9kl6T9Kqk8W06nojYJ15AMbAEGAGUAi8Dh7V1v7L6dyIwDpiXVfYjYEoyPQX4YTJ9JvAXQMCxwPNJeW9gafKzLJkuS+peSNoqWfaMAo+nHBiXTPcAFpJ5jExHHpOA7sl0J+D5ZPu/AyYm5bcAX0+mvwHckkxPBO5Jpg9L9r/OwPBkvyxuq30U+Dbwf8Cfk/mOPp5lQN8GZR15v7sd+HIyXQr0asvxFPSX155eZO7wmpE1fzVwdVv3q0Efh1E/NBYA5cl0ObAgmb4VuKBhO+AC4Nas8luTsnLgtazyeu1aaWwPAKfuLWMC9gNeJPN0gzVAScP9jMwdg+OT6ZKknRruezvbtcU+SubzUY+ReaTPn5P+ddjxJNtZxgdDo0Pud0BP4HWSm5baw3j2pdNTjT3OpMnHlrQT/SNiZTL9NtA/mW5qLM2VVzVS3iqS0xhHkXln3qHHlJzKmQ2sBh4h8056QzT+OJxdfU/qN5K5tXx3x1pINwH/CtQl88093qcjjAcggL9KmqXM44eg4+53w4Fq4FfJKcTbJHWjDcezL4VGhxaZtwEd7v5oSd2B+4ArIuKd7LqOOKaIqI2IsWTeoR8DjGrbHu05Sf8ErI6IWW3dlzz7aESMI/Nk7csknZhd2cH2uxIyp61/ERFHAe+ROR21S2uPZ18KjaYeZ9KerZJUDpD8XJ2UNzWW5soHNVJeUJI6kQmMOyPi/qS4Q49pp4jYQOaJBuNJHofTSD929T2p70nm8Tm7O9ZCOR44W9IyMk+fPhn4CR13PABExIrk52rgD2TCvaPud1VAVUQ8n8zfSyZE2m48hT6/2F5eZBJ7KZnDvZ0X5Q5v63416OMw6l/T+DH1L3b9KJk+i/oXu15IynuTOf9ZlrxeB3ondQ0vdp1Z4LEIuAO4qUF5Rx5TP6BXMt0VeBr4J+D31L9w/I1k+jLqXzj+XTJ9OPUvHC8lc9G4zfZR4GO8fyG8w44H6Ab0yJr+B3B6B9/vngYOTaa/m4ylzcZT8J2xPb3I3FmwkMx56Gvauj8N+nYXsBLYQebdxaVkzhc/BiwCHs36JYvMF1stAeYCFVnr+RKwOHldklVeQeZLs5YAP6fBhbUCjOejZA6Z5wCzk9eZHXxMRwIvJWOaB/xHUj4i+Y+3mMwf3M5JeZdkfnFSPyJrXdck/V5A1t0qbbWPUj80Oux4kr6/nLzm79xmB9/vxgKVyX73RzJ/9NtsPH6MiJmZpbYvXdMwM7McOTTMzCw1h4aZmaXm0DAzs9QcGmZmlppDw8zMUnNomJlZav8f7lVbGkY2FpQAAAAASUVORK5CYII=\n", 246 | "text/plain": [ 247 | "
" 248 | ] 249 | }, 250 | "metadata": { 251 | "needs_background": "light" 252 | }, 253 | "output_type": "display_data" 254 | } 255 | ], 256 | "source": [ 257 | "show_wave(\"common_voice_id_19773609.mp3\", 1.0)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 79, 263 | "id": "electoral-malta", 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAD8CAYAAACLrvgBAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAA23ElEQVR4nO3deXwU9fnA8c+TQAj3GQ65wq2gAhIRLxREAW1FrVptbfEqraX1qrZRarVepdbW1v6sR62KVkXrBRUUAbUeKBAk3CCRG7nkljvk+/tjJ8nsZjc7uzO7s8fzfr3yyuzsHN9JdueZ7y3GGJRSSikncvxOgFJKqfShQUMppZRjGjSUUko5pkFDKaWUYxo0lFJKOaZBQymllGOeBA0ReUZEtorI4gjvi4g8KiJlIrJQRE6yvTdaRFZaP6O9SI9SSqnE8Cqn8Rwwopb3RwI9rJ8xwOMAItICuBs4BRgI3C0izT1Kk1JKKY95EjSMMR8BO2rZZBTwvAn4HGgmIu2A4cB0Y8wOY8xOYDq1Bx+llFI+qpOk87QH1tteb7DWRVpfg4iMIZBLoWHDhgOOPfbYxKRUKaUy1Lx5874xxhS4OUaygoZrxpingKcAioqKTElJic8pUkqp9CIia90eI1mtpzYCHW2vO1jrIq1XSimVgpIVNCYDP7ZaUQ0CdhtjNgHTgPNEpLlVAX6etU4ppVQK8qR4SkReBs4GWonIBgItouoCGGOeAKYC5wNlwH7gGuu9HSJyHzDXOtS9xpjaKtSVUkr5yJOgYYy5Msr7Bhgb4b1ngGe8SIdSSqnE0h7hSimlHNOgoZRSyjENGkoppRzToKGUUsoxDRpKKaUc06ChlFLKMQ0aSimlHNOgoZRSyjENGkoppRzToKGUUsoxDRpKKaUc06ChlFLKMQ0aSimlHNOgoZRSyjENGkoppRzToKGUUsoxDRpKKaUc06ChlFLKMU+ChoiMEJEVIlImIsVh3n9EREqtny9FZJftvaO29yZ7kR6llFKJ4XqOcBHJBR4DzgU2AHNFZLIxZmnlNsaYW2zb/xLobzvEAWNMP7fpUEoplXhe5DQGAmXGmFXGmMPARGBULdtfCbzswXmVUkolmRdBoz2w3vZ6g7WuBhHpDHQB3retzheREhH5XEQu8iA9SimlEsR18VSMrgBeM8Ycta3rbIzZKCJdgfdFZJEx5qvQHUVkDDAGoFOnTslJrVJKqSBe5DQ2Ah1trztY68K5gpCiKWPMRuv3KuBDgus77Ns9ZYwpMsYUFRQUuE2zUkqpOHgRNOYCPUSki4jkEQgMNVpBicixQHPgM9u65iJSz1puBZwOLA3dVymlVGpwXTxljCkXkV8A04Bc4BljzBIRuRcoMcZUBpArgInGGGPb/TjgSRGpIBDAxttbXSmllEotEnwPTw9FRUWmpKTE72QopZJsUulG/v5+GdNvGYyI+J2ctCMi84wxRW6OkeyKcKWUitvNr5RiDBgDGjP8ocOIKKWUckyDhlJKKcc0aCil0kYaVsFmHA0aSqm0o/UZ/tGgoZRSyjENGkoppRzToKGUUsoxDRpKKaUc06ChlEpp89bu4JlPVvudDGXRHuFKqZT2vccDY5xee0YXn1OiQHMaSimlYqBBQymllGMaNJRSSjmmQUOpDLRt7yEKi6fwn5L1fidFZRgNGkploLXb9wEwca4GDeUtDRpKZaDKCYqOVqT3CH/rd+z3OwkqhAYNpTLQwg27ADh45Ki/CXGp3Bb00nGW0UykQUOpDPT7/y4F4Mste31OiTv2QJHmmaaM4UnQEJERIrJCRMpEpDjM+1eLyDYRKbV+rre9N1pEVlo/o71Ij1IqICfNxxA/crQ6Uhwqr8417T1U7kdyFB70CBeRXOAx4FxgAzBXRCYbY5aGbPqKMeYXIfu2AO4GigADzLP23ek2XUqp9Lf7wJGq5R37Dlcta0mVf7zIaQwEyowxq4wxh4GJwCiH+w4HphtjdliBYjowwoM0KaVI/8mKOrVoULUcVKmvQcM3XgSN9oC9Xd8Ga12o74nIQhF5TUQ6xrgvIjJGREpEpGTbtm0eJFs59fWuAxQWT+GTld/4nRQVIyH+qLFg/S4Ki6ew0sd6kaO2LEXdXK2CTQXJ+i/8Fyg0xpxIIDcxIdYDGGOeMsYUGWOKCgoKPE+gimze2kBp4ctz1/mcEhUzFzmNtxd+DcAHK7Z6lJjYVdhyF/Xr5lYtG81q+MaLoLER6Gh73cFaV8UYs90Yc8h6+TQwwOm+SqnYvDl/Q9Wym9KpbXsDX1k/6w8qglpPaaBIBV4EjblADxHpIiJ5wBXAZPsGItLO9vJCYJm1PA04T0Sai0hz4DxrnVIqTre8sqBqObROY6etMjmat0oDOQ0/b9WHyiuqlrXJbWpwHTSMMeXALwjc7JcBrxpjlojIvSJyobXZjSKyREQWADcCV1v77gDuIxB45gL3WuuUh3buO8yu/c5vFulk+7eHeHfxZr+TkbLsdRqTSjfS/77pLFi/K6Zj+PmAP2XhJls67B39/EiNAo8mYTLGTAWmhqz7nW35DuCOCPs+AzzjRTpUeP3vmw7AmvEX+JwS7107oYQF63ex4Hfn0bRBXb+Tk3LsOY1ZZdsBWLZpD307NvMnQTE6oX3TqmXNaaQGbY6g0toGa2yiIxUVUbaMzw/++TkTZq1h6dd7EnJ8P8R67/Wz0rlt0/yqZXudhsYP/+h0rxns1ZL1/Pq1hX4nIykSVVwx66vtzPoq8ISejjk1e5VGvH02YqkHSSStCE8NmtPIYNkQMJLZec3vAfOOVpigJqhOhBtGJNbL8LNYqCJCPYbf/4tspkEjAxlj+OuML/1ORlK5LUJZu30fhcVT+NDWJ+FweXCR14+fmePqHG51u3MqFz8+i/nrdtZIW0S2mBFvgPXzCd8esDSnkRo0aGSgVd/s468zVvqdjCTxJqsxfekWAK5+dm7Vuuc/WxO0zccp0CN+wfpdXPyPWTw4dVn0jQn/14k1wPp5r7bnKNJ9bpBMoUEjA0XKut/6ainLNmVOhW4Ql/cT+1Ns39+/B8D+w6k7F8WSr3c72i4nJ6hWI65zvb/cxx7hJvyyhg//aNCIYs/BI0yYtSbNylDD3xze+GIjY1/6IslpSSyv6jTs/97dB47wxbqdzFi2pcZ2lZMb+W3X/iPRNyJCTiPGj/Keg87OlQjBfTPCJ/y7f/+EwuIpyUpS1tOgEcXYF7/g7slLqlrQZLUUjpteJ+2Sf8xi4YaaT/MX/t+nHp8pPtu+PRR9I6qnfQ0sx3eufT7OXWHPXazZXj31qz1+LNroLNelvKFBI4rKcuwfPj3b55Q4V+vNIY67ayoPr+1V0ry8xnXb97NhZ2LntnaV04jxXPaJkJLNnrvYc8C/HI+qpkEjy1QYw9Y9B/1Ohufclh56Wfo4+E8fcMYfP/DugC7Yg2EKx/6I7DmNHm0a1bptehUhpy8NGhmotpvDmu37GfjgzIQ/CSfLAY8qq71omHPkaAWltnGd3pq/kf2H/Z2WVFI5m+hApDnCw7UA08ZVyaFBI4qzegbm7rhxaHefU+KtLXuclYmnusq5ot320/BiqIyHp63goseq6zxufqWUuyctcX3ccJo7HGcrx4uacB/ZUxqtya02yU0ODRpR5FrfukffL/M5Jc45ebp8d/GmoHkXapNG95i4xXKNt75aSvnRmp3rloQZn2qzz0WB9lFu0zHTYW8KHTRzZJj/15rt+zh4JHWbSWcKDRpRbLe1UvnRv9KnMjyaf368OmjeBScWbtyVmMSkgFjKw9/4YiM/fWEeW0ICQqJuylc/OyfuJqXh0pROzwAVMbSSOu+Rjxj7YmY1KU9FGjSi6H1M9dDMqdAj2IlEPVCu33EgQUd2z21uKNaSjZnLt3LKgzODRr9NVP3Bhyu2xb3vpt0HueHf8yg/WuFqvnC/2IP5lQOrJ/mM9O+a6WNHxGyhQaMWizbsZmaYDl4qsgOHj/Lcp6tjHljPb/EGndL1u9h3qJzC4il89GX8N/dYxZLcdxZvjtjHIdUFD1LoXzpUNQ0atfju/33C1r3pV2HsZ9n1+HeWcc9/lzI9ycF25N8+5qcvlMS1rzGGo3Heke58cxGn/mFmxPeXfr2HSaUbPWvlVclpP41Kxpi0r9MoT7MHkUyl82lkmEmlG/lq67e+nX+ndTM7eOQo5UcrqDCQVyfxzya7Dxxh2pL4AlWPce+4uiHtORi5We32fYe5aWIpF/dvz43n9KBLq4Zxn8cN+9WlU38Ge1LtraPS6BIyjiffZhEZISIrRKRMRIrDvH+riCwVkYUiMlNEOtveOyoipdbPZC/S44V07QB308RSX1t62b/LF/7fp/T87TtJPf/6HfvZczAwdlRQuozhSJgWT5CcJ9g3529kyMMfBuU43l28mb/PXMm/P1/Lu4s31bJ3TY3qBT/vRet3kyNUXf+BI4mZ5TAR7DmNbXurv5OR/pcq8VznNEQkF3gMOBfYAMwVkcnGmKW2zeYDRcaY/SJyA/AQ8H3rvQPGmH5u0+G1Q07nK0hzq7Z9S9eC2nvaxmtpHCPq7jtUzpCHP+TBi09ABM45rk1M+5/5UHVP7BX3j6BuTg53TVrM17sO8MGKbdw+vBedWzbgOyceE3PavHD4aAX1yQXgZ/+eF/ReLDMD2m+mb87fwC2vLODlnwziuHaNI+whvDxnPQBPfvQVN5zdLbaEe+CZT1Zz79tLWXH/COrVyXW0jz2e3/Pf6luK5jT840VOYyBQZoxZZYw5DEwERtk3MMZ8YIypfBT6HOjgwXl9kU5ZeyeG/vl/fOBhi5NIf5+Nuw5QWDyF95dvYXEtTSeXfL2HrXsPcf3zJVw3oYSyrXtdpAXW7tjPi7PX8YHVAulP01bwi5fmx31M1zz6+Ow/fJTZqwKDaFY2nR731iJemrMu7Pb2+ozdDsZw8moI/VGPfcoJd08D4LEPAjngPQdi6SWfWd+3TOBF0GgPrLe93mCti+Q6wF5mkS8iJSLyuYhcFGknERljbVeybVviW6lE6l2aiZVxX26p/cYcTwVqaPPTt+ZvBODa50r4zt8/ibjfmu37gl7vjukGU9MDU5ZG3yiJvOh5Xun7T30e9HrVtn0Rn8Dt/41ozz1jni9h5N8+dpc4y4L1u9hrtS7bbs01HkvP7UibRvs7zl+3k1dL1te6jYpPUivCReQqoAg4y7a6szFmo4h0Bd4XkUXGmK9C9zXGPAU8BVBUVJTwO3ek4LBz32FaN8lP9OmTKtw80vGq/Kvd+HLw0/yfpq1wtP/2bw97lhaAMh8bBYRjDGzbe4g/vONs5r3Yjx/+cxvL//i9pe5bvr0ydx2/eX1R+LTE8Kga7xSvF/9jFgCXF3WMsqWKlRc5jY2A/T/TwVoXRESGAeOAC40xVe1YjTEbrd+rgA+B/h6kybVIT0N1cjOvlXIsMeOmifMTOr9CZRFGNIfLK6Jue+xd7wb1Twh1qLy6iCdZDHD9hLm88UWNr4gnIj2Zh/6PfzdpcULODzBz2ZaIAQNiq4/IsNLgjODFHXAu0ENEuohIHnAFENQKSkT6A08SCBhbbeubi0g9a7kVcDqQEuUJ5RWRWtpkRwV5JJNKv+Y/tWX7w40J9M2+misj+NZhQPrtW4sc514iue/tpTWKeBLNGJPQvj+RnsxDe4M//9laXpq9jncXb3Z9zmlLNnPyAzMA2LT7ANdNqL2/zP9i6AQZ6XpqCyZPf7yqavn3/13CrK/SYySHdOE6aBhjyoFfANOAZcCrxpglInKviFxobfYnoBHwn5CmtccBJSKyAPgAGB/S6soX89bu4LMIM/XpSJqh805Hd/bDH9b6/kuz1/FwxAAQ/u/9aomzwRZrs2Jz/JXs8TJEvuEVFk/hnx+tYsbSLdzxxqLag3Ok4zvMaUCgY2JoCy6nCoun0MtqTv3TF+axbe8hdu0/zH8XfB113/y6zlpOQXw5jfunVBf9PfvpGn7wz8wZMy4VeFKnYYyZCkwNWfc72/KwCPvNAk7wIg1e+t7jn0V8r9zHWcwSJdqYSfsPBfdm3rw7ch+WKYti629wuLyCO98MFGXcNryXo32OVhjy6uRw2GWzaL/GYqqtEveBqdU3vJfnrOOyKGXyh8qr/zc5ktzWfaHN0rfsOUSP1pGa/FYrbNmgann60i385PkSzujein2Hy2nVqB4/HdyVosIWQOS/VaxXWVg8hU+Lh9K+Wf0Y91ShMq+APsEysfVUtFvnxl3BAxXu2HfYk3GWpi7aFFfnv253TnUdMPxijLeTBW2zFXX1bNM4YRMR7TkYvZluydodPDtrTdTt7N+hnzwfKMr6pOwb5q/bxfSlW7j0ieqHNi9Lg8/584feHSyLZX3Q2HPwCMs27aGweApnPvR+1O3DzaOQ7qJVhIe2vJk4dz0/fmZO1fwG8Q5O+MCU4BZE4eZC+N+X3wTPo+CRSaUbOVie/LkXHpy6LOhG74XK3uHLN+/l/xw2JIjVBY9Gb4I77s3Fjh4mYmtyG9jWi86IB9OoJ3wqy/qgceI971W1SXcy9PeGXakzPPinZd9QsmaH6+PsP3yUT8si35gjVWFs+/Ygs8q+oeudU1lgm+bUiVXbvq3RqODYu96tsd2jM1dyVQLmMblpYikLN9Q+P0MivDnf21ZT9/53KW2bRm8CPj/G/0+o2r4bTfJjK+WOpYi3csuCRvWC18dRDFcnxro4FV7WB41YXfPsXL+TUOWHT88OysrH60/TVvDDp2fXKIaqFKnie8nGPTzz6WoAZkVoOBDJvLU7Pe0fkq3eW7rFUV+U0H4zXor19h1LTqMyONTNdf9ZKa8wFL++0PVxsp0GDZtmDudddjIMgx/cVoLuCXNd7y3ZzLOfrgm7/dOfrGbGskAL6sc/jK1YxBhvOxUqb8TTOjDWj509h3lcuyYRt5uxdAtvLww0rMgN6REY7yd94tzwLdKun1DCpNLE9J3JNBo0bJzOUXC5B0/3EPkmX1g8Ja7pPd3PXlfzAGNemMc330Yvgx/Vr7aRY2o6UlERU89ggA9XbI172lPlTLc7p0bfKESsvbb3HTrKjn2HKSyeEnGMq4lz1nH98yVVs2WWrt8Zdrt4XPPsHGaFFMfOWLaFmyaWBo2eu2XPwap+QxUVgVGSl2/ew4593o5akG40aMRhRZSxmpy48qnP6XLH1Lgrke1PhIXFUxj74heuRzVyE3TqxThnhjGxTx97dQoVDWaSwuIpjHm+hGlLnHX0czsr49iXvuCk+6bXuk3xG8E9yqd7MLRJpQ9WbOMHT1fXk9mHq+8x7h0WbtgFwCkPzuS71jhpXe+cSo9x7zDirx9XrQvngSlLM/7BRidhsomltOSeyUv47QXHxTSsyPvLt9CzTWM6NG/AZ9bwFfsOl9M431mxmN3ekCaQUxZtokXDvJiPY/fm/I10atmAJvl1eejd5fzjwxpDgHnmpdnhR2ONJNO/iH57b+kWR2NOGWOSPisjVE/u5bVxby7ilZAiq5smljKkV2sAVocZzSBS3d+C9bv458eBOr7fvrWIU7u24oIT23mcYv9pTsOmZcN60TeyPDdrTa2THc1ZvYOyrXuDxmm69rkSRvw1uOli6AB90Z7iXp27nu3fHgpb9vzC52udJD2if32ymhPveQ8g5oCx73A589Y6b8kVz1wbyn+/eX0hP32huhf5vLU74x5UMBW8OHtdjb5Xq7/ZV9XAA6gaIiVU2da9TLU6s744ey2jHvu06r1/f76OsS99EXe6jDE88b+vUnIyOA0aNrGOKxVaLmp3+ZOfMewvH9Hn7mlBY998e6g8qD/C17uDn1rsc1WH1nls3n2QX7++kAH3z+BginVue3nO+lp70qvMEDp8y/cen5WU/g/PXzsw6HUy41SkfjXD/vIRP38xEBhWbQs/vtpr8zbE1RH1yy3fMv6d5Qx8MPL8837RoGHjtCK8ktPB9X7wz9lBAcDeH+EH/5zNLa+UVr225yCWfL0naD97sElkJ8PLnpiVsGMrFY9WjZyXAiSDvbj0rfkbIw7Kedt/FtDzt+9QWDyFL7fsZeOuA6zdvi9o+JdKR45WVE1QZr+3JHsk5mi0TsOFM7q3crztkVo6NL05fyOPfL8fEDzEwnf+/gmDexbw/LUDa5Tpe92r2G7uGu9aqijlhTo1+mm4z2p4VU92s+2hrzbnPfJR0Ou83BwW3nMe9erkMGHWGt5buoVZX23nb1f0Y+nX1cW39pGYY5kSOFE0aLjw9Cer+e13erN+x37q5Artmtan+PWFYVuhOB1j6WhIcPnoy228Nq/miK5edOpTKhZvfOF+ZOF4hfbmzoQh4A4freDhaSsYeUK7oPnPb5pY6l+iHNCg4YEzH/oAgEv6t+cNl8NEhKtXue0/C1wdU6WvWOaeSLRbX/Xvc1gnpFPPrLJv6Nkm+oi6qe7pT1bz9Cero2+YQrROw0NuAwbofB0q2Ohn5vidhJSQG1I8dTgDBw5NF1kbNA6VH3U8tWhtvljnbfl/Jg69rpRbocVTz0UY2ibTpcJDZdYGjTHPz3M9XSjAJf/wpqXRdc/NZdGG3TU6GimlIDckaHxdy0Rgmax0/U7fp2fI2jqNVCorBpi5fCszl2+NvqFSWUiHNQ/43uOfMePWwXR3MENioniS0xCRESKyQkTKRKQ4zPv1ROQV6/3ZIlJoe+8Oa/0KERnuRXqiCTfZT6XQYDL1xjMTnRylVBShOY1sFjrib7K5PruI5AKPASOB3sCVItI7ZLPrgJ3GmO7AI8AfrX17A1cAfYARwD+s4yXUhFqmpAyteOx9TBPKHhjJ+EtSbipzpbJGaOupSsmcEz1VbNi539fzexGyBgJlxphVxpjDwERgVMg2o4AJ1vJrwDkiItb6icaYQ8aY1UCZdbyEeOyDMn7yfAl/eGd5TPvVyc1hWO82CUqVUiqaSDmNVKgYTra8GAZJTQQvzt4esNfebrDWhd3GGFMO7AZaOtwXABEZIyIlIlKybVt89RGbdh9g/Y74onR+3YRngJRSEUSq08jG1oYndmjm6/nTpvWUMeYpY0yRMaaooKAgrmPcf9EJvHvzYP41uijmfWOdL0Ip5Z1w0xYs/XpPWo+wGy8vpr51w4s74Uago+11B2td2G1EpA7QFNjucF/PNakf+/wV2npDKf+ICKFfwUse/zQri6dimcMnEbw4+1ygh4h0EZE8AhXbk0O2mQyMtpYvBd43gRqsycAVVuuqLkAPICW7wIrOZ62Ur0LnlD94pIIYZzNQHnDdT8MYUy4ivwCmAbnAM8aYJSJyL1BijJkM/At4QUTKgB0EAgvWdq8CS4FyYKwxJnJ7WKVU1goEjeCcxdEsLJ7ymyed+4wxU4GpIet+Z1s+CFwWYd8HgAe8SIdTmmdQKv3k5AAhj5SxTpym3MvKHuFa0qRU+gktngIcFU/97Yp+dG7ZkOlLN/PYB4mb9z5bZGmTII0aSqWb3DBBw148dXH/sK31GdWvPf06NuP24ccmLG3ZJCuDhuY0lEo/4b63FVbrqT9deiJ/ubyvDjeSBNkZNPxOgAPzfjvM7yQ4kgrTT6rsEC4gVDa5rZMriAhfPXh+spOVMKd3b8lndwz1Oxk1ZGmdRuqHjeYN8vxOglJhNcmvw8AuLZixLLmjMoer06jsEW5/r22TfDbv8W/o9GevPpnj2jVh0B9mxn2MlQ+MpE5OIBB+eNvZ7DpwhIse+9TDVMZPcxo++tGgzhHfywnzVPXQ907kjpHB5bLL7xvBjUO7e542pSI5VF7BDWd3S/p5w30nKnuE23Mhk395Oo//8KSkpctu7JBunN2rgLZN810dp25uTtXDbWGrhvTr2MyD1HkjS3MafqcAJlw7kLN6FjD6tM4M+8tHYbe5cWh3Hn2/jGevPpkhx7auWv+DUzpxwj3vAYExsRI5VPKqB8+n651To2+ossah8oqwT/2JFq66orJ4yl5J3rpxPiNPaMeMWwcnJB23D+9VNYFb/bq5vHfLYN6cv5HRpxXS1DbaRM82jbhqUGeG9GpNQeN67D98lNL1O2nXtD4T56zjh4M6M3vVdu6atKRqn4LG9RjV95iw533m6iJ6+DiPRqXsDBo+5zW6tGrIWT0D42fVNpnKref14pKTOlDYqmHQ+sb5dVl27wj2HDwCwNEEtVWvVycn7NNdqCd/NIC/zljJsk17EpIOlXoiDVWeSOECVWXQCPc5DffdevrHRSzdtIe/TP8y4nmuGtSJf3++rsb6J380gFaN6jGgc3P6dmjGVf+azazioTRvmMeN5/Sosf17t5wV9Dq/bi5Djw2Mlv37UccD0LNNY+6atISBhS1o0TCPhy/vS6N64W/Llfv6LTuLp0I+XzN/dRaXDugQdb+fDu5K99aNXJ27qHNzHr7sxKB159qGXT+vdxvKHhhZ9To0YFSqn5dLmyaBLPCj77uf6zzUtJsHs+iewJxYc8fVXik/vE9b3rnJn8mqWjeu58t5U80J7Zsm9Xx+zAMUtp+GVTzldGy4Yb3bBH3fAEYe35bbh/eqen3/RdVz55Q9MJLl943gjZ+fxvA+bRnQuTkAZ/RoxZrxF9C8ofu6xy/vH8nEMYN44kcDIgaMVJKVQWPl1r1Br1s0yOOt+dHHSbzj/OPCPlHUpmurhrzx89OqXr92w2kM6NwiaJt//rh61N0rT+nk64BkH/96CAvvOY9ebRuTZ43sW+Dwxrzi/hGJTJqK4PM7zuHNn5/GnDvPSdo57TmNW8/t6fp4D1/WN+o24UrEymvJaUTSqlHw5/nWc3tW9fH4c0g66uTmkF83l5M6NXd8/FjlOczRp4qsDBpfbd1XtfzOTWfSvGGe43H560b4564ZfwFN8oOfEi4v6sBTPy6K6QO3/5C/Q291aF6fJvmxjwIMUK9Ocuccufq0wqSeL5UMLKx+8GjbND+pDxo/OKUT9tPF+iAV6swerRzl9I8crVkMWxGmTiOagsb1+OKuc7lqUCcAurduxDHN6rNm/AV8z0E6sl3q54USwP75Oq5dEwAa5OWy/3D0G/bZvVrTtVVDVn2zr8Z7ew6WB71+6NLoT0+VLjihHVMWbaJxvj//kgZ5uUy4dmBKNkd+7WencsOLX7Bt76GqdZX9Q6Yu2uRXsnz16s9O9e3cORK+qCgex7Vrwj+slk6X9G/P1MWbOHgkfB3dgTDfz8c/DAwLEmunvhYN87j/ohOCiqLslt83IiuHXXciK3MaPdsEKsg6tWhQte6283pF2jxI/bxc3r/t7LDvvfyTQfRu14Qxg7tyU8jT12s/O5UXrz8l4nEfvPgEbh/eizO6t3KUDruxQ9w3f3zshydxcmGLiO/PuPUs3zocHt++acQc3kOXnhh2fW0GdY18neksWbe4HBHPKsLfuelMGls52798vx9v/zK4bqxurnCM1Xw1XDCZuXxrVZq8lF83l4ZpUL/gh6z8q1xwQjs27T7AD0+p7ifR10U76Mo266d2a8nUCBXCRbXckAGaNqjL2CHx9be4ffix/OrcXnE3jf3o9iF0atmg1m3CNQCYM85ZGfrYId1cDxS3/0j4XODZvVqzZvwFFBZPcXysJ68qou+977lKTypK1ijhOSIJqwjv3roRa8ZfwO79R1i9fV9Q/4TDYYqnKtXxeTa7bJKVQSMnRxgzOPjpPLTzzBNXDaBji/pRjzV33DDHFcWJ5KYiLVrACGd4nza0bhy9A1NenZyITZzP7d2G6Uu3RD1GjoijokPHMvT+kqxpQEWiN7m1B/KfnNmFS07qQMtGeQx8wFkv6aYN6tKvQTPHafKj30i2ysriqXBCy0RHHN+WPsdEbsa48J7zmPmrs1IiYCTTsntHcMfIY3n0yv6Otq+sMwrH3mqsNiJwuLz2vigdmtce4D+6fUjVctM4pvtNBK97+bZslJzPYo5ITHNz/+ysbhzXrknQQ8YNZ3fj9Ru8q5fRgQqTR4NGnJrk16Vbgbs+G+mofl4uPz2rm+OWUl58lZ0c4zcjah/2ulPLBvTt2IwWHrSr98pbY0/3OwlxmbJwEy0bOf87hruhjz61sEbTczdiaT2l3NGgYZPsDlLZILQjVTyctOhy0rlr0tjT+eKuc12nx0ud4ygaTKTVfzifm4fV3oR2856D1I2hUiNc0anXdSJ+dDbMVq7+1CLSQkSmi8hK63eNDgki0k9EPhORJSKyUES+b3vvORFZLSKl1k8/N+lx601bJzzljeMdBuLj2jXh7V+eEfa90FvOgt+dV3ObNH3SfP9XZzPu/OMcb/9p8VCW3Zu4TpQiws3DonfWy8kRJo09nYX3BP4XlU3F/3J5XyaF5KDC/We8rrT3Y1iTbOX2L10MzDTG9ABmWq9D7Qd+bIzpA4wA/ioizWzv326M6Wf9lLpMjyt+9sT2S37dxF6z4GyAyL4dmkYMMPb9G+bl0rRBzTqJNI0Z5OYII45vC8CjV/ZnYJfai2zaN6tP/Tz3nSiHHts6aOiMaNaMv6BGbq5vx2ZVHUEX3TOcNeMv4JKTOtRoiRguoDfw4BrssvCr6xu3f+pRwARreQJwUegGxpgvjTErreWvga1AgcvzqgyVF+bbLyL0OSZQoX5bhBtdmsYMADq2aMCa8RdwYd9jePWnyem0d3JhC354SqeY9nE6akKo/Do1/6de5wy09VTyuP3PtTHGVHbJ3QzUWoAtIgOBPMDeaP8Bq9jqERGJ2PxDRMaISImIlGzbts1lslWyiMBp3Zx3WPw8wvhJk8aezp8v68voUwsjnCe2m0bXgsBAkENtQ86nilUPnu9JXVBtcnOotcdzZYOBIb0KeNphK7dQ024ezP0XHR+Ug7/vosDorvXCBBI3tPVU8kT9z4nIDBFZHOZnlH07Y4yhlk6pItIOeAG4xhhT2X7yDuBY4GSgBfCbSPsbY54yxhQZY4oKCjSj4pVkDBNfVBh97K3KMu4WDfM4Ocz2dXJz+N6ADhH7o4Rbu+iemnUflV7/2WlMGns6z1x9MveN6hM1fcmUkyNh547w9BwiHK2lYuHYtoFRE64/syvD4gxgvdo25qqQicZ+NKgza8Zf4PkAfZrTSJ6onfuMMRHHjhCRLSLSzhizyQoKYed/FJEmwBRgnDHmc9uxK3Mph0TkWeC2mFKvUp7Tprm5to5pZ/UsYO6anTGdJ9w9o3EtAy82b5jnybDWTlxxckfq5ubwwudrgUBgvMQaVTWSRAfzhvXq1DoMd2UuJF3uxdojPHnc9gifDIwGxlu/J4VuICJ5wJvA88aY10Leqww4QqA+ZLHL9KgU0a2gIVcO7MTJhc1rLQsf0Lk589buZGiv6mKiY5pF74kfKlVubmd0b8WfL+/LKQ8Gej5XDqwIcFH/9nRq0cBRh9BENgbq2aYRlw3oQJ3cHJbfN4Jj73q3xjaVmZB06f+QLunMBG4/muOBc0VkJTDMeo2IFInI09Y2lwODgavDNK19UUQWAYuAVsD9LtPj2qziocy49azoG2aIRH3Xmtavy/Vndo1a11A5nLx9s4v7t+e5a06O6XyhT+Z3f7e34329bP3ZqF4d2jTJZ9hxNetKBnRu7ngEgUTmNH5yZteqeob8uuFzgrcN70XbJvn0SZO+S+k0H0W6c5XTMMZsB2rUXBpjSoDrreV/A/+OsP9QN+dPhHiectNZnscVkpWcVkxXbmcvXhcRzu4VYwV1yOkqK3Kf/NEAukaY/TARKotJnh4dW9AL1ad9E6b4OOz7wC4tIjZKSEWa00gebd2cwZ6/dmDUbV7zcV4G8K6pbJuQwRMrK0aH92lLjzaR52H3mlczvF17ehdPjpMtNKeRPBo0MtTx7ZswuGcB3+17TND6Xw6tHn69b8dmdG+dmBtq8wbVlcxOWraEKyIqbNmAXg5v+L2PqR4Y8ZrTC6s6zCXT+786i2tOL/TkWG5bA8377TB6toltbLQfn9o5+kYpyukc4co9DRoZ6A+XnMAzVvHIny/ry+d3VBcz/Mo22VQimpqeY/V7uHRAdeug3BwJmifdrvLeaMI0//zw9iFMu2VwzGm4+7t9qBtDF2EnQ7w70bWgkWfDmbi9B8Yy4m1lx8nLBnR0d1IfaT+N5MnK+TQy3ZUDq3v65tXJoW3TfOaMO6dGL9zamqTGq7KYIPTmGVpsUz0+kf9f9uF9EtuRLh6JGEurUb06fHuovEau7s7zj+PWV0vp1jp5dT9e034ayaNBI0uEe5oO93TvVuUho32FQ8cn8iol0ebVCCcVBzuM98F5wrUDI+47vE9bXv9iQ431p3dvxew7/ZnK1yua00geDRoZ5ONfD2H3gSM+p6KyU5izL/HJhc2ZsWwL7T1otfbxr4fQJEUmWHIr3kB2Vs/q0RISNf1r8cja5y7xg8aM5NGgkUE6tmhALKXSqfCE/ZMzu3Jen7Z08aBZbMcWqTU3Rarx6t8drTe7H1Lhs5wtNGhksUQWTzmVkyOeBIxMlyPwL5d9P6rE+W9fft8INu8+SOsm3jQcUOlJW0+phNDnPm+d3r0VQ1yOyOv2f5JfN5dCDfBZT4NGFvMrS18/wtAVKjItflGpQoNGFktE8ZQTlXMqZBKnnRDj5aaid9rNg4MGTjSejrblj3DD56vk0KCRhQpbJq7C2MntqG6GDWPdunE9no1xgEUnPrp9SNWyF/0QMimzEuvQ+co7GjSyWCKLPDLpBhXNtWd0SchAl51swd1NTiOb/hcq8TRoKOVSMu7J8QT4hy49kdO7t6SwpVZeK+9o0FCeOs+aGrRbQWyD5aWzZDzJx3OK/p2a8+L1gxI2/L2ftDGFfzLv06R89f2TO7Ls3hG1Ns30qf49renYSsFuOLub30nIWho0lKdEhPp5tT8FZkLrHbtEz+cNiZ3+NR3Z63hineVRuaMfxSxUOWy2X3MQpEtO44azu3FKlxZRt0tK8ZSHJ0mXv3+oOeOqh/i3/z1inuVRueIqaIhICxGZLiIrrd9hG0+LyFHb/OCTbeu7iMhsESkTkVdEJC/c/spbT1w1gD9cckLSx2q6OAXHLKrNmT1a0d/BTHxn9yqIuo1bnjS5TfN++vaRmnfuO+xjSrKb25xGMTDTGNMDmGm9DueAMaaf9XOhbf0fgUeMMd2BncB1LtOjHChoXC9ozo1kuayoAwAnF0Z/ek8H3y/qyHu3DE7Y7Id26X27997Tn6z2OwlZy23QGAVMsJYnABc53VEC+cuhwGvx7K/Sz2ndWrFm/AVpMxptvTqR62bq1cnhzvOPo2cS5x9XKhW4DRptjDGbrOXNQKQp0PJFpEREPheRi6x1LYFdxphy6/UGIGL5hYiMsY5Rsm3bNpfJViq6kzo1i1hfseL+kTRtkBlzdygVi6hDo4vIDKBtmLfG2V8YY4yIRKpi62yM2SgiXYH3RWQRsDuWhBpjngKeAigqKkrTqjyVLr6461xEhLYpMgy4trhVqSJqTsMYM8wYc3yYn0nAFhFpB2D93hrhGBut36uAD4H+wHagmYhUBq4OwEbXV6SUB1o0DLTJ+NGgzjxx1Um+peOu7/T27dxKheN2EqbJwGhgvPV7UugGVouq/caYQyLSCjgdeMjKmXwAXApMjLS/Un7KyRFGHN+OiWMG8dlX2zm2bWOaN0xeI7+WCTiXZtOVG26DxnjgVRG5DlgLXA4gIkXAz4wx1wPHAU+KSAWBnM14Y8xSa//fABNF5H5gPvAvl+lRKi7/u/1sVm75luufL6Fd05pFUoO6tmRQ15ZJT5eXHSG1iEt5wVXQMMZsB84Js74EuN5angWcEGH/VcBAN2lQygudWzakbm7q9nXV+71KFan7LVFKKZVyNGgoZcm3Rk7t3a6Jzymplq5DfiRLQeN6fich62jQUMrSomEer/3sVB69sr/fSalB5wgP9usRvQB48fpTfE5J9nFbEa5URinKkCFOMt01p3XhypM7JbUlmwrQnIZSWSYTirxE0IDhEw0aSqUwL2/wWsKlvKBBQ6k0oPd7lSo0aCillHJMg4ZSKSwDqh9UhtGgoVQ60PIplSI0aCil0o5W6vtHg4ZSWcbLQRD9UtusiiqxNGgolTX08Vy5p0FDKaWUYxo0lFJKOaZBQ6kUZjJhzA+VUTRoKJUGROsjVIrQoKGUUsoxV0FDRFqIyHQRWWn9bh5mmyEiUmr7OSgiF1nvPSciq23v9XOTHqUyTSIKp7TES7nhNqdRDMw0xvQAZlqvgxhjPjDG9DPG9AOGAvuB92yb3F75vjGm1GV6lMpIXnRm0w5xygtug8YoYIK1PAG4KMr2lwLvGGP2uzyvUkopH7gNGm2MMZus5c1AmyjbXwG8HLLuARFZKCKPiIhO+KuUnRYlqRQTdbpXEZkBtA3z1jj7C2OMEZGIH3ERaQecAEyzrb6DQLDJA54CfgPcG2H/McAYgE6dOkVLtlIZRUuWVKqIGjSMMcMivSciW0SknTFmkxUUttZyqMuBN40xR2zHrsylHBKRZ4HbaknHUwQCC0VFRfr8pZRSPnBbPDUZGG0tjwYm1bLtlYQUTVmBBhERAvUhi12mRymlVAK5DRrjgXNFZCUwzHqNiBSJyNOVG4lIIdAR+F/I/i+KyCJgEdAKuN9lepTKKF6OSNu0fl0AGuTpCLEqflGLp2pjjNkOnBNmfQlwve31GqB9mO2Gujm/UtnCi+ayN53Tg4JG9RjVr8ZXUSnHXAUNpVT6yK+by7VndPE7GSrN6TAiSqUw7b2tUo0GDaXSgA5YqFKFBg2llFKOadBQKoVp6ZRKNRo0lEoDOtigShUaNJRSSjmmQUMppZRjGjSUSmHa5FalGg0aSqUBrdNQqUKDhlJKKcc0aCiVwrwcsFApL2jQUCotaPmUSg0aNJRSSjmmQUMppZRjGjSUSmHf7XsMZ/ZoxS+Hdvc7KUoBOp+GUimtSX5dXrjuFL+ToVQVzWkopZRyTIOGUkopx1wFDRG5TESWiEiFiBTVst0IEVkhImUiUmxb30VEZlvrXxGRPDfpUUoplVhucxqLgUuAjyJtICK5wGPASKA3cKWI9Lbe/iPwiDGmO7ATuM5lepRSSiWQq6BhjFlmjFkRZbOBQJkxZpUx5jAwERglIgIMBV6ztpsAXOQmPUoppRIrGa2n2gPrba83AKcALYFdxphy2/r2kQ4iImOAMdbLb0UkWrCKpBXwTZz7pqpMu6ZMux7IvGvy9Xrkjwk5bDb8jzq7PWjUoCEiM4C2Yd4aZ4yZ5DYBThljngKecnscESkxxkSsf0lHmXZNmXY9kHnXlGnXA5l3TYm6nqhBwxgzzOU5NgIdba87WOu2A81EpI6V26hcr5RSKkUlo8ntXKCH1VIqD7gCmGyMMcAHwKXWdqOBpOVclFJKxc5tk9uLRWQDcCowRUSmWeuPEZGpAFYu4hfANGAZ8KoxZol1iN8At4pIGYE6jn+5SY9Drou4UlCmXVOmXQ9k3jVl2vVA5l1TQq5HjM4nqZRSyiHtEa6UUsoxDRpKKaUcy6qgEWk4k1QgIs+IyFYRWWxb10JEpovISut3c2u9iMij1nUsFJGTbPuMtrZfKSKjbesHiMgia59Hrc6VibyejiLygYgstYaauSkDrilfROaIyALrmn5vrQ87HI6I1LNel1nvF9qOdYe1foWIDLetT/pnVERyRWS+iLydIdezxvpclIpIibUunT93zUTkNRFZLiLLRORUX6/HGJMVP0Au8BXQFcgDFgC9/U6XLX2DgZOAxbZ1DwHF1nIx8Edr+XzgHQJzgA4CZlvrWwCrrN/NreXm1ntzrG3F2ndkgq+nHXCStdwY+JLAMDLpfE0CNLKW6wKzrfO/ClxhrX8CuMFa/jnwhLV8BfCKtdzb+vzVA7pYn8tcvz6jwK3AS8Db1ut0v541QKuQden8uZsAXG8t5wHN/LyehP7zUumHQAuvabbXdwB3+J2ukDQWEhw0VgDtrOV2wApr+UngytDtgCuBJ23rn7TWtQOW29YHbZeka5sEnJsp1wQ0AL4gMLrBN0Cd0M8ZgRaDp1rLdaztJPSzV7mdH59RAv2jZhIY0udtK31pez3WedZQM2ik5ecOaAqsxmq0lArXk03FU+GGM4k4bEmKaGOM2WQtbwbaWMuRrqW29RvCrE8KqxijP4En87S+JqsopxTYCkwn8CS9y4QfDqcq7db7uwk0LY/1WhPpr8CvgQrrdW3D+6TD9QAY4D0RmSeB4YcgfT93XYBtwLNWEeLTItIQH68nm4JGWjOBx4C0ax8tIo2A14GbjTF77O+l4zUZY44aY/oReEIfCBzrb4riJyLfAbYaY+b5nRaPnWGMOYnAyNpjRWSw/c00+9zVIVBs/bgxpj+wj0BxVJVkX082BY1Iw5mksi0i0g7A+r3VWh/pWmpb3yHM+oQSkboEAsaLxpg3rNVpfU2VjDG7CIxocCrWcDhh0lGVduv9pgSGz4n1WhPldOBCEVlDYPTpocDfSN/rAcAYs9H6vRV4k0BwT9fP3QZggzFmtvX6NQJBxL/rSXT5Yqr8EIjYqwhk9yor5fr4na6QNBYSXKfxJ4Irux6yli8guLJrjrW+BYHyz+bWz2qghfVeaGXX+Qm+FgGeB/4asj6dr6kAaGYt1wc+Br4D/IfgiuOfW8tjCa44ftVa7kNwxfEqApXGvn1GgbOprghP2+sBGgKNbcuzgBFp/rn7GOhlLd9jXYtv15PwD2Mq/RBoWfAlgXLocX6nJyRtLwObgCMEni6uI1BePBNYCcyw/ZOFwMRWXwGLgCLbca4Fyqyfa2zriwhMmvUV8H+EVKwl4HrOIJBlXgiUWj/np/k1nQjMt65pMfA7a31X64tXRuCGW89an2+9LrPe72o71jgr3SuwtVbx6zNKcNBI2+ux0r7A+llSec40/9z1A0qsz91bBG76vl2PDiOilFLKsWyq01BKKeWSBg2llFKOadBQSinlmAYNpZRSjmnQUEop5ZgGDaWUUo5p0FBKKeXY/wMf1BMwpyWZHwAAAABJRU5ErkJggg==\n", 269 | "text/plain": [ 270 | "
" 271 | ] 272 | }, 273 | "metadata": { 274 | "needs_background": "light" 275 | }, 276 | "output_type": "display_data" 277 | } 278 | ], 279 | "source": [ 280 | "show_wave(\"common_voice_id_19773609.mp3\", 20.0)" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 6, 286 | "id": "complex-endorsement", 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "from pathlib import Path" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 7, 296 | "id": "greenhouse-wesley", 297 | "metadata": {}, 298 | "outputs": [ 299 | { 300 | "name": "stderr", 301 | "output_type": "stream", 302 | "text": [ 303 | "Couldn't find file locally at common_voice/common_voice.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.4.1/datasets/common_voice/common_voice.py.\n", 304 | "The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/datasets/common_voice/common_voice.py.\n", 305 | "Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/id/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f)\n" 306 | ] 307 | } 308 | ], 309 | "source": [ 310 | "test_dataset = load_dataset(\"common_voice\", \"id\", split='test')\n", 311 | "songs = {Path(row['path']).name: i for i, row in enumerate(test_dataset)}" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 8, 317 | "id": "understood-september", 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/plain": [ 323 | "5" 324 | ] 325 | }, 326 | "execution_count": 8, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "songs['common_voice_id_24041300.mp3']" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 82, 338 | "id": "increasing-graham", 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "def transcript(filename):\n", 343 | " index = songs[filename]\n", 344 | " test_dataset = load_dataset(\"common_voice\", \"id\", split=f'test[{index}:{index+1}]')\n", 345 | " test_dataset = test_dataset.map(speech_file_to_array_fn)\n", 346 | " inputs = processor(test_dataset[\"speech\"][:5], sampling_rate=16_000, return_tensors=\"pt\", padding=True)\n", 347 | " \n", 348 | " with torch.no_grad():\n", 349 | " logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits\n", 350 | "\n", 351 | " predicted_ids = torch.argmax(logits, dim=-1)\n", 352 | " print(\"Prediction:\", processor.batch_decode(predicted_ids))\n", 353 | " print(\"Reference:\", test_dataset[\"sentence\"][0])" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 92, 359 | "id": "distant-bottom", 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "name": "stderr", 364 | "output_type": "stream", 365 | "text": [ 366 | "Couldn't find file locally at common_voice/common_voice.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.4.1/datasets/common_voice/common_voice.py.\n", 367 | "The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/datasets/common_voice/common_voice.py.\n", 368 | "Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/id/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f)\n" 369 | ] 370 | }, 371 | { 372 | "data": { 373 | "application/vnd.jupyter.widget-view+json": { 374 | "model_id": "bf799fb437374e4eafde38fd19b26a14", 375 | "version_major": 2, 376 | "version_minor": 0 377 | }, 378 | "text/plain": [ 379 | "HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))" 380 | ] 381 | }, 382 | "metadata": {}, 383 | "output_type": "display_data" 384 | }, 385 | { 386 | "name": "stdout", 387 | "output_type": "stream", 388 | "text": [ 389 | "\n", 390 | "Prediction: ['sit menahkanka']\n", 391 | "Reference: Saya pemimpin regu.\n" 392 | ] 393 | } 394 | ], 395 | "source": [ 396 | "transcript(\"common_voice_id_19685341.mp3\")" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 101, 402 | "id": "muslim-bradley", 403 | "metadata": {}, 404 | "outputs": [ 405 | { 406 | "name": "stderr", 407 | "output_type": "stream", 408 | "text": [ 409 | "Couldn't find file locally at common_voice/common_voice.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.4.1/datasets/common_voice/common_voice.py.\n", 410 | "The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/datasets/common_voice/common_voice.py.\n", 411 | "Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/id/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f)\n" 412 | ] 413 | }, 414 | { 415 | "data": { 416 | "application/vnd.jupyter.widget-view+json": { 417 | "model_id": "883d498fc3fd4d53b99dd57a0b4610cf", 418 | "version_major": 2, 419 | "version_minor": 0 420 | }, 421 | "text/plain": [ 422 | "HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))" 423 | ] 424 | }, 425 | "metadata": {}, 426 | "output_type": "display_data" 427 | }, 428 | { 429 | "name": "stdout", 430 | "output_type": "stream", 431 | "text": [ 432 | "\n", 433 | "Prediction: ['penemuan ini akan tercata tam sujar']\n", 434 | "Reference: Penemuan ini akan tercatat dalam sejarah.\n" 435 | ] 436 | } 437 | ], 438 | "source": [ 439 | "transcript(\"common_voice_id_20406088.mp3\")" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 18, 445 | "id": "recreational-indie", 446 | "metadata": {}, 447 | "outputs": [ 448 | { 449 | "name": "stderr", 450 | "output_type": "stream", 451 | "text": [ 452 | "Couldn't find file locally at common_voice/common_voice.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.4.1/datasets/common_voice/common_voice.py.\n", 453 | "The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/datasets/common_voice/common_voice.py.\n", 454 | "Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/id/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f)\n" 455 | ] 456 | } 457 | ], 458 | "source": [ 459 | "test_dataset = load_dataset(\"common_voice\", \"id\", split=\"test[:2%]\")" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 21, 465 | "id": "wireless-craft", 466 | "metadata": {}, 467 | "outputs": [ 468 | { 469 | "data": { 470 | "text/plain": [ 471 | "{'accent': '',\n", 472 | " 'age': '',\n", 473 | " 'client_id': '4a502bde4cffafbdf5f8b332a6bf9edc33b1421f51d066cf4218d5da72f69ffca7fdf51482c2143a27ee0b19dd65e3a95c31df41848df9dca4af00caf881189e',\n", 474 | " 'down_votes': 0,\n", 475 | " 'gender': '',\n", 476 | " 'locale': 'id',\n", 477 | " 'path': '/root/.cache/huggingface/datasets/downloads/extracted/fd8a16a97efd77adba3c26c54d0cfae6c9d9494c1017f8070f3f79db72c4b57c/cv-corpus-6.1-2020-12-11/id/clips/common_voice_id_24041300.mp3',\n", 478 | " 'segment': \"''\",\n", 479 | " 'sentence': 'Coba lakukan dengan cara berbeda.',\n", 480 | " 'up_votes': 2}" 481 | ] 482 | }, 483 | "execution_count": 21, 484 | "metadata": {}, 485 | "output_type": "execute_result" 486 | } 487 | ], 488 | "source": [ 489 | "test_dataset[5]" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 68, 495 | "id": "functioning-despite", 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "name": "stderr", 500 | "output_type": "stream", 501 | "text": [ 502 | "Couldn't find file locally at common_voice/common_voice.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.4.1/datasets/common_voice/common_voice.py.\n", 503 | "The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/datasets/common_voice/common_voice.py.\n", 504 | "Reusing dataset common_voice (/root/.cache/huggingface/datasets/common_voice/id/6.1.0/0041e06ab061b91d0a23234a2221e87970a19cf3a81b20901474cffffeb7869f)\n" 505 | ] 506 | } 507 | ], 508 | "source": [ 509 | "test_dataset = load_dataset(\"common_voice\", \"id\", split=\"test[5:6]\")" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": 28, 515 | "id": "answering-colonial", 516 | "metadata": {}, 517 | "outputs": [ 518 | { 519 | "data": { 520 | "text/plain": [ 521 | "{'accent': '',\n", 522 | " 'age': '',\n", 523 | " 'client_id': '4a502bde4cffafbdf5f8b332a6bf9edc33b1421f51d066cf4218d5da72f69ffca7fdf51482c2143a27ee0b19dd65e3a95c31df41848df9dca4af00caf881189e',\n", 524 | " 'down_votes': 0,\n", 525 | " 'gender': '',\n", 526 | " 'locale': 'id',\n", 527 | " 'path': '/root/.cache/huggingface/datasets/downloads/extracted/fd8a16a97efd77adba3c26c54d0cfae6c9d9494c1017f8070f3f79db72c4b57c/cv-corpus-6.1-2020-12-11/id/clips/common_voice_id_24041300.mp3',\n", 528 | " 'segment': \"''\",\n", 529 | " 'sentence': 'Coba lakukan dengan cara berbeda.',\n", 530 | " 'up_votes': 2}" 531 | ] 532 | }, 533 | "execution_count": 28, 534 | "metadata": {}, 535 | "output_type": "execute_result" 536 | } 537 | ], 538 | "source": [ 539 | "test_dataset[0]" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "id": "determined-observation", 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [] 549 | } 550 | ], 551 | "metadata": { 552 | "kernelspec": { 553 | "display_name": "Python 3", 554 | "language": "python", 555 | "name": "python3" 556 | }, 557 | "language_info": { 558 | "codemirror_mode": { 559 | "name": "ipython", 560 | "version": 3 561 | }, 562 | "file_extension": ".py", 563 | "mimetype": "text/x-python", 564 | "name": "python", 565 | "nbconvert_exporter": "python", 566 | "pygments_lexer": "ipython3", 567 | "version": "3.7.9" 568 | } 569 | }, 570 | "nbformat": 4, 571 | "nbformat_minor": 5 572 | } 573 | --------------------------------------------------------------------------------