├── .gitignore ├── ADD_NEW_LANGUAGE.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── languages ├── __init__.py ├── default │ ├── README.md │ ├── __init__.py │ ├── normalizer.py │ └── requirements.txt ├── en │ ├── README.md │ ├── __init__.py │ ├── normalizer.py │ └── requirements.txt ├── et │ ├── README.md │ ├── __init__.py │ ├── normalizer.py │ └── requirements.txt ├── fa │ ├── README.md │ ├── __init__.py │ ├── normalizer.py │ └── requirements.txt ├── ga │ ├── README.md │ ├── __init__.py │ ├── normalizer.py │ └── requirements.txt ├── ka │ ├── README.md │ ├── __init__.py │ ├── normalizer.py │ └── requirements.txt ├── lt │ ├── README.md │ ├── __init__.py │ ├── normalizer.py │ └── requirements.txt └── tr │ ├── README.md │ ├── __init__.py │ ├── normalizer.py │ └── requirements.txt ├── notebooks └── .gitkeep ├── scripts └── hyperparam_search.py ├── setup.cfg ├── setup.py ├── src └── wav2vec_toolkit │ ├── __init__.py │ ├── data │ ├── collator.py │ └── dataset.py │ ├── eval.py │ ├── finetune.py │ ├── metrics │ ├── cer.py │ └── chunked_wer.py │ ├── text_preprocessing │ ├── __init__.py │ ├── default.py │ └── normalizers.py │ └── utils.py └── templates └── language ├── README.md ├── __init__.py ├── normalizer.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # TEMP 132 | *.tmp.py -------------------------------------------------------------------------------- /ADD_NEW_LANGUAGE.md: -------------------------------------------------------------------------------- 1 | # Add new language 2 | 3 | If you want to add a new language, you just need to create a folder with the iso code of your language (for example, `ru`), and the folder must consist of the following files: 4 | 5 | ```bash 6 | languages 7 | ├── ru 8 | │   ├── README.md 9 | │   ├── __init__.py 10 | │   ├── normalizer.py 11 | │   └── requirements.txt 12 | ``` 13 | 14 | Or you can just use our template as below: 15 | ```bash 16 | mkdir languages/{YOUR_ISO_CODE_LANGUAGE} 17 | cp templates/language/* languages/{YOUR_ISO_CODE_LANGUAGE} 18 | ``` 19 | 20 | The `__init__.py` have to import your normalizer as below: 21 | 22 | ```python 23 | from .normalizer import Normalizer 24 | ``` 25 | 26 | The `normalizer.py` consists of the normalization procedure related to your specific language. 27 | 28 | - `_whitelist`: The acceptable characters related to your language. 29 | - `_dictionary`: A dictionary of words, characters, or phrases that you want to find and replace before whitelisting. 30 | - `_do_lowercase`: Whether to do lowercase or not. 31 | - `_text_key_name`: The key name of text in the batch input related to `load_dataset` architecture of your audio dataset. 32 | - `text_level_normalizer()`: A method to add some extra normalization operations at the text level of your language. For example, a spelling correction. 33 | 34 | ```python 35 | from spellchecker import SpellChecker 36 | from typing import Any 37 | from wav2vec_toolkit.text_preprocessing.normalizers import NormalizerOperation 38 | 39 | 40 | russian = SpellChecker(language='ru') 41 | 42 | class Normalizer(NormalizerOperation): 43 | _whitelist = r"[0-9шиюынжсяплзухтвкйеобмцьёгдщэарчфъ\-]+" 44 | _dictionary = {} 45 | _do_lowercase = True 46 | _text_key_name = "sentence" 47 | def text_level_normalizer(self, sentence: str, *args: Any, **kwargs: Any) -> str: 48 | text = super(Normalizer, self).text_level_normalizer(sentence, *args, **kwargs) 49 | 50 | # DO OTHER OPERATIONS REGARDING YOURS, COMES HERE 51 | words = text.split() 52 | new_text = [] 53 | for word in words: 54 | misspelled = spell.unknown([word]) 55 | if misspelled != set(): 56 | new_text.append(spell.correction(word)) 57 | else: 58 | new_text.append(word) 59 | 60 | text = " ".join(new_text) 61 | return text 62 | ``` 63 | 64 | If you need to use extra libraries or packages regarding your normalization, you must fill in these requirements in the `requirements.txt`. For instance: 65 | 66 | ```text 67 | pyspellchecker==0.6.2 68 | ``` -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include languages * -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # wav2vec-toolkit 2 | A collection of scripts to preprocess ASR datasets and finetune language-specific Wav2Vec2 XLSR models 3 | 4 | This repository accompanies the 🤗 HuggingFace Community Paper on finetuning Wav2Vec2 XLSR for 5 | low-resource languages **[link]** 6 | 7 | # How to contribute 8 | (Mostly identical to the [huggingface/datasets contributing guide](https://raw.githubusercontent.com/huggingface/datasets/master/CONTRIBUTING.md)) 9 | 10 | 1. Fork the [repository](https://github.com/anton-l/wav2vec-toolkit) by clicking on the 'Fork' button on the repository's page. This creates a copy of the code under your GitHub user account. 11 | 12 | 2. Clone your fork to your local disk, and add the base repository as a remote: 13 | 14 | ```bash 15 | git clone git@github.com:/wav2vec-toolkit.git 16 | cd wav2vec-toolkit 17 | git remote add upstream https://github.com/anton-l/wav2vec-toolkit.git 18 | ``` 19 | 20 | 3. Set up a development environment by running the following command in a virtual environment: 21 | 22 | ```bash 23 | conda create -n env python=3.7 --y 24 | conda activate env 25 | pip install -e ".[dev]" 26 | pip install -r languages/{YOUR_SPECIFIC_LANGUAGE}/requirements.txt 27 | ``` 28 | 29 | (If wav2vec-toolkit was already installed in the virtual environment, remove 30 | it with `pip uninstall wav2vec_toolkit` before reinstalling it in editable 31 | mode with the `-e` flag.) 32 | 33 | 3. Create a new branch to hold your development changes: 34 | 35 | ```bash 36 | git checkout -b a-descriptive-name-for-my-changes 37 | ``` 38 | 39 | **do not** work on the `master` branch. 40 | 41 | 4. Develop the features on your branch. 42 | 1. Adding a new language [here](ADD_NEW_LANGUAGE.md) 43 | 44 | 5. Format your code. Run black and isort so that your newly added files look nice with the following command: 45 | 46 | ```bash 47 | black --line-length 119 --target-version py36 src scripts languages 48 | isort src scripts languages 49 | ``` 50 | 51 | 7. Once you're happy with your implementation, add your changes and make a commit to record your changes locally: 52 | 53 | ```bash 54 | git add . 55 | git commit 56 | ``` 57 | 58 | It is a good idea to sync your copy of the code with the original 59 | repository regularly. This way you can quickly account for changes: 60 | 61 | ```bash 62 | git fetch upstream 63 | git rebase upstream/main 64 | ``` 65 | 66 | Push the changes to your account using: 67 | 68 | ```bash 69 | git push -u origin a-descriptive-name-for-my-changes 70 | ``` 71 | 72 | 8. Once you are satisfied, go the webpage of your fork on GitHub. Click on "Pull request" to send your to the project maintainers for review. 73 | -------------------------------------------------------------------------------- /languages/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/languages/__init__.py -------------------------------------------------------------------------------- /languages/default/README.md: -------------------------------------------------------------------------------- 1 | # Language Card for [default-Default] 2 | 3 | 4 | ## Language Description 5 | 6 | SHORE DESCRIPTION COMES HERE 7 | 8 | - **Whitelist:** `r[0-9a-z]+` 9 | - **Characters:** `0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ` 10 | 11 | 12 | ## Extra Description 13 | 14 | DESCRIPTION COMES HERE -------------------------------------------------------------------------------- /languages/default/__init__.py: -------------------------------------------------------------------------------- 1 | from .normalizer import Normalizer 2 | -------------------------------------------------------------------------------- /languages/default/normalizer.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from wav2vec_toolkit.text_preprocessing.normalizers import NormalizerOperation 4 | 5 | 6 | class Normalizer(NormalizerOperation): 7 | _whitelist = r"[0-9\w]+" 8 | _dictionary = {} 9 | _do_lowercase = True 10 | _text_key_name = "sentence" 11 | -------------------------------------------------------------------------------- /languages/default/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/languages/default/requirements.txt -------------------------------------------------------------------------------- /languages/en/README.md: -------------------------------------------------------------------------------- 1 | # Language Card for [en-English] 2 | 3 | 4 | ## Language Description 5 | 6 | SHORE DESCRIPTION COMES HERE 7 | 8 | - **Whitelist:** `r"[0-9a-z\-]+"` 9 | - **Characters:** `0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-` 10 | 11 | 12 | ## Extra Description 13 | 14 | DESCRIPTION COMES HERE -------------------------------------------------------------------------------- /languages/en/__init__.py: -------------------------------------------------------------------------------- 1 | from .normalizer import Normalizer 2 | -------------------------------------------------------------------------------- /languages/en/normalizer.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from wav2vec_toolkit.text_preprocessing.normalizers import NormalizerOperation 4 | 5 | 6 | class Normalizer(NormalizerOperation): 7 | _whitelist = r"[0-9a-z\-]+" 8 | _dictionary = {} 9 | _do_lowercase = True 10 | _text_key_name = "sentence" 11 | -------------------------------------------------------------------------------- /languages/en/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/languages/en/requirements.txt -------------------------------------------------------------------------------- /languages/et/README.md: -------------------------------------------------------------------------------- 1 | # Language Card for [et-Estonian] 2 | 3 | 4 | ## Language Description 5 | 6 | SHORE DESCRIPTION COMES HERE 7 | 8 | - **Whitelist:** `r"[0-9a-zäöõüšž\-]+"` 9 | - **Characters:** `0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZäöõüšžÄÖÕÜŠŽ-` 10 | 11 | 12 | ## Extra Description 13 | 14 | DESCRIPTION COMES HERE -------------------------------------------------------------------------------- /languages/et/__init__.py: -------------------------------------------------------------------------------- 1 | from .normalizer import Normalizer 2 | -------------------------------------------------------------------------------- /languages/et/normalizer.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from wav2vec_toolkit.text_preprocessing.normalizers import NormalizerOperation 4 | 5 | 6 | class Normalizer(NormalizerOperation): 7 | _whitelist = r"[0-9a-zäöõüšž\-]+" 8 | _dictionary = {} 9 | _do_lowercase = True 10 | _text_key_name = "sentence" 11 | -------------------------------------------------------------------------------- /languages/et/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/languages/et/requirements.txt -------------------------------------------------------------------------------- /languages/fa/README.md: -------------------------------------------------------------------------------- 1 | # Language Card for [fa-Persian] 2 | 3 | 4 | ## Language Description 5 | 6 | SHORE DESCRIPTION COMES HERE 7 | 8 | - **Whitelist:** `r"[0-9a-z۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\-\u200c]+"` 9 | - **Characters:** `0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\-\u200c` 10 | 11 | 12 | ## Extra Description 13 | 14 | DESCRIPTION COMES HERE -------------------------------------------------------------------------------- /languages/fa/__init__.py: -------------------------------------------------------------------------------- 1 | from .normalizer import Normalizer 2 | -------------------------------------------------------------------------------- /languages/fa/normalizer.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import hazm 4 | 5 | from wav2vec_toolkit.text_preprocessing.normalizers import NormalizerOperation 6 | 7 | 8 | normalizer = hazm.Normalizer() 9 | 10 | 11 | class Normalizer(NormalizerOperation): 12 | _whitelist = r"[0-9a-z۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\-\u200c]+" 13 | _dictionary = { 14 | "ك": "ک", 15 | "دِ": "د", 16 | "بِ": "ب", 17 | "زِ": "ز", 18 | "ذِ": "ذ", 19 | "شِ": "ش", 20 | "سِ": "س", 21 | "ى": "ی", 22 | "ي": "ی", 23 | "أ": "ا", 24 | "ؤ": "و", 25 | "ے": "ی", 26 | "ۀ": "ه", 27 | "ﭘ": "پ", 28 | "ﮐ": "ک", 29 | "ﯽ": "ی", 30 | "ﺎ": "ا", 31 | "ﺑ": "ب", 32 | "ﺘ": "ت", 33 | "ﺧ": "خ", 34 | "ﺩ": "د", 35 | "ﺱ": "س", 36 | "ﻀ": "ض", 37 | "ﻌ": "ع", 38 | "ﻟ": "ل", 39 | "ﻡ": "م", 40 | "ﻢ": "م", 41 | "ﻪ": "ه", 42 | "ﻮ": "و", 43 | "ﺍ": "ا", 44 | "ة": "ه", 45 | "ﯾ": "ی", 46 | "ﯿ": "ی", 47 | "ﺒ": "ب", 48 | "ﺖ": "ت", 49 | "ﺪ": "د", 50 | "ﺮ": "ر", 51 | "ﺴ": "س", 52 | "ﺷ": "ش", 53 | "ﺸ": "ش", 54 | "ﻋ": "ع", 55 | "ﻤ": "م", 56 | "ﻥ": "ن", 57 | "ﻧ": "ن", 58 | "ﻭ": "و", 59 | "ﺭ": "ر", 60 | "ﮔ": "گ", 61 | "a": " ای ", 62 | "b": " بی ", 63 | "c": " سی ", 64 | "d": " دی ", 65 | "e": " ایی ", 66 | "f": " اف ", 67 | "g": " جی ", 68 | "h": " اچ ", 69 | "i": " آی ", 70 | "j": " جی ", 71 | "k": " کی ", 72 | "l": " ال ", 73 | "m": " ام ", 74 | "n": " ان ", 75 | "o": " او ", 76 | "p": " پی ", 77 | "q": " کیو ", 78 | "r": " آر ", 79 | "s": " اس ", 80 | "t": " تی ", 81 | "u": " یو ", 82 | "v": " وی ", 83 | "w": " دبلیو ", 84 | "x": " اکس ", 85 | "y": " وای ", 86 | "z": " زد ", 87 | "\u200d": " ", 88 | "\u200e": " ", 89 | "\u200f": " ", 90 | "\ufeff": " ", 91 | } 92 | _do_lowercase = True 93 | _text_key_name = "sentence" 94 | 95 | def text_level_normalizer(self, sentence: str, *args: Any, **kwargs: Any) -> str: 96 | text = super(Normalizer, self).text_level_normalizer(sentence, *args, **kwargs) 97 | text = normalizer.normalize(text) 98 | return text 99 | -------------------------------------------------------------------------------- /languages/fa/requirements.txt: -------------------------------------------------------------------------------- 1 | hazm==0.7.0 2 | # sentence-transformers -------------------------------------------------------------------------------- /languages/ga/README.md: -------------------------------------------------------------------------------- 1 | # Language Card for [ga-Irish] 2 | 3 | 4 | ## Language Description 5 | 6 | SHORE DESCRIPTION COMES HERE 7 | 8 | - **Whitelist:** `r"[0-9a-záéíóú\-]+"` 9 | - **Characters:** `0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZáéíóúÁÉÍÓÚ-` 10 | 11 | 12 | ## Extra Description 13 | 14 | DESCRIPTION COMES HERE -------------------------------------------------------------------------------- /languages/ga/__init__.py: -------------------------------------------------------------------------------- 1 | from .normalizer import Normalizer 2 | -------------------------------------------------------------------------------- /languages/ga/normalizer.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from wav2vec_toolkit.text_preprocessing.normalizers import NormalizerOperation 4 | 5 | 6 | def is_upper_vowel(letter): 7 | return letter in ["A", "E", "I", "O", "U", "Á", "É", "Í", "Ó", "Ú"] 8 | 9 | 10 | def irish_lower_word(word): 11 | if len(word) > 1 and word[0] in ["n", "t"] and is_upper_vowel(word[1]): 12 | return word[0] + "-" + word[1:].lower() 13 | else: 14 | return word.lower() 15 | 16 | 17 | class Normalizer(NormalizerOperation): 18 | _whitelist = r"[0-9a-záéíóú\-]+" 19 | _dictionary = {} 20 | _do_lowercase = False 21 | _text_key_name = "sentence" 22 | 23 | def text_level_normalizer(self, sentence: str, *args: Any, **kwargs: Any) -> str: 24 | text = super(Normalizer, self).text_level_normalizer(sentence, *args, **kwargs) 25 | text = " ".join([irish_lower_word(w) for w in text.split(" ")]) 26 | return text 27 | -------------------------------------------------------------------------------- /languages/ga/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/languages/ga/requirements.txt -------------------------------------------------------------------------------- /languages/ka/README.md: -------------------------------------------------------------------------------- 1 | # Language Card for [ga-Georgian] 2 | 3 | 4 | ## Language Description 5 | 6 | SHORE DESCRIPTION COMES HERE 7 | 8 | - **Whitelist:** `r"[0-9აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰ\-]+"` 9 | - **Characters:** `0123456789აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰ-` 10 | 11 | 12 | ## Extra Description 13 | 14 | DESCRIPTION COMES HERE -------------------------------------------------------------------------------- /languages/ka/__init__.py: -------------------------------------------------------------------------------- 1 | from .normalizer import Normalizer 2 | -------------------------------------------------------------------------------- /languages/ka/normalizer.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from wav2vec_toolkit.text_preprocessing.normalizers import NormalizerOperation 4 | 5 | 6 | class Normalizer(NormalizerOperation): 7 | _whitelist = r"[0-9აბგდევზთიკლმნოპჟრსტუფქღყშჩცძწჭხჯჰ\-]+" 8 | _dictionary = {} 9 | _do_lowercase = True 10 | _text_key_name = "sentence" 11 | -------------------------------------------------------------------------------- /languages/ka/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/languages/ka/requirements.txt -------------------------------------------------------------------------------- /languages/lt/README.md: -------------------------------------------------------------------------------- 1 | # Language Card for [lt-Lithuanian] 2 | 3 | 4 | ## Language Description 5 | 6 | SHORE DESCRIPTION COMES HERE 7 | 8 | - **Whitelist:** `r"[0-9a-ząčęėįšųūž\-]+"` 9 | - **Characters:** `0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZąčęėįšųūžĄČĘĖĮŠŲŪŽ-` 10 | 11 | 12 | ## Extra Description 13 | 14 | DESCRIPTION COMES HERE -------------------------------------------------------------------------------- /languages/lt/__init__.py: -------------------------------------------------------------------------------- 1 | from .normalizer import Normalizer 2 | -------------------------------------------------------------------------------- /languages/lt/normalizer.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from wav2vec_toolkit.text_preprocessing.normalizers import NormalizerOperation 4 | 5 | 6 | class Normalizer(NormalizerOperation): 7 | _whitelist = r"[0-9a-ząčęėįšųūž\-]+" 8 | _dictionary = {} 9 | _do_lowercase = True 10 | _text_key_name = "sentence" 11 | -------------------------------------------------------------------------------- /languages/lt/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/languages/lt/requirements.txt -------------------------------------------------------------------------------- /languages/tr/README.md: -------------------------------------------------------------------------------- 1 | # Language Card for [tr-Turkish] 2 | 3 | 4 | ## Language Description 5 | 6 | SHORE DESCRIPTION COMES HERE 7 | 8 | - **Whitelist:** `r"[0-9a-zçğüöşı\-]+"` 9 | - **Characters:** `0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZçğüöşıÇĞÜÖŞI-` 10 | 11 | 12 | ## Extra Description 13 | 14 | DESCRIPTION COMES HERE -------------------------------------------------------------------------------- /languages/tr/__init__.py: -------------------------------------------------------------------------------- 1 | from .normalizer import Normalizer 2 | -------------------------------------------------------------------------------- /languages/tr/normalizer.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from wav2vec_toolkit.text_preprocessing.normalizers import NormalizerOperation 4 | 5 | 6 | class Normalizer(NormalizerOperation): 7 | _whitelist = r"[0-9a-zçğüöşı\-]+" 8 | _dictionary = {} 9 | _do_lowercase = True 10 | _text_key_name = "sentence" 11 | -------------------------------------------------------------------------------- /languages/tr/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/languages/tr/requirements.txt -------------------------------------------------------------------------------- /notebooks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/notebooks/.gitkeep -------------------------------------------------------------------------------- /scripts/hyperparam_search.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/scripts/hyperparam_search.py -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | license_file = LICENSE 3 | 4 | [isort] 5 | ensure_newline_before_comments = True 6 | force_grid_wrap = 0 7 | include_trailing_comma = True 8 | known_first_party = wav2vec_toolkit 9 | known_third_party = 10 | datasets 11 | fairseq 12 | torchaudio 13 | transformers 14 | tokenizers 15 | 16 | line_length = 119 17 | lines_after_imports = 2 18 | multi_line_output = 3 19 | use_parentheses = True 20 | 21 | [flake8] 22 | ignore = E203, E501, W503 23 | max-line-length = 119 24 | exclude = 25 | notebooks -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from setuptools import find_packages, setup 3 | 4 | 5 | INSTALL_REQ = [ 6 | "datasets>=1.5.0", 7 | "transformers>=4.4.2", 8 | "torchaudio", 9 | "soundfile", 10 | "audiomentations", 11 | 12 | # language-specific packages 13 | # "hazm", # Farsi 14 | ] 15 | 16 | 17 | EXTRAS_REQ = { 18 | "dev": [ 19 | "black", 20 | "isort", 21 | "flake8==3.7.9", 22 | ], 23 | } 24 | 25 | languages_packages = [ 26 | "wav2vec_toolkit/{}".format(p).replace("/", ".") 27 | for p 28 | in pathlib.Path("languages").glob("**") 29 | ] 30 | 31 | setup( 32 | name="wav2vec_toolkit", 33 | version="0.0.1", 34 | package_dir={ 35 | "wav2vec_toolkit": "src/wav2vec_toolkit", 36 | "wav2vec_toolkit.languages": "languages" 37 | }, 38 | packages=find_packages(where="src") + languages_packages, 39 | include_package_data=True, 40 | url="https://github.com/anton-l/wav2vec-toolkit", 41 | license="Apache 2.0", 42 | author="The HuggingFace community", 43 | author_email="", 44 | description="A collection of scripts to preprocess ASR datasets and finetune language-specific Wav2Vec2 XLSR models", 45 | install_requires=INSTALL_REQ, 46 | extras_require=EXTRAS_REQ, 47 | ) 48 | -------------------------------------------------------------------------------- /src/wav2vec_toolkit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/src/wav2vec_toolkit/__init__.py -------------------------------------------------------------------------------- /src/wav2vec_toolkit/data/collator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/src/wav2vec_toolkit/data/collator.py -------------------------------------------------------------------------------- /src/wav2vec_toolkit/data/dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/src/wav2vec_toolkit/data/dataset.py -------------------------------------------------------------------------------- /src/wav2vec_toolkit/eval.py: -------------------------------------------------------------------------------- 1 | """ 2 | This could be a unified WER eval function that takes a target language as an argument 3 | 4 | E.g. 5 | 6 | from wav2vec_toolkit.finetune import finetune 7 | from wav2vec_toolkit.text_preprocessing.lang import fa 8 | 9 | finetune(model="username/wav2vec-xlsr-fa", 10 | dataset="common_voice", 11 | split="test", 12 | language="fa", 13 | text_preprocessing=fa.normalize) 14 | """ 15 | -------------------------------------------------------------------------------- /src/wav2vec_toolkit/finetune.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main entrypoint for finetuning 3 | 4 | It could be a function that wraps the huggingface Trainer and picks appropriate finetuning 5 | parameters depending on the language 6 | 7 | E.g. 8 | 9 | from wav2vec_toolkit.finetune import finetune 10 | 11 | finetune(base_model="facebook/wav2vec-xlsr", dataset="common_voice", language="fr", max_epochs=100) 12 | """ 13 | -------------------------------------------------------------------------------- /src/wav2vec_toolkit/metrics/cer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/src/wav2vec_toolkit/metrics/cer.py -------------------------------------------------------------------------------- /src/wav2vec_toolkit/metrics/chunked_wer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/src/wav2vec_toolkit/metrics/chunked_wer.py -------------------------------------------------------------------------------- /src/wav2vec_toolkit/text_preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/src/wav2vec_toolkit/text_preprocessing/__init__.py -------------------------------------------------------------------------------- /src/wav2vec_toolkit/text_preprocessing/default.py: -------------------------------------------------------------------------------- 1 | """ 2 | Default text preprocessing for languages not defined in text_processing.lang 3 | """ 4 | 5 | 6 | def normalize(text, keep_apostrophes=False): 7 | text = text.lower() 8 | 9 | keep_chars = [] 10 | 11 | if keep_apostrophes: 12 | # normalize apostrophes 13 | sent = text.replace("’", "'") 14 | keep_chars.append("'") 15 | 16 | # replace non-alpha characters with space 17 | sent = "".join(ch if ch.isalpha() or ch in keep_chars else " " for ch in text) 18 | 19 | # remove repeated spaces 20 | text = " ".join(sent.split()) 21 | 22 | return text 23 | -------------------------------------------------------------------------------- /src/wav2vec_toolkit/text_preprocessing/normalizers.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | import textwrap 4 | from typing import Any, Dict, Optional 5 | 6 | from wav2vec_toolkit.utils import load_module_from_lang 7 | 8 | 9 | class NormalizerOperation: 10 | """A general normalizer for every language""" 11 | 12 | _whitelist = r"\w+" 13 | _dictionary = {} 14 | _text_key_name: str = "sentence" 15 | _do_lowercase: bool = True 16 | 17 | def __init__( 18 | self, 19 | whitelist: str = None, 20 | dictionary: Dict[str, str] = None, 21 | ) -> None: 22 | self.text_key_name = self._text_key_name 23 | self.whitelist = whitelist if whitelist and isinstance(whitelist, str) else self._whitelist 24 | self.dictionary = dictionary if dictionary and isinstance(dictionary, dict) else self._dictionary 25 | self.do_lowercase = self._do_lowercase 26 | 27 | def chars_to_map(self, sentence: str) -> str: 28 | """Maps every character, words, and phrase into a proper one. 29 | 30 | Args: 31 | sentence (str): A piece of text. 32 | """ 33 | if not len(self.dictionary) > 0: 34 | return sentence 35 | 36 | pattern = "|".join(map(re.escape, self.dictionary.keys())) 37 | return re.sub(pattern, lambda m: self.dictionary[m.group()], str(sentence)) 38 | 39 | def chars_to_preserve( 40 | self, 41 | sentence: str, 42 | ) -> str: 43 | """Keeps specified characters from sentence 44 | 45 | Args: 46 | sentence (str): A piece of text. 47 | """ 48 | try: 49 | tokenized = re.findall(self.whitelist, sentence, re.IGNORECASE) 50 | return " ".join(tokenized) 51 | except Exception as error: 52 | print( 53 | textwrap.dedent( 54 | f""" 55 | Bad characters range {self.whitelist}, 56 | {error} 57 | """ 58 | ) 59 | ) 60 | raise 61 | 62 | def text_level_normalizer(self, sentence: str, *args: Any, **kwargs: Any) -> str: 63 | """A text level of normalization. 64 | It is handy for some languages that need to add a hierarchy of 65 | normalization and filtering at the text level. 66 | 67 | Args: 68 | sentence (str): A piece of text. 69 | """ 70 | text = sentence 71 | return text 72 | 73 | def __call__( 74 | self, 75 | batch: Dict, 76 | return_dict: bool = True, 77 | do_lastspace_removing: bool = True, 78 | text_key_name: Optional[str] = None, 79 | do_lowercase: Optional[bool] = None, 80 | *args: Any, 81 | **kwargs: Any, 82 | ) -> Any: 83 | """Normalization caller 84 | 85 | Args: 86 | batch (Dict): A batch of input. 87 | text_key_name (str, optional): The key name of text in the batch input. 88 | return_dict (bool, optional): Whether to return dictionary of batch or not just the text. Defaults to True. 89 | do_lastspace_removing (bool, optional): Whether to add extra space at the end of text or not. Defaults to True. 90 | do_lowercase (bool, optional): Whether to do lowercase or not. Defaults to None. 91 | """ 92 | 93 | text_key_name = text_key_name if text_key_name else self.text_key_name 94 | do_lowercase = do_lowercase if isinstance(do_lowercase, bool) else self.do_lowercase 95 | 96 | if text_key_name not in batch: 97 | raise KeyError( 98 | textwrap.dedent( 99 | f""" 100 | keyname {text_key_name} not existed in the batch dictionary, 101 | the batch dictionary consists of the following keys {list(batch.keys())}, 102 | you can easily add a new keyname by passing the `text_key_name` into Normalizer. 103 | """ 104 | ) 105 | ) 106 | 107 | text = batch[text_key_name].strip() 108 | 109 | if do_lowercase: 110 | text = text.lower() 111 | 112 | text = self.chars_to_map(text) 113 | text = self.chars_to_preserve(text) 114 | text = self.text_level_normalizer(text, *args, **kwargs) 115 | 116 | text = text.strip() 117 | if not do_lastspace_removing: 118 | text = text + " " 119 | 120 | if not return_dict: 121 | return text 122 | 123 | batch[text_key_name] = text 124 | return batch 125 | 126 | 127 | def normalizers(lang: str) -> NormalizerOperation: 128 | 129 | _normalizer = load_module_from_lang(lang)() 130 | return _normalizer 131 | -------------------------------------------------------------------------------- /src/wav2vec_toolkit/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import textwrap 4 | 5 | import pkg_resources 6 | 7 | 8 | BASE_PATH = "wav2vec_toolkit" 9 | LANG_PATH = "languages" 10 | LANG_MODULE_REQUIREMENTS = ["normalizer.py", "README.md", "requirements.txt"] 11 | 12 | 13 | def get_file_path(name: str): 14 | return os.path.abspath(os.path.join(os.path.dirname(__file__), name)) 15 | 16 | 17 | def parse_requirements(filename: str): 18 | lineiter = (line.strip() for line in open(filename)) 19 | return [line for line in lineiter if line and not line.startswith("#")] 20 | 21 | 22 | def load_module_from_lang(lang: str): 23 | lang_mod_path = f"{BASE_PATH}/{LANG_PATH}/{lang}" 24 | lang_path = "/".join(lang_mod_path.split("/")[-2:]) 25 | for path in LANG_MODULE_REQUIREMENTS: 26 | _path = get_file_path(os.path.join(lang_path, path)) 27 | if not os.path.exists(_path): 28 | raise FileNotFoundError( 29 | textwrap.dedent( 30 | f""" 31 | The filename {path} not existed in `{lang}` directory {_path}, 32 | you can easily add a new language by instructions mentioned at repo. 33 | https://github.com/anton-l/wav2vec-toolkit/tree/master#adding-new-languages 34 | """ 35 | ) 36 | ) 37 | 38 | requirements_txt = get_file_path(os.path.join(lang_path, "requirements.txt")) 39 | dependencies = parse_requirements(requirements_txt) 40 | try: 41 | pkg_resources.require(dependencies) 42 | except pkg_resources.VersionConflict as error: 43 | print( 44 | textwrap.dedent( 45 | f""" 46 | {error.dist} is installed but {error.req} is required, 47 | fastest solution `pip install -r lang/{lang}/requirements.txt`, 48 | you can easily add a new language by instructions mentioned at repo. 49 | https://github.com/anton-l/wav2vec-toolkit/tree/master#adding-new-languages 50 | """ 51 | ) 52 | ) 53 | raise 54 | 55 | except pkg_resources.DistributionNotFound as error: 56 | print( 57 | textwrap.dedent( 58 | f""" 59 | The '{error.req}' distribution was not found and is required by {error.requirers_str}, 60 | fastest solution `pip install -r lang/{lang}/requirements.txt`, 61 | you can easily add a new language by instructions mentioned at repo. 62 | https://github.com/anton-l/wav2vec-toolkit/tree/master#adding-new-languages 63 | """ 64 | ) 65 | ) 66 | raise 67 | 68 | try: 69 | module = __import__(lang_mod_path.replace("/", "."), fromlist=["Normalizer"]) 70 | except ModuleNotFoundError: 71 | print( 72 | textwrap.dedent( 73 | f""" 74 | something wrong happened with your language {lang}, 75 | you can easily add a new language by instructions mentioned at repo. 76 | https://github.com/anton-l/wav2vec-toolkit/tree/master#adding-new-languages 77 | """ 78 | ) 79 | ) 80 | raise 81 | 82 | normalizer = module.Normalizer if getattr(module, "Normalizer") else None 83 | return normalizer 84 | -------------------------------------------------------------------------------- /templates/language/README.md: -------------------------------------------------------------------------------- 1 | # Language Card for [Language_ISO_CODE-LANGUAGE_NAME] 2 | 3 | 4 | ## Language Description 5 | 6 | SHORE DESCRIPTION COMES HERE 7 | 8 | - **Whitelist:** `r"[0-9a-z\-]+"` 9 | - **Characters:** `0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-` 10 | 11 | 12 | ## Extra Description 13 | 14 | DESCRIPTION COMES HERE -------------------------------------------------------------------------------- /templates/language/__init__.py: -------------------------------------------------------------------------------- 1 | from .normalizer import Normalizer 2 | -------------------------------------------------------------------------------- /templates/language/normalizer.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from wav2vec_toolkit.text_preprocessing.normalizers import NormalizerOperation 3 | 4 | 5 | class Normalizer(NormalizerOperation): 6 | _whitelist = r"[0-9\w]+" 7 | _dictionary = {} 8 | _do_lowercase = True 9 | _text_key_name = "sentence" 10 | 11 | def text_level_normalizer(self, sentence: str, *args: Any, **kwargs: Any) -> str: 12 | text = super().text_level_normalizer(sentence, *args, **kwargs) 13 | 14 | # DO OTHER OPERATIONS REGARDING YOURS, COMES HERE 15 | # text = ... 16 | 17 | return text -------------------------------------------------------------------------------- /templates/language/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anton-l/wav2vec-toolkit/4212ea38f3f3e06074b64b484c4b5df606d302a5/templates/language/requirements.txt --------------------------------------------------------------------------------