├── .github └── workflows │ ├── docs.yaml │ └── pypi.yaml ├── .gitignore ├── README.md ├── devrequirements.txt ├── docs ├── css │ └── custom.css ├── examples.md ├── faqs.md ├── getting-started.md ├── img │ └── favicon.ico ├── index.md └── models.md ├── docs_theme └── main.html ├── mkdocs.yml ├── requirements.txt ├── setup.py └── speechtoolkit ├── __init__.py ├── asr ├── __init__.py ├── distilwhisper_lib.py └── whisper_lib.py ├── classification ├── __init__.py └── languageclassification.py ├── data ├── __init__.py └── languages.py ├── tts ├── __init__.py └── styletts2_lib.py ├── utils ├── __init__.py └── device.py └── vc ├── __init__.py ├── lvc_lib.py └── ns3vc_lib.py /.github/workflows/docs.yaml: -------------------------------------------------------------------------------- 1 | name: Publish docs via GitHub Pages 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | jobs: 8 | build: 9 | name: Deploy docs 10 | runs-on: ubuntu-latest 11 | permissions: 12 | contents: write 13 | steps: 14 | - name: Checkout main 15 | uses: actions/checkout@v2 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install build 20 | - name: Deploy docs 21 | uses: mhausenblas/mkdocs-deploy-gh-pages@nomaterial 22 | env: 23 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 24 | CONFIG_FILE: mkdocs.yml 25 | REQUIREMENTS: devrequirements.txt -------------------------------------------------------------------------------- /.github/workflows/pypi.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SpeechToolkit 2 | 3 | **NOTE: This project is still in an early alpha stage and is not ready for production yet.** 4 | 5 | A unified framework for text-to-speech, voice conversion, automatic speech recognition, audio classification, and more! 6 | 7 | Please note that this toolkit is currently in an early alpha and not all features have been implemented. 8 | 9 | If you prefer not to use SpeechToolkit but would like to interact with models individually and separately, please check out the [ML for Speech](https://github.com/ml-for-speech) page. 10 | 11 | ## Implemented Features 12 | 13 | - [x] Text-to-speech 14 | - [x] StyleTTS 2 15 | - [ ] MetaVoice 16 | - [ ] Parler TTS 17 | - [ ] XTTS 18 | - [x] Voice conversion 19 | - [x] LVC-VC 20 | - [x] NaturalSpeech3 Voice Conversion 21 | - [ ] StyleTTS2-VC 22 | - [x] Automatic speech recognition 23 | - [x] Whisper 24 | - [x] Distil-Whisper 25 | - [ ] Canary 26 | - [x] Audio classification 27 | - [x] Language detection 28 | 29 | ## Installation & Usage 30 | 31 | Documentation is available [online](https://ml-for-speech.github.io/speechtoolkit). 32 | 33 | ## Disclaimer 34 | 35 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 36 | 37 | Proivded models may make mistakes. 38 | 39 | THE MODEL IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS MODEL INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS MODEL. 40 | -------------------------------------------------------------------------------- /devrequirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs -------------------------------------------------------------------------------- /docs/css/custom.css: -------------------------------------------------------------------------------- 1 | div.autodoc-docstring { 2 | padding-left: 20px; 3 | margin-bottom: 30px; 4 | border-left: 5px solid rgba(230, 230, 230); 5 | } 6 | 7 | div.autodoc-members { 8 | padding-left: 20px; 9 | margin-bottom: 15px; 10 | } -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | ## Text-to-Speech 4 | 5 | ```python 6 | from speechtoolkit.tts import SingleSpeakerStyleTTS2Model 7 | 8 | model = SingleSpeakerStyleTTS2Model() 9 | 10 | model.infer_to_file('Hello, this is a test', 'out.wav') 11 | ``` 12 | 13 | **Multi-speaker StyleTTS 2 with zero-shot voice cloning:** 14 | 15 | ```python 16 | from speechtoolkit.tts import MultiSpeakerStyleTTS2Model 17 | 18 | model = MultiSpeakerStyleTTS2Model() 19 | 20 | model.infer_to_file('Hello, this is a test', 'sample.wav', 'out.wav') 21 | ``` 22 | 23 | ## Automatic Speech Recognition 24 | 25 | ```python 26 | from speechtoolkit.asr import WhisperModel 27 | 28 | model = WhisperModel() 29 | 30 | model.infer_file('audio.wav') 31 | ``` 32 | 33 | **With a larger model:** 34 | 35 | ```python 36 | from speechtoolkit.asr import WhisperModel 37 | 38 | model = WhisperModel('medium') 39 | 40 | model.infer_file('audio.wav') 41 | ``` 42 | 43 | **With DistilWhisper:** 44 | 45 | ```python 46 | from speechtoolkit.asr import DistilWhisperModel 47 | 48 | model = DistilWhisperModel() 49 | 50 | model.infer_file('audio.wav') 51 | ``` 52 | 53 | ## Voice Conversion 54 | 55 | ```python 56 | from speechtoolkit.vc import LVC 57 | 58 | vc = LVC(device='auto') 59 | 60 | vc.infer_file( 61 | 'original.wav', 62 | 'sample.wav', 63 | 'out.wav' 64 | ) 65 | ``` 66 | 67 | ## Language Classification 68 | 69 | ```python 70 | from speechtoolkit.classification.languageclassification import WhisperLanguageClassifierModel 71 | 72 | lc = WhisperLanguageClassifierModel() 73 | 74 | lc.infer_file('audio.wav') # 'en' 75 | ``` 76 | 77 | ## Accent Classification 78 | 79 | ```python 80 | from speechtoolkit.classification.accentclassification import EdAccAccentClassifierModel 81 | 82 | ac = EdAccAccentClassifierModel() 83 | 84 | ac.infer_file('audio.wav') # 'Mainstream US English' 85 | ``` 86 | -------------------------------------------------------------------------------- /docs/faqs.md: -------------------------------------------------------------------------------- 1 | # FAQs 2 | 3 | ## What is SpeechToolkit? 4 | 5 | Please refer to the [introduction](index.md) for more details. 6 | 7 | ## Are all models trained by SpeechToolkit? 8 | 9 | No, SpeechToolkit is actually primarily composed of third-party models. SpeechToolkit provides a unified, simple Python API to access these models. However, SpeechToolkit does provide some trained models. 10 | 11 | ## Is Apple Silicon (MPS) supported? 12 | 13 | Unfortunately, PyTorch does not yet support all operations on MPS. For simplicity, MPS support is currently disabled across all models, however it may be supported on a case-by-case basis in the future. -------------------------------------------------------------------------------- /docs/getting-started.md: -------------------------------------------------------------------------------- 1 | # Getting Started 2 | 3 | ## Installation 4 | 5 | Installing SpeechToolkit is quite easy! 6 | 7 | ### Basic Installation 8 | 9 | Use this for testing, development, etc. Note that it will download packages for *all* models (text-to-speech, voice cloning, etc), so it will be much slower and larger than downloading task-specific versions. 10 | 11 | If you're unsure of which one to download, you should use basic installation. 12 | 13 | ``` 14 | pip install speechtoolkit[all] 15 | ``` 16 | 17 | ### Advanced Installation 18 | 19 | For production deployment. 20 | 21 | Install SpeechToolkit core without any extras: 22 | 23 | ``` 24 | pip install speechtoolkit 25 | ``` 26 | 27 | ## Basic Usage 28 | 29 | Now that you've successfully installed SpeechToolkit, it's time to run some models! Head over to the [examples](examples.md) page to see some basic examples. -------------------------------------------------------------------------------- /docs/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ml-for-speech/speechtoolkit/037c2c7c92521505f7bc7bc61a1ebb33783a48e3/docs/img/favicon.ico -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | SpeechToolkit is an all-in-one, end-to-end toolkit for ML in speech. It aims to simplify the usage of text-to-speech, automatic speech recognition, and voice conversion models. 4 | 5 | ## Why SpeechToolkit? 6 | 7 | Almost every model uses a different Python API. If you wanted to integrate them into your project, you'd need to write customized code for the model. Switching to a different model would require significant changes. 8 | 9 | SpeechToolkit aims to solve this by providing a centralized, unified, easy-to-use Python API for speech models. Instead of having to rewrite your program to support a new model, you can simply change a couple lines of code with SpeechToolkit. 10 | 11 | In addition, SpeechToolkit packages these models into a simple, PyPI-installable package. This not only makes code management easier, but also can help mitigate potential licensing issues. 12 | 13 | ## Packages 14 | 15 | SpeechToolkit supports many different third-party open-access models, as well as some models developed by ML for Speech. While these models are mostly available through the SpeechToolkit package, we've packaged many of these models individually if you don't want to use the SpeechToolkit library. 16 | 17 | ## Get Started 18 | 19 | Visit the [Getting Started](getting-started.md) page to get started! -------------------------------------------------------------------------------- /docs/models.md: -------------------------------------------------------------------------------- 1 | # Models 2 | 3 | SpeechToolkit supports several different models for various tasks. 4 | 5 | Note that the license "Same" indicates that this is 6 | 7 | ## Text-to-Speech 8 | 9 | Below is a list of models supported for text-to-speech: 10 | 11 | | Name | License | Link | 12 | | ------- | ------- | ----------------------------------------------- | 13 | | StyleTTS 2 | MIT | [Repository](https://github.com/yl4579/StyleTTS2) | 14 | 15 | Note: StyleTTS 2 by default uses a GPL-licensed phonemizer but we've replaced it with the BSD-licensed [OpenPhonemizer](https://github.com/NeuralVox/OpenPhonemizer). 16 | 17 | ## Automatic Speech Recognition 18 | 19 | Below is a list of supported models for automatic speech recognition. 20 | 21 | | Name | License | Link | 22 | | ------- | ------- | ----------------------------------------------- | 23 | | Whisper | MIT | [Repository](https://github.com/openai/whisper) | 24 | 25 | ## Speech Classification 26 | 27 | **NOTE: Classification models are not very accurate yet.** 28 | 29 | SpeechToolkit supports several different types of speech classification. These models are trained by ML for Speech. 30 | 31 | | Version | Task | Link | 32 | | ------- | ----------------------- | --------------------------------------------------------------------- | 33 | | V1 | Language Classification | [Model](https://huggingface.co/ml-for-speech/language-classification) | 34 | 35 | ## Voice Conversion 36 | 37 | Below is a list of supported models for voice conversion. 38 | 39 | | Name | License | Link | 40 | | ------ | ------- | ---------------------------------------------------- | 41 | | LVC-VC | MIT | [Repository](https://github.com/wonjune-kang/lvc-vc) | 42 | | NS3VC | MIT | [Repository](https://github.com/open-mmlab/Amphion) | 43 | 44 | ## A Short Guide to Licenses 45 | 46 | Note that this is not legal advice. 47 | 48 | Please note that models may have a different license than SpeechToolkit. If this is the case, you must comply with both SpeechToolkit *and* the license of the model. 49 | 50 | If you're wondering whether or not you can use a model commercially, you should check both the model's license and the pretrained weights' license. The MIT, Apache 2.0, and BSD licenses typically allow commercial use, unless otherwise specified by the authors. However, the BSD-4-Clause license requires you to provide attribution to the author in certain marketing materials (read the full license for details). If the license name includes "NC," it is likely a non-commercial license, which means you cannot use it commercially. Also note that some models may be trained on copyrighted content, which, depending on your jurisdiction, may influence the ability for you to use the models. 51 | 52 | Before using models, you should carefully read their licenses. 53 | 54 | ## Disclaimer 55 | 56 | Disclaimer for models trained by SpeechToolkit: 57 | 58 | THE MODEL IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS MODEL INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS MODEL. 59 | -------------------------------------------------------------------------------- /docs_theme/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {%- block footer %} 4 |
5 | {%- if config.copyright %} 6 |

{{ config.copyright }}

7 | {%- endif %} 8 | 9 | {%- endblock %} -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: SpeechToolkit 2 | copyright: '© 2024 ML for Speech. All docs are on our GitHub Repository.' 3 | # repo_url: https://github.com/ml-for-speech/speechtoolkit 4 | 5 | theme: 6 | name: mkdocs 7 | highlightjs: true 8 | hljs_languages: 9 | - python 10 | - yaml 11 | - json 12 | shortcuts: 13 | help: 191 # ? 14 | next: 78 # n 15 | previous: 80 # p 16 | search: 191 # / 17 | custom_dir: docs_theme 18 | markdown_extensions: 19 | - admonition 20 | - codehilite 21 | extra_css: 22 | - css/custom.css -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -e . -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | extras = { 4 | "vc": [ 5 | "lvc", 6 | "ns3vc", 7 | ], 8 | "fa2": [ 9 | "flash-attn", 10 | ], 11 | "asr": [ 12 | "openai-whisper", 13 | ], 14 | "tts": [ 15 | "mfs-styletts2", 16 | ], 17 | "dev": [ 18 | "mkdocs", 19 | "mkautodoc", 20 | ], 21 | } 22 | 23 | extra_pkgs = extras 24 | final = [] 25 | 26 | for k in extras: 27 | if not k == "dev": 28 | final += extras[k] 29 | 30 | extra_pkgs["all"] = final 31 | 32 | with open("README.md", "r") as f: 33 | longdesc = f.read() 34 | setup( 35 | name="speechtoolkit", 36 | version="0.0.5", 37 | author="ml-for-speech", 38 | description="ML for Speech presents SpeechToolkit, a unified, all-in-one toolkit for TTS, ASR, VC, & other models.", 39 | long_description=longdesc, 40 | long_description_content_type="text/markdown", 41 | url="https://github.com/ml-for-speech/speechtoolkit", 42 | packages=find_packages(), 43 | install_requires=[ 44 | "soundfile", 45 | "librosa", 46 | "transformers", 47 | "torch", 48 | "optimum", 49 | "txtsplit", 50 | ], 51 | extras_require=extra_pkgs, 52 | ) 53 | -------------------------------------------------------------------------------- /speechtoolkit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ml-for-speech/speechtoolkit/037c2c7c92521505f7bc7bc61a1ebb33783a48e3/speechtoolkit/__init__.py -------------------------------------------------------------------------------- /speechtoolkit/asr/__init__.py: -------------------------------------------------------------------------------- 1 | from speechtoolkit.asr.whisper_lib import WhisperModel 2 | from speechtoolkit.asr.distilwhisper_lib import DistilWhisperModel 3 | -------------------------------------------------------------------------------- /speechtoolkit/asr/distilwhisper_lib.py: -------------------------------------------------------------------------------- 1 | from speechtoolkit.utils.device import device_map 2 | 3 | 4 | class DistilWhisperModel: 5 | """ 6 | Use DistilWhisper for automatic speech recognition. Supports significant speedups. 7 | 8 | Supports several speedups (Flash Attention 2 & BetterTransformer), borrowed from [insanely-fast-whisper](https://github.com/Vaibhavs10/insanely-fast-whisper). 9 | 10 | **Args** 11 | 12 | model (str): Which Whisper model to use on the Hugging Face Hub 13 | device (str): The device to use. Defaults to 'auto' 14 | use_fa2 (bool): Use Flash Attention 2 (significant speedup). Incompatible with BetterTransformer. Only works on CUDA GPUs. 15 | use_bettertransformer (bool): Use BetterTransformer (speedup). Incompatible with Flash Attention 2. If available, use Flash Attention 2 instead. 16 | **kwargs: Additional arguments to pass to Whisper package 17 | """ 18 | 19 | def __init__( 20 | self, 21 | model="distil-whisper/distil-large-v3", 22 | use_fa2=False, 23 | use_bettertransformer=False, 24 | device="auto", 25 | **kwargs, 26 | ): 27 | """ 28 | Initialize model. 29 | 30 | **Args** 31 | 32 | model (str): Which Whisper model to use on the Hugging Face Hub 33 | device (str): The device to use. Defaults to 'auto' 34 | use_fa2 (bool): Use Flash Attention 2 (significant speedup). Incompatible with BetterTransformer. Only works on CUDA GPUs. 35 | use_bettertransformer (bool): Use BetterTransformer (speedup). Incompatible with Flash Attention 2. If available, use Flash Attention 2 instead. 36 | **kwargs: Additional arguments to pass to Whisper package 37 | """ 38 | if use_bettertransformer and use_fa2: 39 | raise ValueError( 40 | "You cannot use both BetterTransformer and Flash Attention 2 at the same time. Typically, Flash Attention 2 provides a better speedup." 41 | ) 42 | from transformers import pipeline 43 | 44 | model_kwargs = {} 45 | if use_fa2: 46 | model_kwargs = {"attn_implementation": "flash_attention_2"} 47 | self.model = pipeline( 48 | "automatic-speech-recognition", 49 | model, 50 | device=device_map(device), 51 | model_kwargs=model_kwargs, 52 | **kwargs, 53 | ) 54 | if use_bettertransformer: 55 | self.model.model.to_bettertransformer() 56 | 57 | def infer_file(self, audio_path, **kwargs): 58 | """ 59 | Run inference on a single file. 60 | 61 | **Args** 62 | 63 | audio_path (str): The path of the original audio. 64 | **kwargs: Additional arguments to pass to Whisper package 65 | 66 | **Returns** 67 | 68 | str: The transcript of the audio file. 69 | """ 70 | return self.model(audio_path, **kwargs)["text"].strip() 71 | -------------------------------------------------------------------------------- /speechtoolkit/asr/whisper_lib.py: -------------------------------------------------------------------------------- 1 | from speechtoolkit.utils.device import device_map 2 | 3 | 4 | class WhisperModel: 5 | """ 6 | Use OpenAI Whisper for automatic speech recognition. 7 | 8 | **Args** 9 | 10 | model (str): Which Whisper model to use 11 | device (str): The device to use. Defaults to 'auto' 12 | **kwargs: Additional arguments to pass to Whisper package 13 | """ 14 | 15 | def __init__( 16 | self, 17 | model="base", 18 | device="auto", 19 | **kwargs, 20 | ): 21 | """ 22 | Initialize model. 23 | 24 | **Args** 25 | 26 | model (str): Which Whisper model to use 27 | device (str): The device to use. Defaults to 'auto' 28 | **kwargs: Additional arguments to pass to Whisper package 29 | """ 30 | import whisper 31 | 32 | self.model = whisper.load_model(model, **kwargs).to(device_map(device)) 33 | 34 | def infer_file(self, audio_path, **kwargs): 35 | """ 36 | Run inference on a single file. 37 | 38 | **Args** 39 | 40 | audio_path (str): The path of the original audio. 41 | **kwargs: Additional arguments to pass to Whisper package 42 | 43 | **Returns** 44 | 45 | str: The transcript of the audio file. 46 | """ 47 | return self.model.transcribe(audio_path, **kwargs)["text"].strip() 48 | -------------------------------------------------------------------------------- /speechtoolkit/classification/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ml-for-speech/speechtoolkit/037c2c7c92521505f7bc7bc61a1ebb33783a48e3/speechtoolkit/classification/__init__.py -------------------------------------------------------------------------------- /speechtoolkit/classification/languageclassification.py: -------------------------------------------------------------------------------- 1 | from transformers import pipeline 2 | from speechtoolkit.data.languages import language_codes 3 | from speechtoolkit.utils.device import device_map 4 | 5 | 6 | class WhisperLanguageClassifierModel: 7 | """ 8 | Use a Whisper-based language classification model. 9 | 10 | **Args** 11 | 12 | device (str): The device to use. Defaults to 'auto' 13 | model (str): The model ID to use on the Hugging Face Hub. 14 | **kwargs: Additional arguments to pass to package 15 | """ 16 | 17 | def __init__( 18 | self, 19 | device="auto", 20 | model="ml-for-speech/language-classification", 21 | **kwargs, 22 | ): 23 | """ 24 | Initialize model. 25 | 26 | **Args** 27 | 28 | device (str): The device to use. Defaults to 'auto' 29 | model (str): The model ID to use on the Hugging Face Hub. 30 | **kwargs: Additional arguments to pass to package 31 | """ 32 | self.pipe = pipeline("audio-classification", model, device=device_map(device)) 33 | 34 | def infer_file(self, file_path): 35 | """ 36 | Run inference on a single file. 37 | 38 | **Args** 39 | 40 | file_path (str): The path of the audio to classify. 41 | 42 | **Returns** 43 | 44 | str: The language ISO language code of the detected language. 45 | """ 46 | result = self.pipe(file_path)[0]["label"] 47 | if result in language_codes.keys(): 48 | return language_codes[result] 49 | else: 50 | return result 51 | -------------------------------------------------------------------------------- /speechtoolkit/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ml-for-speech/speechtoolkit/037c2c7c92521505f7bc7bc61a1ebb33783a48e3/speechtoolkit/data/__init__.py -------------------------------------------------------------------------------- /speechtoolkit/data/languages.py: -------------------------------------------------------------------------------- 1 | language_codes = { 2 | "Arabic": "ar", 3 | "Basque": "eu", 4 | "Breton": "br", 5 | "Catalan": "ca", 6 | "Chinese_China": "zh-cn", 7 | "Chinese_Hongkong": "zh-hk", 8 | "Chinese_Taiwan": "zh-tw", 9 | "Chuvash": "cv", 10 | "Czech": "cs", 11 | "Dhivehi": "dv", 12 | "Dutch": "nl", 13 | "English": "en", 14 | "Esperanto": "eo", 15 | "Estonian": "et", 16 | "French": "fr", 17 | "Frisian": "fy", 18 | "Georgian": "ka", 19 | "German": "de", 20 | "Greek": "el", 21 | "Hakha_Chin": "cnh", 22 | "Indonesian": "id", 23 | "Interlingua": "ia", 24 | "Italian": "it", 25 | "Japanese": "ja", 26 | "Kabyle": "kab", 27 | "Kinyarwanda": "rw", 28 | "Kyrgyz": "ky", 29 | "Latvian": "lv", 30 | "Maltese": "mt", 31 | "Mangolian": "mn", 32 | "Persian": "fa", 33 | "Polish": "pl", 34 | "Portuguese": "pt", 35 | "Romanian": "ro", 36 | "Romansh_Sursilvan": "rm", 37 | "Russian": "ru", 38 | "Sakha": "sah", 39 | "Slovenian": "sl", 40 | "Spanish": "es", 41 | "Swedish": "sv", 42 | "Tamil": "ta", 43 | "Tatar": "tt", 44 | "Turkish": "tr", 45 | "Ukranian": "uk", 46 | "Welsh": "cy", 47 | } 48 | -------------------------------------------------------------------------------- /speechtoolkit/tts/__init__.py: -------------------------------------------------------------------------------- 1 | from .styletts2_lib import MultiSpeakerStyleTTS2Model, SingleSpeakerStyleTTS2Model -------------------------------------------------------------------------------- /speechtoolkit/tts/styletts2_lib.py: -------------------------------------------------------------------------------- 1 | from speechtoolkit.utils.device import device_map 2 | from txtsplit import txtsplit 3 | 4 | 5 | class MultiSpeakerStyleTTS2Model: 6 | """ 7 | Text to Speech with StyleTTS 2 8 | 9 | **Args** 10 | 11 | device (str): The device to use. Defaults to 'auto' 12 | **kwargs: Additional arguments to pass to Whisper package 13 | """ 14 | 15 | def __init__( 16 | self, 17 | device='auto', 18 | **kwargs, 19 | ): 20 | """ 21 | Initialize model. 22 | 23 | **Args** 24 | 25 | **kwargs: Additional arguments to pass to Whisper package 26 | """ 27 | from mfs_styletts2.zeroshot import LFinference, compute_style 28 | from openphonemizer import OpenPhonemizer 29 | import numpy as np 30 | from scipy.io.wavfile import write 31 | 32 | self.np = np 33 | self.write = write 34 | self.LFinference = LFinference 35 | self.compute_style = compute_style 36 | self.phonemizer = OpenPhonemizer() 37 | 38 | def infer_to_file( 39 | self, 40 | text, 41 | sample, 42 | output, 43 | ): 44 | s_ref = self.compute_style(sample) 45 | sentences = txtsplit(self.phonemizer(text)) 46 | wavs = [] 47 | s_prev = None 48 | for text in sentences: 49 | if text.strip() == "": 50 | continue 51 | wav, s_prev = self.LFinference( 52 | text, 53 | s_prev, 54 | s_ref, 55 | alpha=0.3, 56 | beta=0.9, 57 | t=0.7, 58 | diffusion_steps=10, 59 | embedding_scale=1.1, 60 | phonemize=False 61 | ) 62 | wavs.append(wav) 63 | self.write(output, 24000, self.np.concatenate(wavs)) 64 | 65 | class SingleSpeakerStyleTTS2Model: 66 | """ 67 | Text to Speech with StyleTTS 2 68 | 69 | **Args** 70 | 71 | **kwargs: Additional arguments to pass to Whisper package 72 | """ 73 | 74 | def __init__( 75 | self, 76 | **kwargs, 77 | ): 78 | """ 79 | Initialize model. 80 | 81 | **Args** 82 | 83 | device (str): The device to use. Defaults to 'auto' 84 | **kwargs: Additional arguments to pass to Whisper package 85 | """ 86 | from mfs_styletts2.lj import LFinference, compute_style 87 | from openphonemizer import OpenPhonemizer 88 | import numpy as np 89 | from scipy.io.wavfile import write 90 | import torch 91 | 92 | self.torch = torch 93 | self.np = np 94 | self.write = write 95 | self.LFinference = LFinference 96 | self.compute_style = compute_style 97 | self.phonemizer = OpenPhonemizer() 98 | 99 | def infer_to_file( 100 | self, 101 | text, 102 | output, 103 | ): 104 | sentences = txtsplit(self.phonemizer(text)) 105 | wavs = [] 106 | s_prev = None 107 | for text in sentences: 108 | if text.strip() == "": 109 | continue 110 | noise = self.torch.randn(1, 1, 256).to('cuda' if self.torch.cuda.is_available() else 'cpu') 111 | wav, s_prev = self.LFinference( 112 | text, 113 | s_prev, 114 | noise, 115 | alpha=0.3, 116 | diffusion_steps=10, 117 | embedding_scale=1.1, 118 | phonemize=False 119 | ) 120 | wavs.append(wav) 121 | self.write(output, 24000, self.np.concatenate(wavs)) 122 | -------------------------------------------------------------------------------- /speechtoolkit/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ml-for-speech/speechtoolkit/037c2c7c92521505f7bc7bc61a1ebb33783a48e3/speechtoolkit/utils/__init__.py -------------------------------------------------------------------------------- /speechtoolkit/utils/device.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def device_map(device): 5 | if device == "auto": 6 | device = "cuda" if torch.cuda.is_available() else "cpu" 7 | if type(device) == str: 8 | device = torch.device(device) 9 | return device 10 | -------------------------------------------------------------------------------- /speechtoolkit/vc/__init__.py: -------------------------------------------------------------------------------- 1 | from speechtoolkit.vc.lvc_lib import LVCModel 2 | from speechtoolkit.vc.ns3vc_lib import NS3VCModel 3 | -------------------------------------------------------------------------------- /speechtoolkit/vc/lvc_lib.py: -------------------------------------------------------------------------------- 1 | class LVCModel: 2 | """ 3 | Use LVC-VC (End-to-End Zero-Shot Voice Conversion with Location-Variable Convolutions) for zero-shot voice conversion. 4 | 5 | **Args** 6 | 7 | device (str): The device to use. Defaults to 'auto' 8 | use_xl_model (bool): Use the XL model vs. the smaller model. Defaults to 'true' 9 | **kwargs: Additional arguments to pass to package 10 | """ 11 | 12 | def __init__( 13 | self, 14 | device="auto", 15 | use_xl_model=True, 16 | **kwargs, 17 | ): 18 | """ 19 | Initialize model. 20 | 21 | **Args** 22 | 23 | device (str): The device to use. Defaults to 'auto' 24 | use_xl_model (bool): Use the XL model vs. the smaller model. Defaults to 'true' 25 | **kwargs: Additional arguments to pass to package 26 | """ 27 | from lvc import LVC 28 | 29 | self.model = LVC(device=device, use_xl_model=use_xl_model, **kwargs) 30 | 31 | def infer_file( 32 | self, original_audio_path, sample_audio_path, output_audio_path, **kwargs 33 | ): 34 | """ 35 | Run inference on a single file. 36 | 37 | **Args** 38 | 39 | original_audio_path (str): The path of the original audio. 40 | sample_audio_path (str): The path of the speaker sample whose voice you want to clone. 41 | output_audio_path (str): The path to save your audio 42 | """ 43 | self.model.infer_file( 44 | original_audio_path, sample_audio_path, output_audio_path, **kwargs 45 | ) 46 | -------------------------------------------------------------------------------- /speechtoolkit/vc/ns3vc_lib.py: -------------------------------------------------------------------------------- 1 | class NS3VCModel: 2 | """ 3 | Use Amphion's NaturalSpeech3 for zero-shot voice conversion 4 | 5 | **Args** 6 | 7 | device (str): The device to use. Defaults to 'auto' 8 | **kwargs: Additional arguments to pass to package 9 | """ 10 | 11 | def __init__( 12 | self, 13 | device="auto", 14 | **kwargs, 15 | ): 16 | """ 17 | Initialize model. 18 | 19 | **Args** 20 | 21 | device (str): The device to use. Defaults to 'auto' 22 | **kwargs: Additional arguments to pass to package 23 | """ 24 | from ns3vc import NS3VC 25 | 26 | self.model = NS3VC(device=device, **kwargs) 27 | 28 | def infer_file( 29 | self, original_audio_path, sample_audio_path, output_audio_path, **kwargs 30 | ): 31 | """ 32 | Run inference on a single file. 33 | 34 | **Args** 35 | 36 | original_audio_path (str): The path of the original audio. 37 | sample_audio_path (str): The path of the speaker sample whose voice you want to clone. 38 | output_audio_path (str): The path to save your audio 39 | """ 40 | self.model.infer_file( 41 | original_audio_path, sample_audio_path, output_audio_path, **kwargs 42 | ) 43 | --------------------------------------------------------------------------------