├── .github
    └── workflows
    │   ├── docs.yaml
    │   └── pypi.yaml
├── .gitignore
├── README.md
├── devrequirements.txt
├── docs
    ├── css
    │   └── custom.css
    ├── examples.md
    ├── faqs.md
    ├── getting-started.md
    ├── img
    │   └── favicon.ico
    ├── index.md
    └── models.md
├── docs_theme
    └── main.html
├── mkdocs.yml
├── requirements.txt
├── setup.py
└── speechtoolkit
    ├── __init__.py
    ├── asr
        ├── __init__.py
        ├── distilwhisper_lib.py
        └── whisper_lib.py
    ├── classification
        ├── __init__.py
        └── languageclassification.py
    ├── data
        ├── __init__.py
        └── languages.py
    ├── tts
        ├── __init__.py
        └── styletts2_lib.py
    ├── utils
        ├── __init__.py
        └── device.py
    └── vc
        ├── __init__.py
        ├── lvc_lib.py
        └── ns3vc_lib.py


/.github/workflows/docs.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish docs via GitHub Pages
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | 
 7 | jobs:
 8 |   build:
 9 |     name: Deploy docs
10 |     runs-on: ubuntu-latest
11 |     permissions:
12 |       contents: write
13 |     steps:
14 |       - name: Checkout main
15 |         uses: actions/checkout@v2
16 |       - name: Install dependencies
17 |         run: |
18 |           python -m pip install --upgrade pip
19 |           pip install build
20 |       - name: Deploy docs
21 |         uses: mhausenblas/mkdocs-deploy-gh-pages@nomaterial
22 |         env:
23 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
24 |           CONFIG_FILE: mkdocs.yml
25 |           REQUIREMENTS: devrequirements.txt


--------------------------------------------------------------------------------
/.github/workflows/pypi.yaml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SpeechToolkit
 2 | 
 3 | **NOTE: This project is still in an early alpha stage and is not ready for production yet.**
 4 | 
 5 | A unified framework for text-to-speech, voice conversion, automatic speech recognition, audio classification, and more!
 6 | 
 7 | Please note that this toolkit is currently in an early alpha and not all features have been implemented.
 8 | 
 9 | If you prefer not to use SpeechToolkit but would like to interact with models individually and separately, please check out the [ML for Speech](https://github.com/ml-for-speech) page.
10 | 
11 | ## Implemented Features
12 | 
13 | - [x] Text-to-speech
14 |   - [x] StyleTTS 2
15 |   - [ ] MetaVoice
16 |   - [ ] Parler TTS
17 |   - [ ] XTTS
18 | - [x] Voice conversion
19 |   - [x] LVC-VC
20 |   - [x] NaturalSpeech3 Voice Conversion
21 |   - [ ] StyleTTS2-VC
22 | - [x] Automatic speech recognition
23 |   - [x] Whisper
24 |   - [x] Distil-Whisper
25 |   - [ ] Canary
26 | - [x] Audio classification
27 |   - [x] Language detection
28 | 
29 | ## Installation & Usage
30 | 
31 | Documentation is available [online](https://ml-for-speech.github.io/speechtoolkit).
32 | 
33 | ## Disclaimer
34 | 
35 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
36 | 
37 | Proivded models may make mistakes.
38 | 
39 | THE MODEL IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS MODEL INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS MODEL.
40 | 


--------------------------------------------------------------------------------
/devrequirements.txt:
--------------------------------------------------------------------------------
1 | mkdocs


--------------------------------------------------------------------------------
/docs/css/custom.css:
--------------------------------------------------------------------------------
 1 | div.autodoc-docstring {
 2 |   padding-left: 20px;
 3 |   margin-bottom: 30px;
 4 |   border-left: 5px solid rgba(230, 230, 230);
 5 | }
 6 | 
 7 | div.autodoc-members {
 8 |   padding-left: 20px;
 9 |   margin-bottom: 15px;
10 | }


--------------------------------------------------------------------------------
/docs/examples.md:
--------------------------------------------------------------------------------
 1 | # Examples
 2 | 
 3 | ## Text-to-Speech
 4 | 
 5 | ```python
 6 | from speechtoolkit.tts import SingleSpeakerStyleTTS2Model
 7 | 
 8 | model = SingleSpeakerStyleTTS2Model()
 9 | 
10 | model.infer_to_file('Hello, this is a test', 'out.wav')
11 | ```
12 | 
13 | **Multi-speaker StyleTTS 2 with zero-shot voice cloning:**
14 | 
15 | ```python
16 | from speechtoolkit.tts import MultiSpeakerStyleTTS2Model
17 | 
18 | model = MultiSpeakerStyleTTS2Model()
19 | 
20 | model.infer_to_file('Hello, this is a test', 'sample.wav', 'out.wav')
21 | ```
22 | 
23 | ## Automatic Speech Recognition
24 | 
25 | ```python
26 | from speechtoolkit.asr import WhisperModel
27 | 
28 | model = WhisperModel()
29 | 
30 | model.infer_file('audio.wav')
31 | ```
32 | 
33 | **With a larger model:**
34 | 
35 | ```python
36 | from speechtoolkit.asr import WhisperModel
37 | 
38 | model = WhisperModel('medium')
39 | 
40 | model.infer_file('audio.wav')
41 | ```
42 | 
43 | **With DistilWhisper:**
44 | 
45 | ```python
46 | from speechtoolkit.asr import DistilWhisperModel
47 | 
48 | model = DistilWhisperModel()
49 | 
50 | model.infer_file('audio.wav')
51 | ```
52 | 
53 | ## Voice Conversion
54 | 
55 | ```python
56 | from speechtoolkit.vc import LVC
57 | 
58 | vc = LVC(device='auto')
59 | 
60 | vc.infer_file(
61 |     'original.wav',
62 |     'sample.wav',
63 |     'out.wav'
64 | )
65 | ```
66 | 
67 | ## Language Classification
68 | 
69 | ```python
70 | from speechtoolkit.classification.languageclassification import WhisperLanguageClassifierModel
71 | 
72 | lc = WhisperLanguageClassifierModel()
73 | 
74 | lc.infer_file('audio.wav') # 'en'
75 | ```
76 | 
77 | ## Accent Classification
78 | 
79 | ```python
80 | from speechtoolkit.classification.accentclassification import EdAccAccentClassifierModel
81 | 
82 | ac = EdAccAccentClassifierModel()
83 | 
84 | ac.infer_file('audio.wav') # 'Mainstream US English'
85 | ```
86 | 


--------------------------------------------------------------------------------
/docs/faqs.md:
--------------------------------------------------------------------------------
 1 | # FAQs
 2 | 
 3 | ## What is SpeechToolkit?
 4 | 
 5 | Please refer to the [introduction](index.md) for more details.
 6 | 
 7 | ## Are all models trained by SpeechToolkit?
 8 | 
 9 | No, SpeechToolkit is actually primarily composed of third-party models. SpeechToolkit provides a unified, simple Python API to access these models. However, SpeechToolkit does provide some trained models.
10 | 
11 | ## Is Apple Silicon (MPS) supported?
12 | 
13 | Unfortunately, PyTorch does not yet support all operations on MPS. For simplicity, MPS support is currently disabled across all models, however it may be supported on a case-by-case basis in the future.


--------------------------------------------------------------------------------
/docs/getting-started.md:
--------------------------------------------------------------------------------
 1 | # Getting Started
 2 | 
 3 | ## Installation
 4 | 
 5 | Installing SpeechToolkit is quite easy!
 6 | 
 7 | ### Basic Installation
 8 | 
 9 | Use this for testing, development, etc. Note that it will download packages for *all* models (text-to-speech, voice cloning, etc), so it will be much slower and larger than downloading task-specific versions.
10 | 
11 | If you're unsure of which one to download, you should use basic installation.
12 | 
13 | ```
14 | pip install speechtoolkit[all]
15 | ```
16 | 
17 | ### Advanced Installation
18 | 
19 | For production deployment.
20 | 
21 | Install SpeechToolkit core without any extras:
22 | 
23 | ```
24 | pip install speechtoolkit
25 | ```
26 | 
27 | ## Basic Usage
28 | 
29 | Now that you've successfully installed SpeechToolkit, it's time to run some models! Head over to the [examples](examples.md) page to see some basic examples.


--------------------------------------------------------------------------------
/docs/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-for-speech/speechtoolkit/037c2c7c92521505f7bc7bc61a1ebb33783a48e3/docs/img/favicon.ico


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | SpeechToolkit is an all-in-one, end-to-end toolkit for ML in speech. It aims to simplify the usage of text-to-speech, automatic speech recognition, and voice conversion models.
 4 | 
 5 | ## Why SpeechToolkit?
 6 | 
 7 | Almost every model uses a different Python API. If you wanted to integrate them into your project, you'd need to write customized code for the model. Switching to a different model would require significant changes.
 8 | 
 9 | SpeechToolkit aims to solve this by providing a centralized, unified, easy-to-use Python API for speech models. Instead of having to rewrite your program to support a new model, you can simply change a couple lines of code with SpeechToolkit.
10 | 
11 | In addition, SpeechToolkit packages these models into a simple, PyPI-installable package. This not only makes code management easier, but also can help mitigate potential licensing issues.
12 | 
13 | ## Packages
14 | 
15 | SpeechToolkit supports many different third-party open-access models, as well as some models developed by ML for Speech. While these models are mostly available through the SpeechToolkit package, we've packaged many of these models individually if you don't want to use the SpeechToolkit library.
16 | 
17 | ## Get Started
18 | 
19 | Visit the [Getting Started](getting-started.md) page to get started!


--------------------------------------------------------------------------------
/docs/models.md:
--------------------------------------------------------------------------------
 1 | # Models
 2 | 
 3 | SpeechToolkit supports several different models for various tasks.
 4 | 
 5 | Note that the license "Same" indicates that this is 
 6 | 
 7 | ## Text-to-Speech
 8 | 
 9 | Below is a list of models supported for text-to-speech:
10 | 
11 | | Name    | License | Link                                            |
12 | | ------- | ------- | ----------------------------------------------- |
13 | | StyleTTS 2 | MIT     | [Repository](https://github.com/yl4579/StyleTTS2) |
14 | 
15 | Note: StyleTTS 2 by default uses a GPL-licensed phonemizer but we've replaced it with the BSD-licensed [OpenPhonemizer](https://github.com/NeuralVox/OpenPhonemizer).
16 | 
17 | ## Automatic Speech Recognition
18 | 
19 | Below is a list of supported models for automatic speech recognition.
20 | 
21 | | Name    | License | Link                                            |
22 | | ------- | ------- | ----------------------------------------------- |
23 | | Whisper | MIT     | [Repository](https://github.com/openai/whisper) |
24 | 
25 | ## Speech Classification
26 | 
27 | **NOTE: Classification models are not very accurate yet.**
28 | 
29 | SpeechToolkit supports several different types of speech classification. These models are trained by ML for Speech.
30 | 
31 | | Version | Task                    | Link                                                                  |
32 | | ------- | ----------------------- | --------------------------------------------------------------------- |
33 | | V1      | Language Classification | [Model](https://huggingface.co/ml-for-speech/language-classification) |
34 | 
35 | ## Voice Conversion
36 | 
37 | Below is a list of supported models for voice conversion.
38 | 
39 | | Name   | License | Link                                                 |
40 | | ------ | ------- | ---------------------------------------------------- |
41 | | LVC-VC | MIT     | [Repository](https://github.com/wonjune-kang/lvc-vc) |
42 | | NS3VC  | MIT     | [Repository](https://github.com/open-mmlab/Amphion)  |
43 | 
44 | ## A Short Guide to Licenses
45 | 
46 | Note that this is not legal advice.
47 | 
48 | Please note that models may have a different license than SpeechToolkit. If this is the case, you must comply with both SpeechToolkit *and* the license of the model.
49 | 
50 | If you're wondering whether or not you can use a model commercially, you should check both the model's license and the pretrained weights' license. The MIT, Apache 2.0, and BSD licenses typically allow commercial use, unless otherwise specified by the authors. However, the BSD-4-Clause license requires you to provide attribution to the author in certain marketing materials (read the full license for details). If the license name includes "NC," it is likely a non-commercial license, which means you cannot use it commercially. Also note that some models may be trained on copyrighted content, which, depending on your jurisdiction, may influence the ability for you to use the models.
51 | 
52 | Before using models, you should carefully read their licenses.
53 | 
54 | ## Disclaimer
55 | 
56 | Disclaimer for models trained by SpeechToolkit:
57 | 
58 | THE MODEL IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS MODEL INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS MODEL.
59 | 


--------------------------------------------------------------------------------
/docs_theme/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | 
3 | {%- block footer %}
4 |     <hr>
5 |     {%- if config.copyright %}
6 |         <p>{{ config.copyright }}</p>
7 |     {%- endif %}
8 | 
9 | {%- endblock %}


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: SpeechToolkit
 2 | copyright: '&copy; 2024 <a href="https://ml-for-speech.github.io/">ML for Speech</a>. All docs are on our <a href="https://github.com/ml-for-speech/speechtoolkit">GitHub Repository</a>.'
 3 | # repo_url: https://github.com/ml-for-speech/speechtoolkit
 4 | 
 5 | theme:
 6 |   name: mkdocs
 7 |   highlightjs: true
 8 |   hljs_languages:
 9 |     - python
10 |     - yaml
11 |     - json
12 |   shortcuts:
13 |     help: 191    # ?
14 |     next: 78     # n
15 |     previous: 80 # p
16 |     search: 191  # /
17 |   custom_dir: docs_theme
18 | markdown_extensions:
19 |   - admonition
20 |   - codehilite
21 | extra_css:
22 |   - css/custom.css


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -e .


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | extras = {
 4 |     "vc": [
 5 |         "lvc",
 6 |         "ns3vc",
 7 |     ],
 8 |     "fa2": [
 9 |         "flash-attn",
10 |     ],
11 |     "asr": [
12 |         "openai-whisper",
13 |     ],
14 |     "tts": [
15 |         "mfs-styletts2",
16 |     ],
17 |     "dev": [
18 |         "mkdocs",
19 |         "mkautodoc",
20 |     ],
21 | }
22 | 
23 | extra_pkgs = extras
24 | final = []
25 | 
26 | for k in extras:
27 |     if not k == "dev":
28 |         final += extras[k]
29 | 
30 | extra_pkgs["all"] = final
31 | 
32 | with open("README.md", "r") as f:
33 |     longdesc = f.read()
34 | setup(
35 |     name="speechtoolkit",
36 |     version="0.0.5",
37 |     author="ml-for-speech",
38 |     description="ML for Speech presents SpeechToolkit, a unified, all-in-one toolkit for TTS, ASR, VC, & other models.",
39 |     long_description=longdesc,
40 |     long_description_content_type="text/markdown",
41 |     url="https://github.com/ml-for-speech/speechtoolkit",
42 |     packages=find_packages(),
43 |     install_requires=[
44 |         "soundfile",
45 |         "librosa",
46 |         "transformers",
47 |         "torch",
48 |         "optimum",
49 |         "txtsplit",
50 |     ],
51 |     extras_require=extra_pkgs,
52 | )
53 | 


--------------------------------------------------------------------------------
/speechtoolkit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-for-speech/speechtoolkit/037c2c7c92521505f7bc7bc61a1ebb33783a48e3/speechtoolkit/__init__.py


--------------------------------------------------------------------------------
/speechtoolkit/asr/__init__.py:
--------------------------------------------------------------------------------
1 | from speechtoolkit.asr.whisper_lib import WhisperModel
2 | from speechtoolkit.asr.distilwhisper_lib import DistilWhisperModel
3 | 


--------------------------------------------------------------------------------
/speechtoolkit/asr/distilwhisper_lib.py:
--------------------------------------------------------------------------------
 1 | from speechtoolkit.utils.device import device_map
 2 | 
 3 | 
 4 | class DistilWhisperModel:
 5 |     """
 6 |     Use DistilWhisper for automatic speech recognition. Supports significant speedups.
 7 | 
 8 |     Supports several speedups (Flash Attention 2 & BetterTransformer), borrowed from [insanely-fast-whisper](https://github.com/Vaibhavs10/insanely-fast-whisper).
 9 | 
10 |     **Args**
11 | 
12 |     model (str): Which Whisper model to use on the Hugging Face Hub
13 |     device (str): The device to use. Defaults to 'auto'
14 |     use_fa2 (bool): Use Flash Attention 2 (significant speedup). Incompatible with BetterTransformer. Only works on CUDA GPUs.
15 |     use_bettertransformer (bool): Use BetterTransformer (speedup). Incompatible with Flash Attention 2. If available, use Flash Attention 2 instead.
16 |     **kwargs: Additional arguments to pass to Whisper package
17 |     """
18 | 
19 |     def __init__(
20 |         self,
21 |         model="distil-whisper/distil-large-v3",
22 |         use_fa2=False,
23 |         use_bettertransformer=False,
24 |         device="auto",
25 |         **kwargs,
26 |     ):
27 |         """
28 |         Initialize model.
29 | 
30 |         **Args**
31 | 
32 |         model (str): Which Whisper model to use on the Hugging Face Hub
33 |         device (str): The device to use. Defaults to 'auto'
34 |         use_fa2 (bool): Use Flash Attention 2 (significant speedup). Incompatible with BetterTransformer. Only works on CUDA GPUs.
35 |         use_bettertransformer (bool): Use BetterTransformer (speedup). Incompatible with Flash Attention 2. If available, use Flash Attention 2 instead.
36 |         **kwargs: Additional arguments to pass to Whisper package
37 |         """
38 |         if use_bettertransformer and use_fa2:
39 |             raise ValueError(
40 |                 "You cannot use both BetterTransformer and Flash Attention 2 at the same time. Typically, Flash Attention 2 provides a better speedup."
41 |             )
42 |         from transformers import pipeline
43 | 
44 |         model_kwargs = {}
45 |         if use_fa2:
46 |             model_kwargs = {"attn_implementation": "flash_attention_2"}
47 |         self.model = pipeline(
48 |             "automatic-speech-recognition",
49 |             model,
50 |             device=device_map(device),
51 |             model_kwargs=model_kwargs,
52 |             **kwargs,
53 |         )
54 |         if use_bettertransformer:
55 |             self.model.model.to_bettertransformer()
56 | 
57 |     def infer_file(self, audio_path, **kwargs):
58 |         """
59 |         Run inference on a single file.
60 | 
61 |         **Args**
62 | 
63 |         audio_path (str): The path of the original audio.
64 |         **kwargs: Additional arguments to pass to Whisper package
65 | 
66 |         **Returns**
67 | 
68 |         str: The transcript of the audio file.
69 |         """
70 |         return self.model(audio_path, **kwargs)["text"].strip()
71 | 


--------------------------------------------------------------------------------
/speechtoolkit/asr/whisper_lib.py:
--------------------------------------------------------------------------------
 1 | from speechtoolkit.utils.device import device_map
 2 | 
 3 | 
 4 | class WhisperModel:
 5 |     """
 6 |     Use OpenAI Whisper for automatic speech recognition.
 7 | 
 8 |     **Args**
 9 | 
10 |     model (str): Which Whisper model to use
11 |     device (str): The device to use. Defaults to 'auto'
12 |     **kwargs: Additional arguments to pass to Whisper package
13 |     """
14 | 
15 |     def __init__(
16 |         self,
17 |         model="base",
18 |         device="auto",
19 |         **kwargs,
20 |     ):
21 |         """
22 |         Initialize model.
23 | 
24 |         **Args**
25 | 
26 |         model (str): Which Whisper model to use
27 |         device (str): The device to use. Defaults to 'auto'
28 |         **kwargs: Additional arguments to pass to Whisper package
29 |         """
30 |         import whisper
31 | 
32 |         self.model = whisper.load_model(model, **kwargs).to(device_map(device))
33 | 
34 |     def infer_file(self, audio_path, **kwargs):
35 |         """
36 |         Run inference on a single file.
37 | 
38 |         **Args**
39 | 
40 |         audio_path (str): The path of the original audio.
41 |         **kwargs: Additional arguments to pass to Whisper package
42 | 
43 |         **Returns**
44 | 
45 |         str: The transcript of the audio file.
46 |         """
47 |         return self.model.transcribe(audio_path, **kwargs)["text"].strip()
48 | 


--------------------------------------------------------------------------------
/speechtoolkit/classification/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-for-speech/speechtoolkit/037c2c7c92521505f7bc7bc61a1ebb33783a48e3/speechtoolkit/classification/__init__.py


--------------------------------------------------------------------------------
/speechtoolkit/classification/languageclassification.py:
--------------------------------------------------------------------------------
 1 | from transformers import pipeline
 2 | from speechtoolkit.data.languages import language_codes
 3 | from speechtoolkit.utils.device import device_map
 4 | 
 5 | 
 6 | class WhisperLanguageClassifierModel:
 7 |     """
 8 |     Use a Whisper-based language classification model.
 9 | 
10 |     **Args**
11 | 
12 |     device (str): The device to use. Defaults to 'auto'
13 |     model (str): The model ID to use on the Hugging Face Hub.
14 |     **kwargs: Additional arguments to pass to package
15 |     """
16 | 
17 |     def __init__(
18 |         self,
19 |         device="auto",
20 |         model="ml-for-speech/language-classification",
21 |         **kwargs,
22 |     ):
23 |         """
24 |         Initialize model.
25 | 
26 |         **Args**
27 | 
28 |         device (str): The device to use. Defaults to 'auto'
29 |         model (str): The model ID to use on the Hugging Face Hub.
30 |         **kwargs: Additional arguments to pass to package
31 |         """
32 |         self.pipe = pipeline("audio-classification", model, device=device_map(device))
33 | 
34 |     def infer_file(self, file_path):
35 |         """
36 |         Run inference on a single file.
37 | 
38 |         **Args**
39 | 
40 |         file_path (str): The path of the audio to classify.
41 | 
42 |         **Returns**
43 | 
44 |         str: The language ISO language code of the detected language.
45 |         """
46 |         result = self.pipe(file_path)[0]["label"]
47 |         if result in language_codes.keys():
48 |             return language_codes[result]
49 |         else:
50 |             return result
51 | 


--------------------------------------------------------------------------------
/speechtoolkit/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-for-speech/speechtoolkit/037c2c7c92521505f7bc7bc61a1ebb33783a48e3/speechtoolkit/data/__init__.py


--------------------------------------------------------------------------------
/speechtoolkit/data/languages.py:
--------------------------------------------------------------------------------
 1 | language_codes = {
 2 |     "Arabic": "ar",
 3 |     "Basque": "eu",
 4 |     "Breton": "br",
 5 |     "Catalan": "ca",
 6 |     "Chinese_China": "zh-cn",
 7 |     "Chinese_Hongkong": "zh-hk",
 8 |     "Chinese_Taiwan": "zh-tw",
 9 |     "Chuvash": "cv",
10 |     "Czech": "cs",
11 |     "Dhivehi": "dv",
12 |     "Dutch": "nl",
13 |     "English": "en",
14 |     "Esperanto": "eo",
15 |     "Estonian": "et",
16 |     "French": "fr",
17 |     "Frisian": "fy",
18 |     "Georgian": "ka",
19 |     "German": "de",
20 |     "Greek": "el",
21 |     "Hakha_Chin": "cnh",
22 |     "Indonesian": "id",
23 |     "Interlingua": "ia",
24 |     "Italian": "it",
25 |     "Japanese": "ja",
26 |     "Kabyle": "kab",
27 |     "Kinyarwanda": "rw",
28 |     "Kyrgyz": "ky",
29 |     "Latvian": "lv",
30 |     "Maltese": "mt",
31 |     "Mangolian": "mn",
32 |     "Persian": "fa",
33 |     "Polish": "pl",
34 |     "Portuguese": "pt",
35 |     "Romanian": "ro",
36 |     "Romansh_Sursilvan": "rm",
37 |     "Russian": "ru",
38 |     "Sakha": "sah",
39 |     "Slovenian": "sl",
40 |     "Spanish": "es",
41 |     "Swedish": "sv",
42 |     "Tamil": "ta",
43 |     "Tatar": "tt",
44 |     "Turkish": "tr",
45 |     "Ukranian": "uk",
46 |     "Welsh": "cy",
47 | }
48 | 


--------------------------------------------------------------------------------
/speechtoolkit/tts/__init__.py:
--------------------------------------------------------------------------------
1 | from .styletts2_lib import MultiSpeakerStyleTTS2Model, SingleSpeakerStyleTTS2Model


--------------------------------------------------------------------------------
/speechtoolkit/tts/styletts2_lib.py:
--------------------------------------------------------------------------------
  1 | from speechtoolkit.utils.device import device_map
  2 | from txtsplit import txtsplit
  3 | 
  4 | 
  5 | class MultiSpeakerStyleTTS2Model:
  6 |     """
  7 |     Text to Speech with StyleTTS 2
  8 | 
  9 |     **Args**
 10 | 
 11 |     device (str): The device to use. Defaults to 'auto'
 12 |     **kwargs: Additional arguments to pass to Whisper package
 13 |     """
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         device='auto',
 18 |         **kwargs,
 19 |     ):
 20 |         """
 21 |         Initialize model.
 22 | 
 23 |         **Args**
 24 | 
 25 |         **kwargs: Additional arguments to pass to Whisper package
 26 |         """
 27 |         from mfs_styletts2.zeroshot import LFinference, compute_style
 28 |         from openphonemizer import OpenPhonemizer
 29 |         import numpy as np
 30 |         from scipy.io.wavfile import write
 31 | 
 32 |         self.np = np
 33 |         self.write = write
 34 |         self.LFinference = LFinference
 35 |         self.compute_style = compute_style
 36 |         self.phonemizer = OpenPhonemizer()
 37 | 
 38 |     def infer_to_file(
 39 |         self,
 40 |         text,
 41 |         sample,
 42 |         output,
 43 |     ):
 44 |         s_ref = self.compute_style(sample)
 45 |         sentences = txtsplit(self.phonemizer(text))
 46 |         wavs = []
 47 |         s_prev = None
 48 |         for text in sentences:
 49 |             if text.strip() == "":
 50 |                 continue
 51 |             wav, s_prev = self.LFinference(
 52 |                 text,
 53 |                 s_prev,
 54 |                 s_ref,
 55 |                 alpha=0.3,
 56 |                 beta=0.9,
 57 |                 t=0.7,
 58 |                 diffusion_steps=10,
 59 |                 embedding_scale=1.1,
 60 |                 phonemize=False
 61 |             )
 62 |             wavs.append(wav)
 63 |         self.write(output, 24000, self.np.concatenate(wavs))
 64 | 
 65 | class SingleSpeakerStyleTTS2Model:
 66 |     """
 67 |     Text to Speech with StyleTTS 2
 68 | 
 69 |     **Args**
 70 | 
 71 |     **kwargs: Additional arguments to pass to Whisper package
 72 |     """
 73 | 
 74 |     def __init__(
 75 |         self,
 76 |         **kwargs,
 77 |     ):
 78 |         """
 79 |         Initialize model.
 80 | 
 81 |         **Args**
 82 | 
 83 |         device (str): The device to use. Defaults to 'auto'
 84 |         **kwargs: Additional arguments to pass to Whisper package
 85 |         """
 86 |         from mfs_styletts2.lj import LFinference, compute_style
 87 |         from openphonemizer import OpenPhonemizer
 88 |         import numpy as np
 89 |         from scipy.io.wavfile import write
 90 |         import torch
 91 | 
 92 |         self.torch = torch
 93 |         self.np = np
 94 |         self.write = write
 95 |         self.LFinference = LFinference
 96 |         self.compute_style = compute_style
 97 |         self.phonemizer = OpenPhonemizer()
 98 | 
 99 |     def infer_to_file(
100 |         self,
101 |         text,
102 |         output,
103 |     ):
104 |         sentences = txtsplit(self.phonemizer(text))
105 |         wavs = []
106 |         s_prev = None
107 |         for text in sentences:
108 |             if text.strip() == "":
109 |                 continue
110 |             noise = self.torch.randn(1, 1, 256).to('cuda' if self.torch.cuda.is_available() else 'cpu')
111 |             wav, s_prev = self.LFinference(
112 |                 text,
113 |                 s_prev,
114 |                 noise,
115 |                 alpha=0.3,
116 |                 diffusion_steps=10,
117 |                 embedding_scale=1.1,
118 |                 phonemize=False
119 |             )
120 |             wavs.append(wav)
121 |         self.write(output, 24000, self.np.concatenate(wavs))
122 | 


--------------------------------------------------------------------------------
/speechtoolkit/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ml-for-speech/speechtoolkit/037c2c7c92521505f7bc7bc61a1ebb33783a48e3/speechtoolkit/utils/__init__.py


--------------------------------------------------------------------------------
/speechtoolkit/utils/device.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def device_map(device):
 5 |     if device == "auto":
 6 |         device = "cuda" if torch.cuda.is_available() else "cpu"
 7 |     if type(device) == str:
 8 |         device = torch.device(device)
 9 |     return device
10 | 


--------------------------------------------------------------------------------
/speechtoolkit/vc/__init__.py:
--------------------------------------------------------------------------------
1 | from speechtoolkit.vc.lvc_lib import LVCModel
2 | from speechtoolkit.vc.ns3vc_lib import NS3VCModel
3 | 


--------------------------------------------------------------------------------
/speechtoolkit/vc/lvc_lib.py:
--------------------------------------------------------------------------------
 1 | class LVCModel:
 2 |     """
 3 |     Use LVC-VC (End-to-End Zero-Shot Voice Conversion with Location-Variable Convolutions) for zero-shot voice conversion.
 4 | 
 5 |     **Args**
 6 | 
 7 |     device (str): The device to use. Defaults to 'auto'
 8 |     use_xl_model (bool): Use the XL model vs. the smaller model. Defaults to 'true'
 9 |     **kwargs: Additional arguments to pass to package
10 |     """
11 | 
12 |     def __init__(
13 |         self,
14 |         device="auto",
15 |         use_xl_model=True,
16 |         **kwargs,
17 |     ):
18 |         """
19 |         Initialize model.
20 | 
21 |         **Args**
22 | 
23 |         device (str): The device to use. Defaults to 'auto'
24 |         use_xl_model (bool): Use the XL model vs. the smaller model. Defaults to 'true'
25 |         **kwargs: Additional arguments to pass to package
26 |         """
27 |         from lvc import LVC
28 | 
29 |         self.model = LVC(device=device, use_xl_model=use_xl_model, **kwargs)
30 | 
31 |     def infer_file(
32 |         self, original_audio_path, sample_audio_path, output_audio_path, **kwargs
33 |     ):
34 |         """
35 |         Run inference on a single file.
36 | 
37 |         **Args**
38 | 
39 |         original_audio_path (str): The path of the original audio.
40 |         sample_audio_path (str): The path of the speaker sample whose voice you want to clone.
41 |         output_audio_path (str): The path to save your audio
42 |         """
43 |         self.model.infer_file(
44 |             original_audio_path, sample_audio_path, output_audio_path, **kwargs
45 |         )
46 | 


--------------------------------------------------------------------------------
/speechtoolkit/vc/ns3vc_lib.py:
--------------------------------------------------------------------------------
 1 | class NS3VCModel:
 2 |     """
 3 |     Use Amphion's NaturalSpeech3 for zero-shot voice conversion
 4 | 
 5 |     **Args**
 6 | 
 7 |     device (str): The device to use. Defaults to 'auto'
 8 |     **kwargs: Additional arguments to pass to package
 9 |     """
10 | 
11 |     def __init__(
12 |         self,
13 |         device="auto",
14 |         **kwargs,
15 |     ):
16 |         """
17 |         Initialize model.
18 | 
19 |         **Args**
20 | 
21 |         device (str): The device to use. Defaults to 'auto'
22 |         **kwargs: Additional arguments to pass to package
23 |         """
24 |         from ns3vc import NS3VC
25 | 
26 |         self.model = NS3VC(device=device, **kwargs)
27 | 
28 |     def infer_file(
29 |         self, original_audio_path, sample_audio_path, output_audio_path, **kwargs
30 |     ):
31 |         """
32 |         Run inference on a single file.
33 | 
34 |         **Args**
35 | 
36 |         original_audio_path (str): The path of the original audio.
37 |         sample_audio_path (str): The path of the speaker sample whose voice you want to clone.
38 |         output_audio_path (str): The path to save your audio
39 |         """
40 |         self.model.infer_file(
41 |             original_audio_path, sample_audio_path, output_audio_path, **kwargs
42 |         )
43 | 


--------------------------------------------------------------------------------