├── .dockerignore ├── .env.example ├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTION.md ├── Dockerfile ├── LICENSE ├── README.md ├── assets └── example_audio_1.mp3 ├── entrypoint.sh ├── requirements.txt ├── setup.py ├── speech_dataset_generator ├── __init__.py ├── audio_manager │ ├── __init__.py │ └── audio_manager.py ├── audio_processor │ ├── __init__.py │ └── audio_processor.py ├── dataset_generator │ ├── __init__.py │ └── dataset_generator.py ├── main.py ├── speech_rate │ ├── __init__.py │ └── speech_rate.py └── utils │ ├── __init__.py │ └── utils.py └── speech_dataset_generator_example.ipynb /.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .dockerignore 3 | .docker 4 | .git 5 | venv 6 | .github 7 | .gitignore 8 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | HF_TOKEN= 2 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.10' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # .gitignore 2 | 3 | # Python 4 | __pycache__/ 5 | *.pyc 6 | *.pyo 7 | *.pyd 8 | 9 | # .env file 10 | .env 11 | 12 | # Compiled source 13 | *.com 14 | *.class 15 | *.dll 16 | *.exe 17 | *.o 18 | *.so 19 | 20 | # Packages 21 | *.egg 22 | *.egg-info/ 23 | dist/ 24 | build/ 25 | bin/ 26 | parts/ 27 | var/ 28 | sdist/ 29 | develop-eggs/ 30 | 31 | # Installer logs 32 | pip-log.txt 33 | 34 | # Unit test / coverage reports 35 | .coverage 36 | .tox/ 37 | 38 | # Jupyter Notebook 39 | .ipynb_checkpoints 40 | 41 | # PyCharm 42 | .idea/ 43 | 44 | # Visual Studio Code 45 | .vscode/ 46 | 47 | # Sublime Text 48 | .sublime-project 49 | .sublime-workspace 50 | 51 | # virtualenv 52 | venv/ 53 | env/ 54 | ENV/ 55 | 56 | # DS_Store files 57 | .DS_Store 58 | 59 | # macOS 60 | .AppleDouble 61 | .LSOverride 62 | 63 | # Thumbnails 64 | ._* 65 | 66 | # Files that might appear on external disk 67 | .Spotlight-V100 68 | .Trashes 69 | 70 | # Directories potentially created on remote AFP share 71 | .AppleDB 72 | .AppleDesktop 73 | Network Trash Folder 74 | Temporary Items 75 | .apdisk 76 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We, as contributors and maintainers, pledge to make participation in our project and community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | - Being respectful and inclusive of differing viewpoints and experiences. 12 | - Gracefully accepting constructive criticism. 13 | - Focusing on what is best for the community. 14 | - Showing empathy towards other community members. 15 | 16 | Examples of unacceptable behavior include: 17 | 18 | - The use of sexualized language or imagery and unwelcome attention or advances. 19 | - Trolling, insulting/derogatory comments, and personal or political attacks. 20 | - Public or private harassment. 21 | - Publishing others' private information, such as a physical or electronic address, without explicit permission. 22 | - Contributing unnecessary or low-quality code with the sole purpose of gaining contributor status. 23 | - Other conduct that could reasonably be considered inappropriate in a professional setting. 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Enforcement 32 | 33 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at [contact@davidmartinrius.com](mailto:contact@davidmartinrius.com). All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 34 | 35 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 36 | 37 | ## Attribution 38 | 39 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.0, available at [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html](https://www.contributor-covenant.org/version/2/0/code_of_conduct.html). 40 | 41 | For answers to common questions about this code of conduct, see [FAQs](https://www.contributor-covenant.org/faq). 42 | -------------------------------------------------------------------------------- /CONTRIBUTION.md: -------------------------------------------------------------------------------- 1 | # Contribution Guidelines 2 | 3 | Thank you for considering contributing to our project! We appreciate your efforts to make the Speech Dataset Generator better. 4 | 5 | ## How to Contribute 6 | 7 | 1. **Fork the Repository:** Fork the repository to your GitHub account. 8 | 9 | 2. **Clone the Repository:** Clone the forked repository to your local machine: 10 | 11 | ```bash 12 | git clone https://github.com/your-username/speech-dataset-generator.git 13 | ``` 14 | 15 | 3. **Create a New Branch:** Create a new branch for your changes: 16 | 17 | ```bash 18 | git checkout -b feature/your-feature 19 | ``` 20 | 21 | 4. **Make Changes:** Make your changes and ensure that the code follows our coding standards. 22 | 23 | 5. **Test Thoroughly:** Test your changes thoroughly. 24 | 25 | 6. **Commit Changes:** Commit your changes with a descriptive commit message: 26 | 27 | ```bash 28 | git commit -m "Add your descriptive message here" 29 | ``` 30 | 31 | 7. **Push Changes:** Push your changes to your forked repository: 32 | 33 | ```bash 34 | git push origin feature/your-feature 35 | ``` 36 | 37 | 8. **Open a Pull Request:** Open a pull request on our `main` branch. Provide a clear title and description, including any relevant information. 38 | 39 | ## Coding Standards 40 | 41 | - **Follow Coding Style:** Follow the coding style used in the project. 42 | - **Write Clear Code:** Write clear and concise code. 43 | - **Comments:** Add comments when necessary to explain complex logic or functionality. 44 | 45 | ## Testing 46 | 47 | - **Include Tests:** Include tests for your changes. 48 | - **Ensure Passing Tests:** Ensure that existing tests pass. 49 | 50 | ## Reporting Issues 51 | 52 | If you find any issues or bugs, please open a GitHub issue with a clear description and, if possible, steps to reproduce the problem. 53 | 54 | ## Pull Requests 55 | 56 | - **Substantial Changes:** Ensure that your pull requests involve substantial changes that contribute meaningfully to the project. Pull requests with minor or trivial changes (e.g., single-letter modifications) may not be accepted. 57 | 58 | - **Quality Contributions:** Prioritize quality over quantity. Pull requests should improve the project, fix a bug, or add a valuable feature. Contributions solely aimed at gaining collaborator status without substantial improvements will be reviewed critically. 59 | 60 | ## Code of Conduct 61 | 62 | Please adhere to our [Code of Conduct](link-to-code-of-conduct) in all interactions and contributions. 63 | 64 | ## License 65 | 66 | By contributing, you agree that your contributions will be licensed under the project's license. 67 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official CUDA-enabled base image for building 2 | FROM nvidia/cuda:12.3.2-base-ubuntu22.04 AS builder 3 | 4 | # Set the working directory 5 | WORKDIR /app 6 | 7 | # Install CUDA-related packages 8 | RUN apt-get update && \ 9 | apt-get install -y --no-install-recommends \ 10 | libcudnn8 \ 11 | libcudnn8-dev \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | # Switch to the Python 3.10 image 15 | FROM python:3.10 16 | 17 | # Set the working directory 18 | WORKDIR /app 19 | 20 | # Copy CUDA-related files from the builder stage 21 | COPY --from=builder /app /app 22 | 23 | # Install additional packages 24 | RUN apt-get update && \ 25 | apt-get install -y --no-install-recommends \ 26 | libsndfile1 \ 27 | ffmpeg \ 28 | python3-pip \ 29 | git \ 30 | && rm -rf /var/lib/apt/lists/* 31 | 32 | # Copy your application code 33 | COPY . /app 34 | 35 | # Create directories for the files 36 | #RUN mkdir -p /app/model_repo/enhancer_stage2/ds/G/default 37 | # Download files using wget 38 | #RUN wget -O /app/model_repo/enhancer_stage2/hparams.yaml "https://huggingface.co/ResembleAI/resemble-enhance/resolve/main/enhancer_stage2/#hparams.yaml?download=true" && \ 39 | # wget -O /app/model_repo/enhancer_stage2/ds/G/latest "https://huggingface.co/ResembleAI/resemble-enhance/resolve/main/enhancer_stage2/ds/G/latest?download=true" && \ 40 | # wget -O /app/model_repo/enhancer_stage2/ds/G/default/mp_rank_00_model_states.pt "https://huggingface.co/ResembleAI/resemble-enhance/resolve/main/enhancer_stage2/ds/G/default/mp_rank_00_model_states.pt?download=true" 41 | 42 | # Install any needed packages specified in requirements.txt 43 | RUN pip install --no-cache-dir -r requirements.txt 44 | RUN pip install --force-reinstall soundfile 45 | RUN pip install --force-reinstall tensorflow[and-cuda] 46 | # Set up your entrypoint and expose necessary ports 47 | EXPOSE 80 48 | COPY entrypoint.sh /app/entrypoint.sh 49 | RUN chmod +x /app/entrypoint.sh 50 | ENTRYPOINT ["/app/entrypoint.sh"] 51 | 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 David Martin Rius 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Speech Dataset Generator by [David Martin Rius](https://github.com/davidmartinrius/speech-dataset-generator) 2 | 3 | [![00019-2780374442](https://github.com/davidmartinrius/speech-dataset-generator/assets/16558194/8091ba96-6017-4645-b001-a9e3310982e8)](https://github.com/davidmartinrius/speech-dataset-generator) 4 | 5 | This repository is dedicated to creating datasets suitable for training text-to-speech or speech-to-text models. The primary functionality involves transcribing audio files, enhancing audio quality when necessary, and generating datasets. 6 | 7 | 8 | ## Here are the key functionalities of the project: 9 | 10 | 1. **Dataset Generation:** Creation of multilingual datasets with Mean Opinion Score (MOS). 11 | 12 | 2. **Silence Removal:** It includes a feature to remove silences from audio files, enhancing the overall quality. 13 | 14 | 3. **Sound Quality Improvement:** It improves the quality of the audio when needed. 15 | 16 | 4. **Audio Segmentation:** It can segment audio files within specified second ranges. 17 | 18 | 5. **Transcription:** The project transcribes the segmented audio, providing a textual representation. 19 | 20 | 6. **Gender Identification:** It identifies the gender of each speaker in the audio. 21 | 22 | 7. **Pyannote Embeddings:** Utilizes pyannote embeddings for speaker detection across multiple audio files. 23 | 24 | 8. **Automatic Speaker Naming:** Automatically assigns names to speakers detected in multiple audios. 25 | 26 | 9. **Multiple Speaker Detection:** Capable of detecting multiple speakers within each audio file. 27 | 28 | 10. **Store speaker embeddings:** The speakers are detected and stored in a Chroma database, so you do not need to assign a speaker name. 29 | 30 | 11. **Syllabic and words-per-minute metrics** 31 | 32 | 12. **Multiple input sources:** You can either use your own files or download content by pasting URLs from sources such as **YouTube**, **LibriVox** and **TED Talks**. 33 | 34 | ### Example of the output folder: 35 | ```plaintext 36 | outputs 37 | |-- main_data.csv 38 | | 39 | |-- chroma_database 40 | | 41 | |-- enhanced_audios 42 | | 43 | |-- ljspeech 44 | | |-- wavs 45 | | | |-- 1272-128104-0000.wav 46 | | | |-- 1272-128104-0001.wav 47 | | | |-- ... 48 | | | |-- 1272-128104-0225.wav 49 | | |-- metadata.csv 50 | | 51 | |-- librispeech 52 | | |-- speaker_id1 53 | | | |-- book_id1 54 | | | | |-- transcription.txt 55 | | | | |-- file1.wav 56 | | | | |-- file2.wav 57 | | | | |-- ... 58 | | |-- speaker_id2 59 | | | |-- book_id1 60 | | | | |-- transcription.txt 61 | | | | |-- file1.wav 62 | | | | |-- file2.wav 63 | | | | |-- ... 64 | ``` 65 | 66 | ### Example of the main_data.csv content: 67 | 68 | Consider that the values provided are purely fictitious and intended solely for illustrative purposes in this example. 69 | 70 | ```plaintext 71 | 72 | | text | audio_filename | speaker_id | gender | duration | language | words_per_minute | syllables_per_minute | 73 | |-------------------------|------------------------------|----------------|------------|-------------|-------------|--------------------|----------------------| 74 | | Hello, how are you? | wavs/1272-128104-0000.wav | Speaker12 | male | 4.5 | en | 22.22 | 1.11 | 75 | | Hola, ¿cómo estás? | wavs/1272-128104-0001.wav | Speaker45 | female | 6.2 | es | 20.97 | 0.81 | 76 | | This is a test. | wavs/1272-128104-0002.wav | Speaker23 | male | 3.8 | en | 26.32 | 1.32 | 77 | | ¡Adiós! | wavs/1272-128104-0003.wav | Speaker67 | female | 7.0 | es | 16.43 | 0.57 | 78 | | ... | ... | ... | ... | ... | ... | ... | ... | 79 | | Goodbye! | wavs/1272-128104-0225.wav | Speaker78 | male | 5.1 | en | 1.41 | 1.18 | 80 | 81 | ``` 82 | ## Installation 83 | 84 | Please note that this project has been tested and verified to work on Ubuntu 22. Although it has not been tested on macOS and Windows nor on other unix distributions. 85 | 86 | ```bash 87 | 88 | python3.10 -m venv venv 89 | 90 | source venv/bin/activate 91 | 92 | pip install -r requirements.txt 93 | 94 | or 95 | 96 | pip install -e . 97 | 98 | #If you are going to use this program outside of this project folder do this: 99 | export PYTHONPATH=/path/to/your/speech-dataset-generator:$PYTHONPATH 100 | ``` 101 | 102 | 103 | ### Needed agreement to run the code 104 | 105 | **Important**: Make sure to agree to share your contact information to access the [pyannote embedding model](https://huggingface.co/pyannote/embedding). Similarly, access to the [pyannote speaker diarization model](https://huggingface.co/pyannote/speaker-diarization) may require similar agreement. 106 | 107 | ### Huggingface 108 | You need to provide a HuggingFace token in a .env file 109 | 110 | ``` 111 | HF_TOKEN=yourtoken 112 | ``` 113 | 114 | 115 | ## Usage 116 | 117 | The main script `speech_dataset_generator/main.py` accepts command-line arguments for specifying the input file, output directory, time range, and types of enhancers. You can process a single file or an entire folder of audio files. 118 | Also you can use a youtube video or a youtube playlist as input. 119 | 120 | ```bash 121 | 122 | python speech_dataset_generator/main.py --input_file_path --output_directory --range_times --enhancers 123 | 124 | ``` 125 | - `--input_file_path`: (source) Path to the input audio file. Cannot be used with input folder. 126 | 127 | - `--input_folder`: (source) Path to the input folder containing audio files. Cannot be used with input_file_path 128 | 129 | - `--youtube_download`: (source) Link or links separated by space of youtube videos or playlists. 130 | 131 | - `--librivox_download`: (source) Link or links separated by space of LibriVox audiobooks. 132 | 133 | - `--tedtalks_download`: (source) Aggregate Ted Talks audio or video links by separating them with spaces. Copy these links directly from the Share button URL, in the "Download" section, where is MP4 and Audio. 134 | 135 | - `--output_directory`: Output directory for audio files. 136 | 137 | - `--range_times`: Specify a range of two integers in the format "start-end". Default is 4-10. Clarification: in the first instance, the ranges are determined by WhisperX. Therefore you cannot modify them, but when using this parameter you can narrow and filter the ranges. 138 | 139 | - `--enhancers`: You can use audio enhancers: --enhancers deepfilternet resembleai mayavoz. Will be executed in the order you write it. By default no enhancer is set. By now deepfilternet gives the best results when enhancing and denoising an audio. 140 | 141 | - `--datasets`: there are available extra dataset types: metavoice and librispeech. librispeech is in beta version. --datasets metavoice librispeech 142 | 143 | ### Examples: 144 | 145 | #### Input from a file: 146 | ```bash 147 | #No enhancer is used 148 | python speech_dataset_generator/main.py --input_file_path /path/to/audio/file.mp3 --output_directory /output/directory --range_times 5-10 --datasets metavoice 149 | 150 | #Using deepfilternet enhancer 151 | python speech_dataset_generator/main.py --input_file_path /path/to/audio/file.mp3 --output_directory /output/directory --range_times 4-10 --enhancers deepfilternet 152 | 153 | #Using resembleai enhancer 154 | python speech_dataset_generator/main.py --input_file_path /path/to/audio/file.mp3 --output_directory /output/directory --range_times 4-10 --enhancers resembleai 155 | 156 | # Combining enhancers 157 | python speech_dataset_generator/main.py --input_file_path /path/to/audio/file.mp3 --output_directory /output/directory --range_times 4-10 --enhancers deepfilternet resembleai 158 | ``` 159 | 160 | #### Input from a folder: 161 | 162 | ```bash 163 | python speech_dataset_generator/main.py --input_folder /path/to/folder/of/audios --output_directory /output/directory --range_times 4-10 --enhancers deepfilternet 164 | ``` 165 | 166 | #### Input from youtube (single video or playlists): 167 | ```bash 168 | # Youtube single video 169 | python speech_dataset_generator/main.py --youtube_download https://www.youtube.com/watch\?v\=ID --output_directory /output/directory --range_times 5-15 --enhancers deepfilternet resembleai 170 | 171 | #Combining a youtube video + input file 172 | python speech_dataset_generator/main.py --youtube_download https://www.youtube.com/watch\?v\=ID --input_file_path /path/to/audio/file.mp3 --output_directory /output/directory --range_times 5-15 --enhancers deepfilternet resembleai 173 | 174 | #Combining youtube video + input folder 175 | python speech_dataset_generator/main.py --youtube_download https://www.youtube.com/watch\?v\=ID --input_folder /path/to/folder/of/audios --output_directory /output/directory --range_times 5-15 --enhancers deepfilternet resembleai 176 | ``` 177 | 178 | #### Input from LibriVox (one or multiple audiobooks): 179 | ```bash 180 | # LibriVox single audiobook 181 | python speech_dataset_generator/main.py --librivox_download https://librivox.org/audio-book-url/ --output_directory /output/directory --range_times 5-15 --enhancers deepfilternet resembleai 182 | 183 | #Multiple LibriVox audiobooks at a time, in this example there are just 2, but you can pass n urls 184 | python speech_dataset_generator/main.py --librivox_download https://librivox.org/audio-book-url/ https://librivox.org/another-audio-book-url/ --output_directory /output/directory --range_times 5-15 --enhancers deepfilternet resembleai 185 | 186 | #Combining a LibriVox audiobook + input file 187 | python speech_dataset_generator/main.py --librivox_download https://librivox.org/audio-book-url/ --input_file_path /path/to/audio/file.mp3 --output_directory /output/directory --range_times 5-15 --enhancers deepfilternet resembleai 188 | 189 | #Combining LibriVox audiobook + input folder 190 | python speech_dataset_generator/main.py --librivox_download https://librivox.org/audio-book-url/ --input_folder /path/to/folder/of/audios --output_directory /output/directory --range_times 5-15 --enhancers deepfilternet resembleai 191 | 192 | #Also you can download Youtube audios combined with LibriVox 193 | python speech_dataset_generator/main.py --librivox_download https://librivox.org/audio-book-url/ --youtube_download https://www.youtube.com/watch\?v\=ID --output_directory /output/directory --range_times 5-15 --enhancers deepfilternet resembleai 194 | ``` 195 | 196 | #### Input from Ted Talks (one or multiple Ted Talks): 197 | ```bash 198 | # Ted Talks single video 199 | python speech_dataset_generator/main.py --tedtalks_download https://download.ted.com/talks/video-talk.mp3 --output_directory /output/directory --range_times 5-15 --enhancers deepfilternet resembleai 200 | 201 | #Multiple Ted Talks videos at a time, in this example there are just 2, but you can pass n urls 202 | python speech_dataset_generator/main.py --tedtalks_download https://download.ted.com/talks/video-talk.mp3 https://download.ted.com/talks/another-video-talk.mp3 --output_directory /output/directory --range_times 5-15 --enhancers deepfilternet resembleai 203 | 204 | #Combining a Ted Talks video + input file 205 | python speech_dataset_generator/main.py --tedtalks_download https://download.ted.com/talks/video-talk.mp3 --input_file_path /path/to/audio/file.mp3 --output_directory /output/directory --range_times 5-15 --enhancers deepfilternet resembleai 206 | 207 | #Combining Ted Talks video + input folder 208 | python speech_dataset_generator/main.py --tedtalks_download https://download.ted.com/talks/video-talk.mp3 --input_folder /path/to/folder/of/audios --output_directory /output/directory --range_times 5-15 --enhancers deepfilternet resembleai 209 | ``` 210 | 211 | ## Notes 212 | 213 | ### Multilingual: 214 | 215 | This project uses Whisper, making it multilingual. [Here](https://github.com/openai/whisper?tab=readme-ov-file#available-models-and-languages) you can see the current supported language list. 216 | 217 | ### Audio enhancer argument 218 | You can combine --enhancers. There are available "deepfilternet", "resembleai" and "mayavoz". 219 | 220 | If you pass multiples those will be executed in the order that are passed. In the case you don't pass enhancers no enhancer will be used. 221 | 222 | By default, no enhancer is used. 223 | 224 | You can combine them all the enhancers in the input. 225 | 226 | #### Deepfilternet 227 | 228 | I suggest using deepfilternet for noisy audios. It is the one that gives the best results when denoising. 229 | 230 | #### Resembleai 231 | The output sound of resembleai sometimes can be a little distorted. So, it is not always a good choise. 232 | It can denoise and enhance. If you are combining deepfilternet and resembleai, you can disble resembleai denoising. 233 | 234 | In the case of resembleai you can play with its parameters at [audio_manager.py](https://github.com/davidmartinrius/speech-dataset-generator/blob/02bfbf7d2ed675472ff8ed15cafeea6188e22bac/speech_dataset_generator/audio_manager/audio_manager.py#L117) 235 | 236 | solver = "midpoint" #There is "rk4", "euler" and "midpoint" by default 237 | 238 | denoising = True 239 | 240 | nfe = 128 #range from 1 to 128, if the output sounds like a cassete you can reduce this value 241 | 242 | tau = 0 #range from 0 to 1, better if disabled 243 | 244 | #### Mayavoz 245 | The pretrained model of mayavoz only works with a sampling rate of 16000. Only recommended if the input source is also at 16000 hz. 246 | 247 | ### The audio is not always 100% splitted into sub files 248 | 249 | An input audio may not be used completely. Here some reasons: 250 | - The range_times do not fit a transcripted segment. 251 | - The segment has music or not enough quality (MOS under 3), even when enhanced. 252 | 253 | If you are not using enhancers and the segments are being discarted because of bad quality you can try --enhancers argument with deepfilternet, resembleai, mayavoz or combine them. See [examples section](#examples) to learn how to use it. 254 | 255 | ### Gender detection 256 | 257 | You can use an input audio with multiple speakers and multiple genders. Each speaker will be separated into a fragment and from that fragment the gender will be identified. 258 | 259 | There is an example audio in this project with this case. It is in ./assets/example_audio_1.mp3 260 | You can try it without coding in speech_dataset_generator_example.ipynb 261 | 262 | # Next Steps 263 | 264 | ## External input sources 265 | 266 | - [X] **Youtube** 267 | - [X] **Librivox** 268 | - [X] **Ted talks** 269 | 270 | ## Vector database 271 | 272 | - [X] **Store speaker embeddings in Chroma vector database** 273 | 274 | ## Refactor code 275 | 276 | - [X] Everything is inside main.py The code needs to be reorganized. 277 | 278 | ## Speech rate 279 | 280 | - [X] Detect the speech speed rate for each sentence and add it to the csv output file. The metrics are words per minute (wpm) and syllables per minute (spm) 281 | 282 | ## Audio enhancers 283 | 284 | - [X] **deepfilternet** 285 | 286 | - [X] **resembleai** 287 | 288 | - [X] **mayavoz** 289 | 290 | - [ ] **[espnet](https://github.com/espnet/espnet/tree/master?tab=readme-ov-file#se-speech-enhancement-and-separation) speech enhancement** 291 | 292 | ## Docker image 293 | 294 | - [ ] **Create a docker image for ease of use.** 295 | 296 | ### Example of docker usage (image not available yet) 297 | ```bash 298 | docker run -p 4000:80 -e HF_TOKEN=your_hf_token \ 299 | -v /your/local/output/folder:/app/output \ 300 | --gpus all \ 301 | davidmartinrius/speech-dataset-generator \ 302 | --input_file /app/assets/example_audio_1.wav \ 303 | --output_directory /app/output \ 304 | --range_times 4-10 \ 305 | --enhancers deepfilternet resembleai 306 | ``` 307 | ```bash 308 | docker run -p 4000:80 -e HF_TOKEN=your_hf_token \ 309 | -v /your/local/output/folder:/app/output \ 310 | -v /your/audio/file.mp3:/app/file.wav \ 311 | --gpus all \ 312 | davidmartinrius/speech-dataset-generator \ 313 | --input_file /app/file.wav \ 314 | --output_directory /app/output \ 315 | --range_times 4-10 \ 316 | --enhancers deepfilternet resembleai 317 | ``` 318 | 319 | ## Google colab 320 | 321 | - [ ] Add a speech_dataset_generator_example.ipynb file with all available options applied to some noisy audios and good quality audios. 322 | 323 | ## Add age classification and new gender classification 324 | 325 | - [X] https://github.com/Anvarjon/Age-Gender-Classification Finally, this won't be integrated as it is too imprecise and only works well with the trained dataset, but not with unseen samples. 326 | 327 | ## Emotion regognition 328 | 329 | - [ ] https://github.com/ddlBoJack/emotion2vec 330 | 331 | ## Upload to PyPi 332 | 333 | - [ ] Still pending. There is an uploaded PyPi package but does not work yet. Got some issues setup.py because some of the required packages are not available in PyPi. I am still looking for a way to install those packages. So, by now install the package from requirements.txt or setup.py. 334 | 335 | ## Support multiple datasets 336 | 337 | Generator of multiple types of datasets: 338 | 339 | - [X] **LJSpeech** This is the default one. When you generate a new dataset a LJSpeech format is given. It still does not split by train/dev/test, but creates a metadata.csv 340 | 341 | - [X] **[Metavoice-src](https://github.com/metavoiceio/metavoice-src)** Example of the dataset: https://github.com/metavoiceio/metavoice-src/blob/main/datasets/sample_dataset.csv 342 | 343 | - [ ] **LibriSpeech** Currently in development. Work in progress 344 | 345 | 346 | - [ ] **Common Voice 11** 347 | 348 | - [ ] **VoxPopuli** 349 | 350 | - [ ] **TED-LIUM** 351 | 352 | - [ ] **GigaSpeech** 353 | 354 | - [ ] **SPGISpeech** 355 | 356 | - [ ] **Earnings-22** 357 | 358 | - [ ] **AMI** 359 | 360 | - [ ] **VCTK** 361 | 362 | ## Dataset converter. 363 | 364 | For example, from LibriSpeech to Common Voice and vice versa, etc. 365 | 366 | I have to look for a way to extract all the needed features for each dataset type. Also find the best way to divide the dataset into train, dev and test taking into account the input data provided by the user. 367 | 368 | ## Gradio interface 369 | 370 | - [ ] **Generate datasets** 371 | 372 | - [ ] **Dataset converter** 373 | 374 | ## Runpod serverless instance 375 | 376 | In case you do not have a gpu or you want to distribute this as a service. 377 | 378 | runpod is a cloud GPU on demand. It has a good integration with python and docker. Also it has an affordable pricing. 379 | 380 | - [ ] Explain how to create a storage in runpod 381 | - [ ] Create a base install to the storage with a Pod 382 | - [ ] Launch a serverless instance with a Docker instance of this project 383 | - [ ] Call the serverless custom API endpoints to upload files, download generated datasets, convert datasets to other types of datasets, etc 384 | 385 | ## Used packages in this project 386 | This project uses several open-source libraries and tools for audio processing. Special thanks to the contributors of these projects. 387 | 388 | - Python 3.10 389 | 390 | - [whisperx](https://github.com/m-bain/whisperX?tab=readme-ov-file) (v3.1.1) 391 | 392 | - [faster-whisper](https://github.com/SYSTRAN/faster-whisper) (1.0.0) 393 | 394 | - [pydub](https://github.com/jiaaro/pydub) (v0.25.1) 395 | 396 | - [python-dotenv](https://github.com/theskumar/python-dotenv) (v1.0.1) 397 | 398 | - [inaSpeechSegmenter](https://github.com/ina-foss/inaSpeechSegmenter) (v0.7.7) 399 | 400 | - [unsilence](https://github.com/lagmoellertim/unsilence) (v1.0.9) 401 | 402 | - [deepfilternet](https://github.com/Rikorose/DeepFilterNet) 403 | 404 | - [resemble-enhance](https://github.com/resemble-ai/resemble-enhance) (v0.0.1) 405 | 406 | - [speechmetrics](https://github.com/aliutkus/speechmetrics) 407 | 408 | - [pyannote](https://huggingface.co/pyannote) (embedding model and speaker diarization model) 409 | 410 | - [yt-dlp](https://github.com/yt-dlp/yt-dlp) 411 | 412 | - [Chroma](https://github.com/chroma-core/chroma) 413 | 414 | - [mayavoz](https://github.com/shahules786/mayavoz) 415 | 416 | ## License 417 | 418 | If you plan to use this project in yours: [whisperX](https://github.com/m-bain/whisperX?tab=BSD-4-Clause-1-ov-file) is currently under the BSD-4-Clause license, yt-dlp has no license and all others are under the MIT license or Apache 2.0 license. 419 | 420 | This project is licensed under the [MIT License](LICENSE). 421 | 422 | 423 | ## Give it a star ⭐️ 424 | 425 | Did you find this project useful? If so please, consider giving it a star! Your support is greatly appreciated and helps to increase the visibility of the project. Thank you! 😊 426 | 427 | [![Star History Chart](https://api.star-history.com/svg?repos=davidmartinrius/speech-dataset-generator&type=Date)](https://star-history.com/#davidmartinrius/speech-dataset-generator&Date) 428 | -------------------------------------------------------------------------------- /assets/example_audio_1.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmartinrius/speech-dataset-generator/5142e9779dc06f0ad1540bc151979d92a5627739/assets/example_audio_1.mp3 -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -m speech_dataset_generator.main "$@" 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools==65 2 | whisperx==3.1.1 3 | faster-whisper==0.10.1 4 | pydub==0.25.1 5 | python-dotenv==1.0.1 6 | inaSpeechSegmenter==0.7.8 7 | git+https://github.com/davidmartinrius/unsilence.git 8 | deepfilternet==0.5.6 9 | git+https://github.com/davidmartinrius/resemble-enhance.git 10 | git+https://github.com/davidmartinrius/speechmetrics#egg=speechmetrics 11 | yt-dlp 12 | chromadb==0.4.23 13 | pyphen==0.14.0 14 | pypinyin==0.50.0 15 | konlpy==0.6.0 16 | speechbrain==0.5.16 17 | git+https://github.com/davidmartinrius/mayavoz.git 18 | bs4==0.0.2 19 | numpy==1.24 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='speech-dataset-generator', 5 | version='1.0.0', 6 | author='David Martin Rius', 7 | url='https://github.com/davidmartinrius/speech-dataset-generator', 8 | author_email='0991592@gmail.com', # Add your email address 9 | description='🔊 Create labeled datasets, enhance audio quality, identify speakers, support diverse dataset types. 🎧👥📊 Advanced audio processing.', 10 | long_description=open('README.md').read(), # Add a README.md file for a detailed description 11 | long_description_content_type='text/markdown', 12 | packages=find_packages(), 13 | install_requires=[ 14 | 'whisperx==3.1.1', 15 | 'faster-whisper==0.10.1', 16 | 'pydub==0.25.1', 17 | 'python-dotenv==1.0.1', 18 | 'inaSpeechSegmenter==0.7.8', 19 | 'unsilence @ git+https://github.com/davidmartinrius/unsilence.git', 20 | 'deepfilternet', 21 | 'resemble-enhance @ git+https://github.com/davidmartinrius/resemble-enhance.git', 22 | 'speechmetrics @ git+https://github.com/davidmartinrius/speechmetrics#egg=speechmetrics', 23 | 'yt-dlp', 24 | 'chromadb==0.4.23', 25 | 'pyphen==0.14.0', 26 | 'pypinyin==0.50.0', 27 | 'konlpy==0.6.0', 28 | 'speechbrain==0.5.16', 29 | 'mayavoz @ git+https://github.com/davidmartinrius/mayavoz.git', 30 | 'bs4==0.0.2', 31 | 'numpy==1.24', 32 | ], 33 | entry_points={ 34 | 'console_scripts': [ 35 | 'speech-dataset-generator = speech_dataset_generator.main:main', 36 | ], 37 | }, 38 | python_requires='>=3.10', 39 | project_urls={ 40 | 'Source': 'https://github.com/davidmartinrius/speech-dataset-generator', 41 | 'Issues': 'https://github.com/davidmartinrius/speech-dataset-generator/issues', 42 | }, 43 | ) 44 | -------------------------------------------------------------------------------- /speech_dataset_generator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmartinrius/speech-dataset-generator/5142e9779dc06f0ad1540bc151979d92a5627739/speech_dataset_generator/__init__.py -------------------------------------------------------------------------------- /speech_dataset_generator/audio_manager/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmartinrius/speech-dataset-generator/5142e9779dc06f0ad1540bc151979d92a5627739/speech_dataset_generator/audio_manager/__init__.py -------------------------------------------------------------------------------- /speech_dataset_generator/audio_manager/audio_manager.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import gc 4 | import os 5 | 6 | import speechmetrics #https://github.com/aliutkus/speechmetrics 7 | from unsilence import Unsilence 8 | 9 | #https://github.com/ina-foss/inaSpeechSegmenter # sudo apt-get install ffmpeg 10 | from df.enhance import enhance, init_df, load_audio, save_audio 11 | 12 | import torchaudio 13 | from resemble_enhance.enhancer.inference import denoise, enhance as resemble_enhancer 14 | from scipy.io.wavfile import write 15 | from mayavoz.models import Mayamodel 16 | 17 | from speech_dataset_generator.utils.utils import get_device 18 | 19 | class AudioManager: 20 | 21 | def process(self, input_audio, output_directory, enhancers): 22 | 23 | output_audio_file = self.get_output_file_name(input_audio, output_directory) 24 | self.enhance_audio(input_audio, output_audio_file, enhancers) 25 | 26 | if not self.has_speech_quality(output_audio_file): 27 | return None 28 | 29 | return output_audio_file 30 | 31 | def get_output_file_name(self, input_audio, output_directory): 32 | # Extract the input file name without extension 33 | file_name_without_extension, extension = os.path.splitext(os.path.basename(input_audio)) 34 | 35 | # Create the output file name by adding "_enhanced" suffix 36 | output_file_name = f"{file_name_without_extension}_enhanced{extension}" 37 | 38 | enhanced_directory = os.path.join(output_directory, "enhanced_audios") 39 | 40 | if not os.path.exists(enhanced_directory): 41 | os.makedirs(enhanced_directory) 42 | 43 | return os.path.join(enhanced_directory, output_file_name) 44 | 45 | #https://github.com/Rikorose/DeepFilterNet 46 | #alternatives to deepfilternet 47 | #https://github.com/resemble-ai/resemble-enhance 48 | #https://github.com/shahules786/mayavoz 49 | def enhance_audio(self, noisy_audio, output_audio_file, enhancers=["deepfilternet"]): 50 | 51 | temp_output = noisy_audio # Initial audio loading 52 | 53 | for enhancement_type in enhancers: 54 | print(f"enhancing audio with {enhancement_type}") 55 | 56 | if enhancement_type == "deepfilternet": 57 | temp_output = self.enhance_audio_deepfilternet(temp_output, output_audio_file) 58 | elif enhancement_type == "resembleai": 59 | temp_output = self.enhance_audio_resembleai(temp_output, output_audio_file) 60 | elif enhancement_type == "mayavoz": 61 | temp_output = self.enhance_audio_mayavoz(temp_output, output_audio_file) 62 | 63 | self.remove_sliences(temp_output, output_audio_file) 64 | 65 | return temp_output 66 | 67 | def enhance_audio_deepfilternet(self, noisy_audio, output_audio_file): 68 | 69 | model, df_state, _ = init_df() # Load default model 70 | 71 | audio, info = load_audio(noisy_audio, sr=df_state.sr()) 72 | 73 | minutes = 1 # it is easy to edit time in minutes than seconds 74 | seconds = minutes * 60 75 | 76 | # Split audio into 5min chunks 77 | audio_chunks = [audio[:, i:i + seconds * info.sample_rate] 78 | for i in range(0, audio.shape[1], seconds * info.sample_rate)] 79 | 80 | enhanced_chunks = [] 81 | for ac in audio_chunks: 82 | enhanced_chunks.append(enhance(model, df_state, ac)) 83 | 84 | enhanced = torch.cat(enhanced_chunks, dim=1) 85 | 86 | assert enhanced.shape == audio.shape, 'Enhanced audio shape does not match original audio shape.' 87 | 88 | save_audio(output_audio_file, enhanced, sr=df_state.sr()) 89 | 90 | # Free memory after inference 91 | del model, df_state, audio, info, enhanced_chunks, enhanced 92 | torch.cuda.empty_cache() 93 | gc.collect() 94 | 95 | return output_audio_file 96 | 97 | def split_audio_into_chunks(self, audio, chunk_size): 98 | """ 99 | Split audio into chunks of specified size. 100 | 101 | Parameters: 102 | audio (np.ndarray): The audio data. 103 | chunk_size (int): Size of each chunk in samples. 104 | 105 | Returns: 106 | List[np.ndarray]: List of audio chunks. 107 | """ 108 | num_samples = len(audio) 109 | num_chunks = num_samples // chunk_size 110 | chunks = [audio[i * chunk_size: (i + 1) * chunk_size] for i in range(num_chunks)] 111 | return chunks 112 | 113 | def enhance_audio_resembleai(self, noisy_audio, output_audio_file): 114 | 115 | solver = "midpoint" #rk4, euler, midpoint 116 | denoising = True 117 | nfe = 128 118 | tau = 0 119 | chunk_duration = 20 # 1 minute in seconds 120 | 121 | if noisy_audio is None: 122 | return None, None 123 | 124 | solver = solver.lower() 125 | nfe = int(nfe) 126 | lambd = 0.9 if denoising else 0.1 127 | 128 | # Load the entire audio file 129 | dwav, sr = torchaudio.load(noisy_audio) 130 | dwav = dwav.mean(dim=0) 131 | 132 | device = get_device() 133 | 134 | # Calculate chunk size based on duration 135 | chunk_size = int(sr * chunk_duration) 136 | 137 | # Split the audio into chunks 138 | audio_chunks = self.split_audio_into_chunks(dwav.cpu().numpy(), chunk_size) 139 | 140 | enhanced_chunks = [] 141 | 142 | # For Docker 143 | run_dir = "/app/model_repo/enhancer_stage2/" 144 | 145 | if not os.path.exists(run_dir): 146 | run_dir = None 147 | 148 | for chunk in audio_chunks: 149 | 150 | chunk_tensor = torch.tensor(chunk) 151 | 152 | # Apply enhancement to each chunk 153 | wav2_chunk, new_sr = resemble_enhancer( 154 | chunk_tensor, 155 | sr, 156 | device, 157 | nfe=nfe, 158 | solver=solver, 159 | lambd=lambd, 160 | tau=tau, 161 | #run_dir=run_dir 162 | ) 163 | 164 | # Save the enhanced chunk to the list 165 | enhanced_chunks.append(wav2_chunk) 166 | 167 | # Concatenate all enhanced chunks 168 | enhanced_audio = np.concatenate(enhanced_chunks) 169 | 170 | # Write the concatenated enhanced audio to the output file 171 | write(output_audio_file, new_sr, enhanced_audio) 172 | 173 | return output_audio_file 174 | 175 | def enhance_audio_mayavoz(self, noisy_audio, output_audio_file): 176 | 177 | model = Mayamodel.from_pretrained("shahules786/mayavoz-waveunet-valentini-28spk") 178 | waveform = model.enhance(noisy_audio) 179 | 180 | # this model only works with this sampling rate 181 | sr = 16000 182 | 183 | write( 184 | output_audio_file, rate=sr, data=waveform.detach().cpu().numpy() 185 | ) 186 | 187 | # Free memory after inference 188 | del model 189 | torch.cuda.empty_cache() 190 | gc.collect() 191 | 192 | return output_audio_file 193 | 194 | # https://github.com/lagmoellertim/unsilence 195 | def remove_sliences(self, path_to_audio_file, output_audio_file): 196 | 197 | print("removing silences") 198 | u = Unsilence(path_to_audio_file) 199 | 200 | u.detect_silence() 201 | 202 | #rewrite the file with no silences 203 | u.render_media(output_audio_file, audio_only=True) # Audio only specified 204 | 205 | # Free memory after inference 206 | del u 207 | torch.cuda.empty_cache() 208 | gc.collect() 209 | 210 | #https://github.com/aliutkus/speechmetrics 211 | def has_speech_quality(self, path_to_audio_file): 212 | 213 | window_length = 30 # seconds 214 | metrics = speechmetrics.load('absolute', window_length) 215 | 216 | scores = metrics(path_to_audio_file) 217 | 218 | average_scores = {} 219 | 220 | for metric_name, scores_array in scores.items(): 221 | # Calculate the average of the array/list 222 | average_score = np.mean(scores_array) 223 | 224 | # Print the result 225 | #print(f"Average {metric_name} score: {average_score}") 226 | 227 | average_scores[metric_name] = average_score 228 | 229 | mos_value = average_scores['mosnet'] 230 | if mos_value >= 3: 231 | return True 232 | 233 | print(f"Discarding audio {path_to_audio_file}. Not enough quality. MOS {mos_value} < 3") 234 | 235 | # Free memory after inference 236 | del metrics 237 | torch.cuda.empty_cache() 238 | gc.collect() 239 | 240 | return False 241 | 242 | def has_music(self, segmentation): 243 | 244 | labels = [item[0] for item in segmentation if item[0] in ('music')] 245 | 246 | if 'music' in labels: 247 | return True 248 | 249 | return False 250 | -------------------------------------------------------------------------------- /speech_dataset_generator/audio_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmartinrius/speech-dataset-generator/5142e9779dc06f0ad1540bc151979d92a5627739/speech_dataset_generator/audio_processor/__init__.py -------------------------------------------------------------------------------- /speech_dataset_generator/audio_processor/audio_processor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yt_dlp 3 | import chromadb 4 | import os 5 | import requests 6 | from bs4 import BeautifulSoup 7 | from urllib.parse import urlparse, unquote 8 | import secrets 9 | import string 10 | 11 | from speech_dataset_generator.dataset_generator.dataset_generator import DatasetGenerator 12 | 13 | def generate_random_string(length): 14 | alphabet = string.ascii_letters + string.digits 15 | return ''.join(secrets.choice(alphabet) for _ in range(length)) 16 | 17 | def get_local_audio_files(input_folder): 18 | all_files = os.listdir(input_folder) 19 | return [os.path.join(input_folder, file) for file in all_files if file.lower().endswith(('.mp3', '.wav', '.flac', '.ogg', '.aac', '.wma'))] 20 | 21 | def get_youtube_audio_files(urls, output_directory): 22 | 23 | downloaded_files = [] 24 | if not urls: 25 | return downloaded_files 26 | 27 | youtube_files_output_directory = os.path.join(output_directory, "youtube_files") 28 | 29 | if not os.path.exists(youtube_files_output_directory): 30 | os.makedirs(youtube_files_output_directory) 31 | 32 | for url in urls: 33 | audio_file_name = generate_random_string(24) + '.wav' 34 | 35 | output_template = os.path.join(youtube_files_output_directory, audio_file_name) 36 | 37 | ydl_opts = { 38 | 'format': 'bestaudio/best', 39 | 'extractaudio': True, 40 | 'audioformat': 'wav', 41 | 'outtmpl': output_template, 42 | } 43 | 44 | with yt_dlp.YoutubeDL(ydl_opts) as ydl: 45 | ydl.download([url]) 46 | 47 | downloaded_files.append(output_template) 48 | 49 | return downloaded_files 50 | 51 | def get_librivox_audio_files(urls, output_directory): 52 | 53 | downloaded_files = [] 54 | if not urls: 55 | return downloaded_files 56 | 57 | librivox_files_output_directory = os.path.join(output_directory, "librivox") 58 | 59 | headers = { 60 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 61 | 'sec-ch-ua': '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"', 62 | 'sec-ch-ua-mobile': '?0', 63 | 'sec-ch-ua-platform': '"Linux"' 64 | } 65 | 66 | if not os.path.exists(librivox_files_output_directory): 67 | os.makedirs(librivox_files_output_directory) 68 | 69 | for url in urls: 70 | 71 | parsed_url = urlparse(url) 72 | headers['GET'] = f'{parsed_url.path} HTTP/1.1' 73 | 74 | page = requests.get(url) 75 | soup = BeautifulSoup(page.content, 'html.parser') 76 | results = soup.find_all(class_='chapter-name') 77 | 78 | hrefs = [c['href'] for c in results] 79 | 80 | print('found {} chapters to download'.format(len(hrefs))) 81 | 82 | for audio in hrefs: 83 | 84 | audio_file_name = generate_random_string(24) + '.wav' 85 | 86 | audio_path = os.path.join(librivox_files_output_directory, audio_file_name) 87 | 88 | if os.path.exists(audio_path): 89 | print(f"Already exists: {audio_file_name}") 90 | continue 91 | 92 | print('Downloading {} to:'.format(audio),audio_path) 93 | file = requests.get(audio, headers=headers) 94 | with open(audio_path, 'wb') as f: 95 | f.write(file.content) 96 | 97 | downloaded_files.append(audio_path) 98 | 99 | return downloaded_files 100 | 101 | def get_tedtalks_audio_files(urls, output_directory): 102 | 103 | downloaded_files = [] 104 | if not urls: 105 | return downloaded_files 106 | 107 | tedtalks_files_output_directory = os.path.join(output_directory, "tedtalks") 108 | 109 | if not os.path.exists(tedtalks_files_output_directory): 110 | os.makedirs(tedtalks_files_output_directory) 111 | 112 | for url in urls: 113 | 114 | print("processing ted talk", url) 115 | 116 | filename = generate_random_string(24) + ".wav" 117 | 118 | response = requests.get(url) 119 | 120 | if response.status_code == 200: 121 | 122 | audio_path = os.path.join(tedtalks_files_output_directory, filename) 123 | 124 | with open(audio_path, 'wb') as audio_file: 125 | audio_file.write(response.content) 126 | 127 | print(f"Downloaded '{url}' to '{audio_path}'") 128 | else: 129 | print(f"Failed to download the audio file from url {url}.") 130 | 131 | downloaded_files.append(audio_path) 132 | 133 | return downloaded_files 134 | 135 | def process_audio_files(audio_files, output_directory, start, end, enhancers, datasets): 136 | 137 | dataset_generator = DatasetGenerator() 138 | 139 | client = chromadb.PersistentClient(path=os.path.join(output_directory, "chroma_database")) 140 | collection = client.get_or_create_collection(name="speakers") 141 | 142 | for audio_file in audio_files: 143 | print("Processing:", audio_file) 144 | dataset_generator.process(audio_file, output_directory, start, end, enhancers, collection, datasets) -------------------------------------------------------------------------------- /speech_dataset_generator/dataset_generator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmartinrius/speech-dataset-generator/5142e9779dc06f0ad1540bc151979d92a5627739/speech_dataset_generator/dataset_generator/__init__.py -------------------------------------------------------------------------------- /speech_dataset_generator/dataset_generator/dataset_generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | import torch 4 | import gc 5 | import time 6 | import os 7 | import random 8 | from dotenv import load_dotenv 9 | from collections import Counter 10 | 11 | import whisperx 12 | from pydub import AudioSegment 13 | from pyannote.audio import Model 14 | from scipy.spatial.distance import cdist 15 | from pyannote.audio import Inference 16 | 17 | #https://github.com/ina-foss/inaSpeechSegmenter # sudo apt-get install ffmpeg 18 | from inaSpeechSegmenter import Segmenter 19 | 20 | from scipy.spatial.distance import cdist 21 | 22 | from speech_dataset_generator.audio_manager.audio_manager import AudioManager 23 | from speech_dataset_generator.utils.utils import get_device 24 | from speech_dataset_generator.speech_rate.speech_rate import SpeechRate 25 | import shutil 26 | import csv 27 | 28 | load_dotenv() 29 | 30 | # Access the HF_TOKEN environment variable 31 | HF_TOKEN = os.environ.get("HF_TOKEN") 32 | 33 | # If HF_TOKEN is not set, try getting it from the OS environment 34 | if HF_TOKEN is None: 35 | HF_TOKEN = os.getenv("HF_TOKEN") 36 | 37 | # Check if HF_TOKEN is set 38 | if not HF_TOKEN: 39 | raise ValueError("HF_TOKEN is not set. Please set the environment variable.") 40 | 41 | class DatasetGenerator: 42 | 43 | def create_audio_segment(self, start, end, audio_file, wavs_directory): 44 | 45 | ts = str(int(time.time())) 46 | 47 | #file_name = os.path.join(path_to_store_audio, ts_encoded + str(random.getrandbits(128)) + ".wav") 48 | file_name = os.path.join(wavs_directory, ts + "_" + self.generate_random_number_as_string(24) + ".wav") 49 | 50 | t1 = start * 1000 51 | t2 = end * 1000 52 | 53 | extension = audio_file[-3:] 54 | 55 | if extension == "mp3": 56 | newAudio = AudioSegment.from_mp3(audio_file) 57 | elif extension == "m4a": 58 | newAudio = AudioSegment.from_file(audio_file) 59 | else: 60 | newAudio = AudioSegment.from_wav(audio_file) 61 | 62 | newAudio = newAudio[t1:t2] 63 | 64 | newAudio = newAudio.set_frame_rate(22050) 65 | newAudio = newAudio.set_channels(1) 66 | newAudio.export(file_name, format="wav") 67 | 68 | return file_name 69 | 70 | def write_main_data_to_csv(self, transcription, csv_file_name, language): 71 | 72 | header = ['text', 'audio_file', 'speaker_id', 'gender', 'duration', 'language', 'syllables_per_minute', 'words_per_minute'] 73 | with open(csv_file_name, 'w', encoding='utf-8', newline='') as csvFile: 74 | csv_writer = csv.DictWriter(csvFile, fieldnames=header) 75 | csv_writer.writeheader() 76 | 77 | for segment in transcription["segments"]: 78 | newData = { 79 | 'text': segment["text"], 80 | 'audio_file': segment["audio_file"], 81 | 'speaker_id': segment["generated_speaker_name"], 82 | 'gender': segment["dominant_gender"], 83 | 'duration': segment["duration"], 84 | 'language': language, 85 | 'syllables_per_minute': segment["syllables_per_minute"], 86 | 'words_per_minute': segment["words_per_minute"], 87 | } 88 | 89 | with open(csv_file_name, 'a', encoding='utf-8', newline='') as csvFile: 90 | 91 | csv_writer = csv.DictWriter(csvFile, fieldnames=header) 92 | 93 | csv_writer.writerow(newData) 94 | 95 | def write_data_to_csv_ljspeech(self, transcription, csv_file_name): 96 | 97 | with open(csv_file_name, 'w', encoding='utf-8', newline='') as csvFile: 98 | 99 | csv_writer = csv.writer(csvFile, delimiter='|') 100 | 101 | for segment in transcription["segments"]: 102 | 103 | audio_file_name, _ = os.path.splitext(os.path.basename(segment["audio_file"])) 104 | 105 | text = segment["text"] 106 | 107 | csv_writer.writerow([audio_file_name, text, text]) 108 | 109 | def generate_random_number_as_string(self, digits): 110 | finalNumber = "" 111 | for i in range(digits // 16): 112 | finalNumber = finalNumber + str(math.floor(random.random() * 10000000000000000)) 113 | finalNumber = finalNumber + str(math.floor(random.random() * (10 ** (digits % 16)))) 114 | return str(finalNumber) 115 | 116 | def get_speaker_info(self, collection, audio_embeddings_infencer, file_name): 117 | 118 | current_speaker_embedding = audio_embeddings_infencer(file_name) 119 | 120 | # Normalize embeddings 121 | current_speaker_embedding = (current_speaker_embedding / np.linalg.norm(current_speaker_embedding)).tolist() 122 | 123 | results = collection.query( 124 | query_embeddings=[current_speaker_embedding], 125 | n_results=1, 126 | include=["metadatas", "distances", "embeddings"] 127 | ) 128 | 129 | if not results["distances"][0]: 130 | 131 | speaker_name = self.generate_random_number_as_string(24) 132 | 133 | collection.add( 134 | embeddings=np.array([current_speaker_embedding]), 135 | metadatas=[{"speaker_name": speaker_name }], 136 | ids=[speaker_name] 137 | ) 138 | 139 | return speaker_name 140 | 141 | distance = cdist([current_speaker_embedding], [results["embeddings"][0][0]], metric="cosine")[0,0] 142 | 143 | if distance < 0.15: 144 | 145 | speaker_name = results["metadatas"][0][0]["speaker_name"] 146 | 147 | return speaker_name 148 | else: 149 | 150 | speaker_name = self.generate_random_number_as_string(24) 151 | 152 | collection.add( 153 | embeddings=np.array([current_speaker_embedding]), 154 | metadatas=[{"speaker_name": speaker_name }], 155 | ids=[speaker_name] 156 | ) 157 | 158 | return speaker_name 159 | 160 | def get_gender(self, segmentation): 161 | 162 | labels = [item[0] for item in segmentation if item[0] in ('male', 'female')] 163 | 164 | if 'male' in labels: 165 | return 'male' 166 | elif 'female' in labels: 167 | return 'female' 168 | else: 169 | return 'no_gender' 170 | 171 | def get_transcription(self, enhanced_audio_file_path): 172 | device = get_device() 173 | batch_size = 8 # reduce if low on GPU mem 174 | #compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy) 175 | compute_type="int8" 176 | 177 | # 1. Transcribe with original whisper (batched) 178 | model = whisperx.load_model("large-v3", device, compute_type=compute_type) 179 | 180 | audio = whisperx.load_audio(enhanced_audio_file_path) 181 | result = model.transcribe(audio, batch_size=batch_size) 182 | 183 | language = result["language"] 184 | 185 | model_a, metadata = whisperx.load_align_model(language_code=language, device=device) 186 | result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) 187 | 188 | diarize_model = whisperx.DiarizationPipeline(model_name='pyannote/speaker-diarization@2.1', use_auth_token=HF_TOKEN, device=device) 189 | diarize_segments = diarize_model(audio) 190 | 191 | result = whisperx.assign_word_speakers(diarize_segments, result) 192 | 193 | # this is needed to fix some items that only have word, but don't have start, end, score and speaker. 194 | for i, segment in enumerate(result["segments"]): 195 | wordlevel_info = [] 196 | for iw, word in enumerate(segment["words"]): 197 | 198 | if any(key not in word for key in ["start", "end", "speaker"]): 199 | 200 | if iw-1 >= 0: # Check if iw-1 is a valid index 201 | word["start"] = round(segment["words"][iw-1]["end"] + 0.001, 3) 202 | word['score'] = segment["words"][iw-1]["score"] 203 | word['speaker'] = segment["words"][iw-1]["speaker"] 204 | elif i-1 >= 0: 205 | # Use the last word of the previous segment 206 | word["start"] = round(result["segments"][i-1]["words"][-1]["end"] + 0.001, 3) 207 | word["score"] = result["segments"][i-1]["words"][-1]["score"] 208 | word["speaker"] = result["segments"][i-1]["words"][-1]["speaker"] 209 | else: 210 | word["start"] = 0.001 211 | word["score"] = 1 212 | word["speaker"] = segment["speaker"] 213 | 214 | if iw+1 < len(segment["words"]) and 'start' in segment["words"][iw+1]: # Check if iw+1 is a valid index 215 | word["end"] = round(segment["words"][iw+1]["start"] - 0.001, 3) 216 | word["score"] = segment["words"][iw+1]["score"] 217 | word["speaker"] = segment["words"][iw+1]["speaker"] 218 | elif i+1 < len(result["segments"]) and 'start' in result["segments"][i+1]["words"][0]: 219 | # Use the first word of the next segment 220 | word["end"] = round(result["segments"][i+1]["words"][0]["start"] - 0.001, 3) 221 | word["score"] = result["segments"][i+1]["words"][0]["score"] 222 | word["speaker"] = result["segments"][i+1]["words"][0]["speaker"] 223 | else: 224 | word["end"] = 0.001 225 | word["score"] = 1 226 | word["speaker"] = segment["speaker"] 227 | 228 | if "speaker" not in word: 229 | word["speaker"] = segment["speaker"] 230 | 231 | wordlevel_info.append({ 232 | 'word': word["word"], 233 | 'start': word["start"], 234 | 'end': word["end"], 235 | 'speaker': word['speaker'], 236 | 'score': word['score'] 237 | }) 238 | 239 | segment["words"] = wordlevel_info 240 | 241 | fixed_segments = [] 242 | for segment in result["segments"]: 243 | 244 | current_speaker = None 245 | current_words = [] 246 | 247 | # Iterate over each word in the segment 248 | for word in segment["words"]: 249 | speaker = word["speaker"] 250 | 251 | if not current_speaker or current_speaker == speaker: 252 | current_words.append(word) 253 | else: 254 | fixed_segments.append({ 255 | "speaker": current_speaker, 256 | "start": current_words[0]["start"] if current_words else None, 257 | "end": current_words[-1]["end"] if current_words else None, 258 | "text": " ".join(w["word"] for w in current_words), 259 | "words": current_words, 260 | }) 261 | 262 | # Start a new list for the current speaker 263 | current_words = [word] 264 | 265 | current_speaker = speaker 266 | 267 | # Save the words for the last speaker in the segment 268 | if current_speaker is not None: 269 | fixed_segments.append({ 270 | "speaker": current_speaker, 271 | "start": current_words[0]["start"] if current_words else None, 272 | "end": current_words[-1]["end"] if current_words else None, 273 | "text": " ".join(w["word"] for w in current_words), 274 | "words": current_words, 275 | }) 276 | 277 | result["segments"] = fixed_segments 278 | 279 | del model; gc.collect(); torch.cuda.empty_cache() 280 | del diarize_segments 281 | 282 | return result, language 283 | 284 | def add_wpm_spm_to_each_segment(self, transcription, language): 285 | 286 | speech_rate_instance = SpeechRate() 287 | for segment in transcription["segments"]: 288 | 289 | duration = segment['duration'] 290 | 291 | word_list = [word_info['word'] for word_info in segment['words']] 292 | 293 | syllables_per_minute = speech_rate_instance.get_syllables_per_minute(word_list, language, duration) 294 | words_per_minute = speech_rate_instance.get_words_per_minute(word_list, duration) 295 | 296 | segment['syllables_per_minute'] = syllables_per_minute 297 | segment['words_per_minute'] = words_per_minute 298 | 299 | return transcription 300 | 301 | def get_existing_speakers(self, transcription, collection): 302 | 303 | existing_speakers = {} 304 | 305 | for segment in transcription["segments"]: 306 | 307 | speaker = segment["speaker"] 308 | duration = segment["end"] - segment["start"] 309 | 310 | if "speaker" not in existing_speakers or ("speaker" in existing_speakers and duration > existing_speakers[speaker]["end"] - existing_speakers[speaker]["start"]): 311 | existing_speakers[speaker] = { 312 | "speaker": speaker, 313 | "audio_file": segment["audio_file"], 314 | "start": segment["start"], 315 | "end": segment["end"], 316 | "gender": segment["gender"] 317 | } 318 | 319 | model = Model.from_pretrained("pyannote/embedding", use_auth_token=HF_TOKEN) 320 | audio_embeddings_infencer = Inference(model, window="whole") 321 | 322 | #for existing_speaker in existing_speakers: 323 | for speaker, existing_speaker in existing_speakers.items(): 324 | 325 | generated_speaker_name = self.get_speaker_info(collection, audio_embeddings_infencer, existing_speaker["audio_file"]) 326 | 327 | existing_speaker["generated_speaker_name"] = generated_speaker_name 328 | 329 | return existing_speakers 330 | 331 | # The dominant gender is because the gender inference sometimes fails, 332 | # This way when testing among all files it is corrected 333 | # The speaker name is given by the chroma database in case the speaker already exists 334 | def assign_name_and_dominant_gender_to_each_speaker(self, transcription, existing_speakers): 335 | 336 | # Create a dictionary to store gender counts for each speaker 337 | speaker_gender_counts = {} 338 | 339 | for segment in transcription["segments"]: 340 | speaker_id = segment["speaker"] 341 | gender = segment["gender"] 342 | 343 | # Initialize the gender counts for the speaker if not already present 344 | if speaker_id not in speaker_gender_counts: 345 | speaker_gender_counts[speaker_id] = Counter() 346 | 347 | # Increment the gender count for the speaker 348 | speaker_gender_counts[speaker_id][gender] += 1 349 | 350 | # Assign the generated speaker name and dominant gender to each segment 351 | for segment in transcription["segments"]: 352 | speaker_id = segment["speaker"] 353 | gender_counts = speaker_gender_counts.get(speaker_id, Counter()) 354 | 355 | # Get the gender with the highest count 356 | dominant_gender = max(gender_counts, key=gender_counts.get) 357 | 358 | # Assign the generated speaker name and gender to the segment 359 | segment["generated_speaker_name"] = existing_speakers[speaker_id]["generated_speaker_name"] 360 | segment["dominant_gender"] = dominant_gender 361 | 362 | return transcription 363 | 364 | # This method adds new values such as audio_file, gender and duration to each segment 365 | def filter_transcription_segments_and_assign_values(self, transcription, range_start, range_end, enhanced_audio_file_path, wavs_directory): 366 | 367 | seg = Segmenter() 368 | valid_segments = [] 369 | for segment in transcription["segments"]: 370 | start = segment["start"] 371 | end = segment["end"] 372 | 373 | duration = segment["end"] - segment["start"] 374 | 375 | if duration < range_start or duration > range_end: 376 | print(f"Audio duration greater than range. Range {start} to {end}. Duration {duration}. Audio file: {enhanced_audio_file_path}") 377 | continue 378 | 379 | file_name = self.create_audio_segment(start, end, enhanced_audio_file_path, wavs_directory) 380 | 381 | segmentation = seg(file_name) 382 | has_music = self.audio_manager_instance.has_music(segmentation) 383 | 384 | if has_music: 385 | print(f"Audio has music. Discarted from {start} to {end} of {enhanced_audio_file_path}") 386 | os.remove(file_name) 387 | continue 388 | 389 | #Verify the quality of the audio here 390 | has_quality = self.audio_manager_instance.has_speech_quality(file_name) 391 | 392 | if not has_quality: 393 | print(f"Audio does not have enough quality. Discarted from {start} to {end} of {enhanced_audio_file_path}") 394 | os.remove(file_name) 395 | continue 396 | 397 | gender = self.get_gender(segmentation) 398 | 399 | segment["audio_file"] = file_name 400 | segment["gender"] = gender 401 | segment["duration"] = round(duration, 3) 402 | 403 | valid_segments.append(segment) 404 | 405 | transcription["segments"] = valid_segments 406 | 407 | return transcription 408 | 409 | def process(self, path_to_audio_file, output_directory, range_start, range_end, enhancers, collection, datasets): 410 | 411 | # STEPS 412 | # check the audio quality of the whole file 413 | # transcribe 414 | # check where is the speech 415 | # check the quality of each individual audio file 416 | # get speakers number of the speakers. Just one can be speaking. Identify each one with chromadb embeddings 417 | # clustering voices (it is an improvement. TODO in a near future) 418 | # discard audios that only are music or has too poor quality 419 | # discard parts of the audio that are music if there is speech too 420 | 421 | ljspeech_directory = os.path.join(output_directory, "ljspeech") 422 | 423 | self.audio_manager_instance = AudioManager() 424 | 425 | wavs_directory = os.path.join(ljspeech_directory, 'wavs') 426 | if not os.path.exists(wavs_directory): 427 | os.makedirs(wavs_directory, exist_ok=True) 428 | 429 | if not os.path.exists(path_to_audio_file): 430 | raise Exception(f"File {path_to_audio_file} does not exist") 431 | 432 | enhanced_audio_file_path = self.audio_manager_instance.process(path_to_audio_file, output_directory, enhancers) 433 | 434 | if not enhanced_audio_file_path: 435 | return 436 | 437 | transcription, language = self.get_transcription(enhanced_audio_file_path) 438 | 439 | transcription = self.filter_transcription_segments_and_assign_values(transcription, range_start, range_end, enhanced_audio_file_path, wavs_directory) 440 | 441 | # words_per_minute and syllables_per_minute 442 | transcription = self.add_wpm_spm_to_each_segment(transcription, language) 443 | 444 | existing_speakers = self.get_existing_speakers(transcription, collection) 445 | 446 | transcription = self.assign_name_and_dominant_gender_to_each_speaker(transcription, existing_speakers) 447 | 448 | csv_file_name = os.path.join(output_directory, "main_data.csv") 449 | self.write_main_data_to_csv(transcription, csv_file_name, language) 450 | 451 | csv_file_name = os.path.join(ljspeech_directory, "metadata.csv") 452 | self.write_data_to_csv_ljspeech(transcription, csv_file_name) 453 | 454 | self.iterate_datasets(datasets, transcription, output_directory, path_to_audio_file, existing_speakers) 455 | 456 | def iterate_datasets(self, datasets, transcription, output_directory, path_to_audio_file, existing_speakers): 457 | 458 | for dataset in datasets: 459 | # Dynamically call the function based on the dataset name 460 | function_name = f"{dataset}_dataset_generator" 461 | 462 | if hasattr(self, function_name): 463 | function_to_call = getattr(self, function_name) 464 | function_to_call(transcription, output_directory, path_to_audio_file, existing_speakers) 465 | else: 466 | print(f"No matching function found for dataset: {dataset}") 467 | 468 | def metavoice_dataset_generator(self, transcription, output_directory, path_to_audio_file, existing_speakers): 469 | 470 | metavoice_directory = os.path.join(output_directory, 'metavoice') 471 | 472 | metavoice_data_directory = os.path.join(output_directory, 'metavoice', 'data') 473 | 474 | os.makedirs(metavoice_data_directory, exist_ok=True) 475 | 476 | # CSV file path 477 | csv_file_path = os.path.join(metavoice_directory, 'metavoice_dataset.csv') 478 | 479 | # Check if the CSV file exists 480 | csv_exists = os.path.exists(csv_file_path) 481 | 482 | with open(csv_file_path, 'a', newline='') as csvfile: 483 | fieldnames = ['audio_files', 'captions'] 484 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='|') 485 | 486 | # Write headers only if CSV file doesn't exist 487 | if not csv_exists: 488 | writer.writeheader() 489 | 490 | for segment in transcription["segments"]: 491 | 492 | shutil.copy(segment["audio_file"], metavoice_data_directory) 493 | 494 | basename_without_extension = os.path.splitext(os.path.basename(segment["audio_file"]))[0] 495 | 496 | transcription_file_path = os.path.join(metavoice_data_directory, f'{basename_without_extension}.txt') 497 | 498 | with open(transcription_file_path, 'w', encoding='utf-8') as transcription_file: 499 | transcription_file.write(segment["text"]) 500 | 501 | # Write data to CSV file 502 | writer.writerow({'audio_files': segment["audio_file"], 'captions': transcription_file_path}) 503 | 504 | #Work in progress 505 | def librispeech_dataset_generator(self, transcription, output_directory, path_to_audio_file, existing_speakers): 506 | 507 | librispeech_directory = os.path.join(output_directory, 'librispeech') 508 | 509 | # Path is librispeech_directory/generated_speaker_name/audio_id/ 510 | # Inside audio_id there is the transcription and the audio files 511 | # The transcription file has n lines with: 512 | # generated_speaker_name-audio_id-number_of_audio transcription 513 | # The transcription file name is speaker_id-book_id.trans.txt 514 | 515 | filename = os.path.basename(path_to_audio_file) 516 | 517 | # Strip the file extension 518 | filename_without_extension, _ = os.path.splitext(filename) 519 | 520 | # Make it lowercase and remove non-alphabetic characters 521 | cleaned_folder_name = ''.join(char.lower() for char in filename_without_extension if char.isalpha()) 522 | 523 | for existing_speaker in existing_speakers.values(): 524 | current_speaker_audio_directory = os.path.join(librispeech_directory, existing_speaker["generated_speaker_name"], cleaned_folder_name) 525 | 526 | os.makedirs(current_speaker_audio_directory, exist_ok=True) 527 | 528 | for segment in transcription["segments"]: 529 | 530 | current_speaker_audio_directory = os.path.join(librispeech_directory, segment["generated_speaker_name"], cleaned_folder_name) 531 | 532 | speaker_id = segment["generated_speaker_name"] 533 | book_id = cleaned_folder_name 534 | 535 | file_extension = os.path.splitext(segment["audio_file"])[1] 536 | 537 | max_number = -1 538 | for filename in os.listdir(current_speaker_audio_directory): 539 | if filename.startswith(f"{speaker_id}-{book_id}-") and not filename.lower().endswith('.txt'): 540 | 541 | current_number = int(filename.rsplit('-', 1)[1].rsplit('.', 1)[0]) 542 | max_number = max(max_number, current_number) 543 | 544 | new_number = max_number + 1 545 | 546 | new_filename = f"{speaker_id}-{book_id}-{new_number}{file_extension}" 547 | 548 | new_full_path = os.path.join(current_speaker_audio_directory, new_filename) 549 | 550 | shutil.copy(segment["audio_file"], new_full_path) 551 | 552 | new_file_data = f"{speaker_id}-{book_id}-{new_number}.{file_extension}" 553 | 554 | transcription_file_path = os.path.join(current_speaker_audio_directory, f"{speaker_id}-{book_id}.trans.txt") 555 | with open(transcription_file_path, 'a', encoding='utf-8') as transcription_file: 556 | transcription_file.write(f"{new_file_data} {segment['text']}\n") 557 | -------------------------------------------------------------------------------- /speech_dataset_generator/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from speech_dataset_generator.audio_processor.audio_processor import process_audio_files, get_local_audio_files, get_youtube_audio_files, get_librivox_audio_files, get_tedtalks_audio_files 4 | 5 | def parse_range(value): 6 | try: 7 | start, end = map(int, value.split('-')) 8 | return start, end 9 | except ValueError: 10 | raise argparse.ArgumentTypeError("Invalid range format. Please use 'start-end'.") 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(description='') 14 | 15 | group = parser.add_mutually_exclusive_group() 16 | group.add_argument('--input_file_path', type=str, help='Path to the input audio file.') 17 | group.add_argument('--input_folder', type=str, help='Path to the input folder containing audio files.') 18 | 19 | parser.add_argument("--youtube_download", nargs="*", help="YouTube playlist or video URLs") 20 | parser.add_argument("--librivox_download", nargs="*", help="Librivox audiobook URLs") 21 | parser.add_argument("--tedtalks_download", nargs="*", help="Ted Talks audio URLs") 22 | 23 | parser.add_argument("--output_directory", type=str, help="Output directory for audio files", default=".") 24 | parser.add_argument('--range_times', nargs='?', type=parse_range, default=(4, 10), help='Specify a range of two integers in the format "start-end". Default is 4-10.') 25 | parser.add_argument('--enhancers', nargs='+', default=[], help='You can combine enhancers too: --enhancers deepfilternet resembleai. Will be executed in the order you write it. By default no enhancer is enabled') 26 | 27 | parser.add_argument('--datasets', nargs='+', type=str, choices=['librispeech','metavoice'], help='Specify the dataset type. LJSpeech is always generated. You can also generate LibriSpeech.') 28 | 29 | args = parser.parse_args() 30 | 31 | input_file_path = args.input_file_path 32 | input_folder = args.input_folder 33 | youtube_download = args.youtube_download 34 | librivox_download = args.librivox_download 35 | tedtalks_download = args.tedtalks_download 36 | output_directory = args.output_directory 37 | start, end = args.range_times 38 | enhancers = args.enhancers 39 | datasets = args.datasets 40 | 41 | if not any([input_file_path, input_folder, youtube_download, librivox_download, tedtalks_download]): 42 | raise Exception("At least 1 input is needed: --input_file_path or --input_folder or --youtube_download or --librivox_download or --tedtalks_download") 43 | 44 | youtube_audio_files = get_youtube_audio_files(youtube_download, output_directory) 45 | librivox_audio_files = get_librivox_audio_files(librivox_download, output_directory) 46 | tedtalks_audio_files = get_tedtalks_audio_files(tedtalks_download, output_directory) 47 | 48 | audio_files = [] 49 | 50 | if input_folder: 51 | local_audio_files = get_local_audio_files(input_folder) 52 | audio_files.extend(local_audio_files) 53 | 54 | if input_file_path: 55 | audio_files.append(input_file_path) 56 | 57 | audio_files.extend(youtube_audio_files + librivox_audio_files + tedtalks_audio_files) 58 | 59 | process_audio_files(audio_files, output_directory, start, end, enhancers, datasets) 60 | -------------------------------------------------------------------------------- /speech_dataset_generator/speech_rate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmartinrius/speech-dataset-generator/5142e9779dc06f0ad1540bc151979d92a5627739/speech_dataset_generator/speech_rate/__init__.py -------------------------------------------------------------------------------- /speech_dataset_generator/speech_rate/speech_rate.py: -------------------------------------------------------------------------------- 1 | import pyphen 2 | 3 | from pypinyin import pinyin, Style 4 | from konlpy.tag import Okt 5 | 6 | class SpeechRate: 7 | 8 | def check_language_availability(self, language): 9 | language_codes = list(set(code.split('_')[0] for code in pyphen.LANGUAGES.keys())) 10 | 11 | language_codes.extend(['zh','ko']) 12 | 13 | if language not in language_codes: 14 | raise Exception("Available language codes:", language_codes) 15 | 16 | def count_syllables_in_pinyin(self, pinyin_text): 17 | # Convert Pinyin to numbered Pinyin (with tone numbers) 18 | pinyin_with_tone_numbers = pinyin(pinyin_text, style=Style.TONE3) 19 | 20 | # Count the number of syllables 21 | syllable_count = sum([1 for s in pinyin_with_tone_numbers if s[0][-1].isdigit()]) 22 | 23 | return syllable_count 24 | 25 | def get_total_syllables_per_word(self, word, language): 26 | 27 | self.check_language_availability(language) 28 | 29 | if 'zh' == language: 30 | 31 | pinyin_with_tone_numbers = pinyin(word, style=Style.TONE3) 32 | 33 | # Count the number of syllables 34 | total_syllables = sum([1 for s in pinyin_with_tone_numbers if s[0][-1].isdigit()]) 35 | 36 | elif 'ko' == language: 37 | 38 | okt = Okt() 39 | morphemes = okt.morphs(word) 40 | total_syllables = len(morphemes) 41 | else: 42 | 43 | dic = pyphen.Pyphen(lang=language) 44 | total_syllables = len(dic.inserted(word).split('-')) 45 | 46 | return total_syllables 47 | 48 | def get_syllables_per_minute(self, words, language, duration_in_seconds): 49 | 50 | total_syllables = sum(self.get_total_syllables_per_word(word, language) for word in words) 51 | 52 | spm = (total_syllables / duration_in_seconds) * 60 53 | 54 | return round(spm, 3) 55 | 56 | def get_words_per_minute(self, words, duration_in_seconds): 57 | 58 | wpm = (len(words) / duration_in_seconds) * 60 59 | 60 | return round(wpm, 3) -------------------------------------------------------------------------------- /speech_dataset_generator/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidmartinrius/speech-dataset-generator/5142e9779dc06f0ad1540bc151979d92a5627739/speech_dataset_generator/utils/__init__.py -------------------------------------------------------------------------------- /speech_dataset_generator/utils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def get_device(): 4 | if torch.cuda.is_available(): 5 | device = "cuda" 6 | print("CUDA is available. Using GPU.") 7 | else: 8 | device = "cpu" 9 | print("CUDA is not available. Using CPU.") 10 | 11 | return device 12 | -------------------------------------------------------------------------------- /speech_dataset_generator_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Speech Dataset Generator - Usage Guide\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Prerequisites" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "### Pyannote Agreement\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "Before running the code, ensure that you have agreed to share your contact information to access the pyannote embedding model. A similar agreement may be required for the pyannote speaker diarization model.\n", 29 | "\n", 30 | "1. https://huggingface.co/pyannote/embedding\n", 31 | "\n", 32 | "2. https://huggingface.co/pyannote/speaker-diarization" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "### Huggingface token" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "You need to generate a token at https://huggingface.co/settings/tokens" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Installation\n" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# 1. Clone the Repository\n", 63 | "!git clone https://github.com/davidmartinrius/speech-dataset-generator.git\n", 64 | "%cd speech-dataset-generator" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# 2. Set Up Environment\n", 74 | "!python3.10 -m venv venv\n", 75 | "!source venv/bin/activate\n", 76 | "!pip install -r requirements.txt" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "# 3. HuggingFace Token\n", 86 | "!echo \"HF_TOKEN=yourtoken\" > .env\n", 87 | "# Make sure to replace 'yourtoken' with your actual HuggingFace token." 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "id": "f3dc9c4e", 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "# 4. Set up path\n", 98 | "import os\n", 99 | "os.environ['PYTHONPATH'] += \":/content/speech-dataset-generator\"" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "## Usage\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "import os\n", 116 | "from IPython.display import Audio, display\n", 117 | "\n", 118 | "def display_the_list_of_files(output_directory):\n", 119 | "\n", 120 | " # List all files in the output directory\n", 121 | " file_list = [f for f in os.listdir(output_directory) if f.endswith('.wav')]\n", 122 | "\n", 123 | " # Display the list of files\n", 124 | " print(\"List of generated .wav files:\")\n", 125 | " for i, file_name in enumerate(file_list[:10]):\n", 126 | " print(f\"{file_name}\\n\")\n", 127 | " \n", 128 | "# Function to play audio\n", 129 | "def play_audio(wavs_directory):\n", 130 | "\n", 131 | " # Let the user choose a file to play\n", 132 | " selected_file = input(\"Enter the filename to play (e.g., example_file.wav): \")\n", 133 | " file_path = os.path.join(wavs_directory, selected_file)\n", 134 | " print(file_path)\n", 135 | "\n", 136 | " # Check if the selected file exists\n", 137 | " if os.path.exists(file_path):\n", 138 | " print(f\"Playing: {selected_file}\")\n", 139 | " display(Audio(filename=file_path))\n", 140 | " else:\n", 141 | " print(f\"File '{selected_file}' not found in the output directory.\")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "### Basic" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "In the next audio there is:\n", 156 | "- 2 speakers\n", 157 | "- 2 genders\n", 158 | "- Background noise\n", 159 | "- A length of 2:14 minutes" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "display(Audio(filename=\"./assets/example_audio_1.mp3\"))" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "In this audio I am going to apply to filters. \n", 176 | "1. deepfilternet to decrement the noise\n", 177 | "2. resembleai to enhance the audio quality\n", 178 | "3. Silence removal" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "output_directory = \"./outputs/output_combining_enhancers\"\n", 188 | "\n", 189 | "# No enhancer is used\n", 190 | "!python speech_dataset_generator/main.py --input_file_path ./assets/example_audio_1.mp3 --output_directory {output_directory} --range_times 3-15 --enhancers deepfilternet resembleai" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "After processing the audio you got:\n", 198 | "1- enhanced audios\n", 199 | "2- Segmented audios in the range you specified. In this case from 5 to 10 seconds for each speaker\n", 200 | "3- chroma_database, where the speakers are persisted, so you can reuse this database to process other files and the labels of the speakers will be the same\n", 201 | "4- A metadata.csv + wavs folder, this is the LJSpeech dataset standard" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "Inside enhanced folder you can listen the improved audio without silences: The original was 2:14 minutes. Now it has been reduced to 1:44 minutes." 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "display(Audio(filename=os.path.join(output_directory, \"enhanced\", \"example_audio_1_enhanced.mp3\")))" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "Let's see what is inside wavs folder:" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "wavs_directory = os.path.join(output_directory, \"wavs\")\n", 234 | "display_the_list_of_files(wavs_directory)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "#Use one of the file names. Example of the output:\n", 244 | "#List of generated .wav files:\n", 245 | "# 1709255795_1479612617475313631572.wav\n", 246 | "\n", 247 | "#When executing this a prompt will ask for a file name:\n", 248 | "\n", 249 | "play_audio(wavs_directory)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "### Advanced (still in progress)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "##### Example: Input from a File\n" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "##### Generate with no enhancer. The base audio must be of very good quality, or it will be discarded" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "output_directory = \"./outputs/output_no_enhancer\"\n", 280 | "\n", 281 | "# No enhancer is used\n", 282 | "!python speech_dataset_generator/main.py --input_file_path ./assets/example_audio_1.mp3 --output_directory {output_directory} --range_times 5-10" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "wavs_directory = os.path.join(output_directory, \"wavs\")\n", 292 | "display_the_list_of_files(wavs_directory)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "play_audio(wavs_directory)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "##### Using deepfilternet enhancer" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "!python speech_dataset_generator/main.py --input_file_path ./assets/example_audio_1.mp3 --output_directory ./outputs/output_deepfilternet --range_times 4-10 --enhancers deepfilternet" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "#### Using resembleai enhancer" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "!python speech_dataset_generator/main.py --input_file_path ./assets/example_audio_1.mp3 --output_directory ./outputs/output_resembleai --range_times 4-10 --enhancers resembleai" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "#### Combining enhancers" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "!python speech_dataset_generator/main.py --input_file_path ./assets/example_audio_1.mp3 --output_directory ./outputs/output_combining_enhancers --range_times 4-10 --enhancers deepfilternet resembleai" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "#### Example: Input from a Folder" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "!python speech_dataset_generator/main.py --input_folder ./assets --output_directory ./outputs/output_folder --range_times 4-10 --enhancers deepfilternet" 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "#### Example: Input from YouTube (Single Video or Playlists)\n" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": {}, 378 | "source": [ 379 | "#### Youtube Single Video" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "# Youtube Single Video\n", 389 | "!python speech_dataset_generator/main.py --youtube_download https://www.youtube.com/watch?v=ExJZAegsOis --output_directory ./outputs/output_youtube --range_times 5-15 --enhancers deepfilternet resembleai" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "#### Combining a YouTube video + Input File" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "!python speech_dataset_generator/main.py --youtube_download https://www.youtube.com/watch?v=ExJZAegsOis --input_file_path ./assets/example_audio_1.mp3 --output_directory ./outputs/output_youtube_and_file --range_times 5-15 --enhancers deepfilternet resembleai" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "#### Combining YouTube video + Input Folder" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "!python speech_dataset_generator/main.py --youtube_download https://www.youtube.com/watch?v=ExJZAegsOis --input_folder ./assets --output_directory ./outputs/output_youtube_and_folder --range_times 5-15 --enhancers deepfilternet resembleai" 422 | ] 423 | } 424 | ], 425 | "metadata": { 426 | "kernelspec": { 427 | "display_name": "Python 3", 428 | "language": "python", 429 | "name": "python3" 430 | }, 431 | "language_info": { 432 | "codemirror_mode": { 433 | "name": "ipython", 434 | "version": 3 435 | }, 436 | "file_extension": ".py", 437 | "mimetype": "text/x-python", 438 | "name": "python", 439 | "nbconvert_exporter": "python", 440 | "pygments_lexer": "ipython3", 441 | "version": "3.10.12" 442 | } 443 | }, 444 | "nbformat": 4, 445 | "nbformat_minor": 5 446 | } 447 | --------------------------------------------------------------------------------