├── .gitignore ├── LICENSE ├── README.md ├── README_zh.md ├── images ├── delete.gif ├── index.gif ├── merge.gif ├── split.gif └── text.gif ├── requirements.txt ├── setup.py ├── subfix ├── __init__.py ├── cli.py ├── format │ ├── FormatBertvits2.py │ ├── FormatJson.py │ └── __init__.py ├── models │ ├── __init__.py │ ├── audio │ │ ├── __init__.py │ │ ├── asr │ │ │ ├── __init__.py │ │ │ ├── openai_whisper.py │ │ │ ├── speech_paraformer_large_vad_punc_asr_zh.py │ │ │ └── speech_uniasr_asr_multilang.py │ │ ├── punctuation │ │ │ ├── __init__.py │ │ │ └── punctuation_funasr.py │ │ ├── speaker_diarization │ │ │ ├── Speech_Campplus_Speaker_Diarization.py │ │ │ └── __init__.py │ │ ├── vad │ │ │ ├── __init__.py │ │ │ └── speech_fsmn_vad_zh.py │ │ └── verification │ │ │ └── __init__.py │ └── nlp │ │ ├── __init__.py │ │ └── correction │ │ └── __init__.py ├── solution │ ├── __init__.py │ ├── diarization.py │ ├── modelscope_multi_lang.py │ ├── punctuation_multi_lang.py │ └── whisper_multi_lang.py ├── utils │ ├── __init__.py │ ├── convert.py │ ├── ext_files.py │ └── misc.py └── webui │ ├── __init__.py │ ├── language.py │ └── webui.py └── subfix_webui.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2023 cronrpc/SubFix 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SubFix 2 | `SubFix` is a web tool designed for easily editing and modifying audio subtitles. Users can see changes in real-time and conveniently **merge, split, delete, and edit subtitles** of audios. 3 | 4 | `SubFix` also supports automated voice annotation, utilizing `modelscope` and `whisper` for multilingual text annotation. Currently, `modelscope` provides automated annotations in languages including Chinese, English, Japanese, German, and Russian. `whisper` supports almost all languages." 5 | 6 | [中文版本](README_zh.md) 7 | 8 | An standalone `.py` file version is available for access at [subfix_webui.py](https://github.com/cronrpc/SubFix/blob/main/subfix_webui.py). This version allows language selection through command-line parameters, supporting both English and Chinese. Additionally, users can choose whether to synchronize the deletion of audio files on the hard drive during editing. 9 | 10 | Usage instructions for the standalone Python file version can be found at: [subfix_webui.py Help](#subfix_webuipy) 11 | 12 | ## Installation 13 | 14 | Follow these steps for a quick and easy installation. It's recommended to use a `Linux` environment. If using `Windows`, you will need to manually configure the `ffmpeg` environment variable, and installing `modelscope` might be more complex. 15 | 16 | ### Installing Dependencies 17 | 18 | Ensure the installed version of `Python` is above `3.9`, then execute the following command. If you do not need to use automatic labeling of audio, there is no need to install the `Modelscope` module. 19 | 20 | Using Conda: 21 | ```bash 22 | conda create -n modelscope python=3.9 23 | conda activate modelscope 24 | ``` 25 | 26 | Installing Dependencies 27 | 28 | #### In a Linux environment 29 | 30 | ```bash 31 | sudo apt install build-essential 32 | sudo apt install ffmpeg 33 | sudo apt install libsox-dev 34 | 35 | git clone https://github.com/cronrpc/SubFix.git 36 | cd SubFix 37 | pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html 38 | pip install -e . 39 | ``` 40 | 41 | #### Updating modelscope and FunASR to the Latest Versions 42 | 43 | Due to recent changes in the modelscope API, the code installed via pip may not be the latest version, potentially causing issues with automatic annotation. 44 | 45 | To ensure compatibility, it is recommended to install the latest version directly from the GitHub repository: 46 | 47 | ```bash 48 | # Install FunASR 49 | git clone https://github.com/alibaba/FunASR.git && cd FunASR 50 | pip3 install -e ./ 51 | 52 | # Install modelscope 53 | git clone https://github.com/modelscope/modelscope.git 54 | cd modelscope 55 | pip install -e . 56 | pip install -e .[audio_asr] 57 | ``` 58 | 59 | #### In a Windows environment 60 | 61 | If you have a GPU, you need to install the `cuda` version of `pytorch` beforehand and configure environment variables such as `ffmpeg`. Then execute the following commands: 62 | 63 | ```bash 64 | git clone https://github.com/cronrpc/SubFix.git 65 | cd SubFix 66 | pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html 67 | pip install -e . 68 | ``` 69 | 70 | For information on installing pytorch, please visit(https://pytorch.org/get-started/locally/) 71 | 72 | ## Usage Guide 73 | 74 | After installing with `pip install -e .`, you can start the tool from any directory in the `shell` using the following command. All parameters have default values, so you don't need to input any `--option` if the default is used. 75 | ```bash 76 | subfix -h 77 | 78 | # webui 79 | subfix webui -h 80 | subfix webui --load_list demo.list --webui_language zh --force_delete True 81 | # create dataset 82 | subfix create modelscope -h 83 | # English 84 | subfix create modelscope --source_dir origin --language EN 85 | # Chinese 86 | subfix create modelscope --source_dir origin --language ZH 87 | # Japanese 88 | subfix create modelscope --source_dir origin --language JA 89 | # OpenAI Whisper Annotation (Supports Almost All Languages) 90 | subfix create whisper --source_dir origin --language ZH 91 | subfix create whisper --source_dir origin --language JA 92 | # diarization (speaker segmentation) 93 | subfix diarization -h 94 | subfix diarization --source_dir origin --target_dir diarization --min_seconds 3.0 95 | ``` 96 | 97 | Before using automated annotation, it's recommended to clear the `cache/subfix/` folder. 98 | ```bash 99 | rm -rf cache/subfix 100 | ``` 101 | 102 | ## Starting SubFix to View Dataset 103 | 104 | `SubFix` supports two formats: `.json` and `.list`. 105 | 106 | In the `.list` format, each line is similar to `"{wav_path}|{speaker_name}|{language}|{text}"`. 107 | 108 | For example, if you already have a `demo.list` file and its corresponding audio files are in the correct path, you can use the following commands to start the `SubFix` UI interface: 109 | 110 | ```bash 111 | subfix webui --load_list demo.list 112 | # or 113 | subfix webui --load_json demo.json 114 | ``` 115 | 116 | Viewing Help: 117 | ```bash 118 | subfix --help 119 | subfix webui --help 120 | ``` 121 | 122 | ### Quick Viewing and Listening to Audio 123 | 124 | You can click the `Previous Index` and `Next Index` buttons to switch lists, or drag the `slider` and click `Change Index` for quick positioning in the list. 125 | 126 | ![change index gif](images/index.gif) 127 | 128 | ### Modifying Text 129 | 130 | You can directly modify the text and click the `Submit Text` button to save the changes. 131 | 132 | ![change text gif](images/text.gif) 133 | 134 | ### Merging 135 | 136 | Select the audios you want to merge, set the `merge interval`, and then click the `merge` button to merge the audio. 137 | 138 | ![merge audio gif](images/merge.gif) 139 | 140 | ### Splitting Audio 141 | 142 | Select the audio to be split, set the `split point`, and then click the `split` button to proceed. Note that only one audio can be split at a time, and the text needs to be adjusted again after splitting. 143 | 144 | ![split audio gif](images/split.gif) 145 | 146 | ### Deleting 147 | 148 | Select the audio to be deleted and click the `button` to delete. The delete operation will be temporarily stored in memory. To save it to a file, click the save button or execute another command. 149 | 150 | ![delete audio gif](images/delete.gif) 151 | 152 | ### Automated Audio Annotation and Dataset Creation 153 | 154 | By default, place the audio files in the `origin` folder. For an audio file `abc.wav` by a speaker `sam`, its file path could be structured like `./origin/sam/abc.wav`. Then execute the following command: 155 | 156 | ```bash 157 | # rm -rf cache/subfix 158 | subfix create --source_dir origin --output demo.list 159 | ``` 160 | 161 | This command will create a `dataset` directory and store the paths and subtitles of all transcribed audio files in the `demo.list` file. 162 | 163 | ### Add Punctuation to List Files 164 | 165 | If you want to use punctuation, use the following command to automatically add punctuation to the text in the list file: 166 | 167 | ``` 168 | subfix punctuation --load_list demo.list 169 | ``` 170 | 171 | ### Speaker Recognition and Clustering 172 | 173 | In some cases, large audio segments might include background music, leading to the recognition of vocals or noise from the background song, causing multiple speakers to be identified in the same file. Or, when speaking is too dense, it might result in excessively long recognized audio. 174 | 175 | This feature extracts the `n` most frequent speakers from each file, with an interval of `interval` seconds between each sentence spoken by the same person. This is saved in the `diarization` folder, making it easier to extract audio later. 176 | 177 | ```bash 178 | subfix diarization --source_dir origin --target_dir diarization --min_seconds 3.0 --interval 10 --top_of_number 1 179 | subfix create modelscope --source_dir diarization --language ZH 180 | ``` 181 | 182 | ## Format Conversion 183 | 184 | The two formats, `.list` and `.json`, can be converted into each other. Use the following commands to convert files: 185 | 186 | ```bash 187 | subfix format_convert --source demo.list --target demo.json 188 | subfix format_convert --source demo.json --target demo.list 189 | ``` 190 | 191 | ## subfix_webui.py 192 | 193 | 194 | Usage of the standalone Python file version: 195 | 196 | View Help 197 | 198 | ```bash 199 | python subfix_webui.py -h 200 | ``` 201 | 202 | Launch in Chinese 203 | 204 | ```bash 205 | python subfix_webui.py --webui_language zh --load_list demo.list 206 | ``` 207 | 208 | Specify a `.list` File 209 | 210 | ```bash 211 | python subfix_webui.py --load_list demo.list 212 | ``` 213 | 214 | Synchronize deletion of disk files, the default value is True. 215 | 216 | ```bash 217 | python subfix_webui.py --force_delete True 218 | # or 219 | python subfix_webui.py --force_delete False 220 | ``` 221 | 222 | Launch with a specified port 223 | 224 | ```bash 225 | python subfix_webui.py --server_port 1234 226 | ``` 227 | 228 | ## References 229 | 230 | - [anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS) 231 | - [fishaudio/Bert-VITS2](https://github.com/fishaudio/Bert-VITS2) 232 | - [openai/whisper](https://github.com/openai/whisper) -------------------------------------------------------------------------------- /README_zh.md: -------------------------------------------------------------------------------- 1 | # SubFix 2 | `SubFix`是一个用于轻松地编辑修改音频字幕的网页工具。能够实时地看到改动,方便地对音频进行**合并、分割、删除、编辑字幕**。 3 | 4 | `SubFix`同时也支持自动化语音标注,使用`modelscope`和`whisper`对文本进行多语言标注。目前`modelscope`支持中文、英语、日语、德语、德语、俄语的自动化标注。`whisper`支持几乎所有语言。 5 | 6 | [English Version](README.md) 7 | 8 | 独立的`.py`文件版本,可以通过[subfix_webui.py](https://github.com/cronrpc/SubFix/blob/main/subfix_webui.py)获取。该版本可以通过命令行参数来选择语言,支持英文和中文。同时可以选择在编辑时是否同步删除硬盘上的音频文件。 9 | 10 | 单独Python文件版本的使用方法,可访问:[subfix_webui.py 帮助](#subfix_webuipy) 11 | 12 | ## 安装 13 | 14 | 进行如下安装步骤可以快速而轻松的安装。建议使用`Linux`环境。如果是`Windows`环境,需要您手动配置`ffmpeg`环境变量,并且`modelscope`的安装可能比较复杂。 15 | 16 | ### 安装依赖 17 | 18 | 确认安装的`Python`版本最好大于`3.9`,然后执行如下命令。如果您不需要使用音频的自动标注,那么不需要安装`Modelscope`模块。 19 | 20 | 使用conda 21 | ```bash 22 | conda create -n modelscope python=3.9 23 | conda activate modelscope 24 | ``` 25 | 26 | 安装依赖 27 | 28 | #### 在`Linux`环境 29 | 30 | ```bash 31 | sudo apt install build-essential 32 | sudo apt install ffmpeg 33 | sudo apt install libsox-dev 34 | 35 | git clone https://github.com/cronrpc/SubFix.git 36 | cd SubFix 37 | pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html 38 | pip install -e . 39 | ``` 40 | 41 | #### 更新modelscope和FunASR到最新版本 42 | 43 | 由于modelscope的API发生了改动,通过pip安装的代码未必是最新版本,可能存在无法运行自动标注的情况。 44 | 45 | 这里建议是,可以直接安装最新Github仓库的modelscope和funASR到最新版本。 46 | 47 | ``` 48 | # 安装FunASR 49 | git clone https://github.com/alibaba/FunASR.git && cd FunASR 50 | pip3 install -e ./ 51 | 52 | # 安装modelscope 53 | git clone https://github.com/modelscope/modelscope.git 54 | cd modelscope 55 | pip install -e . 56 | pip install -e .[audio_asr] 57 | ``` 58 | 59 | #### 在`Windows`环境 60 | 61 | 如果有gpu,需要提前安装`pytorch`的`cuda`版本,配置`ffmpeg`等环境变量,之后执行下列命令 62 | 63 | ```bash 64 | git clone https://github.com/cronrpc/SubFix.git 65 | cd SubFix 66 | pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html 67 | pip install -e . 68 | ``` 69 | 70 | 关于`pytorch`安装,请访问(https://pytorch.org/get-started/locally/) 71 | 72 | ## 使用指南 73 | 74 | 当你使用`pip install -e .`安装后,在`shell`下可以通过该命令在任意目录下启动本工具。所有参数都有默认值,如果是默认值的话,不需要输入任何`--option`。 75 | ```bash 76 | subfix -h 77 | 78 | # webui 79 | subfix webui -h 80 | subfix webui --load_list demo.list --webui_language zh --force_delete True 81 | # create dataset 82 | subfix create modelscope -h 83 | # 英语 84 | subfix create modelscope --source_dir origin --language EN 85 | # 中文 86 | subfix create modelscope --source_dir origin --language ZH 87 | # 日语 88 | subfix create modelscope --source_dir origin --language JA 89 | # Openai-Whisper标注 (几乎支持所有语言) 90 | subfix create whisper --source_dir origin --language ZH 91 | subfix create whisper --source_dir origin --language JA 92 | # 说话人确认 (分离不同说话人) 93 | subfix diarization -h 94 | subfix diarization --source_dir origin --target_dir diarization --min_seconds 3.0 95 | ``` 96 | 97 | 每次使用自动标注前,建议清空一下`cache/subfix/`文件夹 98 | ```bash 99 | rm -rf cache/subfix 100 | ``` 101 | 102 | ## 启动SubFix查看数据集 103 | 104 | `SubFix`支持2种格式,分别是`.json`和`.list`格式。 105 | 106 | `.list`的格式中,每行数据类似于`"{wav_path}|{speaker_name}|{language}|{text}"`。 107 | 108 | 例如,如果你已经有了一个`demo.list`文件,和它对应的音频已经放到了正确的路径,那么可以执行如下命令来启动`SubFix`的UI界面: 109 | 110 | ```bash 111 | subfix webui --load_list demo.list 112 | # or 113 | subfix webui --load_json demo.json 114 | ``` 115 | 116 | 查看帮助 117 | ```bash 118 | subfix --help 119 | subfix webui --help 120 | ``` 121 | 122 | ### 快速查看和听取音频 123 | 124 | 可以点击`Previous Index`、`Next Index`按钮来切换列表,同时可以拖动`slider`并点击`Change Index`来快速定位列表。 125 | 126 | ![change index gif](images/index.gif) 127 | 128 | ### 修改文本 129 | 130 | 可以直接修改文本,并点击`Submit Text`按钮来保存修改。 131 | 132 | ![change text gif](images/text.gif) 133 | 134 | ### 合并 135 | 136 | 选择需要合并的音频,设置`合并间隔`,然后点击`合并`按钮来合并音频。 137 | 138 | ![merge audio gif](images/merge.gif) 139 | 140 | ### 分割音频 141 | 142 | 选择需要分割的音频,设置`分割点`,然后点击`分割`按钮来进行分割。注意,一次只能分割一个音频,分割后需要重新调整下文本。 143 | 144 | ![split audio gif](images/split.gif) 145 | 146 | ### 删除 147 | 148 | 选择需要删除的音频,点击`按钮`进行删除。删除操作将暂存到内存之中,如果需要保存到文件中,需要点击保存按钮,或者执行一次其他命令来保存。 149 | 150 | ![delete audio gif](images/delete.gif) 151 | 152 | ### 自动标注音频和创建数据集 153 | 154 | 默认情况下,将音频文件放入`origin`文件夹下,对于一个`sam`音频文件`abc.wav`,其所在的文件路径可以是`./origin/sam/abc.wav`这样的结构,之后执行下面的命令: 155 | 156 | ```bash 157 | # rm -rf cache/subfix 158 | subfix create --source_dir origin --output demo.list 159 | ``` 160 | 161 | 该命令将创建一个`dataset`目录,同时将所有文件转录的音频的路径和字幕存储到了`demo.list`文件中。 162 | 163 | ### 给list文件添加标点符号 164 | 165 | 如果要用标点符号,使用下面的命令,自动对list文件中的文本添加标点符号。 166 | 167 | ``` 168 | subfix punctuation --load_list demo.list 169 | ``` 170 | 171 | ### 说话人识别、聚类 172 | 173 | 在某些情况下,大段落音频中由于存在背景音乐,会将背景歌曲的人声或噪音识别,造成同一文件中的多人说话。 174 | 175 | 又或者,说话太过密集,导致识别出来的音频过长。 176 | 177 | 该功能将提取出每个文件中出现次数最多的`n`个说话人,说话人的每句话之间间隔`interval`秒,保存在`diarization`文件夹中,便于后续提取音频。 178 | 179 | ```bash 180 | subfix diarization --source_dir origin --target_dir diarization --min_seconds 3.0 --interval 10 --top_of_number 1 181 | subfix create modelscope --source_dir diarization --language ZH 182 | ``` 183 | 184 | ## 格式转换 185 | 186 | 两种格式`.list`和`.json`可以互相转换,使用如下命令对文件进行转换: 187 | 188 | ```bash 189 | subfix format_convert --source demo.list --target demo.json 190 | subfix format_convert --source demo.json --target demo.list 191 | ``` 192 | 193 | ## subfix_webui.py 194 | 195 | 单独Python文件版本的使用方法: 196 | 197 | 查看帮助 198 | 199 | ```bash 200 | python subfix_webui.py -h 201 | ``` 202 | 203 | 中文启动 204 | 205 | ```bash 206 | python subfix_webui.py --webui_language zh --load_list demo.list 207 | ``` 208 | 209 | 指定`.list`文件 210 | 211 | ```bash 212 | python subfix_webui.py --load_list demo.list 213 | ``` 214 | 215 | 同步删除磁盘文件,默认值是True。 216 | 217 | ```bash 218 | python subfix_webui.py --force_delete True 219 | # or 220 | python subfix_webui.py --force_delete False 221 | ``` 222 | 223 | 指定端口启动 224 | 225 | ```bash 226 | python subfix_webui.py --server_port 1234 227 | ``` 228 | 229 | ## References 230 | 231 | - [anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS) 232 | - [fishaudio/Bert-VITS2](https://github.com/fishaudio/Bert-VITS2) 233 | - [openai/whisper](https://github.com/openai/whisper) -------------------------------------------------------------------------------- /images/delete.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/images/delete.gif -------------------------------------------------------------------------------- /images/index.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/images/index.gif -------------------------------------------------------------------------------- /images/merge.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/images/merge.gif -------------------------------------------------------------------------------- /images/split.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/images/split.gif -------------------------------------------------------------------------------- /images/text.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/images/text.gif -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | librosa 2 | gradio>=3.50.2, <4.0.0 3 | numpy 4 | soundfile 5 | torchaudio 6 | transformers 7 | openai-whisper 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import io 5 | import os 6 | import sys 7 | import pkg_resources 8 | from shutil import rmtree 9 | from setuptools import find_packages, setup, Command 10 | 11 | NAME = 'SubFix' 12 | DESCRIPTION = 'A tool to read dataset and crate dataset to train TTS.' 13 | URL = 'https://github.com/cronrpc/SubFix' 14 | EMAIL = 'cronrpc' 15 | AUTHOR = 'cronrpc' 16 | REQUIRES_PYTHON = '>=3.8.0' 17 | VERSION = '0.1.2' 18 | 19 | REQUIRED = [ 20 | ] 21 | 22 | EXTRAS = { 23 | } 24 | 25 | here = os.path.abspath(os.path.dirname(__file__)) 26 | long_description = DESCRIPTION 27 | 28 | 29 | setup( 30 | name=NAME, 31 | version=VERSION, 32 | description=DESCRIPTION, 33 | long_description=long_description, 34 | long_description_content_type='text/markdown', 35 | author=AUTHOR, 36 | author_email=EMAIL, 37 | python_requires=REQUIRES_PYTHON, 38 | url=URL, 39 | py_modules=['subfix'], 40 | install_requires=REQUIRED 41 | + [ 42 | str(r) 43 | for r in pkg_resources.parse_requirements( 44 | open(os.path.join(os.path.dirname(__file__), "requirements.txt")) 45 | ) 46 | ], 47 | entry_points={ 48 | "console_scripts": ["subfix=subfix.cli:cli"], 49 | }, 50 | extras_require=EXTRAS, 51 | include_package_data=True, 52 | license='Apache 2.0', 53 | classifiers=[ 54 | 'License :: OSI Approved :: Apache 2.0', 55 | 'Programming Language :: Python', 56 | 'Programming Language :: Python :: 3', 57 | 'Programming Language :: Python :: 3.8', 58 | 'Programming Language :: Python :: Implementation :: CPython', 59 | 'Programming Language :: Python :: Implementation :: PyPy' 60 | ], 61 | ) 62 | -------------------------------------------------------------------------------- /subfix/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/__init__.py -------------------------------------------------------------------------------- /subfix/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | 5 | def handle_diarization(args): 6 | print(f"handle_diarization from {args.source_dir} to {args.target_dir}") 7 | assert(os.path.exists(args.source_dir)) 8 | from subfix.solution.diarization import diarization_dir 9 | diarization_dir(args) 10 | pass 11 | 12 | 13 | def handle_punctuation(args): 14 | assert(os.path.exists(args.load_list)) 15 | from subfix.solution.punctuation_multi_lang import punctuation_multi_lang_process 16 | punctuation_multi_lang_process(args) 17 | pass 18 | 19 | 20 | def handle_format_convert(args): 21 | from .format import FormatBertvits2, FormatJson 22 | print(os.path.splitext(args.source)[1]) 23 | if os.path.splitext(args.source)[1] == '.list': 24 | souce_format = FormatBertvits2() 25 | else: 26 | souce_format = FormatJson() 27 | 28 | if os.path.splitext(args.target)[1] == '.list': 29 | target_format = FormatBertvits2() 30 | else: 31 | target_format = FormatJson() 32 | 33 | data = souce_format.load(args.source) 34 | target_format.save(args.target, data) 35 | 36 | 37 | def handle_webui(args): 38 | from .webui import startwebui 39 | args.force_delete = (args.force_delete.upper() == "TRUE") 40 | startwebui(args) 41 | 42 | 43 | def handle_create(args): 44 | print(f"Checkout command with args: {args}") 45 | if args.solution == "modelscope": 46 | from .solution.modelscope_multi_lang import run_task 47 | run_task(args) 48 | elif args.solution == "whisper": 49 | from .solution.whisper_multi_lang import run_whisper_task 50 | run_whisper_task(args) 51 | 52 | 53 | def cli(): 54 | parser = argparse.ArgumentParser(description="a tool to check or create TTS dataset") 55 | subparsers = parser.add_subparsers(dest='command') 56 | 57 | # webui 58 | parser_webui = subparsers.add_parser('webui', 59 | help='webui to modify audios') 60 | parser_webui.add_argument('--load_json', default="None", help='source file, like demo.json') 61 | parser_webui.add_argument('--load_list', default="None", help='source file, like demo.list') 62 | parser_webui.add_argument('--json_key_text', default="text", type=str, help='the text key name in json, Default: text') 63 | parser_webui.add_argument('--json_key_path', default="wav_path", type=str, help='the path key name in json, Default: wav_path') 64 | parser_webui.add_argument('--g_batch', default=10, type=int, help='max number g_batch wav to display, Default: 10') 65 | parser_webui.add_argument('--webui_language', default="en", type=str, help='webui language: en or zh, Default: en') 66 | parser_webui.add_argument('--force_delete', default="True", type=str, help='delete file in disk while delete items, True or False, Default: True') 67 | parser_webui.set_defaults(func=handle_webui) 68 | 69 | 70 | # create 71 | parser_create = subparsers.add_parser('create', 72 | help='create dataset by origin audio dirctory: subfix create [modelscope|whisper]') 73 | create_subparsers = parser_create.add_subparsers(dest='solution', 74 | help='auto asr solution, modelscope or whisper') 75 | 76 | # create modelscope 77 | modelscope_subparsers = create_subparsers.add_parser('modelscope', 78 | help='modelscope models') 79 | modelscope_subparsers.add_argument("--source_dir", type=str, default="origin", help="Source directory path, Default: origin") 80 | modelscope_subparsers.add_argument("--target_dir", type=str, default="dataset", help="Target directory path, Default: dataset") 81 | modelscope_subparsers.add_argument("--cache_dir", type=str, default="cache", help="cache directory path, Default: cache") 82 | modelscope_subparsers.add_argument("--sample_rate", type=int, default=44100, help="Sample rate, Default: 44100") 83 | modelscope_subparsers.add_argument("--language", type=str, default="ZH", help="Language, Default: ZH|JA|KO|EN|DE|RU") 84 | modelscope_subparsers.add_argument("--output", type=str, default="demo.list", help="List file, Default: demo.list") 85 | modelscope_subparsers.add_argument("--max_seconds", type=int, default=15, help="Max sliced voice length(seconds), Default: 15") 86 | modelscope_subparsers.add_argument("--absolute_path", default="False", type=str, help='absolute_path True or False, Default: False') 87 | modelscope_subparsers.set_defaults(func=handle_create) 88 | 89 | # create whisper 90 | whisper_subparsers = create_subparsers.add_parser('whisper', 91 | help='whisper models') 92 | whisper_subparsers.add_argument("--source_dir", type=str, default="origin", help="Source directory path, Default: origin") 93 | whisper_subparsers.add_argument("--target_dir", type=str, default="dataset", help="Target directory path, Default: dataset") 94 | whisper_subparsers.add_argument("--cache_dir", type=str, default="cache", help="cache directory path, Default: cache") 95 | whisper_subparsers.add_argument("--model", type=str, default="large-v3", help="whisper model small/medium/large-v3, Default: small") 96 | whisper_subparsers.add_argument("--sample_rate", type=int, default=44100, help="Sample rate, Default: 44100") 97 | whisper_subparsers.add_argument("--language", type=str, default="ZH", help="Any Language whisper support, Default: ZH") 98 | whisper_subparsers.add_argument("--output", type=str, default="demo.list", help="List file, Default: demo.list") 99 | whisper_subparsers.add_argument("--max_seconds", type=int, default=15, help="Max sliced voice length(seconds), Default: 15") 100 | whisper_subparsers.add_argument("--absolute_path", default="False", type=str, help='absolute_path True or False, Default: False') 101 | whisper_subparsers.set_defaults(func=handle_create) 102 | 103 | # format_convert 104 | parser_format_convert = subparsers.add_parser('format_convert', 105 | help='format_convert: format_convert --source demo.json --target demo.list') 106 | parser_format_convert.add_argument('--source', default="demo.list", help='source file, like demo.json/list') 107 | parser_format_convert.add_argument('--target', default="demo.json", help='target file, like demo.list/json') 108 | parser_format_convert.set_defaults(func=handle_format_convert) 109 | 110 | # diarization 111 | parser_diarization = subparsers.add_parser('diarization', 112 | help='diarization: diarization -h') 113 | parser_diarization.add_argument('--source_dir', default="origin", help='source dir, Default: origin') 114 | parser_diarization.add_argument('--target_dir', default="diarization", help='target dir, Default: diarization') 115 | parser_diarization.add_argument('--cache_dir', default="cache", help='cache dir, Default: cache') 116 | parser_diarization.add_argument('--min_seconds', default=3.0, type=float, help='slice must bigger than min_seconds, Default: 3.0') 117 | parser_diarization.add_argument('--top_of_number', default=1, type=int, help='The n items with the highest frequency of occurrence. Default: 1') 118 | parser_diarization.add_argument('--interval', default=1.0, type=float, help='The interval between two slice audio. Default: 1.0') 119 | parser_diarization.add_argument("--sample_rate", type=int, default=44100, help="Sample rate, Default: 44100") 120 | parser_diarization.add_argument("--oracle_num", type=int, default=0, help="oracle number, the person number you think maybe in audio, Default: 0") 121 | parser_diarization.set_defaults(func=handle_diarization) 122 | 123 | # punctuation 124 | parser_punctuation = subparsers.add_parser('punctuation', 125 | help='punctuation: punctuation -h') 126 | parser_punctuation.add_argument('--load_list', default="demo.list", type=str, help='source file, like demo.list') 127 | parser_punctuation.set_defaults(func=handle_punctuation) 128 | 129 | # run 130 | args = parser.parse_args() 131 | 132 | if hasattr(args, 'func'): 133 | args.func(args) 134 | else: 135 | parser.print_help() 136 | 137 | -------------------------------------------------------------------------------- /subfix/format/FormatBertvits2.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List 3 | 4 | 5 | class FormatBertvits2(): 6 | 7 | def __init__(self) -> None: 8 | pass 9 | 10 | def load(self, path : str) -> List[dict]: 11 | # this format : {wav_path}|{speaker_name}|{language}|{text}" 12 | data = [] 13 | with open(path, 'r', encoding="utf-8") as source: 14 | read_list = source.readlines() 15 | for _ in read_list: 16 | items = _.split('|') 17 | if (len(items) == 4): 18 | wav_path, speaker_name, language, text= items 19 | data.append( 20 | { 21 | 'wav_path':wav_path, 22 | 'speaker_name':speaker_name, 23 | 'language':language, 24 | 'text':text.strip() 25 | } 26 | ) 27 | print(f"data has been load from {path}") 28 | return data 29 | 30 | def save(self, path : str, data : List[dict]): 31 | with open(path, 'w', encoding="utf-8") as target: 32 | for _ in data: 33 | wav_path = _['wav_path'] 34 | speaker_name = _['speaker_name'] 35 | language = _['language'] 36 | text = _['text'] 37 | target.write(f"{wav_path}|{speaker_name}|{language}|{text}\n") 38 | print(f"data has been save at {path}") -------------------------------------------------------------------------------- /subfix/format/FormatJson.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List 3 | 4 | 5 | class FormatJson(): 6 | 7 | def __init__(self) -> None: 8 | pass 9 | 10 | def load(self, path : str): 11 | with open(path, 'r', encoding="utf-8") as source: 12 | data_lines = source.readlines() 13 | data = [json.loads(line) for line in data_lines] 14 | print(f"data has been load from {path}") 15 | return data 16 | 17 | def save(self, path : str, data : List[dict]): 18 | with open(path, 'w', encoding="utf-8") as target: 19 | for item in data: 20 | line = json.dumps(item, ensure_ascii=False) 21 | target.write(line + '\n') 22 | print(f"data has been save at {path}") -------------------------------------------------------------------------------- /subfix/format/__init__.py: -------------------------------------------------------------------------------- 1 | from .FormatBertvits2 import FormatBertvits2 2 | from .FormatJson import FormatJson -------------------------------------------------------------------------------- /subfix/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/models/__init__.py -------------------------------------------------------------------------------- /subfix/models/audio/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/models/audio/__init__.py -------------------------------------------------------------------------------- /subfix/models/audio/asr/__init__.py: -------------------------------------------------------------------------------- 1 | from .speech_paraformer_large_vad_punc_asr_zh import Speech_Paraformer_Large_Vad_Punc_Asr_zh 2 | from .speech_uniasr_asr_multilang import Speech_UniASR_Asr_MultiLang 3 | from .openai_whisper import Openai_Whisper -------------------------------------------------------------------------------- /subfix/models/audio/asr/openai_whisper.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import Any 3 | import librosa 4 | 5 | class Openai_Whisper(): 6 | def __init__(self, language : str, model_name : str = "large-v3") -> None: 7 | import whisper 8 | self.whisper_model = whisper.load_model(model_name, download_root = None) 9 | self.language = language 10 | 11 | def infer(self, audio_in) -> None: 12 | print("start asr:", audio_in) 13 | segments = self.whisper_model.transcribe(audio_in, word_timestamps=True, language = self.language)['segments'] 14 | data_list = [] 15 | for _ in segments: 16 | item = {} 17 | item['start'] = _['start'] 18 | item['end'] = _['end'] 19 | item['text'] = _['text'].strip() 20 | data_list.append(item) 21 | return data_list 22 | 23 | def __call__(self, *args: Any, **kwds: Any) -> Any: 24 | return self.infer(*args, **kwds) -------------------------------------------------------------------------------- /subfix/models/audio/asr/speech_paraformer_large_vad_punc_asr_zh.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | 4 | class Speech_Paraformer_Large_Vad_Punc_Asr_zh(): 5 | def __init__(self, language : str = "ZH") -> None: 6 | from funasr import AutoModel 7 | 8 | self._model = AutoModel(model="paraformer-zh", model_revision="v2.0.4", 9 | vad_model="fsmn-vad", vad_model_revision="v2.0.4", 10 | punc_model="ct-punc-c", punc_model_revision="v2.0.4", 11 | spk_model="cam++", spk_model_revision="v2.0.2", 12 | ) 13 | 14 | def infer(self, audio_in) -> None: 15 | rec_result = self._model.generate(input=audio_in, 16 | batch_size_s=300, 17 | hotword='') # dict_keys(['text', 'start', 'end', 'timestamp', 'spk']) 18 | data_list = [] 19 | for sentence in rec_result[0]['sentence_info']: 20 | if sentence['text'].strip() == "": 21 | continue 22 | item = {} 23 | item['start'] = sentence['timestamp'][0][0] / 1000.0 24 | item['end'] = sentence['end'] / 1000.0 25 | item['text'] = sentence['text'].strip() 26 | data_list.append(item) 27 | return data_list 28 | 29 | def __call__(self, *args: Any, **kwds: Any) -> Any: 30 | return self.infer(*args, **kwds) -------------------------------------------------------------------------------- /subfix/models/audio/asr/speech_uniasr_asr_multilang.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | import librosa 3 | 4 | class Speech_UniASR_Asr_MultiLang(): 5 | def __init__(self, language : str, max_seconds : float) -> None: 6 | self.set_asr_model_by_language(language) 7 | self.set_vad_model_by_language(language, max_seconds) 8 | 9 | def set_asr_model_by_language(self, language): 10 | 11 | model_config = { 12 | "KO" : 'damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline', 13 | "JA" : 'damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline', 14 | "EN" : 'damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline', 15 | "DE" : 'damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online', 16 | "RU" : 'damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline', 17 | "ZH" : 'iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch', 18 | } 19 | 20 | model_config_revision = { 21 | "DE" : 'v1.0.1', 22 | "ZH" : 'v2.0.4' 23 | } 24 | 25 | assert( language in model_config.keys() ) 26 | 27 | from modelscope.pipelines import pipeline 28 | from modelscope.utils.constant import Tasks 29 | 30 | revision = model_config_revision[language] if (language in model_config_revision.keys()) else None 31 | 32 | self._asr_model = pipeline( task = Tasks.auto_speech_recognition, 33 | model = model_config[language], 34 | model_revision = revision ) 35 | 36 | def set_vad_model_by_language(self, language, max_seconds = 60.0): 37 | from subfix.models.audio.vad.speech_fsmn_vad_zh import Speech_Fsmn_Vad_Zh_16k_Common 38 | self._vad_model = Speech_Fsmn_Vad_Zh_16k_Common(max_seconds=max_seconds) 39 | 40 | def infer(self, audio_in) -> None: 41 | print("start asr:", audio_in) 42 | vad_list = self._vad_model(audio_in = audio_in) 43 | data_list = [] 44 | waveform, sample_rate = librosa.load(audio_in, sr=16000, mono=True) 45 | for _ in vad_list: 46 | start_time, end_time = _['start'], _['end'] 47 | start = int(start_time * sample_rate) 48 | end = int(end_time * sample_rate) 49 | slice_waveform = waveform[start: end] 50 | ret_asrmodl = self._asr_model(input = slice_waveform) 51 | if (len(ret_asrmodl) > 0): 52 | text = ret_asrmodl[0]['text'] 53 | print(text) 54 | if text.strip() == "": 55 | continue 56 | item = {} 57 | item['start'] = start_time 58 | item['end'] = end_time 59 | item['text'] = text.strip() 60 | data_list.append(item) 61 | return data_list 62 | 63 | def __call__(self, *args: Any, **kwds: Any) -> Any: 64 | return self.infer(*args, **kwds) -------------------------------------------------------------------------------- /subfix/models/audio/punctuation/__init__.py: -------------------------------------------------------------------------------- 1 | from .punctuation_funasr import Punctuation_FunASR -------------------------------------------------------------------------------- /subfix/models/audio/punctuation/punctuation_funasr.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | class Punctuation_FunASR(): 4 | def __init__(self) -> None: 5 | from funasr import AutoModel 6 | self._model = AutoModel(model="ct-punc", model_revision="v2.0.4") 7 | 8 | def infer(self, input): 9 | res = self._model.generate(input=input) 10 | if (len(res) > 0): 11 | text = res[0]['text'] 12 | return text 13 | else: 14 | return "" 15 | 16 | def __call__(self, *args: Any, **kwds: Any) -> Any: 17 | return self.infer(*args, **kwds) -------------------------------------------------------------------------------- /subfix/models/audio/speaker_diarization/Speech_Campplus_Speaker_Diarization.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Speech_Campplus_Speaker_Diarization(): 4 | def __init__(self) -> None: 5 | from modelscope.pipelines import pipeline 6 | self._pipeline = pipeline( 7 | task='speaker-diarization', 8 | model='damo/speech_campplus_speaker-diarization_common', 9 | model_revision='v1.0.0' 10 | ) 11 | 12 | def infer(self, input, min_seconds = 0, oracle_num = None, **args): 13 | result = self._pipeline(input, oracle_num = oracle_num, **args)['text'] 14 | count_dict = {} 15 | for item in result: 16 | if item[2] in count_dict: 17 | count_dict[item[2]] = count_dict[item[2]] + 1 18 | else: 19 | count_dict[item[2]] = 1 20 | numbers = list(reversed([[k, v] for k, v in sorted(count_dict.items(), key=lambda m : list(m)[1])])) 21 | topn = [i[0] for i in numbers] # person 22 | topn_number = [i[1] for i in numbers] # number 23 | res = [] 24 | for item in result: 25 | if item[1] - item[0] > min_seconds: 26 | res.append(item) 27 | return res, topn, topn_number -------------------------------------------------------------------------------- /subfix/models/audio/speaker_diarization/__init__.py: -------------------------------------------------------------------------------- 1 | from .Speech_Campplus_Speaker_Diarization import Speech_Campplus_Speaker_Diarization -------------------------------------------------------------------------------- /subfix/models/audio/vad/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/models/audio/vad/__init__.py -------------------------------------------------------------------------------- /subfix/models/audio/vad/speech_fsmn_vad_zh.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | class Speech_Fsmn_Vad_Zh_16k_Common(): 4 | def __init__(self, max_seconds : float = 60.0) -> None: 5 | from modelscope.pipelines import pipeline 6 | from modelscope.utils.constant import Tasks 7 | 8 | self._inference_pipeline = pipeline( 9 | task=Tasks.voice_activity_detection, 10 | model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch', 11 | model_revision=None, 12 | ) 13 | self.max_seconds = max_seconds 14 | self._tolerance = 1e-6 15 | 16 | def infer(self, audio_in) -> None: 17 | rec_result = self._inference_pipeline(audio_in)[0] 18 | # return [{start : seconds, end: seconds}] 19 | data = [] 20 | for item in rec_result['value']: 21 | start = item[0] / 1000.0 22 | end = item[1] / 1000.0 23 | duration = end - start 24 | if duration <= self.max_seconds: 25 | data.append({'start': start, 'end': end}) 26 | else: 27 | num_segments = int(duration / self.max_seconds) + (1 if duration % self.max_seconds > self._tolerance else 0) 28 | segment_length = duration / num_segments 29 | for i in range(num_segments): 30 | new_start = start + i * segment_length 31 | new_end = min(new_start + segment_length, end) 32 | data.append({'start': new_start, 'end': new_end}) 33 | return data 34 | 35 | def __call__(self, *args: Any, **kwds: Any) -> Any: 36 | return self.infer(*args, **kwds) -------------------------------------------------------------------------------- /subfix/models/audio/verification/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/models/audio/verification/__init__.py -------------------------------------------------------------------------------- /subfix/models/nlp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/models/nlp/__init__.py -------------------------------------------------------------------------------- /subfix/models/nlp/correction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/models/nlp/correction/__init__.py -------------------------------------------------------------------------------- /subfix/solution/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/solution/__init__.py -------------------------------------------------------------------------------- /subfix/solution/diarization.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from subfix.models.audio.speaker_diarization import Speech_Campplus_Speaker_Diarization 4 | from subfix.utils import convert_files, get_files_by_ext 5 | from subfix.utils.misc import merge_audio_vads 6 | 7 | def diarization_dir(args): 8 | 9 | 10 | source_dir = args.source_dir 11 | target_dir = args.target_dir 12 | cache_dir = args.cache_dir 13 | sample_rate = args.sample_rate 14 | min_seconds = args.min_seconds 15 | top_of_number = args.top_of_number 16 | interval = args.interval 17 | oracle_num = None if int(args.oracle_num) == 0 else int(args.oracle_num) 18 | 19 | dir_16000 = os.path.join(cache_dir,'subfix','origin','16000') 20 | dir_sample_rate = os.path.join(cache_dir,'subfix','origin',str(sample_rate)) 21 | 22 | if os.path.exists(dir_16000): 23 | shutil.rmtree(dir_16000) 24 | if os.path.exists(dir_sample_rate): 25 | shutil.rmtree(dir_sample_rate) 26 | 27 | convert_files(source_dir, dir_sample_rate, sample_rate) 28 | convert_files(dir_sample_rate, dir_16000, 16000) 29 | 30 | files = get_files_by_ext(dir_16000, [".wav"]) 31 | 32 | print("Start Speech_Campplus_Speaker_Diarization") 33 | 34 | SCSD = Speech_Campplus_Speaker_Diarization() 35 | 36 | for file_path in files: 37 | f_16000 = os.path.join(dir_16000, file_path) 38 | f_samplerate = os.path.join(dir_sample_rate, file_path) 39 | 40 | result, topn, topn_number = SCSD.infer(f_16000, min_seconds = min_seconds , oracle_num = oracle_num) 41 | topn = topn[:top_of_number] 42 | for person in topn: 43 | vad_list = [] 44 | save_path = os.path.join(target_dir, os.path.splitext(file_path)[0] + f"_{person}" + os.path.splitext(file_path)[1]) 45 | print("save:", save_path) 46 | for item in result: 47 | if item[2] == person: 48 | vad_list.append(item[:2]) 49 | if len(vad_list) > 0: 50 | merge_audio_vads(f_samplerate, save_path, vad_list, interval=interval) 51 | 52 | if os.path.exists(dir_16000): 53 | shutil.rmtree(dir_16000) 54 | if os.path.exists(dir_sample_rate): 55 | shutil.rmtree(dir_sample_rate) -------------------------------------------------------------------------------- /subfix/solution/modelscope_multi_lang.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import re 4 | import subprocess 5 | 6 | import librosa 7 | import numpy as np 8 | import soundfile 9 | 10 | from subfix.models.audio.asr import Speech_Paraformer_Large_Vad_Punc_Asr_zh , Speech_UniASR_Asr_MultiLang 11 | from subfix.utils import convert_files 12 | from subfix.utils.misc import merge_audio_slice, get_sub_dirs 13 | 14 | 15 | def create_dataset(source_dir, target_dir, sample_rate, language, infer_model, max_seconds, absolute_path : bool): 16 | # source_dir, target_dir, sample_rate=44100, language = "ZH", inference_pipeline = None 17 | 18 | roles = get_sub_dirs(source_dir) 19 | count = 0 20 | result = [] 21 | 22 | for speaker_name in roles: 23 | 24 | source_audios = [f for f in os.listdir(os.path.join(source_dir, speaker_name)) if f.endswith(".wav")] 25 | source_audios = [os.path.join(source_dir, speaker_name, filename) for filename in source_audios] 26 | slice_dir = os.path.join(target_dir, speaker_name) 27 | os.makedirs(slice_dir, exist_ok=True) 28 | 29 | for audio_path in sorted(source_audios): 30 | 31 | data_list = infer_model(audio_in=audio_path) 32 | 33 | data, count = merge_audio_slice(audio_path, slice_dir, data_list, count, sample_rate, max_seconds, language, speaker_name) 34 | 35 | for item_audio in data: 36 | if absolute_path: 37 | sliced_audio_path = os.path.abspath(item_audio['sliced_audio_path']) 38 | else: 39 | sliced_audio_path = item_audio['sliced_audio_path'] 40 | speaker_name = item_audio['speaker_name'] 41 | language = item_audio['language'] 42 | text = item_audio['text'] 43 | result.append(f"{sliced_audio_path}|{speaker_name}|{language}|{text}") 44 | 45 | return result 46 | 47 | 48 | def create_list(source_dir, target_dir, cache_dir, sample_rate, language, output_list, max_seconds, absolute_path : bool): 49 | 50 | resample_dir = os.path.join(cache_dir,"subfix","origin",f"{sample_rate}") 51 | 52 | convert_files(source_dir, resample_dir, sample_rate) 53 | 54 | if language == "ZH": 55 | asr_model = Speech_Paraformer_Large_Vad_Punc_Asr_zh() 56 | else: 57 | asr_model = Speech_UniASR_Asr_MultiLang(language=language, max_seconds=max_seconds) 58 | 59 | result = create_dataset(resample_dir, target_dir, sample_rate = sample_rate, language = language, infer_model = asr_model, max_seconds = max_seconds, absolute_path = absolute_path) 60 | 61 | with open(output_list, "w", encoding="utf-8") as file: 62 | for line in result: 63 | try: 64 | file.write(line.strip() + '\n') 65 | except UnicodeEncodeError: 66 | print("UnicodeEncodeError: Can't encode to ASCII:", line) 67 | 68 | 69 | def run_task(args): 70 | 71 | args.absolute_path = (args.absolute_path.upper() == "TRUE") 72 | 73 | create_list(args.source_dir, args.target_dir, args.cache_dir, args.sample_rate, args.language, args.output, args.max_seconds, args.absolute_path) 74 | 75 | -------------------------------------------------------------------------------- /subfix/solution/punctuation_multi_lang.py: -------------------------------------------------------------------------------- 1 | from subfix.format import FormatBertvits2 2 | from subfix.models.audio.punctuation import Punctuation_FunASR 3 | 4 | def punctuation_multi_lang_process(args): 5 | input_file = args.load_list 6 | souce_format = FormatBertvits2() 7 | data = souce_format.load(input_file) 8 | punc_fix = Punctuation_FunASR() 9 | for i in range(len(data)): 10 | print(i,'/',len(data),sep="") 11 | data[i]['text'] = punc_fix(data[i]['text']) 12 | data = souce_format.save(input_file, data) 13 | pass -------------------------------------------------------------------------------- /subfix/solution/whisper_multi_lang.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import re 4 | import subprocess 5 | 6 | import librosa 7 | import numpy as np 8 | import soundfile 9 | 10 | from subfix.models.audio.asr import Openai_Whisper 11 | from subfix.utils import convert_files 12 | from subfix.utils.misc import merge_audio_slice, get_sub_dirs 13 | 14 | 15 | def create_whisper_dataset(source_dir, target_dir, sample_rate, language, infer_model, max_seconds, absolute_path : bool): 16 | # source_dir, target_dir, sample_rate=44100, language = "ZH", inference_pipeline = None 17 | 18 | roles = get_sub_dirs(source_dir) 19 | count = 0 20 | result = [] 21 | 22 | for speaker_name in roles: 23 | 24 | source_audios = [f for f in os.listdir(os.path.join(source_dir, speaker_name)) if f.endswith(".wav")] 25 | source_audios = [os.path.join(source_dir, speaker_name, filename) for filename in source_audios] 26 | slice_dir = os.path.join(target_dir, speaker_name) 27 | os.makedirs(slice_dir, exist_ok=True) 28 | 29 | for audio_path in sorted(source_audios): 30 | 31 | data_list = infer_model(audio_in=audio_path) 32 | 33 | data, count = merge_audio_slice(audio_path, slice_dir, data_list, count, sample_rate, max_seconds, language, speaker_name) 34 | 35 | for item_audio in data: 36 | if absolute_path: 37 | sliced_audio_path = os.path.abspath(item_audio['sliced_audio_path']) 38 | else: 39 | sliced_audio_path = item_audio['sliced_audio_path'] 40 | speaker_name = item_audio['speaker_name'] 41 | language = item_audio['language'] 42 | text = item_audio['text'] 43 | result.append(f"{sliced_audio_path}|{speaker_name}|{language}|{text}") 44 | 45 | return result 46 | 47 | 48 | def create_whisper_list(source_dir, target_dir, cache_dir, sample_rate, language, output_list, max_seconds, model_name, absolute_path : bool): 49 | 50 | resample_dir = os.path.join(cache_dir,"subfix","origin",f"{sample_rate}") 51 | 52 | convert_files(source_dir, resample_dir, sample_rate) 53 | 54 | lang_map = { 55 | "ZH" : "Chinese", 56 | "EN" : "English", 57 | "JA" : "Japanese", 58 | "RU" : "ru", 59 | "DE" : "de", 60 | "KO" : "ko" 61 | } 62 | 63 | language_map = lang_map[language] if (language in lang_map.keys()) else language 64 | 65 | asr_model = Openai_Whisper(language = language_map, model_name = model_name) 66 | 67 | result = create_whisper_dataset(resample_dir, target_dir, sample_rate = sample_rate, language = language, infer_model = asr_model, max_seconds = max_seconds, absolute_path = absolute_path) 68 | 69 | with open(output_list, "w", encoding="utf-8") as file: 70 | for line in result: 71 | try: 72 | file.write(line.strip() + '\n') 73 | except UnicodeEncodeError: 74 | print("UnicodeEncodeError: Can't encode to ASCII:", line) 75 | 76 | 77 | def run_whisper_task(args): 78 | 79 | args.absolute_path = (args.absolute_path.upper() == "TRUE") 80 | 81 | create_whisper_list(args.source_dir, args.target_dir, args.cache_dir, args.sample_rate, args.language, args.output, args.max_seconds, args.model, args.absolute_path) 82 | 83 | -------------------------------------------------------------------------------- /subfix/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .convert import convert_wav_ffmpeg, convert_wav_librosa, convert_files 2 | from .ext_files import get_files_by_ext -------------------------------------------------------------------------------- /subfix/utils/convert.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import os 3 | import soundfile 4 | import subprocess 5 | from concurrent.futures import ThreadPoolExecutor 6 | from .ext_files import get_files_by_ext 7 | 8 | 9 | 10 | def ffmpeg_installed(): 11 | 12 | try: 13 | subprocess.run(["ffmpeg", "-version"], 14 | capture_output=True, 15 | check=True) 16 | print("find ffmpeg installed, use ffmpeg") 17 | return True 18 | except Exception as e: 19 | print("ffmpeg not found, use librosa") 20 | return False 21 | 22 | 23 | def convert_wav_ffmpeg(source_file : str, 24 | target_file : str, 25 | sample_rate : int, 26 | number : int): 27 | 28 | os.makedirs(os.path.dirname(target_file), exist_ok=True) 29 | 30 | print(f"file {number} start convert") 31 | 32 | cmd = ["ffmpeg", "-y", "-i", source_file, "-ar", f"{sample_rate}", "-ac", "1", "-v", "quiet", target_file] 33 | 34 | subprocess.run(cmd) 35 | 36 | 37 | def convert_wav_librosa(source_file : str, 38 | target_file : str, 39 | sample_rate : int, 40 | number : int): 41 | 42 | os.makedirs(os.path.dirname(target_file), exist_ok=True) 43 | 44 | print(f"file {number} start convert") 45 | 46 | data, sample_rate = librosa.load(source_file, 47 | sr=sample_rate, 48 | mono=True) 49 | 50 | soundfile.write(target_file, data, sample_rate) 51 | 52 | 53 | def convert_files(source_dir : str, 54 | target_dir : str, 55 | sample_rate : int, 56 | max_threads = None, 57 | force_librosa = False): 58 | 59 | if max_threads == None: 60 | max_threads = os.cpu_count() 61 | 62 | ext_files = get_files_by_ext(source_dir, [".mp3","acc","wav"]) 63 | 64 | ffmpeg_installed_flag = (not force_librosa) and ffmpeg_installed() 65 | 66 | os.makedirs(target_dir, exist_ok=True) 67 | 68 | with ThreadPoolExecutor(max_workers=max_threads) as executor: 69 | print(f"files count: {len(ext_files)}") 70 | print(f"max_threads = {max_threads}") 71 | for number, file in enumerate(ext_files, start=1): 72 | source_path = os.path.join(source_dir, file) 73 | target_path = os.path.join(target_dir, os.path.splitext(file)[0] + '.wav') 74 | os.makedirs(os.path.dirname(target_path), exist_ok=True) 75 | 76 | if not os.path.exists(target_path): 77 | if ffmpeg_installed_flag: 78 | executor.submit(convert_wav_ffmpeg, 79 | source_path, 80 | target_path, 81 | sample_rate, 82 | number) 83 | else: 84 | executor.submit(convert_wav_librosa, 85 | source_path, 86 | target_path, 87 | sample_rate, 88 | number) -------------------------------------------------------------------------------- /subfix/utils/ext_files.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Union, List 3 | 4 | def get_files_by_ext(directory: str, 5 | media_extensions: Union[str, List[str]] 6 | )-> List[str]: 7 | 8 | if isinstance(media_extensions, str): 9 | media_extensions = [media_extensions] 10 | 11 | relative_paths = [] 12 | 13 | for root, dirs, files in os.walk(directory): 14 | for file in files: 15 | if any(file.endswith(ext) for ext in media_extensions): 16 | relative_path = os.path.relpath(os.path.join(root, file), 17 | directory) 18 | relative_paths.append(relative_path) 19 | relative_paths = sorted(relative_paths) 20 | return relative_paths -------------------------------------------------------------------------------- /subfix/utils/misc.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import json 4 | from typing import List, Union 5 | import librosa 6 | import soundfile 7 | import numpy as np 8 | 9 | def save_json(path : str, data : Union[List[dict], dict]): 10 | with open(path, 'w', encoding="utf-8") as target: 11 | json.dump(data, path, ensure_ascii=False) 12 | 13 | 14 | def load_json(path : str): 15 | with open(path, 'r', encoding="utf-8") as source: 16 | data = json.load(source) 17 | return data 18 | 19 | 20 | def merge_audio_vads(source_path ,save_path, vad_list : List[List], interval = 1, sample_rate = None): 21 | data, sample_rate = librosa.load(source_path, sr=sample_rate, mono=True) 22 | audio_list = [] 23 | for i, _ in enumerate(vad_list): 24 | time_start = _[0] 25 | time_end = _[1] 26 | start = int((time_start) * sample_rate) 27 | end = int((time_end) * sample_rate) 28 | if (i > 0): 29 | silence = np.zeros(int(sample_rate * interval)) 30 | audio_list.append(silence) 31 | audio_list.append(data[start:end]) 32 | audio_concat = np.concatenate(audio_list) 33 | os.makedirs(os.path.split(save_path)[0], exist_ok=True) 34 | soundfile.write(save_path, audio_concat, sample_rate) 35 | 36 | 37 | def get_sub_dirs(source_dir): 38 | sub_dir = [f for f in os.listdir(source_dir) if not f.startswith('.')] 39 | sub_dir = [f for f in sub_dir if os.path.isdir(os.path.join(source_dir, f))] 40 | return sub_dir 41 | 42 | 43 | def ends_with_ending_sentence(sentence): 44 | if re.search(r'[。?!…]$', sentence): 45 | return True 46 | return False 47 | 48 | 49 | def ends_with_punctuation(sentence): 50 | pattern = r'[.,!?。,!?、・\uff00-\uffef\u3000-\u303f\u3040-\u309f\u30a0-\u30ff]$' 51 | return re.search(pattern, sentence) 52 | 53 | 54 | def merge_audio_slice(source_audio, slice_dir, data_list, start_count, sample_rate, max_seconds, language, speaker_name) -> List: 55 | # input : datalist = [{'start': seconds, 'end': seconds, 'text': text}] 56 | # return : [{'sliced_audio_path', 'speaker_name', 'language', 'text'}] , count_next 57 | sentence_list = [] 58 | audio_list = [] 59 | time_length = 0 60 | count = start_count 61 | result = [] 62 | 63 | data, sample_rate = librosa.load(source_audio, sr=sample_rate, mono=True) 64 | for sentence in data_list: 65 | text = sentence['text'].strip() 66 | if (text == ""): 67 | continue 68 | start = int((sentence['start']) * sample_rate) 69 | end = int((sentence['end']) * sample_rate) 70 | 71 | if time_length > 0 and time_length + (sentence['end'] - sentence['start']) > max_seconds: 72 | sliced_audio_name = f"{str(count).zfill(6)}" 73 | sliced_audio_path = os.path.join(slice_dir, sliced_audio_name+".wav") 74 | s_sentence = "".join(sentence_list) 75 | 76 | if language == "ZH" and re.search(r"[,]$", s_sentence): 77 | s_sentence = s_sentence[:-1] + '。' 78 | if language == "ZH" and not ends_with_punctuation(s_sentence): 79 | s_sentence = s_sentence 80 | 81 | audio_concat = np.concatenate(audio_list) 82 | if time_length > max_seconds: 83 | print(f"[too long voice]:{sliced_audio_path}, voice_length:{time_length} seconds") 84 | soundfile.write(sliced_audio_path, audio_concat, sample_rate) 85 | result.append( 86 | { 87 | 'sliced_audio_path' : sliced_audio_path, 88 | 'speaker_name' : speaker_name, 89 | 'language' : language, 90 | 'text' : s_sentence 91 | } 92 | ) 93 | sentence_list = [] 94 | audio_list = [] 95 | time_length = 0 96 | count = count + 1 97 | 98 | sentence_list.append(text) 99 | audio_list.append(data[start:end]) 100 | time_length = time_length + (sentence['end'] - sentence['start']) 101 | 102 | if ( ends_with_ending_sentence(text) ): 103 | sliced_audio_name = f"{str(count).zfill(6)}" 104 | sliced_audio_path = os.path.join(slice_dir, sliced_audio_name+".wav") 105 | s_sentence = "".join(sentence_list) 106 | audio_concat = np.concatenate(audio_list) 107 | soundfile.write(sliced_audio_path, audio_concat, sample_rate) 108 | 109 | result.append( 110 | { 111 | 'sliced_audio_path' : sliced_audio_path, 112 | 'speaker_name' : speaker_name, 113 | 'language' : language, 114 | 'text' : s_sentence 115 | } 116 | ) 117 | sentence_list = [] 118 | audio_list = [] 119 | time_length = 0 120 | count = count + 1 121 | return result, count -------------------------------------------------------------------------------- /subfix/webui/__init__.py: -------------------------------------------------------------------------------- 1 | from .webui import startwebui -------------------------------------------------------------------------------- /subfix/webui/language.py: -------------------------------------------------------------------------------- 1 | 2 | LANG_CONFIG_MAP = { 3 | "zh": { 4 | "Change Index" : "改变索引", 5 | "Submit Text" : "保存文本", 6 | "Merge Audio" : "合并音频", 7 | "Delete Audio" : "删除音频", 8 | "Previous Index" : "前一页", 9 | "Next Index" : "后一页", 10 | "Light Theme" : "亮色模式", 11 | "Dark Theme" : "黑暗模式", 12 | "Choose Audio" : "选择音频", 13 | "Output Audio" : "Output Audio", 14 | "Text" : "文本", 15 | "Invert Selection": "反选", 16 | "Save File" : "保存文件", 17 | "Split Audio" : "分割音频", 18 | "Audio Split Point(s)" : "音频分割点(单位:秒)", 19 | "Index":"索引", 20 | "Interval":"合并间隔(单位:秒)" 21 | }, 22 | } 23 | 24 | 25 | class TextLanguage(): 26 | def __init__(self, language : str = "en") -> None: 27 | if language in LANG_CONFIG_MAP.keys(): 28 | self.language = language 29 | else: 30 | self.language = "en" 31 | pass 32 | 33 | def get_text(self, text : str) -> str: 34 | if self.language == "en": 35 | return text 36 | elif text in LANG_CONFIG_MAP[self.language].keys() : 37 | return LANG_CONFIG_MAP[self.language][text] 38 | else: 39 | return text 40 | 41 | def __call__(self, text : str) -> str: 42 | return self.get_text(text) -------------------------------------------------------------------------------- /subfix/webui/webui.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import copy 3 | import json 4 | import os 5 | import uuid 6 | 7 | import librosa 8 | import gradio as gr 9 | import numpy as np 10 | import soundfile 11 | 12 | from .language import TextLanguage 13 | 14 | g_json_key_text = "" 15 | g_json_key_path = "" 16 | g_load_file = "" 17 | g_load_format = "" 18 | 19 | g_max_json_index = 0 20 | g_index = 0 21 | g_batch = 10 22 | g_text_list = [] 23 | g_audio_list = [] 24 | g_checkbox_list = [] 25 | g_data_json = [] 26 | g_language = None 27 | 28 | 29 | def reload_data(index, batch): 30 | global g_index 31 | g_index = index 32 | global g_batch 33 | g_batch = batch 34 | datas = g_data_json[index:index+batch] 35 | output = [] 36 | for d in datas: 37 | output.append( 38 | { 39 | g_json_key_text: d[g_json_key_text], 40 | g_json_key_path: d[g_json_key_path] 41 | } 42 | ) 43 | return output 44 | 45 | 46 | def b_change_index(index, batch): 47 | global g_index, g_batch 48 | g_index, g_batch = index, batch 49 | datas = reload_data(index, batch) 50 | output = [] 51 | for i , _ in enumerate(datas): 52 | output.append( 53 | gr.Textbox( 54 | label=f"Text {i+index}", 55 | value=_[g_json_key_text] 56 | ) 57 | ) 58 | for _ in range(g_batch - len(datas)): 59 | output.append( 60 | gr.Textbox( 61 | label=f"Text", 62 | value="" 63 | ) 64 | ) 65 | for _ in datas: 66 | output.append(_[g_json_key_path]) 67 | for _ in range(g_batch - len(datas)): 68 | output.append(None) 69 | for _ in range(g_batch): 70 | output.append(False) 71 | return output 72 | 73 | 74 | def b_next_index(index, batch): 75 | if (index + batch) <= g_max_json_index: 76 | return index + batch , *b_change_index(index + batch, batch) 77 | else: 78 | return index, *b_change_index(index, batch) 79 | 80 | 81 | def b_previous_index(index, batch): 82 | if (index - batch) >= 0: 83 | return index - batch , *b_change_index(index - batch, batch) 84 | else: 85 | return 0, *b_change_index(0, batch) 86 | 87 | 88 | def b_submit_change(*text_list): 89 | global g_data_json 90 | change = False 91 | for i, new_text in enumerate(text_list): 92 | if g_index + i <= g_max_json_index: 93 | new_text = new_text.strip()+' ' 94 | if (g_data_json[g_index + i][g_json_key_text] != new_text): 95 | g_data_json[g_index + i][g_json_key_text] = new_text 96 | change = True 97 | if change: 98 | b_save_file() 99 | return g_index, *b_change_index(g_index, g_batch) 100 | 101 | 102 | def b_delete_audio(*checkbox_list): 103 | global g_data_json, g_index, g_max_json_index 104 | change = False 105 | for i, checkbox in reversed(list(enumerate(checkbox_list))): 106 | if g_index + i < len(g_data_json): 107 | if (checkbox == True): 108 | if g_force_delete: 109 | print("remove",g_data_json[g_index + i][g_json_key_path]) 110 | os.remove(g_data_json[g_index + i][g_json_key_path]) 111 | g_data_json.pop(g_index + i) 112 | change = True 113 | 114 | g_max_json_index = len(g_data_json)-1 115 | if g_index > g_max_json_index: 116 | g_index = g_max_json_index 117 | g_index = g_index if g_index >= 0 else 0 118 | if g_force_delete and change: 119 | b_save_file() 120 | return gr.Slider(value=g_index, maximum=(g_max_json_index if g_max_json_index>=0 else 0)), *b_change_index(g_index, g_batch) 121 | 122 | 123 | def b_invert_selection(*checkbox_list): 124 | new_list = [not item if item is True else True for item in checkbox_list] 125 | return new_list 126 | 127 | 128 | def get_next_path(filename): 129 | base_dir = os.path.dirname(filename) 130 | base_name = os.path.splitext(os.path.basename(filename))[0] 131 | for i in range(100): 132 | new_path = os.path.join(base_dir, f"{base_name}_{str(i).zfill(2)}.wav") 133 | if not os.path.exists(new_path) : 134 | return new_path 135 | return os.path.join(base_dir, f'{str(uuid.uuid4())}.wav') 136 | 137 | 138 | def b_audio_split(audio_breakpoint, *checkbox_list): 139 | global g_data_json , g_max_json_index 140 | checked_index = [] 141 | for i, checkbox in enumerate(checkbox_list): 142 | if (checkbox == True and g_index+i < len(g_data_json)): 143 | checked_index.append(g_index + i) 144 | if len(checked_index) == 1 : 145 | index = checked_index[0] 146 | audio_json = copy.deepcopy(g_data_json[index]) 147 | path = audio_json[g_json_key_path] 148 | data, sample_rate = librosa.load(path, sr=None, mono=True) 149 | audio_maxframe = len(data) 150 | break_frame = int(audio_breakpoint * sample_rate) 151 | 152 | if (break_frame >= 1 and break_frame < audio_maxframe): 153 | audio_first = data[0:break_frame] 154 | audio_second = data[break_frame:] 155 | nextpath = get_next_path(path) 156 | soundfile.write(nextpath, audio_second, sample_rate) 157 | soundfile.write(path, audio_first, sample_rate) 158 | g_data_json.insert(index + 1, audio_json) 159 | g_data_json[index + 1][g_json_key_path] = nextpath 160 | b_save_file() 161 | 162 | g_max_json_index = len(g_data_json) - 1 163 | return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch) 164 | 165 | def b_merge_audio(interval_r, *checkbox_list): 166 | global g_data_json , g_max_json_index 167 | checked_index = [] 168 | audios_path = [] 169 | audios_text = [] 170 | delete_files = [] 171 | for i, checkbox in enumerate(checkbox_list): 172 | if (checkbox == True and g_index+i < len(g_data_json)): 173 | checked_index.append(g_index + i) 174 | 175 | if (len(checked_index)>1): 176 | for i in checked_index: 177 | audios_path.append(g_data_json[i][g_json_key_path]) 178 | audios_text.append(g_data_json[i][g_json_key_text]) 179 | for i in reversed(checked_index[1:]): 180 | delete_files.append(g_data_json[i][g_json_key_path]) 181 | g_data_json.pop(i) 182 | 183 | base_index = checked_index[0] 184 | base_path = audios_path[0] 185 | g_data_json[base_index][g_json_key_text] = "".join(audios_text) 186 | 187 | audio_list = [] 188 | l_sample_rate = None 189 | for i, path in enumerate(audios_path): 190 | data, sample_rate = librosa.load(path, sr=l_sample_rate, mono=True) 191 | l_sample_rate = sample_rate 192 | if (i > 0): 193 | silence = np.zeros(int(l_sample_rate * interval_r)) 194 | audio_list.append(silence) 195 | 196 | audio_list.append(data) 197 | 198 | audio_concat = np.concatenate(audio_list) 199 | 200 | for item_file in delete_files: 201 | os.remove(item_file) 202 | 203 | soundfile.write(base_path, audio_concat, l_sample_rate) 204 | 205 | b_save_file() 206 | 207 | g_max_json_index = len(g_data_json) - 1 208 | 209 | return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch) 210 | 211 | 212 | def b_save_json(): 213 | with open(g_load_file,'w', encoding="utf-8") as file: 214 | for data in g_data_json: 215 | file.write(f'{json.dumps(data, ensure_ascii = False)}\n') 216 | 217 | 218 | def b_save_list(): 219 | with open(g_load_file,'w', encoding="utf-8") as file: 220 | for data in g_data_json: 221 | wav_path = data["wav_path"] 222 | speaker_name = data["speaker_name"] 223 | language = data["language"] 224 | text = data["text"] 225 | file.write(f"{wav_path}|{speaker_name}|{language}|{text}".strip()+'\n') 226 | 227 | 228 | def b_load_json(): 229 | global g_data_json, g_max_json_index 230 | with open(g_load_file, 'r', encoding="utf-8") as file: 231 | g_data_json = file.readlines() 232 | g_data_json = [json.loads(line) for line in g_data_json] 233 | g_max_json_index = len(g_data_json) - 1 234 | 235 | 236 | def b_load_list(): 237 | global g_data_json, g_max_json_index 238 | with open(g_load_file, 'r', encoding="utf-8") as source: 239 | data_list = source.readlines() 240 | for _ in data_list: 241 | data = _.split('|') 242 | if (len(data) == 4): 243 | wav_path, speaker_name, language, text = data 244 | g_data_json.append( 245 | { 246 | 'wav_path':wav_path, 247 | 'speaker_name':speaker_name, 248 | 'language':language, 249 | 'text':text.strip() 250 | } 251 | ) 252 | else: 253 | print("error line:", data) 254 | g_max_json_index = len(g_data_json) - 1 255 | 256 | 257 | def b_save_file(): 258 | if g_load_format == "json": 259 | b_save_json() 260 | elif g_load_format == "list": 261 | b_save_list() 262 | 263 | 264 | def b_load_file(): 265 | if g_load_format == "json": 266 | b_load_json() 267 | elif g_load_format == "list": 268 | b_load_list() 269 | 270 | 271 | def set_global(load_json, load_list, json_key_text, json_key_path, batch, webui_language, force_delete): 272 | global g_json_key_text, g_json_key_path, g_load_file, g_load_format, g_batch, g_language, g_force_delete 273 | 274 | g_batch = int(batch) 275 | 276 | if (load_json != "None"): 277 | g_load_format = "json" 278 | g_load_file = load_json 279 | elif (load_list != "None"): 280 | g_load_format = "list" 281 | g_load_file = load_list 282 | else: 283 | g_load_format = "list" 284 | g_load_file = "demo.list" 285 | 286 | g_json_key_text = json_key_text 287 | g_json_key_path = json_key_path 288 | g_language = TextLanguage(webui_language) 289 | g_force_delete = force_delete 290 | 291 | b_load_file() 292 | 293 | 294 | def startwebui(args): 295 | 296 | set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch, args.webui_language, args.force_delete) 297 | 298 | with gr.Blocks() as demo: 299 | 300 | with gr.Row(): 301 | btn_change_index = gr.Button(g_language("Change Index")) 302 | btn_submit_change = gr.Button(g_language("Submit Text")) 303 | btn_merge_audio = gr.Button(g_language("Merge Audio")) 304 | btn_delete_audio = gr.Button(g_language("Delete Audio")) 305 | btn_previous_index = gr.Button(g_language("Previous Index")) 306 | btn_next_index = gr.Button(g_language("Next Index")) 307 | 308 | with gr.Row(): 309 | index_slider = gr.Slider( 310 | minimum=0, maximum=g_max_json_index, value=g_index, step=1, label=g_language("Index"), scale=3 311 | ) 312 | splitpoint_slider = gr.Slider( 313 | minimum=0, maximum=120.0, value=0, step=0.1, label=g_language("Audio Split Point(s)"), scale=3 314 | ) 315 | btn_audio_split = gr.Button(g_language("Split Audio"), scale=1) 316 | btn_save_json = gr.Button(g_language("Save File"), visible=True, scale=1) 317 | btn_invert_selection = gr.Button(g_language("Invert Selection"), scale=1) 318 | 319 | with gr.Row(): 320 | with gr.Column(): 321 | for _ in range(0,g_batch): 322 | with gr.Row(): 323 | text = gr.Textbox( 324 | label = "Text", 325 | visible = True, 326 | scale=5 327 | ) 328 | audio_output = gr.Audio( 329 | label= g_language("Output Audio"), 330 | visible = True, 331 | scale=5 332 | ) 333 | audio_check = gr.Checkbox( 334 | label="Yes", 335 | show_label = True, 336 | info = g_language("Choose Audio"), 337 | scale=1 338 | ) 339 | g_text_list.append(text) 340 | g_audio_list.append(audio_output) 341 | g_checkbox_list.append(audio_check) 342 | 343 | 344 | 345 | with gr.Row(): 346 | batchsize_slider = gr.Slider( 347 | minimum=1, maximum=g_batch, value=g_batch, step=1, label=g_language("Batch Size"), scale=3, interactive=False 348 | ) 349 | interval_slider = gr.Slider( 350 | minimum=0, maximum=2, value=0, step=0.01, label=g_language("Interval"), scale=3 351 | ) 352 | btn_theme_dark = gr.Button(g_language("Light Theme"), link="?__theme=light", scale=1) 353 | btn_theme_light = gr.Button(g_language("Dark Theme"), link="?__theme=dark", scale=1) 354 | 355 | btn_change_index.click( 356 | b_change_index, 357 | inputs=[ 358 | index_slider, 359 | batchsize_slider, 360 | ], 361 | outputs=[ 362 | *g_text_list, 363 | *g_audio_list, 364 | *g_checkbox_list 365 | ], 366 | ) 367 | 368 | 369 | btn_submit_change.click( 370 | b_submit_change, 371 | inputs=[ 372 | *g_text_list, 373 | ], 374 | outputs=[ 375 | index_slider, 376 | *g_text_list, 377 | *g_audio_list, 378 | *g_checkbox_list 379 | ], 380 | ) 381 | 382 | btn_previous_index.click( 383 | b_previous_index, 384 | inputs=[ 385 | index_slider, 386 | batchsize_slider, 387 | ], 388 | outputs=[ 389 | index_slider, 390 | *g_text_list, 391 | *g_audio_list, 392 | *g_checkbox_list 393 | ], 394 | ) 395 | 396 | btn_next_index.click( 397 | b_next_index, 398 | inputs=[ 399 | index_slider, 400 | batchsize_slider, 401 | ], 402 | outputs=[ 403 | index_slider, 404 | *g_text_list, 405 | *g_audio_list, 406 | *g_checkbox_list 407 | ], 408 | ) 409 | 410 | btn_delete_audio.click( 411 | b_delete_audio, 412 | inputs=[ 413 | *g_checkbox_list 414 | ], 415 | outputs=[ 416 | index_slider, 417 | *g_text_list, 418 | *g_audio_list, 419 | *g_checkbox_list 420 | ] 421 | ) 422 | 423 | btn_merge_audio.click( 424 | b_merge_audio, 425 | inputs=[ 426 | interval_slider, 427 | *g_checkbox_list 428 | ], 429 | outputs=[ 430 | index_slider, 431 | *g_text_list, 432 | *g_audio_list, 433 | *g_checkbox_list 434 | ] 435 | ) 436 | 437 | btn_audio_split.click( 438 | b_audio_split, 439 | inputs=[ 440 | splitpoint_slider, 441 | *g_checkbox_list 442 | ], 443 | outputs=[ 444 | index_slider, 445 | *g_text_list, 446 | *g_audio_list, 447 | *g_checkbox_list 448 | ] 449 | ) 450 | 451 | btn_invert_selection.click( 452 | b_invert_selection, 453 | inputs=[ 454 | *g_checkbox_list 455 | ], 456 | outputs=[ 457 | *g_checkbox_list 458 | ] 459 | ) 460 | 461 | btn_save_json.click( 462 | b_save_file 463 | ) 464 | 465 | demo.load( 466 | b_change_index, 467 | inputs=[ 468 | index_slider, 469 | batchsize_slider, 470 | ], 471 | outputs=[ 472 | *g_text_list, 473 | *g_audio_list, 474 | *g_checkbox_list 475 | ], 476 | ) 477 | 478 | demo.launch() -------------------------------------------------------------------------------- /subfix_webui.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import copy 3 | import json 4 | import os 5 | import uuid 6 | 7 | import librosa 8 | import gradio as gr 9 | import numpy as np 10 | import soundfile 11 | 12 | 13 | """ 14 | Apache License 15 | Version 2.0, January 2004 16 | https://www.apache.org/licenses/LICENSE-2.0 17 | 18 | SubFix 19 | cronrpc 20 | https://github.com/cronrpc/SubFix 21 | """ 22 | 23 | 24 | g_json_key_text = "" 25 | g_json_key_path = "" 26 | g_load_file = "" 27 | g_load_format = "" 28 | 29 | g_max_json_index = 0 30 | g_index = 0 31 | g_batch = 10 32 | g_text_list = [] 33 | g_audio_list = [] 34 | g_checkbox_list = [] 35 | g_data_json = [] 36 | g_language = None 37 | 38 | 39 | SUBFIX_LANG_CONFIG_MAP = { 40 | "zh": { 41 | "Change Index" : "改变索引", 42 | "Submit Text" : "保存文本", 43 | "Merge Audio" : "合并音频", 44 | "Delete Audio" : "删除音频", 45 | "Previous Index" : "前一页", 46 | "Next Index" : "后一页", 47 | "Light Theme" : "亮色模式", 48 | "Dark Theme" : "黑暗模式", 49 | "Choose Audio" : "选择音频", 50 | "Output Audio" : "Output Audio", 51 | "Text" : "文本", 52 | "Invert Selection": "反选", 53 | "Save File" : "保存文件", 54 | "Split Audio" : "分割音频", 55 | "Audio Split Point(s)" : "音频分割点(单位:秒)", 56 | "Index":"索引", 57 | "Interval":"合并间隔(单位:秒)" 58 | }, 59 | } 60 | 61 | 62 | class SUBFIX_TextLanguage(): 63 | def __init__(self, language : str = "en") -> None: 64 | if language in SUBFIX_LANG_CONFIG_MAP.keys(): 65 | self.language = language 66 | else: 67 | self.language = "en" 68 | pass 69 | 70 | def get_text(self, text : str) -> str: 71 | if self.language == "en": 72 | return text 73 | elif text in SUBFIX_LANG_CONFIG_MAP[self.language].keys() : 74 | return SUBFIX_LANG_CONFIG_MAP[self.language][text] 75 | else: 76 | return text 77 | 78 | def __call__(self, text : str) -> str: 79 | return self.get_text(text) 80 | 81 | 82 | def reload_data(index, batch): 83 | global g_index 84 | g_index = index 85 | global g_batch 86 | g_batch = batch 87 | datas = g_data_json[index:index+batch] 88 | output = [] 89 | for d in datas: 90 | output.append( 91 | { 92 | g_json_key_text: d[g_json_key_text], 93 | g_json_key_path: d[g_json_key_path] 94 | } 95 | ) 96 | return output 97 | 98 | 99 | def b_change_index(index, batch): 100 | global g_index, g_batch 101 | g_index, g_batch = index, batch 102 | datas = reload_data(index, batch) 103 | output = [] 104 | for i , _ in enumerate(datas): 105 | output.append( 106 | gr.Textbox( 107 | label=f"Text {i+index}", 108 | value=_[g_json_key_text] 109 | ) 110 | ) 111 | for _ in range(g_batch - len(datas)): 112 | output.append( 113 | gr.Textbox( 114 | label=f"Text", 115 | value="" 116 | ) 117 | ) 118 | for _ in datas: 119 | output.append(_[g_json_key_path]) 120 | for _ in range(g_batch - len(datas)): 121 | output.append(None) 122 | for _ in range(g_batch): 123 | output.append(False) 124 | return output 125 | 126 | 127 | def b_next_index(index, batch): 128 | if (index + batch) <= g_max_json_index: 129 | return index + batch , *b_change_index(index + batch, batch) 130 | else: 131 | return index, *b_change_index(index, batch) 132 | 133 | 134 | def b_previous_index(index, batch): 135 | if (index - batch) >= 0: 136 | return index - batch , *b_change_index(index - batch, batch) 137 | else: 138 | return 0, *b_change_index(0, batch) 139 | 140 | 141 | def b_submit_change(*text_list): 142 | global g_data_json 143 | change = False 144 | for i, new_text in enumerate(text_list): 145 | if g_index + i <= g_max_json_index: 146 | new_text = new_text.strip()+' ' 147 | if (g_data_json[g_index + i][g_json_key_text] != new_text): 148 | g_data_json[g_index + i][g_json_key_text] = new_text 149 | change = True 150 | if change: 151 | b_save_file() 152 | return g_index, *b_change_index(g_index, g_batch) 153 | 154 | 155 | def b_delete_audio(*checkbox_list): 156 | global g_data_json, g_index, g_max_json_index 157 | change = False 158 | for i, checkbox in reversed(list(enumerate(checkbox_list))): 159 | if g_index + i < len(g_data_json): 160 | if (checkbox == True): 161 | if g_force_delete: 162 | print("remove",g_data_json[g_index + i][g_json_key_path]) 163 | os.remove(g_data_json[g_index + i][g_json_key_path]) 164 | g_data_json.pop(g_index + i) 165 | change = True 166 | 167 | g_max_json_index = len(g_data_json)-1 168 | if g_index > g_max_json_index: 169 | g_index = g_max_json_index 170 | g_index = g_index if g_index >= 0 else 0 171 | if g_force_delete and change: 172 | b_save_file() 173 | return gr.Slider(value=g_index, maximum=(g_max_json_index if g_max_json_index>=0 else 0)), *b_change_index(g_index, g_batch) 174 | 175 | 176 | def b_invert_selection(*checkbox_list): 177 | new_list = [not item if item is True else True for item in checkbox_list] 178 | return new_list 179 | 180 | 181 | def get_next_path(filename): 182 | base_dir = os.path.dirname(filename) 183 | base_name = os.path.splitext(os.path.basename(filename))[0] 184 | for i in range(100): 185 | new_path = os.path.join(base_dir, f"{base_name}_{str(i).zfill(2)}.wav") 186 | if not os.path.exists(new_path) : 187 | return new_path 188 | return os.path.join(base_dir, f'{str(uuid.uuid4())}.wav') 189 | 190 | 191 | def b_audio_split(audio_breakpoint, *checkbox_list): 192 | global g_data_json , g_max_json_index 193 | checked_index = [] 194 | for i, checkbox in enumerate(checkbox_list): 195 | if (checkbox == True and g_index+i < len(g_data_json)): 196 | checked_index.append(g_index + i) 197 | if len(checked_index) == 1 : 198 | index = checked_index[0] 199 | audio_json = copy.deepcopy(g_data_json[index]) 200 | path = audio_json[g_json_key_path] 201 | data, sample_rate = librosa.load(path, sr=None, mono=True) 202 | audio_maxframe = len(data) 203 | break_frame = int(audio_breakpoint * sample_rate) 204 | 205 | if (break_frame >= 1 and break_frame < audio_maxframe): 206 | audio_first = data[0:break_frame] 207 | audio_second = data[break_frame:] 208 | nextpath = get_next_path(path) 209 | soundfile.write(nextpath, audio_second, sample_rate) 210 | soundfile.write(path, audio_first, sample_rate) 211 | g_data_json.insert(index + 1, audio_json) 212 | g_data_json[index + 1][g_json_key_path] = nextpath 213 | b_save_file() 214 | 215 | g_max_json_index = len(g_data_json) - 1 216 | return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch) 217 | 218 | def b_merge_audio(interval_r, *checkbox_list): 219 | global g_data_json , g_max_json_index 220 | checked_index = [] 221 | audios_path = [] 222 | audios_text = [] 223 | delete_files = [] 224 | for i, checkbox in enumerate(checkbox_list): 225 | if (checkbox == True and g_index+i < len(g_data_json)): 226 | checked_index.append(g_index + i) 227 | 228 | if (len(checked_index)>1): 229 | for i in checked_index: 230 | audios_path.append(g_data_json[i][g_json_key_path]) 231 | audios_text.append(g_data_json[i][g_json_key_text]) 232 | for i in reversed(checked_index[1:]): 233 | delete_files.append(g_data_json[i][g_json_key_path]) 234 | g_data_json.pop(i) 235 | 236 | base_index = checked_index[0] 237 | base_path = audios_path[0] 238 | g_data_json[base_index][g_json_key_text] = "".join(audios_text) 239 | 240 | audio_list = [] 241 | l_sample_rate = None 242 | for i, path in enumerate(audios_path): 243 | data, sample_rate = librosa.load(path, sr=l_sample_rate, mono=True) 244 | l_sample_rate = sample_rate 245 | if (i > 0): 246 | silence = np.zeros(int(l_sample_rate * interval_r)) 247 | audio_list.append(silence) 248 | 249 | audio_list.append(data) 250 | 251 | audio_concat = np.concatenate(audio_list) 252 | 253 | for item_file in delete_files: 254 | os.remove(item_file) 255 | 256 | soundfile.write(base_path, audio_concat, l_sample_rate) 257 | 258 | b_save_file() 259 | 260 | g_max_json_index = len(g_data_json) - 1 261 | 262 | return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch) 263 | 264 | 265 | def b_save_json(): 266 | with open(g_load_file,'w', encoding="utf-8") as file: 267 | for data in g_data_json: 268 | file.write(f'{json.dumps(data, ensure_ascii = False)}\n') 269 | 270 | 271 | def b_save_list(): 272 | with open(g_load_file,'w', encoding="utf-8") as file: 273 | for data in g_data_json: 274 | wav_path = data["wav_path"] 275 | speaker_name = data["speaker_name"] 276 | language = data["language"] 277 | text = data["text"] 278 | file.write(f"{wav_path}|{speaker_name}|{language}|{text}".strip()+'\n') 279 | 280 | 281 | def b_load_json(): 282 | global g_data_json, g_max_json_index 283 | with open(g_load_file, 'r', encoding="utf-8") as file: 284 | g_data_json = file.readlines() 285 | g_data_json = [json.loads(line) for line in g_data_json] 286 | g_max_json_index = len(g_data_json) - 1 287 | 288 | 289 | def b_load_list(): 290 | global g_data_json, g_max_json_index 291 | with open(g_load_file, 'r', encoding="utf-8") as source: 292 | data_list = source.readlines() 293 | for _ in data_list: 294 | data = _.split('|') 295 | if (len(data) == 4): 296 | wav_path, speaker_name, language, text = data 297 | g_data_json.append( 298 | { 299 | 'wav_path':wav_path, 300 | 'speaker_name':speaker_name, 301 | 'language':language, 302 | 'text':text.strip() 303 | } 304 | ) 305 | else: 306 | print("error line:", data) 307 | g_max_json_index = len(g_data_json) - 1 308 | 309 | 310 | def b_save_file(): 311 | if g_load_format == "json": 312 | b_save_json() 313 | elif g_load_format == "list": 314 | b_save_list() 315 | 316 | 317 | def b_load_file(): 318 | if g_load_format == "json": 319 | b_load_json() 320 | elif g_load_format == "list": 321 | b_load_list() 322 | 323 | 324 | def set_global(load_json, load_list, json_key_text, json_key_path, batch, webui_language, force_delete): 325 | global g_json_key_text, g_json_key_path, g_load_file, g_load_format, g_batch, g_language, g_force_delete 326 | 327 | g_batch = int(batch) 328 | 329 | if (load_json != "None"): 330 | g_load_format = "json" 331 | g_load_file = load_json 332 | elif (load_list != "None"): 333 | g_load_format = "list" 334 | g_load_file = load_list 335 | else: 336 | g_load_format = "list" 337 | g_load_file = "demo.list" 338 | 339 | g_json_key_text = json_key_text 340 | g_json_key_path = json_key_path 341 | g_language = SUBFIX_TextLanguage(webui_language) 342 | g_force_delete = force_delete 343 | 344 | b_load_file() 345 | 346 | 347 | def subfix_startwebui(args): 348 | 349 | set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch, args.webui_language, args.force_delete) 350 | 351 | with gr.Blocks() as demo: 352 | 353 | with gr.Row(): 354 | btn_change_index = gr.Button(g_language("Change Index")) 355 | btn_submit_change = gr.Button(g_language("Submit Text")) 356 | btn_merge_audio = gr.Button(g_language("Merge Audio")) 357 | btn_delete_audio = gr.Button(g_language("Delete Audio")) 358 | btn_previous_index = gr.Button(g_language("Previous Index")) 359 | btn_next_index = gr.Button(g_language("Next Index")) 360 | 361 | with gr.Row(): 362 | index_slider = gr.Slider( 363 | minimum=0, maximum=g_max_json_index, value=g_index, step=1, label=g_language("Index"), scale=3 364 | ) 365 | splitpoint_slider = gr.Slider( 366 | minimum=0, maximum=120.0, value=0, step=0.1, label=g_language("Audio Split Point(s)"), scale=3 367 | ) 368 | btn_audio_split = gr.Button(g_language("Split Audio"), scale=1) 369 | btn_save_json = gr.Button(g_language("Save File"), visible=True, scale=1) 370 | btn_invert_selection = gr.Button(g_language("Invert Selection"), scale=1) 371 | 372 | with gr.Row(): 373 | with gr.Column(): 374 | for _ in range(0,g_batch): 375 | with gr.Row(): 376 | text = gr.Textbox( 377 | label = "Text", 378 | visible = True, 379 | scale=5 380 | ) 381 | audio_output = gr.Audio( 382 | label= g_language("Output Audio"), 383 | visible = True, 384 | scale=5 385 | ) 386 | audio_check = gr.Checkbox( 387 | label="Yes", 388 | show_label = True, 389 | info = g_language("Choose Audio"), 390 | scale=1 391 | ) 392 | g_text_list.append(text) 393 | g_audio_list.append(audio_output) 394 | g_checkbox_list.append(audio_check) 395 | 396 | 397 | 398 | with gr.Row(): 399 | batchsize_slider = gr.Slider( 400 | minimum=1, maximum=g_batch, value=g_batch, step=1, label=g_language("Batch Size"), scale=3, interactive=False 401 | ) 402 | interval_slider = gr.Slider( 403 | minimum=0, maximum=2, value=0, step=0.01, label=g_language("Interval"), scale=3 404 | ) 405 | btn_theme_dark = gr.Button(g_language("Light Theme"), link="?__theme=light", scale=1) 406 | btn_theme_light = gr.Button(g_language("Dark Theme"), link="?__theme=dark", scale=1) 407 | 408 | btn_change_index.click( 409 | b_change_index, 410 | inputs=[ 411 | index_slider, 412 | batchsize_slider, 413 | ], 414 | outputs=[ 415 | *g_text_list, 416 | *g_audio_list, 417 | *g_checkbox_list 418 | ], 419 | ) 420 | 421 | 422 | btn_submit_change.click( 423 | b_submit_change, 424 | inputs=[ 425 | *g_text_list, 426 | ], 427 | outputs=[ 428 | index_slider, 429 | *g_text_list, 430 | *g_audio_list, 431 | *g_checkbox_list 432 | ], 433 | ) 434 | 435 | btn_previous_index.click( 436 | b_previous_index, 437 | inputs=[ 438 | index_slider, 439 | batchsize_slider, 440 | ], 441 | outputs=[ 442 | index_slider, 443 | *g_text_list, 444 | *g_audio_list, 445 | *g_checkbox_list 446 | ], 447 | ) 448 | 449 | btn_next_index.click( 450 | b_next_index, 451 | inputs=[ 452 | index_slider, 453 | batchsize_slider, 454 | ], 455 | outputs=[ 456 | index_slider, 457 | *g_text_list, 458 | *g_audio_list, 459 | *g_checkbox_list 460 | ], 461 | ) 462 | 463 | btn_delete_audio.click( 464 | b_delete_audio, 465 | inputs=[ 466 | *g_checkbox_list 467 | ], 468 | outputs=[ 469 | index_slider, 470 | *g_text_list, 471 | *g_audio_list, 472 | *g_checkbox_list 473 | ] 474 | ) 475 | 476 | btn_merge_audio.click( 477 | b_merge_audio, 478 | inputs=[ 479 | interval_slider, 480 | *g_checkbox_list 481 | ], 482 | outputs=[ 483 | index_slider, 484 | *g_text_list, 485 | *g_audio_list, 486 | *g_checkbox_list 487 | ] 488 | ) 489 | 490 | btn_audio_split.click( 491 | b_audio_split, 492 | inputs=[ 493 | splitpoint_slider, 494 | *g_checkbox_list 495 | ], 496 | outputs=[ 497 | index_slider, 498 | *g_text_list, 499 | *g_audio_list, 500 | *g_checkbox_list 501 | ] 502 | ) 503 | 504 | btn_invert_selection.click( 505 | b_invert_selection, 506 | inputs=[ 507 | *g_checkbox_list 508 | ], 509 | outputs=[ 510 | *g_checkbox_list 511 | ] 512 | ) 513 | 514 | btn_save_json.click( 515 | b_save_file 516 | ) 517 | 518 | demo.load( 519 | b_change_index, 520 | inputs=[ 521 | index_slider, 522 | batchsize_slider, 523 | ], 524 | outputs=[ 525 | *g_text_list, 526 | *g_audio_list, 527 | *g_checkbox_list 528 | ], 529 | ) 530 | 531 | demo.launch(server_port = args.server_port) 532 | 533 | 534 | if __name__ == "__main__": 535 | parser_subfix_webui = argparse.ArgumentParser(description='Process some integers.') 536 | parser_subfix_webui.add_argument('--load_json', default="None", help='source file, like demo.json') 537 | parser_subfix_webui.add_argument('--load_list', default="None", help='source file, like demo.list') 538 | parser_subfix_webui.add_argument('--json_key_text', default="text", help='the text key name in json, Default: text') 539 | parser_subfix_webui.add_argument('--json_key_path', default="wav_path", help='the path key name in json, Default: wav_path') 540 | parser_subfix_webui.add_argument('--g_batch', default=10, help='max number g_batch wav to display, Default: 10') 541 | parser_subfix_webui.add_argument('--webui_language', default="en", type=str, help='webui language: en or zh, Default: en') 542 | parser_subfix_webui.add_argument('--force_delete', default="True", type=str, help='delete file in disk while delete items, True or False, Default: True') 543 | parser_subfix_webui.add_argument('--server_port', default=7860, type=int, help='the webui port, Default: 7860') 544 | 545 | parser_subfix = parser_subfix_webui.parse_args() 546 | 547 | parser_subfix.force_delete = (parser_subfix.force_delete.upper() == "TRUE") 548 | 549 | subfix_startwebui(parser_subfix) --------------------------------------------------------------------------------