├── .gitignore
├── LICENSE
├── README.md
├── README_zh.md
├── images
    ├── delete.gif
    ├── index.gif
    ├── merge.gif
    ├── split.gif
    └── text.gif
├── requirements.txt
├── setup.py
├── subfix
    ├── __init__.py
    ├── cli.py
    ├── format
    │   ├── FormatBertvits2.py
    │   ├── FormatJson.py
    │   └── __init__.py
    ├── models
    │   ├── __init__.py
    │   ├── audio
    │   │   ├── __init__.py
    │   │   ├── asr
    │   │   │   ├── __init__.py
    │   │   │   ├── openai_whisper.py
    │   │   │   ├── speech_paraformer_large_vad_punc_asr_zh.py
    │   │   │   └── speech_uniasr_asr_multilang.py
    │   │   ├── punctuation
    │   │   │   ├── __init__.py
    │   │   │   └── punctuation_funasr.py
    │   │   ├── speaker_diarization
    │   │   │   ├── Speech_Campplus_Speaker_Diarization.py
    │   │   │   └── __init__.py
    │   │   ├── vad
    │   │   │   ├── __init__.py
    │   │   │   └── speech_fsmn_vad_zh.py
    │   │   └── verification
    │   │   │   └── __init__.py
    │   └── nlp
    │   │   ├── __init__.py
    │   │   └── correction
    │   │       └── __init__.py
    ├── solution
    │   ├── __init__.py
    │   ├── diarization.py
    │   ├── modelscope_multi_lang.py
    │   ├── punctuation_multi_lang.py
    │   └── whisper_multi_lang.py
    ├── utils
    │   ├── __init__.py
    │   ├── convert.py
    │   ├── ext_files.py
    │   └── misc.py
    └── webui
    │   ├── __init__.py
    │   ├── language.py
    │   └── webui.py
└── subfix_webui.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2023 cronrpc/SubFix
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SubFix
  2 | `SubFix` is a web tool designed for easily editing and modifying audio subtitles. Users can see changes in real-time and conveniently **merge, split, delete, and edit subtitles** of audios.
  3 | 
  4 | `SubFix` also supports automated voice annotation, utilizing `modelscope` and `whisper` for multilingual text annotation. Currently, `modelscope` provides automated annotations in languages including Chinese, English, Japanese, German, and Russian. `whisper` supports almost all languages."
  5 | 
  6 | [中文版本](README_zh.md)
  7 | 
  8 | An standalone `.py` file version is available for access at [subfix_webui.py](https://github.com/cronrpc/SubFix/blob/main/subfix_webui.py). This version allows language selection through command-line parameters, supporting both English and Chinese. Additionally, users can choose whether to synchronize the deletion of audio files on the hard drive during editing.
  9 | 
 10 | Usage instructions for the standalone Python file version can be found at: [subfix_webui.py Help](#subfix_webuipy)
 11 | 
 12 | ## Installation
 13 | 
 14 | Follow these steps for a quick and easy installation. It's recommended to use a `Linux` environment. If using `Windows`, you will need to manually configure the `ffmpeg` environment variable, and installing `modelscope` might be more complex.
 15 | 
 16 | ### Installing Dependencies
 17 | 
 18 | Ensure the installed version of `Python` is above `3.9`, then execute the following command. If you do not need to use automatic labeling of audio, there is no need to install the `Modelscope` module.
 19 | 
 20 | Using Conda:
 21 | ```bash
 22 | conda create -n modelscope python=3.9
 23 | conda activate modelscope
 24 | ```
 25 | 
 26 | Installing Dependencies
 27 | 
 28 | #### In a Linux environment
 29 | 
 30 | ```bash
 31 | sudo apt install build-essential
 32 | sudo apt install ffmpeg
 33 | sudo apt install libsox-dev
 34 | 
 35 | git clone https://github.com/cronrpc/SubFix.git
 36 | cd SubFix
 37 | pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 38 | pip install -e .
 39 | ```
 40 | 
 41 | #### Updating modelscope and FunASR to the Latest Versions
 42 | 
 43 | Due to recent changes in the modelscope API, the code installed via pip may not be the latest version, potentially causing issues with automatic annotation. 
 44 | 
 45 | To ensure compatibility, it is recommended to install the latest version directly from the GitHub repository:
 46 | 
 47 | ```bash
 48 | # Install FunASR
 49 | git clone https://github.com/alibaba/FunASR.git && cd FunASR
 50 | pip3 install -e ./
 51 | 
 52 | # Install modelscope
 53 | git clone https://github.com/modelscope/modelscope.git
 54 | cd modelscope
 55 | pip install -e .
 56 | pip install -e .[audio_asr]
 57 | ```
 58 | 
 59 | #### In a Windows environment
 60 | 
 61 | If you have a GPU, you need to install the `cuda` version of `pytorch` beforehand and configure environment variables such as `ffmpeg`. Then execute the following commands:
 62 | 
 63 | ```bash
 64 | git clone https://github.com/cronrpc/SubFix.git
 65 | cd SubFix
 66 | pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 67 | pip install -e .
 68 | ```
 69 | 
 70 | For information on installing pytorch, please visit(https://pytorch.org/get-started/locally/)
 71 | 
 72 | ## Usage Guide
 73 | 
 74 | After installing with `pip install -e .`, you can start the tool from any directory in the `shell` using the following command. All parameters have default values, so you don't need to input any `--option` if the default is used.
 75 | ```bash
 76 | subfix -h
 77 | 
 78 | # webui
 79 | subfix webui -h
 80 | subfix webui --load_list demo.list --webui_language zh --force_delete True
 81 | # create dataset
 82 | subfix create modelscope -h
 83 | # English
 84 | subfix create modelscope --source_dir origin --language EN
 85 | # Chinese
 86 | subfix create modelscope --source_dir origin --language ZH
 87 | # Japanese
 88 | subfix create modelscope --source_dir origin --language JA
 89 | # OpenAI Whisper Annotation (Supports Almost All Languages)
 90 | subfix create whisper --source_dir origin --language ZH
 91 | subfix create whisper --source_dir origin --language JA
 92 | # diarization (speaker segmentation)
 93 | subfix diarization -h
 94 | subfix diarization --source_dir origin --target_dir diarization --min_seconds 3.0
 95 | ```
 96 | 
 97 | Before using automated annotation, it's recommended to clear the `cache/subfix/` folder.
 98 | ```bash
 99 | rm -rf cache/subfix
100 | ```
101 | 
102 | ## Starting SubFix to View Dataset
103 | 
104 | `SubFix` supports two formats: `.json` and `.list`.
105 | 
106 | In the `.list` format, each line is similar to `"{wav_path}|{speaker_name}|{language}|{text}"`.
107 | 
108 | For example, if you already have a `demo.list` file and its corresponding audio files are in the correct path, you can use the following commands to start the `SubFix` UI interface:
109 | 
110 | ```bash
111 | subfix webui --load_list demo.list
112 | # or
113 | subfix webui --load_json demo.json
114 | ```
115 | 
116 | Viewing Help:
117 | ```bash
118 | subfix --help
119 | subfix webui --help
120 | ```
121 | 
122 | ### Quick Viewing and Listening to Audio
123 | 
124 | You can click the `Previous Index` and `Next Index` buttons to switch lists, or drag the `slider` and click `Change Index` for quick positioning in the list.
125 | 
126 | ![change index gif](images/index.gif)
127 | 
128 | ### Modifying Text
129 | 
130 | You can directly modify the text and click the `Submit Text` button to save the changes.
131 | 
132 | ![change text gif](images/text.gif)
133 | 
134 | ### Merging
135 | 
136 | Select the audios you want to merge, set the `merge interval`, and then click the `merge` button to merge the audio.
137 | 
138 | ![merge audio gif](images/merge.gif)
139 | 
140 | ### Splitting Audio
141 | 
142 | Select the audio to be split, set the `split point`, and then click the `split` button to proceed. Note that only one audio can be split at a time, and the text needs to be adjusted again after splitting.
143 | 
144 | ![split audio gif](images/split.gif)
145 | 
146 | ### Deleting
147 | 
148 | Select the audio to be deleted and click the `button` to delete. The delete operation will be temporarily stored in memory. To save it to a file, click the save button or execute another command.
149 | 
150 | ![delete audio gif](images/delete.gif)
151 | 
152 | ### Automated Audio Annotation and Dataset Creation
153 | 
154 | By default, place the audio files in the `origin` folder. For an audio file `abc.wav` by a speaker `sam`, its file path could be structured like `./origin/sam/abc.wav`. Then execute the following command:
155 | 
156 | ```bash
157 | # rm -rf cache/subfix
158 | subfix create --source_dir origin --output demo.list
159 | ```
160 | 
161 | This command will create a `dataset` directory and store the paths and subtitles of all transcribed audio files in the `demo.list` file.
162 | 
163 | ### Add Punctuation to List Files
164 | 
165 | If you want to use punctuation, use the following command to automatically add punctuation to the text in the list file:
166 | 
167 | ```
168 | subfix punctuation --load_list demo.list
169 | ```
170 | 
171 | ### Speaker Recognition and Clustering
172 | 
173 | In some cases, large audio segments might include background music, leading to the recognition of vocals or noise from the background song, causing multiple speakers to be identified in the same file. Or, when speaking is too dense, it might result in excessively long recognized audio.
174 | 
175 | This feature extracts the `n` most frequent speakers from each file, with an interval of `interval` seconds between each sentence spoken by the same person. This is saved in the `diarization` folder, making it easier to extract audio later.
176 | 
177 | ```bash
178 | subfix diarization --source_dir origin --target_dir diarization --min_seconds 3.0 --interval 10 --top_of_number 1
179 | subfix create modelscope --source_dir diarization --language ZH
180 | ```
181 | 
182 | ## Format Conversion
183 | 
184 | The two formats, `.list` and `.json`, can be converted into each other. Use the following commands to convert files:
185 | 
186 | ```bash
187 | subfix format_convert --source demo.list --target demo.json
188 | subfix format_convert --source demo.json --target demo.list
189 | ```
190 | 
191 | ## subfix_webui.py
192 | 
193 | 
194 | Usage of the standalone Python file version:
195 | 
196 | View Help
197 | 
198 | ```bash
199 | python subfix_webui.py -h
200 | ```
201 | 
202 | Launch in Chinese
203 | 
204 | ```bash
205 | python subfix_webui.py --webui_language zh --load_list demo.list
206 | ```
207 | 
208 | Specify a `.list` File
209 | 
210 | ```bash
211 | python subfix_webui.py --load_list demo.list
212 | ```
213 | 
214 | Synchronize deletion of disk files, the default value is True.
215 | 
216 | ```bash
217 | python subfix_webui.py --force_delete True
218 | # or
219 | python subfix_webui.py --force_delete False
220 | ```
221 | 
222 | Launch with a specified port
223 | 
224 | ```bash
225 | python subfix_webui.py --server_port 1234
226 | ```
227 | 
228 | ## References
229 | 
230 | - [anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS)
231 | - [fishaudio/Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
232 | - [openai/whisper](https://github.com/openai/whisper)


--------------------------------------------------------------------------------
/README_zh.md:
--------------------------------------------------------------------------------
  1 | # SubFix
  2 | `SubFix`是一个用于轻松地编辑修改音频字幕的网页工具。能够实时地看到改动，方便地对音频进行**合并、分割、删除、编辑字幕**。
  3 | 
  4 | `SubFix`同时也支持自动化语音标注，使用`modelscope`和`whisper`对文本进行多语言标注。目前`modelscope`支持中文、英语、日语、德语、德语、俄语的自动化标注。`whisper`支持几乎所有语言。
  5 | 
  6 | [English Version](README.md)
  7 | 
  8 | 独立的`.py`文件版本，可以通过[subfix_webui.py](https://github.com/cronrpc/SubFix/blob/main/subfix_webui.py)获取。该版本可以通过命令行参数来选择语言，支持英文和中文。同时可以选择在编辑时是否同步删除硬盘上的音频文件。
  9 | 
 10 | 单独Python文件版本的使用方法，可访问：[subfix_webui.py 帮助](#subfix_webuipy)
 11 | 
 12 | ## 安装
 13 | 
 14 | 进行如下安装步骤可以快速而轻松的安装。建议使用`Linux`环境。如果是`Windows`环境，需要您手动配置`ffmpeg`环境变量，并且`modelscope`的安装可能比较复杂。
 15 | 
 16 | ### 安装依赖
 17 | 
 18 | 确认安装的`Python`版本最好大于`3.9`，然后执行如下命令。如果您不需要使用音频的自动标注，那么不需要安装`Modelscope`模块。
 19 | 
 20 | 使用conda
 21 | ```bash
 22 | conda create -n modelscope python=3.9
 23 | conda activate modelscope
 24 | ```
 25 | 
 26 | 安装依赖
 27 | 
 28 | #### 在`Linux`环境
 29 | 
 30 | ```bash
 31 | sudo apt install build-essential
 32 | sudo apt install ffmpeg
 33 | sudo apt install libsox-dev
 34 | 
 35 | git clone https://github.com/cronrpc/SubFix.git
 36 | cd SubFix
 37 | pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 38 | pip install -e .
 39 | ```
 40 | 
 41 | #### 更新modelscope和FunASR到最新版本
 42 | 
 43 | 由于modelscope的API发生了改动，通过pip安装的代码未必是最新版本，可能存在无法运行自动标注的情况。
 44 | 
 45 | 这里建议是，可以直接安装最新Github仓库的modelscope和funASR到最新版本。
 46 | 
 47 | ```
 48 | # 安装FunASR
 49 | git clone https://github.com/alibaba/FunASR.git && cd FunASR
 50 | pip3 install -e ./
 51 | 
 52 | # 安装modelscope
 53 | git clone https://github.com/modelscope/modelscope.git
 54 | cd modelscope
 55 | pip install -e .
 56 | pip install -e .[audio_asr]
 57 | ```
 58 | 
 59 | #### 在`Windows`环境
 60 | 
 61 | 如果有gpu，需要提前安装`pytorch`的`cuda`版本，配置`ffmpeg`等环境变量，之后执行下列命令
 62 | 
 63 | ```bash
 64 | git clone https://github.com/cronrpc/SubFix.git
 65 | cd SubFix
 66 | pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 67 | pip install -e .
 68 | ```
 69 | 
 70 | 关于`pytorch`安装，请访问(https://pytorch.org/get-started/locally/)
 71 | 
 72 | ## 使用指南
 73 | 
 74 | 当你使用`pip install -e .`安装后，在`shell`下可以通过该命令在任意目录下启动本工具。所有参数都有默认值，如果是默认值的话，不需要输入任何`--option`。
 75 | ```bash
 76 | subfix -h
 77 | 
 78 | # webui
 79 | subfix webui -h
 80 | subfix webui --load_list demo.list --webui_language zh --force_delete True
 81 | # create dataset
 82 | subfix create modelscope -h
 83 | # 英语
 84 | subfix create modelscope --source_dir origin --language EN
 85 | # 中文
 86 | subfix create modelscope --source_dir origin --language ZH
 87 | # 日语
 88 | subfix create modelscope --source_dir origin --language JA
 89 | # Openai-Whisper标注 (几乎支持所有语言)
 90 | subfix create whisper --source_dir origin --language ZH
 91 | subfix create whisper --source_dir origin --language JA
 92 | # 说话人确认 （分离不同说话人）
 93 | subfix diarization -h
 94 | subfix diarization --source_dir origin --target_dir diarization --min_seconds 3.0
 95 | ```
 96 | 
 97 | 每次使用自动标注前，建议清空一下`cache/subfix/`文件夹
 98 | ```bash
 99 | rm -rf cache/subfix
100 | ```
101 | 
102 | ## 启动SubFix查看数据集
103 | 
104 | `SubFix`支持2种格式，分别是`.json`和`.list`格式。
105 | 
106 | `.list`的格式中，每行数据类似于`"{wav_path}|{speaker_name}|{language}|{text}"`。
107 | 
108 | 例如，如果你已经有了一个`demo.list`文件，和它对应的音频已经放到了正确的路径，那么可以执行如下命令来启动`SubFix`的UI界面：
109 | 
110 | ```bash
111 | subfix webui --load_list demo.list
112 | # or
113 | subfix webui --load_json demo.json
114 | ```
115 | 
116 | 查看帮助
117 | ```bash
118 | subfix --help
119 | subfix webui --help
120 | ```
121 | 
122 | ### 快速查看和听取音频
123 | 
124 | 可以点击`Previous Index`、`Next Index`按钮来切换列表，同时可以拖动`slider`并点击`Change Index`来快速定位列表。
125 | 
126 | ![change index gif](images/index.gif)
127 | 
128 | ### 修改文本
129 | 
130 | 可以直接修改文本，并点击`Submit Text`按钮来保存修改。
131 | 
132 | ![change text gif](images/text.gif)
133 | 
134 | ### 合并
135 | 
136 | 选择需要合并的音频，设置`合并间隔`，然后点击`合并`按钮来合并音频。
137 | 
138 | ![merge audio gif](images/merge.gif)
139 | 
140 | ### 分割音频
141 | 
142 | 选择需要分割的音频，设置`分割点`，然后点击`分割`按钮来进行分割。注意，一次只能分割一个音频，分割后需要重新调整下文本。
143 | 
144 | ![split audio gif](images/split.gif)
145 | 
146 | ### 删除
147 | 
148 | 选择需要删除的音频，点击`按钮`进行删除。删除操作将暂存到内存之中，如果需要保存到文件中，需要点击保存按钮，或者执行一次其他命令来保存。
149 | 
150 | ![delete audio gif](images/delete.gif)
151 | 
152 | ### 自动标注音频和创建数据集
153 | 
154 | 默认情况下，将音频文件放入`origin`文件夹下，对于一个`sam`音频文件`abc.wav`，其所在的文件路径可以是`./origin/sam/abc.wav`这样的结构，之后执行下面的命令：
155 | 
156 | ```bash
157 | # rm -rf cache/subfix
158 | subfix create --source_dir origin --output demo.list
159 | ```
160 | 
161 | 该命令将创建一个`dataset`目录，同时将所有文件转录的音频的路径和字幕存储到了`demo.list`文件中。
162 | 
163 | ### 给list文件添加标点符号
164 | 
165 | 如果要用标点符号，使用下面的命令，自动对list文件中的文本添加标点符号。
166 | 
167 | ```
168 | subfix punctuation --load_list demo.list
169 | ```
170 | 
171 | ### 说话人识别、聚类
172 | 
173 | 在某些情况下，大段落音频中由于存在背景音乐，会将背景歌曲的人声或噪音识别，造成同一文件中的多人说话。
174 | 
175 | 又或者，说话太过密集，导致识别出来的音频过长。
176 | 
177 | 该功能将提取出每个文件中出现次数最多的`n`个说话人，说话人的每句话之间间隔`interval`秒，保存在`diarization`文件夹中，便于后续提取音频。
178 | 
179 | ```bash
180 | subfix diarization --source_dir origin --target_dir diarization --min_seconds 3.0 --interval 10 --top_of_number 1
181 | subfix create modelscope --source_dir diarization --language ZH
182 | ```
183 | 
184 | ## 格式转换
185 | 
186 | 两种格式`.list`和`.json`可以互相转换，使用如下命令对文件进行转换：
187 | 
188 | ```bash
189 | subfix format_convert --source demo.list --target demo.json
190 | subfix format_convert --source demo.json --target demo.list
191 | ```
192 | 
193 | ## subfix_webui.py
194 | 
195 | 单独Python文件版本的使用方法：
196 | 
197 | 查看帮助
198 | 
199 | ```bash
200 | python subfix_webui.py -h
201 | ```
202 | 
203 | 中文启动
204 | 
205 | ```bash
206 | python subfix_webui.py --webui_language zh --load_list demo.list
207 | ```
208 | 
209 | 指定`.list`文件
210 | 
211 | ```bash
212 | python subfix_webui.py --load_list demo.list
213 | ```
214 | 
215 | 同步删除磁盘文件，默认值是True。
216 | 
217 | ```bash
218 | python subfix_webui.py --force_delete True
219 | # or
220 | python subfix_webui.py --force_delete False
221 | ```
222 | 
223 | 指定端口启动
224 | 
225 | ```bash
226 | python subfix_webui.py --server_port 1234
227 | ```
228 | 
229 | ## References
230 | 
231 | - [anyvoiceai/MassTTS](https://github.com/anyvoiceai/MassTTS)
232 | - [fishaudio/Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
233 | - [openai/whisper](https://github.com/openai/whisper)


--------------------------------------------------------------------------------
/images/delete.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/images/delete.gif


--------------------------------------------------------------------------------
/images/index.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/images/index.gif


--------------------------------------------------------------------------------
/images/merge.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/images/merge.gif


--------------------------------------------------------------------------------
/images/split.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/images/split.gif


--------------------------------------------------------------------------------
/images/text.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/images/text.gif


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | librosa
2 | gradio>=3.50.2, <4.0.0
3 | numpy
4 | soundfile
5 | torchaudio
6 | transformers
7 | openai-whisper
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import io
 5 | import os
 6 | import sys
 7 | import pkg_resources
 8 | from shutil import rmtree
 9 | from setuptools import find_packages, setup, Command
10 | 
11 | NAME = 'SubFix'
12 | DESCRIPTION = 'A tool to read dataset and crate dataset to train TTS.'
13 | URL = 'https://github.com/cronrpc/SubFix'
14 | EMAIL = 'cronrpc'
15 | AUTHOR = 'cronrpc'
16 | REQUIRES_PYTHON = '>=3.8.0'
17 | VERSION = '0.1.2'
18 | 
19 | REQUIRED = [
20 | ]
21 | 
22 | EXTRAS = {
23 | }
24 | 
25 | here = os.path.abspath(os.path.dirname(__file__))
26 | long_description = DESCRIPTION
27 | 
28 | 
29 | setup(
30 |     name=NAME,
31 |     version=VERSION,
32 |     description=DESCRIPTION,
33 |     long_description=long_description,
34 |     long_description_content_type='text/markdown',
35 |     author=AUTHOR,
36 |     author_email=EMAIL,
37 |     python_requires=REQUIRES_PYTHON,
38 |     url=URL,
39 |     py_modules=['subfix'],
40 |     install_requires=REQUIRED
41 |     + [
42 |         str(r)
43 |         for r in pkg_resources.parse_requirements(
44 |             open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
45 |         )
46 |     ],
47 |     entry_points={
48 |         "console_scripts": ["subfix=subfix.cli:cli"],
49 |     },
50 |     extras_require=EXTRAS,
51 |     include_package_data=True,
52 |     license='Apache 2.0',
53 |     classifiers=[
54 |         'License :: OSI Approved :: Apache 2.0',
55 |         'Programming Language :: Python',
56 |         'Programming Language :: Python :: 3',
57 |         'Programming Language :: Python :: 3.8',
58 |         'Programming Language :: Python :: Implementation :: CPython',
59 |         'Programming Language :: Python :: Implementation :: PyPy'
60 |     ],
61 | )
62 | 


--------------------------------------------------------------------------------
/subfix/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/__init__.py


--------------------------------------------------------------------------------
/subfix/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | 
  5 | def handle_diarization(args):
  6 |     print(f"handle_diarization from {args.source_dir} to {args.target_dir}")
  7 |     assert(os.path.exists(args.source_dir))
  8 |     from subfix.solution.diarization import diarization_dir
  9 |     diarization_dir(args)
 10 |     pass
 11 | 
 12 | 
 13 | def handle_punctuation(args):
 14 |     assert(os.path.exists(args.load_list))
 15 |     from subfix.solution.punctuation_multi_lang import punctuation_multi_lang_process
 16 |     punctuation_multi_lang_process(args)
 17 |     pass
 18 | 
 19 | 
 20 | def handle_format_convert(args):
 21 |     from .format import FormatBertvits2, FormatJson
 22 |     print(os.path.splitext(args.source)[1])
 23 |     if os.path.splitext(args.source)[1] == '.list':
 24 |         souce_format = FormatBertvits2()
 25 |     else:
 26 |         souce_format = FormatJson()
 27 | 
 28 |     if os.path.splitext(args.target)[1] == '.list':
 29 |         target_format = FormatBertvits2()
 30 |     else:
 31 |         target_format = FormatJson()
 32 | 
 33 |     data = souce_format.load(args.source)
 34 |     target_format.save(args.target, data)
 35 | 
 36 | 
 37 | def handle_webui(args):
 38 |     from .webui import startwebui
 39 |     args.force_delete = (args.force_delete.upper() == "TRUE")
 40 |     startwebui(args)
 41 | 
 42 | 
 43 | def handle_create(args):
 44 |     print(f"Checkout command with args: {args}")
 45 |     if args.solution == "modelscope":
 46 |         from .solution.modelscope_multi_lang import run_task
 47 |         run_task(args)
 48 |     elif args.solution == "whisper":
 49 |         from .solution.whisper_multi_lang import run_whisper_task
 50 |         run_whisper_task(args)
 51 | 
 52 | 
 53 | def cli():
 54 |     parser = argparse.ArgumentParser(description="a tool to check or create TTS dataset")
 55 |     subparsers = parser.add_subparsers(dest='command')
 56 |     
 57 |     # webui
 58 |     parser_webui = subparsers.add_parser('webui', 
 59 |                                           help='webui to modify audios')
 60 |     parser_webui.add_argument('--load_json', default="None", help='source file, like demo.json')
 61 |     parser_webui.add_argument('--load_list', default="None", help='source file, like demo.list')
 62 |     parser_webui.add_argument('--json_key_text', default="text", type=str, help='the text key name in json, Default: text')
 63 |     parser_webui.add_argument('--json_key_path', default="wav_path", type=str, help='the path key name in json, Default: wav_path')
 64 |     parser_webui.add_argument('--g_batch', default=10, type=int, help='max number g_batch wav to display, Default: 10')
 65 |     parser_webui.add_argument('--webui_language', default="en", type=str, help='webui language: en or zh, Default: en')
 66 |     parser_webui.add_argument('--force_delete', default="True", type=str, help='delete file in disk while delete items, True or False, Default: True')
 67 |     parser_webui.set_defaults(func=handle_webui)
 68 |     
 69 | 
 70 |     # create
 71 |     parser_create = subparsers.add_parser('create', 
 72 |                                           help='create dataset by origin audio dirctory: subfix create [modelscope|whisper]')
 73 |     create_subparsers = parser_create.add_subparsers(dest='solution', 
 74 |                                                      help='auto asr solution, modelscope or whisper')
 75 | 
 76 |     # create modelscope
 77 |     modelscope_subparsers = create_subparsers.add_parser('modelscope', 
 78 |                                                          help='modelscope models')
 79 |     modelscope_subparsers.add_argument("--source_dir", type=str, default="origin", help="Source directory path, Default: origin")
 80 |     modelscope_subparsers.add_argument("--target_dir", type=str, default="dataset", help="Target directory path, Default: dataset")
 81 |     modelscope_subparsers.add_argument("--cache_dir", type=str, default="cache", help="cache directory path, Default: cache")
 82 |     modelscope_subparsers.add_argument("--sample_rate", type=int, default=44100, help="Sample rate, Default: 44100")
 83 |     modelscope_subparsers.add_argument("--language", type=str, default="ZH", help="Language, Default: ZH|JA|KO|EN|DE|RU")
 84 |     modelscope_subparsers.add_argument("--output", type=str, default="demo.list", help="List file, Default: demo.list")
 85 |     modelscope_subparsers.add_argument("--max_seconds", type=int, default=15, help="Max sliced voice length(seconds), Default: 15")
 86 |     modelscope_subparsers.add_argument("--absolute_path", default="False", type=str, help='absolute_path True or False, Default: False')
 87 |     modelscope_subparsers.set_defaults(func=handle_create)
 88 | 
 89 |     # create whisper
 90 |     whisper_subparsers = create_subparsers.add_parser('whisper', 
 91 |                                                       help='whisper models')
 92 |     whisper_subparsers.add_argument("--source_dir", type=str, default="origin", help="Source directory path, Default: origin")
 93 |     whisper_subparsers.add_argument("--target_dir", type=str, default="dataset", help="Target directory path, Default: dataset")
 94 |     whisper_subparsers.add_argument("--cache_dir", type=str, default="cache", help="cache directory path, Default: cache")
 95 |     whisper_subparsers.add_argument("--model", type=str, default="large-v3", help="whisper model small/medium/large-v3, Default: small")
 96 |     whisper_subparsers.add_argument("--sample_rate", type=int, default=44100, help="Sample rate, Default: 44100")
 97 |     whisper_subparsers.add_argument("--language", type=str, default="ZH", help="Any Language whisper support, Default: ZH")
 98 |     whisper_subparsers.add_argument("--output", type=str, default="demo.list", help="List file, Default: demo.list")
 99 |     whisper_subparsers.add_argument("--max_seconds", type=int, default=15, help="Max sliced voice length(seconds), Default: 15")
100 |     whisper_subparsers.add_argument("--absolute_path", default="False", type=str, help='absolute_path True or False, Default: False')
101 |     whisper_subparsers.set_defaults(func=handle_create)
102 | 
103 |     # format_convert
104 |     parser_format_convert = subparsers.add_parser('format_convert', 
105 |                                           help='format_convert: format_convert --source demo.json --target demo.list')
106 |     parser_format_convert.add_argument('--source', default="demo.list", help='source file, like demo.json/list')
107 |     parser_format_convert.add_argument('--target', default="demo.json", help='target file, like demo.list/json')
108 |     parser_format_convert.set_defaults(func=handle_format_convert)
109 | 
110 |     # diarization
111 |     parser_diarization = subparsers.add_parser('diarization', 
112 |                                           help='diarization: diarization -h')
113 |     parser_diarization.add_argument('--source_dir', default="origin", help='source dir, Default: origin')
114 |     parser_diarization.add_argument('--target_dir', default="diarization", help='target dir, Default: diarization')
115 |     parser_diarization.add_argument('--cache_dir', default="cache", help='cache dir, Default: cache')
116 |     parser_diarization.add_argument('--min_seconds', default=3.0, type=float, help='slice must bigger than min_seconds, Default: 3.0')
117 |     parser_diarization.add_argument('--top_of_number', default=1, type=int, help='The n items with the highest frequency of occurrence. Default: 1')
118 |     parser_diarization.add_argument('--interval', default=1.0, type=float, help='The interval between two slice audio. Default: 1.0')
119 |     parser_diarization.add_argument("--sample_rate", type=int, default=44100, help="Sample rate, Default: 44100")
120 |     parser_diarization.add_argument("--oracle_num", type=int, default=0, help="oracle number, the person number you think maybe in audio, Default: 0")
121 |     parser_diarization.set_defaults(func=handle_diarization)
122 | 
123 |     # punctuation
124 |     parser_punctuation = subparsers.add_parser('punctuation', 
125 |                                           help='punctuation: punctuation -h')
126 |     parser_punctuation.add_argument('--load_list', default="demo.list", type=str, help='source file, like demo.list')
127 |     parser_punctuation.set_defaults(func=handle_punctuation)
128 | 
129 |     # run
130 |     args = parser.parse_args()
131 | 
132 |     if hasattr(args, 'func'):
133 |         args.func(args)
134 |     else:
135 |         parser.print_help()
136 | 
137 | 


--------------------------------------------------------------------------------
/subfix/format/FormatBertvits2.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import List
 3 | 
 4 | 
 5 | class FormatBertvits2():
 6 | 
 7 |     def __init__(self) -> None:
 8 |         pass
 9 | 
10 |     def load(self, path : str) -> List[dict]:
11 |         # this format : {wav_path}|{speaker_name}|{language}|{text}"
12 |         data = []
13 |         with open(path, 'r', encoding="utf-8") as source:
14 |             read_list = source.readlines()
15 |             for _ in read_list:
16 |                 items = _.split('|')
17 |                 if (len(items) == 4):
18 |                     wav_path, speaker_name, language, text= items
19 |                     data.append(
20 |                         {
21 |                             'wav_path':wav_path,
22 |                             'speaker_name':speaker_name,
23 |                             'language':language,
24 |                             'text':text.strip()
25 |                         }
26 |                     )
27 |             print(f"data has been load from {path}")
28 |         return data
29 | 
30 |     def save(self, path : str, data : List[dict]):
31 |         with open(path, 'w', encoding="utf-8") as target:
32 |             for _ in data:
33 |                 wav_path = _['wav_path']
34 |                 speaker_name = _['speaker_name']
35 |                 language = _['language']
36 |                 text = _['text']
37 |                 target.write(f"{wav_path}|{speaker_name}|{language}|{text}\n")
38 |             print(f"data has been save at {path}")


--------------------------------------------------------------------------------
/subfix/format/FormatJson.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import List
 3 | 
 4 | 
 5 | class FormatJson():
 6 | 
 7 |     def __init__(self) -> None:
 8 |         pass
 9 | 
10 |     def load(self, path : str):
11 |         with open(path, 'r', encoding="utf-8") as source:
12 |             data_lines = source.readlines()
13 |             data = [json.loads(line) for line in data_lines]
14 |             print(f"data has been load from {path}")
15 |         return data
16 | 
17 |     def save(self, path : str, data : List[dict]):
18 |         with open(path, 'w', encoding="utf-8") as target:
19 |             for item in data:
20 |                 line = json.dumps(item, ensure_ascii=False)
21 |                 target.write(line + '\n')
22 |             print(f"data has been save at {path}")


--------------------------------------------------------------------------------
/subfix/format/__init__.py:
--------------------------------------------------------------------------------
1 | from .FormatBertvits2 import FormatBertvits2
2 | from .FormatJson import FormatJson


--------------------------------------------------------------------------------
/subfix/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/models/__init__.py


--------------------------------------------------------------------------------
/subfix/models/audio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/models/audio/__init__.py


--------------------------------------------------------------------------------
/subfix/models/audio/asr/__init__.py:
--------------------------------------------------------------------------------
1 | from .speech_paraformer_large_vad_punc_asr_zh import Speech_Paraformer_Large_Vad_Punc_Asr_zh
2 | from .speech_uniasr_asr_multilang import Speech_UniASR_Asr_MultiLang
3 | from .openai_whisper import Openai_Whisper


--------------------------------------------------------------------------------
/subfix/models/audio/asr/openai_whisper.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from typing import Any
 3 | import librosa
 4 | 
 5 | class Openai_Whisper():
 6 |     def __init__(self, language : str, model_name : str = "large-v3") -> None:
 7 |         import whisper
 8 |         self.whisper_model = whisper.load_model(model_name, download_root = None)
 9 |         self.language = language
10 | 
11 |     def infer(self, audio_in) -> None:
12 |         print("start asr:", audio_in)
13 |         segments = self.whisper_model.transcribe(audio_in, word_timestamps=True, language = self.language)['segments']
14 |         data_list = []
15 |         for _ in segments:
16 |             item = {}
17 |             item['start'] = _['start']
18 |             item['end'] = _['end']
19 |             item['text'] = _['text'].strip()
20 |             data_list.append(item)
21 |         return data_list
22 | 
23 |     def __call__(self, *args: Any, **kwds: Any) -> Any:
24 |         return self.infer(*args, **kwds)


--------------------------------------------------------------------------------
/subfix/models/audio/asr/speech_paraformer_large_vad_punc_asr_zh.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | 
 4 | class Speech_Paraformer_Large_Vad_Punc_Asr_zh():
 5 |     def __init__(self, language : str = "ZH") -> None:
 6 |         from funasr import AutoModel
 7 | 
 8 |         self._model = AutoModel(model="paraformer-zh", model_revision="v2.0.4",
 9 |                   vad_model="fsmn-vad", vad_model_revision="v2.0.4",
10 |                   punc_model="ct-punc-c", punc_model_revision="v2.0.4",
11 |                   spk_model="cam++", spk_model_revision="v2.0.2",
12 |                   )
13 | 
14 |     def infer(self, audio_in) -> None:
15 |         rec_result = self._model.generate(input=audio_in, 
16 |                                     batch_size_s=300, 
17 |                                     hotword='') # dict_keys(['text', 'start', 'end', 'timestamp', 'spk'])
18 |         data_list = []
19 |         for sentence in rec_result[0]['sentence_info']:
20 |             if sentence['text'].strip() == "":
21 |                 continue
22 |             item = {}
23 |             item['start'] = sentence['timestamp'][0][0] / 1000.0
24 |             item['end'] = sentence['end'] / 1000.0
25 |             item['text'] = sentence['text'].strip()
26 |             data_list.append(item)
27 |         return data_list
28 | 
29 |     def __call__(self, *args: Any, **kwds: Any) -> Any:
30 |         return self.infer(*args, **kwds)


--------------------------------------------------------------------------------
/subfix/models/audio/asr/speech_uniasr_asr_multilang.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | import librosa
 3 | 
 4 | class Speech_UniASR_Asr_MultiLang():
 5 |     def __init__(self, language : str, max_seconds : float) -> None:
 6 |         self.set_asr_model_by_language(language)
 7 |         self.set_vad_model_by_language(language, max_seconds)
 8 |     
 9 |     def set_asr_model_by_language(self, language):
10 | 
11 |         model_config = {
12 |             "KO" : 'damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline',
13 |             "JA" : 'damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline',
14 |             "EN" : 'damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline',
15 |             "DE" : 'damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online',
16 |             "RU" : 'damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline',
17 |             "ZH" : 'iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
18 |         }
19 | 
20 |         model_config_revision = {
21 |             "DE" : 'v1.0.1',
22 |             "ZH" : 'v2.0.4'
23 |         }
24 | 
25 |         assert( language in model_config.keys() )
26 | 
27 |         from modelscope.pipelines import pipeline
28 |         from modelscope.utils.constant import Tasks
29 | 
30 |         revision = model_config_revision[language] if (language in model_config_revision.keys()) else None
31 | 
32 |         self._asr_model = pipeline( task = Tasks.auto_speech_recognition,
33 |                                     model = model_config[language],
34 |                                     model_revision = revision )
35 | 
36 |     def set_vad_model_by_language(self, language, max_seconds = 60.0):
37 |         from subfix.models.audio.vad.speech_fsmn_vad_zh import Speech_Fsmn_Vad_Zh_16k_Common
38 |         self._vad_model = Speech_Fsmn_Vad_Zh_16k_Common(max_seconds=max_seconds)
39 | 
40 |     def infer(self, audio_in) -> None:
41 |         print("start asr:", audio_in)
42 |         vad_list = self._vad_model(audio_in = audio_in)
43 |         data_list = []
44 |         waveform, sample_rate = librosa.load(audio_in, sr=16000, mono=True)
45 |         for _ in vad_list:
46 |             start_time, end_time = _['start'], _['end']
47 |             start = int(start_time * sample_rate)
48 |             end = int(end_time * sample_rate)
49 |             slice_waveform = waveform[start: end]
50 |             ret_asrmodl = self._asr_model(input = slice_waveform)
51 |             if (len(ret_asrmodl) > 0):
52 |                 text = ret_asrmodl[0]['text']
53 |                 print(text)
54 |                 if text.strip() == "":
55 |                     continue
56 |                 item = {}
57 |                 item['start'] = start_time
58 |                 item['end'] = end_time
59 |                 item['text'] = text.strip()
60 |                 data_list.append(item)
61 |         return data_list
62 | 
63 |     def __call__(self, *args: Any, **kwds: Any) -> Any:
64 |         return self.infer(*args, **kwds)


--------------------------------------------------------------------------------
/subfix/models/audio/punctuation/__init__.py:
--------------------------------------------------------------------------------
1 | from .punctuation_funasr import Punctuation_FunASR


--------------------------------------------------------------------------------
/subfix/models/audio/punctuation/punctuation_funasr.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | class Punctuation_FunASR():
 4 |     def __init__(self) -> None:
 5 |         from funasr import AutoModel
 6 |         self._model = AutoModel(model="ct-punc", model_revision="v2.0.4")
 7 |     
 8 |     def infer(self, input):
 9 |         res = self._model.generate(input=input)
10 |         if (len(res) > 0):
11 |             text = res[0]['text']
12 |             return text
13 |         else:
14 |             return ""
15 |         
16 |     def __call__(self, *args: Any, **kwds: Any) -> Any:
17 |         return self.infer(*args, **kwds)


--------------------------------------------------------------------------------
/subfix/models/audio/speaker_diarization/Speech_Campplus_Speaker_Diarization.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class Speech_Campplus_Speaker_Diarization():
 4 |     def __init__(self) -> None:
 5 |         from modelscope.pipelines import pipeline
 6 |         self._pipeline = pipeline(
 7 |             task='speaker-diarization',
 8 |             model='damo/speech_campplus_speaker-diarization_common',
 9 |             model_revision='v1.0.0'
10 |         )
11 |     
12 |     def infer(self, input, min_seconds = 0, oracle_num = None, **args):
13 |         result = self._pipeline(input, oracle_num = oracle_num, **args)['text']
14 |         count_dict = {}
15 |         for item in result:
16 |             if item[2] in count_dict:
17 |                 count_dict[item[2]] = count_dict[item[2]] + 1
18 |             else:
19 |                 count_dict[item[2]] = 1
20 |         numbers = list(reversed([[k, v] for k, v in sorted(count_dict.items(), key=lambda m : list(m)[1])]))
21 |         topn = [i[0] for i in numbers] # person
22 |         topn_number = [i[1] for i in numbers] # number
23 |         res = []
24 |         for item in result:
25 |             if item[1] - item[0] > min_seconds:
26 |                 res.append(item)
27 |         return res, topn, topn_number


--------------------------------------------------------------------------------
/subfix/models/audio/speaker_diarization/__init__.py:
--------------------------------------------------------------------------------
1 | from .Speech_Campplus_Speaker_Diarization import Speech_Campplus_Speaker_Diarization


--------------------------------------------------------------------------------
/subfix/models/audio/vad/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/models/audio/vad/__init__.py


--------------------------------------------------------------------------------
/subfix/models/audio/vad/speech_fsmn_vad_zh.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | class Speech_Fsmn_Vad_Zh_16k_Common():
 4 |     def __init__(self, max_seconds : float = 60.0) -> None:
 5 |         from modelscope.pipelines import pipeline
 6 |         from modelscope.utils.constant import Tasks
 7 | 
 8 |         self._inference_pipeline = pipeline(
 9 |                                         task=Tasks.voice_activity_detection,
10 |                                         model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
11 |                                         model_revision=None,
12 |                                     )
13 |         self.max_seconds = max_seconds
14 |         self._tolerance = 1e-6
15 | 
16 |     def infer(self, audio_in) -> None:
17 |         rec_result = self._inference_pipeline(audio_in)[0]
18 |         # return [{start : seconds, end: seconds}]
19 |         data = []
20 |         for item in rec_result['value']:
21 |             start = item[0] / 1000.0
22 |             end = item[1] / 1000.0
23 |             duration = end - start
24 |             if duration <= self.max_seconds:
25 |                 data.append({'start': start, 'end': end})
26 |             else:
27 |                 num_segments = int(duration / self.max_seconds) + (1 if duration % self.max_seconds > self._tolerance else 0)
28 |                 segment_length = duration / num_segments
29 |                 for i in range(num_segments):
30 |                     new_start = start + i * segment_length
31 |                     new_end = min(new_start + segment_length, end)
32 |                     data.append({'start': new_start, 'end': new_end})
33 |         return data
34 | 
35 |     def __call__(self, *args: Any, **kwds: Any) -> Any:
36 |         return self.infer(*args, **kwds)


--------------------------------------------------------------------------------
/subfix/models/audio/verification/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/models/audio/verification/__init__.py


--------------------------------------------------------------------------------
/subfix/models/nlp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/models/nlp/__init__.py


--------------------------------------------------------------------------------
/subfix/models/nlp/correction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/models/nlp/correction/__init__.py


--------------------------------------------------------------------------------
/subfix/solution/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cronrpc/SubFix/e4d152dbc7697c392e81226a9723429c412680f6/subfix/solution/__init__.py


--------------------------------------------------------------------------------
/subfix/solution/diarization.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | from subfix.models.audio.speaker_diarization import Speech_Campplus_Speaker_Diarization
 4 | from subfix.utils import convert_files, get_files_by_ext
 5 | from subfix.utils.misc import merge_audio_vads
 6 | 
 7 | def diarization_dir(args):
 8 | 
 9 | 
10 |     source_dir = args.source_dir
11 |     target_dir = args.target_dir
12 |     cache_dir = args.cache_dir
13 |     sample_rate = args.sample_rate
14 |     min_seconds = args.min_seconds
15 |     top_of_number = args.top_of_number
16 |     interval = args.interval
17 |     oracle_num = None if int(args.oracle_num) == 0 else int(args.oracle_num)
18 |     
19 |     dir_16000 = os.path.join(cache_dir,'subfix','origin','16000')
20 |     dir_sample_rate = os.path.join(cache_dir,'subfix','origin',str(sample_rate))
21 | 
22 |     if os.path.exists(dir_16000):
23 |         shutil.rmtree(dir_16000)
24 |     if os.path.exists(dir_sample_rate):
25 |         shutil.rmtree(dir_sample_rate)
26 |     
27 |     convert_files(source_dir, dir_sample_rate, sample_rate)
28 |     convert_files(dir_sample_rate, dir_16000, 16000)
29 | 
30 |     files = get_files_by_ext(dir_16000, [".wav"])
31 | 
32 |     print("Start Speech_Campplus_Speaker_Diarization")
33 | 
34 |     SCSD = Speech_Campplus_Speaker_Diarization()
35 | 
36 |     for file_path in files:
37 |         f_16000 = os.path.join(dir_16000, file_path)
38 |         f_samplerate = os.path.join(dir_sample_rate, file_path)
39 |         
40 |         result, topn, topn_number = SCSD.infer(f_16000, min_seconds = min_seconds , oracle_num = oracle_num)
41 |         topn = topn[:top_of_number]
42 |         for person in topn:
43 |             vad_list = []
44 |             save_path = os.path.join(target_dir, os.path.splitext(file_path)[0] + f"_{person}" +  os.path.splitext(file_path)[1])
45 |             print("save:", save_path)
46 |             for item in result:
47 |                 if item[2] == person:
48 |                     vad_list.append(item[:2])
49 |             if len(vad_list) > 0:
50 |                 merge_audio_vads(f_samplerate, save_path, vad_list, interval=interval)
51 | 
52 |     if os.path.exists(dir_16000):
53 |         shutil.rmtree(dir_16000)
54 |     if os.path.exists(dir_sample_rate):
55 |         shutil.rmtree(dir_sample_rate)


--------------------------------------------------------------------------------
/subfix/solution/modelscope_multi_lang.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import re
 4 | import subprocess
 5 | 
 6 | import librosa
 7 | import numpy as np
 8 | import soundfile
 9 | 
10 | from subfix.models.audio.asr import Speech_Paraformer_Large_Vad_Punc_Asr_zh , Speech_UniASR_Asr_MultiLang
11 | from subfix.utils import convert_files
12 | from subfix.utils.misc import merge_audio_slice, get_sub_dirs
13 | 
14 | 
15 | def create_dataset(source_dir, target_dir, sample_rate, language, infer_model, max_seconds, absolute_path : bool):
16 |     # source_dir, target_dir, sample_rate=44100, language = "ZH", inference_pipeline = None
17 |     
18 |     roles = get_sub_dirs(source_dir)
19 |     count = 0
20 |     result = []
21 | 
22 |     for speaker_name in roles:
23 | 
24 |         source_audios = [f for f in os.listdir(os.path.join(source_dir, speaker_name)) if f.endswith(".wav")]
25 |         source_audios = [os.path.join(source_dir, speaker_name, filename) for filename in source_audios]
26 |         slice_dir = os.path.join(target_dir, speaker_name)
27 |         os.makedirs(slice_dir, exist_ok=True)
28 | 
29 |         for audio_path in sorted(source_audios):
30 | 
31 |             data_list = infer_model(audio_in=audio_path)
32 | 
33 |             data, count = merge_audio_slice(audio_path, slice_dir, data_list, count, sample_rate, max_seconds, language, speaker_name)
34 | 
35 |             for item_audio in data:
36 |                 if absolute_path:
37 |                     sliced_audio_path = os.path.abspath(item_audio['sliced_audio_path'])
38 |                 else:
39 |                     sliced_audio_path = item_audio['sliced_audio_path']
40 |                 speaker_name = item_audio['speaker_name']
41 |                 language = item_audio['language']
42 |                 text = item_audio['text']
43 |                 result.append(f"{sliced_audio_path}|{speaker_name}|{language}|{text}")
44 | 
45 |     return result
46 | 
47 | 
48 | def create_list(source_dir, target_dir, cache_dir, sample_rate, language, output_list, max_seconds, absolute_path : bool):
49 | 
50 |     resample_dir = os.path.join(cache_dir,"subfix","origin",f"{sample_rate}")
51 | 
52 |     convert_files(source_dir, resample_dir, sample_rate)
53 |     
54 |     if language == "ZH":
55 |         asr_model = Speech_Paraformer_Large_Vad_Punc_Asr_zh()
56 |     else:
57 |         asr_model = Speech_UniASR_Asr_MultiLang(language=language, max_seconds=max_seconds)
58 | 
59 |     result =  create_dataset(resample_dir, target_dir, sample_rate = sample_rate, language = language, infer_model = asr_model, max_seconds = max_seconds, absolute_path = absolute_path)
60 | 
61 |     with open(output_list, "w", encoding="utf-8") as file:
62 |         for line in result:
63 |             try:
64 |                 file.write(line.strip() + '\n')
65 |             except UnicodeEncodeError:
66 |                 print("UnicodeEncodeError: Can't encode to ASCII:", line)
67 | 
68 | 
69 | def run_task(args):
70 | 
71 |     args.absolute_path = (args.absolute_path.upper() == "TRUE")
72 |     
73 |     create_list(args.source_dir, args.target_dir, args.cache_dir, args.sample_rate, args.language, args.output, args.max_seconds, args.absolute_path)
74 |     
75 | 


--------------------------------------------------------------------------------
/subfix/solution/punctuation_multi_lang.py:
--------------------------------------------------------------------------------
 1 | from subfix.format import FormatBertvits2
 2 | from subfix.models.audio.punctuation import Punctuation_FunASR
 3 | 
 4 | def punctuation_multi_lang_process(args):
 5 |     input_file = args.load_list
 6 |     souce_format = FormatBertvits2()
 7 |     data = souce_format.load(input_file)
 8 |     punc_fix = Punctuation_FunASR()
 9 |     for i in range(len(data)):
10 |         print(i,'/',len(data),sep="")
11 |         data[i]['text'] = punc_fix(data[i]['text'])
12 |     data = souce_format.save(input_file, data)
13 |     pass


--------------------------------------------------------------------------------
/subfix/solution/whisper_multi_lang.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import re
 4 | import subprocess
 5 | 
 6 | import librosa
 7 | import numpy as np
 8 | import soundfile
 9 | 
10 | from subfix.models.audio.asr import Openai_Whisper
11 | from subfix.utils import convert_files
12 | from subfix.utils.misc import merge_audio_slice, get_sub_dirs
13 | 
14 | 
15 | def create_whisper_dataset(source_dir, target_dir, sample_rate, language, infer_model, max_seconds, absolute_path : bool):
16 |     # source_dir, target_dir, sample_rate=44100, language = "ZH", inference_pipeline = None
17 |     
18 |     roles = get_sub_dirs(source_dir)
19 |     count = 0
20 |     result = []
21 | 
22 |     for speaker_name in roles:
23 | 
24 |         source_audios = [f for f in os.listdir(os.path.join(source_dir, speaker_name)) if f.endswith(".wav")]
25 |         source_audios = [os.path.join(source_dir, speaker_name, filename) for filename in source_audios]
26 |         slice_dir = os.path.join(target_dir, speaker_name)
27 |         os.makedirs(slice_dir, exist_ok=True)
28 | 
29 |         for audio_path in sorted(source_audios):
30 | 
31 |             data_list = infer_model(audio_in=audio_path)
32 | 
33 |             data, count = merge_audio_slice(audio_path, slice_dir, data_list, count, sample_rate, max_seconds, language, speaker_name)
34 | 
35 |             for item_audio in data:
36 |                 if absolute_path:
37 |                     sliced_audio_path = os.path.abspath(item_audio['sliced_audio_path'])
38 |                 else:
39 |                     sliced_audio_path = item_audio['sliced_audio_path']
40 |                 speaker_name = item_audio['speaker_name']
41 |                 language = item_audio['language']
42 |                 text = item_audio['text']
43 |                 result.append(f"{sliced_audio_path}|{speaker_name}|{language}|{text}")
44 | 
45 |     return result
46 | 
47 | 
48 | def create_whisper_list(source_dir, target_dir, cache_dir, sample_rate, language, output_list, max_seconds, model_name, absolute_path : bool):
49 | 
50 |     resample_dir = os.path.join(cache_dir,"subfix","origin",f"{sample_rate}")
51 | 
52 |     convert_files(source_dir, resample_dir, sample_rate)
53 |     
54 |     lang_map = {
55 |         "ZH" : "Chinese",
56 |         "EN" : "English",
57 |         "JA" : "Japanese",
58 |         "RU" : "ru",
59 |         "DE" : "de",
60 |         "KO" : "ko"
61 |     }
62 | 
63 |     language_map = lang_map[language] if (language in lang_map.keys()) else language 
64 |     
65 |     asr_model = Openai_Whisper(language = language_map, model_name = model_name)
66 | 
67 |     result =  create_whisper_dataset(resample_dir, target_dir, sample_rate = sample_rate, language = language, infer_model = asr_model, max_seconds = max_seconds, absolute_path = absolute_path)
68 | 
69 |     with open(output_list, "w", encoding="utf-8") as file:
70 |         for line in result:
71 |             try:
72 |                 file.write(line.strip() + '\n')
73 |             except UnicodeEncodeError:
74 |                 print("UnicodeEncodeError: Can't encode to ASCII:", line)
75 | 
76 | 
77 | def run_whisper_task(args):
78 | 
79 |     args.absolute_path = (args.absolute_path.upper() == "TRUE")
80 | 
81 |     create_whisper_list(args.source_dir, args.target_dir, args.cache_dir, args.sample_rate, args.language, args.output, args.max_seconds, args.model, args.absolute_path)
82 |     
83 | 


--------------------------------------------------------------------------------
/subfix/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .convert import convert_wav_ffmpeg, convert_wav_librosa, convert_files
2 | from .ext_files import get_files_by_ext


--------------------------------------------------------------------------------
/subfix/utils/convert.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import os
 3 | import soundfile
 4 | import subprocess
 5 | from concurrent.futures import ThreadPoolExecutor
 6 | from .ext_files import get_files_by_ext
 7 | 
 8 | 
 9 | 
10 | def ffmpeg_installed():
11 | 
12 |     try:
13 |         subprocess.run(["ffmpeg", "-version"], 
14 |                        capture_output=True, 
15 |                        check=True)
16 |         print("find ffmpeg installed, use ffmpeg")
17 |         return True
18 |     except Exception as e:
19 |         print("ffmpeg not found, use librosa")
20 |         return False
21 | 
22 | 
23 | def convert_wav_ffmpeg(source_file : str, 
24 |                        target_file : str, 
25 |                        sample_rate : int,
26 |                        number      : int):
27 |     
28 |     os.makedirs(os.path.dirname(target_file), exist_ok=True)
29 | 
30 |     print(f"file {number} start convert")
31 | 
32 |     cmd = ["ffmpeg", "-y", "-i", source_file, "-ar", f"{sample_rate}", "-ac", "1", "-v", "quiet", target_file]
33 | 
34 |     subprocess.run(cmd)
35 | 
36 | 
37 | def convert_wav_librosa(source_file : str, 
38 |                         target_file : str, 
39 |                         sample_rate : int,
40 |                         number      : int):
41 |     
42 |     os.makedirs(os.path.dirname(target_file), exist_ok=True)
43 | 
44 |     print(f"file {number} start convert")
45 | 
46 |     data, sample_rate = librosa.load(source_file, 
47 |                                      sr=sample_rate, 
48 |                                      mono=True)
49 |     
50 |     soundfile.write(target_file, data, sample_rate)
51 | 
52 | 
53 | def convert_files(source_dir : str, 
54 |                   target_dir : str, 
55 |                   sample_rate : int, 
56 |                   max_threads = None,
57 |                   force_librosa = False):
58 | 
59 |     if max_threads == None:
60 |         max_threads = os.cpu_count()
61 | 
62 |     ext_files = get_files_by_ext(source_dir, [".mp3","acc","wav"])
63 | 
64 |     ffmpeg_installed_flag = (not force_librosa) and ffmpeg_installed()
65 | 
66 |     os.makedirs(target_dir, exist_ok=True)
67 | 
68 |     with ThreadPoolExecutor(max_workers=max_threads) as executor:
69 |         print(f"files count: {len(ext_files)}")
70 |         print(f"max_threads = {max_threads}")
71 |         for number, file in enumerate(ext_files, start=1):
72 |             source_path = os.path.join(source_dir, file)
73 |             target_path = os.path.join(target_dir, os.path.splitext(file)[0] + '.wav')
74 |             os.makedirs(os.path.dirname(target_path), exist_ok=True)
75 | 
76 |             if not os.path.exists(target_path):
77 |                 if ffmpeg_installed_flag:
78 |                     executor.submit(convert_wav_ffmpeg, 
79 |                                     source_path, 
80 |                                     target_path, 
81 |                                     sample_rate,
82 |                                     number)
83 |                 else:
84 |                     executor.submit(convert_wav_librosa, 
85 |                                     source_path, 
86 |                                     target_path, 
87 |                                     sample_rate,
88 |                                     number)


--------------------------------------------------------------------------------
/subfix/utils/ext_files.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Union, List
 3 | 
 4 | def get_files_by_ext(directory: str, 
 5 |                      media_extensions: Union[str, List[str]]
 6 |                      )-> List[str]:
 7 |     
 8 |     if isinstance(media_extensions, str):
 9 |         media_extensions = [media_extensions]
10 | 
11 |     relative_paths = []
12 |     
13 |     for root, dirs, files in os.walk(directory):
14 |         for file in files:
15 |             if any(file.endswith(ext) for ext in media_extensions):
16 |                 relative_path = os.path.relpath(os.path.join(root, file), 
17 |                                                 directory)
18 |                 relative_paths.append(relative_path)
19 |     relative_paths = sorted(relative_paths)
20 |     return relative_paths


--------------------------------------------------------------------------------
/subfix/utils/misc.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import json
  4 | from typing import List, Union
  5 | import librosa
  6 | import soundfile
  7 | import numpy as np
  8 | 
  9 | def save_json(path : str, data : Union[List[dict], dict]):
 10 |     with open(path, 'w', encoding="utf-8") as target:
 11 |         json.dump(data, path, ensure_ascii=False)
 12 | 
 13 | 
 14 | def load_json(path : str):
 15 |     with open(path, 'r', encoding="utf-8") as source:
 16 |         data = json.load(source)
 17 |         return data
 18 | 
 19 | 
 20 | def merge_audio_vads(source_path ,save_path, vad_list : List[List], interval = 1, sample_rate = None):
 21 |     data, sample_rate = librosa.load(source_path, sr=sample_rate, mono=True)
 22 |     audio_list = []
 23 |     for i, _ in enumerate(vad_list):
 24 |         time_start = _[0]
 25 |         time_end = _[1]
 26 |         start = int((time_start) * sample_rate)
 27 |         end = int((time_end) * sample_rate)
 28 |         if (i > 0):
 29 |             silence = np.zeros(int(sample_rate * interval))
 30 |             audio_list.append(silence)
 31 |         audio_list.append(data[start:end])
 32 |     audio_concat = np.concatenate(audio_list)
 33 |     os.makedirs(os.path.split(save_path)[0], exist_ok=True)
 34 |     soundfile.write(save_path, audio_concat, sample_rate)
 35 | 
 36 | 
 37 | def get_sub_dirs(source_dir):
 38 |     sub_dir = [f for f in os.listdir(source_dir) if not f.startswith('.')]
 39 |     sub_dir = [f for f in sub_dir if os.path.isdir(os.path.join(source_dir, f))]
 40 |     return sub_dir
 41 | 
 42 | 
 43 | def ends_with_ending_sentence(sentence):
 44 |     if re.search(r'[。？！…]$', sentence):
 45 |         return True
 46 |     return False
 47 | 
 48 | 
 49 | def ends_with_punctuation(sentence):
 50 |     pattern = r'[.,!?。，！？、・\uff00-\uffef\u3000-\u303f\u3040-\u309f\u30a0-\u30ff]$'
 51 |     return re.search(pattern, sentence)
 52 | 
 53 | 
 54 | def merge_audio_slice(source_audio, slice_dir, data_list, start_count, sample_rate, max_seconds, language, speaker_name) -> List:
 55 |     # input : datalist = [{'start': seconds, 'end': seconds, 'text': text}]
 56 |     # return : [{'sliced_audio_path', 'speaker_name', 'language', 'text'}] , count_next
 57 |     sentence_list = []
 58 |     audio_list = []
 59 |     time_length = 0
 60 |     count = start_count
 61 |     result = []
 62 | 
 63 |     data, sample_rate = librosa.load(source_audio, sr=sample_rate, mono=True)
 64 |     for sentence in data_list:
 65 |         text = sentence['text'].strip()
 66 |         if (text == ""):
 67 |             continue
 68 |         start = int((sentence['start']) * sample_rate)
 69 |         end = int((sentence['end']) * sample_rate)
 70 | 
 71 |         if time_length > 0 and time_length + (sentence['end'] - sentence['start']) > max_seconds:
 72 |             sliced_audio_name = f"{str(count).zfill(6)}"
 73 |             sliced_audio_path = os.path.join(slice_dir, sliced_audio_name+".wav")
 74 |             s_sentence = "".join(sentence_list)
 75 | 
 76 |             if language == "ZH" and re.search(r"[，]$", s_sentence):
 77 |                 s_sentence = s_sentence[:-1] + '。'
 78 |             if language == "ZH" and not ends_with_punctuation(s_sentence):
 79 |                 s_sentence = s_sentence
 80 | 
 81 |             audio_concat = np.concatenate(audio_list)
 82 |             if time_length > max_seconds:
 83 |                 print(f"[too long voice]:{sliced_audio_path}, voice_length:{time_length} seconds")
 84 |             soundfile.write(sliced_audio_path, audio_concat, sample_rate)
 85 |             result.append(
 86 |                 {
 87 |                     'sliced_audio_path' : sliced_audio_path,
 88 |                     'speaker_name' : speaker_name,
 89 |                     'language' : language,
 90 |                     'text' : s_sentence
 91 |                 }
 92 |             )
 93 |             sentence_list = []
 94 |             audio_list = []
 95 |             time_length = 0
 96 |             count = count + 1
 97 | 
 98 |         sentence_list.append(text)
 99 |         audio_list.append(data[start:end])
100 |         time_length = time_length + (sentence['end'] - sentence['start'])
101 |         
102 |         if ( ends_with_ending_sentence(text) ):
103 |             sliced_audio_name = f"{str(count).zfill(6)}"
104 |             sliced_audio_path = os.path.join(slice_dir, sliced_audio_name+".wav")
105 |             s_sentence = "".join(sentence_list)
106 |             audio_concat = np.concatenate(audio_list)
107 |             soundfile.write(sliced_audio_path, audio_concat, sample_rate)
108 |             
109 |             result.append(
110 |                 {
111 |                     'sliced_audio_path' : sliced_audio_path,
112 |                     'speaker_name' : speaker_name,
113 |                     'language' : language,
114 |                     'text' : s_sentence
115 |                 }
116 |             )
117 |             sentence_list = []
118 |             audio_list = []
119 |             time_length = 0
120 |             count = count + 1
121 |     return result, count


--------------------------------------------------------------------------------
/subfix/webui/__init__.py:
--------------------------------------------------------------------------------
1 | from .webui import startwebui


--------------------------------------------------------------------------------
/subfix/webui/language.py:
--------------------------------------------------------------------------------
 1 | 
 2 | LANG_CONFIG_MAP = {
 3 |     "zh": {
 4 |         "Change Index" : "改变索引",
 5 |         "Submit Text" : "保存文本",
 6 |         "Merge Audio" : "合并音频",
 7 |         "Delete Audio" : "删除音频",
 8 |         "Previous Index" : "前一页",
 9 |         "Next Index" : "后一页",
10 |         "Light Theme" : "亮色模式",
11 |         "Dark Theme" : "黑暗模式",
12 |         "Choose Audio" : "选择音频",
13 |         "Output Audio" : "Output Audio",
14 |         "Text" : "文本",
15 |         "Invert Selection": "反选",
16 |         "Save File" : "保存文件",
17 |         "Split Audio" : "分割音频",
18 |         "Audio Split Point(s)" : "音频分割点(单位：秒)",
19 |         "Index":"索引",
20 |         "Interval":"合并间隔（单位：秒）"
21 |     },
22 | }
23 | 
24 | 
25 | class TextLanguage():
26 |     def __init__(self, language : str = "en") -> None:
27 |         if language in LANG_CONFIG_MAP.keys():
28 |             self.language = language
29 |         else:
30 |             self.language = "en"
31 |         pass
32 | 
33 |     def get_text(self, text : str) -> str:
34 |         if self.language == "en":
35 |             return text
36 |         elif text in LANG_CONFIG_MAP[self.language].keys() :
37 |             return LANG_CONFIG_MAP[self.language][text]
38 |         else:
39 |             return text
40 |         
41 |     def __call__(self, text : str) -> str:
42 |         return self.get_text(text)


--------------------------------------------------------------------------------
/subfix/webui/webui.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import copy
  3 | import json
  4 | import os
  5 | import uuid
  6 | 
  7 | import librosa
  8 | import gradio as gr
  9 | import numpy as np
 10 | import soundfile
 11 | 
 12 | from .language import TextLanguage
 13 | 
 14 | g_json_key_text = ""
 15 | g_json_key_path = ""
 16 | g_load_file = ""
 17 | g_load_format = ""
 18 | 
 19 | g_max_json_index = 0
 20 | g_index = 0
 21 | g_batch = 10
 22 | g_text_list = []
 23 | g_audio_list = []
 24 | g_checkbox_list = []
 25 | g_data_json = []
 26 | g_language = None
 27 | 
 28 | 
 29 | def reload_data(index, batch):
 30 |     global g_index
 31 |     g_index = index
 32 |     global g_batch
 33 |     g_batch = batch
 34 |     datas = g_data_json[index:index+batch]
 35 |     output = []
 36 |     for d in datas:
 37 |         output.append(
 38 |             {
 39 |                 g_json_key_text: d[g_json_key_text],
 40 |                 g_json_key_path: d[g_json_key_path]
 41 |             }
 42 |         )
 43 |     return output
 44 | 
 45 | 
 46 | def b_change_index(index, batch):
 47 |     global g_index, g_batch
 48 |     g_index, g_batch = index, batch
 49 |     datas = reload_data(index, batch)
 50 |     output = []
 51 |     for i , _ in enumerate(datas):
 52 |         output.append(
 53 |             gr.Textbox(
 54 |                 label=f"Text {i+index}",
 55 |                 value=_[g_json_key_text]
 56 |             )
 57 |             )
 58 |     for _ in range(g_batch - len(datas)):
 59 |         output.append(
 60 |             gr.Textbox(
 61 |                 label=f"Text",
 62 |                 value=""
 63 |             )
 64 |         )
 65 |     for _ in datas:
 66 |         output.append(_[g_json_key_path])
 67 |     for _ in range(g_batch - len(datas)):
 68 |         output.append(None)
 69 |     for _ in range(g_batch):
 70 |         output.append(False)
 71 |     return output
 72 | 
 73 | 
 74 | def b_next_index(index, batch):
 75 |     if (index + batch) <= g_max_json_index:
 76 |         return index + batch , *b_change_index(index + batch, batch)
 77 |     else:
 78 |         return index, *b_change_index(index, batch)
 79 | 
 80 | 
 81 | def b_previous_index(index, batch):
 82 |     if (index - batch) >= 0:
 83 |         return index - batch , *b_change_index(index - batch, batch)
 84 |     else:
 85 |         return 0, *b_change_index(0, batch)
 86 | 
 87 | 
 88 | def b_submit_change(*text_list):
 89 |     global g_data_json
 90 |     change = False
 91 |     for i, new_text in enumerate(text_list):
 92 |         if g_index + i <= g_max_json_index:
 93 |             new_text = new_text.strip()+' '
 94 |             if (g_data_json[g_index + i][g_json_key_text] != new_text):
 95 |                 g_data_json[g_index + i][g_json_key_text] = new_text
 96 |                 change = True
 97 |     if change:
 98 |         b_save_file()
 99 |     return g_index, *b_change_index(g_index, g_batch)
100 | 
101 | 
102 | def b_delete_audio(*checkbox_list):
103 |     global g_data_json, g_index, g_max_json_index
104 |     change = False
105 |     for i, checkbox in reversed(list(enumerate(checkbox_list))):
106 |         if g_index + i < len(g_data_json):
107 |             if (checkbox == True):
108 |                 if g_force_delete:
109 |                     print("remove",g_data_json[g_index + i][g_json_key_path])
110 |                     os.remove(g_data_json[g_index + i][g_json_key_path])
111 |                 g_data_json.pop(g_index + i)
112 |                 change = True
113 |     
114 |     g_max_json_index = len(g_data_json)-1
115 |     if g_index > g_max_json_index:
116 |         g_index = g_max_json_index
117 |         g_index = g_index if g_index >= 0 else 0
118 |     if g_force_delete and change:
119 |         b_save_file()
120 |     return gr.Slider(value=g_index, maximum=(g_max_json_index if g_max_json_index>=0 else 0)), *b_change_index(g_index, g_batch)
121 | 
122 | 
123 | def b_invert_selection(*checkbox_list):
124 |     new_list = [not item if item is True else True for item in checkbox_list]
125 |     return new_list
126 | 
127 | 
128 | def get_next_path(filename):
129 |     base_dir = os.path.dirname(filename)
130 |     base_name = os.path.splitext(os.path.basename(filename))[0]
131 |     for i in range(100):
132 |         new_path = os.path.join(base_dir, f"{base_name}_{str(i).zfill(2)}.wav")
133 |         if not os.path.exists(new_path) :
134 |             return new_path
135 |     return os.path.join(base_dir, f'{str(uuid.uuid4())}.wav')
136 | 
137 | 
138 | def b_audio_split(audio_breakpoint, *checkbox_list):
139 |     global g_data_json , g_max_json_index
140 |     checked_index = []
141 |     for i, checkbox in enumerate(checkbox_list):
142 |         if (checkbox == True and g_index+i < len(g_data_json)):
143 |             checked_index.append(g_index + i)
144 |     if len(checked_index) == 1 :
145 |         index = checked_index[0]
146 |         audio_json = copy.deepcopy(g_data_json[index])
147 |         path = audio_json[g_json_key_path]
148 |         data, sample_rate = librosa.load(path, sr=None, mono=True)
149 |         audio_maxframe = len(data)
150 |         break_frame = int(audio_breakpoint * sample_rate)
151 | 
152 |         if (break_frame >= 1 and break_frame < audio_maxframe):
153 |             audio_first = data[0:break_frame]
154 |             audio_second = data[break_frame:]
155 |             nextpath = get_next_path(path)
156 |             soundfile.write(nextpath, audio_second, sample_rate)
157 |             soundfile.write(path, audio_first, sample_rate)
158 |             g_data_json.insert(index + 1, audio_json)
159 |             g_data_json[index + 1][g_json_key_path] = nextpath
160 |             b_save_file()
161 | 
162 |     g_max_json_index = len(g_data_json) - 1
163 |     return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch)
164 |     
165 | def b_merge_audio(interval_r, *checkbox_list):
166 |     global g_data_json , g_max_json_index
167 |     checked_index = []
168 |     audios_path = []
169 |     audios_text = []
170 |     delete_files = []
171 |     for i, checkbox in enumerate(checkbox_list):
172 |         if (checkbox == True and g_index+i < len(g_data_json)):
173 |             checked_index.append(g_index + i)
174 |             
175 |     if (len(checked_index)>1):
176 |         for i in checked_index:
177 |             audios_path.append(g_data_json[i][g_json_key_path])
178 |             audios_text.append(g_data_json[i][g_json_key_text])
179 |         for i in reversed(checked_index[1:]):
180 |             delete_files.append(g_data_json[i][g_json_key_path])
181 |             g_data_json.pop(i)
182 | 
183 |         base_index = checked_index[0]
184 |         base_path = audios_path[0]
185 |         g_data_json[base_index][g_json_key_text] = "".join(audios_text)
186 | 
187 |         audio_list = []
188 |         l_sample_rate = None
189 |         for i, path in enumerate(audios_path):
190 |             data, sample_rate = librosa.load(path, sr=l_sample_rate, mono=True)
191 |             l_sample_rate = sample_rate
192 |             if (i > 0):
193 |                 silence = np.zeros(int(l_sample_rate * interval_r))
194 |                 audio_list.append(silence)
195 | 
196 |             audio_list.append(data)
197 | 
198 |         audio_concat = np.concatenate(audio_list)
199 | 
200 |         for item_file in delete_files:
201 |             os.remove(item_file)
202 | 
203 |         soundfile.write(base_path, audio_concat, l_sample_rate)
204 | 
205 |         b_save_file()
206 |     
207 |     g_max_json_index = len(g_data_json) - 1
208 |     
209 |     return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch)
210 | 
211 | 
212 | def b_save_json():
213 |     with open(g_load_file,'w', encoding="utf-8") as file:
214 |         for data in g_data_json:
215 |             file.write(f'{json.dumps(data, ensure_ascii = False)}\n')
216 | 
217 | 
218 | def b_save_list():
219 |     with open(g_load_file,'w', encoding="utf-8") as file:
220 |         for data in g_data_json:
221 |             wav_path = data["wav_path"]
222 |             speaker_name = data["speaker_name"]
223 |             language = data["language"]
224 |             text = data["text"]
225 |             file.write(f"{wav_path}|{speaker_name}|{language}|{text}".strip()+'\n')
226 | 
227 | 
228 | def b_load_json():
229 |     global g_data_json, g_max_json_index
230 |     with open(g_load_file, 'r', encoding="utf-8") as file:
231 |         g_data_json = file.readlines()
232 |         g_data_json = [json.loads(line) for line in g_data_json]
233 |         g_max_json_index = len(g_data_json) - 1
234 | 
235 | 
236 | def b_load_list():
237 |     global g_data_json, g_max_json_index
238 |     with open(g_load_file, 'r', encoding="utf-8") as source:
239 |         data_list = source.readlines()
240 |         for _ in data_list:
241 |             data = _.split('|')
242 |             if (len(data) == 4):
243 |                 wav_path, speaker_name, language, text = data
244 |                 g_data_json.append(
245 |                         {
246 |                             'wav_path':wav_path,
247 |                             'speaker_name':speaker_name,
248 |                             'language':language,
249 |                             'text':text.strip()
250 |                         }
251 |                 )
252 |             else:
253 |                 print("error line:", data)
254 |         g_max_json_index = len(g_data_json) - 1
255 | 
256 | 
257 | def b_save_file():
258 |     if g_load_format == "json":
259 |         b_save_json()
260 |     elif g_load_format == "list":
261 |         b_save_list()
262 | 
263 | 
264 | def b_load_file():
265 |     if g_load_format == "json":
266 |         b_load_json()
267 |     elif g_load_format == "list":
268 |         b_load_list()
269 | 
270 | 
271 | def set_global(load_json, load_list, json_key_text, json_key_path, batch, webui_language, force_delete):
272 |     global g_json_key_text, g_json_key_path, g_load_file, g_load_format, g_batch, g_language, g_force_delete
273 | 
274 |     g_batch = int(batch)
275 |     
276 |     if (load_json != "None"):
277 |         g_load_format = "json"
278 |         g_load_file = load_json
279 |     elif (load_list != "None"):
280 |         g_load_format = "list"
281 |         g_load_file = load_list
282 |     else:
283 |         g_load_format = "list"
284 |         g_load_file = "demo.list"
285 |         
286 |     g_json_key_text = json_key_text
287 |     g_json_key_path = json_key_path
288 |     g_language = TextLanguage(webui_language)
289 |     g_force_delete = force_delete
290 | 
291 |     b_load_file()
292 | 
293 | 
294 | def startwebui(args):
295 | 
296 |     set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch, args.webui_language, args.force_delete)
297 |     
298 |     with gr.Blocks() as demo:
299 | 
300 |         with gr.Row():
301 |             btn_change_index = gr.Button(g_language("Change Index"))
302 |             btn_submit_change = gr.Button(g_language("Submit Text"))
303 |             btn_merge_audio = gr.Button(g_language("Merge Audio"))
304 |             btn_delete_audio = gr.Button(g_language("Delete Audio"))
305 |             btn_previous_index = gr.Button(g_language("Previous Index"))
306 |             btn_next_index = gr.Button(g_language("Next Index"))
307 |             
308 |         with gr.Row():
309 |             index_slider = gr.Slider(
310 |                     minimum=0, maximum=g_max_json_index, value=g_index, step=1, label=g_language("Index"), scale=3
311 |             )
312 |             splitpoint_slider = gr.Slider(
313 |                     minimum=0, maximum=120.0, value=0, step=0.1, label=g_language("Audio Split Point(s)"), scale=3
314 |             )
315 |             btn_audio_split = gr.Button(g_language("Split Audio"), scale=1)
316 |             btn_save_json = gr.Button(g_language("Save File"), visible=True, scale=1)
317 |             btn_invert_selection = gr.Button(g_language("Invert Selection"), scale=1)
318 |         
319 |         with gr.Row():
320 |             with gr.Column():
321 |                 for _ in range(0,g_batch):
322 |                     with gr.Row():
323 |                         text = gr.Textbox(
324 |                             label = "Text",
325 |                             visible = True,
326 |                             scale=5
327 |                         )
328 |                         audio_output = gr.Audio(
329 |                             label= g_language("Output Audio"),
330 |                             visible = True,
331 |                             scale=5
332 |                         )
333 |                         audio_check = gr.Checkbox(
334 |                             label="Yes",
335 |                             show_label = True,
336 |                             info = g_language("Choose Audio"),
337 |                             scale=1
338 |                         )
339 |                         g_text_list.append(text)
340 |                         g_audio_list.append(audio_output)
341 |                         g_checkbox_list.append(audio_check)
342 | 
343 | 
344 | 
345 |         with gr.Row():
346 |             batchsize_slider = gr.Slider(
347 |                     minimum=1, maximum=g_batch, value=g_batch, step=1, label=g_language("Batch Size"), scale=3, interactive=False
348 |             )
349 |             interval_slider = gr.Slider(
350 |                     minimum=0, maximum=2, value=0, step=0.01, label=g_language("Interval"), scale=3
351 |             )
352 |             btn_theme_dark = gr.Button(g_language("Light Theme"), link="?__theme=light", scale=1)
353 |             btn_theme_light = gr.Button(g_language("Dark Theme"), link="?__theme=dark", scale=1)
354 |         
355 |         btn_change_index.click(
356 |             b_change_index,
357 |             inputs=[
358 |                 index_slider,
359 |                 batchsize_slider,
360 |             ],
361 |             outputs=[
362 |                 *g_text_list,
363 |                 *g_audio_list,
364 |                 *g_checkbox_list
365 |             ],
366 |         )
367 | 
368 |         
369 |         btn_submit_change.click(
370 |             b_submit_change,
371 |             inputs=[
372 |                 *g_text_list,
373 |             ],
374 |             outputs=[
375 |                 index_slider,
376 |                 *g_text_list,
377 |                 *g_audio_list,
378 |                 *g_checkbox_list
379 |             ],
380 |         )
381 | 
382 |         btn_previous_index.click(
383 |             b_previous_index,
384 |             inputs=[
385 |                 index_slider,
386 |                 batchsize_slider,
387 |             ],
388 |             outputs=[
389 |                 index_slider,
390 |                 *g_text_list,
391 |                 *g_audio_list,
392 |                 *g_checkbox_list
393 |             ],
394 |         )
395 |         
396 |         btn_next_index.click(
397 |             b_next_index,
398 |             inputs=[
399 |                 index_slider,
400 |                 batchsize_slider,
401 |             ],
402 |             outputs=[
403 |                 index_slider,
404 |                 *g_text_list,
405 |                 *g_audio_list,
406 |                 *g_checkbox_list
407 |             ],
408 |         )
409 | 
410 |         btn_delete_audio.click(
411 |             b_delete_audio,
412 |             inputs=[
413 |                 *g_checkbox_list
414 |             ],
415 |             outputs=[
416 |                 index_slider,
417 |                 *g_text_list,
418 |                 *g_audio_list,
419 |                 *g_checkbox_list
420 |             ]
421 |         )
422 | 
423 |         btn_merge_audio.click(
424 |             b_merge_audio,
425 |             inputs=[
426 |                 interval_slider,
427 |                 *g_checkbox_list
428 |             ],
429 |             outputs=[
430 |                 index_slider,
431 |                 *g_text_list,
432 |                 *g_audio_list,
433 |                 *g_checkbox_list
434 |             ]
435 |         )
436 | 
437 |         btn_audio_split.click(
438 |             b_audio_split,
439 |             inputs=[
440 |                 splitpoint_slider,
441 |                 *g_checkbox_list
442 |             ],
443 |             outputs=[
444 |                 index_slider,
445 |                 *g_text_list,
446 |                 *g_audio_list,
447 |                 *g_checkbox_list
448 |             ]
449 |         )
450 | 
451 |         btn_invert_selection.click(
452 |             b_invert_selection,
453 |             inputs=[
454 |                 *g_checkbox_list
455 |             ],
456 |             outputs=[
457 |                 *g_checkbox_list
458 |             ]
459 |         )
460 | 
461 |         btn_save_json.click(
462 |             b_save_file
463 |         )
464 | 
465 |         demo.load(
466 |             b_change_index,
467 |             inputs=[
468 |                 index_slider,
469 |                 batchsize_slider,
470 |             ],
471 |             outputs=[
472 |                 *g_text_list,
473 |                 *g_audio_list,
474 |                 *g_checkbox_list
475 |             ],
476 |         )
477 |         
478 |     demo.launch()


--------------------------------------------------------------------------------
/subfix_webui.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import copy
  3 | import json
  4 | import os
  5 | import uuid
  6 | 
  7 | import librosa
  8 | import gradio as gr
  9 | import numpy as np
 10 | import soundfile
 11 | 
 12 | 
 13 | """
 14 | Apache License
 15 | Version 2.0, January 2004
 16 | https://www.apache.org/licenses/LICENSE-2.0
 17 | 
 18 | SubFix
 19 | cronrpc
 20 | https://github.com/cronrpc/SubFix
 21 | """
 22 | 
 23 | 
 24 | g_json_key_text = ""
 25 | g_json_key_path = ""
 26 | g_load_file = ""
 27 | g_load_format = ""
 28 | 
 29 | g_max_json_index = 0
 30 | g_index = 0
 31 | g_batch = 10
 32 | g_text_list = []
 33 | g_audio_list = []
 34 | g_checkbox_list = []
 35 | g_data_json = []
 36 | g_language = None
 37 | 
 38 | 
 39 | SUBFIX_LANG_CONFIG_MAP = {
 40 |     "zh": {
 41 |         "Change Index" : "改变索引",
 42 |         "Submit Text" : "保存文本",
 43 |         "Merge Audio" : "合并音频",
 44 |         "Delete Audio" : "删除音频",
 45 |         "Previous Index" : "前一页",
 46 |         "Next Index" : "后一页",
 47 |         "Light Theme" : "亮色模式",
 48 |         "Dark Theme" : "黑暗模式",
 49 |         "Choose Audio" : "选择音频",
 50 |         "Output Audio" : "Output Audio",
 51 |         "Text" : "文本",
 52 |         "Invert Selection": "反选",
 53 |         "Save File" : "保存文件",
 54 |         "Split Audio" : "分割音频",
 55 |         "Audio Split Point(s)" : "音频分割点(单位：秒)",
 56 |         "Index":"索引",
 57 |         "Interval":"合并间隔（单位：秒）"
 58 |     },
 59 | }
 60 | 
 61 | 
 62 | class SUBFIX_TextLanguage():
 63 |     def __init__(self, language : str = "en") -> None:
 64 |         if language in SUBFIX_LANG_CONFIG_MAP.keys():
 65 |             self.language = language
 66 |         else:
 67 |             self.language = "en"
 68 |         pass
 69 | 
 70 |     def get_text(self, text : str) -> str:
 71 |         if self.language == "en":
 72 |             return text
 73 |         elif text in SUBFIX_LANG_CONFIG_MAP[self.language].keys() :
 74 |             return SUBFIX_LANG_CONFIG_MAP[self.language][text]
 75 |         else:
 76 |             return text
 77 |         
 78 |     def __call__(self, text : str) -> str:
 79 |         return self.get_text(text)
 80 | 
 81 | 
 82 | def reload_data(index, batch):
 83 |     global g_index
 84 |     g_index = index
 85 |     global g_batch
 86 |     g_batch = batch
 87 |     datas = g_data_json[index:index+batch]
 88 |     output = []
 89 |     for d in datas:
 90 |         output.append(
 91 |             {
 92 |                 g_json_key_text: d[g_json_key_text],
 93 |                 g_json_key_path: d[g_json_key_path]
 94 |             }
 95 |         )
 96 |     return output
 97 | 
 98 | 
 99 | def b_change_index(index, batch):
100 |     global g_index, g_batch
101 |     g_index, g_batch = index, batch
102 |     datas = reload_data(index, batch)
103 |     output = []
104 |     for i , _ in enumerate(datas):
105 |         output.append(
106 |             gr.Textbox(
107 |                 label=f"Text {i+index}",
108 |                 value=_[g_json_key_text]
109 |             )
110 |             )
111 |     for _ in range(g_batch - len(datas)):
112 |         output.append(
113 |             gr.Textbox(
114 |                 label=f"Text",
115 |                 value=""
116 |             )
117 |         )
118 |     for _ in datas:
119 |         output.append(_[g_json_key_path])
120 |     for _ in range(g_batch - len(datas)):
121 |         output.append(None)
122 |     for _ in range(g_batch):
123 |         output.append(False)
124 |     return output
125 | 
126 | 
127 | def b_next_index(index, batch):
128 |     if (index + batch) <= g_max_json_index:
129 |         return index + batch , *b_change_index(index + batch, batch)
130 |     else:
131 |         return index, *b_change_index(index, batch)
132 | 
133 | 
134 | def b_previous_index(index, batch):
135 |     if (index - batch) >= 0:
136 |         return index - batch , *b_change_index(index - batch, batch)
137 |     else:
138 |         return 0, *b_change_index(0, batch)
139 | 
140 | 
141 | def b_submit_change(*text_list):
142 |     global g_data_json
143 |     change = False
144 |     for i, new_text in enumerate(text_list):
145 |         if g_index + i <= g_max_json_index:
146 |             new_text = new_text.strip()+' '
147 |             if (g_data_json[g_index + i][g_json_key_text] != new_text):
148 |                 g_data_json[g_index + i][g_json_key_text] = new_text
149 |                 change = True
150 |     if change:
151 |         b_save_file()
152 |     return g_index, *b_change_index(g_index, g_batch)
153 | 
154 | 
155 | def b_delete_audio(*checkbox_list):
156 |     global g_data_json, g_index, g_max_json_index
157 |     change = False
158 |     for i, checkbox in reversed(list(enumerate(checkbox_list))):
159 |         if g_index + i < len(g_data_json):
160 |             if (checkbox == True):
161 |                 if g_force_delete:
162 |                     print("remove",g_data_json[g_index + i][g_json_key_path])
163 |                     os.remove(g_data_json[g_index + i][g_json_key_path])
164 |                 g_data_json.pop(g_index + i)
165 |                 change = True
166 |     
167 |     g_max_json_index = len(g_data_json)-1
168 |     if g_index > g_max_json_index:
169 |         g_index = g_max_json_index
170 |         g_index = g_index if g_index >= 0 else 0
171 |     if g_force_delete and change:
172 |         b_save_file()
173 |     return gr.Slider(value=g_index, maximum=(g_max_json_index if g_max_json_index>=0 else 0)), *b_change_index(g_index, g_batch)
174 | 
175 | 
176 | def b_invert_selection(*checkbox_list):
177 |     new_list = [not item if item is True else True for item in checkbox_list]
178 |     return new_list
179 | 
180 | 
181 | def get_next_path(filename):
182 |     base_dir = os.path.dirname(filename)
183 |     base_name = os.path.splitext(os.path.basename(filename))[0]
184 |     for i in range(100):
185 |         new_path = os.path.join(base_dir, f"{base_name}_{str(i).zfill(2)}.wav")
186 |         if not os.path.exists(new_path) :
187 |             return new_path
188 |     return os.path.join(base_dir, f'{str(uuid.uuid4())}.wav')
189 | 
190 | 
191 | def b_audio_split(audio_breakpoint, *checkbox_list):
192 |     global g_data_json , g_max_json_index
193 |     checked_index = []
194 |     for i, checkbox in enumerate(checkbox_list):
195 |         if (checkbox == True and g_index+i < len(g_data_json)):
196 |             checked_index.append(g_index + i)
197 |     if len(checked_index) == 1 :
198 |         index = checked_index[0]
199 |         audio_json = copy.deepcopy(g_data_json[index])
200 |         path = audio_json[g_json_key_path]
201 |         data, sample_rate = librosa.load(path, sr=None, mono=True)
202 |         audio_maxframe = len(data)
203 |         break_frame = int(audio_breakpoint * sample_rate)
204 | 
205 |         if (break_frame >= 1 and break_frame < audio_maxframe):
206 |             audio_first = data[0:break_frame]
207 |             audio_second = data[break_frame:]
208 |             nextpath = get_next_path(path)
209 |             soundfile.write(nextpath, audio_second, sample_rate)
210 |             soundfile.write(path, audio_first, sample_rate)
211 |             g_data_json.insert(index + 1, audio_json)
212 |             g_data_json[index + 1][g_json_key_path] = nextpath
213 |             b_save_file()
214 | 
215 |     g_max_json_index = len(g_data_json) - 1
216 |     return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch)
217 |     
218 | def b_merge_audio(interval_r, *checkbox_list):
219 |     global g_data_json , g_max_json_index
220 |     checked_index = []
221 |     audios_path = []
222 |     audios_text = []
223 |     delete_files = []
224 |     for i, checkbox in enumerate(checkbox_list):
225 |         if (checkbox == True and g_index+i < len(g_data_json)):
226 |             checked_index.append(g_index + i)
227 |             
228 |     if (len(checked_index)>1):
229 |         for i in checked_index:
230 |             audios_path.append(g_data_json[i][g_json_key_path])
231 |             audios_text.append(g_data_json[i][g_json_key_text])
232 |         for i in reversed(checked_index[1:]):
233 |             delete_files.append(g_data_json[i][g_json_key_path])
234 |             g_data_json.pop(i)
235 | 
236 |         base_index = checked_index[0]
237 |         base_path = audios_path[0]
238 |         g_data_json[base_index][g_json_key_text] = "".join(audios_text)
239 | 
240 |         audio_list = []
241 |         l_sample_rate = None
242 |         for i, path in enumerate(audios_path):
243 |             data, sample_rate = librosa.load(path, sr=l_sample_rate, mono=True)
244 |             l_sample_rate = sample_rate
245 |             if (i > 0):
246 |                 silence = np.zeros(int(l_sample_rate * interval_r))
247 |                 audio_list.append(silence)
248 | 
249 |             audio_list.append(data)
250 | 
251 |         audio_concat = np.concatenate(audio_list)
252 | 
253 |         for item_file in delete_files:
254 |             os.remove(item_file)
255 | 
256 |         soundfile.write(base_path, audio_concat, l_sample_rate)
257 | 
258 |         b_save_file()
259 |     
260 |     g_max_json_index = len(g_data_json) - 1
261 |     
262 |     return gr.Slider(value=g_index, maximum=g_max_json_index), *b_change_index(g_index, g_batch)
263 | 
264 | 
265 | def b_save_json():
266 |     with open(g_load_file,'w', encoding="utf-8") as file:
267 |         for data in g_data_json:
268 |             file.write(f'{json.dumps(data, ensure_ascii = False)}\n')
269 | 
270 | 
271 | def b_save_list():
272 |     with open(g_load_file,'w', encoding="utf-8") as file:
273 |         for data in g_data_json:
274 |             wav_path = data["wav_path"]
275 |             speaker_name = data["speaker_name"]
276 |             language = data["language"]
277 |             text = data["text"]
278 |             file.write(f"{wav_path}|{speaker_name}|{language}|{text}".strip()+'\n')
279 | 
280 | 
281 | def b_load_json():
282 |     global g_data_json, g_max_json_index
283 |     with open(g_load_file, 'r', encoding="utf-8") as file:
284 |         g_data_json = file.readlines()
285 |         g_data_json = [json.loads(line) for line in g_data_json]
286 |         g_max_json_index = len(g_data_json) - 1
287 | 
288 | 
289 | def b_load_list():
290 |     global g_data_json, g_max_json_index
291 |     with open(g_load_file, 'r', encoding="utf-8") as source:
292 |         data_list = source.readlines()
293 |         for _ in data_list:
294 |             data = _.split('|')
295 |             if (len(data) == 4):
296 |                 wav_path, speaker_name, language, text = data
297 |                 g_data_json.append(
298 |                         {
299 |                             'wav_path':wav_path,
300 |                             'speaker_name':speaker_name,
301 |                             'language':language,
302 |                             'text':text.strip()
303 |                         }
304 |                 )
305 |             else:
306 |                 print("error line:", data)
307 |         g_max_json_index = len(g_data_json) - 1
308 | 
309 | 
310 | def b_save_file():
311 |     if g_load_format == "json":
312 |         b_save_json()
313 |     elif g_load_format == "list":
314 |         b_save_list()
315 | 
316 | 
317 | def b_load_file():
318 |     if g_load_format == "json":
319 |         b_load_json()
320 |     elif g_load_format == "list":
321 |         b_load_list()
322 | 
323 | 
324 | def set_global(load_json, load_list, json_key_text, json_key_path, batch, webui_language, force_delete):
325 |     global g_json_key_text, g_json_key_path, g_load_file, g_load_format, g_batch, g_language, g_force_delete
326 | 
327 |     g_batch = int(batch)
328 |     
329 |     if (load_json != "None"):
330 |         g_load_format = "json"
331 |         g_load_file = load_json
332 |     elif (load_list != "None"):
333 |         g_load_format = "list"
334 |         g_load_file = load_list
335 |     else:
336 |         g_load_format = "list"
337 |         g_load_file = "demo.list"
338 |         
339 |     g_json_key_text = json_key_text
340 |     g_json_key_path = json_key_path
341 |     g_language = SUBFIX_TextLanguage(webui_language)
342 |     g_force_delete = force_delete
343 | 
344 |     b_load_file()
345 | 
346 | 
347 | def subfix_startwebui(args):
348 | 
349 |     set_global(args.load_json, args.load_list, args.json_key_text, args.json_key_path, args.g_batch, args.webui_language, args.force_delete)
350 |     
351 |     with gr.Blocks() as demo:
352 | 
353 |         with gr.Row():
354 |             btn_change_index = gr.Button(g_language("Change Index"))
355 |             btn_submit_change = gr.Button(g_language("Submit Text"))
356 |             btn_merge_audio = gr.Button(g_language("Merge Audio"))
357 |             btn_delete_audio = gr.Button(g_language("Delete Audio"))
358 |             btn_previous_index = gr.Button(g_language("Previous Index"))
359 |             btn_next_index = gr.Button(g_language("Next Index"))
360 |             
361 |         with gr.Row():
362 |             index_slider = gr.Slider(
363 |                     minimum=0, maximum=g_max_json_index, value=g_index, step=1, label=g_language("Index"), scale=3
364 |             )
365 |             splitpoint_slider = gr.Slider(
366 |                     minimum=0, maximum=120.0, value=0, step=0.1, label=g_language("Audio Split Point(s)"), scale=3
367 |             )
368 |             btn_audio_split = gr.Button(g_language("Split Audio"), scale=1)
369 |             btn_save_json = gr.Button(g_language("Save File"), visible=True, scale=1)
370 |             btn_invert_selection = gr.Button(g_language("Invert Selection"), scale=1)
371 |         
372 |         with gr.Row():
373 |             with gr.Column():
374 |                 for _ in range(0,g_batch):
375 |                     with gr.Row():
376 |                         text = gr.Textbox(
377 |                             label = "Text",
378 |                             visible = True,
379 |                             scale=5
380 |                         )
381 |                         audio_output = gr.Audio(
382 |                             label= g_language("Output Audio"),
383 |                             visible = True,
384 |                             scale=5
385 |                         )
386 |                         audio_check = gr.Checkbox(
387 |                             label="Yes",
388 |                             show_label = True,
389 |                             info = g_language("Choose Audio"),
390 |                             scale=1
391 |                         )
392 |                         g_text_list.append(text)
393 |                         g_audio_list.append(audio_output)
394 |                         g_checkbox_list.append(audio_check)
395 | 
396 | 
397 | 
398 |         with gr.Row():
399 |             batchsize_slider = gr.Slider(
400 |                     minimum=1, maximum=g_batch, value=g_batch, step=1, label=g_language("Batch Size"), scale=3, interactive=False
401 |             )
402 |             interval_slider = gr.Slider(
403 |                     minimum=0, maximum=2, value=0, step=0.01, label=g_language("Interval"), scale=3
404 |             )
405 |             btn_theme_dark = gr.Button(g_language("Light Theme"), link="?__theme=light", scale=1)
406 |             btn_theme_light = gr.Button(g_language("Dark Theme"), link="?__theme=dark", scale=1)
407 |         
408 |         btn_change_index.click(
409 |             b_change_index,
410 |             inputs=[
411 |                 index_slider,
412 |                 batchsize_slider,
413 |             ],
414 |             outputs=[
415 |                 *g_text_list,
416 |                 *g_audio_list,
417 |                 *g_checkbox_list
418 |             ],
419 |         )
420 | 
421 |         
422 |         btn_submit_change.click(
423 |             b_submit_change,
424 |             inputs=[
425 |                 *g_text_list,
426 |             ],
427 |             outputs=[
428 |                 index_slider,
429 |                 *g_text_list,
430 |                 *g_audio_list,
431 |                 *g_checkbox_list
432 |             ],
433 |         )
434 | 
435 |         btn_previous_index.click(
436 |             b_previous_index,
437 |             inputs=[
438 |                 index_slider,
439 |                 batchsize_slider,
440 |             ],
441 |             outputs=[
442 |                 index_slider,
443 |                 *g_text_list,
444 |                 *g_audio_list,
445 |                 *g_checkbox_list
446 |             ],
447 |         )
448 |         
449 |         btn_next_index.click(
450 |             b_next_index,
451 |             inputs=[
452 |                 index_slider,
453 |                 batchsize_slider,
454 |             ],
455 |             outputs=[
456 |                 index_slider,
457 |                 *g_text_list,
458 |                 *g_audio_list,
459 |                 *g_checkbox_list
460 |             ],
461 |         )
462 | 
463 |         btn_delete_audio.click(
464 |             b_delete_audio,
465 |             inputs=[
466 |                 *g_checkbox_list
467 |             ],
468 |             outputs=[
469 |                 index_slider,
470 |                 *g_text_list,
471 |                 *g_audio_list,
472 |                 *g_checkbox_list
473 |             ]
474 |         )
475 | 
476 |         btn_merge_audio.click(
477 |             b_merge_audio,
478 |             inputs=[
479 |                 interval_slider,
480 |                 *g_checkbox_list
481 |             ],
482 |             outputs=[
483 |                 index_slider,
484 |                 *g_text_list,
485 |                 *g_audio_list,
486 |                 *g_checkbox_list
487 |             ]
488 |         )
489 | 
490 |         btn_audio_split.click(
491 |             b_audio_split,
492 |             inputs=[
493 |                 splitpoint_slider,
494 |                 *g_checkbox_list
495 |             ],
496 |             outputs=[
497 |                 index_slider,
498 |                 *g_text_list,
499 |                 *g_audio_list,
500 |                 *g_checkbox_list
501 |             ]
502 |         )
503 | 
504 |         btn_invert_selection.click(
505 |             b_invert_selection,
506 |             inputs=[
507 |                 *g_checkbox_list
508 |             ],
509 |             outputs=[
510 |                 *g_checkbox_list
511 |             ]
512 |         )
513 | 
514 |         btn_save_json.click(
515 |             b_save_file
516 |         )
517 | 
518 |         demo.load(
519 |             b_change_index,
520 |             inputs=[
521 |                 index_slider,
522 |                 batchsize_slider,
523 |             ],
524 |             outputs=[
525 |                 *g_text_list,
526 |                 *g_audio_list,
527 |                 *g_checkbox_list
528 |             ],
529 |         )
530 |         
531 |     demo.launch(server_port = args.server_port)
532 | 
533 | 
534 | if __name__ == "__main__":
535 |     parser_subfix_webui = argparse.ArgumentParser(description='Process some integers.')
536 |     parser_subfix_webui.add_argument('--load_json', default="None", help='source file, like demo.json')
537 |     parser_subfix_webui.add_argument('--load_list', default="None", help='source file, like demo.list')
538 |     parser_subfix_webui.add_argument('--json_key_text', default="text", help='the text key name in json, Default: text')
539 |     parser_subfix_webui.add_argument('--json_key_path', default="wav_path", help='the path key name in json, Default: wav_path')
540 |     parser_subfix_webui.add_argument('--g_batch', default=10, help='max number g_batch wav to display, Default: 10')
541 |     parser_subfix_webui.add_argument('--webui_language', default="en", type=str, help='webui language: en or zh, Default: en')
542 |     parser_subfix_webui.add_argument('--force_delete', default="True", type=str, help='delete file in disk while delete items, True or False, Default: True')
543 |     parser_subfix_webui.add_argument('--server_port', default=7860, type=int, help='the webui port, Default: 7860')
544 | 
545 |     parser_subfix = parser_subfix_webui.parse_args()
546 | 
547 |     parser_subfix.force_delete = (parser_subfix.force_delete.upper() == "TRUE")
548 | 
549 |     subfix_startwebui(parser_subfix)


--------------------------------------------------------------------------------