├── .github └── workflows │ ├── ci.yml │ └── release.yml ├── .gitignore ├── Dockerfile ├── Dockerfile.cuda ├── LICENSE ├── README.md ├── autocut.py ├── autocut.spec ├── autocut ├── __init__.py ├── __main__.py ├── cut.py ├── daemon.py ├── main.py ├── transcribe.py └── utils.py ├── build.sh ├── imgs └── typora.jpg ├── requirements.txt ├── setup.cfg ├── setup.py ├── snakers4_silero-vad_master ├── .github │ └── ISSUE_TEMPLATE │ │ ├── bug_report.md │ │ ├── feature_request.md │ │ └── questions---help---support.md ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── examples │ ├── colab_record_example.ipynb │ ├── cpp │ │ ├── README.md │ │ ├── silero-vad-onnx.cpp │ │ └── wav.h │ ├── microphone_and_webRTC_integration │ │ ├── README.md │ │ └── microphone_and_webRTC_integration.py │ └── pyaudio-streaming │ │ ├── README.md │ │ └── pyaudio-streaming-examples.ipynb ├── files │ ├── lang_dict_95.json │ ├── lang_group_dict_95.json │ ├── silero_logo.jpg │ ├── silero_vad.jit │ └── silero_vad.onnx ├── hubconf.py ├── silero-vad.ipynb └── utils_vad.py └── test ├── config.py ├── content ├── test.srt ├── test_md.md └── test_srt.srt ├── media ├── test001.mp4 ├── test001_en.mp4 ├── test002.mov ├── test003.mkv ├── test004.flv ├── test005.mp3 └── test006.MP4 ├── test_cut.py └── test_transcribe.py /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | lint_and_test: 11 | runs-on: ${{ matrix.os }}-latest 12 | strategy: 13 | matrix: 14 | python-version: ['3.9', '3.10'] 15 | # macos did not support m1 for now 16 | os: [ubuntu, windows, macos] 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Set Variables 24 | id: set_variables 25 | shell: bash 26 | run: | 27 | echo "PY=$(python -c 'import hashlib, sys;print(hashlib.sha256(sys.version.encode()+sys.executable.encode()).hexdigest())')" >> $GITHUB_OUTPUT 28 | echo "PIP_CACHE=$(pip cache dir)" >> $GITHUB_OUTPUT 29 | - name: Cache PIP 30 | uses: actions/cache@v3 31 | with: 32 | path: ${{ steps.set_variables.outputs.PIP_CACHE }} 33 | key: ${{ runner.os }}-pip-${{ steps.set_variables.outputs.PY }} 34 | 35 | - name: Setup ffmpeg for differnt platforms 36 | uses: FedericoCarboni/setup-ffmpeg@master 37 | 38 | - name: Install dependencies 39 | run: | 40 | python -m pip install --upgrade pip 41 | pip install . 42 | pip install pytest black 43 | - name: Run Test 44 | run: pytest test/ 45 | - name: Run Lint 46 | run: black . --check 47 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10 7 | 8 | jobs: 9 | 10 | createrelease: 11 | name: Create Release 12 | runs-on: [ubuntu-latest] 13 | steps: 14 | - name: Create Release 15 | id: create_release 16 | uses: actions/create-release@v1 17 | env: 18 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 19 | with: 20 | tag_name: ${{ github.ref }} 21 | release_name: Release ${{ github.ref }} 22 | draft: false 23 | prerelease: false 24 | - name: Output Release URL File 25 | run: echo "${{ steps.create_release.outputs.upload_url }}" > release_url.txt 26 | - name: Save Release URL File for publish 27 | uses: actions/upload-artifact@v1 28 | with: 29 | name: release_url 30 | path: release_url.txt 31 | 32 | build: 33 | name: Build packages 34 | needs: createrelease 35 | runs-on: ${{ matrix.os }} 36 | strategy: 37 | matrix: 38 | include: 39 | - os: macos-latest 40 | TARGET: macos 41 | OUT_FILE_NAME: autocut_macos.zip 42 | ASSET_MIME: application/zip 43 | - os: ubuntu-latest 44 | TARGET: linux 45 | OUT_FILE_NAME: autocut_linux.zip 46 | ASSET_MIME: application/zip 47 | - os: windows-latest 48 | TARGET: windows 49 | OUT_FILE_NAME: autocut_windows.zip 50 | ASSET_MIME: application/zip 51 | steps: 52 | - uses: actions/checkout@v1 53 | - name: Set up Python 3.9 54 | uses: actions/setup-python@v2 55 | with: 56 | python-version: 3.9 57 | - name: Install dependencies 58 | run: | 59 | python -m pip install --upgrade pip 60 | pip install virtualenv 61 | python -m virtualenv ./.venv 62 | - name: Build with pyinstaller for Windows 63 | if: runner.os == 'windows' 64 | run: | 65 | .venv\Scripts\activate 66 | pip install -r requirements.txt 67 | pyinstaller autocut.spec -y 68 | - name: Build with pyinstaller for Other-${{matrix.TARGET}} 69 | if: runner.os != 'windows' 70 | run: | 71 | source .venv/bin/activate 72 | pip install -r requirements.txt 73 | pyinstaller autocut.spec -y 74 | - name: Zip Files 75 | uses: vimtor/action-zip@v1 76 | with: 77 | files: ./dist 78 | dest: ./dist/autocut_${{matrix.TARGET}}.zip 79 | - name: Load Release URL File from release job 80 | uses: actions/download-artifact@v1 81 | with: 82 | name: release_url 83 | - name: Get Release File Name & Upload URL 84 | id: get_release_info 85 | shell: bash 86 | run: | 87 | value=`cat release_url/release_url.txt` 88 | echo ::set-output name=upload_url::$value 89 | - name: Upload Release Asset 90 | id: upload-release-asset 91 | uses: actions/upload-release-asset@v1 92 | env: 93 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 94 | with: 95 | upload_url: ${{ steps.get_release_info.outputs.upload_url }} 96 | asset_path: ./dist/${{ matrix.OUT_FILE_NAME}} 97 | asset_name: ${{ matrix.OUT_FILE_NAME}} 98 | asset_content_type: ${{ matrix.ASSET_MIME}} 99 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | log/ 131 | 132 | # vad_model 133 | # snakers4_silero-vad_master 134 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim as base 2 | 3 | RUN mkdir /autocut 4 | COPY ./ /autocut 5 | WORKDIR /autocut 6 | 7 | RUN apt update && \ 8 | apt install -y git && \ 9 | apt install -y ffmpeg 10 | 11 | RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu && \ 12 | pip install . -------------------------------------------------------------------------------- /Dockerfile.cuda: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime 2 | 3 | RUN mkdir /autocut 4 | COPY ./ /autocut 5 | WORKDIR /autocut 6 | 7 | RUN apt update && \ 8 | apt install -y git && \ 9 | apt install -y ffmpeg 10 | 11 | RUN pip install . -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AutoCut: 通过字幕来剪切视频 2 | 3 | AutoCut 对你的视频自动生成字幕。然后你选择需要保留的句子,AutoCut 将对你视频中对应的片段裁切并保存。你无需使用视频编辑软件,只需要编辑文本文件即可完成剪切。 4 | 5 | ## 使用例子 6 | 7 | 假如你录制的视频放在 `2022-11-04/` 这个文件夹里。那么运行 8 | 9 | ```bash 10 | autocut -d 2022-11-04 11 | ``` 12 | 13 | > 提示:如果你使用 OBS 录屏,可以在 `设置->高级->录像->文件名格式` 中将空格改成 `/`,即 `%CCYY-%MM-%DD/%hh-%mm-%ss`。那么视频文件将放在日期命名的文件夹里。 14 | 15 | AutoCut 将持续对这个文件夹里视频进行字幕抽取和剪切。例如,你刚完成一个视频录制,保存在 `11-28-18.mp4`。AutoCut 将生成 `11-28-18.md`。你在里面选择需要保留的句子后,AutoCut 将剪切出 `11-28-18_cut.mp4`,并生成 `11-28-18_cut.md` 来预览结果。 16 | 17 | 你可以使用任何的 Markdown 编辑器。例如我常用 VS Code 和 Typora。下图是通过 Typora 来对 `11-28-18.md` 编辑。 18 | 19 | ![](imgs/typora.jpg) 20 | 21 | 全部完成后在 `autocut.md` 里选择需要拼接的视频后,AutoCut 将输出 `autocut_merged.mp4` 和对应的字幕文件。 22 | 23 | ## 安装 24 | 25 | 首先安装 Python 包 26 | 27 | ``` 28 | pip install git+https://github.com/mli/autocut.git 29 | ``` 30 | 31 | ## 本地安装测试 32 | 33 | 34 | ``` 35 | git clone https://github.com/mli/autocut 36 | cd autocut 37 | pip install . 38 | ``` 39 | 40 | 41 | > 上面将安装 [pytorch](https://pytorch.org/)。如果你需要 GPU 运行,且默认安装的版本不匹配的话,你可以先安装 Pytorch。如果安装 Whipser 出现问题,请参考[官方文档](https://github.com/openai/whisper#setup)。 42 | 43 | 另外需要安装 [ffmpeg](https://ffmpeg.org/) 44 | 45 | ``` 46 | # on Ubuntu or Debian 47 | sudo apt update && sudo apt install ffmpeg 48 | 49 | # on Arch Linux 50 | sudo pacman -S ffmpeg 51 | 52 | # on MacOS using Homebrew (https://brew.sh/) 53 | brew install ffmpeg 54 | 55 | # on Windows using Scoop (https://scoop.sh/) 56 | scoop install ffmpeg 57 | ``` 58 | 59 | ## Docker 安装 60 | 61 | 首先将项目克隆到本地。 62 | 63 | ```bash 64 | git clone https://github.com/mli/autocut.git 65 | ``` 66 | 67 | ### 安装 CPU 版本 68 | 69 | 进入项目根目录,然后构建 docker 映像。 70 | 71 | ```bash 72 | docker build -t autocut . 73 | ``` 74 | 75 | 运行下面的命令创建 docker 容器,就可以直接使用了。 76 | 77 | ```bash 78 | docker run -it --rm -v E:\autocut:/autocut/video autocut /bin/bash 79 | ``` 80 | 81 | 其中 `-v` 是将主机存放视频的文件夹 `E:\autocut` 映射到虚拟机的 `/autocut/video` 目录。`E:\autocut` 是主机存放视频的目录,需修改为自己主机存放视频的目录。 82 | 83 | ### 安装 GPU 版本 84 | 85 | 使用 GPU 加速需要主机有 Nvidia 的显卡并安装好相应驱动。然后在项目根目录,执行下面的命令构建 docker 映像。 86 | 87 | ```bash 88 | docker build -f ./Dockerfile.cuda -t autocut-gpu . 89 | ``` 90 | 91 | 使用 GPU 加速时,运行 docker 容器需添加参数 `--gpus all`。 92 | 93 | ```bash 94 | docker run --gpus all -it --rm -v E:\autocut:/autocut/video autocut-gpu 95 | ``` 96 | 97 | ## 更多使用选项 98 | 99 | ### 转录某个视频生成 `.srt` 和 `.md` 结果。 100 | 101 | ```bash 102 | autocut -t 22-52-00.mp4 103 | ``` 104 | 105 | 1. 如果对转录质量不满意,可以使用更大的模型,例如 106 | 107 | ```bash 108 | autocut -t 22-52-00.mp4 --whisper-model large 109 | ``` 110 | 111 | 默认是 `small`。更好的模型是 `medium` 和 `large`,但推荐使用 GPU 获得更好的速度。也可以使用更快的 `tiny` 和 `base`,但转录质量会下降。 112 | 113 | 114 | ### 剪切某个视频 115 | 116 | ```bash 117 | autocut -c 22-52-00.mp4 22-52-00.srt 22-52-00.md 118 | ``` 119 | 120 | 1. 默认视频比特率是 `--bitrate 10m`,你可以根据需要调大调小。 121 | 2. 如果不习惯 Markdown 格式文件,你也可以直接在 `srt` 文件里删除不要的句子,在剪切时不传入 `md` 文件名即可。就是 `autocut -c 22-52-00.mp4 22-52-00.srt` 122 | 3. 如果仅有 `srt` 文件,编辑不方便可以使用如下命令生成 `md` 文件,然后编辑 `md` 文件即可,但此时会完全对照 `srt` 生成,不会出现 `no speech` 等提示文本。 123 | 124 | ```bash 125 | autocut -m test.srt test.mp4 126 | autocut -m test.mp4 test.srt # 支持视频和字幕乱序传入 127 | autocut -m test.srt # 也可以只传入字幕文件 128 | ``` 129 | 130 | 131 | ### 一些小提示 132 | 133 | 134 | 1. 讲得流利的视频的转录质量会高一些,这因为是 Whisper 训练数据分布的缘故。对一个视频,你可以先粗选一下句子,然后在剪出来的视频上再剪一次。 135 | 2. ~~最终视频生成的字幕通常还需要做一些小编辑。你可以直接编辑`md`文件(比`srt`文件更紧凑,且嵌入了视频)。然后使用 `autocut -s 22-52-00.md 22-52-00.srt` 来生成更新的字幕 `22-52-00_edited.srt`。注意这里会无视句子是不是被选中,而是全部转换成 `srt`。~~ 136 | 3. 最终视频生成的字幕通常还需要做一些小编辑。但 `srt` 里面空行太多。你可以使用 `autocut -s 22-52-00.srt` 来生成一个紧凑些的版本 `22-52-00_compact.srt` 方便编辑(这个格式不合法,但编辑器,例如 VS Code,还是会进行语法高亮)。编辑完成后,`autocut -s 22-52-00_compact.srt` 转回正常格式。 137 | 4. 用 Typora 和 VS Code 编辑 Markdown 都很方便。他们都有对应的快捷键 mark 一行或者多行。但 VS Code 视频预览似乎有点问题。 138 | 5. 视频是通过 ffmpeg 导出。在 Apple M1 芯片上它用不了 GPU,导致导出速度不如专业视频软件。 139 | 140 | ### 常见问题 141 | 142 | 1. **输出的是乱码?** 143 | 144 | AutoCut 默认输出编码是 `utf-8`. 确保你的编辑器也使用了 `utf-8` 解码。你可以通过 `--encoding` 指定其他编码格式。但是需要注意生成字幕文件和使用字幕文件剪辑时的编码格式需要一致。例如使用 `gbk`。 145 | 146 | ```bash 147 | autocut -t test.mp4 --encoding=gbk 148 | autocut -c test.mp4 test.srt test.md --encoding=gbk 149 | ``` 150 | 151 | 如果使用了其他编码格式(如 `gbk` 等)生成 `md` 文件并用 Typora 打开后,该文件可能会被 Typora 自动转码为其他编码格式,此时再通过生成时指定的编码格式进行剪辑时可能会出现编码不支持等报错。因此可以在使用 Typora 编辑后再通过 VSCode 等修改到你需要的编码格式进行保存后再使用剪辑功能。 152 | 153 | 2. **如何使用 GPU 来转录?** 154 | 155 | 当你有 Nvidia GPU,而且安装了对应版本的 PyTorch 的时候,转录是在 GPU 上进行。你可以通过命令来查看当前是不是支持 GPU。 156 | 157 | ```bash 158 | python -c "import torch; print(torch.cuda.is_available())" 159 | ``` 160 | 161 | 否则你可以在安装 AutoCut 前手动安装对应的 GPU 版本 PyTorch。 162 | 163 | 3. **使用 GPU 时报错显存不够。** 164 | 165 | whisper 的大模型需要一定的 GPU 显存。如果你的显存不够,你可以用小一点的模型,例如 `small`。如果你仍然想用大模型,可以通过 `--device` 来强制使用 CPU。例如 166 | 167 | ```bash 168 | autocut -t 11-28-18.mp4 --whisper-model large --device cpu 169 | ``` 170 | 171 | 4. **能不能使用 `pip` 安装?** 172 | 173 | whisper已经发布到PyPI了,可以直接用`pip install openai-whisper`安装。 174 | 175 | [https://github.com/openai/whisper#setup](https://github.com/openai/whisper#setup) 176 | 177 | [https://pypi.org/project/openai-whisper/](https://pypi.org/project/openai-whisper/) 178 | 179 | ## 如何参与贡献 180 | 181 | [这里有一些想做的 feature](https://github.com/mli/autocut/issues/22),欢迎贡献。 182 | 183 | ### 代码结构 184 | ```text 185 | autocut 186 | │ .gitignore 187 | │ LICENSE 188 | │ README.md # 一般新增或修改需要让使用者知道就需要对应更新 README.md 内容 189 | │ setup.py 190 | │ 191 | └─autocut # 核心代码位于 autocut 文件夹中,新增功能的实现也一般在这里面进行修改或新增 192 | │ cut.py 193 | │ daemon.py 194 | │ main.py 195 | │ transcribe.py 196 | │ utils.py 197 | └─ __init__.py 198 | 199 | ``` 200 | 201 | ### 安装依赖 202 | 开始安装这个项目的需要的依赖之前,建议先了解一下 Anaconda 或者 venv 的虚拟环境使用,推荐**使用虚拟环境来搭建该项目的开发环境**。 203 | 具体安装方式为在你搭建搭建的虚拟环境之中按照[上方安装步骤](./README.md#安装)进行安装。 204 | 205 | > 为什么推荐使用虚拟环境开发? 206 | > 207 | > 一方面是保证各种不同的开发环境之间互相不污染。 208 | > 209 | > 更重要的是在于这个项目实际上是一个 Python Package,所以在你安装之后 AutoCut 的代码实际也会变成你的环境依赖。 210 | > **因此在你更新代码之后,你需要让将新代码重新安装到环境中,然后才能调用到新的代码。** 211 | 212 | ### 开发 213 | 214 | 1. 代码风格目前遵循 PEP-8,可以使用相关的自动格式化软件完成。 215 | 2. `utils.py` 主要是全局共用的一些工具方法。 216 | 3. `transcribe.py` 是调用模型生成`srt`和`md`的部分。 217 | 4. `cut.py` 提供根据标记后`md`或`srt`进行视频剪切合并的功能。 218 | 5. `daemon.py` 提供的是监听文件夹生成字幕和剪切视频的功能。 219 | 6. `main.py` 声明命令行参数,根据输入参数调用对应功能。 220 | 221 | 开发过程中请尽量保证修改在正确的地方,以及合理地复用代码, 222 | 同时工具函数请尽可能放在`utils.py`中。 223 | 代码格式目前是遵循 PEP-8,变量命名尽量语义化即可。 224 | 225 | 在开发完成之后,最重要的一点是需要进行**测试**,请保证提交之前对所有**与你修改直接相关的部分**以及**你修改会影响到的部分**都进行了测试,并保证功能的正常。 226 | 目前使用 `GitHub Actions` CI, Lint 使用 black 提交前请运行 `black`。 227 | 228 | ### 提交 229 | 230 | 1. commit 信息用英文描述清楚你做了哪些修改即可,小写字母开头。 231 | 2. 最好可以保证一次的 commit 涉及的修改比较小,可以简短地描述清楚,这样也方便之后有修改时的查找。 232 | 3. PR 的时候 title 简述有哪些修改, contents 可以具体写下修改内容。 233 | 4. run test `pip install pytest` then `pytest test` 234 | 5. run lint `pip install black` then `black .` 235 | -------------------------------------------------------------------------------- /autocut.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | from autocut import main 3 | 4 | if __name__ == "__main__": 5 | multiprocessing.freeze_support() 6 | main.main() 7 | -------------------------------------------------------------------------------- /autocut.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python ; coding: utf-8 -*- 2 | from PyInstaller.utils.hooks import copy_metadata, collect_data_files 3 | from os import path 4 | import platform 5 | plat = platform.system().lower() 6 | 7 | datas = [] 8 | datas += collect_data_files('torch') 9 | datas += copy_metadata('tqdm') 10 | datas += copy_metadata('regex') 11 | datas += copy_metadata('requests') 12 | datas += copy_metadata('packaging') 13 | datas += copy_metadata('filelock') 14 | datas += copy_metadata('numpy') 15 | datas += copy_metadata('tokenizers') 16 | datas += copy_metadata('torch') 17 | 18 | datas += collect_data_files('transformers', include_py_files=True) 19 | 20 | datas += [(path.join( 21 | './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages', 22 | 'moviepy' 23 | ), 'moviepy')] 24 | datas += [(path.join( 25 | './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages', 26 | 'imageio_ffmpeg' 27 | ), 'imageio_ffmpeg')] 28 | datas += [(path.join( 29 | './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages', 30 | 'torchaudio' 31 | ), 'torchaudio')] 32 | datas += [(path.join( 33 | './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages', 34 | 'whisper' 35 | ), 'whisper')] 36 | datas += [(path.join( 37 | './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages', 38 | 'opencc' 39 | ), 'opencc')] 40 | datas += [('./snakers4_silero-vad_master', './snakers4_silero-vad_master')] 41 | if not plat == 'windows': 42 | datas += [('./build.sh', './')] 43 | 44 | block_cipher = None 45 | 46 | 47 | a = Analysis( 48 | ['autocut.py'], 49 | pathex=[], 50 | binaries=[], 51 | datas=datas, 52 | hiddenimports=[], 53 | hookspath=[], 54 | hooksconfig={}, 55 | runtime_hooks=[], 56 | excludes=[], 57 | win_no_prefer_redirects=False, 58 | win_private_assemblies=False, 59 | cipher=block_cipher, 60 | noarchive=False, 61 | ) 62 | pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) 63 | 64 | exe = EXE( 65 | pyz, 66 | a.scripts, 67 | [], 68 | exclude_binaries=True, 69 | name='autocut', 70 | debug=False, 71 | bootloader_ignore_signals=False, 72 | strip=False, 73 | upx=True, 74 | console=True, 75 | disable_windowed_traceback=False, 76 | argv_emulation=False, 77 | target_arch=None, 78 | codesign_identity=None, 79 | entitlements_file=None, 80 | ) 81 | coll = COLLECT( 82 | exe, 83 | a.binaries, 84 | a.zipfiles, 85 | a.datas, 86 | strip=False, 87 | upx=True, 88 | upx_exclude=[], 89 | name='autocut', 90 | ) 91 | -------------------------------------------------------------------------------- /autocut/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.3" 2 | -------------------------------------------------------------------------------- /autocut/__main__.py: -------------------------------------------------------------------------------- 1 | from .main import main 2 | 3 | if __name__ == "__main__": 4 | main() 5 | -------------------------------------------------------------------------------- /autocut/cut.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | 5 | import srt 6 | from moviepy import editor 7 | 8 | from . import utils 9 | 10 | 11 | # Merge videos 12 | class Merger: 13 | def __init__(self, args): 14 | self.args = args 15 | 16 | def write_md(self, videos): 17 | md = utils.MD(self.args.inputs[0], self.args.encoding) 18 | num_tasks = len(md.tasks()) 19 | # Not overwrite if already marked as down or no new videos 20 | if md.done_editing() or num_tasks == len(videos) + 1: 21 | return 22 | 23 | md.clear() 24 | md.add_done_editing(False) 25 | md.add("\nSelect the files that will be used to generate `autocut_final.mp4`\n") 26 | base = lambda fn: os.path.basename(fn) 27 | for f in videos: 28 | md_fn = utils.change_ext(f, "md") 29 | video_md = utils.MD(md_fn, self.args.encoding) 30 | # select a few words to scribe the video 31 | desc = "" 32 | if len(video_md.tasks()) > 1: 33 | for _, t in video_md.tasks()[1:]: 34 | m = re.findall(r"\] (.*)", t) 35 | if m and "no speech" not in m[0].lower(): 36 | desc += m[0] + " " 37 | if len(desc) > 50: 38 | break 39 | md.add_task( 40 | False, 41 | f'[{base(f)}]({base(md_fn)}) {"[Edited]" if video_md.done_editing() else ""} {desc}', 42 | ) 43 | md.write() 44 | 45 | def run(self): 46 | md_fn = self.args.inputs[0] 47 | md = utils.MD(md_fn, self.args.encoding) 48 | if not md.done_editing(): 49 | return 50 | 51 | videos = [] 52 | for m, t in md.tasks(): 53 | if not m: 54 | continue 55 | m = re.findall(r"\[(.*)\]", t) 56 | if not m: 57 | continue 58 | fn = os.path.join(os.path.dirname(md_fn), m[0]) 59 | logging.info(f"Loading {fn}") 60 | videos.append(editor.VideoFileClip(fn)) 61 | 62 | dur = sum([v.duration for v in videos]) 63 | logging.info(f"Merging into a video with {dur / 60:.1f} min length") 64 | 65 | merged = editor.concatenate_videoclips(videos) 66 | fn = os.path.splitext(md_fn)[0] + "_merged.mp4" 67 | merged.write_videofile( 68 | fn, audio_codec="aac", bitrate=self.args.bitrate 69 | ) # logger=None, 70 | logging.info(f"Saved merged video to {fn}") 71 | 72 | 73 | # Cut media 74 | class Cutter: 75 | def __init__(self, args): 76 | self.args = args 77 | 78 | def run(self): 79 | fns = {"srt": None, "media": None, "md": None} 80 | for fn in self.args.inputs: 81 | ext = os.path.splitext(fn)[1][1:] 82 | fns[ext if ext in fns else "media"] = fn 83 | 84 | assert fns["media"], "must provide a media filename" 85 | assert fns["srt"], "must provide a srt filename" 86 | 87 | is_video_file = utils.is_video(fns["media"].lower()) 88 | outext = "mp4" if is_video_file else "mp3" 89 | output_fn = utils.change_ext(utils.add_cut(fns["media"]), outext) 90 | if utils.check_exists(output_fn, self.args.force): 91 | return 92 | 93 | with open(fns["srt"], encoding=self.args.encoding) as f: 94 | subs = list(srt.parse(f.read())) 95 | 96 | if fns["md"]: 97 | md = utils.MD(fns["md"], self.args.encoding) 98 | if not md.done_editing(): 99 | return 100 | index = [] 101 | for mark, sent in md.tasks(): 102 | if not mark: 103 | continue 104 | m = re.match(r"\[(\d+)", sent.strip()) 105 | if m: 106 | index.append(int(m.groups()[0])) 107 | subs = [s for s in subs if s.index in index] 108 | logging.info(f'Cut {fns["media"]} based on {fns["srt"]} and {fns["md"]}') 109 | else: 110 | logging.info(f'Cut {fns["media"]} based on {fns["srt"]}') 111 | 112 | segments = [] 113 | # Avoid disordered subtitles 114 | subs.sort(key=lambda x: x.start) 115 | for x in subs: 116 | if len(segments) == 0: 117 | segments.append( 118 | {"start": x.start.total_seconds(), "end": x.end.total_seconds()} 119 | ) 120 | else: 121 | if x.start.total_seconds() - segments[-1]["end"] < 0.5: 122 | segments[-1]["end"] = x.end.total_seconds() 123 | else: 124 | segments.append( 125 | {"start": x.start.total_seconds(), "end": x.end.total_seconds()} 126 | ) 127 | 128 | if is_video_file: 129 | media = editor.VideoFileClip(fns["media"]) 130 | else: 131 | media = editor.AudioFileClip(fns["media"]) 132 | 133 | # Add a fade between two clips. Not quite necessary. keep code here for reference 134 | # fade = 0 135 | # segments = _expand_segments(segments, fade, 0, video.duration) 136 | # clips = [video.subclip( 137 | # s['start'], s['end']).crossfadein(fade) for s in segments] 138 | # final_clip = editor.concatenate_videoclips(clips, padding = -fade) 139 | 140 | clips = [media.subclip(s["start"], s["end"]) for s in segments] 141 | if is_video_file: 142 | final_clip: editor.VideoClip = editor.concatenate_videoclips(clips) 143 | logging.info( 144 | f"Reduced duration from {media.duration:.1f} to {final_clip.duration:.1f}" 145 | ) 146 | 147 | aud = final_clip.audio.set_fps(44100) 148 | final_clip = final_clip.without_audio().set_audio(aud) 149 | final_clip = final_clip.fx(editor.afx.audio_normalize) 150 | 151 | # an alternative to birate is use crf, e.g. ffmpeg_params=['-crf', '18'] 152 | final_clip.write_videofile( 153 | output_fn, audio_codec="aac", bitrate=self.args.bitrate 154 | ) 155 | else: 156 | final_clip: editor.AudioClip = editor.concatenate_audioclips(clips) 157 | logging.info( 158 | f"Reduced duration from {media.duration:.1f} to {final_clip.duration:.1f}" 159 | ) 160 | 161 | final_clip = final_clip.fx(editor.afx.audio_normalize) 162 | final_clip.write_audiofile( 163 | output_fn, codec="libmp3lame", fps=44100, bitrate=self.args.bitrate 164 | ) 165 | 166 | media.close() 167 | logging.info(f"Saved media to {output_fn}") 168 | -------------------------------------------------------------------------------- /autocut/daemon.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import glob 3 | import logging 4 | import os 5 | import time 6 | 7 | from . import cut, transcribe, utils 8 | 9 | 10 | class Daemon: 11 | def __init__(self, args): 12 | self.args = args 13 | self.sleep = 1 14 | 15 | def run(self): 16 | assert len(self.args.inputs) == 1, "Must provide a single folder" 17 | while True: 18 | self._iter() 19 | time.sleep(self.sleep) 20 | self.sleep = min(60, self.sleep + 1) 21 | 22 | def _iter(self): 23 | folder = self.args.inputs[0] 24 | files = sorted(list(glob.glob(os.path.join(folder, "*")))) 25 | media_files = [f for f in files if utils.is_video(f) or utils.is_audio(f)] 26 | args = copy.deepcopy(self.args) 27 | for f in media_files: 28 | srt_fn = utils.change_ext(f, "srt") 29 | md_fn = utils.change_ext(f, "md") 30 | is_video_file = utils.is_video(f) 31 | if srt_fn not in files or md_fn not in files: 32 | args.inputs = [f] 33 | try: 34 | transcribe.Transcribe(args).run() 35 | self.sleep = 1 36 | break 37 | except RuntimeError as e: 38 | logging.warn( 39 | "Failed, may be due to the video is still on recording" 40 | ) 41 | pass 42 | if md_fn in files: 43 | if utils.add_cut(md_fn) in files: 44 | continue 45 | md = utils.MD(md_fn, self.args.encoding) 46 | ext = "mp4" if is_video_file else "mp3" 47 | if not md.done_editing() or os.path.exists( 48 | utils.change_ext(utils.add_cut(f), ext) 49 | ): 50 | continue 51 | args.inputs = [f, md_fn, srt_fn] 52 | cut.Cutter(args).run() 53 | self.sleep = 1 54 | 55 | args.inputs = [os.path.join(folder, "autocut.md")] 56 | merger = cut.Merger(args) 57 | merger.write_md(media_files) 58 | merger.run() 59 | -------------------------------------------------------------------------------- /autocut/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | 5 | from whisper.tokenizer import LANGUAGES 6 | from . import utils 7 | 8 | def main(): 9 | parser = argparse.ArgumentParser( 10 | description="Edit videos based on transcribed subtitles", 11 | formatter_class=argparse.RawDescriptionHelpFormatter, 12 | ) 13 | 14 | logging.basicConfig( 15 | format="[autocut:%(filename)s:L%(lineno)d] %(levelname)-6s %(message)s" 16 | ) 17 | logging.getLogger().setLevel(logging.INFO) 18 | 19 | parser.add_argument("inputs", type=str, nargs="+", help="Inputs filenames/folders") 20 | parser.add_argument( 21 | "-t", 22 | "--transcribe", 23 | help="Transcribe videos/audio into subtitles", 24 | action=argparse.BooleanOptionalAction, 25 | ) 26 | parser.add_argument( 27 | "-c", 28 | "--cut", 29 | help="Cut a video based on subtitles", 30 | action=argparse.BooleanOptionalAction, 31 | ) 32 | parser.add_argument( 33 | "-d", 34 | "--daemon", 35 | help="Monitor a folder to transcribe and cut", 36 | action=argparse.BooleanOptionalAction, 37 | ) 38 | parser.add_argument( 39 | "-s", 40 | help="Convert .srt to a compact format for easier editing", 41 | action=argparse.BooleanOptionalAction, 42 | ) 43 | parser.add_argument( 44 | "-m", 45 | "--to-md", 46 | help="Convert .srt to .md for easier editing", 47 | action=argparse.BooleanOptionalAction, 48 | ) 49 | parser.add_argument( 50 | "--lang", 51 | type=str, 52 | default="zh", 53 | choices=LANGUAGES.keys(), 54 | help="The output language of transcription", 55 | ) 56 | parser.add_argument( 57 | "--prompt", type=str, default="", help="initial prompt feed into whisper" 58 | ) 59 | parser.add_argument( 60 | "--whisper-model", 61 | type=str, 62 | default="small", 63 | choices=["tiny", "base", "small", "medium", "large"], 64 | help="The whisper model used to transcribe.", 65 | ) 66 | parser.add_argument( 67 | "--bitrate", 68 | type=str, 69 | default="10m", 70 | help="The bitrate to export the cutted video, such as 10m, 1m, or 500k", 71 | ) 72 | parser.add_argument( 73 | "--vad", help="If or not use VAD", choices=["1", "0", "auto"], default="auto" 74 | ) 75 | parser.add_argument( 76 | "--force", 77 | help="Force write even if files exist", 78 | action=argparse.BooleanOptionalAction, 79 | ) 80 | parser.add_argument( 81 | "--encoding", type=str, default="utf-8", help="Document encoding format" 82 | ) 83 | parser.add_argument( 84 | "--device", 85 | type=str, 86 | default=None, 87 | choices=["cpu", "cuda"], 88 | help="Force to CPU or GPU for transcribing. In default automatically use GPU if available.", 89 | ) 90 | 91 | args = parser.parse_args() 92 | 93 | if args.transcribe: 94 | from .transcribe import Transcribe 95 | 96 | Transcribe(args).run() 97 | elif args.to_md: 98 | from .utils import trans_srt_to_md 99 | 100 | if len(args.inputs) == 2: 101 | [input_1, input_2] = args.inputs 102 | base, ext = os.path.splitext(input_1) 103 | if ext != ".srt": 104 | input_1, input_2 = input_2, input_1 105 | trans_srt_to_md(args.encoding, args.force, input_1, input_2) 106 | elif len(args.inputs) == 1: 107 | trans_srt_to_md(args.encoding, args.force, args.inputs[0]) 108 | else: 109 | logging.warn( 110 | "Wrong number of files, please pass in a .srt file or an additional video file" 111 | ) 112 | elif args.cut: 113 | from .cut import Cutter 114 | 115 | Cutter(args).run() 116 | elif args.daemon: 117 | from .daemon import Daemon 118 | 119 | Daemon(args).run() 120 | elif args.s: 121 | utils.compact_rst(args.inputs[0], args.encoding) 122 | else: 123 | logging.warning("No action, use -c, -t or -d") 124 | 125 | 126 | if __name__ == "__main__": 127 | main() 128 | -------------------------------------------------------------------------------- /autocut/transcribe.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import os 4 | import sys 5 | import time 6 | 7 | import opencc 8 | import srt 9 | import torch 10 | import whisper 11 | 12 | from tqdm import tqdm 13 | 14 | from . import utils 15 | 16 | 17 | def process(whisper_model, audio, seg, lang, prompt): 18 | r = whisper_model.transcribe( 19 | audio[int(seg["start"]) : int(seg["end"])], 20 | task="transcribe", 21 | language=lang, 22 | initial_prompt=prompt, 23 | ) 24 | r["origin_timestamp"] = seg 25 | return r 26 | 27 | 28 | class Transcribe: 29 | def __init__(self, args): 30 | self.args = args 31 | self.sampling_rate = 16000 32 | self.whisper_model = None 33 | self.vad_model = None 34 | self.detect_speech = None 35 | 36 | def run(self): 37 | for input in self.args.inputs: 38 | logging.info(f"Transcribing {input}") 39 | name, _ = os.path.splitext(input) 40 | if utils.check_exists(name + ".md", self.args.force): 41 | continue 42 | 43 | audio = whisper.load_audio(input, sr=self.sampling_rate) 44 | if ( 45 | self.args.vad == "1" 46 | or self.args.vad == "auto" 47 | and not name.endswith("_cut") 48 | ): 49 | speech_timestamps = self._detect_voice_activity(audio) 50 | else: 51 | speech_timestamps = [{"start": 0, "end": len(audio)}] 52 | transcribe_results = self._transcribe(audio, speech_timestamps) 53 | 54 | output = name + ".srt" 55 | self._save_srt(output, transcribe_results) 56 | logging.info(f"Transcribed {input} to {output}") 57 | self._save_md(name + ".md", output, input) 58 | logging.info(f'Saved texts to {name + ".md"} to mark sentences') 59 | 60 | def _detect_voice_activity(self, audio): 61 | """Detect segments that have voice activities""" 62 | tic = time.time() 63 | if self.vad_model is None or self.detect_speech is None: 64 | # torch load limit https://github.com/pytorch/vision/issues/4156 65 | torch.hub._validate_not_a_forked_repo = lambda a, b, c: True 66 | self.vad_model, funcs = torch.hub.load( 67 | repo_or_dir=os.path.join(os.path.dirname(sys.executable), "snakers4_silero-vad_master"), 68 | source="local", 69 | model="silero_vad", 70 | trust_repo=True, 71 | silero_vad_source="local", 72 | ) 73 | 74 | self.detect_speech = funcs[0] 75 | 76 | speeches = self.detect_speech( 77 | audio, self.vad_model, sampling_rate=self.sampling_rate 78 | ) 79 | 80 | # Remove too short segments 81 | speeches = utils.remove_short_segments(speeches, 1.0 * self.sampling_rate) 82 | 83 | # Expand to avoid to tight cut. You can tune the pad length 84 | speeches = utils.expand_segments( 85 | speeches, 0.2 * self.sampling_rate, 0.0 * self.sampling_rate, audio.shape[0] 86 | ) 87 | 88 | # Merge very closed segments 89 | speeches = utils.merge_adjacent_segments(speeches, 0.5 * self.sampling_rate) 90 | 91 | logging.info(f"Done voice activity detection in {time.time() - tic:.1f} sec") 92 | return speeches if len(speeches) > 1 else [{"start": 0, "end": len(audio)}] 93 | 94 | def _transcribe(self, audio, speech_timestamps): 95 | tic = time.time() 96 | if self.whisper_model is None: 97 | self.whisper_model = whisper.load_model( 98 | self.args.whisper_model, self.args.device 99 | ) 100 | 101 | res = [] 102 | if self.args.device == "cpu" and len(speech_timestamps) > 1: 103 | from multiprocessing import Pool 104 | 105 | pbar = tqdm(total=len(speech_timestamps)) 106 | 107 | pool = Pool(processes=4) 108 | # TODO, a better way is merging these segments into a single one, so whisper can get more context 109 | for seg in speech_timestamps: 110 | res.append( 111 | pool.apply_async( 112 | process, 113 | ( 114 | self.whisper_model, 115 | audio, 116 | seg, 117 | self.args.lang, 118 | self.args.prompt, 119 | ), 120 | callback=lambda x: pbar.update(), 121 | ) 122 | ) 123 | pool.close() 124 | pool.join() 125 | pbar.close() 126 | logging.info(f"Done transcription in {time.time() - tic:.1f} sec") 127 | return [i.get() for i in res] 128 | else: 129 | for seg in ( 130 | speech_timestamps 131 | if len(speech_timestamps) == 1 132 | else tqdm(speech_timestamps) 133 | ): 134 | r = self.whisper_model.transcribe( 135 | audio[int(seg["start"]) : int(seg["end"])], 136 | task="transcribe", 137 | language=self.args.lang, 138 | initial_prompt=self.args.prompt, 139 | verbose=False if len(speech_timestamps) == 1 else None, 140 | ) 141 | r["origin_timestamp"] = seg 142 | res.append(r) 143 | logging.info(f"Done transcription in {time.time() - tic:.1f} sec") 144 | return res 145 | 146 | def _save_srt(self, output, transcribe_results): 147 | subs = [] 148 | # whisper sometimes generate traditional chinese, explicitly convert 149 | cc = opencc.OpenCC("t2s") 150 | 151 | def _add_sub(start, end, text): 152 | subs.append( 153 | srt.Subtitle( 154 | index=0, 155 | start=datetime.timedelta(seconds=start), 156 | end=datetime.timedelta(seconds=end), 157 | content=cc.convert(text.strip()), 158 | ) 159 | ) 160 | 161 | prev_end = 0 162 | for r in transcribe_results: 163 | origin = r["origin_timestamp"] 164 | for s in r["segments"]: 165 | start = s["start"] + origin["start"] / self.sampling_rate 166 | end = min( 167 | s["end"] + origin["start"] / self.sampling_rate, 168 | origin["end"] / self.sampling_rate, 169 | ) 170 | if start > end: 171 | continue 172 | # mark any empty segment that is not very short 173 | if start > prev_end + 1.0: 174 | _add_sub(prev_end, start, "< No Speech >") 175 | _add_sub(start, end, s["text"]) 176 | prev_end = end 177 | 178 | with open(output, "wb") as f: 179 | f.write(srt.compose(subs).encode(self.args.encoding, "replace")) 180 | 181 | def _save_md(self, md_fn, srt_fn, video_fn): 182 | with open(srt_fn, encoding=self.args.encoding) as f: 183 | subs = srt.parse(f.read()) 184 | 185 | md = utils.MD(md_fn, self.args.encoding) 186 | md.clear() 187 | md.add_done_editing(False) 188 | md.add_video(os.path.basename(video_fn)) 189 | md.add( 190 | f"\nTexts generated from [{os.path.basename(srt_fn)}]({os.path.basename(srt_fn)})." 191 | "Mark the sentences to keep for autocut.\n" 192 | "The format is [subtitle_index,duration_in_second] subtitle context.\n\n" 193 | ) 194 | 195 | for s in subs: 196 | sec = s.start.seconds 197 | pre = f"[{s.index},{sec // 60:02d}:{sec % 60:02d}]" 198 | md.add_task(False, f"{pre:11} {s.content.strip()}") 199 | md.write() 200 | -------------------------------------------------------------------------------- /autocut/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | 5 | import srt 6 | import opencc 7 | 8 | 9 | def is_video(filename): 10 | _, ext = os.path.splitext(filename) 11 | return ext in [".mp4", ".mov", ".mkv", ".avi", ".flv", ".f4v", ".webm"] 12 | 13 | 14 | def is_audio(filename): 15 | _, ext = os.path.splitext(filename) 16 | return ext in [".ogg", ".wav", ".mp3", ".flac", ".m4a"] 17 | 18 | 19 | def change_ext(filename, new_ext): 20 | # Change the extension of filename to new_ext 21 | base, _ = os.path.splitext(filename) 22 | if not new_ext.startswith("."): 23 | new_ext = "." + new_ext 24 | return base + new_ext 25 | 26 | 27 | def add_cut(filename): 28 | # Add cut mark to the filename 29 | base, ext = os.path.splitext(filename) 30 | if base.endswith("_cut"): 31 | base = base[:-4] + "_" + base[-4:] 32 | else: 33 | base += "_cut" 34 | return base + ext 35 | 36 | 37 | # a very simple markdown parser 38 | class MD: 39 | def __init__(self, filename, encoding): 40 | self.lines = [] 41 | self.EDIT_DONE_MAKR = "<-- Mark if you are done editing." 42 | self.encoding = encoding 43 | self.filename = filename 44 | if filename: 45 | self.load_file() 46 | 47 | def load_file(self): 48 | if os.path.exists(self.filename): 49 | with open(self.filename, encoding=self.encoding) as f: 50 | self.lines = f.readlines() 51 | 52 | def clear(self): 53 | self.lines = [] 54 | 55 | def write(self): 56 | with open(self.filename, "wb") as f: 57 | f.write("\n".join(self.lines).encode(self.encoding, "replace")) 58 | 59 | def tasks(self): 60 | # get all tasks with their status 61 | ret = [] 62 | for l in self.lines: 63 | mark, task = self._parse_task_status(l) 64 | if mark is not None: 65 | ret.append((mark, task)) 66 | return ret 67 | 68 | def done_editing(self): 69 | for m, t in self.tasks(): 70 | if m and self.EDIT_DONE_MAKR in t: 71 | return True 72 | return False 73 | 74 | def add(self, line): 75 | self.lines.append(line) 76 | 77 | def add_task(self, mark, contents): 78 | self.add(f'- [{"x" if mark else " "}] {contents.strip()}') 79 | 80 | def add_done_editing(self, mark): 81 | self.add_task(mark, self.EDIT_DONE_MAKR) 82 | 83 | def add_video(self, video_fn): 84 | ext = os.path.splitext(video_fn)[1][1:] 85 | self.add( 86 | f'\n\n' 87 | ) 88 | 89 | def _parse_task_status(self, line): 90 | # return (is_marked, rest) or (None, line) if not a task 91 | m = re.match(r"- +\[([ x])\] +(.*)", line) 92 | if not m: 93 | return None, line 94 | return m.groups()[0].lower() == "x", m.groups()[1] 95 | 96 | 97 | def check_exists(output, force): 98 | if os.path.exists(output): 99 | if force: 100 | logging.info(f"{output} exists. Will overwrite it") 101 | else: 102 | logging.info( 103 | f"{output} exists, skipping... Use the --force flag to overwrite" 104 | ) 105 | return True 106 | return False 107 | 108 | 109 | def expand_segments(segments, expand_head, expand_tail, total_length): 110 | # Pad head and tail for each time segment 111 | results = [] 112 | for i in range(len(segments)): 113 | t = segments[i] 114 | start = max(t["start"] - expand_head, segments[i - 1]["end"] if i > 0 else 0) 115 | end = min( 116 | t["end"] + expand_tail, 117 | segments[i + 1]["start"] if i < len(segments) - 1 else total_length, 118 | ) 119 | results.append({"start": start, "end": end}) 120 | return results 121 | 122 | 123 | def remove_short_segments(segments, threshold): 124 | # Remove segments whose length < threshold 125 | return [s for s in segments if s["end"] - s["start"] > threshold] 126 | 127 | 128 | def merge_adjacent_segments(segments, threshold): 129 | # Merge two adjacent segments if their distance < threshold 130 | results = [] 131 | i = 0 132 | while i < len(segments): 133 | s = segments[i] 134 | for j in range(i + 1, len(segments)): 135 | if segments[j]["start"] < s["end"] + threshold: 136 | s["end"] = segments[j]["end"] 137 | i = j 138 | else: 139 | break 140 | i += 1 141 | results.append(s) 142 | return results 143 | 144 | 145 | def compact_rst(sub_fn, encoding): 146 | cc = opencc.OpenCC("t2s") 147 | 148 | base, ext = os.path.splitext(sub_fn) 149 | COMPACT = "_compact" 150 | if ext != ".srt": 151 | logging.fatal("only .srt file is supported") 152 | 153 | if base.endswith(COMPACT): 154 | # to original rst 155 | with open(sub_fn, encoding=encoding) as f: 156 | lines = f.readlines() 157 | subs = [] 158 | for l in lines: 159 | items = l.split(" ") 160 | if len(items) < 4: 161 | continue 162 | subs.append( 163 | srt.Subtitle( 164 | index=0, 165 | start=srt.srt_timestamp_to_timedelta(items[0]), 166 | end=srt.srt_timestamp_to_timedelta(items[2]), 167 | content=" ".join(items[3:]).strip(), 168 | ) 169 | ) 170 | with open(base[: -len(COMPACT)] + ext, "wb") as f: 171 | f.write(srt.compose(subs).encode(encoding, "replace")) 172 | else: 173 | # to a compact version 174 | with open(sub_fn, encoding=encoding) as f: 175 | subs = srt.parse(f.read()) 176 | with open(base + COMPACT + ext, "wb") as f: 177 | for s in subs: 178 | f.write( 179 | f"{srt.timedelta_to_srt_timestamp(s.start)} --> {srt.timedelta_to_srt_timestamp(s.end)} " 180 | f"{cc.convert(s.content.strip())}\n".encode(encoding, "replace") 181 | ) 182 | 183 | 184 | def trans_srt_to_md(encoding, force, srt_fn, video_fn=None): 185 | base, ext = os.path.splitext(srt_fn) 186 | if ext != ".srt": 187 | logging.fatal("only .srt file is supported") 188 | md_fn = base + ext.split(".")[0] + ".md" 189 | 190 | check_exists(md_fn, force) 191 | 192 | with open(srt_fn, encoding=encoding) as f: 193 | subs = srt.parse(f.read()) 194 | 195 | md = MD(md_fn, encoding) 196 | md.clear() 197 | md.add_done_editing(False) 198 | if video_fn: 199 | if not is_video(video_fn): 200 | logging.fatal(f"{video_fn} may not be a video") 201 | md.add_video(os.path.basename(video_fn)) 202 | md.add( 203 | f"\nTexts generated from [{os.path.basename(srt_fn)}]({os.path.basename(srt_fn)})." 204 | "Mark the sentences to keep for autocut.\n" 205 | "The format is [subtitle_index,duration_in_second] subtitle context.\n\n" 206 | ) 207 | 208 | for s in subs: 209 | sec = s.start.seconds 210 | pre = f"[{s.index},{sec // 60:02d}:{sec % 60:02d}]" 211 | md.add_task(False, f"{pre:11} {s.content.strip()}") 212 | md.write() 213 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | # run this script after pyinstaller 2 | # see https://github.com/pyinstaller/pyinstaller/issues/7582#issuecomment-1515434457 3 | 4 | rm -f libtorch* 5 | ln -s torch/lib/libtorch.dylib . 6 | ln -s torch/lib/libtorch_cpu.dylib . 7 | ln -s torch/lib/libtorch_python.dylib . 8 | 9 | ln -s torchaudio/lib/libtorchaudio.so . 10 | 11 | install_name_tool -add_rpath @loader_path/../.. torchaudio/lib/libtorchaudio.so 12 | -------------------------------------------------------------------------------- /imgs/typora.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/imgs/typora.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | altgraph==0.17.3 2 | attrs==22.1.0 3 | black==22.12.0 4 | certifi==2022.12.7 5 | charset-normalizer==2.1.1 6 | click==8.1.3 7 | colorama==0.4.6 8 | decorator==4.4.2 9 | exceptiongroup==1.0.4 10 | ffmpeg-python==0.2.0 11 | filelock==3.8.2 12 | future==0.18.2 13 | huggingface-hub==0.11.1 14 | idna==3.4 15 | imageio==2.22.4 16 | imageio-ffmpeg==0.4.7 17 | iniconfig==1.1.1 18 | more-itertools==9.0.0 19 | moviepy==1.0.3 20 | mypy-extensions==0.4.3 21 | numpy==1.23.5 22 | opencc-python-reimplemented==0.1.6 23 | packaging==22.0 24 | parameterized==0.8.1 25 | pathspec==0.10.3 26 | pefile==2022.5.30 27 | Pillow==9.3.0 28 | platformdirs==2.6.0 29 | pluggy==1.0.0 30 | proglog==0.1.10 31 | pyinstaller==5.7.0 32 | pyinstaller-hooks-contrib==2022.14 33 | pyparsing==3.0.9 34 | pytest==7.2.0 35 | pywin32-ctypes==0.2.0 36 | PyYAML==6.0 37 | regex==2022.10.31 38 | requests==2.28.1 39 | six==1.16.0 40 | srt==3.5.2 41 | tokenizers==0.13.2 42 | tomli==2.0.1 43 | torch==1.13.0 44 | torchaudio==0.13.0 45 | tqdm==4.64.1 46 | transformers==4.25.1 47 | typing_extensions==4.4.0 48 | urllib3==1.26.13 49 | whisper @ git+https://github.com/openai/whisper.git@02aa851a4910201f0db56960064d7e121a01002c 50 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = autocut 3 | version = attr: autocut.__version__ 4 | license = Apache Software License 5 | description = Cut video by subtitles 6 | long_description = file: README.md 7 | classifiers = 8 | License :: OSI Approved :: Apache Software License 9 | Operating System :: OS Independent 10 | Programming Language :: Python :: 3 11 | 12 | [options] 13 | packages = find: 14 | include_package_data = True 15 | python_requires = >= 3.9 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | requirements = [ 4 | "srt", 5 | "moviepy", 6 | "opencc-python-reimplemented", 7 | "torchaudio", 8 | "parameterized", 9 | "openai-whisper", 10 | "tqdm", 11 | ] 12 | 13 | 14 | setup( 15 | name="autocut", 16 | install_requires=requirements, 17 | packages=find_packages(), 18 | entry_points={ 19 | "console_scripts": [ 20 | "autocut = autocut.main:main", 21 | ] 22 | }, 23 | ) 24 | -------------------------------------------------------------------------------- /snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: Bug report - [X] 5 | labels: bug 6 | assignees: snakers4 7 | 8 | --- 9 | 10 | ## 🐛 Bug 11 | 12 | 13 | 14 | ## To Reproduce 15 | 16 | Steps to reproduce the behavior: 17 | 18 | 1. 19 | 2. 20 | 3. 21 | 22 | 23 | 24 | ## Expected behavior 25 | 26 | 27 | 28 | ## Environment 29 | 30 | Please copy and paste the output from this 31 | [environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py) 32 | (or fill out the checklist below manually). 33 | 34 | You can get the script and run it with: 35 | ``` 36 | wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py 37 | # For security purposes, please check the contents of collect_env.py before running it. 38 | python collect_env.py 39 | ``` 40 | 41 | - PyTorch Version (e.g., 1.0): 42 | - OS (e.g., Linux): 43 | - How you installed PyTorch (`conda`, `pip`, source): 44 | - Build command you used (if compiling from source): 45 | - Python version: 46 | - CUDA/cuDNN version: 47 | - GPU models and configuration: 48 | - Any other relevant information: 49 | 50 | ## Additional context 51 | 52 | 53 | -------------------------------------------------------------------------------- /snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: Feature request - [X] 5 | labels: enhancement 6 | assignees: snakers4 7 | 8 | --- 9 | 10 | ## 🚀 Feature 11 | 12 | 13 | ## Motivation 14 | 15 | 16 | 17 | ## Pitch 18 | 19 | 20 | 21 | ## Alternatives 22 | 23 | 24 | 25 | ## Additional context 26 | 27 | 28 | -------------------------------------------------------------------------------- /snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/questions---help---support.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Questions / Help / Support 3 | about: Ask for help, support or ask a question 4 | title: "❓ Questions / Help / Support" 5 | labels: help wanted 6 | assignees: snakers4 7 | 8 | --- 9 | 10 | ## ❓ Questions and Help 11 | 12 | We have a [wiki](https://github.com/snakers4/silero-models/wiki) available for our users. Please make sure you have checked it out first. 13 | -------------------------------------------------------------------------------- /snakers4_silero-vad_master/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at aveysov@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /snakers4_silero-vad_master/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020-present Silero Team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /snakers4_silero-vad_master/README.md: -------------------------------------------------------------------------------- 1 | [![Mailing list : test](http://img.shields.io/badge/Email-gray.svg?style=for-the-badge&logo=gmail)](mailto:hello@silero.ai) [![Mailing list : test](http://img.shields.io/badge/Telegram-blue.svg?style=for-the-badge&logo=telegram)](https://t.me/silero_speech) [![License: CC BY-NC 4.0](https://img.shields.io/badge/License-MIT-lightgrey.svg?style=for-the-badge)](https://github.com/snakers4/silero-vad/blob/master/LICENSE) 2 | 3 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb) 4 | 5 | ![header](https://user-images.githubusercontent.com/12515440/89997349-b3523080-dc94-11ea-9906-ca2e8bc50535.png) 6 | 7 |
8 |

Silero VAD

9 |
10 | 11 | **Silero VAD** - pre-trained enterprise-grade [Voice Activity Detector](https://en.wikipedia.org/wiki/Voice_activity_detection) (also see our [STT models](https://github.com/snakers4/silero-models)). 12 | 13 | This repository also includes Number Detector and Language classifier [models](https://github.com/snakers4/silero-vad/wiki/Other-Models) 14 | 15 |
16 | 17 |

18 | 19 |

20 | 21 |
22 | Real Time Example 23 | 24 | https://user-images.githubusercontent.com/36505480/144874384-95f80f6d-a4f1-42cc-9be7-004c891dd481.mp4 25 | 26 |
27 | 28 |
29 |

Key Features

30 |
31 | 32 | - **Stellar accuracy** 33 | 34 | Silero VAD has [excellent results](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#vs-other-available-solutions) on speech detection tasks. 35 | 36 | - **Fast** 37 | 38 | One audio chunk (30+ ms) [takes](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics#silero-vad-performance-metrics) less than **1ms** to be processed on a single CPU thread. Using batching or GPU can also improve performance considerably. Under certain conditions ONNX may even run up to 4-5x faster. 39 | 40 | - **Lightweight** 41 | 42 | JIT model is around one megabyte in size. 43 | 44 | - **General** 45 | 46 | Silero VAD was trained on huge corpora that include over **100** languages and it performs well on audios from different domains with various background noise and quality levels. 47 | 48 | - **Flexible sampling rate** 49 | 50 | Silero VAD [supports](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#sample-rate-comparison) **8000 Hz** and **16000 Hz** [sampling rates](https://en.wikipedia.org/wiki/Sampling_(signal_processing)#Sampling_rate). 51 | 52 | - **Flexible chunk size** 53 | 54 | Model was trained on **30 ms**. Longer chunks are supported directly, others may work as well. 55 | 56 | - **Highly Portable** 57 | 58 | Silero VAD reaps benefits from the rich ecosystems built around **PyTorch** and **ONNX** running everywhere where these runtimes are available. 59 | 60 | - **No Strings Attached** 61 | 62 | Published under permissive license (MIT) Silero VAD has zero strings attached - no telemetry, no keys, no registration, no built-in expiration, no keys or vendor lock. 63 | 64 |
65 |

Typical Use Cases

66 |
67 | 68 | - Voice activity detection for IOT / edge / mobile use cases 69 | - Data cleaning and preparation, voice detection in general 70 | - Telephony and call-center automation, voice bots 71 | - Voice interfaces 72 | 73 |
74 |

Links

75 |
76 | 77 | 78 | - [Examples and Dependencies](https://github.com/snakers4/silero-vad/wiki/Examples-and-Dependencies#dependencies) 79 | - [Quality Metrics](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics) 80 | - [Performance Metrics](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics) 81 | - [Number Detector and Language classifier models](https://github.com/snakers4/silero-vad/wiki/Other-Models) 82 | - [Versions and Available Models](https://github.com/snakers4/silero-vad/wiki/Version-history-and-Available-Models) 83 | - [Further reading](https://github.com/snakers4/silero-models#further-reading) 84 | - [FAQ](https://github.com/snakers4/silero-vad/wiki/FAQ) 85 | 86 |
87 |

Get In Touch

88 |
89 | 90 | Try our models, create an [issue](https://github.com/snakers4/silero-vad/issues/new), start a [discussion](https://github.com/snakers4/silero-vad/discussions/new), join our telegram [chat](https://t.me/silero_speech), [email](mailto:hello@silero.ai) us, read our [news](https://t.me/silero_news). 91 | 92 | Please see our [wiki](https://github.com/snakers4/silero-models/wiki) and [tiers](https://github.com/snakers4/silero-models/wiki/Licensing-and-Tiers) for relevant information and [email](mailto:hello@silero.ai) us directly. 93 | 94 | **Citations** 95 | 96 | ``` 97 | @misc{Silero VAD, 98 | author = {Silero Team}, 99 | title = {Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier}, 100 | year = {2021}, 101 | publisher = {GitHub}, 102 | journal = {GitHub repository}, 103 | howpublished = {\url{https://github.com/snakers4/silero-vad}}, 104 | commit = {insert_some_commit_here}, 105 | email = {hello@silero.ai} 106 | } 107 | ``` 108 | 109 |
110 |

Examples and VAD-based Community Apps

111 |
112 | 113 | - Example of VAD ONNX Runtime model usage in [C++](https://github.com/snakers4/silero-vad/tree/master/examples/cpp) 114 | 115 | - Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web 116 | -------------------------------------------------------------------------------- /snakers4_silero-vad_master/examples/colab_record_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "bccAucKjnPHm" 7 | }, 8 | "source": [ 9 | "### Dependencies and inputs" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "id": "cSih95WFmwgi" 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "!pip -q install pydub\n", 21 | "from google.colab import output\n", 22 | "from base64 import b64decode, b64encode\n", 23 | "from io import BytesIO\n", 24 | "import numpy as np\n", 25 | "from pydub import AudioSegment\n", 26 | "from IPython.display import HTML, display\n", 27 | "import torch\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "import moviepy.editor as mpe\n", 30 | "from matplotlib.animation import FuncAnimation, FFMpegWriter\n", 31 | "import matplotlib\n", 32 | "matplotlib.use('Agg')\n", 33 | "\n", 34 | "torch.set_num_threads(1)\n", 35 | "\n", 36 | "model, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", 37 | " model='silero_vad',\n", 38 | " force_reload=True)\n", 39 | "\n", 40 | "def int2float(sound):\n", 41 | " abs_max = np.abs(sound).max()\n", 42 | " sound = sound.astype('float32')\n", 43 | " if abs_max > 0:\n", 44 | " sound *= 1/abs_max\n", 45 | " sound = sound.squeeze()\n", 46 | " return sound\n", 47 | "\n", 48 | "AUDIO_HTML = \"\"\"\n", 49 | "\n", 126 | "\"\"\"\n", 127 | "\n", 128 | "def record(sec=10):\n", 129 | " display(HTML(AUDIO_HTML))\n", 130 | " s = output.eval_js(\"data\")\n", 131 | " b = b64decode(s.split(',')[1])\n", 132 | " audio = AudioSegment.from_file(BytesIO(b))\n", 133 | " audio.export('test.mp3', format='mp3')\n", 134 | " audio = audio.set_channels(1)\n", 135 | " audio = audio.set_frame_rate(16000)\n", 136 | " audio_float = int2float(np.array(audio.get_array_of_samples()))\n", 137 | " audio_tens = torch.tensor(audio_float )\n", 138 | " return audio_tens\n", 139 | "\n", 140 | "def make_animation(probs, audio_duration, interval=40):\n", 141 | " fig = plt.figure(figsize=(16, 9))\n", 142 | " ax = plt.axes(xlim=(0, audio_duration), ylim=(0, 1.02))\n", 143 | " line, = ax.plot([], [], lw=2)\n", 144 | " x = [i / 16000 * 512 for i in range(len(probs))]\n", 145 | " plt.xlabel('Time, seconds', fontsize=16)\n", 146 | " plt.ylabel('Speech Probability', fontsize=16)\n", 147 | "\n", 148 | " def init():\n", 149 | " plt.fill_between(x, probs, color='#064273')\n", 150 | " line.set_data([], [])\n", 151 | " line.set_color('#990000')\n", 152 | " return line,\n", 153 | "\n", 154 | " def animate(i):\n", 155 | " x = i * interval / 1000 - 0.04\n", 156 | " y = np.linspace(0, 1.02, 2)\n", 157 | " \n", 158 | " line.set_data(x, y)\n", 159 | " line.set_color('#990000')\n", 160 | " return line,\n", 161 | "\n", 162 | " anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=audio_duration / (interval / 1000))\n", 163 | "\n", 164 | " f = r\"animation.mp4\" \n", 165 | " writervideo = FFMpegWriter(fps=1000/interval) \n", 166 | " anim.save(f, writer=writervideo)\n", 167 | " plt.close('all')\n", 168 | "\n", 169 | "def combine_audio(vidname, audname, outname, fps=25): \n", 170 | " my_clip = mpe.VideoFileClip(vidname, verbose=False)\n", 171 | " audio_background = mpe.AudioFileClip(audname)\n", 172 | " final_clip = my_clip.set_audio(audio_background)\n", 173 | " final_clip.write_videofile(outname,fps=fps,verbose=False)\n", 174 | "\n", 175 | "def record_make_animation():\n", 176 | " tensor = record()\n", 177 | "\n", 178 | " print('Calculating probabilities...')\n", 179 | " speech_probs = []\n", 180 | " window_size_samples = 512\n", 181 | " for i in range(0, len(tensor), window_size_samples):\n", 182 | " if len(tensor[i: i+ window_size_samples]) < window_size_samples:\n", 183 | " break\n", 184 | " speech_prob = model(tensor[i: i+ window_size_samples], 16000).item()\n", 185 | " speech_probs.append(speech_prob)\n", 186 | " model.reset_states()\n", 187 | " print('Making animation...')\n", 188 | " make_animation(speech_probs, len(tensor) / 16000)\n", 189 | "\n", 190 | " print('Merging your voice with animation...')\n", 191 | " combine_audio('animation.mp4', 'test.mp3', 'merged.mp4')\n", 192 | " print('Done!')\n", 193 | " mp4 = open('merged.mp4','rb').read()\n", 194 | " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n", 195 | " display(HTML(\"\"\"\n", 196 | " \n", 199 | " \"\"\" % data_url))" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": { 205 | "id": "IFVs3GvTnpB1" 206 | }, 207 | "source": [ 208 | "## Record example" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "id": "5EBjrTwiqAaQ" 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "record_make_animation()" 220 | ] 221 | } 222 | ], 223 | "metadata": { 224 | "colab": { 225 | "collapsed_sections": [ 226 | "bccAucKjnPHm" 227 | ], 228 | "name": "Untitled2.ipynb", 229 | "provenance": [] 230 | }, 231 | "kernelspec": { 232 | "display_name": "Python 3", 233 | "name": "python3" 234 | }, 235 | "language_info": { 236 | "name": "python" 237 | } 238 | }, 239 | "nbformat": 4, 240 | "nbformat_minor": 0 241 | } 242 | -------------------------------------------------------------------------------- /snakers4_silero-vad_master/examples/cpp/README.md: -------------------------------------------------------------------------------- 1 | # Stream example in C++ 2 | 3 | Here's a simple example of the vad model in c++ onnxruntime. 4 | 5 | 6 | 7 | ## Requirements 8 | 9 | Code are tested in the environments bellow, feel free to try others. 10 | 11 | - WSL2 + Debian-bullseye (docker) 12 | - gcc 12.2.0 13 | - onnxruntime-linux-x64-1.12.1 14 | 15 | 16 | 17 | ## Usage 18 | 19 | 1. Install gcc 12.2.0, or just pull the docker image with `docker pull gcc:12.2.0-bullseye` 20 | 21 | 2. Install onnxruntime-linux-x64-1.12.1 22 | 23 | - Download lib onnxruntime: 24 | 25 | `wget https://github.com/microsoft/onnxruntime/releases/download/v1.12.1/onnxruntime-linux-x64-1.12.1.tgz` 26 | 27 | - Unzip. Assume the path is `/root/onnxruntime-linux-x64-1.12.1` 28 | 29 | 3. Modify wav path & Test configs in main function 30 | 31 | `wav::WavReader wav_reader("${path_to_your_wav_file}");` 32 | 33 | test sample rate, frame per ms, threshold... 34 | 35 | 4. Build with gcc and run 36 | 37 | ```bash 38 | # Build 39 | g++ silero-vad-onnx.cpp -I /root/onnxruntime-linux-x64-1.12.1/include/ -L /root/onnxruntime-linux-x64-1.12.1/lib/ -lonnxruntime -Wl,-rpath,/root/onnxruntime-linux-x64-1.12.1/lib/ -o test 40 | 41 | # Run 42 | ./test 43 | ``` -------------------------------------------------------------------------------- /snakers4_silero-vad_master/examples/cpp/silero-vad-onnx.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "onnxruntime_cxx_api.h" 8 | #include "wav.h" 9 | 10 | class VadIterator 11 | { 12 | // OnnxRuntime resources 13 | Ort::Env env; 14 | Ort::SessionOptions session_options; 15 | std::shared_ptr session = nullptr; 16 | Ort::AllocatorWithDefaultOptions allocator; 17 | Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeCPU); 18 | 19 | public: 20 | void init_engine_threads(int inter_threads, int intra_threads) 21 | { 22 | // The method should be called in each thread/proc in multi-thread/proc work 23 | session_options.SetIntraOpNumThreads(intra_threads); 24 | session_options.SetInterOpNumThreads(inter_threads); 25 | session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); 26 | } 27 | 28 | void init_onnx_model(const std::string &model_path) 29 | { 30 | // Init threads = 1 for 31 | init_engine_threads(1, 1); 32 | // Load model 33 | session = std::make_shared(env, model_path.c_str(), session_options); 34 | } 35 | 36 | void reset_states() 37 | { 38 | // Call reset before each audio start 39 | std::memset(_h.data(), 0.0f, _h.size() * sizeof(float)); 40 | std::memset(_c.data(), 0.0f, _c.size() * sizeof(float)); 41 | triggerd = false; 42 | temp_end = 0; 43 | current_sample = 0; 44 | } 45 | 46 | // Call it in predict func. if you prefer raw bytes input. 47 | void bytes_to_float_tensor(const char *pcm_bytes) 48 | { 49 | std::memcpy(input.data(), pcm_bytes, window_size_samples * sizeof(int16_t)); 50 | for (int i = 0; i < window_size_samples; i++) 51 | { 52 | input[i] = static_cast(input[i]) / 32768; // int16_t normalized to float 53 | } 54 | } 55 | 56 | 57 | void predict(const std::vector &data) 58 | { 59 | // bytes_to_float_tensor(data); 60 | 61 | // Infer 62 | // Create ort tensors 63 | input.assign(data.begin(), data.end()); 64 | Ort::Value input_ort = Ort::Value::CreateTensor( 65 | memory_info, input.data(), input.size(), input_node_dims, 2); 66 | Ort::Value sr_ort = Ort::Value::CreateTensor( 67 | memory_info, sr.data(), sr.size(), sr_node_dims, 1); 68 | Ort::Value h_ort = Ort::Value::CreateTensor( 69 | memory_info, _h.data(), _h.size(), hc_node_dims, 3); 70 | Ort::Value c_ort = Ort::Value::CreateTensor( 71 | memory_info, _c.data(), _c.size(), hc_node_dims, 3); 72 | 73 | // Clear and add inputs 74 | ort_inputs.clear(); 75 | ort_inputs.emplace_back(std::move(input_ort)); 76 | ort_inputs.emplace_back(std::move(sr_ort)); 77 | ort_inputs.emplace_back(std::move(h_ort)); 78 | ort_inputs.emplace_back(std::move(c_ort)); 79 | 80 | // Infer 81 | ort_outputs = session->Run( 82 | Ort::RunOptions{nullptr}, 83 | input_node_names.data(), ort_inputs.data(), ort_inputs.size(), 84 | output_node_names.data(), output_node_names.size()); 85 | 86 | // Output probability & update h,c recursively 87 | float output = ort_outputs[0].GetTensorMutableData()[0]; 88 | float *hn = ort_outputs[1].GetTensorMutableData(); 89 | std::memcpy(_h.data(), hn, size_hc * sizeof(float)); 90 | float *cn = ort_outputs[2].GetTensorMutableData(); 91 | std::memcpy(_c.data(), cn, size_hc * sizeof(float)); 92 | 93 | // Push forward sample index 94 | current_sample += window_size_samples; 95 | 96 | // Reset temp_end when > threshold 97 | if ((output >= threshold) && (temp_end != 0)) 98 | { 99 | temp_end = 0; 100 | } 101 | // 1) Silence 102 | if ((output < threshold) && (triggerd == false)) 103 | { 104 | // printf("{ silence: %.3f s }\n", 1.0 * current_sample / sample_rate); 105 | } 106 | // 2) Speaking 107 | if ((output >= (threshold - 0.15)) && (triggerd == true)) 108 | { 109 | // printf("{ speaking_2: %.3f s }\n", 1.0 * current_sample / sample_rate); 110 | } 111 | 112 | // 3) Start 113 | if ((output >= threshold) && (triggerd == false)) 114 | { 115 | triggerd = true; 116 | speech_start = current_sample - window_size_samples - speech_pad_samples; // minus window_size_samples to get precise start time point. 117 | printf("{ start: %.3f s }\n", 1.0 * speech_start / sample_rate); 118 | } 119 | 120 | // 4) End 121 | if ((output < (threshold - 0.15)) && (triggerd == true)) 122 | { 123 | 124 | if (temp_end != 0) 125 | { 126 | temp_end = current_sample; 127 | } 128 | // a. silence < min_slience_samples, continue speaking 129 | if ((current_sample - temp_end) < min_silence_samples) 130 | { 131 | // printf("{ speaking_4: %.3f s }\n", 1.0 * current_sample / sample_rate); 132 | // printf(""); 133 | } 134 | // b. silence >= min_slience_samples, end speaking 135 | else 136 | { 137 | speech_end = current_sample + speech_pad_samples; 138 | temp_end = 0; 139 | triggerd = false; 140 | printf("{ end: %.3f s }\n", 1.0 * speech_end / sample_rate); 141 | } 142 | } 143 | 144 | 145 | } 146 | 147 | private: 148 | // model config 149 | int64_t window_size_samples; // Assign when init, support 256 512 768 for 8k; 512 1024 1536 for 16k. 150 | int sample_rate; 151 | int sr_per_ms; // Assign when init, support 8 or 16 152 | float threshold; 153 | int min_silence_samples; // sr_per_ms * #ms 154 | int speech_pad_samples; // usually a 155 | 156 | // model states 157 | bool triggerd = false; 158 | unsigned int speech_start = 0; 159 | unsigned int speech_end = 0; 160 | unsigned int temp_end = 0; 161 | unsigned int current_sample = 0; 162 | // MAX 4294967295 samples / 8sample per ms / 1000 / 60 = 8947 minutes 163 | float output; 164 | 165 | // Onnx model 166 | // Inputs 167 | std::vector ort_inputs; 168 | 169 | std::vector input_node_names = {"input", "sr", "h", "c"}; 170 | std::vector input; 171 | std::vector sr; 172 | unsigned int size_hc = 2 * 1 * 64; // It's FIXED. 173 | std::vector _h; 174 | std::vector _c; 175 | 176 | int64_t input_node_dims[2] = {}; 177 | const int64_t sr_node_dims[1] = {1}; 178 | const int64_t hc_node_dims[3] = {2, 1, 64}; 179 | 180 | // Outputs 181 | std::vector ort_outputs; 182 | std::vector output_node_names = {"output", "hn", "cn"}; 183 | 184 | 185 | public: 186 | // Construction 187 | VadIterator(const std::string ModelPath, 188 | int Sample_rate, int frame_size, 189 | float Threshold, int min_silence_duration_ms, int speech_pad_ms) 190 | { 191 | init_onnx_model(ModelPath); 192 | sample_rate = Sample_rate; 193 | sr_per_ms = sample_rate / 1000; 194 | threshold = Threshold; 195 | min_silence_samples = sr_per_ms * min_silence_duration_ms; 196 | speech_pad_samples = sr_per_ms * speech_pad_ms; 197 | window_size_samples = frame_size * sr_per_ms; 198 | 199 | input.resize(window_size_samples); 200 | input_node_dims[0] = 1; 201 | input_node_dims[1] = window_size_samples; 202 | // std::cout << "== Input size" << input.size() << std::endl; 203 | _h.resize(size_hc); 204 | _c.resize(size_hc); 205 | sr.resize(1); 206 | } 207 | 208 | }; 209 | 210 | int main() 211 | { 212 | 213 | // Read wav 214 | wav::WavReader wav_reader("./test_for_vad.wav"); 215 | std::vector data(wav_reader.num_samples()); 216 | std::vector input_wav(wav_reader.num_samples()); 217 | 218 | for (int i = 0; i < wav_reader.num_samples(); i++) 219 | { 220 | data[i] = static_cast(*(wav_reader.data() + i)); 221 | } 222 | 223 | for (int i = 0; i < wav_reader.num_samples(); i++) 224 | { 225 | input_wav[i] = static_cast(data[i]) / 32768; 226 | } 227 | 228 | // ===== Test configs ===== 229 | std::string path = "../files/silero_vad.onnx"; 230 | int test_sr = 8000; 231 | int test_frame_ms = 64; 232 | float test_threshold = 0.5f; 233 | int test_min_silence_duration_ms = 0; 234 | int test_speech_pad_ms = 0; 235 | int test_window_samples = test_frame_ms * (test_sr/1000); 236 | 237 | VadIterator vad( 238 | path, test_sr, test_frame_ms, test_threshold, 239 | test_min_silence_duration_ms, test_speech_pad_ms); 240 | 241 | for (int j = 0; j < wav_reader.num_samples(); j += test_window_samples) 242 | { 243 | // std::cout << "== 4" << std::endl; 244 | std::vector r{&input_wav[0] + j, &input_wav[0] + j + test_window_samples}; 245 | auto start = std::chrono::high_resolution_clock::now(); 246 | // Predict and print throughout process time 247 | vad.predict(r); 248 | auto end = std::chrono::high_resolution_clock::now(); 249 | auto elapsed_time = std::chrono::duration_cast(end-start); 250 | // std::cout << "== Elapsed time: " << 1.0*elapsed_time.count()/1000000 << "ms" << " ==" < 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include 26 | 27 | // #include "utils/log.h" 28 | 29 | namespace wav { 30 | 31 | struct WavHeader { 32 | char riff[4]; // "riff" 33 | unsigned int size; 34 | char wav[4]; // "WAVE" 35 | char fmt[4]; // "fmt " 36 | unsigned int fmt_size; 37 | uint16_t format; 38 | uint16_t channels; 39 | unsigned int sample_rate; 40 | unsigned int bytes_per_second; 41 | uint16_t block_size; 42 | uint16_t bit; 43 | char data[4]; // "data" 44 | unsigned int data_size; 45 | }; 46 | 47 | class WavReader { 48 | public: 49 | WavReader() : data_(nullptr) {} 50 | explicit WavReader(const std::string& filename) { Open(filename); } 51 | 52 | bool Open(const std::string& filename) { 53 | FILE* fp = fopen(filename.c_str(), "rb"); //文件读取 54 | if (NULL == fp) { 55 | std::cout << "Error in read " << filename; 56 | return false; 57 | } 58 | 59 | WavHeader header; 60 | fread(&header, 1, sizeof(header), fp); 61 | if (header.fmt_size < 16) { 62 | fprintf(stderr, 63 | "WaveData: expect PCM format data " 64 | "to have fmt chunk of at least size 16.\n"); 65 | return false; 66 | } else if (header.fmt_size > 16) { 67 | int offset = 44 - 8 + header.fmt_size - 16; 68 | fseek(fp, offset, SEEK_SET); 69 | fread(header.data, 8, sizeof(char), fp); 70 | } 71 | // check "riff" "WAVE" "fmt " "data" 72 | 73 | // Skip any sub-chunks between "fmt" and "data". Usually there will 74 | // be a single "fact" sub chunk, but on Windows there can also be a 75 | // "list" sub chunk. 76 | while (0 != strncmp(header.data, "data", 4)) { 77 | // We will just ignore the data in these chunks. 78 | fseek(fp, header.data_size, SEEK_CUR); 79 | // read next sub chunk 80 | fread(header.data, 8, sizeof(char), fp); 81 | } 82 | 83 | num_channel_ = header.channels; 84 | sample_rate_ = header.sample_rate; 85 | bits_per_sample_ = header.bit; 86 | int num_data = header.data_size / (bits_per_sample_ / 8); 87 | data_ = new float[num_data]; // Create 1-dim array 88 | num_samples_ = num_data / num_channel_; 89 | 90 | for (int i = 0; i < num_data; ++i) { 91 | switch (bits_per_sample_) { 92 | case 8: { 93 | char sample; 94 | fread(&sample, 1, sizeof(char), fp); 95 | data_[i] = static_cast(sample); 96 | break; 97 | } 98 | case 16: { 99 | int16_t sample; 100 | fread(&sample, 1, sizeof(int16_t), fp); 101 | // std::cout << sample; 102 | data_[i] = static_cast(sample); 103 | // std::cout << data_[i]; 104 | break; 105 | } 106 | case 32: { 107 | int sample; 108 | fread(&sample, 1, sizeof(int), fp); 109 | data_[i] = static_cast(sample); 110 | break; 111 | } 112 | default: 113 | fprintf(stderr, "unsupported quantization bits"); 114 | exit(1); 115 | } 116 | } 117 | fclose(fp); 118 | return true; 119 | } 120 | 121 | int num_channel() const { return num_channel_; } 122 | int sample_rate() const { return sample_rate_; } 123 | int bits_per_sample() const { return bits_per_sample_; } 124 | int num_samples() const { return num_samples_; } 125 | 126 | ~WavReader() { 127 | delete[] data_; 128 | } 129 | 130 | const float* data() const { return data_; } 131 | 132 | private: 133 | int num_channel_; 134 | int sample_rate_; 135 | int bits_per_sample_; 136 | int num_samples_; // sample points per channel 137 | float* data_; 138 | }; 139 | 140 | class WavWriter { 141 | public: 142 | WavWriter(const float* data, int num_samples, int num_channel, 143 | int sample_rate, int bits_per_sample) 144 | : data_(data), 145 | num_samples_(num_samples), 146 | num_channel_(num_channel), 147 | sample_rate_(sample_rate), 148 | bits_per_sample_(bits_per_sample) {} 149 | 150 | void Write(const std::string& filename) { 151 | FILE* fp = fopen(filename.c_str(), "w"); 152 | // init char 'riff' 'WAVE' 'fmt ' 'data' 153 | WavHeader header; 154 | char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57, 155 | 0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00, 156 | 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 157 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 158 | 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00}; 159 | memcpy(&header, wav_header, sizeof(header)); 160 | header.channels = num_channel_; 161 | header.bit = bits_per_sample_; 162 | header.sample_rate = sample_rate_; 163 | header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8); 164 | header.size = sizeof(header) - 8 + header.data_size; 165 | header.bytes_per_second = 166 | sample_rate_ * num_channel_ * (bits_per_sample_ / 8); 167 | header.block_size = num_channel_ * (bits_per_sample_ / 8); 168 | 169 | fwrite(&header, 1, sizeof(header), fp); 170 | 171 | for (int i = 0; i < num_samples_; ++i) { 172 | for (int j = 0; j < num_channel_; ++j) { 173 | switch (bits_per_sample_) { 174 | case 8: { 175 | char sample = static_cast(data_[i * num_channel_ + j]); 176 | fwrite(&sample, 1, sizeof(sample), fp); 177 | break; 178 | } 179 | case 16: { 180 | int16_t sample = static_cast(data_[i * num_channel_ + j]); 181 | fwrite(&sample, 1, sizeof(sample), fp); 182 | break; 183 | } 184 | case 32: { 185 | int sample = static_cast(data_[i * num_channel_ + j]); 186 | fwrite(&sample, 1, sizeof(sample), fp); 187 | break; 188 | } 189 | } 190 | } 191 | } 192 | fclose(fp); 193 | } 194 | 195 | private: 196 | const float* data_; 197 | int num_samples_; // total float points in data_ 198 | int num_channel_; 199 | int sample_rate_; 200 | int bits_per_sample_; 201 | }; 202 | 203 | } // namespace wenet 204 | 205 | #endif // FRONTEND_WAV_H_ 206 | -------------------------------------------------------------------------------- /snakers4_silero-vad_master/examples/microphone_and_webRTC_integration/README.md: -------------------------------------------------------------------------------- 1 | 2 | In this example, an integration with the microphone and the webRTC VAD has been done. I used [this](https://github.com/mozilla/DeepSpeech-examples/tree/r0.8/mic_vad_streaming) as a draft. 3 | Here a short video to present the results: 4 | 5 | https://user-images.githubusercontent.com/28188499/116685087-182ff100-a9b2-11eb-927d-ed9f621226ee.mp4 6 | 7 | # Requirements: 8 | The libraries used for the following example are: 9 | ``` 10 | Python == 3.6.9 11 | webrtcvad >= 2.0.10 12 | torchaudio >= 0.8.1 13 | torch >= 1.8.1 14 | halo >= 0.0.31 15 | Soundfile >= 0.13.3 16 | ``` 17 | Using pip3: 18 | ``` 19 | pip3 install webrtcvad 20 | pip3 install torchaudio 21 | pip3 install torch 22 | pip3 install halo 23 | pip3 install soundfile 24 | ``` 25 | Moreover, to make the code easier, the default sample_rate is 16KHz without resampling. 26 | 27 | This example has been tested on ``` ubuntu 18.04.3 LTS``` 28 | 29 | -------------------------------------------------------------------------------- /snakers4_silero-vad_master/examples/microphone_and_webRTC_integration/microphone_and_webRTC_integration.py: -------------------------------------------------------------------------------- 1 | import collections, queue 2 | import numpy as np 3 | import pyaudio 4 | import webrtcvad 5 | from halo import Halo 6 | import torch 7 | import torchaudio 8 | 9 | class Audio(object): 10 | """Streams raw audio from microphone. Data is received in a separate thread, and stored in a buffer, to be read from.""" 11 | 12 | FORMAT = pyaudio.paInt16 13 | # Network/VAD rate-space 14 | RATE_PROCESS = 16000 15 | CHANNELS = 1 16 | BLOCKS_PER_SECOND = 50 17 | 18 | def __init__(self, callback=None, device=None, input_rate=RATE_PROCESS): 19 | def proxy_callback(in_data, frame_count, time_info, status): 20 | #pylint: disable=unused-argument 21 | callback(in_data) 22 | return (None, pyaudio.paContinue) 23 | if callback is None: callback = lambda in_data: self.buffer_queue.put(in_data) 24 | self.buffer_queue = queue.Queue() 25 | self.device = device 26 | self.input_rate = input_rate 27 | self.sample_rate = self.RATE_PROCESS 28 | self.block_size = int(self.RATE_PROCESS / float(self.BLOCKS_PER_SECOND)) 29 | self.block_size_input = int(self.input_rate / float(self.BLOCKS_PER_SECOND)) 30 | self.pa = pyaudio.PyAudio() 31 | 32 | kwargs = { 33 | 'format': self.FORMAT, 34 | 'channels': self.CHANNELS, 35 | 'rate': self.input_rate, 36 | 'input': True, 37 | 'frames_per_buffer': self.block_size_input, 38 | 'stream_callback': proxy_callback, 39 | } 40 | 41 | self.chunk = None 42 | # if not default device 43 | if self.device: 44 | kwargs['input_device_index'] = self.device 45 | 46 | self.stream = self.pa.open(**kwargs) 47 | self.stream.start_stream() 48 | 49 | def read(self): 50 | """Return a block of audio data, blocking if necessary.""" 51 | return self.buffer_queue.get() 52 | 53 | def destroy(self): 54 | self.stream.stop_stream() 55 | self.stream.close() 56 | self.pa.terminate() 57 | 58 | frame_duration_ms = property(lambda self: 1000 * self.block_size // self.sample_rate) 59 | 60 | 61 | class VADAudio(Audio): 62 | """Filter & segment audio with voice activity detection.""" 63 | 64 | def __init__(self, aggressiveness=3, device=None, input_rate=None): 65 | super().__init__(device=device, input_rate=input_rate) 66 | self.vad = webrtcvad.Vad(aggressiveness) 67 | 68 | def frame_generator(self): 69 | """Generator that yields all audio frames from microphone.""" 70 | if self.input_rate == self.RATE_PROCESS: 71 | while True: 72 | yield self.read() 73 | else: 74 | raise Exception("Resampling required") 75 | 76 | def vad_collector(self, padding_ms=300, ratio=0.75, frames=None): 77 | """Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None. 78 | Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered. 79 | Example: (frame, ..., frame, None, frame, ..., frame, None, ...) 80 | |---utterence---| |---utterence---| 81 | """ 82 | if frames is None: frames = self.frame_generator() 83 | num_padding_frames = padding_ms // self.frame_duration_ms 84 | ring_buffer = collections.deque(maxlen=num_padding_frames) 85 | triggered = False 86 | 87 | for frame in frames: 88 | if len(frame) < 640: 89 | return 90 | 91 | is_speech = self.vad.is_speech(frame, self.sample_rate) 92 | 93 | if not triggered: 94 | ring_buffer.append((frame, is_speech)) 95 | num_voiced = len([f for f, speech in ring_buffer if speech]) 96 | if num_voiced > ratio * ring_buffer.maxlen: 97 | triggered = True 98 | for f, s in ring_buffer: 99 | yield f 100 | ring_buffer.clear() 101 | 102 | else: 103 | yield frame 104 | ring_buffer.append((frame, is_speech)) 105 | num_unvoiced = len([f for f, speech in ring_buffer if not speech]) 106 | if num_unvoiced > ratio * ring_buffer.maxlen: 107 | triggered = False 108 | yield None 109 | ring_buffer.clear() 110 | 111 | def main(ARGS): 112 | # Start audio with VAD 113 | vad_audio = VADAudio(aggressiveness=ARGS.webRTC_aggressiveness, 114 | device=ARGS.device, 115 | input_rate=ARGS.rate) 116 | 117 | print("Listening (ctrl-C to exit)...") 118 | frames = vad_audio.vad_collector() 119 | 120 | # load silero VAD 121 | torchaudio.set_audio_backend("soundfile") 122 | model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', 123 | model=ARGS.silaro_model_name, 124 | force_reload= ARGS.reload) 125 | (get_speech_ts,_,_, _,_, _, _) = utils 126 | 127 | 128 | # Stream from microphone to DeepSpeech using VAD 129 | spinner = None 130 | if not ARGS.nospinner: 131 | spinner = Halo(spinner='line') 132 | wav_data = bytearray() 133 | for frame in frames: 134 | if frame is not None: 135 | if spinner: spinner.start() 136 | 137 | wav_data.extend(frame) 138 | else: 139 | if spinner: spinner.stop() 140 | print("webRTC has detected a possible speech") 141 | 142 | newsound= np.frombuffer(wav_data,np.int16) 143 | audio_float32=Int2Float(newsound) 144 | time_stamps =get_speech_ts(audio_float32, model,num_steps=ARGS.num_steps,trig_sum=ARGS.trig_sum,neg_trig_sum=ARGS.neg_trig_sum, 145 | num_samples_per_window=ARGS.num_samples_per_window,min_speech_samples=ARGS.min_speech_samples, 146 | min_silence_samples=ARGS.min_silence_samples) 147 | 148 | if(len(time_stamps)>0): 149 | print("silero VAD has detected a possible speech") 150 | else: 151 | print("silero VAD has detected a noise") 152 | print() 153 | wav_data = bytearray() 154 | 155 | 156 | def Int2Float(sound): 157 | _sound = np.copy(sound) # 158 | abs_max = np.abs(_sound).max() 159 | _sound = _sound.astype('float32') 160 | if abs_max > 0: 161 | _sound *= 1/abs_max 162 | audio_float32 = torch.from_numpy(_sound.squeeze()) 163 | return audio_float32 164 | 165 | if __name__ == '__main__': 166 | DEFAULT_SAMPLE_RATE = 16000 167 | 168 | import argparse 169 | parser = argparse.ArgumentParser(description="Stream from microphone to webRTC and silero VAD") 170 | 171 | parser.add_argument('-v', '--webRTC_aggressiveness', type=int, default=3, 172 | help="Set aggressiveness of webRTC: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3") 173 | parser.add_argument('--nospinner', action='store_true', 174 | help="Disable spinner") 175 | parser.add_argument('-d', '--device', type=int, default=None, 176 | help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().") 177 | 178 | parser.add_argument('-name', '--silaro_model_name', type=str, default="silero_vad", 179 | help="select the name of the model. You can select between 'silero_vad',''silero_vad_micro','silero_vad_micro_8k','silero_vad_mini','silero_vad_mini_8k'") 180 | parser.add_argument('--reload', action='store_true',help="download the last version of the silero vad") 181 | 182 | parser.add_argument('-ts', '--trig_sum', type=float, default=0.25, 183 | help="overlapping windows are used for each audio chunk, trig sum defines average probability among those windows for switching into triggered state (speech state)") 184 | 185 | parser.add_argument('-nts', '--neg_trig_sum', type=float, default=0.07, 186 | help="same as trig_sum, but for switching from triggered to non-triggered state (non-speech)") 187 | 188 | parser.add_argument('-N', '--num_steps', type=int, default=8, 189 | help="nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)") 190 | 191 | parser.add_argument('-nspw', '--num_samples_per_window', type=int, default=4000, 192 | help="number of samples in each window, our models were trained using 4000 samples (250 ms) per window, so this is preferable value (lesser values reduce quality)") 193 | 194 | parser.add_argument('-msps', '--min_speech_samples', type=int, default=10000, 195 | help="minimum speech chunk duration in samples") 196 | 197 | parser.add_argument('-msis', '--min_silence_samples', type=int, default=500, 198 | help=" minimum silence duration in samples between to separate speech chunks") 199 | ARGS = parser.parse_args() 200 | ARGS.rate=DEFAULT_SAMPLE_RATE 201 | main(ARGS) -------------------------------------------------------------------------------- /snakers4_silero-vad_master/examples/pyaudio-streaming/README.md: -------------------------------------------------------------------------------- 1 | # Pyaudio Streaming Example 2 | 3 | This example notebook shows how micophone audio fetched by pyaudio can be processed with Silero-VAD. 4 | 5 | It has been designed as a low-level example for binary real-time streaming using only the prediction of the model, processing the binary data and plotting the speech probabilities at the end to visualize it. 6 | 7 | Currently, the notebook consits of two examples: 8 | - One that records audio of a predefined length from the microphone, process it with Silero-VAD, and plots it afterwards. 9 | - The other one plots the speech probabilities in real-time (using jupyterplot) and records the audio until you press enter. 10 | 11 | ## Example Video for the Real-Time Visualization 12 | 13 | 14 | https://user-images.githubusercontent.com/8079748/117580455-4622dd00-b0f8-11eb-858d-e6368ed4eada.mp4 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /snakers4_silero-vad_master/examples/pyaudio-streaming/pyaudio-streaming-examples.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "62a0cccb", 6 | "metadata": {}, 7 | "source": [ 8 | "# Pyaudio Microphone Streaming Examples\n", 9 | "\n", 10 | "A simple notebook that uses pyaudio to get the microphone audio and feeds this audio then to Silero VAD.\n", 11 | "\n", 12 | "I created it as an example on how binary data from a stream could be feed into Silero VAD.\n", 13 | "\n", 14 | "\n", 15 | "Has been tested on Ubuntu 21.04 (x86). After you installed the dependencies below, no additional setup is required." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "id": "64cbe1eb", 21 | "metadata": {}, 22 | "source": [ 23 | "## Dependencies\n", 24 | "The cell below lists all used dependencies and the used versions. Uncomment to install them from within the notebook." 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "57bc2aac", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "#!pip install numpy==1.20.2\n", 35 | "#!pip install torch==1.9.0\n", 36 | "#!pip install matplotlib==3.4.2\n", 37 | "#!pip install torchaudio==0.9.0\n", 38 | "#!pip install soundfile==0.10.3.post1\n", 39 | "#!pip install pyaudio==0.2.11" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "110de761", 45 | "metadata": {}, 46 | "source": [ 47 | "## Imports" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "id": "5a647d8d", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "import io\n", 58 | "import numpy as np\n", 59 | "import torch\n", 60 | "torch.set_num_threads(1)\n", 61 | "import torchaudio\n", 62 | "import matplotlib\n", 63 | "import matplotlib.pylab as plt\n", 64 | "torchaudio.set_audio_backend(\"soundfile\")\n", 65 | "import pyaudio" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "725d7066", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", 76 | " model='silero_vad',\n", 77 | " force_reload=True)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "id": "1c0b2ea7", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "(get_speech_timestamps,\n", 88 | " save_audio,\n", 89 | " read_audio,\n", 90 | " VADIterator,\n", 91 | " collect_chunks) = utils" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "id": "f9112603", 97 | "metadata": {}, 98 | "source": [ 99 | "### Helper Methods" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "5abc6330", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# Taken from utils_vad.py\n", 110 | "def validate(model,\n", 111 | " inputs: torch.Tensor):\n", 112 | " with torch.no_grad():\n", 113 | " outs = model(inputs)\n", 114 | " return outs\n", 115 | "\n", 116 | "# Provided by Alexander Veysov\n", 117 | "def int2float(sound):\n", 118 | " abs_max = np.abs(sound).max()\n", 119 | " sound = sound.astype('float32')\n", 120 | " if abs_max > 0:\n", 121 | " sound *= 1/abs_max\n", 122 | " sound = sound.squeeze() # depends on the use case\n", 123 | " return sound" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "id": "5124095e", 129 | "metadata": {}, 130 | "source": [ 131 | "## Pyaudio Set-up" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "id": "a845356e", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "FORMAT = pyaudio.paInt16\n", 142 | "CHANNELS = 1\n", 143 | "SAMPLE_RATE = 16000\n", 144 | "CHUNK = int(SAMPLE_RATE / 10)\n", 145 | "\n", 146 | "audio = pyaudio.PyAudio()" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "id": "0b910c99", 152 | "metadata": {}, 153 | "source": [ 154 | "## Simple Example\n", 155 | "The following example reads the audio as 250ms chunks from the microphone, converts them to a Pytorch Tensor, and gets the probabilities/confidences if the model thinks the frame is voiced." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "id": "9d3d2c10", 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "num_samples = 1536" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "id": "3cb44a4a", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "stream = audio.open(format=FORMAT,\n", 176 | " channels=CHANNELS,\n", 177 | " rate=SAMPLE_RATE,\n", 178 | " input=True,\n", 179 | " frames_per_buffer=CHUNK)\n", 180 | "data = []\n", 181 | "voiced_confidences = []\n", 182 | "\n", 183 | "print(\"Started Recording\")\n", 184 | "for i in range(0, frames_to_record):\n", 185 | " \n", 186 | " audio_chunk = stream.read(num_samples)\n", 187 | " \n", 188 | " # in case you want to save the audio later\n", 189 | " data.append(audio_chunk)\n", 190 | " \n", 191 | " audio_int16 = np.frombuffer(audio_chunk, np.int16);\n", 192 | "\n", 193 | " audio_float32 = int2float(audio_int16)\n", 194 | " \n", 195 | " # get the confidences and add them to the list to plot them later\n", 196 | " new_confidence = model(torch.from_numpy(audio_float32), 16000).item()\n", 197 | " voiced_confidences.append(new_confidence)\n", 198 | " \n", 199 | "print(\"Stopped the recording\")\n", 200 | "\n", 201 | "# plot the confidences for the speech\n", 202 | "plt.figure(figsize=(20,6))\n", 203 | "plt.plot(voiced_confidences)\n", 204 | "plt.show()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "id": "a3dda982", 210 | "metadata": {}, 211 | "source": [ 212 | "## Real Time Visualization\n", 213 | "\n", 214 | "As an enhancement to plot the speech probabilities in real time I added the implementation below.\n", 215 | "In contrast to the simeple one, it records the audio until to stop the recording by pressing enter.\n", 216 | "While looking into good ways to update matplotlib plots in real-time, I found a simple libarary that does the job. https://github.com/lvwerra/jupyterplot It has some limitations, but works for this use case really well.\n" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "id": "05ef4100", 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "#!pip install jupyterplot==0.0.3" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "id": "d1d4cdd6", 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "from jupyterplot import ProgressPlot\n", 237 | "import threading\n", 238 | "\n", 239 | "continue_recording = True\n", 240 | "\n", 241 | "def stop():\n", 242 | " input(\"Press Enter to stop the recording:\")\n", 243 | " global continue_recording\n", 244 | " continue_recording = False\n", 245 | "\n", 246 | "def start_recording():\n", 247 | " \n", 248 | " stream = audio.open(format=FORMAT,\n", 249 | " channels=CHANNELS,\n", 250 | " rate=SAMPLE_RATE,\n", 251 | " input=True,\n", 252 | " frames_per_buffer=CHUNK)\n", 253 | "\n", 254 | " data = []\n", 255 | " voiced_confidences = []\n", 256 | " \n", 257 | " global continue_recording\n", 258 | " continue_recording = True\n", 259 | " \n", 260 | " pp = ProgressPlot(plot_names=[\"Silero VAD\"],line_names=[\"speech probabilities\"], x_label=\"audio chunks\")\n", 261 | " \n", 262 | " stop_listener = threading.Thread(target=stop)\n", 263 | " stop_listener.start()\n", 264 | "\n", 265 | " while continue_recording:\n", 266 | " \n", 267 | " audio_chunk = stream.read(num_samples)\n", 268 | " \n", 269 | " # in case you want to save the audio later\n", 270 | " data.append(audio_chunk)\n", 271 | " \n", 272 | " audio_int16 = np.frombuffer(audio_chunk, np.int16);\n", 273 | "\n", 274 | " audio_float32 = int2float(audio_int16)\n", 275 | " \n", 276 | " # get the confidences and add them to the list to plot them later\n", 277 | " new_confidence = model(torch.from_numpy(audio_float32), 16000).item()\n", 278 | " voiced_confidences.append(new_confidence)\n", 279 | " \n", 280 | " pp.update(new_confidence)\n", 281 | "\n", 282 | "\n", 283 | " pp.finalize()" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "id": "1e398009", 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "start_recording()" 294 | ] 295 | } 296 | ], 297 | "metadata": { 298 | "kernelspec": { 299 | "display_name": "Python 3", 300 | "language": "python", 301 | "name": "python3" 302 | }, 303 | "language_info": { 304 | "codemirror_mode": { 305 | "name": "ipython", 306 | "version": 3 307 | }, 308 | "file_extension": ".py", 309 | "mimetype": "text/x-python", 310 | "name": "python", 311 | "nbconvert_exporter": "python", 312 | "pygments_lexer": "ipython3", 313 | "version": "3.7.10" 314 | }, 315 | "toc": { 316 | "base_numbering": 1, 317 | "nav_menu": {}, 318 | "number_sections": true, 319 | "sideBar": true, 320 | "skip_h1_title": false, 321 | "title_cell": "Table of Contents", 322 | "title_sidebar": "Contents", 323 | "toc_cell": false, 324 | "toc_position": {}, 325 | "toc_section_display": true, 326 | "toc_window_display": false 327 | } 328 | }, 329 | "nbformat": 4, 330 | "nbformat_minor": 5 331 | } 332 | -------------------------------------------------------------------------------- /snakers4_silero-vad_master/files/lang_dict_95.json: -------------------------------------------------------------------------------- 1 | {"59": "mg, Malagasy", "76": "tk, Turkmen", "20": "lb, Luxembourgish, Letzeburgesch", "62": "or, Oriya", "30": "en, English", "26": "oc, Occitan", "69": "no, Norwegian", "77": "sr, Serbian", "90": "bs, Bosnian", "71": "el, Greek, Modern (1453\u2013)", "15": "az, Azerbaijani", "12": "lo, Lao", "85": "zh-HK, Chinese", "79": "cs, Czech", "43": "sv, Swedish", "37": "mn, Mongolian", "32": "fi, Finnish", "51": "tg, Tajik", "46": "am, Amharic", "17": "nn, Norwegian Nynorsk", "40": "ja, Japanese", "8": "it, Italian", "21": "ha, Hausa", "11": "as, Assamese", "29": "fa, Persian", "82": "bn, Bengali", "54": "mk, Macedonian", "31": "sw, Swahili", "45": "vi, Vietnamese", "41": "ur, Urdu", "74": "bo, Tibetan", "4": "hi, Hindi", "86": "mr, Marathi", "3": "fy-NL, Western Frisian", "65": "sk, Slovak", "2": "ln, Lingala", "92": "gl, Galician", "53": "sn, Shona", "87": "su, Sundanese", "35": "tt, Tatar", "93": "kn, Kannada", "6": "yo, Yoruba", "27": "ps, Pashto, Pushto", "34": "hy, Armenian", "25": "pa-IN, Punjabi, Panjabi", "23": "nl, Dutch, Flemish", "48": "th, Thai", "73": "mt, Maltese", "55": "ar, Arabic", "89": "ba, Bashkir", "78": "bg, Bulgarian", "42": "yi, Yiddish", "5": "ru, Russian", "84": "sv-SE, Swedish", "80": "tr, Turkish", "33": "sq, Albanian", "38": "kk, Kazakh", "50": "pl, Polish", "9": "hr, Croatian", "66": "ky, Kirghiz, Kyrgyz", "49": "hu, Hungarian", "10": "si, Sinhala, Sinhalese", "56": "la, Latin", "75": "de, German", "14": "ko, Korean", "22": "id, Indonesian", "47": "sl, Slovenian", "57": "be, Belarusian", "36": "ta, Tamil", "7": "da, Danish", "91": "sd, Sindhi", "28": "et, Estonian", "63": "pt, Portuguese", "60": "ne, Nepali", "94": "zh-TW, Chinese", "18": "zh-CN, Chinese", "88": "rw, Kinyarwanda", "19": "es, Spanish, Castilian", "39": "ht, Haitian, Haitian Creole", "64": "tl, Tagalog", "83": "ms, Malay", "70": "ro, Romanian, Moldavian, Moldovan", "68": "pa, Punjabi, Panjabi", "52": "uz, Uzbek", "58": "km, Central Khmer", "67": "my, Burmese", "0": "fr, French", "24": "af, Afrikaans", "16": "gu, Gujarati", "81": "so, Somali", "13": "uk, Ukrainian", "44": "ca, Catalan, Valencian", "72": "ml, Malayalam", "61": "te, Telugu", "1": "zh, Chinese"} -------------------------------------------------------------------------------- /snakers4_silero-vad_master/files/lang_group_dict_95.json: -------------------------------------------------------------------------------- 1 | {"0": ["Afrikaans", "Dutch, Flemish", "Western Frisian"], "1": ["Turkish", "Azerbaijani"], "2": ["Russian", "Slovak", "Ukrainian", "Czech", "Polish", "Belarusian"], "3": ["Bulgarian", "Macedonian", "Serbian", "Croatian", "Bosnian", "Slovenian"], "4": ["Norwegian Nynorsk", "Swedish", "Danish", "Norwegian"], "5": ["English"], "6": ["Finnish", "Estonian"], "7": ["Yiddish", "Luxembourgish, Letzeburgesch", "German"], "8": ["Spanish", "Occitan", "Portuguese", "Catalan, Valencian", "Galician", "Spanish, Castilian", "Italian"], "9": ["Maltese", "Arabic"], "10": ["Marathi"], "11": ["Hindi", "Urdu"], "12": ["Lao", "Thai"], "13": ["Malay", "Indonesian"], "14": ["Romanian, Moldavian, Moldovan"], "15": ["Tagalog"], "16": ["Tajik", "Persian"], "17": ["Kazakh", "Uzbek", "Kirghiz, Kyrgyz"], "18": ["Kinyarwanda"], "19": ["Tatar", "Bashkir"], "20": ["French"], "21": ["Chinese"], "22": ["Lingala"], "23": ["Yoruba"], "24": ["Sinhala, Sinhalese"], "25": ["Assamese"], "26": ["Korean"], "27": ["Gujarati"], "28": ["Hausa"], "29": ["Punjabi, Panjabi"], "30": ["Pashto, Pushto"], "31": ["Swahili"], "32": ["Albanian"], "33": ["Armenian"], "34": ["Mongolian"], "35": ["Tamil"], "36": ["Haitian, Haitian Creole"], "37": ["Japanese"], "38": ["Vietnamese"], "39": ["Amharic"], "40": ["Hungarian"], "41": ["Shona"], "42": ["Latin"], "43": ["Central Khmer"], "44": ["Malagasy"], "45": ["Nepali"], "46": ["Telugu"], "47": ["Oriya"], "48": ["Burmese"], "49": ["Greek, Modern (1453\u2013)"], "50": ["Malayalam"], "51": ["Tibetan"], "52": ["Turkmen"], "53": ["Somali"], "54": ["Bengali"], "55": ["Sundanese"], "56": ["Sindhi"], "57": ["Kannada"]} -------------------------------------------------------------------------------- /snakers4_silero-vad_master/files/silero_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/snakers4_silero-vad_master/files/silero_logo.jpg -------------------------------------------------------------------------------- /snakers4_silero-vad_master/files/silero_vad.jit: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/snakers4_silero-vad_master/files/silero_vad.jit -------------------------------------------------------------------------------- /snakers4_silero-vad_master/files/silero_vad.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/snakers4_silero-vad_master/files/silero_vad.onnx -------------------------------------------------------------------------------- /snakers4_silero-vad_master/hubconf.py: -------------------------------------------------------------------------------- 1 | dependencies = ['torch', 'torchaudio'] 2 | import torch 3 | import json 4 | from utils_vad import (init_jit_model, 5 | get_speech_timestamps, 6 | get_number_ts, 7 | get_language, 8 | get_language_and_group, 9 | save_audio, 10 | read_audio, 11 | VADIterator, 12 | collect_chunks, 13 | drop_chunks, 14 | Validator, 15 | OnnxWrapper) 16 | 17 | 18 | def versiontuple(v): 19 | return tuple(map(int, (v.split('+')[0].split(".")))) 20 | 21 | 22 | def silero_vad(onnx=False, force_onnx_cpu=False, silero_vad_source='github'): 23 | """Silero Voice Activity Detector 24 | Returns a model with a set of utils 25 | Please see https://github.com/snakers4/silero-vad for usage examples 26 | """ 27 | 28 | if not onnx: 29 | installed_version = torch.__version__ 30 | supported_version = '1.12.0' 31 | if versiontuple(installed_version) < versiontuple(supported_version): 32 | raise Exception(f'Please install torch {supported_version} or greater ({installed_version} installed)') 33 | import os 34 | import sys 35 | hub_dir = torch.hub.get_dir() if silero_vad_source == 'github' else os.path.dirname(sys.executable) 36 | if onnx: 37 | model = OnnxWrapper(f'{hub_dir}/snakers4_silero-vad_master/files/silero_vad.onnx', force_onnx_cpu) 38 | else: 39 | model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/silero_vad.jit') 40 | utils = (get_speech_timestamps, 41 | save_audio, 42 | read_audio, 43 | VADIterator, 44 | collect_chunks) 45 | 46 | return model, utils 47 | 48 | 49 | def silero_number_detector(onnx=False, force_onnx_cpu=False): 50 | """Silero Number Detector 51 | Returns a model with a set of utils 52 | Please see https://github.com/snakers4/silero-vad for usage examples 53 | """ 54 | if onnx: 55 | url = 'https://models.silero.ai/vad_models/number_detector.onnx' 56 | else: 57 | url = 'https://models.silero.ai/vad_models/number_detector.jit' 58 | model = Validator(url, force_onnx_cpu) 59 | utils = (get_number_ts, 60 | save_audio, 61 | read_audio, 62 | collect_chunks, 63 | drop_chunks) 64 | 65 | return model, utils 66 | 67 | 68 | def silero_lang_detector(onnx=False, force_onnx_cpu=False): 69 | """Silero Language Classifier 70 | Returns a model with a set of utils 71 | Please see https://github.com/snakers4/silero-vad for usage examples 72 | """ 73 | if onnx: 74 | url = 'https://models.silero.ai/vad_models/number_detector.onnx' 75 | else: 76 | url = 'https://models.silero.ai/vad_models/number_detector.jit' 77 | model = Validator(url, force_onnx_cpu) 78 | utils = (get_language, 79 | read_audio) 80 | 81 | return model, utils 82 | 83 | 84 | def silero_lang_detector_95(onnx=False, force_onnx_cpu=False): 85 | """Silero Language Classifier (95 languages) 86 | Returns a model with a set of utils 87 | Please see https://github.com/snakers4/silero-vad for usage examples 88 | """ 89 | 90 | hub_dir = torch.hub.get_dir() 91 | if onnx: 92 | url = 'https://models.silero.ai/vad_models/lang_classifier_95.onnx' 93 | else: 94 | url = 'https://models.silero.ai/vad_models/lang_classifier_95.jit' 95 | model = Validator(url, force_onnx_cpu) 96 | 97 | with open(f'{hub_dir}/snakers4_silero-vad_master/files/lang_dict_95.json', 'r') as f: 98 | lang_dict = json.load(f) 99 | 100 | with open(f'{hub_dir}/snakers4_silero-vad_master/files/lang_group_dict_95.json', 'r') as f: 101 | lang_group_dict = json.load(f) 102 | 103 | utils = (get_language_and_group, read_audio) 104 | 105 | return model, lang_dict, lang_group_dict, utils 106 | -------------------------------------------------------------------------------- /snakers4_silero-vad_master/silero-vad.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "FpMplOCA2Fwp" 7 | }, 8 | "source": [ 9 | "#VAD" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "heading_collapsed": true, 16 | "id": "62A6F_072Fwq" 17 | }, 18 | "source": [ 19 | "## Install Dependencies" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": { 26 | "hidden": true, 27 | "id": "5w5AkskZ2Fwr" 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "#@title Install and Import Dependencies\n", 32 | "\n", 33 | "# this assumes that you have a relevant version of PyTorch installed\n", 34 | "!pip install -q torchaudio\n", 35 | "\n", 36 | "SAMPLING_RATE = 16000\n", 37 | "\n", 38 | "import torch\n", 39 | "torch.set_num_threads(1)\n", 40 | "\n", 41 | "from IPython.display import Audio\n", 42 | "from pprint import pprint\n", 43 | "# download example\n", 44 | "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "id": "pSifus5IilRp" 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "USE_ONNX = False # change this to True if you want to test onnx model\n", 56 | "if USE_ONNX:\n", 57 | " !pip install -q onnxruntime\n", 58 | " \n", 59 | "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", 60 | " model='silero_vad',\n", 61 | " force_reload=True,\n", 62 | " onnx=USE_ONNX)\n", 63 | "\n", 64 | "(get_speech_timestamps,\n", 65 | " save_audio,\n", 66 | " read_audio,\n", 67 | " VADIterator,\n", 68 | " collect_chunks) = utils" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "id": "fXbbaUO3jsrw" 75 | }, 76 | "source": [ 77 | "## Full Audio" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": { 83 | "id": "RAfJPb_a-Auj" 84 | }, 85 | "source": [ 86 | "**Speech timestapms from full audio**" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "id": "aI_eydBPjsrx" 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n", 98 | "# get speech timestamps from full audio file\n", 99 | "speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)\n", 100 | "pprint(speech_timestamps)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "id": "OuEobLchjsry" 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "# merge all speech chunks to one audio\n", 112 | "save_audio('only_speech.wav',\n", 113 | " collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE) \n", 114 | "Audio('only_speech.wav')" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": { 120 | "id": "iDKQbVr8jsry" 121 | }, 122 | "source": [ 123 | "## Stream imitation example" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "id": "q-lql_2Wjsry" 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "## using VADIterator class\n", 135 | "\n", 136 | "vad_iterator = VADIterator(model)\n", 137 | "wav = read_audio(f'en_example.wav', sampling_rate=SAMPLING_RATE)\n", 138 | "\n", 139 | "window_size_samples = 1536 # number of samples in a single audio chunk\n", 140 | "for i in range(0, len(wav), window_size_samples):\n", 141 | " chunk = wav[i: i+ window_size_samples]\n", 142 | " if len(chunk) < window_size_samples:\n", 143 | " break\n", 144 | " speech_dict = vad_iterator(chunk, return_seconds=True)\n", 145 | " if speech_dict:\n", 146 | " print(speech_dict, end=' ')\n", 147 | "vad_iterator.reset_states() # reset model states after each audio" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "id": "BX3UgwwB2Fwv" 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "## just probabilities\n", 159 | "\n", 160 | "wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n", 161 | "speech_probs = []\n", 162 | "window_size_samples = 1536\n", 163 | "for i in range(0, len(wav), window_size_samples):\n", 164 | " chunk = wav[i: i+ window_size_samples]\n", 165 | " if len(chunk) < window_size_samples:\n", 166 | " break\n", 167 | " speech_prob = model(chunk, SAMPLING_RATE).item()\n", 168 | " speech_probs.append(speech_prob)\n", 169 | "vad_iterator.reset_states() # reset model states after each audio\n", 170 | "\n", 171 | "print(speech_probs[:10]) # first 10 chunks predicts" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": { 177 | "heading_collapsed": true, 178 | "id": "36jY0niD2Fww" 179 | }, 180 | "source": [ 181 | "# Number detector" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": { 187 | "heading_collapsed": true, 188 | "hidden": true, 189 | "id": "scd1DlS42Fwx" 190 | }, 191 | "source": [ 192 | "## Install Dependencies" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "hidden": true, 200 | "id": "Kq5gQuYq2Fwx" 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "#@title Install and Import Dependencies\n", 205 | "\n", 206 | "# this assumes that you have a relevant version of PyTorch installed\n", 207 | "!pip install -q torchaudio\n", 208 | "\n", 209 | "SAMPLING_RATE = 16000\n", 210 | "\n", 211 | "import torch\n", 212 | "torch.set_num_threads(1)\n", 213 | "\n", 214 | "from IPython.display import Audio\n", 215 | "from pprint import pprint\n", 216 | "# download example\n", 217 | "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en_num.wav', 'en_number_example.wav')" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": { 224 | "id": "dPwCFHmFycUF" 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "USE_ONNX = False # change this to True if you want to test onnx model\n", 229 | "if USE_ONNX:\n", 230 | " !pip install -q onnxruntime\n", 231 | " \n", 232 | "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", 233 | " model='silero_number_detector',\n", 234 | " force_reload=True,\n", 235 | " onnx=USE_ONNX)\n", 236 | "\n", 237 | "(get_number_ts,\n", 238 | " save_audio,\n", 239 | " read_audio,\n", 240 | " collect_chunks,\n", 241 | " drop_chunks) = utils\n" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": { 247 | "heading_collapsed": true, 248 | "hidden": true, 249 | "id": "qhPa30ij2Fwy" 250 | }, 251 | "source": [ 252 | "## Full audio" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": { 259 | "hidden": true, 260 | "id": "EXpau6xq2Fwy" 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "wav = read_audio('en_number_example.wav', sampling_rate=SAMPLING_RATE)\n", 265 | "# get number timestamps from full audio file\n", 266 | "number_timestamps = get_number_ts(wav, model)\n", 267 | "pprint(number_timestamps)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": { 274 | "hidden": true, 275 | "id": "u-KfXRhZ2Fwy" 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "# convert ms in timestamps to samples\n", 280 | "for timestamp in number_timestamps:\n", 281 | " timestamp['start'] = int(timestamp['start'] * SAMPLING_RATE / 1000)\n", 282 | " timestamp['end'] = int(timestamp['end'] * SAMPLING_RATE / 1000)" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "hidden": true, 290 | "id": "iwYEC4aZ2Fwy" 291 | }, 292 | "outputs": [], 293 | "source": [ 294 | "# merge all number chunks to one audio\n", 295 | "save_audio('only_numbers.wav',\n", 296 | " collect_chunks(number_timestamps, wav), SAMPLING_RATE) \n", 297 | "Audio('only_numbers.wav')" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": { 304 | "hidden": true, 305 | "id": "fHaYejX12Fwy" 306 | }, 307 | "outputs": [], 308 | "source": [ 309 | "# drop all number chunks from audio\n", 310 | "save_audio('no_numbers.wav',\n", 311 | " drop_chunks(number_timestamps, wav), SAMPLING_RATE) \n", 312 | "Audio('no_numbers.wav')" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": { 318 | "heading_collapsed": true, 319 | "id": "PnKtJKbq2Fwz" 320 | }, 321 | "source": [ 322 | "# Language detector" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": { 328 | "heading_collapsed": true, 329 | "hidden": true, 330 | "id": "F5cAmMbP2Fwz" 331 | }, 332 | "source": [ 333 | "## Install Dependencies" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": { 340 | "hidden": true, 341 | "id": "Zu9D0t6n2Fwz" 342 | }, 343 | "outputs": [], 344 | "source": [ 345 | "#@title Install and Import Dependencies\n", 346 | "\n", 347 | "# this assumes that you have a relevant version of PyTorch installed\n", 348 | "!pip install -q torchaudio\n", 349 | "\n", 350 | "SAMPLING_RATE = 16000\n", 351 | "\n", 352 | "import torch\n", 353 | "torch.set_num_threads(1)\n", 354 | "\n", 355 | "from IPython.display import Audio\n", 356 | "from pprint import pprint\n", 357 | "# download example\n", 358 | "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": { 365 | "id": "JfRKDZiRztFe" 366 | }, 367 | "outputs": [], 368 | "source": [ 369 | "USE_ONNX = False # change this to True if you want to test onnx model\n", 370 | "if USE_ONNX:\n", 371 | " !pip install -q onnxruntime\n", 372 | " \n", 373 | "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", 374 | " model='silero_lang_detector',\n", 375 | " force_reload=True,\n", 376 | " onnx=USE_ONNX)\n", 377 | "\n", 378 | "get_language, read_audio = utils" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": { 384 | "heading_collapsed": true, 385 | "hidden": true, 386 | "id": "iC696eMX2Fwz" 387 | }, 388 | "source": [ 389 | "## Full audio" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": { 396 | "hidden": true, 397 | "id": "c8UYnYBF2Fw0" 398 | }, 399 | "outputs": [], 400 | "source": [ 401 | "wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n", 402 | "lang = get_language(wav, model)\n", 403 | "print(lang)" 404 | ] 405 | } 406 | ], 407 | "metadata": { 408 | "colab": { 409 | "name": "silero-vad.ipynb", 410 | "provenance": [] 411 | }, 412 | "kernelspec": { 413 | "display_name": "Python 3", 414 | "language": "python", 415 | "name": "python3" 416 | }, 417 | "language_info": { 418 | "codemirror_mode": { 419 | "name": "ipython", 420 | "version": 3 421 | }, 422 | "file_extension": ".py", 423 | "mimetype": "text/x-python", 424 | "name": "python", 425 | "nbconvert_exporter": "python", 426 | "pygments_lexer": "ipython3", 427 | "version": "3.8.8" 428 | }, 429 | "toc": { 430 | "base_numbering": 1, 431 | "nav_menu": {}, 432 | "number_sections": true, 433 | "sideBar": true, 434 | "skip_h1_title": false, 435 | "title_cell": "Table of Contents", 436 | "title_sidebar": "Contents", 437 | "toc_cell": false, 438 | "toc_position": {}, 439 | "toc_section_display": true, 440 | "toc_window_display": false 441 | } 442 | }, 443 | "nbformat": 4, 444 | "nbformat_minor": 0 445 | } 446 | -------------------------------------------------------------------------------- /snakers4_silero-vad_master/utils_vad.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchaudio 3 | from typing import List 4 | import torch.nn.functional as F 5 | import warnings 6 | 7 | languages = ['ru', 'en', 'de', 'es'] 8 | 9 | 10 | class OnnxWrapper(): 11 | 12 | def __init__(self, path, force_onnx_cpu=False): 13 | import numpy as np 14 | global np 15 | import onnxruntime 16 | if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers(): 17 | self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider']) 18 | else: 19 | self.session = onnxruntime.InferenceSession(path) 20 | self.session.intra_op_num_threads = 1 21 | self.session.inter_op_num_threads = 1 22 | 23 | self.reset_states() 24 | self.sample_rates = [8000, 16000] 25 | 26 | def _validate_input(self, x, sr: int): 27 | if x.dim() == 1: 28 | x = x.unsqueeze(0) 29 | if x.dim() > 2: 30 | raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}") 31 | 32 | if sr != 16000 and (sr % 16000 == 0): 33 | step = sr // 16000 34 | x = x[::step] 35 | sr = 16000 36 | 37 | if sr not in self.sample_rates: 38 | raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)") 39 | 40 | if sr / x.shape[1] > 31.25: 41 | raise ValueError("Input audio chunk is too short") 42 | 43 | return x, sr 44 | 45 | def reset_states(self, batch_size=1): 46 | self._h = np.zeros((2, batch_size, 64)).astype('float32') 47 | self._c = np.zeros((2, batch_size, 64)).astype('float32') 48 | self._last_sr = 0 49 | self._last_batch_size = 0 50 | 51 | def __call__(self, x, sr: int): 52 | 53 | x, sr = self._validate_input(x, sr) 54 | batch_size = x.shape[0] 55 | 56 | if not self._last_batch_size: 57 | self.reset_states(batch_size) 58 | if (self._last_sr) and (self._last_sr != sr): 59 | self.reset_states(batch_size) 60 | if (self._last_batch_size) and (self._last_batch_size != batch_size): 61 | self.reset_states(batch_size) 62 | 63 | if sr in [8000, 16000]: 64 | ort_inputs = {'input': x.numpy(), 'h': self._h, 'c': self._c, 'sr': np.array(sr, dtype='int64')} 65 | ort_outs = self.session.run(None, ort_inputs) 66 | out, self._h, self._c = ort_outs 67 | else: 68 | raise ValueError() 69 | 70 | self._last_sr = sr 71 | self._last_batch_size = batch_size 72 | 73 | out = torch.tensor(out) 74 | return out 75 | 76 | def audio_forward(self, x, sr: int, num_samples: int = 512): 77 | outs = [] 78 | x, sr = self._validate_input(x, sr) 79 | 80 | if x.shape[1] % num_samples: 81 | pad_num = num_samples - (x.shape[1] % num_samples) 82 | x = torch.nn.functional.pad(x, (0, pad_num), 'constant', value=0.0) 83 | 84 | self.reset_states(x.shape[0]) 85 | for i in range(0, x.shape[1], num_samples): 86 | wavs_batch = x[:, i:i+num_samples] 87 | out_chunk = self.__call__(wavs_batch, sr) 88 | outs.append(out_chunk) 89 | 90 | stacked = torch.cat(outs, dim=1) 91 | return stacked.cpu() 92 | 93 | 94 | class Validator(): 95 | def __init__(self, url, force_onnx_cpu): 96 | self.onnx = True if url.endswith('.onnx') else False 97 | torch.hub.download_url_to_file(url, 'inf.model') 98 | if self.onnx: 99 | import onnxruntime 100 | if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers(): 101 | self.model = onnxruntime.InferenceSession('inf.model', providers=['CPUExecutionProvider']) 102 | else: 103 | self.model = onnxruntime.InferenceSession('inf.model') 104 | else: 105 | self.model = init_jit_model(model_path='inf.model') 106 | 107 | def __call__(self, inputs: torch.Tensor): 108 | with torch.no_grad(): 109 | if self.onnx: 110 | ort_inputs = {'input': inputs.cpu().numpy()} 111 | outs = self.model.run(None, ort_inputs) 112 | outs = [torch.Tensor(x) for x in outs] 113 | else: 114 | outs = self.model(inputs) 115 | 116 | return outs 117 | 118 | 119 | def read_audio(path: str, 120 | sampling_rate: int = 16000): 121 | 122 | wav, sr = torchaudio.load(path) 123 | 124 | if wav.size(0) > 1: 125 | wav = wav.mean(dim=0, keepdim=True) 126 | 127 | if sr != sampling_rate: 128 | transform = torchaudio.transforms.Resample(orig_freq=sr, 129 | new_freq=sampling_rate) 130 | wav = transform(wav) 131 | sr = sampling_rate 132 | 133 | assert sr == sampling_rate 134 | return wav.squeeze(0) 135 | 136 | 137 | def save_audio(path: str, 138 | tensor: torch.Tensor, 139 | sampling_rate: int = 16000): 140 | torchaudio.save(path, tensor.unsqueeze(0), sampling_rate, bits_per_sample=16) 141 | 142 | 143 | def init_jit_model(model_path: str, 144 | device=torch.device('cpu')): 145 | torch.set_grad_enabled(False) 146 | model = torch.jit.load(model_path, map_location=device) 147 | model.eval() 148 | return model 149 | 150 | 151 | def make_visualization(probs, step): 152 | import pandas as pd 153 | pd.DataFrame({'probs': probs}, 154 | index=[x * step for x in range(len(probs))]).plot(figsize=(16, 8), 155 | kind='area', ylim=[0, 1.05], xlim=[0, len(probs) * step], 156 | xlabel='seconds', 157 | ylabel='speech probability', 158 | colormap='tab20') 159 | 160 | 161 | def get_speech_timestamps(audio: torch.Tensor, 162 | model, 163 | threshold: float = 0.5, 164 | sampling_rate: int = 16000, 165 | min_speech_duration_ms: int = 250, 166 | max_speech_duration_s: float = float('inf'), 167 | min_silence_duration_ms: int = 100, 168 | window_size_samples: int = 512, 169 | speech_pad_ms: int = 30, 170 | return_seconds: bool = False, 171 | visualize_probs: bool = False): 172 | 173 | """ 174 | This method is used for splitting long audios into speech chunks using silero VAD 175 | 176 | Parameters 177 | ---------- 178 | audio: torch.Tensor, one dimensional 179 | One dimensional float torch.Tensor, other types are casted to torch if possible 180 | 181 | model: preloaded .jit silero VAD model 182 | 183 | threshold: float (default - 0.5) 184 | Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH. 185 | It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets. 186 | 187 | sampling_rate: int (default - 16000) 188 | Currently silero VAD models support 8000 and 16000 sample rates 189 | 190 | min_speech_duration_ms: int (default - 250 milliseconds) 191 | Final speech chunks shorter min_speech_duration_ms are thrown out 192 | 193 | max_speech_duration_s: int (default - inf) 194 | Maximum duration of speech chunks in seconds 195 | Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that lasts more than 100s (if any), to prevent agressive cutting. 196 | Otherwise, they will be split aggressively just before max_speech_duration_s. 197 | 198 | min_silence_duration_ms: int (default - 100 milliseconds) 199 | In the end of each speech chunk wait for min_silence_duration_ms before separating it 200 | 201 | window_size_samples: int (default - 1536 samples) 202 | Audio chunks of window_size_samples size are fed to the silero VAD model. 203 | WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 samples for 8000 sample rate. 204 | Values other than these may affect model perfomance!! 205 | 206 | speech_pad_ms: int (default - 30 milliseconds) 207 | Final speech chunks are padded by speech_pad_ms each side 208 | 209 | return_seconds: bool (default - False) 210 | whether return timestamps in seconds (default - samples) 211 | 212 | visualize_probs: bool (default - False) 213 | whether draw prob hist or not 214 | 215 | Returns 216 | ---------- 217 | speeches: list of dicts 218 | list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds) 219 | """ 220 | 221 | if not torch.is_tensor(audio): 222 | try: 223 | audio = torch.Tensor(audio) 224 | except: 225 | raise TypeError("Audio cannot be casted to tensor. Cast it manually") 226 | 227 | if len(audio.shape) > 1: 228 | for i in range(len(audio.shape)): # trying to squeeze empty dimensions 229 | audio = audio.squeeze(0) 230 | if len(audio.shape) > 1: 231 | raise ValueError("More than one dimension in audio. Are you trying to process audio with 2 channels?") 232 | 233 | if sampling_rate > 16000 and (sampling_rate % 16000 == 0): 234 | step = sampling_rate // 16000 235 | sampling_rate = 16000 236 | audio = audio[::step] 237 | warnings.warn('Sampling rate is a multiply of 16000, casting to 16000 manually!') 238 | else: 239 | step = 1 240 | 241 | if sampling_rate == 8000 and window_size_samples > 768: 242 | warnings.warn('window_size_samples is too big for 8000 sampling_rate! Better set window_size_samples to 256, 512 or 768 for 8000 sample rate!') 243 | if window_size_samples not in [256, 512, 768, 1024, 1536]: 244 | warnings.warn('Unusual window_size_samples! Supported window_size_samples:\n - [512, 1024, 1536] for 16000 sampling_rate\n - [256, 512, 768] for 8000 sampling_rate') 245 | 246 | model.reset_states() 247 | min_speech_samples = sampling_rate * min_speech_duration_ms / 1000 248 | speech_pad_samples = sampling_rate * speech_pad_ms / 1000 249 | max_speech_samples = sampling_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples 250 | min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 251 | min_silence_samples_at_max_speech = sampling_rate * 98 / 1000 252 | 253 | audio_length_samples = len(audio) 254 | 255 | speech_probs = [] 256 | for current_start_sample in range(0, audio_length_samples, window_size_samples): 257 | chunk = audio[current_start_sample: current_start_sample + window_size_samples] 258 | if len(chunk) < window_size_samples: 259 | chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk)))) 260 | speech_prob = model(chunk, sampling_rate).item() 261 | speech_probs.append(speech_prob) 262 | 263 | triggered = False 264 | speeches = [] 265 | current_speech = {} 266 | neg_threshold = threshold - 0.15 267 | temp_end = 0 # to save potential segment end (and tolerate some silence) 268 | prev_end = next_start = 0 # to save potential segment limits in case of maximum segment size reached 269 | 270 | for i, speech_prob in enumerate(speech_probs): 271 | if (speech_prob >= threshold) and temp_end: 272 | temp_end = 0 273 | if next_start < prev_end: 274 | next_start = window_size_samples * i 275 | 276 | if (speech_prob >= threshold) and not triggered: 277 | triggered = True 278 | current_speech['start'] = window_size_samples * i 279 | continue 280 | 281 | if triggered and (window_size_samples * i) - current_speech['start'] > max_speech_samples: 282 | if prev_end: 283 | current_speech['end'] = prev_end 284 | speeches.append(current_speech) 285 | current_speech = {} 286 | if next_start < prev_end: # previously reached silence (< neg_thres) and is still not speech (< thres) 287 | triggered = False 288 | else: 289 | current_speech['start'] = next_start 290 | prev_end = next_start = temp_end = 0 291 | else: 292 | current_speech['end'] = window_size_samples * i 293 | speeches.append(current_speech) 294 | current_speech = {} 295 | prev_end = next_start = temp_end = 0 296 | triggered = False 297 | continue 298 | 299 | 300 | if (speech_prob < neg_threshold) and triggered: 301 | if not temp_end: 302 | temp_end = window_size_samples * i 303 | if ((window_size_samples * i) - temp_end) > min_silence_samples_at_max_speech : # condition to avoid cutting in very short silence 304 | prev_end = temp_end 305 | if (window_size_samples * i) - temp_end < min_silence_samples: 306 | continue 307 | else: 308 | current_speech['end'] = temp_end 309 | if (current_speech['end'] - current_speech['start']) > min_speech_samples: 310 | speeches.append(current_speech) 311 | current_speech = {} 312 | prev_end = next_start = temp_end = 0 313 | triggered = False 314 | continue 315 | 316 | if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples: 317 | current_speech['end'] = audio_length_samples 318 | speeches.append(current_speech) 319 | 320 | for i, speech in enumerate(speeches): 321 | if i == 0: 322 | speech['start'] = int(max(0, speech['start'] - speech_pad_samples)) 323 | if i != len(speeches) - 1: 324 | silence_duration = speeches[i+1]['start'] - speech['end'] 325 | if silence_duration < 2 * speech_pad_samples: 326 | speech['end'] += int(silence_duration // 2) 327 | speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - silence_duration // 2)) 328 | else: 329 | speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples)) 330 | speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - speech_pad_samples)) 331 | else: 332 | speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples)) 333 | 334 | if return_seconds: 335 | for speech_dict in speeches: 336 | speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1) 337 | speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1) 338 | elif step > 1: 339 | for speech_dict in speeches: 340 | speech_dict['start'] *= step 341 | speech_dict['end'] *= step 342 | 343 | if visualize_probs: 344 | make_visualization(speech_probs, window_size_samples / sampling_rate) 345 | 346 | return speeches 347 | 348 | 349 | def get_number_ts(wav: torch.Tensor, 350 | model, 351 | model_stride=8, 352 | hop_length=160, 353 | sample_rate=16000): 354 | wav = torch.unsqueeze(wav, dim=0) 355 | perframe_logits = model(wav)[0] 356 | perframe_preds = torch.argmax(torch.softmax(perframe_logits, dim=1), dim=1).squeeze() # (1, num_frames_strided) 357 | extended_preds = [] 358 | for i in perframe_preds: 359 | extended_preds.extend([i.item()] * model_stride) 360 | # len(extended_preds) is *num_frames_real*; for each frame of audio we know if it has a number in it. 361 | triggered = False 362 | timings = [] 363 | cur_timing = {} 364 | for i, pred in enumerate(extended_preds): 365 | if pred == 1: 366 | if not triggered: 367 | cur_timing['start'] = int((i * hop_length) / (sample_rate / 1000)) 368 | triggered = True 369 | elif pred == 0: 370 | if triggered: 371 | cur_timing['end'] = int((i * hop_length) / (sample_rate / 1000)) 372 | timings.append(cur_timing) 373 | cur_timing = {} 374 | triggered = False 375 | if cur_timing: 376 | cur_timing['end'] = int(len(wav) / (sample_rate / 1000)) 377 | timings.append(cur_timing) 378 | return timings 379 | 380 | 381 | def get_language(wav: torch.Tensor, 382 | model): 383 | wav = torch.unsqueeze(wav, dim=0) 384 | lang_logits = model(wav)[2] 385 | lang_pred = torch.argmax(torch.softmax(lang_logits, dim=1), dim=1).item() # from 0 to len(languages) - 1 386 | assert lang_pred < len(languages) 387 | return languages[lang_pred] 388 | 389 | 390 | def get_language_and_group(wav: torch.Tensor, 391 | model, 392 | lang_dict: dict, 393 | lang_group_dict: dict, 394 | top_n=1): 395 | wav = torch.unsqueeze(wav, dim=0) 396 | lang_logits, lang_group_logits = model(wav) 397 | 398 | softm = torch.softmax(lang_logits, dim=1).squeeze() 399 | softm_group = torch.softmax(lang_group_logits, dim=1).squeeze() 400 | 401 | srtd = torch.argsort(softm, descending=True) 402 | srtd_group = torch.argsort(softm_group, descending=True) 403 | 404 | outs = [] 405 | outs_group = [] 406 | for i in range(top_n): 407 | prob = round(softm[srtd[i]].item(), 2) 408 | prob_group = round(softm_group[srtd_group[i]].item(), 2) 409 | outs.append((lang_dict[str(srtd[i].item())], prob)) 410 | outs_group.append((lang_group_dict[str(srtd_group[i].item())], prob_group)) 411 | 412 | return outs, outs_group 413 | 414 | 415 | class VADIterator: 416 | def __init__(self, 417 | model, 418 | threshold: float = 0.5, 419 | sampling_rate: int = 16000, 420 | min_silence_duration_ms: int = 100, 421 | speech_pad_ms: int = 30 422 | ): 423 | 424 | """ 425 | Class for stream imitation 426 | 427 | Parameters 428 | ---------- 429 | model: preloaded .jit silero VAD model 430 | 431 | threshold: float (default - 0.5) 432 | Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH. 433 | It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets. 434 | 435 | sampling_rate: int (default - 16000) 436 | Currently silero VAD models support 8000 and 16000 sample rates 437 | 438 | min_silence_duration_ms: int (default - 100 milliseconds) 439 | In the end of each speech chunk wait for min_silence_duration_ms before separating it 440 | 441 | speech_pad_ms: int (default - 30 milliseconds) 442 | Final speech chunks are padded by speech_pad_ms each side 443 | """ 444 | 445 | self.model = model 446 | self.threshold = threshold 447 | self.sampling_rate = sampling_rate 448 | 449 | if sampling_rate not in [8000, 16000]: 450 | raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]') 451 | 452 | self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 453 | self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000 454 | self.reset_states() 455 | 456 | def reset_states(self): 457 | 458 | self.model.reset_states() 459 | self.triggered = False 460 | self.temp_end = 0 461 | self.current_sample = 0 462 | 463 | def __call__(self, x, return_seconds=False): 464 | """ 465 | x: torch.Tensor 466 | audio chunk (see examples in repo) 467 | 468 | return_seconds: bool (default - False) 469 | whether return timestamps in seconds (default - samples) 470 | """ 471 | 472 | if not torch.is_tensor(x): 473 | try: 474 | x = torch.Tensor(x) 475 | except: 476 | raise TypeError("Audio cannot be casted to tensor. Cast it manually") 477 | 478 | window_size_samples = len(x[0]) if x.dim() == 2 else len(x) 479 | self.current_sample += window_size_samples 480 | 481 | speech_prob = self.model(x, self.sampling_rate).item() 482 | 483 | if (speech_prob >= self.threshold) and self.temp_end: 484 | self.temp_end = 0 485 | 486 | if (speech_prob >= self.threshold) and not self.triggered: 487 | self.triggered = True 488 | speech_start = self.current_sample - self.speech_pad_samples 489 | return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)} 490 | 491 | if (speech_prob < self.threshold - 0.15) and self.triggered: 492 | if not self.temp_end: 493 | self.temp_end = self.current_sample 494 | if self.current_sample - self.temp_end < self.min_silence_samples: 495 | return None 496 | else: 497 | speech_end = self.temp_end + self.speech_pad_samples 498 | self.temp_end = 0 499 | self.triggered = False 500 | return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)} 501 | 502 | return None 503 | 504 | 505 | def collect_chunks(tss: List[dict], 506 | wav: torch.Tensor): 507 | chunks = [] 508 | for i in tss: 509 | chunks.append(wav[i['start']: i['end']]) 510 | return torch.cat(chunks) 511 | 512 | 513 | def drop_chunks(tss: List[dict], 514 | wav: torch.Tensor): 515 | chunks = [] 516 | cur_start = 0 517 | for i in tss: 518 | chunks.append((wav[cur_start: i['start']])) 519 | cur_start = i['end'] 520 | return torch.cat(chunks) 521 | -------------------------------------------------------------------------------- /test/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | # 定义一个日志收集器 5 | logger = logging.getLogger() 6 | # 设置收集器的级别,不设定的话,默认收集warning及以上级别的日志 7 | logger.setLevel("DEBUG") 8 | # 设置日志格式 9 | fmt = logging.Formatter("%(filename)s-%(lineno)d-%(asctime)s-%(levelname)s-%(message)s") 10 | # 设置日志处理器-输出到文件,并且设置编码格式 11 | if not os.path.exists("./log"): 12 | os.makedirs("./log") 13 | file_handler = logging.FileHandler("./log/log.txt", encoding="utf-8") 14 | # 设置日志处理器级别 15 | file_handler.setLevel("DEBUG") 16 | # 处理器按指定格式输出日志 17 | file_handler.setFormatter(fmt) 18 | # 输出到控制台 19 | ch = logging.StreamHandler() 20 | # 设置日志处理器级别 21 | ch.setLevel("DEBUG") 22 | # 处理器按指定格式输出日志 23 | ch.setFormatter(fmt) 24 | # 收集器和处理器对接,指定输出渠道 25 | # 日志输出到文件 26 | logger.addHandler(file_handler) 27 | # 日志输出到控制台 28 | logger.addHandler(ch) 29 | 30 | TEST_MEDIA_PATH = "./test/media/" 31 | TEST_CONTENT_PATH = "./test/content/" 32 | TEST_MEDIA_FILE = [ 33 | "test001.mp4", 34 | "test002.mov", 35 | "test003.mkv", 36 | "test004.flv", 37 | "test005.mp3", 38 | "test006.MP4", 39 | ] 40 | 41 | TEST_MEDIA_FILE_LANG = ["test001_en.mp4"] 42 | TEST_MEDIA_FILE_SIMPLE = ["test001.mp4", "test005.mp3"] 43 | 44 | 45 | class TestArgs: 46 | def __init__( 47 | self, 48 | encoding="utf-8", 49 | sampling_rate=16000, 50 | bitrate="10m", 51 | lang="zh", 52 | prompt="", 53 | whisper_model="small", 54 | device=None, 55 | vad=False, 56 | force=False, 57 | ): 58 | self.inputs = [] 59 | self.bitrate = bitrate 60 | self.encoding = encoding 61 | self.sampling_rate = sampling_rate 62 | self.lang = lang 63 | self.prompt = prompt 64 | self.whisper_model = whisper_model 65 | self.device = device 66 | self.vad = vad 67 | self.force = force 68 | -------------------------------------------------------------------------------- /test/content/test.srt: -------------------------------------------------------------------------------- 1 | 1 2 | 00:00:00,000 --> 00:00:05,000 3 | 大家好,我的名字是AutoCut.这是一条用于测试的视频。 4 | 5 | 2 6 | 00:00:05,000 --> 00:00:10,260 7 | Hello, my name is AutoCut. This is a video for testing. 8 | 9 | -------------------------------------------------------------------------------- /test/content/test_md.md: -------------------------------------------------------------------------------- 1 | - [x] <-- Mark if you are done editing. 2 | 3 | 4 | 5 | Texts generated from [test001.srt](test001.srt).Mark the sentences to keep for autocut. 6 | The format is [subtitle_index,duration_in_second] subtitle context. 7 | 8 | - [ ] [1,00:00] 大家好,我的名字是AutoCut.这是一条用于测试的视频。 9 | - [x] [2,00:05] Hello, my name is AutoCut. This is a video for testing. 10 | -------------------------------------------------------------------------------- /test/content/test_srt.srt: -------------------------------------------------------------------------------- 1 | 1 2 | 00:00:00,000 --> 00:00:05,000 3 | 大家好,我的名字是AutoCut.这是一条用于测试的视频。 4 | 5 | -------------------------------------------------------------------------------- /test/media/test001.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/test/media/test001.mp4 -------------------------------------------------------------------------------- /test/media/test001_en.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/test/media/test001_en.mp4 -------------------------------------------------------------------------------- /test/media/test002.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/test/media/test002.mov -------------------------------------------------------------------------------- /test/media/test003.mkv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/test/media/test003.mkv -------------------------------------------------------------------------------- /test/media/test004.flv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/test/media/test004.flv -------------------------------------------------------------------------------- /test/media/test005.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/test/media/test005.mp3 -------------------------------------------------------------------------------- /test/media/test006.MP4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/test/media/test006.MP4 -------------------------------------------------------------------------------- /test/test_cut.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | from parameterized import parameterized, param 6 | 7 | from autocut.cut import Cutter 8 | from config import TestArgs, TEST_MEDIA_PATH, TEST_MEDIA_FILE_SIMPLE, TEST_CONTENT_PATH 9 | 10 | 11 | class TestCut(unittest.TestCase): 12 | @classmethod 13 | def setUpClass(cls): 14 | logging.info("检查测试文件是否正常存在") 15 | scan_file = os.listdir(TEST_MEDIA_PATH) 16 | logging.info( 17 | "应存在文件列表:" + str(TEST_MEDIA_FILE_SIMPLE) + " 扫描到文件列表:" + str(scan_file) 18 | ) 19 | for file in TEST_MEDIA_FILE_SIMPLE: 20 | assert file in scan_file 21 | 22 | def tearDown(self): 23 | for file in TEST_MEDIA_FILE_SIMPLE: 24 | namepart = os.path.join( 25 | TEST_MEDIA_PATH, os.path.splitext(file)[0] + "_cut." 26 | ) 27 | if os.path.exists(namepart + "mp4"): 28 | os.remove(namepart + "mp4") 29 | if os.path.exists(namepart + "mp3"): 30 | os.remove(namepart + "mp3") 31 | 32 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE]) 33 | def test_srt_cut(self, file_name): 34 | args = TestArgs() 35 | args.inputs = [ 36 | os.path.join(TEST_MEDIA_PATH, file_name), 37 | os.path.join(TEST_CONTENT_PATH, "test_srt.srt"), 38 | ] 39 | cut = Cutter(args) 40 | cut.run() 41 | namepart = os.path.join( 42 | TEST_MEDIA_PATH, os.path.splitext(file_name)[0] + "_cut." 43 | ) 44 | self.assertTrue( 45 | os.path.exists(namepart + "mp4") or os.path.exists(namepart + "mp3") 46 | ) 47 | 48 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE]) 49 | def test_md_cut(self, file_name): 50 | args = TestArgs() 51 | args.inputs = [ 52 | TEST_MEDIA_PATH + file_name, 53 | os.path.join(TEST_CONTENT_PATH, "test.srt"), 54 | os.path.join(TEST_CONTENT_PATH, "test_md.md"), 55 | ] 56 | cut = Cutter(args) 57 | cut.run() 58 | namepart = os.path.join( 59 | TEST_MEDIA_PATH, os.path.splitext(file_name)[0] + "_cut." 60 | ) 61 | self.assertTrue( 62 | os.path.exists(namepart + "mp4") or os.path.exists(namepart + "mp3") 63 | ) 64 | -------------------------------------------------------------------------------- /test/test_transcribe.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | from parameterized import parameterized, param 6 | 7 | from autocut.utils import MD 8 | from config import ( 9 | TEST_MEDIA_FILE, 10 | TestArgs, 11 | TEST_MEDIA_FILE_SIMPLE, 12 | TEST_MEDIA_FILE_LANG, 13 | TEST_MEDIA_PATH, 14 | ) 15 | from autocut.transcribe import Transcribe 16 | 17 | 18 | class TestTranscribe(unittest.TestCase): 19 | @classmethod 20 | def setUpClass(cls): 21 | logging.info("检查测试文件是否正常存在") 22 | scan_file = os.listdir(TEST_MEDIA_PATH) 23 | logging.info( 24 | "应存在文件列表:" 25 | + str(TEST_MEDIA_FILE) 26 | + str(TEST_MEDIA_FILE_LANG) 27 | + str(TEST_MEDIA_FILE_SIMPLE) 28 | + " 扫描到文件列表:" 29 | + str(scan_file) 30 | ) 31 | for file in TEST_MEDIA_FILE: 32 | assert file in scan_file 33 | for file in TEST_MEDIA_FILE_LANG: 34 | assert file in scan_file 35 | for file in TEST_MEDIA_FILE_SIMPLE: 36 | assert file in scan_file 37 | 38 | @classmethod 39 | def tearDownClass(cls): 40 | for file in os.listdir(TEST_MEDIA_PATH): 41 | if file.endswith("md") or file.endswith("srt"): 42 | os.remove(TEST_MEDIA_PATH + file) 43 | 44 | def tearDown(self): 45 | for file in TEST_MEDIA_FILE_SIMPLE: 46 | if os.path.exists(TEST_MEDIA_PATH + file.split(".")[0] + ".md"): 47 | os.remove(TEST_MEDIA_PATH + file.split(".")[0] + ".md") 48 | if os.path.exists(TEST_MEDIA_PATH + file.split(".")[0] + ".srt"): 49 | os.remove(TEST_MEDIA_PATH + file.split(".")[0] + ".srt") 50 | 51 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE]) 52 | def test_default_transcribe(self, file_name): 53 | logging.info("检查默认参数生成字幕") 54 | args = TestArgs() 55 | args.inputs = [TEST_MEDIA_PATH + file_name] 56 | transcribe = Transcribe(args) 57 | transcribe.run() 58 | self.assertTrue( 59 | os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md") 60 | ) 61 | 62 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE]) 63 | def test_jump_done_transcribe(self, file_name): 64 | logging.info("检查默认参数跳过生成字幕") 65 | args = TestArgs() 66 | args.inputs = [TEST_MEDIA_PATH + file_name] 67 | transcribe = Transcribe(args) 68 | transcribe.run() 69 | self.assertTrue( 70 | os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md") 71 | ) 72 | 73 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_LANG]) 74 | def test_en_transcribe(self, file_name): 75 | logging.info("检查--lang='en'参数生成字幕") 76 | args = TestArgs() 77 | args.lang = "en" 78 | args.inputs = [TEST_MEDIA_PATH + file_name] 79 | transcribe = Transcribe(args) 80 | transcribe.run() 81 | self.assertTrue( 82 | os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md") 83 | ) 84 | 85 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_LANG]) 86 | def test_force_transcribe(self, file_name): 87 | logging.info("检查--force参数生成字幕") 88 | args = TestArgs() 89 | args.force = True 90 | args.inputs = [TEST_MEDIA_PATH + file_name] 91 | md0_lens = len( 92 | "".join( 93 | MD( 94 | TEST_MEDIA_PATH + file_name.split(".")[0] + ".md", args.encoding 95 | ).lines 96 | ) 97 | ) 98 | transcribe = Transcribe(args) 99 | transcribe.run() 100 | md1_lens = len( 101 | "".join( 102 | MD( 103 | TEST_MEDIA_PATH + file_name.split(".")[0] + ".md", args.encoding 104 | ).lines 105 | ) 106 | ) 107 | self.assertLessEqual(md1_lens, md0_lens) 108 | 109 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE]) 110 | def test_encoding_transcribe(self, file_name): 111 | logging.info("检查--encoding参数生成字幕") 112 | args = TestArgs() 113 | args.encoding = "gbk" 114 | args.inputs = [TEST_MEDIA_PATH + file_name] 115 | transcribe = Transcribe(args) 116 | transcribe.run() 117 | with open( 118 | os.path.join(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md"), 119 | encoding="gbk", 120 | ): 121 | self.assertTrue(True) 122 | 123 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE]) 124 | def test_vad_transcribe(self, file_name): 125 | logging.info("检查--vad参数生成字幕") 126 | args = TestArgs() 127 | args.force = True 128 | args.vad = True 129 | args.inputs = [TEST_MEDIA_PATH + file_name] 130 | transcribe = Transcribe(args) 131 | transcribe.run() 132 | self.assertTrue( 133 | os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md") 134 | ) 135 | --------------------------------------------------------------------------------