├── .github
    └── workflows
    │   ├── ci.yml
    │   └── release.yml
├── .gitignore
├── Dockerfile
├── Dockerfile.cuda
├── LICENSE
├── README.md
├── autocut.py
├── autocut.spec
├── autocut
    ├── __init__.py
    ├── __main__.py
    ├── cut.py
    ├── daemon.py
    ├── main.py
    ├── transcribe.py
    └── utils.py
├── build.sh
├── imgs
    └── typora.jpg
├── requirements.txt
├── setup.cfg
├── setup.py
├── snakers4_silero-vad_master
    ├── .github
    │   └── ISSUE_TEMPLATE
    │   │   ├── bug_report.md
    │   │   ├── feature_request.md
    │   │   └── questions---help---support.md
    ├── CODE_OF_CONDUCT.md
    ├── LICENSE
    ├── README.md
    ├── examples
    │   ├── colab_record_example.ipynb
    │   ├── cpp
    │   │   ├── README.md
    │   │   ├── silero-vad-onnx.cpp
    │   │   └── wav.h
    │   ├── microphone_and_webRTC_integration
    │   │   ├── README.md
    │   │   └── microphone_and_webRTC_integration.py
    │   └── pyaudio-streaming
    │   │   ├── README.md
    │   │   └── pyaudio-streaming-examples.ipynb
    ├── files
    │   ├── lang_dict_95.json
    │   ├── lang_group_dict_95.json
    │   ├── silero_logo.jpg
    │   ├── silero_vad.jit
    │   └── silero_vad.onnx
    ├── hubconf.py
    ├── silero-vad.ipynb
    └── utils_vad.py
└── test
    ├── config.py
    ├── content
        ├── test.srt
        ├── test_md.md
        └── test_srt.srt
    ├── media
        ├── test001.mp4
        ├── test001_en.mp4
        ├── test002.mov
        ├── test003.mkv
        ├── test004.flv
        ├── test005.mp3
        └── test006.MP4
    ├── test_cut.py
    └── test_transcribe.py


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | jobs:
10 |   lint_and_test:
11 |     runs-on: ${{ matrix.os }}-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: ['3.9', '3.10']
15 |         #  macos did not support m1 for now
16 |         os: [ubuntu, windows, macos]
17 |     steps:
18 |       - uses: actions/checkout@v3
19 |       - name: Set up Python ${{ matrix.python-version }}
20 |         uses: actions/setup-python@v4
21 |         with:
22 |           python-version: ${{ matrix.python-version }}
23 |       - name: Set Variables
24 |         id: set_variables
25 |         shell: bash
26 |         run: |
27 |           echo "PY=$(python -c 'import hashlib, sys;print(hashlib.sha256(sys.version.encode()+sys.executable.encode()).hexdigest())')" >> $GITHUB_OUTPUT
28 |           echo "PIP_CACHE=$(pip cache dir)" >> $GITHUB_OUTPUT
29 |       - name: Cache PIP
30 |         uses: actions/cache@v3
31 |         with:
32 |           path: ${{ steps.set_variables.outputs.PIP_CACHE }}
33 |           key: ${{ runner.os }}-pip-${{ steps.set_variables.outputs.PY }}
34 |   
35 |       - name: Setup ffmpeg for differnt platforms 
36 |         uses: FedericoCarboni/setup-ffmpeg@master
37 | 
38 |       - name: Install dependencies
39 |         run: |
40 |           python -m pip install --upgrade pip
41 |           pip install .
42 |           pip install pytest black
43 |       - name: Run Test
44 |         run: pytest test/
45 |       - name: Run Lint
46 |         run: black . --check
47 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10
 7 | 
 8 | jobs:
 9 | 
10 |   createrelease:
11 |     name: Create Release
12 |     runs-on: [ubuntu-latest]
13 |     steps:
14 |     - name: Create Release
15 |       id: create_release
16 |       uses: actions/create-release@v1
17 |       env:
18 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
19 |       with:
20 |         tag_name: ${{ github.ref }}
21 |         release_name: Release ${{ github.ref }}
22 |         draft: false
23 |         prerelease: false
24 |     - name: Output Release URL File
25 |       run: echo "${{ steps.create_release.outputs.upload_url }}" > release_url.txt
26 |     - name: Save Release URL File for publish
27 |       uses: actions/upload-artifact@v1
28 |       with:
29 |         name: release_url
30 |         path: release_url.txt
31 | 
32 |   build:
33 |     name: Build packages
34 |     needs: createrelease
35 |     runs-on: ${{ matrix.os }}
36 |     strategy:
37 |       matrix:
38 |         include:
39 |           - os: macos-latest
40 |             TARGET: macos
41 |             OUT_FILE_NAME: autocut_macos.zip
42 |             ASSET_MIME: application/zip
43 |           - os: ubuntu-latest
44 |             TARGET: linux
45 |             OUT_FILE_NAME: autocut_linux.zip
46 |             ASSET_MIME: application/zip
47 |           - os: windows-latest
48 |             TARGET: windows
49 |             OUT_FILE_NAME: autocut_windows.zip
50 |             ASSET_MIME: application/zip
51 |     steps:
52 |     - uses: actions/checkout@v1
53 |     - name: Set up Python 3.9
54 |       uses: actions/setup-python@v2
55 |       with:
56 |         python-version: 3.9
57 |     - name: Install dependencies
58 |       run: |
59 |         python -m pip install --upgrade pip
60 |         pip install virtualenv
61 |         python -m virtualenv ./.venv
62 |     - name: Build with pyinstaller for Windows
63 |       if: runner.os == 'windows'
64 |       run: |
65 |         .venv\Scripts\activate
66 |         pip install -r requirements.txt
67 |         pyinstaller autocut.spec -y
68 |     - name: Build with pyinstaller for Other-${{matrix.TARGET}}
69 |       if: runner.os != 'windows'
70 |       run: |
71 |         source .venv/bin/activate
72 |         pip install -r requirements.txt
73 |         pyinstaller autocut.spec -y
74 |     - name: Zip Files
75 |       uses: vimtor/action-zip@v1
76 |       with:
77 |         files: ./dist
78 |         dest: ./dist/autocut_${{matrix.TARGET}}.zip
79 |     - name: Load Release URL File from release job
80 |       uses: actions/download-artifact@v1
81 |       with:
82 |         name: release_url
83 |     - name: Get Release File Name & Upload URL
84 |       id: get_release_info
85 |       shell: bash
86 |       run: |
87 |         value=`cat release_url/release_url.txt`
88 |         echo ::set-output name=upload_url::$value
89 |     - name: Upload Release Asset
90 |       id: upload-release-asset
91 |       uses: actions/upload-release-asset@v1
92 |       env:
93 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
94 |       with:
95 |         upload_url: ${{ steps.get_release_info.outputs.upload_url }}
96 |         asset_path: ./dist/${{ matrix.OUT_FILE_NAME}}
97 |         asset_name: ${{ matrix.OUT_FILE_NAME}}
98 |         asset_content_type: ${{ matrix.ASSET_MIME}}
99 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | log/
131 | 
132 | # vad_model
133 | # snakers4_silero-vad_master
134 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim as base
 2 | 
 3 | RUN mkdir /autocut
 4 | COPY ./ /autocut
 5 | WORKDIR /autocut
 6 | 
 7 | RUN apt update && \
 8 |     apt install -y git && \
 9 |     apt install -y ffmpeg
10 | 
11 | RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu && \
12 |     pip install .


--------------------------------------------------------------------------------
/Dockerfile.cuda:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime
 2 | 
 3 | RUN mkdir /autocut
 4 | COPY ./ /autocut
 5 | WORKDIR /autocut
 6 | 
 7 | RUN apt update && \
 8 |     apt install -y git && \
 9 |     apt install -y ffmpeg
10 | 
11 | RUN pip install .


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AutoCut: 通过字幕来剪切视频
  2 | 
  3 | AutoCut 对你的视频自动生成字幕。然后你选择需要保留的句子，AutoCut 将对你视频中对应的片段裁切并保存。你无需使用视频编辑软件，只需要编辑文本文件即可完成剪切。
  4 | 
  5 | ## 使用例子
  6 | 
  7 | 假如你录制的视频放在 `2022-11-04/` 这个文件夹里。那么运行
  8 | 
  9 | ```bash
 10 | autocut -d 2022-11-04
 11 | ```
 12 | 
 13 | > 提示：如果你使用 OBS 录屏，可以在 `设置->高级->录像->文件名格式` 中将空格改成 `/`，即 `%CCYY-%MM-%DD/%hh-%mm-%ss`。那么视频文件将放在日期命名的文件夹里。
 14 | 
 15 | AutoCut 将持续对这个文件夹里视频进行字幕抽取和剪切。例如，你刚完成一个视频录制，保存在 `11-28-18.mp4`。AutoCut 将生成 `11-28-18.md`。你在里面选择需要保留的句子后，AutoCut 将剪切出 `11-28-18_cut.mp4`，并生成 `11-28-18_cut.md` 来预览结果。
 16 | 
 17 | 你可以使用任何的 Markdown 编辑器。例如我常用 VS Code 和 Typora。下图是通过 Typora 来对 `11-28-18.md` 编辑。
 18 | 
 19 | ![](imgs/typora.jpg)
 20 | 
 21 | 全部完成后在 `autocut.md` 里选择需要拼接的视频后，AutoCut 将输出 `autocut_merged.mp4` 和对应的字幕文件。
 22 | 
 23 | ## 安装
 24 | 
 25 | 首先安装 Python 包
 26 | 
 27 | ```
 28 | pip install git+https://github.com/mli/autocut.git
 29 | ```
 30 | 
 31 | ## 本地安装测试
 32 | 
 33 | 
 34 | ```
 35 | git clone https://github.com/mli/autocut
 36 | cd autocut
 37 | pip install .
 38 | ```
 39 | 
 40 | 
 41 | > 上面将安装 [pytorch](https://pytorch.org/)。如果你需要 GPU 运行，且默认安装的版本不匹配的话，你可以先安装 Pytorch。如果安装 Whipser 出现问题，请参考[官方文档](https://github.com/openai/whisper#setup)。
 42 | 
 43 | 另外需要安装 [ffmpeg](https://ffmpeg.org/)
 44 | 
 45 | ```
 46 | # on Ubuntu or Debian
 47 | sudo apt update && sudo apt install ffmpeg
 48 | 
 49 | # on Arch Linux
 50 | sudo pacman -S ffmpeg
 51 | 
 52 | # on MacOS using Homebrew (https://brew.sh/)
 53 | brew install ffmpeg
 54 | 
 55 | # on Windows using Scoop (https://scoop.sh/)
 56 | scoop install ffmpeg
 57 | ```
 58 | 
 59 | ## Docker 安装
 60 | 
 61 | 首先将项目克隆到本地。
 62 | 
 63 | ```bash
 64 | git clone https://github.com/mli/autocut.git
 65 | ```
 66 | 
 67 | ### 安装 CPU 版本
 68 | 
 69 | 进入项目根目录，然后构建 docker 映像。
 70 | 
 71 | ```bash
 72 | docker build -t autocut .
 73 | ```
 74 | 
 75 | 运行下面的命令创建 docker 容器，就可以直接使用了。
 76 | 
 77 | ```bash
 78 | docker run -it --rm -v E:\autocut:/autocut/video autocut /bin/bash
 79 | ```
 80 | 
 81 | 其中 `-v` 是将主机存放视频的文件夹 `E:\autocut` 映射到虚拟机的 `/autocut/video` 目录。`E:\autocut` 是主机存放视频的目录，需修改为自己主机存放视频的目录。
 82 | 
 83 | ### 安装 GPU 版本
 84 | 
 85 | 使用 GPU 加速需要主机有 Nvidia 的显卡并安装好相应驱动。然后在项目根目录，执行下面的命令构建 docker 映像。
 86 | 
 87 | ```bash
 88 | docker build -f ./Dockerfile.cuda -t autocut-gpu .
 89 | ```
 90 | 
 91 | 使用 GPU 加速时，运行 docker 容器需添加参数 `--gpus all`。
 92 | 
 93 | ```bash
 94 | docker run --gpus all -it --rm -v E:\autocut:/autocut/video autocut-gpu
 95 | ```
 96 | 
 97 | ## 更多使用选项
 98 | 
 99 | ### 转录某个视频生成 `.srt` 和 `.md` 结果。
100 | 
101 | ```bash
102 | autocut -t 22-52-00.mp4
103 | ```
104 | 
105 | 1. 如果对转录质量不满意，可以使用更大的模型，例如
106 | 
107 |     ```bash
108 |     autocut -t 22-52-00.mp4 --whisper-model large
109 |     ```
110 | 
111 |     默认是 `small`。更好的模型是 `medium` 和 `large`，但推荐使用 GPU 获得更好的速度。也可以使用更快的 `tiny` 和 `base`，但转录质量会下降。
112 | 
113 | 
114 | ### 剪切某个视频
115 | 
116 | ```bash
117 | autocut -c 22-52-00.mp4 22-52-00.srt 22-52-00.md
118 | ```
119 | 
120 | 1. 默认视频比特率是 `--bitrate 10m`，你可以根据需要调大调小。
121 | 2. 如果不习惯 Markdown 格式文件，你也可以直接在 `srt` 文件里删除不要的句子，在剪切时不传入 `md` 文件名即可。就是 `autocut -c 22-52-00.mp4 22-52-00.srt`
122 | 3. 如果仅有 `srt` 文件，编辑不方便可以使用如下命令生成 `md` 文件，然后编辑 `md` 文件即可，但此时会完全对照 `srt` 生成，不会出现 `no speech` 等提示文本。
123 | 
124 |    ```bash
125 |    autocut -m test.srt test.mp4
126 |    autocut -m test.mp4 test.srt # 支持视频和字幕乱序传入
127 |    autocut -m test.srt # 也可以只传入字幕文件
128 |    ```
129 | 
130 | 
131 | ### 一些小提示
132 | 
133 | 
134 | 1. 讲得流利的视频的转录质量会高一些，这因为是 Whisper 训练数据分布的缘故。对一个视频，你可以先粗选一下句子，然后在剪出来的视频上再剪一次。
135 | 2. ~~最终视频生成的字幕通常还需要做一些小编辑。你可以直接编辑`md`文件（比`srt`文件更紧凑，且嵌入了视频）。然后使用 `autocut -s 22-52-00.md 22-52-00.srt` 来生成更新的字幕 `22-52-00_edited.srt`。注意这里会无视句子是不是被选中，而是全部转换成 `srt`。~~
136 | 3. 最终视频生成的字幕通常还需要做一些小编辑。但 `srt` 里面空行太多。你可以使用 `autocut -s 22-52-00.srt` 来生成一个紧凑些的版本 `22-52-00_compact.srt` 方便编辑（这个格式不合法，但编辑器，例如 VS Code，还是会进行语法高亮）。编辑完成后，`autocut -s 22-52-00_compact.srt` 转回正常格式。
137 | 4. 用 Typora 和 VS Code 编辑 Markdown 都很方便。他们都有对应的快捷键 mark 一行或者多行。但 VS Code 视频预览似乎有点问题。
138 | 5. 视频是通过 ffmpeg 导出。在 Apple M1 芯片上它用不了 GPU，导致导出速度不如专业视频软件。
139 | 
140 | ### 常见问题
141 | 
142 | 1. **输出的是乱码？**
143 | 
144 |    AutoCut 默认输出编码是 `utf-8`. 确保你的编辑器也使用了 `utf-8` 解码。你可以通过 `--encoding` 指定其他编码格式。但是需要注意生成字幕文件和使用字幕文件剪辑时的编码格式需要一致。例如使用 `gbk`。
145 | 
146 |     ```bash
147 |     autocut -t test.mp4 --encoding=gbk
148 |     autocut -c test.mp4 test.srt test.md --encoding=gbk
149 |     ```
150 | 
151 |     如果使用了其他编码格式（如 `gbk` 等）生成 `md` 文件并用 Typora 打开后，该文件可能会被 Typora 自动转码为其他编码格式，此时再通过生成时指定的编码格式进行剪辑时可能会出现编码不支持等报错。因此可以在使用 Typora 编辑后再通过 VSCode 等修改到你需要的编码格式进行保存后再使用剪辑功能。
152 | 
153 | 2. **如何使用 GPU 来转录？**
154 | 
155 |    当你有 Nvidia GPU，而且安装了对应版本的 PyTorch 的时候，转录是在 GPU 上进行。你可以通过命令来查看当前是不是支持 GPU。
156 | 
157 |    ```bash
158 |    python -c "import torch; print(torch.cuda.is_available())"
159 |    ```
160 | 
161 |    否则你可以在安装 AutoCut 前手动安装对应的 GPU 版本 PyTorch。
162 | 
163 | 3. **使用 GPU 时报错显存不够。**
164 | 
165 |    whisper 的大模型需要一定的 GPU 显存。如果你的显存不够，你可以用小一点的模型，例如 `small`。如果你仍然想用大模型，可以通过 `--device` 来强制使用 CPU。例如
166 | 
167 |    ```bash
168 |    autocut -t 11-28-18.mp4 --whisper-model large --device cpu
169 |    ```
170 | 
171 | 4. **能不能使用 `pip` 安装?**
172 | 
173 |     whisper已经发布到PyPI了，可以直接用`pip install openai-whisper`安装。
174 |    
175 |    [https://github.com/openai/whisper#setup](https://github.com/openai/whisper#setup)
176 | 
177 |    [https://pypi.org/project/openai-whisper/](https://pypi.org/project/openai-whisper/)
178 | 
179 | ## 如何参与贡献
180 | 
181 | [这里有一些想做的 feature](https://github.com/mli/autocut/issues/22)，欢迎贡献。
182 | 
183 | ### 代码结构
184 | ```text
185 | autocut
186 | │  .gitignore
187 | │  LICENSE
188 | │  README.md # 一般新增或修改需要让使用者知道就需要对应更新 README.md 内容
189 | │  setup.py
190 | │
191 | └─autocut # 核心代码位于 autocut 文件夹中，新增功能的实现也一般在这里面进行修改或新增
192 |    │  cut.py
193 |    │  daemon.py
194 |    │  main.py
195 |    │  transcribe.py
196 |    │  utils.py
197 |    └─ __init__.py
198 | 
199 | ```
200 | 
201 | ### 安装依赖
202 | 开始安装这个项目的需要的依赖之前，建议先了解一下 Anaconda 或者 venv 的虚拟环境使用，推荐**使用虚拟环境来搭建该项目的开发环境**。
203 | 具体安装方式为在你搭建搭建的虚拟环境之中按照[上方安装步骤](./README.md#安装)进行安装。
204 | 
205 | > 为什么推荐使用虚拟环境开发？
206 | >
207 | > 一方面是保证各种不同的开发环境之间互相不污染。
208 | >
209 | > 更重要的是在于这个项目实际上是一个 Python Package，所以在你安装之后 AutoCut 的代码实际也会变成你的环境依赖。
210 | > **因此在你更新代码之后，你需要让将新代码重新安装到环境中，然后才能调用到新的代码。**
211 | 
212 | ### 开发
213 | 
214 | 1. 代码风格目前遵循 PEP-8，可以使用相关的自动格式化软件完成。
215 | 2. `utils.py` 主要是全局共用的一些工具方法。
216 | 3. `transcribe.py` 是调用模型生成`srt`和`md`的部分。
217 | 4. `cut.py` 提供根据标记后`md`或`srt`进行视频剪切合并的功能。
218 | 5. `daemon.py` 提供的是监听文件夹生成字幕和剪切视频的功能。
219 | 6. `main.py` 声明命令行参数，根据输入参数调用对应功能。
220 | 
221 | 开发过程中请尽量保证修改在正确的地方，以及合理地复用代码，
222 | 同时工具函数请尽可能放在`utils.py`中。
223 | 代码格式目前是遵循 PEP-8，变量命名尽量语义化即可。
224 | 
225 | 在开发完成之后，最重要的一点是需要进行**测试**，请保证提交之前对所有**与你修改直接相关的部分**以及**你修改会影响到的部分**都进行了测试，并保证功能的正常。
226 | 目前使用 `GitHub Actions` CI, Lint 使用 black 提交前请运行 `black`。
227 | 
228 | ### 提交
229 | 
230 | 1. commit 信息用英文描述清楚你做了哪些修改即可，小写字母开头。
231 | 2. 最好可以保证一次的 commit 涉及的修改比较小，可以简短地描述清楚，这样也方便之后有修改时的查找。
232 | 3. PR 的时候 title 简述有哪些修改， contents 可以具体写下修改内容。
233 | 4. run test `pip install pytest` then `pytest test`
234 | 5. run lint `pip install black` then `black .`
235 | 


--------------------------------------------------------------------------------
/autocut.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | from autocut import main
3 | 
4 | if __name__ == "__main__":
5 |     multiprocessing.freeze_support()
6 |     main.main()
7 | 


--------------------------------------------------------------------------------
/autocut.spec:
--------------------------------------------------------------------------------
 1 | # -*- mode: python ; coding: utf-8 -*-
 2 | from PyInstaller.utils.hooks import copy_metadata, collect_data_files
 3 | from os import path
 4 | import platform
 5 | plat = platform.system().lower()
 6 | 
 7 | datas = []
 8 | datas += collect_data_files('torch')
 9 | datas += copy_metadata('tqdm')
10 | datas += copy_metadata('regex')
11 | datas += copy_metadata('requests')
12 | datas += copy_metadata('packaging')
13 | datas += copy_metadata('filelock')
14 | datas += copy_metadata('numpy')
15 | datas += copy_metadata('tokenizers')
16 | datas += copy_metadata('torch')
17 | 
18 | datas += collect_data_files('transformers', include_py_files=True)
19 | 
20 | datas += [(path.join(
21 |     './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages',
22 |     'moviepy'
23 | ), 'moviepy')]
24 | datas += [(path.join(
25 |     './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages',
26 |     'imageio_ffmpeg'
27 | ), 'imageio_ffmpeg')]
28 | datas += [(path.join(
29 |     './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages',
30 |     'torchaudio'
31 | ), 'torchaudio')]
32 | datas += [(path.join(
33 |     './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages',
34 |     'whisper'
35 | ), 'whisper')]
36 | datas += [(path.join(
37 |     './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages',
38 |     'opencc'
39 | ), 'opencc')]
40 | datas += [('./snakers4_silero-vad_master', './snakers4_silero-vad_master')]
41 | if not plat == 'windows':
42 |     datas += [('./build.sh', './')]
43 | 
44 | block_cipher = None
45 | 
46 | 
47 | a = Analysis(
48 |     ['autocut.py'],
49 |     pathex=[],
50 |     binaries=[],
51 |     datas=datas,
52 |     hiddenimports=[],
53 |     hookspath=[],
54 |     hooksconfig={},
55 |     runtime_hooks=[],
56 |     excludes=[],
57 |     win_no_prefer_redirects=False,
58 |     win_private_assemblies=False,
59 |     cipher=block_cipher,
60 |     noarchive=False,
61 | )
62 | pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
63 | 
64 | exe = EXE(
65 |     pyz,
66 |     a.scripts,
67 |     [],
68 |     exclude_binaries=True,
69 |     name='autocut',
70 |     debug=False,
71 |     bootloader_ignore_signals=False,
72 |     strip=False,
73 |     upx=True,
74 |     console=True,
75 |     disable_windowed_traceback=False,
76 |     argv_emulation=False,
77 |     target_arch=None,
78 |     codesign_identity=None,
79 |     entitlements_file=None,
80 | )
81 | coll = COLLECT(
82 |     exe,
83 |     a.binaries,
84 |     a.zipfiles,
85 |     a.datas,
86 |     strip=False,
87 |     upx=True,
88 |     upx_exclude=[],
89 |     name='autocut',
90 | )
91 | 


--------------------------------------------------------------------------------
/autocut/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.3"
2 | 


--------------------------------------------------------------------------------
/autocut/__main__.py:
--------------------------------------------------------------------------------
1 | from .main import main
2 | 
3 | if __name__ == "__main__":
4 |     main()
5 | 


--------------------------------------------------------------------------------
/autocut/cut.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import re
  4 | 
  5 | import srt
  6 | from moviepy import editor
  7 | 
  8 | from . import utils
  9 | 
 10 | 
 11 | # Merge videos
 12 | class Merger:
 13 |     def __init__(self, args):
 14 |         self.args = args
 15 | 
 16 |     def write_md(self, videos):
 17 |         md = utils.MD(self.args.inputs[0], self.args.encoding)
 18 |         num_tasks = len(md.tasks())
 19 |         # Not overwrite if already marked as down or no new videos
 20 |         if md.done_editing() or num_tasks == len(videos) + 1:
 21 |             return
 22 | 
 23 |         md.clear()
 24 |         md.add_done_editing(False)
 25 |         md.add("\nSelect the files that will be used to generate `autocut_final.mp4`\n")
 26 |         base = lambda fn: os.path.basename(fn)
 27 |         for f in videos:
 28 |             md_fn = utils.change_ext(f, "md")
 29 |             video_md = utils.MD(md_fn, self.args.encoding)
 30 |             # select a few words to scribe the video
 31 |             desc = ""
 32 |             if len(video_md.tasks()) > 1:
 33 |                 for _, t in video_md.tasks()[1:]:
 34 |                     m = re.findall(r"\] (.*)", t)
 35 |                     if m and "no speech" not in m[0].lower():
 36 |                         desc += m[0] + " "
 37 |                     if len(desc) > 50:
 38 |                         break
 39 |             md.add_task(
 40 |                 False,
 41 |                 f'[{base(f)}]({base(md_fn)}) {"[Edited]" if video_md.done_editing() else ""} {desc}',
 42 |             )
 43 |         md.write()
 44 | 
 45 |     def run(self):
 46 |         md_fn = self.args.inputs[0]
 47 |         md = utils.MD(md_fn, self.args.encoding)
 48 |         if not md.done_editing():
 49 |             return
 50 | 
 51 |         videos = []
 52 |         for m, t in md.tasks():
 53 |             if not m:
 54 |                 continue
 55 |             m = re.findall(r"\[(.*)\]", t)
 56 |             if not m:
 57 |                 continue
 58 |             fn = os.path.join(os.path.dirname(md_fn), m[0])
 59 |             logging.info(f"Loading {fn}")
 60 |             videos.append(editor.VideoFileClip(fn))
 61 | 
 62 |         dur = sum([v.duration for v in videos])
 63 |         logging.info(f"Merging into a video with {dur / 60:.1f} min length")
 64 | 
 65 |         merged = editor.concatenate_videoclips(videos)
 66 |         fn = os.path.splitext(md_fn)[0] + "_merged.mp4"
 67 |         merged.write_videofile(
 68 |             fn, audio_codec="aac", bitrate=self.args.bitrate
 69 |         )  # logger=None,
 70 |         logging.info(f"Saved merged video to {fn}")
 71 | 
 72 | 
 73 | # Cut media
 74 | class Cutter:
 75 |     def __init__(self, args):
 76 |         self.args = args
 77 | 
 78 |     def run(self):
 79 |         fns = {"srt": None, "media": None, "md": None}
 80 |         for fn in self.args.inputs:
 81 |             ext = os.path.splitext(fn)[1][1:]
 82 |             fns[ext if ext in fns else "media"] = fn
 83 | 
 84 |         assert fns["media"], "must provide a media filename"
 85 |         assert fns["srt"], "must provide a srt filename"
 86 | 
 87 |         is_video_file = utils.is_video(fns["media"].lower())
 88 |         outext = "mp4" if is_video_file else "mp3"
 89 |         output_fn = utils.change_ext(utils.add_cut(fns["media"]), outext)
 90 |         if utils.check_exists(output_fn, self.args.force):
 91 |             return
 92 | 
 93 |         with open(fns["srt"], encoding=self.args.encoding) as f:
 94 |             subs = list(srt.parse(f.read()))
 95 | 
 96 |         if fns["md"]:
 97 |             md = utils.MD(fns["md"], self.args.encoding)
 98 |             if not md.done_editing():
 99 |                 return
100 |             index = []
101 |             for mark, sent in md.tasks():
102 |                 if not mark:
103 |                     continue
104 |                 m = re.match(r"\[(\d+)", sent.strip())
105 |                 if m:
106 |                     index.append(int(m.groups()[0]))
107 |             subs = [s for s in subs if s.index in index]
108 |             logging.info(f'Cut {fns["media"]} based on {fns["srt"]} and {fns["md"]}')
109 |         else:
110 |             logging.info(f'Cut {fns["media"]} based on {fns["srt"]}')
111 | 
112 |         segments = []
113 |         # Avoid disordered subtitles
114 |         subs.sort(key=lambda x: x.start)
115 |         for x in subs:
116 |             if len(segments) == 0:
117 |                 segments.append(
118 |                     {"start": x.start.total_seconds(), "end": x.end.total_seconds()}
119 |                 )
120 |             else:
121 |                 if x.start.total_seconds() - segments[-1]["end"] < 0.5:
122 |                     segments[-1]["end"] = x.end.total_seconds()
123 |                 else:
124 |                     segments.append(
125 |                         {"start": x.start.total_seconds(), "end": x.end.total_seconds()}
126 |                     )
127 | 
128 |         if is_video_file:
129 |             media = editor.VideoFileClip(fns["media"])
130 |         else:
131 |             media = editor.AudioFileClip(fns["media"])
132 | 
133 |         # Add a fade between two clips. Not quite necessary. keep code here for reference
134 |         # fade = 0
135 |         # segments = _expand_segments(segments, fade, 0, video.duration)
136 |         # clips = [video.subclip(
137 |         #         s['start'], s['end']).crossfadein(fade) for s in segments]
138 |         # final_clip = editor.concatenate_videoclips(clips, padding = -fade)
139 | 
140 |         clips = [media.subclip(s["start"], s["end"]) for s in segments]
141 |         if is_video_file:
142 |             final_clip: editor.VideoClip = editor.concatenate_videoclips(clips)
143 |             logging.info(
144 |                 f"Reduced duration from {media.duration:.1f} to {final_clip.duration:.1f}"
145 |             )
146 | 
147 |             aud = final_clip.audio.set_fps(44100)
148 |             final_clip = final_clip.without_audio().set_audio(aud)
149 |             final_clip = final_clip.fx(editor.afx.audio_normalize)
150 | 
151 |             # an alternative to birate is use crf, e.g. ffmpeg_params=['-crf', '18']
152 |             final_clip.write_videofile(
153 |                 output_fn, audio_codec="aac", bitrate=self.args.bitrate
154 |             )
155 |         else:
156 |             final_clip: editor.AudioClip = editor.concatenate_audioclips(clips)
157 |             logging.info(
158 |                 f"Reduced duration from {media.duration:.1f} to {final_clip.duration:.1f}"
159 |             )
160 | 
161 |             final_clip = final_clip.fx(editor.afx.audio_normalize)
162 |             final_clip.write_audiofile(
163 |                 output_fn, codec="libmp3lame", fps=44100, bitrate=self.args.bitrate
164 |             )
165 | 
166 |         media.close()
167 |         logging.info(f"Saved media to {output_fn}")
168 | 


--------------------------------------------------------------------------------
/autocut/daemon.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import glob
 3 | import logging
 4 | import os
 5 | import time
 6 | 
 7 | from . import cut, transcribe, utils
 8 | 
 9 | 
10 | class Daemon:
11 |     def __init__(self, args):
12 |         self.args = args
13 |         self.sleep = 1
14 | 
15 |     def run(self):
16 |         assert len(self.args.inputs) == 1, "Must provide a single folder"
17 |         while True:
18 |             self._iter()
19 |             time.sleep(self.sleep)
20 |             self.sleep = min(60, self.sleep + 1)
21 | 
22 |     def _iter(self):
23 |         folder = self.args.inputs[0]
24 |         files = sorted(list(glob.glob(os.path.join(folder, "*"))))
25 |         media_files = [f for f in files if utils.is_video(f) or utils.is_audio(f)]
26 |         args = copy.deepcopy(self.args)
27 |         for f in media_files:
28 |             srt_fn = utils.change_ext(f, "srt")
29 |             md_fn = utils.change_ext(f, "md")
30 |             is_video_file = utils.is_video(f)
31 |             if srt_fn not in files or md_fn not in files:
32 |                 args.inputs = [f]
33 |                 try:
34 |                     transcribe.Transcribe(args).run()
35 |                     self.sleep = 1
36 |                     break
37 |                 except RuntimeError as e:
38 |                     logging.warn(
39 |                         "Failed, may be due to the video is still on recording"
40 |                     )
41 |                     pass
42 |             if md_fn in files:
43 |                 if utils.add_cut(md_fn) in files:
44 |                     continue
45 |                 md = utils.MD(md_fn, self.args.encoding)
46 |                 ext = "mp4" if is_video_file else "mp3"
47 |                 if not md.done_editing() or os.path.exists(
48 |                     utils.change_ext(utils.add_cut(f), ext)
49 |                 ):
50 |                     continue
51 |                 args.inputs = [f, md_fn, srt_fn]
52 |                 cut.Cutter(args).run()
53 |                 self.sleep = 1
54 | 
55 |         args.inputs = [os.path.join(folder, "autocut.md")]
56 |         merger = cut.Merger(args)
57 |         merger.write_md(media_files)
58 |         merger.run()
59 | 


--------------------------------------------------------------------------------
/autocut/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | 
  5 | from whisper.tokenizer import LANGUAGES
  6 | from . import utils
  7 | 
  8 | def main():
  9 |     parser = argparse.ArgumentParser(
 10 |         description="Edit videos based on transcribed subtitles",
 11 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 12 |     )
 13 | 
 14 |     logging.basicConfig(
 15 |         format="[autocut:%(filename)s:L%(lineno)d] %(levelname)-6s %(message)s"
 16 |     )
 17 |     logging.getLogger().setLevel(logging.INFO)
 18 | 
 19 |     parser.add_argument("inputs", type=str, nargs="+", help="Inputs filenames/folders")
 20 |     parser.add_argument(
 21 |         "-t",
 22 |         "--transcribe",
 23 |         help="Transcribe videos/audio into subtitles",
 24 |         action=argparse.BooleanOptionalAction,
 25 |     )
 26 |     parser.add_argument(
 27 |         "-c",
 28 |         "--cut",
 29 |         help="Cut a video based on subtitles",
 30 |         action=argparse.BooleanOptionalAction,
 31 |     )
 32 |     parser.add_argument(
 33 |         "-d",
 34 |         "--daemon",
 35 |         help="Monitor a folder to transcribe and cut",
 36 |         action=argparse.BooleanOptionalAction,
 37 |     )
 38 |     parser.add_argument(
 39 |         "-s",
 40 |         help="Convert .srt to a compact format for easier editing",
 41 |         action=argparse.BooleanOptionalAction,
 42 |     )
 43 |     parser.add_argument(
 44 |         "-m",
 45 |         "--to-md",
 46 |         help="Convert .srt to .md for easier editing",
 47 |         action=argparse.BooleanOptionalAction,
 48 |     )
 49 |     parser.add_argument(
 50 |         "--lang",
 51 |         type=str,
 52 |         default="zh",
 53 |         choices=LANGUAGES.keys(),
 54 |         help="The output language of transcription",
 55 |     )
 56 |     parser.add_argument(
 57 |         "--prompt", type=str, default="", help="initial prompt feed into whisper"
 58 |     )
 59 |     parser.add_argument(
 60 |         "--whisper-model",
 61 |         type=str,
 62 |         default="small",
 63 |         choices=["tiny", "base", "small", "medium", "large"],
 64 |         help="The whisper model used to transcribe.",
 65 |     )
 66 |     parser.add_argument(
 67 |         "--bitrate",
 68 |         type=str,
 69 |         default="10m",
 70 |         help="The bitrate to export the cutted video, such as 10m, 1m, or 500k",
 71 |     )
 72 |     parser.add_argument(
 73 |         "--vad", help="If or not use VAD", choices=["1", "0", "auto"], default="auto"
 74 |     )
 75 |     parser.add_argument(
 76 |         "--force",
 77 |         help="Force write even if files exist",
 78 |         action=argparse.BooleanOptionalAction,
 79 |     )
 80 |     parser.add_argument(
 81 |         "--encoding", type=str, default="utf-8", help="Document encoding format"
 82 |     )
 83 |     parser.add_argument(
 84 |         "--device",
 85 |         type=str,
 86 |         default=None,
 87 |         choices=["cpu", "cuda"],
 88 |         help="Force to CPU or GPU for transcribing. In default automatically use GPU if available.",
 89 |     )
 90 | 
 91 |     args = parser.parse_args()
 92 | 
 93 |     if args.transcribe:
 94 |         from .transcribe import Transcribe
 95 | 
 96 |         Transcribe(args).run()
 97 |     elif args.to_md:
 98 |         from .utils import trans_srt_to_md
 99 | 
100 |         if len(args.inputs) == 2:
101 |             [input_1, input_2] = args.inputs
102 |             base, ext = os.path.splitext(input_1)
103 |             if ext != ".srt":
104 |                 input_1, input_2 = input_2, input_1
105 |             trans_srt_to_md(args.encoding, args.force, input_1, input_2)
106 |         elif len(args.inputs) == 1:
107 |             trans_srt_to_md(args.encoding, args.force, args.inputs[0])
108 |         else:
109 |             logging.warn(
110 |                 "Wrong number of files, please pass in a .srt file or an additional video file"
111 |             )
112 |     elif args.cut:
113 |         from .cut import Cutter
114 | 
115 |         Cutter(args).run()
116 |     elif args.daemon:
117 |         from .daemon import Daemon
118 | 
119 |         Daemon(args).run()
120 |     elif args.s:
121 |         utils.compact_rst(args.inputs[0], args.encoding)
122 |     else:
123 |         logging.warning("No action, use -c, -t or -d")
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     main()
128 | 


--------------------------------------------------------------------------------
/autocut/transcribe.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import os
  4 | import sys
  5 | import time
  6 | 
  7 | import opencc
  8 | import srt
  9 | import torch
 10 | import whisper
 11 | 
 12 | from tqdm import tqdm
 13 | 
 14 | from . import utils
 15 | 
 16 | 
 17 | def process(whisper_model, audio, seg, lang, prompt):
 18 |     r = whisper_model.transcribe(
 19 |         audio[int(seg["start"]) : int(seg["end"])],
 20 |         task="transcribe",
 21 |         language=lang,
 22 |         initial_prompt=prompt,
 23 |     )
 24 |     r["origin_timestamp"] = seg
 25 |     return r
 26 | 
 27 | 
 28 | class Transcribe:
 29 |     def __init__(self, args):
 30 |         self.args = args
 31 |         self.sampling_rate = 16000
 32 |         self.whisper_model = None
 33 |         self.vad_model = None
 34 |         self.detect_speech = None
 35 | 
 36 |     def run(self):
 37 |         for input in self.args.inputs:
 38 |             logging.info(f"Transcribing {input}")
 39 |             name, _ = os.path.splitext(input)
 40 |             if utils.check_exists(name + ".md", self.args.force):
 41 |                 continue
 42 | 
 43 |             audio = whisper.load_audio(input, sr=self.sampling_rate)
 44 |             if (
 45 |                 self.args.vad == "1"
 46 |                 or self.args.vad == "auto"
 47 |                 and not name.endswith("_cut")
 48 |             ):
 49 |                 speech_timestamps = self._detect_voice_activity(audio)
 50 |             else:
 51 |                 speech_timestamps = [{"start": 0, "end": len(audio)}]
 52 |             transcribe_results = self._transcribe(audio, speech_timestamps)
 53 | 
 54 |             output = name + ".srt"
 55 |             self._save_srt(output, transcribe_results)
 56 |             logging.info(f"Transcribed {input} to {output}")
 57 |             self._save_md(name + ".md", output, input)
 58 |             logging.info(f'Saved texts to {name + ".md"} to mark sentences')
 59 | 
 60 |     def _detect_voice_activity(self, audio):
 61 |         """Detect segments that have voice activities"""
 62 |         tic = time.time()
 63 |         if self.vad_model is None or self.detect_speech is None:
 64 |             # torch load limit https://github.com/pytorch/vision/issues/4156
 65 |             torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
 66 |             self.vad_model, funcs = torch.hub.load(
 67 |                 repo_or_dir=os.path.join(os.path.dirname(sys.executable), "snakers4_silero-vad_master"), 
 68 |                 source="local",
 69 |                 model="silero_vad", 
 70 |                 trust_repo=True,
 71 |                 silero_vad_source="local",
 72 |             )
 73 | 
 74 |             self.detect_speech = funcs[0]
 75 | 
 76 |         speeches = self.detect_speech(
 77 |             audio, self.vad_model, sampling_rate=self.sampling_rate
 78 |         )
 79 | 
 80 |         # Remove too short segments
 81 |         speeches = utils.remove_short_segments(speeches, 1.0 * self.sampling_rate)
 82 | 
 83 |         # Expand to avoid to tight cut. You can tune the pad length
 84 |         speeches = utils.expand_segments(
 85 |             speeches, 0.2 * self.sampling_rate, 0.0 * self.sampling_rate, audio.shape[0]
 86 |         )
 87 | 
 88 |         # Merge very closed segments
 89 |         speeches = utils.merge_adjacent_segments(speeches, 0.5 * self.sampling_rate)
 90 | 
 91 |         logging.info(f"Done voice activity detection in {time.time() - tic:.1f} sec")
 92 |         return speeches if len(speeches) > 1 else [{"start": 0, "end": len(audio)}]
 93 | 
 94 |     def _transcribe(self, audio, speech_timestamps):
 95 |         tic = time.time()
 96 |         if self.whisper_model is None:
 97 |             self.whisper_model = whisper.load_model(
 98 |                 self.args.whisper_model, self.args.device
 99 |             )
100 | 
101 |         res = []
102 |         if self.args.device == "cpu" and len(speech_timestamps) > 1:
103 |             from multiprocessing import Pool
104 | 
105 |             pbar = tqdm(total=len(speech_timestamps))
106 | 
107 |             pool = Pool(processes=4)
108 |             # TODO, a better way is merging these segments into a single one, so whisper can get more context
109 |             for seg in speech_timestamps:
110 |                 res.append(
111 |                     pool.apply_async(
112 |                         process,
113 |                         (
114 |                             self.whisper_model,
115 |                             audio,
116 |                             seg,
117 |                             self.args.lang,
118 |                             self.args.prompt,
119 |                         ),
120 |                         callback=lambda x: pbar.update(),
121 |                     )
122 |                 )
123 |             pool.close()
124 |             pool.join()
125 |             pbar.close()
126 |             logging.info(f"Done transcription in {time.time() - tic:.1f} sec")
127 |             return [i.get() for i in res]
128 |         else:
129 |             for seg in (
130 |                 speech_timestamps
131 |                 if len(speech_timestamps) == 1
132 |                 else tqdm(speech_timestamps)
133 |             ):
134 |                 r = self.whisper_model.transcribe(
135 |                     audio[int(seg["start"]) : int(seg["end"])],
136 |                     task="transcribe",
137 |                     language=self.args.lang,
138 |                     initial_prompt=self.args.prompt,
139 |                     verbose=False if len(speech_timestamps) == 1 else None,
140 |                 )
141 |                 r["origin_timestamp"] = seg
142 |                 res.append(r)
143 |             logging.info(f"Done transcription in {time.time() - tic:.1f} sec")
144 |             return res
145 | 
146 |     def _save_srt(self, output, transcribe_results):
147 |         subs = []
148 |         # whisper sometimes generate traditional chinese, explicitly convert
149 |         cc = opencc.OpenCC("t2s")
150 | 
151 |         def _add_sub(start, end, text):
152 |             subs.append(
153 |                 srt.Subtitle(
154 |                     index=0,
155 |                     start=datetime.timedelta(seconds=start),
156 |                     end=datetime.timedelta(seconds=end),
157 |                     content=cc.convert(text.strip()),
158 |                 )
159 |             )
160 | 
161 |         prev_end = 0
162 |         for r in transcribe_results:
163 |             origin = r["origin_timestamp"]
164 |             for s in r["segments"]:
165 |                 start = s["start"] + origin["start"] / self.sampling_rate
166 |                 end = min(
167 |                     s["end"] + origin["start"] / self.sampling_rate,
168 |                     origin["end"] / self.sampling_rate,
169 |                 )
170 |                 if start > end:
171 |                     continue
172 |                 # mark any empty segment that is not very short
173 |                 if start > prev_end + 1.0:
174 |                     _add_sub(prev_end, start, "< No Speech >")
175 |                 _add_sub(start, end, s["text"])
176 |                 prev_end = end
177 | 
178 |         with open(output, "wb") as f:
179 |             f.write(srt.compose(subs).encode(self.args.encoding, "replace"))
180 | 
181 |     def _save_md(self, md_fn, srt_fn, video_fn):
182 |         with open(srt_fn, encoding=self.args.encoding) as f:
183 |             subs = srt.parse(f.read())
184 | 
185 |         md = utils.MD(md_fn, self.args.encoding)
186 |         md.clear()
187 |         md.add_done_editing(False)
188 |         md.add_video(os.path.basename(video_fn))
189 |         md.add(
190 |             f"\nTexts generated from [{os.path.basename(srt_fn)}]({os.path.basename(srt_fn)})."
191 |             "Mark the sentences to keep for autocut.\n"
192 |             "The format is [subtitle_index,duration_in_second] subtitle context.\n\n"
193 |         )
194 | 
195 |         for s in subs:
196 |             sec = s.start.seconds
197 |             pre = f"[{s.index},{sec // 60:02d}:{sec % 60:02d}]"
198 |             md.add_task(False, f"{pre:11} {s.content.strip()}")
199 |         md.write()
200 | 


--------------------------------------------------------------------------------
/autocut/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import re
  4 | 
  5 | import srt
  6 | import opencc
  7 | 
  8 | 
  9 | def is_video(filename):
 10 |     _, ext = os.path.splitext(filename)
 11 |     return ext in [".mp4", ".mov", ".mkv", ".avi", ".flv", ".f4v", ".webm"]
 12 | 
 13 | 
 14 | def is_audio(filename):
 15 |     _, ext = os.path.splitext(filename)
 16 |     return ext in [".ogg", ".wav", ".mp3", ".flac", ".m4a"]
 17 | 
 18 | 
 19 | def change_ext(filename, new_ext):
 20 |     # Change the extension of filename to new_ext
 21 |     base, _ = os.path.splitext(filename)
 22 |     if not new_ext.startswith("."):
 23 |         new_ext = "." + new_ext
 24 |     return base + new_ext
 25 | 
 26 | 
 27 | def add_cut(filename):
 28 |     # Add cut mark to the filename
 29 |     base, ext = os.path.splitext(filename)
 30 |     if base.endswith("_cut"):
 31 |         base = base[:-4] + "_" + base[-4:]
 32 |     else:
 33 |         base += "_cut"
 34 |     return base + ext
 35 | 
 36 | 
 37 | # a very simple markdown parser
 38 | class MD:
 39 |     def __init__(self, filename, encoding):
 40 |         self.lines = []
 41 |         self.EDIT_DONE_MAKR = "<-- Mark if you are done editing."
 42 |         self.encoding = encoding
 43 |         self.filename = filename
 44 |         if filename:
 45 |             self.load_file()
 46 | 
 47 |     def load_file(self):
 48 |         if os.path.exists(self.filename):
 49 |             with open(self.filename, encoding=self.encoding) as f:
 50 |                 self.lines = f.readlines()
 51 | 
 52 |     def clear(self):
 53 |         self.lines = []
 54 | 
 55 |     def write(self):
 56 |         with open(self.filename, "wb") as f:
 57 |             f.write("\n".join(self.lines).encode(self.encoding, "replace"))
 58 | 
 59 |     def tasks(self):
 60 |         # get all tasks with their status
 61 |         ret = []
 62 |         for l in self.lines:
 63 |             mark, task = self._parse_task_status(l)
 64 |             if mark is not None:
 65 |                 ret.append((mark, task))
 66 |         return ret
 67 | 
 68 |     def done_editing(self):
 69 |         for m, t in self.tasks():
 70 |             if m and self.EDIT_DONE_MAKR in t:
 71 |                 return True
 72 |         return False
 73 | 
 74 |     def add(self, line):
 75 |         self.lines.append(line)
 76 | 
 77 |     def add_task(self, mark, contents):
 78 |         self.add(f'- [{"x" if mark else " "}] {contents.strip()}')
 79 | 
 80 |     def add_done_editing(self, mark):
 81 |         self.add_task(mark, self.EDIT_DONE_MAKR)
 82 | 
 83 |     def add_video(self, video_fn):
 84 |         ext = os.path.splitext(video_fn)[1][1:]
 85 |         self.add(
 86 |             f'\n<video controls="true" allowfullscreen="true"> <source src="{video_fn}" type="video/{ext}"> </video>\n'
 87 |         )
 88 | 
 89 |     def _parse_task_status(self, line):
 90 |         # return (is_marked, rest) or (None, line) if not a task
 91 |         m = re.match(r"- +\[([ x])\] +(.*)", line)
 92 |         if not m:
 93 |             return None, line
 94 |         return m.groups()[0].lower() == "x", m.groups()[1]
 95 | 
 96 | 
 97 | def check_exists(output, force):
 98 |     if os.path.exists(output):
 99 |         if force:
100 |             logging.info(f"{output} exists. Will overwrite it")
101 |         else:
102 |             logging.info(
103 |                 f"{output} exists, skipping... Use the --force flag to overwrite"
104 |             )
105 |             return True
106 |     return False
107 | 
108 | 
109 | def expand_segments(segments, expand_head, expand_tail, total_length):
110 |     # Pad head and tail for each time segment
111 |     results = []
112 |     for i in range(len(segments)):
113 |         t = segments[i]
114 |         start = max(t["start"] - expand_head, segments[i - 1]["end"] if i > 0 else 0)
115 |         end = min(
116 |             t["end"] + expand_tail,
117 |             segments[i + 1]["start"] if i < len(segments) - 1 else total_length,
118 |         )
119 |         results.append({"start": start, "end": end})
120 |     return results
121 | 
122 | 
123 | def remove_short_segments(segments, threshold):
124 |     # Remove segments whose length < threshold
125 |     return [s for s in segments if s["end"] - s["start"] > threshold]
126 | 
127 | 
128 | def merge_adjacent_segments(segments, threshold):
129 |     # Merge two adjacent segments if their distance < threshold
130 |     results = []
131 |     i = 0
132 |     while i < len(segments):
133 |         s = segments[i]
134 |         for j in range(i + 1, len(segments)):
135 |             if segments[j]["start"] < s["end"] + threshold:
136 |                 s["end"] = segments[j]["end"]
137 |                 i = j
138 |             else:
139 |                 break
140 |         i += 1
141 |         results.append(s)
142 |     return results
143 | 
144 | 
145 | def compact_rst(sub_fn, encoding):
146 |     cc = opencc.OpenCC("t2s")
147 | 
148 |     base, ext = os.path.splitext(sub_fn)
149 |     COMPACT = "_compact"
150 |     if ext != ".srt":
151 |         logging.fatal("only .srt file is supported")
152 | 
153 |     if base.endswith(COMPACT):
154 |         # to original rst
155 |         with open(sub_fn, encoding=encoding) as f:
156 |             lines = f.readlines()
157 |         subs = []
158 |         for l in lines:
159 |             items = l.split(" ")
160 |             if len(items) < 4:
161 |                 continue
162 |             subs.append(
163 |                 srt.Subtitle(
164 |                     index=0,
165 |                     start=srt.srt_timestamp_to_timedelta(items[0]),
166 |                     end=srt.srt_timestamp_to_timedelta(items[2]),
167 |                     content=" ".join(items[3:]).strip(),
168 |                 )
169 |             )
170 |         with open(base[: -len(COMPACT)] + ext, "wb") as f:
171 |             f.write(srt.compose(subs).encode(encoding, "replace"))
172 |     else:
173 |         # to a compact version
174 |         with open(sub_fn, encoding=encoding) as f:
175 |             subs = srt.parse(f.read())
176 |         with open(base + COMPACT + ext, "wb") as f:
177 |             for s in subs:
178 |                 f.write(
179 |                     f"{srt.timedelta_to_srt_timestamp(s.start)} --> {srt.timedelta_to_srt_timestamp(s.end)} "
180 |                     f"{cc.convert(s.content.strip())}\n".encode(encoding, "replace")
181 |                 )
182 | 
183 | 
184 | def trans_srt_to_md(encoding, force, srt_fn, video_fn=None):
185 |     base, ext = os.path.splitext(srt_fn)
186 |     if ext != ".srt":
187 |         logging.fatal("only .srt file is supported")
188 |     md_fn = base + ext.split(".")[0] + ".md"
189 | 
190 |     check_exists(md_fn, force)
191 | 
192 |     with open(srt_fn, encoding=encoding) as f:
193 |         subs = srt.parse(f.read())
194 | 
195 |     md = MD(md_fn, encoding)
196 |     md.clear()
197 |     md.add_done_editing(False)
198 |     if video_fn:
199 |         if not is_video(video_fn):
200 |             logging.fatal(f"{video_fn} may not be a video")
201 |         md.add_video(os.path.basename(video_fn))
202 |     md.add(
203 |         f"\nTexts generated from [{os.path.basename(srt_fn)}]({os.path.basename(srt_fn)})."
204 |         "Mark the sentences to keep for autocut.\n"
205 |         "The format is [subtitle_index,duration_in_second] subtitle context.\n\n"
206 |     )
207 | 
208 |     for s in subs:
209 |         sec = s.start.seconds
210 |         pre = f"[{s.index},{sec // 60:02d}:{sec % 60:02d}]"
211 |         md.add_task(False, f"{pre:11} {s.content.strip()}")
212 |     md.write()
213 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
 1 | # run this script after pyinstaller
 2 | # see https://github.com/pyinstaller/pyinstaller/issues/7582#issuecomment-1515434457
 3 | 
 4 | rm -f libtorch*
 5 | ln -s torch/lib/libtorch.dylib .
 6 | ln -s torch/lib/libtorch_cpu.dylib .
 7 | ln -s torch/lib/libtorch_python.dylib .
 8 | 
 9 | ln -s torchaudio/lib/libtorchaudio.so .
10 | 
11 | install_name_tool -add_rpath @loader_path/../.. torchaudio/lib/libtorchaudio.so
12 | 


--------------------------------------------------------------------------------
/imgs/typora.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/imgs/typora.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | altgraph==0.17.3
 2 | attrs==22.1.0
 3 | black==22.12.0
 4 | certifi==2022.12.7
 5 | charset-normalizer==2.1.1
 6 | click==8.1.3
 7 | colorama==0.4.6
 8 | decorator==4.4.2
 9 | exceptiongroup==1.0.4
10 | ffmpeg-python==0.2.0
11 | filelock==3.8.2
12 | future==0.18.2
13 | huggingface-hub==0.11.1
14 | idna==3.4
15 | imageio==2.22.4
16 | imageio-ffmpeg==0.4.7
17 | iniconfig==1.1.1
18 | more-itertools==9.0.0
19 | moviepy==1.0.3
20 | mypy-extensions==0.4.3
21 | numpy==1.23.5
22 | opencc-python-reimplemented==0.1.6
23 | packaging==22.0
24 | parameterized==0.8.1
25 | pathspec==0.10.3
26 | pefile==2022.5.30
27 | Pillow==9.3.0
28 | platformdirs==2.6.0
29 | pluggy==1.0.0
30 | proglog==0.1.10
31 | pyinstaller==5.7.0
32 | pyinstaller-hooks-contrib==2022.14
33 | pyparsing==3.0.9
34 | pytest==7.2.0
35 | pywin32-ctypes==0.2.0
36 | PyYAML==6.0
37 | regex==2022.10.31
38 | requests==2.28.1
39 | six==1.16.0
40 | srt==3.5.2
41 | tokenizers==0.13.2
42 | tomli==2.0.1
43 | torch==1.13.0
44 | torchaudio==0.13.0
45 | tqdm==4.64.1
46 | transformers==4.25.1
47 | typing_extensions==4.4.0
48 | urllib3==1.26.13
49 | whisper @ git+https://github.com/openai/whisper.git@02aa851a4910201f0db56960064d7e121a01002c
50 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = autocut
 3 | version = attr: autocut.__version__
 4 | license = Apache Software License
 5 | description = Cut video by subtitles
 6 | long_description = file: README.md
 7 | classifiers =
 8 |     License :: OSI Approved :: Apache Software License
 9 |     Operating System :: OS Independent
10 |     Programming Language :: Python :: 3
11 | 
12 | [options]
13 | packages = find:
14 | include_package_data = True
15 | python_requires = >= 3.9
16 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | requirements = [
 4 |     "srt",
 5 |     "moviepy",
 6 |     "opencc-python-reimplemented",
 7 |     "torchaudio",
 8 |     "parameterized",
 9 |     "openai-whisper",
10 |     "tqdm",
11 | ]
12 | 
13 | 
14 | setup(
15 |     name="autocut",
16 |     install_requires=requirements,
17 |     packages=find_packages(),
18 |     entry_points={
19 |         "console_scripts": [
20 |             "autocut = autocut.main:main",
21 |         ]
22 |     },
23 | )
24 | 


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: Bug report - [X]
 5 | labels: bug
 6 | assignees: snakers4
 7 | 
 8 | ---
 9 | 
10 | ## 🐛 Bug
11 | 
12 | <!-- A clear and concise description of what the bug is. -->
13 | 
14 | ## To Reproduce
15 | 
16 | Steps to reproduce the behavior:
17 | 
18 | 1.
19 | 2.
20 | 3.
21 | 
22 | <!-- If you have a code sample, error messages, stack traces, please provide it here as well -->
23 | 
24 | ## Expected behavior
25 | 
26 | <!-- A clear and concise description of what you expected to happen. -->
27 | 
28 | ## Environment
29 | 
30 | Please copy and paste the output from this 
31 | [environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py)
32 | (or fill out the checklist below manually).
33 | 
34 | You can get the script and run it with:
35 | ```
36 | wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
37 | # For security purposes, please check the contents of collect_env.py before running it.
38 | python collect_env.py
39 | ```
40 | 
41 |  - PyTorch Version (e.g., 1.0):
42 |  - OS (e.g., Linux):
43 |  - How you installed PyTorch (`conda`, `pip`, source):
44 |  - Build command you used (if compiling from source):
45 |  - Python version:
46 |  - CUDA/cuDNN version:
47 |  - GPU models and configuration:
48 |  - Any other relevant information:
49 | 
50 | ## Additional context
51 | 
52 | <!-- Add any other context about the problem here. -->
53 | 


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: Feature request - [X]
 5 | labels: enhancement
 6 | assignees: snakers4
 7 | 
 8 | ---
 9 | 
10 | ## 🚀 Feature
11 | <!-- A clear and concise description of the feature proposal -->
12 | 
13 | ## Motivation
14 | 
15 | <!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too -->
16 | 
17 | ## Pitch
18 | 
19 | <!-- A clear and concise description of what you want to happen. -->
20 | 
21 | ## Alternatives
22 | 
23 | <!-- A clear and concise description of any alternative solutions or features you've considered, if any. -->
24 | 
25 | ## Additional context
26 | 
27 | <!-- Add any other context or screenshots about the feature request here. -->
28 | 


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/questions---help---support.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Questions / Help / Support
 3 | about: Ask for help, support or ask a question
 4 | title: "❓ Questions / Help / Support"
 5 | labels: help wanted
 6 | assignees: snakers4
 7 | 
 8 | ---
 9 | 
10 | ## ❓ Questions and Help
11 | 
12 | We have a [wiki](https://github.com/snakers4/silero-models/wiki) available for our users. Please make sure you have checked it out first.
13 | 


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at aveysov@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020-present Silero Team
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/README.md:
--------------------------------------------------------------------------------
  1 | [![Mailing list : test](http://img.shields.io/badge/Email-gray.svg?style=for-the-badge&logo=gmail)](mailto:hello@silero.ai) [![Mailing list : test](http://img.shields.io/badge/Telegram-blue.svg?style=for-the-badge&logo=telegram)](https://t.me/silero_speech) [![License: CC BY-NC 4.0](https://img.shields.io/badge/License-MIT-lightgrey.svg?style=for-the-badge)](https://github.com/snakers4/silero-vad/blob/master/LICENSE)
  2 | 
  3 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb)
  4 | 
  5 | ![header](https://user-images.githubusercontent.com/12515440/89997349-b3523080-dc94-11ea-9906-ca2e8bc50535.png)
  6 | 
  7 | <br/>
  8 | <h1 align="center">Silero VAD</h1>
  9 | <br/>
 10 | 
 11 | **Silero VAD** - pre-trained enterprise-grade [Voice Activity Detector](https://en.wikipedia.org/wiki/Voice_activity_detection) (also see our [STT models](https://github.com/snakers4/silero-models)).
 12 | 
 13 | This repository also includes Number Detector and Language classifier [models](https://github.com/snakers4/silero-vad/wiki/Other-Models)
 14 | 
 15 | <br/>
 16 | 
 17 | <p align="center">
 18 |   <img src="https://user-images.githubusercontent.com/36505480/198026365-8da383e0-5398-4a12-b7f8-22c2c0059512.png" />
 19 | </p>
 20 | 
 21 | <details>
 22 | <summary>Real Time Example</summary>
 23 | 
 24 | https://user-images.githubusercontent.com/36505480/144874384-95f80f6d-a4f1-42cc-9be7-004c891dd481.mp4
 25 | 
 26 | </details>
 27 | 
 28 | <br/>
 29 | <h2 align="center">Key Features</h2>
 30 | <br/>
 31 | 
 32 | - **Stellar accuracy**
 33 | 
 34 |   Silero VAD has [excellent results](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#vs-other-available-solutions) on speech detection tasks.
 35 |   
 36 | - **Fast**
 37 | 
 38 |   One audio chunk (30+ ms) [takes](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics#silero-vad-performance-metrics) less than **1ms** to be processed on a single CPU thread. Using batching or GPU can also improve performance considerably. Under certain conditions ONNX may even run up to 4-5x faster. 
 39 | 
 40 | - **Lightweight**
 41 | 
 42 |   JIT model is around one megabyte in size.
 43 | 
 44 | - **General**
 45 | 
 46 |   Silero VAD was trained on huge corpora that include over **100** languages and it performs well on audios from different domains with various background noise and quality levels.
 47 | 
 48 | - **Flexible sampling rate**
 49 | 
 50 |   Silero VAD [supports](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#sample-rate-comparison)  **8000 Hz** and **16000 Hz** [sampling rates](https://en.wikipedia.org/wiki/Sampling_(signal_processing)#Sampling_rate).
 51 | 
 52 | - **Flexible chunk size**
 53 | 
 54 |   Model was trained on **30 ms**. Longer chunks are supported directly, others may work as well.
 55 | 
 56 | - **Highly Portable**
 57 | 
 58 |   Silero VAD reaps benefits from the rich ecosystems built around **PyTorch** and **ONNX** running everywhere where these runtimes are available.
 59 | 
 60 | - **No Strings Attached**
 61 | 
 62 |    Published under permissive license (MIT) Silero VAD has zero strings attached - no telemetry, no keys, no registration, no built-in expiration, no keys or vendor lock.
 63 | 
 64 | <br/>
 65 | <h2 align="center">Typical Use Cases</h2>
 66 | <br/>
 67 | 
 68 | - Voice activity detection for IOT / edge / mobile use cases
 69 | - Data cleaning and preparation, voice detection in general
 70 | - Telephony and call-center automation, voice bots
 71 | - Voice interfaces
 72 | 
 73 | <br/>
 74 | <h2 align="center">Links</h2>
 75 | <br/>
 76 | 
 77 | 
 78 | - [Examples and Dependencies](https://github.com/snakers4/silero-vad/wiki/Examples-and-Dependencies#dependencies)
 79 | - [Quality Metrics](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics)
 80 | - [Performance Metrics](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics)
 81 | - [Number Detector and Language classifier models](https://github.com/snakers4/silero-vad/wiki/Other-Models)
 82 | - [Versions and Available Models](https://github.com/snakers4/silero-vad/wiki/Version-history-and-Available-Models)
 83 | - [Further reading](https://github.com/snakers4/silero-models#further-reading)
 84 | - [FAQ](https://github.com/snakers4/silero-vad/wiki/FAQ)
 85 | 
 86 | <br/>
 87 | <h2 align="center">Get In Touch</h2>
 88 | <br/>
 89 | 
 90 | Try our models, create an [issue](https://github.com/snakers4/silero-vad/issues/new), start a [discussion](https://github.com/snakers4/silero-vad/discussions/new), join our telegram [chat](https://t.me/silero_speech), [email](mailto:hello@silero.ai) us, read our [news](https://t.me/silero_news).
 91 | 
 92 | Please see our [wiki](https://github.com/snakers4/silero-models/wiki) and [tiers](https://github.com/snakers4/silero-models/wiki/Licensing-and-Tiers) for relevant information and [email](mailto:hello@silero.ai) us directly.
 93 | 
 94 | **Citations**
 95 | 
 96 | ```
 97 | @misc{Silero VAD,
 98 |   author = {Silero Team},
 99 |   title = {Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier},
100 |   year = {2021},
101 |   publisher = {GitHub},
102 |   journal = {GitHub repository},
103 |   howpublished = {\url{https://github.com/snakers4/silero-vad}},
104 |   commit = {insert_some_commit_here},
105 |   email = {hello@silero.ai}
106 | }
107 | ```
108 | 
109 | <br/>
110 | <h2 align="center">Examples and VAD-based Community Apps</h2>
111 | <br/>
112 | 
113 | - Example of VAD ONNX Runtime model usage in [C++](https://github.com/snakers4/silero-vad/tree/master/examples/cpp) 
114 | 
115 | - Voice activity detection for the [browser](https://github.com/ricky0123/vad) using ONNX Runtime Web 
116 | 


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/examples/colab_record_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "bccAucKjnPHm"
  7 |    },
  8 |    "source": [
  9 |     "### Dependencies and inputs"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {
 16 |     "id": "cSih95WFmwgi"
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "!pip -q install pydub\n",
 21 |     "from google.colab import output\n",
 22 |     "from base64 import b64decode, b64encode\n",
 23 |     "from io import BytesIO\n",
 24 |     "import numpy as np\n",
 25 |     "from pydub import AudioSegment\n",
 26 |     "from IPython.display import HTML, display\n",
 27 |     "import torch\n",
 28 |     "import matplotlib.pyplot as plt\n",
 29 |     "import moviepy.editor as mpe\n",
 30 |     "from matplotlib.animation import FuncAnimation, FFMpegWriter\n",
 31 |     "import matplotlib\n",
 32 |     "matplotlib.use('Agg')\n",
 33 |     "\n",
 34 |     "torch.set_num_threads(1)\n",
 35 |     "\n",
 36 |     "model, _ = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
 37 |     "                              model='silero_vad',\n",
 38 |     "                              force_reload=True)\n",
 39 |     "\n",
 40 |     "def int2float(sound):\n",
 41 |     "    abs_max = np.abs(sound).max()\n",
 42 |     "    sound = sound.astype('float32')\n",
 43 |     "    if abs_max > 0:\n",
 44 |     "        sound *= 1/abs_max\n",
 45 |     "    sound = sound.squeeze()\n",
 46 |     "    return sound\n",
 47 |     "\n",
 48 |     "AUDIO_HTML = \"\"\"\n",
 49 |     "<script>\n",
 50 |     "var my_div = document.createElement(\"DIV\");\n",
 51 |     "var my_p = document.createElement(\"P\");\n",
 52 |     "var my_btn = document.createElement(\"BUTTON\");\n",
 53 |     "var t = document.createTextNode(\"Press to start recording\");\n",
 54 |     "\n",
 55 |     "my_btn.appendChild(t);\n",
 56 |     "//my_p.appendChild(my_btn);\n",
 57 |     "my_div.appendChild(my_btn);\n",
 58 |     "document.body.appendChild(my_div);\n",
 59 |     "\n",
 60 |     "var base64data = 0;\n",
 61 |     "var reader;\n",
 62 |     "var recorder, gumStream;\n",
 63 |     "var recordButton = my_btn;\n",
 64 |     "\n",
 65 |     "var handleSuccess = function(stream) {\n",
 66 |     "  gumStream = stream;\n",
 67 |     "  var options = {\n",
 68 |     "    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
 69 |     "    mimeType : 'audio/webm;codecs=opus'\n",
 70 |     "    //mimeType : 'audio/webm;codecs=pcm'\n",
 71 |     "  };            \n",
 72 |     "  //recorder = new MediaRecorder(stream, options);\n",
 73 |     "  recorder = new MediaRecorder(stream);\n",
 74 |     "  recorder.ondataavailable = function(e) {            \n",
 75 |     "    var url = URL.createObjectURL(e.data);\n",
 76 |     "    // var preview = document.createElement('audio');\n",
 77 |     "    // preview.controls = true;\n",
 78 |     "    // preview.src = url;\n",
 79 |     "    // document.body.appendChild(preview);\n",
 80 |     "\n",
 81 |     "    reader = new FileReader();\n",
 82 |     "    reader.readAsDataURL(e.data); \n",
 83 |     "    reader.onloadend = function() {\n",
 84 |     "      base64data = reader.result;\n",
 85 |     "      //console.log(\"Inside FileReader:\" + base64data);\n",
 86 |     "    }\n",
 87 |     "  };\n",
 88 |     "  recorder.start();\n",
 89 |     "  };\n",
 90 |     "\n",
 91 |     "recordButton.innerText = \"Recording... press to stop\";\n",
 92 |     "\n",
 93 |     "navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);\n",
 94 |     "\n",
 95 |     "\n",
 96 |     "function toggleRecording() {\n",
 97 |     "  if (recorder && recorder.state == \"recording\") {\n",
 98 |     "      recorder.stop();\n",
 99 |     "      gumStream.getAudioTracks()[0].stop();\n",
100 |     "      recordButton.innerText = \"Saving recording...\"\n",
101 |     "  }\n",
102 |     "}\n",
103 |     "\n",
104 |     "// https://stackoverflow.com/a/951057\n",
105 |     "function sleep(ms) {\n",
106 |     "  return new Promise(resolve => setTimeout(resolve, ms));\n",
107 |     "}\n",
108 |     "\n",
109 |     "var data = new Promise(resolve=>{\n",
110 |     "//recordButton.addEventListener(\"click\", toggleRecording);\n",
111 |     "recordButton.onclick = ()=>{\n",
112 |     "toggleRecording()\n",
113 |     "\n",
114 |     "sleep(2000).then(() => {\n",
115 |     "  // wait 2000ms for the data to be available...\n",
116 |     "  // ideally this should use something like await...\n",
117 |     "  //console.log(\"Inside data:\" + base64data)\n",
118 |     "  resolve(base64data.toString())\n",
119 |     "\n",
120 |     "});\n",
121 |     "\n",
122 |     "}\n",
123 |     "});\n",
124 |     "      \n",
125 |     "</script>\n",
126 |     "\"\"\"\n",
127 |     "\n",
128 |     "def record(sec=10):\n",
129 |     "    display(HTML(AUDIO_HTML))\n",
130 |     "    s = output.eval_js(\"data\")\n",
131 |     "    b = b64decode(s.split(',')[1])\n",
132 |     "    audio = AudioSegment.from_file(BytesIO(b))\n",
133 |     "    audio.export('test.mp3', format='mp3')\n",
134 |     "    audio = audio.set_channels(1)\n",
135 |     "    audio = audio.set_frame_rate(16000)\n",
136 |     "    audio_float = int2float(np.array(audio.get_array_of_samples()))\n",
137 |     "    audio_tens = torch.tensor(audio_float )\n",
138 |     "    return audio_tens\n",
139 |     "\n",
140 |     "def make_animation(probs, audio_duration, interval=40):\n",
141 |     "    fig = plt.figure(figsize=(16, 9))\n",
142 |     "    ax = plt.axes(xlim=(0, audio_duration), ylim=(0, 1.02))\n",
143 |     "    line, = ax.plot([], [], lw=2)\n",
144 |     "    x = [i / 16000 * 512 for i in range(len(probs))]\n",
145 |     "    plt.xlabel('Time, seconds', fontsize=16)\n",
146 |     "    plt.ylabel('Speech Probability', fontsize=16)\n",
147 |     "\n",
148 |     "    def init():\n",
149 |     "        plt.fill_between(x, probs, color='#064273')\n",
150 |     "        line.set_data([], [])\n",
151 |     "        line.set_color('#990000')\n",
152 |     "        return line,\n",
153 |     "\n",
154 |     "    def animate(i):\n",
155 |     "        x = i * interval / 1000 - 0.04\n",
156 |     "        y = np.linspace(0, 1.02, 2)\n",
157 |     "        \n",
158 |     "        line.set_data(x, y)\n",
159 |     "        line.set_color('#990000')\n",
160 |     "        return line,\n",
161 |     "\n",
162 |     "    anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=audio_duration / (interval / 1000))\n",
163 |     "\n",
164 |     "    f = r\"animation.mp4\" \n",
165 |     "    writervideo = FFMpegWriter(fps=1000/interval) \n",
166 |     "    anim.save(f, writer=writervideo)\n",
167 |     "    plt.close('all')\n",
168 |     "\n",
169 |     "def combine_audio(vidname, audname, outname, fps=25): \n",
170 |     "    my_clip = mpe.VideoFileClip(vidname, verbose=False)\n",
171 |     "    audio_background = mpe.AudioFileClip(audname)\n",
172 |     "    final_clip = my_clip.set_audio(audio_background)\n",
173 |     "    final_clip.write_videofile(outname,fps=fps,verbose=False)\n",
174 |     "\n",
175 |     "def record_make_animation():\n",
176 |     "  tensor = record()\n",
177 |     "\n",
178 |     "  print('Calculating probabilities...')\n",
179 |     "  speech_probs = []\n",
180 |     "  window_size_samples = 512\n",
181 |     "  for i in range(0, len(tensor), window_size_samples):\n",
182 |     "      if len(tensor[i: i+ window_size_samples]) < window_size_samples:\n",
183 |     "        break\n",
184 |     "      speech_prob = model(tensor[i: i+ window_size_samples], 16000).item()\n",
185 |     "      speech_probs.append(speech_prob)\n",
186 |     "  model.reset_states()\n",
187 |     "  print('Making animation...')\n",
188 |     "  make_animation(speech_probs, len(tensor) / 16000)\n",
189 |     "\n",
190 |     "  print('Merging your voice with animation...')\n",
191 |     "  combine_audio('animation.mp4', 'test.mp3', 'merged.mp4')\n",
192 |     "  print('Done!')\n",
193 |     "  mp4 = open('merged.mp4','rb').read()\n",
194 |     "  data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
195 |     "  display(HTML(\"\"\"\n",
196 |     "  <video width=800 controls>\n",
197 |     "        <source src=\"%s\" type=\"video/mp4\">\n",
198 |     "  </video>\n",
199 |     "  \"\"\" % data_url))"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {
205 |     "id": "IFVs3GvTnpB1"
206 |    },
207 |    "source": [
208 |     "## Record example"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {
215 |     "id": "5EBjrTwiqAaQ"
216 |    },
217 |    "outputs": [],
218 |    "source": [
219 |     "record_make_animation()"
220 |    ]
221 |   }
222 |  ],
223 |  "metadata": {
224 |   "colab": {
225 |    "collapsed_sections": [
226 |     "bccAucKjnPHm"
227 |    ],
228 |    "name": "Untitled2.ipynb",
229 |    "provenance": []
230 |   },
231 |   "kernelspec": {
232 |    "display_name": "Python 3",
233 |    "name": "python3"
234 |   },
235 |   "language_info": {
236 |    "name": "python"
237 |   }
238 |  },
239 |  "nbformat": 4,
240 |  "nbformat_minor": 0
241 | }
242 | 


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/examples/cpp/README.md:
--------------------------------------------------------------------------------
 1 | # Stream example in C++
 2 | 
 3 | Here's a simple example of the vad model in c++ onnxruntime.
 4 | 
 5 | 
 6 | 
 7 | ## Requirements
 8 | 
 9 | Code are tested in the environments bellow, feel free to try others.
10 | 
11 | - WSL2 + Debian-bullseye (docker)  
12 | - gcc 12.2.0
13 | - onnxruntime-linux-x64-1.12.1
14 | 
15 | 
16 | 
17 | ## Usage
18 | 
19 | 1. Install gcc 12.2.0, or just pull the docker image with `docker pull gcc:12.2.0-bullseye`
20 | 
21 | 2. Install onnxruntime-linux-x64-1.12.1
22 | 
23 |    - Download lib onnxruntime: 
24 | 
25 |      `wget https://github.com/microsoft/onnxruntime/releases/download/v1.12.1/onnxruntime-linux-x64-1.12.1.tgz`
26 | 
27 |    - Unzip. Assume the path is `/root/onnxruntime-linux-x64-1.12.1`
28 | 
29 | 3. Modify wav path & Test configs in main function
30 | 
31 |    `wav::WavReader wav_reader("${path_to_your_wav_file}");`
32 | 
33 |    test sample rate, frame per ms, threshold...
34 | 
35 | 4. Build with gcc and run
36 | 
37 |    ```bash
38 |    # Build
39 |    g++ silero-vad-onnx.cpp -I /root/onnxruntime-linux-x64-1.12.1/include/ -L /root/onnxruntime-linux-x64-1.12.1/lib/ -lonnxruntime  -Wl,-rpath,/root/onnxruntime-linux-x64-1.12.1/lib/ -o test
40 |    
41 |    # Run
42 |    ./test
43 |    ```


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/examples/cpp/silero-vad-onnx.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <vector>
  3 | #include <sstream>
  4 | #include <cstring>
  5 | #include <chrono>
  6 | 
  7 | #include "onnxruntime_cxx_api.h"
  8 | #include "wav.h"
  9 | 
 10 | class VadIterator
 11 | {
 12 |     // OnnxRuntime resources
 13 |     Ort::Env env;
 14 |     Ort::SessionOptions session_options;
 15 |     std::shared_ptr<Ort::Session> session = nullptr;
 16 |     Ort::AllocatorWithDefaultOptions allocator;
 17 |     Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeCPU);
 18 | 
 19 | public:
 20 |     void init_engine_threads(int inter_threads, int intra_threads)
 21 |     {   
 22 |         // The method should be called in each thread/proc in multi-thread/proc work
 23 |         session_options.SetIntraOpNumThreads(intra_threads);
 24 |         session_options.SetInterOpNumThreads(inter_threads);
 25 |         session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
 26 |     }
 27 | 
 28 |     void init_onnx_model(const std::string &model_path)
 29 |     {   
 30 |         // Init threads = 1 for 
 31 |         init_engine_threads(1, 1);
 32 |         // Load model
 33 |         session = std::make_shared<Ort::Session>(env, model_path.c_str(), session_options);
 34 |     }
 35 | 
 36 |     void reset_states()
 37 |     {
 38 |         // Call reset before each audio start
 39 |         std::memset(_h.data(), 0.0f, _h.size() * sizeof(float));
 40 |         std::memset(_c.data(), 0.0f, _c.size() * sizeof(float));
 41 |         triggerd = false;
 42 |         temp_end = 0;
 43 |         current_sample = 0;
 44 |     }
 45 | 
 46 |     // Call it in predict func. if you prefer raw bytes input.
 47 |     void bytes_to_float_tensor(const char *pcm_bytes) 
 48 |     {
 49 |         std::memcpy(input.data(), pcm_bytes, window_size_samples * sizeof(int16_t));
 50 |         for (int i = 0; i < window_size_samples; i++)
 51 |         {
 52 |             input[i] = static_cast<float>(input[i]) / 32768; // int16_t normalized to float
 53 |         }
 54 |     }
 55 | 
 56 | 
 57 |     void predict(const std::vector<float> &data)
 58 |     {
 59 |         // bytes_to_float_tensor(data); 
 60 |         
 61 |         // Infer
 62 |         // Create ort tensors
 63 |         input.assign(data.begin(), data.end());
 64 |         Ort::Value input_ort = Ort::Value::CreateTensor<float>(
 65 |             memory_info, input.data(), input.size(), input_node_dims, 2);
 66 |         Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
 67 |             memory_info, sr.data(), sr.size(), sr_node_dims, 1);
 68 |         Ort::Value h_ort = Ort::Value::CreateTensor<float>(
 69 |             memory_info, _h.data(), _h.size(), hc_node_dims, 3);
 70 |         Ort::Value c_ort = Ort::Value::CreateTensor<float>(
 71 |             memory_info, _c.data(), _c.size(), hc_node_dims, 3);
 72 | 
 73 |         // Clear and add inputs
 74 |         ort_inputs.clear();
 75 |         ort_inputs.emplace_back(std::move(input_ort));
 76 |         ort_inputs.emplace_back(std::move(sr_ort));
 77 |         ort_inputs.emplace_back(std::move(h_ort));
 78 |         ort_inputs.emplace_back(std::move(c_ort));
 79 | 
 80 |         // Infer
 81 |         ort_outputs = session->Run(
 82 |             Ort::RunOptions{nullptr},
 83 |             input_node_names.data(), ort_inputs.data(), ort_inputs.size(),
 84 |             output_node_names.data(), output_node_names.size());
 85 | 
 86 |         // Output probability & update h,c recursively
 87 |         float output = ort_outputs[0].GetTensorMutableData<float>()[0];
 88 |         float *hn = ort_outputs[1].GetTensorMutableData<float>();
 89 |         std::memcpy(_h.data(), hn, size_hc * sizeof(float));
 90 |         float *cn = ort_outputs[2].GetTensorMutableData<float>();
 91 |         std::memcpy(_c.data(), cn, size_hc * sizeof(float));
 92 | 
 93 |         // Push forward sample index
 94 |         current_sample += window_size_samples;
 95 |         
 96 |         // Reset temp_end when > threshold 
 97 |         if ((output >= threshold) && (temp_end != 0))
 98 |         {
 99 |             temp_end = 0;
100 |         }
101 |         // 1) Silence
102 |         if ((output < threshold) && (triggerd == false))
103 |         {
104 |             // printf("{ silence: %.3f s }\n", 1.0 * current_sample / sample_rate);
105 |         }
106 |         // 2) Speaking 
107 |         if ((output >= (threshold - 0.15)) && (triggerd == true))
108 |         {
109 |             // printf("{ speaking_2: %.3f s }\n", 1.0 * current_sample / sample_rate);
110 |         }
111 | 
112 |         // 3) Start
113 |         if ((output >= threshold) && (triggerd == false))
114 |         {
115 |             triggerd = true;
116 |             speech_start = current_sample - window_size_samples - speech_pad_samples; // minus window_size_samples to get precise start time point.
117 |             printf("{ start: %.3f s }\n", 1.0 * speech_start / sample_rate);
118 |         }
119 | 
120 |         // 4) End 
121 |         if ((output < (threshold - 0.15)) && (triggerd == true))
122 |         {
123 | 
124 |             if (temp_end != 0)
125 |             {
126 |                 temp_end = current_sample;
127 |             }
128 |             // a. silence < min_slience_samples, continue speaking 
129 |             if ((current_sample - temp_end) < min_silence_samples)
130 |             {
131 |                 // printf("{ speaking_4: %.3f s }\n", 1.0 * current_sample / sample_rate);
132 |                 // printf("");
133 |             }
134 |             // b. silence >= min_slience_samples, end speaking
135 |             else
136 |             {
137 |                 speech_end = current_sample + speech_pad_samples;
138 |                 temp_end = 0;
139 |                 triggerd = false;
140 |                 printf("{ end: %.3f s }\n", 1.0 * speech_end / sample_rate);
141 |             }
142 |         }
143 | 
144 | 
145 |     }
146 | 
147 | private:
148 |     // model config
149 |     int64_t window_size_samples;  // Assign when init, support 256 512 768 for 8k; 512 1024 1536 for 16k.
150 |     int sample_rate;
151 |     int sr_per_ms;  // Assign when init, support 8 or 16
152 |     float threshold;
153 |     int min_silence_samples; // sr_per_ms * #ms
154 |     int speech_pad_samples; // usually a 
155 | 
156 |     // model states
157 |     bool triggerd = false;
158 |     unsigned int speech_start = 0; 
159 |     unsigned int speech_end = 0;
160 |     unsigned int temp_end = 0;
161 |     unsigned int current_sample = 0;    
162 |     // MAX 4294967295 samples / 8sample per ms / 1000 / 60 = 8947 minutes  
163 |     float output;
164 | 
165 |     // Onnx model
166 |     // Inputs
167 |     std::vector<Ort::Value> ort_inputs;
168 |     
169 |     std::vector<const char *> input_node_names = {"input", "sr", "h", "c"};
170 |     std::vector<float> input;
171 |     std::vector<int64_t> sr;
172 |     unsigned int size_hc = 2 * 1 * 64; // It's FIXED.
173 |     std::vector<float> _h;
174 |     std::vector<float> _c;
175 | 
176 |     int64_t input_node_dims[2] = {}; 
177 |     const int64_t sr_node_dims[1] = {1};
178 |     const int64_t hc_node_dims[3] = {2, 1, 64};
179 | 
180 |     // Outputs
181 |     std::vector<Ort::Value> ort_outputs;
182 |     std::vector<const char *> output_node_names = {"output", "hn", "cn"};
183 |     
184 | 
185 | public:
186 |     // Construction
187 |     VadIterator(const std::string ModelPath, 
188 |              int Sample_rate, int frame_size, 
189 |              float Threshold, int min_silence_duration_ms, int speech_pad_ms) 
190 |     {
191 |         init_onnx_model(ModelPath);
192 |         sample_rate = Sample_rate;
193 |         sr_per_ms = sample_rate / 1000;
194 |         threshold = Threshold;
195 |         min_silence_samples = sr_per_ms * min_silence_duration_ms;
196 |         speech_pad_samples = sr_per_ms * speech_pad_ms;
197 |         window_size_samples = frame_size * sr_per_ms;
198 |         
199 |         input.resize(window_size_samples);
200 |         input_node_dims[0] = 1;
201 |         input_node_dims[1] = window_size_samples;
202 |         // std::cout << "== Input size" << input.size() << std::endl;
203 |         _h.resize(size_hc);
204 |         _c.resize(size_hc);
205 |         sr.resize(1);
206 |     }
207 | 
208 | };
209 | 
210 | int main()
211 | {
212 | 
213 |     // Read wav
214 |     wav::WavReader wav_reader("./test_for_vad.wav");
215 |     std::vector<int16_t> data(wav_reader.num_samples());
216 |     std::vector<float> input_wav(wav_reader.num_samples());
217 | 
218 |     for (int i = 0; i < wav_reader.num_samples(); i++)
219 |     {
220 |         data[i] = static_cast<int16_t>(*(wav_reader.data() + i));
221 |     }
222 | 
223 |     for (int i = 0; i < wav_reader.num_samples(); i++)
224 |     {
225 |         input_wav[i] = static_cast<float>(data[i]) / 32768;
226 |     }
227 | 
228 |     // ===== Test configs =====
229 |     std::string path = "../files/silero_vad.onnx";
230 |     int test_sr = 8000;
231 |     int test_frame_ms = 64;
232 |     float test_threshold = 0.5f;
233 |     int test_min_silence_duration_ms = 0;
234 |     int test_speech_pad_ms = 0;
235 |     int test_window_samples = test_frame_ms * (test_sr/1000);
236 | 
237 |     VadIterator vad(
238 |         path, test_sr, test_frame_ms, test_threshold,
239 |         test_min_silence_duration_ms, test_speech_pad_ms);
240 | 
241 |     for (int j = 0; j < wav_reader.num_samples(); j += test_window_samples)
242 |     {
243 |         // std::cout << "== 4" << std::endl;
244 |         std::vector<float> r{&input_wav[0] + j, &input_wav[0] + j + test_window_samples};
245 |         auto start = std::chrono::high_resolution_clock::now();
246 |         // Predict and print throughout process time
247 |         vad.predict(r);
248 |         auto end = std::chrono::high_resolution_clock::now();
249 |         auto elapsed_time = std::chrono::duration_cast<std::chrono::nanoseconds>(end-start);
250 |         // std::cout << "== Elapsed time: " << 1.0*elapsed_time.count()/1000000 << "ms" << " ==" <<std::endl;
251 | 
252 |     }
253 | }
254 | 


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/examples/cpp/wav.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2016 Personal (Binbin Zhang)
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //   http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | 
 16 | #ifndef FRONTEND_WAV_H_
 17 | #define FRONTEND_WAV_H_
 18 | 
 19 | #include <assert.h>
 20 | #include <stdint.h>
 21 | #include <stdio.h>
 22 | #include <stdlib.h>
 23 | #include <string.h>
 24 | 
 25 | #include <string>
 26 | 
 27 | // #include "utils/log.h"
 28 | 
 29 | namespace wav {
 30 | 
 31 | struct WavHeader {
 32 |   char riff[4];  // "riff"
 33 |   unsigned int size;
 34 |   char wav[4];  // "WAVE"
 35 |   char fmt[4];  // "fmt "
 36 |   unsigned int fmt_size;
 37 |   uint16_t format;
 38 |   uint16_t channels;
 39 |   unsigned int sample_rate;
 40 |   unsigned int bytes_per_second;
 41 |   uint16_t block_size;
 42 |   uint16_t bit;
 43 |   char data[4];  // "data"
 44 |   unsigned int data_size;
 45 | };
 46 | 
 47 | class WavReader {
 48 |  public:
 49 |   WavReader() : data_(nullptr) {}
 50 |   explicit WavReader(const std::string& filename) { Open(filename); }
 51 | 
 52 |   bool Open(const std::string& filename) {
 53 |     FILE* fp = fopen(filename.c_str(), "rb"); //文件读取
 54 |     if (NULL == fp) {
 55 |       std::cout << "Error in read " << filename;
 56 |       return false;
 57 |     }
 58 | 
 59 |     WavHeader header;
 60 |     fread(&header, 1, sizeof(header), fp);
 61 |     if (header.fmt_size < 16) {
 62 |       fprintf(stderr,
 63 |               "WaveData: expect PCM format data "
 64 |               "to have fmt chunk of at least size 16.\n");
 65 |       return false;
 66 |     } else if (header.fmt_size > 16) {
 67 |       int offset = 44 - 8 + header.fmt_size - 16;
 68 |       fseek(fp, offset, SEEK_SET);
 69 |       fread(header.data, 8, sizeof(char), fp);
 70 |     }
 71 |     // check "riff" "WAVE" "fmt " "data"
 72 | 
 73 |     // Skip any sub-chunks between "fmt" and "data".  Usually there will
 74 |     // be a single "fact" sub chunk, but on Windows there can also be a
 75 |     // "list" sub chunk.
 76 |     while (0 != strncmp(header.data, "data", 4)) {
 77 |       // We will just ignore the data in these chunks.
 78 |       fseek(fp, header.data_size, SEEK_CUR);
 79 |       // read next sub chunk
 80 |       fread(header.data, 8, sizeof(char), fp);
 81 |     }
 82 | 
 83 |     num_channel_ = header.channels;
 84 |     sample_rate_ = header.sample_rate;
 85 |     bits_per_sample_ = header.bit;
 86 |     int num_data = header.data_size / (bits_per_sample_ / 8);
 87 |     data_ = new float[num_data]; // Create 1-dim array
 88 |     num_samples_ = num_data / num_channel_;
 89 | 
 90 |     for (int i = 0; i < num_data; ++i) {
 91 |       switch (bits_per_sample_) {
 92 |         case 8: {
 93 |           char sample;
 94 |           fread(&sample, 1, sizeof(char), fp);
 95 |           data_[i] = static_cast<float>(sample);
 96 |           break;
 97 |         }
 98 |         case 16: {
 99 |           int16_t sample;
100 |           fread(&sample, 1, sizeof(int16_t), fp);
101 |           // std::cout << sample;
102 |           data_[i] = static_cast<float>(sample);
103 |           // std::cout << data_[i];
104 |           break;
105 |         }
106 |         case 32: {
107 |           int sample;
108 |           fread(&sample, 1, sizeof(int), fp);
109 |           data_[i] = static_cast<float>(sample);
110 |           break;
111 |         }
112 |         default:
113 |           fprintf(stderr, "unsupported quantization bits");
114 |           exit(1);
115 |       }
116 |     }
117 |     fclose(fp);
118 |     return true;
119 |   }
120 | 
121 |   int num_channel() const { return num_channel_; }
122 |   int sample_rate() const { return sample_rate_; }
123 |   int bits_per_sample() const { return bits_per_sample_; }
124 |   int num_samples() const { return num_samples_; }
125 | 
126 |   ~WavReader() {
127 |     delete[] data_;
128 |   }
129 | 
130 |   const float* data() const { return data_; }
131 | 
132 |  private:
133 |   int num_channel_;
134 |   int sample_rate_;
135 |   int bits_per_sample_;
136 |   int num_samples_;  // sample points per channel
137 |   float* data_;
138 | };
139 | 
140 | class WavWriter {
141 |  public:
142 |   WavWriter(const float* data, int num_samples, int num_channel,
143 |             int sample_rate, int bits_per_sample)
144 |       : data_(data),
145 |         num_samples_(num_samples),
146 |         num_channel_(num_channel),
147 |         sample_rate_(sample_rate),
148 |         bits_per_sample_(bits_per_sample) {}
149 | 
150 |   void Write(const std::string& filename) {
151 |     FILE* fp = fopen(filename.c_str(), "w");
152 |     // init char 'riff' 'WAVE' 'fmt ' 'data'
153 |     WavHeader header;
154 |     char wav_header[44] = {0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, 0x57,
155 |                            0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, 0x10, 0x00,
156 |                            0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
157 |                            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
158 |                            0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00};
159 |     memcpy(&header, wav_header, sizeof(header));
160 |     header.channels = num_channel_;
161 |     header.bit = bits_per_sample_;
162 |     header.sample_rate = sample_rate_;
163 |     header.data_size = num_samples_ * num_channel_ * (bits_per_sample_ / 8);
164 |     header.size = sizeof(header) - 8 + header.data_size;
165 |     header.bytes_per_second =
166 |         sample_rate_ * num_channel_ * (bits_per_sample_ / 8);
167 |     header.block_size = num_channel_ * (bits_per_sample_ / 8);
168 | 
169 |     fwrite(&header, 1, sizeof(header), fp);
170 | 
171 |     for (int i = 0; i < num_samples_; ++i) {
172 |       for (int j = 0; j < num_channel_; ++j) {
173 |         switch (bits_per_sample_) {
174 |           case 8: {
175 |             char sample = static_cast<char>(data_[i * num_channel_ + j]);
176 |             fwrite(&sample, 1, sizeof(sample), fp);
177 |             break;
178 |           }
179 |           case 16: {
180 |             int16_t sample = static_cast<int16_t>(data_[i * num_channel_ + j]);
181 |             fwrite(&sample, 1, sizeof(sample), fp);
182 |             break;
183 |           }
184 |           case 32: {
185 |             int sample = static_cast<int>(data_[i * num_channel_ + j]);
186 |             fwrite(&sample, 1, sizeof(sample), fp);
187 |             break;
188 |           }
189 |         }
190 |       }
191 |     }
192 |     fclose(fp);
193 |   }
194 | 
195 |  private:
196 |   const float* data_;
197 |   int num_samples_;  // total float points in data_
198 |   int num_channel_;
199 |   int sample_rate_;
200 |   int bits_per_sample_;
201 | };
202 | 
203 | }  // namespace wenet
204 | 
205 | #endif  // FRONTEND_WAV_H_
206 | 


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/examples/microphone_and_webRTC_integration/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | In this example, an integration with the microphone and the webRTC VAD has been done.  I used [this](https://github.com/mozilla/DeepSpeech-examples/tree/r0.8/mic_vad_streaming) as a draft.
 3 | Here a short video to present the results:
 4 | 
 5 | https://user-images.githubusercontent.com/28188499/116685087-182ff100-a9b2-11eb-927d-ed9f621226ee.mp4
 6 | 
 7 | # Requirements:
 8 | The libraries used for the following example are:
 9 | ```
10 | Python == 3.6.9
11 | webrtcvad >= 2.0.10
12 | torchaudio >= 0.8.1
13 | torch >= 1.8.1
14 | halo >= 0.0.31
15 | Soundfile >= 0.13.3
16 | ```
17 | Using pip3:
18 | ```
19 | pip3 install webrtcvad
20 | pip3 install torchaudio
21 | pip3 install torch
22 | pip3 install halo
23 | pip3 install soundfile
24 | ```
25 | Moreover, to make the code easier, the default sample_rate is 16KHz without resampling.
26 | 
27 | This example has been tested on ``` ubuntu 18.04.3 LTS```
28 | 
29 | 


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/examples/microphone_and_webRTC_integration/microphone_and_webRTC_integration.py:
--------------------------------------------------------------------------------
  1 | import collections, queue
  2 | import numpy as np
  3 | import pyaudio
  4 | import webrtcvad
  5 | from halo import Halo
  6 | import torch
  7 | import torchaudio
  8 | 
  9 | class Audio(object):
 10 |     """Streams raw audio from microphone. Data is received in a separate thread, and stored in a buffer, to be read from."""
 11 | 
 12 |     FORMAT = pyaudio.paInt16
 13 |     # Network/VAD rate-space
 14 |     RATE_PROCESS = 16000
 15 |     CHANNELS = 1
 16 |     BLOCKS_PER_SECOND = 50
 17 | 
 18 |     def __init__(self, callback=None, device=None, input_rate=RATE_PROCESS):
 19 |         def proxy_callback(in_data, frame_count, time_info, status):
 20 |             #pylint: disable=unused-argument
 21 |             callback(in_data)
 22 |             return (None, pyaudio.paContinue)
 23 |         if callback is None: callback = lambda in_data: self.buffer_queue.put(in_data)
 24 |         self.buffer_queue = queue.Queue()
 25 |         self.device = device
 26 |         self.input_rate = input_rate
 27 |         self.sample_rate = self.RATE_PROCESS
 28 |         self.block_size = int(self.RATE_PROCESS / float(self.BLOCKS_PER_SECOND))
 29 |         self.block_size_input = int(self.input_rate / float(self.BLOCKS_PER_SECOND))
 30 |         self.pa = pyaudio.PyAudio()
 31 | 
 32 |         kwargs = {
 33 |             'format': self.FORMAT,
 34 |             'channels': self.CHANNELS,
 35 |             'rate': self.input_rate,
 36 |             'input': True,
 37 |             'frames_per_buffer': self.block_size_input,
 38 |             'stream_callback': proxy_callback,
 39 |         }
 40 | 
 41 |         self.chunk = None
 42 |         # if not default device
 43 |         if self.device:
 44 |             kwargs['input_device_index'] = self.device
 45 | 
 46 |         self.stream = self.pa.open(**kwargs)
 47 |         self.stream.start_stream()
 48 | 
 49 |     def read(self):
 50 |         """Return a block of audio data, blocking if necessary."""
 51 |         return self.buffer_queue.get()
 52 | 
 53 |     def destroy(self):
 54 |         self.stream.stop_stream()
 55 |         self.stream.close()
 56 |         self.pa.terminate()
 57 | 
 58 |     frame_duration_ms = property(lambda self: 1000 * self.block_size // self.sample_rate)
 59 | 
 60 | 
 61 | class VADAudio(Audio):
 62 |     """Filter & segment audio with voice activity detection."""
 63 | 
 64 |     def __init__(self, aggressiveness=3, device=None, input_rate=None):
 65 |         super().__init__(device=device, input_rate=input_rate)
 66 |         self.vad = webrtcvad.Vad(aggressiveness)
 67 | 
 68 |     def frame_generator(self):
 69 |         """Generator that yields all audio frames from microphone."""
 70 |         if self.input_rate == self.RATE_PROCESS:
 71 |             while True:
 72 |                 yield self.read()
 73 |         else:
 74 |             raise Exception("Resampling required")
 75 | 
 76 |     def vad_collector(self, padding_ms=300, ratio=0.75, frames=None):
 77 |         """Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
 78 |             Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
 79 |             Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
 80 |                       |---utterence---|        |---utterence---|
 81 |         """
 82 |         if frames is None: frames = self.frame_generator()
 83 |         num_padding_frames = padding_ms // self.frame_duration_ms
 84 |         ring_buffer = collections.deque(maxlen=num_padding_frames)
 85 |         triggered = False
 86 | 
 87 |         for frame in frames:
 88 |             if len(frame) < 640:
 89 |                 return
 90 | 
 91 |             is_speech = self.vad.is_speech(frame, self.sample_rate)
 92 | 
 93 |             if not triggered:
 94 |                 ring_buffer.append((frame, is_speech))
 95 |                 num_voiced = len([f for f, speech in ring_buffer if speech])
 96 |                 if num_voiced > ratio * ring_buffer.maxlen:
 97 |                     triggered = True
 98 |                     for f, s in ring_buffer:
 99 |                         yield f
100 |                     ring_buffer.clear()
101 | 
102 |             else:
103 |                 yield frame
104 |                 ring_buffer.append((frame, is_speech))
105 |                 num_unvoiced = len([f for f, speech in ring_buffer if not speech])
106 |                 if num_unvoiced > ratio * ring_buffer.maxlen:
107 |                     triggered = False
108 |                     yield None
109 |                     ring_buffer.clear()
110 | 
111 | def main(ARGS):
112 |     # Start audio with VAD
113 |     vad_audio = VADAudio(aggressiveness=ARGS.webRTC_aggressiveness,
114 |                          device=ARGS.device,
115 |                          input_rate=ARGS.rate)
116 | 
117 |     print("Listening (ctrl-C to exit)...")
118 |     frames = vad_audio.vad_collector()
119 | 
120 |     # load silero VAD
121 |     torchaudio.set_audio_backend("soundfile")
122 |     model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
123 |                                     model=ARGS.silaro_model_name,
124 |                                     force_reload= ARGS.reload)
125 |     (get_speech_ts,_,_, _,_, _, _) = utils
126 | 
127 | 
128 |     # Stream from microphone to DeepSpeech using VAD
129 |     spinner = None
130 |     if not ARGS.nospinner:
131 |         spinner = Halo(spinner='line')
132 |     wav_data = bytearray()
133 |     for frame in frames:
134 |         if frame is not None:
135 |             if spinner: spinner.start()
136 | 
137 |             wav_data.extend(frame)
138 |         else:
139 |             if spinner: spinner.stop()
140 |             print("webRTC has detected a possible speech")
141 | 
142 |             newsound= np.frombuffer(wav_data,np.int16)
143 |             audio_float32=Int2Float(newsound)
144 |             time_stamps =get_speech_ts(audio_float32, model,num_steps=ARGS.num_steps,trig_sum=ARGS.trig_sum,neg_trig_sum=ARGS.neg_trig_sum,
145 |                                     num_samples_per_window=ARGS.num_samples_per_window,min_speech_samples=ARGS.min_speech_samples,
146 |                                     min_silence_samples=ARGS.min_silence_samples)
147 | 
148 |             if(len(time_stamps)>0):
149 |                 print("silero VAD has detected a possible speech")
150 |             else:
151 |                 print("silero VAD has detected a noise")
152 |             print()
153 |             wav_data = bytearray()
154 | 
155 | 
156 | def Int2Float(sound):
157 |     _sound = np.copy(sound)  #
158 |     abs_max = np.abs(_sound).max()
159 |     _sound = _sound.astype('float32')
160 |     if abs_max > 0:
161 |         _sound *= 1/abs_max
162 |     audio_float32 = torch.from_numpy(_sound.squeeze())
163 |     return audio_float32
164 | 
165 | if __name__ == '__main__':
166 |     DEFAULT_SAMPLE_RATE = 16000
167 | 
168 |     import argparse
169 |     parser = argparse.ArgumentParser(description="Stream from microphone to webRTC and silero VAD")
170 | 
171 |     parser.add_argument('-v', '--webRTC_aggressiveness', type=int, default=3,
172 |                         help="Set aggressiveness of webRTC: an integer between 0 and 3, 0 being the least aggressive about filtering out non-speech, 3 the most aggressive. Default: 3")
173 |     parser.add_argument('--nospinner', action='store_true',
174 |                         help="Disable spinner")
175 |     parser.add_argument('-d', '--device', type=int, default=None,
176 |                         help="Device input index (Int) as listed by pyaudio.PyAudio.get_device_info_by_index(). If not provided, falls back to PyAudio.get_default_device().")
177 | 
178 |     parser.add_argument('-name', '--silaro_model_name', type=str, default="silero_vad",
179 |                         help="select the name of the model. You can select between 'silero_vad',''silero_vad_micro','silero_vad_micro_8k','silero_vad_mini','silero_vad_mini_8k'")
180 |     parser.add_argument('--reload', action='store_true',help="download the last version of the silero vad")
181 | 
182 |     parser.add_argument('-ts', '--trig_sum', type=float, default=0.25,
183 |                         help="overlapping windows are used for each audio chunk, trig sum defines average probability among those windows for switching into triggered state (speech state)")
184 | 
185 |     parser.add_argument('-nts', '--neg_trig_sum', type=float, default=0.07,
186 |                         help="same as trig_sum, but for switching from triggered to non-triggered state (non-speech)")
187 | 
188 |     parser.add_argument('-N', '--num_steps', type=int, default=8,
189 |                         help="nubmer of overlapping windows to split audio chunk into (we recommend 4 or 8)")
190 | 
191 |     parser.add_argument('-nspw', '--num_samples_per_window', type=int, default=4000,
192 |                         help="number of samples in each window, our models were trained using 4000 samples (250 ms) per window, so this is preferable value (lesser values reduce quality)")
193 | 
194 |     parser.add_argument('-msps', '--min_speech_samples', type=int, default=10000,
195 |                         help="minimum speech chunk duration in samples")
196 | 
197 |     parser.add_argument('-msis', '--min_silence_samples', type=int, default=500,
198 |                         help=" minimum silence duration in samples between to separate speech chunks")
199 |     ARGS = parser.parse_args()
200 |     ARGS.rate=DEFAULT_SAMPLE_RATE
201 |     main(ARGS)


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/examples/pyaudio-streaming/README.md:
--------------------------------------------------------------------------------
 1 | # Pyaudio Streaming Example
 2 | 
 3 | This example notebook shows how micophone audio fetched by pyaudio can be processed with Silero-VAD.
 4 | 
 5 | It has been designed as a low-level example for binary real-time streaming using only the prediction of the model, processing the binary data and plotting the speech probabilities at the end to visualize it.
 6 | 
 7 | Currently, the notebook consits of two examples:
 8 |  - One that records audio of a predefined length from the microphone, process it with Silero-VAD, and plots it afterwards.
 9 |  - The other one plots the speech probabilities in real-time (using jupyterplot) and records the audio until you press enter.
10 | 
11 | ## Example Video for the Real-Time Visualization
12 | 
13 | 
14 | https://user-images.githubusercontent.com/8079748/117580455-4622dd00-b0f8-11eb-858d-e6368ed4eada.mp4
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/examples/pyaudio-streaming/pyaudio-streaming-examples.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "62a0cccb",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Pyaudio Microphone Streaming Examples\n",
  9 |     "\n",
 10 |     "A simple notebook that uses pyaudio to get the microphone audio and feeds this audio then to Silero VAD.\n",
 11 |     "\n",
 12 |     "I created it as an example on how binary data from a stream could be feed into Silero VAD.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "Has been tested on Ubuntu 21.04 (x86). After you installed the dependencies below, no additional setup is required."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "id": "64cbe1eb",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Dependencies\n",
 24 |     "The cell below lists all used dependencies and the used versions. Uncomment to install them from within the notebook."
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "57bc2aac",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "#!pip install numpy==1.20.2\n",
 35 |     "#!pip install torch==1.9.0\n",
 36 |     "#!pip install matplotlib==3.4.2\n",
 37 |     "#!pip install torchaudio==0.9.0\n",
 38 |     "#!pip install soundfile==0.10.3.post1\n",
 39 |     "#!pip install pyaudio==0.2.11"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "id": "110de761",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "## Imports"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "id": "5a647d8d",
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "import io\n",
 58 |     "import numpy as np\n",
 59 |     "import torch\n",
 60 |     "torch.set_num_threads(1)\n",
 61 |     "import torchaudio\n",
 62 |     "import matplotlib\n",
 63 |     "import matplotlib.pylab as plt\n",
 64 |     "torchaudio.set_audio_backend(\"soundfile\")\n",
 65 |     "import pyaudio"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "id": "725d7066",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
 76 |     "                              model='silero_vad',\n",
 77 |     "                              force_reload=True)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "id": "1c0b2ea7",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "(get_speech_timestamps,\n",
 88 |     " save_audio,\n",
 89 |     " read_audio,\n",
 90 |     " VADIterator,\n",
 91 |     " collect_chunks) = utils"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "id": "f9112603",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "### Helper Methods"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "id": "5abc6330",
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# Taken from utils_vad.py\n",
110 |     "def validate(model,\n",
111 |     "             inputs: torch.Tensor):\n",
112 |     "    with torch.no_grad():\n",
113 |     "        outs = model(inputs)\n",
114 |     "    return outs\n",
115 |     "\n",
116 |     "# Provided by Alexander Veysov\n",
117 |     "def int2float(sound):\n",
118 |     "    abs_max = np.abs(sound).max()\n",
119 |     "    sound = sound.astype('float32')\n",
120 |     "    if abs_max > 0:\n",
121 |     "        sound *= 1/abs_max\n",
122 |     "    sound = sound.squeeze()  # depends on the use case\n",
123 |     "    return sound"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "id": "5124095e",
129 |    "metadata": {},
130 |    "source": [
131 |     "## Pyaudio Set-up"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "id": "a845356e",
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "FORMAT = pyaudio.paInt16\n",
142 |     "CHANNELS = 1\n",
143 |     "SAMPLE_RATE = 16000\n",
144 |     "CHUNK = int(SAMPLE_RATE / 10)\n",
145 |     "\n",
146 |     "audio = pyaudio.PyAudio()"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "id": "0b910c99",
152 |    "metadata": {},
153 |    "source": [
154 |     "## Simple Example\n",
155 |     "The following example reads the audio as 250ms chunks from the microphone, converts them to a Pytorch Tensor, and gets the probabilities/confidences if the model thinks the frame is voiced."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "9d3d2c10",
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "num_samples = 1536"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "id": "3cb44a4a",
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "stream = audio.open(format=FORMAT,\n",
176 |     "                    channels=CHANNELS,\n",
177 |     "                    rate=SAMPLE_RATE,\n",
178 |     "                    input=True,\n",
179 |     "                    frames_per_buffer=CHUNK)\n",
180 |     "data = []\n",
181 |     "voiced_confidences = []\n",
182 |     "\n",
183 |     "print(\"Started Recording\")\n",
184 |     "for i in range(0, frames_to_record):\n",
185 |     "    \n",
186 |     "    audio_chunk = stream.read(num_samples)\n",
187 |     "    \n",
188 |     "    # in case you want to save the audio later\n",
189 |     "    data.append(audio_chunk)\n",
190 |     "    \n",
191 |     "    audio_int16 = np.frombuffer(audio_chunk, np.int16);\n",
192 |     "\n",
193 |     "    audio_float32 = int2float(audio_int16)\n",
194 |     "    \n",
195 |     "    # get the confidences and add them to the list to plot them later\n",
196 |     "    new_confidence = model(torch.from_numpy(audio_float32), 16000).item()\n",
197 |     "    voiced_confidences.append(new_confidence)\n",
198 |     "    \n",
199 |     "print(\"Stopped the recording\")\n",
200 |     "\n",
201 |     "# plot the confidences for the speech\n",
202 |     "plt.figure(figsize=(20,6))\n",
203 |     "plt.plot(voiced_confidences)\n",
204 |     "plt.show()"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "id": "a3dda982",
210 |    "metadata": {},
211 |    "source": [
212 |     "## Real Time Visualization\n",
213 |     "\n",
214 |     "As an enhancement to plot the speech probabilities in real time I added the implementation below.\n",
215 |     "In contrast to the simeple one, it records the audio until to stop the recording by pressing enter.\n",
216 |     "While looking into good ways to update matplotlib plots in real-time, I found a simple libarary that does the job. https://github.com/lvwerra/jupyterplot It has some limitations, but works for this use case really well.\n"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "id": "05ef4100",
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "#!pip install jupyterplot==0.0.3"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "id": "d1d4cdd6",
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "from jupyterplot import ProgressPlot\n",
237 |     "import threading\n",
238 |     "\n",
239 |     "continue_recording = True\n",
240 |     "\n",
241 |     "def stop():\n",
242 |     "    input(\"Press Enter to stop the recording:\")\n",
243 |     "    global continue_recording\n",
244 |     "    continue_recording = False\n",
245 |     "\n",
246 |     "def start_recording():\n",
247 |     "    \n",
248 |     "    stream = audio.open(format=FORMAT,\n",
249 |     "                    channels=CHANNELS,\n",
250 |     "                    rate=SAMPLE_RATE,\n",
251 |     "                    input=True,\n",
252 |     "                    frames_per_buffer=CHUNK)\n",
253 |     "\n",
254 |     "    data = []\n",
255 |     "    voiced_confidences = []\n",
256 |     "    \n",
257 |     "    global continue_recording\n",
258 |     "    continue_recording = True\n",
259 |     "    \n",
260 |     "    pp = ProgressPlot(plot_names=[\"Silero VAD\"],line_names=[\"speech probabilities\"], x_label=\"audio chunks\")\n",
261 |     "    \n",
262 |     "    stop_listener = threading.Thread(target=stop)\n",
263 |     "    stop_listener.start()\n",
264 |     "\n",
265 |     "    while continue_recording:\n",
266 |     "    \n",
267 |     "        audio_chunk = stream.read(num_samples)\n",
268 |     "    \n",
269 |     "        # in case you want to save the audio later\n",
270 |     "        data.append(audio_chunk)\n",
271 |     "    \n",
272 |     "        audio_int16 = np.frombuffer(audio_chunk, np.int16);\n",
273 |     "\n",
274 |     "        audio_float32 = int2float(audio_int16)\n",
275 |     "    \n",
276 |     "        # get the confidences and add them to the list to plot them later\n",
277 |     "        new_confidence = model(torch.from_numpy(audio_float32), 16000).item()\n",
278 |     "        voiced_confidences.append(new_confidence)\n",
279 |     "    \n",
280 |     "        pp.update(new_confidence)\n",
281 |     "\n",
282 |     "\n",
283 |     "    pp.finalize()"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "id": "1e398009",
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "start_recording()"
294 |    ]
295 |   }
296 |  ],
297 |  "metadata": {
298 |   "kernelspec": {
299 |    "display_name": "Python 3",
300 |    "language": "python",
301 |    "name": "python3"
302 |   },
303 |   "language_info": {
304 |    "codemirror_mode": {
305 |     "name": "ipython",
306 |     "version": 3
307 |    },
308 |    "file_extension": ".py",
309 |    "mimetype": "text/x-python",
310 |    "name": "python",
311 |    "nbconvert_exporter": "python",
312 |    "pygments_lexer": "ipython3",
313 |    "version": "3.7.10"
314 |   },
315 |   "toc": {
316 |    "base_numbering": 1,
317 |    "nav_menu": {},
318 |    "number_sections": true,
319 |    "sideBar": true,
320 |    "skip_h1_title": false,
321 |    "title_cell": "Table of Contents",
322 |    "title_sidebar": "Contents",
323 |    "toc_cell": false,
324 |    "toc_position": {},
325 |    "toc_section_display": true,
326 |    "toc_window_display": false
327 |   }
328 |  },
329 |  "nbformat": 4,
330 |  "nbformat_minor": 5
331 | }
332 | 


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/files/lang_dict_95.json:
--------------------------------------------------------------------------------
1 | {"59": "mg, Malagasy", "76": "tk, Turkmen", "20": "lb, Luxembourgish, Letzeburgesch", "62": "or, Oriya", "30": "en, English", "26": "oc, Occitan", "69": "no, Norwegian", "77": "sr, Serbian", "90": "bs, Bosnian", "71": "el, Greek, Modern (1453\u2013)", "15": "az, Azerbaijani", "12": "lo, Lao", "85": "zh-HK, Chinese", "79": "cs, Czech", "43": "sv, Swedish", "37": "mn, Mongolian", "32": "fi, Finnish", "51": "tg, Tajik", "46": "am, Amharic", "17": "nn, Norwegian Nynorsk", "40": "ja, Japanese", "8": "it, Italian", "21": "ha, Hausa", "11": "as, Assamese", "29": "fa, Persian", "82": "bn, Bengali", "54": "mk, Macedonian", "31": "sw, Swahili", "45": "vi, Vietnamese", "41": "ur, Urdu", "74": "bo, Tibetan", "4": "hi, Hindi", "86": "mr, Marathi", "3": "fy-NL, Western Frisian", "65": "sk, Slovak", "2": "ln, Lingala", "92": "gl, Galician", "53": "sn, Shona", "87": "su, Sundanese", "35": "tt, Tatar", "93": "kn, Kannada", "6": "yo, Yoruba", "27": "ps, Pashto, Pushto", "34": "hy, Armenian", "25": "pa-IN, Punjabi, Panjabi", "23": "nl, Dutch, Flemish", "48": "th, Thai", "73": "mt, Maltese", "55": "ar, Arabic", "89": "ba, Bashkir", "78": "bg, Bulgarian", "42": "yi, Yiddish", "5": "ru, Russian", "84": "sv-SE, Swedish", "80": "tr, Turkish", "33": "sq, Albanian", "38": "kk, Kazakh", "50": "pl, Polish", "9": "hr, Croatian", "66": "ky, Kirghiz, Kyrgyz", "49": "hu, Hungarian", "10": "si, Sinhala, Sinhalese", "56": "la, Latin", "75": "de, German", "14": "ko, Korean", "22": "id, Indonesian", "47": "sl, Slovenian", "57": "be, Belarusian", "36": "ta, Tamil", "7": "da, Danish", "91": "sd, Sindhi", "28": "et, Estonian", "63": "pt, Portuguese", "60": "ne, Nepali", "94": "zh-TW, Chinese", "18": "zh-CN, Chinese", "88": "rw, Kinyarwanda", "19": "es, Spanish, Castilian", "39": "ht, Haitian, Haitian Creole", "64": "tl, Tagalog", "83": "ms, Malay", "70": "ro, Romanian, Moldavian, Moldovan", "68": "pa, Punjabi, Panjabi", "52": "uz, Uzbek", "58": "km, Central Khmer", "67": "my, Burmese", "0": "fr, French", "24": "af, Afrikaans", "16": "gu, Gujarati", "81": "so, Somali", "13": "uk, Ukrainian", "44": "ca, Catalan, Valencian", "72": "ml, Malayalam", "61": "te, Telugu", "1": "zh, Chinese"}


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/files/lang_group_dict_95.json:
--------------------------------------------------------------------------------
1 | {"0": ["Afrikaans", "Dutch, Flemish", "Western Frisian"], "1": ["Turkish", "Azerbaijani"], "2": ["Russian", "Slovak", "Ukrainian", "Czech", "Polish", "Belarusian"], "3": ["Bulgarian", "Macedonian", "Serbian", "Croatian", "Bosnian", "Slovenian"], "4": ["Norwegian Nynorsk", "Swedish", "Danish", "Norwegian"], "5": ["English"], "6": ["Finnish", "Estonian"], "7": ["Yiddish", "Luxembourgish, Letzeburgesch", "German"], "8": ["Spanish", "Occitan", "Portuguese", "Catalan, Valencian", "Galician", "Spanish, Castilian", "Italian"], "9": ["Maltese", "Arabic"], "10": ["Marathi"], "11": ["Hindi", "Urdu"], "12": ["Lao", "Thai"], "13": ["Malay", "Indonesian"], "14": ["Romanian, Moldavian, Moldovan"], "15": ["Tagalog"], "16": ["Tajik", "Persian"], "17": ["Kazakh", "Uzbek", "Kirghiz, Kyrgyz"], "18": ["Kinyarwanda"], "19": ["Tatar", "Bashkir"], "20": ["French"], "21": ["Chinese"], "22": ["Lingala"], "23": ["Yoruba"], "24": ["Sinhala, Sinhalese"], "25": ["Assamese"], "26": ["Korean"], "27": ["Gujarati"], "28": ["Hausa"], "29": ["Punjabi, Panjabi"], "30": ["Pashto, Pushto"], "31": ["Swahili"], "32": ["Albanian"], "33": ["Armenian"], "34": ["Mongolian"], "35": ["Tamil"], "36": ["Haitian, Haitian Creole"], "37": ["Japanese"], "38": ["Vietnamese"], "39": ["Amharic"], "40": ["Hungarian"], "41": ["Shona"], "42": ["Latin"], "43": ["Central Khmer"], "44": ["Malagasy"], "45": ["Nepali"], "46": ["Telugu"], "47": ["Oriya"], "48": ["Burmese"], "49": ["Greek, Modern (1453\u2013)"], "50": ["Malayalam"], "51": ["Tibetan"], "52": ["Turkmen"], "53": ["Somali"], "54": ["Bengali"], "55": ["Sundanese"], "56": ["Sindhi"], "57": ["Kannada"]}


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/files/silero_logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/snakers4_silero-vad_master/files/silero_logo.jpg


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/files/silero_vad.jit:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/snakers4_silero-vad_master/files/silero_vad.jit


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/files/silero_vad.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/snakers4_silero-vad_master/files/silero_vad.onnx


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/hubconf.py:
--------------------------------------------------------------------------------
  1 | dependencies = ['torch', 'torchaudio']
  2 | import torch
  3 | import json
  4 | from utils_vad import (init_jit_model,
  5 |                        get_speech_timestamps,
  6 |                        get_number_ts,
  7 |                        get_language,
  8 |                        get_language_and_group,
  9 |                        save_audio,
 10 |                        read_audio,
 11 |                        VADIterator,
 12 |                        collect_chunks,
 13 |                        drop_chunks,
 14 |                        Validator,
 15 |                        OnnxWrapper)
 16 | 
 17 | 
 18 | def versiontuple(v):
 19 |     return tuple(map(int, (v.split('+')[0].split("."))))
 20 | 
 21 | 
 22 | def silero_vad(onnx=False, force_onnx_cpu=False, silero_vad_source='github'):
 23 |     """Silero Voice Activity Detector
 24 |     Returns a model with a set of utils
 25 |     Please see https://github.com/snakers4/silero-vad for usage examples
 26 |     """
 27 | 
 28 |     if not onnx:
 29 |         installed_version = torch.__version__
 30 |         supported_version = '1.12.0'
 31 |         if versiontuple(installed_version) < versiontuple(supported_version):
 32 |             raise Exception(f'Please install torch {supported_version} or greater ({installed_version} installed)')
 33 |     import os
 34 |     import sys
 35 |     hub_dir = torch.hub.get_dir() if silero_vad_source == 'github' else os.path.dirname(sys.executable)
 36 |     if onnx:
 37 |         model = OnnxWrapper(f'{hub_dir}/snakers4_silero-vad_master/files/silero_vad.onnx', force_onnx_cpu)
 38 |     else:
 39 |         model = init_jit_model(model_path=f'{hub_dir}/snakers4_silero-vad_master/files/silero_vad.jit')
 40 |     utils = (get_speech_timestamps,
 41 |              save_audio,
 42 |              read_audio,
 43 |              VADIterator,
 44 |              collect_chunks)
 45 | 
 46 |     return model, utils
 47 | 
 48 | 
 49 | def silero_number_detector(onnx=False, force_onnx_cpu=False):
 50 |     """Silero Number Detector
 51 |     Returns a model with a set of utils
 52 |     Please see https://github.com/snakers4/silero-vad for usage examples
 53 |     """
 54 |     if onnx:
 55 |         url = 'https://models.silero.ai/vad_models/number_detector.onnx'
 56 |     else:
 57 |         url = 'https://models.silero.ai/vad_models/number_detector.jit'
 58 |     model = Validator(url, force_onnx_cpu)
 59 |     utils = (get_number_ts,
 60 |              save_audio,
 61 |              read_audio,
 62 |              collect_chunks,
 63 |              drop_chunks)
 64 | 
 65 |     return model, utils
 66 | 
 67 | 
 68 | def silero_lang_detector(onnx=False, force_onnx_cpu=False):
 69 |     """Silero Language Classifier
 70 |     Returns a model with a set of utils
 71 |     Please see https://github.com/snakers4/silero-vad for usage examples
 72 |     """
 73 |     if onnx:
 74 |         url = 'https://models.silero.ai/vad_models/number_detector.onnx'
 75 |     else:
 76 |         url = 'https://models.silero.ai/vad_models/number_detector.jit'
 77 |     model = Validator(url, force_onnx_cpu)
 78 |     utils = (get_language,
 79 |              read_audio)
 80 | 
 81 |     return model, utils
 82 | 
 83 | 
 84 | def silero_lang_detector_95(onnx=False, force_onnx_cpu=False):
 85 |     """Silero Language Classifier (95 languages)
 86 |     Returns a model with a set of utils
 87 |     Please see https://github.com/snakers4/silero-vad for usage examples
 88 |     """
 89 | 
 90 |     hub_dir = torch.hub.get_dir()
 91 |     if onnx:
 92 |         url = 'https://models.silero.ai/vad_models/lang_classifier_95.onnx'
 93 |     else:
 94 |         url = 'https://models.silero.ai/vad_models/lang_classifier_95.jit'
 95 |     model = Validator(url, force_onnx_cpu)
 96 | 
 97 |     with open(f'{hub_dir}/snakers4_silero-vad_master/files/lang_dict_95.json', 'r') as f:
 98 |         lang_dict = json.load(f)
 99 | 
100 |     with open(f'{hub_dir}/snakers4_silero-vad_master/files/lang_group_dict_95.json', 'r') as f:
101 |         lang_group_dict = json.load(f)
102 | 
103 |     utils = (get_language_and_group, read_audio)
104 | 
105 |     return model, lang_dict, lang_group_dict, utils
106 | 


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/silero-vad.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "FpMplOCA2Fwp"
  7 |    },
  8 |    "source": [
  9 |     "#VAD"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {
 15 |     "heading_collapsed": true,
 16 |     "id": "62A6F_072Fwq"
 17 |    },
 18 |    "source": [
 19 |     "## Install Dependencies"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {
 26 |     "hidden": true,
 27 |     "id": "5w5AkskZ2Fwr"
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "#@title Install and Import Dependencies\n",
 32 |     "\n",
 33 |     "# this assumes that you have a relevant version of PyTorch installed\n",
 34 |     "!pip install -q torchaudio\n",
 35 |     "\n",
 36 |     "SAMPLING_RATE = 16000\n",
 37 |     "\n",
 38 |     "import torch\n",
 39 |     "torch.set_num_threads(1)\n",
 40 |     "\n",
 41 |     "from IPython.display import Audio\n",
 42 |     "from pprint import pprint\n",
 43 |     "# download example\n",
 44 |     "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {
 51 |     "id": "pSifus5IilRp"
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "USE_ONNX = False # change this to True if you want to test onnx model\n",
 56 |     "if USE_ONNX:\n",
 57 |     "    !pip install -q onnxruntime\n",
 58 |     "  \n",
 59 |     "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
 60 |     "                              model='silero_vad',\n",
 61 |     "                              force_reload=True,\n",
 62 |     "                              onnx=USE_ONNX)\n",
 63 |     "\n",
 64 |     "(get_speech_timestamps,\n",
 65 |     " save_audio,\n",
 66 |     " read_audio,\n",
 67 |     " VADIterator,\n",
 68 |     " collect_chunks) = utils"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {
 74 |     "id": "fXbbaUO3jsrw"
 75 |    },
 76 |    "source": [
 77 |     "## Full Audio"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {
 83 |     "id": "RAfJPb_a-Auj"
 84 |    },
 85 |    "source": [
 86 |     "**Speech timestapms from full audio**"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {
 93 |     "id": "aI_eydBPjsrx"
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n",
 98 |     "# get speech timestamps from full audio file\n",
 99 |     "speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)\n",
100 |     "pprint(speech_timestamps)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {
107 |     "id": "OuEobLchjsry"
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "# merge all speech chunks to one audio\n",
112 |     "save_audio('only_speech.wav',\n",
113 |     "           collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE) \n",
114 |     "Audio('only_speech.wav')"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {
120 |     "id": "iDKQbVr8jsry"
121 |    },
122 |    "source": [
123 |     "## Stream imitation example"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {
130 |     "id": "q-lql_2Wjsry"
131 |    },
132 |    "outputs": [],
133 |    "source": [
134 |     "## using VADIterator class\n",
135 |     "\n",
136 |     "vad_iterator = VADIterator(model)\n",
137 |     "wav = read_audio(f'en_example.wav', sampling_rate=SAMPLING_RATE)\n",
138 |     "\n",
139 |     "window_size_samples = 1536 # number of samples in a single audio chunk\n",
140 |     "for i in range(0, len(wav), window_size_samples):\n",
141 |     "    chunk = wav[i: i+ window_size_samples]\n",
142 |     "    if len(chunk) < window_size_samples:\n",
143 |     "      break\n",
144 |     "    speech_dict = vad_iterator(chunk, return_seconds=True)\n",
145 |     "    if speech_dict:\n",
146 |     "        print(speech_dict, end=' ')\n",
147 |     "vad_iterator.reset_states() # reset model states after each audio"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {
154 |     "id": "BX3UgwwB2Fwv"
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "## just probabilities\n",
159 |     "\n",
160 |     "wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n",
161 |     "speech_probs = []\n",
162 |     "window_size_samples = 1536\n",
163 |     "for i in range(0, len(wav), window_size_samples):\n",
164 |     "    chunk = wav[i: i+ window_size_samples]\n",
165 |     "    if len(chunk) < window_size_samples:\n",
166 |     "      break\n",
167 |     "    speech_prob = model(chunk, SAMPLING_RATE).item()\n",
168 |     "    speech_probs.append(speech_prob)\n",
169 |     "vad_iterator.reset_states() # reset model states after each audio\n",
170 |     "\n",
171 |     "print(speech_probs[:10]) # first 10 chunks predicts"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {
177 |     "heading_collapsed": true,
178 |     "id": "36jY0niD2Fww"
179 |    },
180 |    "source": [
181 |     "# Number detector"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {
187 |     "heading_collapsed": true,
188 |     "hidden": true,
189 |     "id": "scd1DlS42Fwx"
190 |    },
191 |    "source": [
192 |     "## Install Dependencies"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {
199 |     "hidden": true,
200 |     "id": "Kq5gQuYq2Fwx"
201 |    },
202 |    "outputs": [],
203 |    "source": [
204 |     "#@title Install and Import Dependencies\n",
205 |     "\n",
206 |     "# this assumes that you have a relevant version of PyTorch installed\n",
207 |     "!pip install -q torchaudio\n",
208 |     "\n",
209 |     "SAMPLING_RATE = 16000\n",
210 |     "\n",
211 |     "import torch\n",
212 |     "torch.set_num_threads(1)\n",
213 |     "\n",
214 |     "from IPython.display import Audio\n",
215 |     "from pprint import pprint\n",
216 |     "# download example\n",
217 |     "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en_num.wav', 'en_number_example.wav')"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "metadata": {
224 |     "id": "dPwCFHmFycUF"
225 |    },
226 |    "outputs": [],
227 |    "source": [
228 |     "USE_ONNX = False # change this to True if you want to test onnx model\n",
229 |     "if USE_ONNX:\n",
230 |     "    !pip install -q onnxruntime\n",
231 |     "  \n",
232 |     "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
233 |     "                              model='silero_number_detector',\n",
234 |     "                              force_reload=True,\n",
235 |     "                              onnx=USE_ONNX)\n",
236 |     "\n",
237 |     "(get_number_ts,\n",
238 |     " save_audio,\n",
239 |     " read_audio,\n",
240 |     " collect_chunks,\n",
241 |     " drop_chunks) = utils\n"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {
247 |     "heading_collapsed": true,
248 |     "hidden": true,
249 |     "id": "qhPa30ij2Fwy"
250 |    },
251 |    "source": [
252 |     "## Full audio"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {
259 |     "hidden": true,
260 |     "id": "EXpau6xq2Fwy"
261 |    },
262 |    "outputs": [],
263 |    "source": [
264 |     "wav = read_audio('en_number_example.wav', sampling_rate=SAMPLING_RATE)\n",
265 |     "# get number timestamps from full audio file\n",
266 |     "number_timestamps = get_number_ts(wav, model)\n",
267 |     "pprint(number_timestamps)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "metadata": {
274 |     "hidden": true,
275 |     "id": "u-KfXRhZ2Fwy"
276 |    },
277 |    "outputs": [],
278 |    "source": [
279 |     "# convert ms in timestamps to samples\n",
280 |     "for timestamp in number_timestamps:\n",
281 |     "    timestamp['start'] = int(timestamp['start'] * SAMPLING_RATE / 1000)\n",
282 |     "    timestamp['end'] = int(timestamp['end'] * SAMPLING_RATE / 1000)"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {
289 |     "hidden": true,
290 |     "id": "iwYEC4aZ2Fwy"
291 |    },
292 |    "outputs": [],
293 |    "source": [
294 |     "# merge all number chunks to one audio\n",
295 |     "save_audio('only_numbers.wav',\n",
296 |     "           collect_chunks(number_timestamps, wav), SAMPLING_RATE) \n",
297 |     "Audio('only_numbers.wav')"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {
304 |     "hidden": true,
305 |     "id": "fHaYejX12Fwy"
306 |    },
307 |    "outputs": [],
308 |    "source": [
309 |     "# drop all number chunks from audio\n",
310 |     "save_audio('no_numbers.wav',\n",
311 |     "           drop_chunks(number_timestamps, wav), SAMPLING_RATE) \n",
312 |     "Audio('no_numbers.wav')"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "markdown",
317 |    "metadata": {
318 |     "heading_collapsed": true,
319 |     "id": "PnKtJKbq2Fwz"
320 |    },
321 |    "source": [
322 |     "# Language detector"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {
328 |     "heading_collapsed": true,
329 |     "hidden": true,
330 |     "id": "F5cAmMbP2Fwz"
331 |    },
332 |    "source": [
333 |     "## Install Dependencies"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {
340 |     "hidden": true,
341 |     "id": "Zu9D0t6n2Fwz"
342 |    },
343 |    "outputs": [],
344 |    "source": [
345 |     "#@title Install and Import Dependencies\n",
346 |     "\n",
347 |     "# this assumes that you have a relevant version of PyTorch installed\n",
348 |     "!pip install -q torchaudio\n",
349 |     "\n",
350 |     "SAMPLING_RATE = 16000\n",
351 |     "\n",
352 |     "import torch\n",
353 |     "torch.set_num_threads(1)\n",
354 |     "\n",
355 |     "from IPython.display import Audio\n",
356 |     "from pprint import pprint\n",
357 |     "# download example\n",
358 |     "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {
365 |     "id": "JfRKDZiRztFe"
366 |    },
367 |    "outputs": [],
368 |    "source": [
369 |     "USE_ONNX = False # change this to True if you want to test onnx model\n",
370 |     "if USE_ONNX:\n",
371 |     "    !pip install -q onnxruntime\n",
372 |     "  \n",
373 |     "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n",
374 |     "                              model='silero_lang_detector',\n",
375 |     "                              force_reload=True,\n",
376 |     "                              onnx=USE_ONNX)\n",
377 |     "\n",
378 |     "get_language, read_audio = utils"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {
384 |     "heading_collapsed": true,
385 |     "hidden": true,
386 |     "id": "iC696eMX2Fwz"
387 |    },
388 |    "source": [
389 |     "## Full audio"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": null,
395 |    "metadata": {
396 |     "hidden": true,
397 |     "id": "c8UYnYBF2Fw0"
398 |    },
399 |    "outputs": [],
400 |    "source": [
401 |     "wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n",
402 |     "lang = get_language(wav, model)\n",
403 |     "print(lang)"
404 |    ]
405 |   }
406 |  ],
407 |  "metadata": {
408 |   "colab": {
409 |    "name": "silero-vad.ipynb",
410 |    "provenance": []
411 |   },
412 |   "kernelspec": {
413 |    "display_name": "Python 3",
414 |    "language": "python",
415 |    "name": "python3"
416 |   },
417 |   "language_info": {
418 |    "codemirror_mode": {
419 |     "name": "ipython",
420 |     "version": 3
421 |    },
422 |    "file_extension": ".py",
423 |    "mimetype": "text/x-python",
424 |    "name": "python",
425 |    "nbconvert_exporter": "python",
426 |    "pygments_lexer": "ipython3",
427 |    "version": "3.8.8"
428 |   },
429 |   "toc": {
430 |    "base_numbering": 1,
431 |    "nav_menu": {},
432 |    "number_sections": true,
433 |    "sideBar": true,
434 |    "skip_h1_title": false,
435 |    "title_cell": "Table of Contents",
436 |    "title_sidebar": "Contents",
437 |    "toc_cell": false,
438 |    "toc_position": {},
439 |    "toc_section_display": true,
440 |    "toc_window_display": false
441 |   }
442 |  },
443 |  "nbformat": 4,
444 |  "nbformat_minor": 0
445 | }
446 | 


--------------------------------------------------------------------------------
/snakers4_silero-vad_master/utils_vad.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torchaudio
  3 | from typing import List
  4 | import torch.nn.functional as F
  5 | import warnings
  6 | 
  7 | languages = ['ru', 'en', 'de', 'es']
  8 | 
  9 | 
 10 | class OnnxWrapper():
 11 | 
 12 |     def __init__(self, path, force_onnx_cpu=False):
 13 |         import numpy as np
 14 |         global np
 15 |         import onnxruntime
 16 |         if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
 17 |             self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'])
 18 |         else:
 19 |             self.session = onnxruntime.InferenceSession(path)
 20 |         self.session.intra_op_num_threads = 1
 21 |         self.session.inter_op_num_threads = 1
 22 | 
 23 |         self.reset_states()
 24 |         self.sample_rates = [8000, 16000]
 25 | 
 26 |     def _validate_input(self, x, sr: int):
 27 |         if x.dim() == 1:
 28 |             x = x.unsqueeze(0)
 29 |         if x.dim() > 2:
 30 |             raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
 31 | 
 32 |         if sr != 16000 and (sr % 16000 == 0):
 33 |             step = sr // 16000
 34 |             x = x[::step]
 35 |             sr = 16000
 36 | 
 37 |         if sr not in self.sample_rates:
 38 |             raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
 39 | 
 40 |         if sr / x.shape[1] > 31.25:
 41 |             raise ValueError("Input audio chunk is too short")
 42 | 
 43 |         return x, sr
 44 | 
 45 |     def reset_states(self, batch_size=1):
 46 |         self._h = np.zeros((2, batch_size, 64)).astype('float32')
 47 |         self._c = np.zeros((2, batch_size, 64)).astype('float32')
 48 |         self._last_sr = 0
 49 |         self._last_batch_size = 0
 50 | 
 51 |     def __call__(self, x, sr: int):
 52 | 
 53 |         x, sr = self._validate_input(x, sr)
 54 |         batch_size = x.shape[0]
 55 | 
 56 |         if not self._last_batch_size:
 57 |             self.reset_states(batch_size)
 58 |         if (self._last_sr) and (self._last_sr != sr):
 59 |             self.reset_states(batch_size)
 60 |         if (self._last_batch_size) and (self._last_batch_size != batch_size):
 61 |             self.reset_states(batch_size)
 62 | 
 63 |         if sr in [8000, 16000]:
 64 |             ort_inputs = {'input': x.numpy(), 'h': self._h, 'c': self._c, 'sr': np.array(sr, dtype='int64')}
 65 |             ort_outs = self.session.run(None, ort_inputs)
 66 |             out, self._h, self._c = ort_outs
 67 |         else:
 68 |             raise ValueError()
 69 | 
 70 |         self._last_sr = sr
 71 |         self._last_batch_size = batch_size
 72 | 
 73 |         out = torch.tensor(out)
 74 |         return out
 75 | 
 76 |     def audio_forward(self, x, sr: int, num_samples: int = 512):
 77 |         outs = []
 78 |         x, sr = self._validate_input(x, sr)
 79 | 
 80 |         if x.shape[1] % num_samples:
 81 |             pad_num = num_samples - (x.shape[1] % num_samples)
 82 |             x = torch.nn.functional.pad(x, (0, pad_num), 'constant', value=0.0)
 83 | 
 84 |         self.reset_states(x.shape[0])
 85 |         for i in range(0, x.shape[1], num_samples):
 86 |             wavs_batch = x[:, i:i+num_samples]
 87 |             out_chunk = self.__call__(wavs_batch, sr)
 88 |             outs.append(out_chunk)
 89 | 
 90 |         stacked = torch.cat(outs, dim=1)
 91 |         return stacked.cpu()
 92 | 
 93 | 
 94 | class Validator():
 95 |     def __init__(self, url, force_onnx_cpu):
 96 |         self.onnx = True if url.endswith('.onnx') else False
 97 |         torch.hub.download_url_to_file(url, 'inf.model')
 98 |         if self.onnx:
 99 |             import onnxruntime
100 |             if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
101 |                 self.model = onnxruntime.InferenceSession('inf.model', providers=['CPUExecutionProvider'])
102 |             else:
103 |                 self.model = onnxruntime.InferenceSession('inf.model')
104 |         else:
105 |             self.model = init_jit_model(model_path='inf.model')
106 | 
107 |     def __call__(self, inputs: torch.Tensor):
108 |         with torch.no_grad():
109 |             if self.onnx:
110 |                 ort_inputs = {'input': inputs.cpu().numpy()}
111 |                 outs = self.model.run(None, ort_inputs)
112 |                 outs = [torch.Tensor(x) for x in outs]
113 |             else:
114 |                 outs = self.model(inputs)
115 | 
116 |         return outs
117 | 
118 | 
119 | def read_audio(path: str,
120 |                sampling_rate: int = 16000):
121 | 
122 |     wav, sr = torchaudio.load(path)
123 | 
124 |     if wav.size(0) > 1:
125 |         wav = wav.mean(dim=0, keepdim=True)
126 | 
127 |     if sr != sampling_rate:
128 |         transform = torchaudio.transforms.Resample(orig_freq=sr,
129 |                                                    new_freq=sampling_rate)
130 |         wav = transform(wav)
131 |         sr = sampling_rate
132 | 
133 |     assert sr == sampling_rate
134 |     return wav.squeeze(0)
135 | 
136 | 
137 | def save_audio(path: str,
138 |                tensor: torch.Tensor,
139 |                sampling_rate: int = 16000):
140 |     torchaudio.save(path, tensor.unsqueeze(0), sampling_rate, bits_per_sample=16)
141 | 
142 | 
143 | def init_jit_model(model_path: str,
144 |                    device=torch.device('cpu')):
145 |     torch.set_grad_enabled(False)
146 |     model = torch.jit.load(model_path, map_location=device)
147 |     model.eval()
148 |     return model
149 | 
150 | 
151 | def make_visualization(probs, step):
152 |     import pandas as pd
153 |     pd.DataFrame({'probs': probs},
154 |                  index=[x * step for x in range(len(probs))]).plot(figsize=(16, 8),
155 |                  kind='area', ylim=[0, 1.05], xlim=[0, len(probs) * step],
156 |                  xlabel='seconds',
157 |                  ylabel='speech probability',
158 |                  colormap='tab20')
159 | 
160 | 
161 | def get_speech_timestamps(audio: torch.Tensor,
162 |                           model,
163 |                           threshold: float = 0.5,
164 |                           sampling_rate: int = 16000,
165 |                           min_speech_duration_ms: int = 250,
166 |                           max_speech_duration_s: float = float('inf'),
167 |                           min_silence_duration_ms: int = 100,
168 |                           window_size_samples: int = 512,
169 |                           speech_pad_ms: int = 30,
170 |                           return_seconds: bool = False,
171 |                           visualize_probs: bool = False):
172 | 
173 |     """
174 |     This method is used for splitting long audios into speech chunks using silero VAD
175 | 
176 |     Parameters
177 |     ----------
178 |     audio: torch.Tensor, one dimensional
179 |         One dimensional float torch.Tensor, other types are casted to torch if possible
180 | 
181 |     model: preloaded .jit silero VAD model
182 | 
183 |     threshold: float (default - 0.5)
184 |         Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
185 |         It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
186 | 
187 |     sampling_rate: int (default - 16000)
188 |         Currently silero VAD models support 8000 and 16000 sample rates
189 | 
190 |     min_speech_duration_ms: int (default - 250 milliseconds)
191 |         Final speech chunks shorter min_speech_duration_ms are thrown out
192 | 
193 |     max_speech_duration_s: int (default -  inf)
194 |         Maximum duration of speech chunks in seconds
195 |         Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that lasts more than 100s (if any), to prevent agressive cutting.
196 |         Otherwise, they will be split aggressively just before max_speech_duration_s.
197 | 
198 |     min_silence_duration_ms: int (default - 100 milliseconds)
199 |         In the end of each speech chunk wait for min_silence_duration_ms before separating it
200 | 
201 |     window_size_samples: int (default - 1536 samples)
202 |         Audio chunks of window_size_samples size are fed to the silero VAD model.
203 |         WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 samples for 8000 sample rate.
204 |         Values other than these may affect model perfomance!!
205 | 
206 |     speech_pad_ms: int (default - 30 milliseconds)
207 |         Final speech chunks are padded by speech_pad_ms each side
208 | 
209 |     return_seconds: bool (default - False)
210 |         whether return timestamps in seconds (default - samples)
211 | 
212 |     visualize_probs: bool (default - False)
213 |         whether draw prob hist or not
214 | 
215 |     Returns
216 |     ----------
217 |     speeches: list of dicts
218 |         list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds)
219 |     """
220 | 
221 |     if not torch.is_tensor(audio):
222 |         try:
223 |             audio = torch.Tensor(audio)
224 |         except:
225 |             raise TypeError("Audio cannot be casted to tensor. Cast it manually")
226 | 
227 |     if len(audio.shape) > 1:
228 |         for i in range(len(audio.shape)):  # trying to squeeze empty dimensions
229 |             audio = audio.squeeze(0)
230 |         if len(audio.shape) > 1:
231 |             raise ValueError("More than one dimension in audio. Are you trying to process audio with 2 channels?")
232 | 
233 |     if sampling_rate > 16000 and (sampling_rate % 16000 == 0):
234 |         step = sampling_rate // 16000
235 |         sampling_rate = 16000
236 |         audio = audio[::step]
237 |         warnings.warn('Sampling rate is a multiply of 16000, casting to 16000 manually!')
238 |     else:
239 |         step = 1
240 | 
241 |     if sampling_rate == 8000 and window_size_samples > 768:
242 |         warnings.warn('window_size_samples is too big for 8000 sampling_rate! Better set window_size_samples to 256, 512 or 768 for 8000 sample rate!')
243 |     if window_size_samples not in [256, 512, 768, 1024, 1536]:
244 |         warnings.warn('Unusual window_size_samples! Supported window_size_samples:\n - [512, 1024, 1536] for 16000 sampling_rate\n - [256, 512, 768] for 8000 sampling_rate')
245 | 
246 |     model.reset_states()
247 |     min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
248 |     speech_pad_samples = sampling_rate * speech_pad_ms / 1000
249 |     max_speech_samples = sampling_rate * max_speech_duration_s - window_size_samples - 2 * speech_pad_samples
250 |     min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
251 |     min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
252 | 
253 |     audio_length_samples = len(audio)
254 | 
255 |     speech_probs = []
256 |     for current_start_sample in range(0, audio_length_samples, window_size_samples):
257 |         chunk = audio[current_start_sample: current_start_sample + window_size_samples]
258 |         if len(chunk) < window_size_samples:
259 |             chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk))))
260 |         speech_prob = model(chunk, sampling_rate).item()
261 |         speech_probs.append(speech_prob)
262 | 
263 |     triggered = False
264 |     speeches = []
265 |     current_speech = {}
266 |     neg_threshold = threshold - 0.15
267 |     temp_end = 0 # to save potential segment end (and tolerate some silence)
268 |     prev_end = next_start = 0 # to save potential segment limits in case of maximum segment size reached
269 | 
270 |     for i, speech_prob in enumerate(speech_probs):
271 |         if (speech_prob >= threshold) and temp_end:
272 |             temp_end = 0
273 |             if next_start < prev_end:
274 |                next_start = window_size_samples * i
275 | 
276 |         if (speech_prob >= threshold) and not triggered:
277 |             triggered = True
278 |             current_speech['start'] = window_size_samples * i
279 |             continue
280 |         
281 |         if triggered and (window_size_samples * i) - current_speech['start'] > max_speech_samples:
282 |             if prev_end:
283 |                 current_speech['end'] = prev_end
284 |                 speeches.append(current_speech)
285 |                 current_speech = {}
286 |                 if next_start < prev_end: # previously reached silence (< neg_thres) and is still not speech (< thres)
287 |                     triggered = False
288 |                 else:
289 |                     current_speech['start'] = next_start
290 |                 prev_end = next_start = temp_end = 0
291 |             else:
292 |                 current_speech['end'] = window_size_samples * i
293 |                 speeches.append(current_speech)
294 |                 current_speech = {}
295 |                 prev_end = next_start = temp_end = 0
296 |                 triggered = False
297 |                 continue
298 |                 
299 | 
300 |         if (speech_prob < neg_threshold) and triggered:
301 |             if not temp_end:
302 |                 temp_end = window_size_samples * i
303 |             if ((window_size_samples * i) - temp_end) > min_silence_samples_at_max_speech : # condition to avoid cutting in very short silence
304 |                 prev_end = temp_end
305 |             if (window_size_samples * i) - temp_end < min_silence_samples:
306 |                 continue
307 |             else:
308 |                 current_speech['end'] = temp_end
309 |                 if (current_speech['end'] - current_speech['start']) > min_speech_samples:
310 |                     speeches.append(current_speech)
311 |                 current_speech = {}
312 |                 prev_end = next_start = temp_end = 0
313 |                 triggered = False
314 |                 continue
315 | 
316 |     if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples:
317 |         current_speech['end'] = audio_length_samples
318 |         speeches.append(current_speech)
319 | 
320 |     for i, speech in enumerate(speeches):
321 |         if i == 0:
322 |             speech['start'] = int(max(0, speech['start'] - speech_pad_samples))
323 |         if i != len(speeches) - 1:
324 |             silence_duration = speeches[i+1]['start'] - speech['end']
325 |             if silence_duration < 2 * speech_pad_samples:
326 |                 speech['end'] += int(silence_duration // 2)
327 |                 speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - silence_duration // 2))
328 |             else:
329 |                 speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
330 |                 speeches[i+1]['start'] = int(max(0, speeches[i+1]['start'] - speech_pad_samples))
331 |         else:
332 |             speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))
333 | 
334 |     if return_seconds:
335 |         for speech_dict in speeches:
336 |             speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1)
337 |             speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1)
338 |     elif step > 1:
339 |         for speech_dict in speeches:
340 |             speech_dict['start'] *= step
341 |             speech_dict['end'] *= step
342 | 
343 |     if visualize_probs:
344 |         make_visualization(speech_probs, window_size_samples / sampling_rate)
345 | 
346 |     return speeches
347 | 
348 | 
349 | def get_number_ts(wav: torch.Tensor,
350 |                   model,
351 |                   model_stride=8,
352 |                   hop_length=160,
353 |                   sample_rate=16000):
354 |     wav = torch.unsqueeze(wav, dim=0)
355 |     perframe_logits = model(wav)[0]
356 |     perframe_preds = torch.argmax(torch.softmax(perframe_logits, dim=1), dim=1).squeeze()   # (1, num_frames_strided)
357 |     extended_preds = []
358 |     for i in perframe_preds:
359 |         extended_preds.extend([i.item()] * model_stride)
360 |     # len(extended_preds) is *num_frames_real*; for each frame of audio we know if it has a number in it.
361 |     triggered = False
362 |     timings = []
363 |     cur_timing = {}
364 |     for i, pred in enumerate(extended_preds):
365 |         if pred == 1:
366 |             if not triggered:
367 |                 cur_timing['start'] = int((i * hop_length) / (sample_rate / 1000))
368 |                 triggered = True
369 |         elif pred == 0:
370 |             if triggered:
371 |                 cur_timing['end'] = int((i * hop_length) / (sample_rate / 1000))
372 |                 timings.append(cur_timing)
373 |                 cur_timing = {}
374 |                 triggered = False
375 |     if cur_timing:
376 |         cur_timing['end'] = int(len(wav) / (sample_rate / 1000))
377 |         timings.append(cur_timing)
378 |     return timings
379 | 
380 | 
381 | def get_language(wav: torch.Tensor,
382 |                  model):
383 |     wav = torch.unsqueeze(wav, dim=0)
384 |     lang_logits = model(wav)[2]
385 |     lang_pred = torch.argmax(torch.softmax(lang_logits, dim=1), dim=1).item()   # from 0 to len(languages) - 1
386 |     assert lang_pred < len(languages)
387 |     return languages[lang_pred]
388 | 
389 | 
390 | def get_language_and_group(wav: torch.Tensor,
391 |                            model,
392 |                            lang_dict: dict,
393 |                            lang_group_dict: dict,
394 |                            top_n=1):
395 |     wav = torch.unsqueeze(wav, dim=0)
396 |     lang_logits, lang_group_logits = model(wav)
397 | 
398 |     softm = torch.softmax(lang_logits, dim=1).squeeze()
399 |     softm_group = torch.softmax(lang_group_logits, dim=1).squeeze()
400 | 
401 |     srtd = torch.argsort(softm, descending=True)
402 |     srtd_group = torch.argsort(softm_group, descending=True)
403 | 
404 |     outs = []
405 |     outs_group = []
406 |     for i in range(top_n):
407 |         prob = round(softm[srtd[i]].item(), 2)
408 |         prob_group = round(softm_group[srtd_group[i]].item(), 2)
409 |         outs.append((lang_dict[str(srtd[i].item())], prob))
410 |         outs_group.append((lang_group_dict[str(srtd_group[i].item())], prob_group))
411 | 
412 |     return outs, outs_group
413 | 
414 | 
415 | class VADIterator:
416 |     def __init__(self,
417 |                  model,
418 |                  threshold: float = 0.5,
419 |                  sampling_rate: int = 16000,
420 |                  min_silence_duration_ms: int = 100,
421 |                  speech_pad_ms: int = 30
422 |                  ):
423 | 
424 |         """
425 |         Class for stream imitation
426 | 
427 |         Parameters
428 |         ----------
429 |         model: preloaded .jit silero VAD model
430 | 
431 |         threshold: float (default - 0.5)
432 |             Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
433 |             It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
434 | 
435 |         sampling_rate: int (default - 16000)
436 |             Currently silero VAD models support 8000 and 16000 sample rates
437 | 
438 |         min_silence_duration_ms: int (default - 100 milliseconds)
439 |             In the end of each speech chunk wait for min_silence_duration_ms before separating it
440 | 
441 |         speech_pad_ms: int (default - 30 milliseconds)
442 |             Final speech chunks are padded by speech_pad_ms each side
443 |         """
444 | 
445 |         self.model = model
446 |         self.threshold = threshold
447 |         self.sampling_rate = sampling_rate
448 | 
449 |         if sampling_rate not in [8000, 16000]:
450 |             raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
451 | 
452 |         self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
453 |         self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
454 |         self.reset_states()
455 | 
456 |     def reset_states(self):
457 | 
458 |         self.model.reset_states()
459 |         self.triggered = False
460 |         self.temp_end = 0
461 |         self.current_sample = 0
462 | 
463 |     def __call__(self, x, return_seconds=False):
464 |         """
465 |         x: torch.Tensor
466 |             audio chunk (see examples in repo)
467 | 
468 |         return_seconds: bool (default - False)
469 |             whether return timestamps in seconds (default - samples)
470 |         """
471 | 
472 |         if not torch.is_tensor(x):
473 |             try:
474 |                 x = torch.Tensor(x)
475 |             except:
476 |                 raise TypeError("Audio cannot be casted to tensor. Cast it manually")
477 | 
478 |         window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
479 |         self.current_sample += window_size_samples
480 | 
481 |         speech_prob = self.model(x, self.sampling_rate).item()
482 | 
483 |         if (speech_prob >= self.threshold) and self.temp_end:
484 |             self.temp_end = 0
485 | 
486 |         if (speech_prob >= self.threshold) and not self.triggered:
487 |             self.triggered = True
488 |             speech_start = self.current_sample - self.speech_pad_samples
489 |             return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
490 | 
491 |         if (speech_prob < self.threshold - 0.15) and self.triggered:
492 |             if not self.temp_end:
493 |                 self.temp_end = self.current_sample
494 |             if self.current_sample - self.temp_end < self.min_silence_samples:
495 |                 return None
496 |             else:
497 |                 speech_end = self.temp_end + self.speech_pad_samples
498 |                 self.temp_end = 0
499 |                 self.triggered = False
500 |                 return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
501 | 
502 |         return None
503 | 
504 | 
505 | def collect_chunks(tss: List[dict],
506 |                    wav: torch.Tensor):
507 |     chunks = []
508 |     for i in tss:
509 |         chunks.append(wav[i['start']: i['end']])
510 |     return torch.cat(chunks)
511 | 
512 | 
513 | def drop_chunks(tss: List[dict],
514 |                 wav: torch.Tensor):
515 |     chunks = []
516 |     cur_start = 0
517 |     for i in tss:
518 |         chunks.append((wav[cur_start: i['start']]))
519 |         cur_start = i['end']
520 |     return torch.cat(chunks)
521 | 


--------------------------------------------------------------------------------
/test/config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | # 定义一个日志收集器
 5 | logger = logging.getLogger()
 6 | # 设置收集器的级别，不设定的话，默认收集warning及以上级别的日志
 7 | logger.setLevel("DEBUG")
 8 | # 设置日志格式
 9 | fmt = logging.Formatter("%(filename)s-%(lineno)d-%(asctime)s-%(levelname)s-%(message)s")
10 | # 设置日志处理器-输出到文件,并且设置编码格式
11 | if not os.path.exists("./log"):
12 |     os.makedirs("./log")
13 | file_handler = logging.FileHandler("./log/log.txt", encoding="utf-8")
14 | # 设置日志处理器级别
15 | file_handler.setLevel("DEBUG")
16 | # 处理器按指定格式输出日志
17 | file_handler.setFormatter(fmt)
18 | # 输出到控制台
19 | ch = logging.StreamHandler()
20 | # 设置日志处理器级别
21 | ch.setLevel("DEBUG")
22 | # 处理器按指定格式输出日志
23 | ch.setFormatter(fmt)
24 | # 收集器和处理器对接，指定输出渠道
25 | # 日志输出到文件
26 | logger.addHandler(file_handler)
27 | # 日志输出到控制台
28 | logger.addHandler(ch)
29 | 
30 | TEST_MEDIA_PATH = "./test/media/"
31 | TEST_CONTENT_PATH = "./test/content/"
32 | TEST_MEDIA_FILE = [
33 |     "test001.mp4",
34 |     "test002.mov",
35 |     "test003.mkv",
36 |     "test004.flv",
37 |     "test005.mp3",
38 |     "test006.MP4",
39 | ]
40 | 
41 | TEST_MEDIA_FILE_LANG = ["test001_en.mp4"]
42 | TEST_MEDIA_FILE_SIMPLE = ["test001.mp4", "test005.mp3"]
43 | 
44 | 
45 | class TestArgs:
46 |     def __init__(
47 |         self,
48 |         encoding="utf-8",
49 |         sampling_rate=16000,
50 |         bitrate="10m",
51 |         lang="zh",
52 |         prompt="",
53 |         whisper_model="small",
54 |         device=None,
55 |         vad=False,
56 |         force=False,
57 |     ):
58 |         self.inputs = []
59 |         self.bitrate = bitrate
60 |         self.encoding = encoding
61 |         self.sampling_rate = sampling_rate
62 |         self.lang = lang
63 |         self.prompt = prompt
64 |         self.whisper_model = whisper_model
65 |         self.device = device
66 |         self.vad = vad
67 |         self.force = force
68 | 


--------------------------------------------------------------------------------
/test/content/test.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:00:00,000 --> 00:00:05,000
3 | 大家好,我的名字是AutoCut.这是一条用于测试的视频。
4 | 
5 | 2
6 | 00:00:05,000 --> 00:00:10,260
7 | Hello, my name is AutoCut. This is a video for testing.
8 | 
9 | 


--------------------------------------------------------------------------------
/test/content/test_md.md:
--------------------------------------------------------------------------------
 1 | - [x] <-- Mark if you are done editing.
 2 | 
 3 | <video controls="true" allowfullscreen="true"> <source src="../video/test001.mp4" type="video/mp4"> </video>
 4 | 
 5 | Texts generated from [test001.srt](test001.srt).Mark the sentences to keep for autocut.
 6 | The format is [subtitle_index,duration_in_second] subtitle context.
 7 | 
 8 | - [ ] [1,00:00]   大家好,我的名字是AutoCut.这是一条用于测试的视频。
 9 | - [x] [2,00:05]   Hello, my name is AutoCut. This is a video for testing.
10 | 


--------------------------------------------------------------------------------
/test/content/test_srt.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:00:00,000 --> 00:00:05,000
3 | 大家好,我的名字是AutoCut.这是一条用于测试的视频。
4 | 
5 | 


--------------------------------------------------------------------------------
/test/media/test001.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/test/media/test001.mp4


--------------------------------------------------------------------------------
/test/media/test001_en.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/test/media/test001_en.mp4


--------------------------------------------------------------------------------
/test/media/test002.mov:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/test/media/test002.mov


--------------------------------------------------------------------------------
/test/media/test003.mkv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/test/media/test003.mkv


--------------------------------------------------------------------------------
/test/media/test004.flv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/test/media/test004.flv


--------------------------------------------------------------------------------
/test/media/test005.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/test/media/test005.mp3


--------------------------------------------------------------------------------
/test/media/test006.MP4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/test/media/test006.MP4


--------------------------------------------------------------------------------
/test/test_cut.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import unittest
 4 | 
 5 | from parameterized import parameterized, param
 6 | 
 7 | from autocut.cut import Cutter
 8 | from config import TestArgs, TEST_MEDIA_PATH, TEST_MEDIA_FILE_SIMPLE, TEST_CONTENT_PATH
 9 | 
10 | 
11 | class TestCut(unittest.TestCase):
12 |     @classmethod
13 |     def setUpClass(cls):
14 |         logging.info("检查测试文件是否正常存在")
15 |         scan_file = os.listdir(TEST_MEDIA_PATH)
16 |         logging.info(
17 |             "应存在文件列表：" + str(TEST_MEDIA_FILE_SIMPLE) + "  扫描到文件列表：" + str(scan_file)
18 |         )
19 |         for file in TEST_MEDIA_FILE_SIMPLE:
20 |             assert file in scan_file
21 | 
22 |     def tearDown(self):
23 |         for file in TEST_MEDIA_FILE_SIMPLE:
24 |             namepart = os.path.join(
25 |                 TEST_MEDIA_PATH, os.path.splitext(file)[0] + "_cut."
26 |             )
27 |             if os.path.exists(namepart + "mp4"):
28 |                 os.remove(namepart + "mp4")
29 |             if os.path.exists(namepart + "mp3"):
30 |                 os.remove(namepart + "mp3")
31 | 
32 |     @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE])
33 |     def test_srt_cut(self, file_name):
34 |         args = TestArgs()
35 |         args.inputs = [
36 |             os.path.join(TEST_MEDIA_PATH, file_name),
37 |             os.path.join(TEST_CONTENT_PATH, "test_srt.srt"),
38 |         ]
39 |         cut = Cutter(args)
40 |         cut.run()
41 |         namepart = os.path.join(
42 |             TEST_MEDIA_PATH, os.path.splitext(file_name)[0] + "_cut."
43 |         )
44 |         self.assertTrue(
45 |             os.path.exists(namepart + "mp4") or os.path.exists(namepart + "mp3")
46 |         )
47 | 
48 |     @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE])
49 |     def test_md_cut(self, file_name):
50 |         args = TestArgs()
51 |         args.inputs = [
52 |             TEST_MEDIA_PATH + file_name,
53 |             os.path.join(TEST_CONTENT_PATH, "test.srt"),
54 |             os.path.join(TEST_CONTENT_PATH, "test_md.md"),
55 |         ]
56 |         cut = Cutter(args)
57 |         cut.run()
58 |         namepart = os.path.join(
59 |             TEST_MEDIA_PATH, os.path.splitext(file_name)[0] + "_cut."
60 |         )
61 |         self.assertTrue(
62 |             os.path.exists(namepart + "mp4") or os.path.exists(namepart + "mp3")
63 |         )
64 | 


--------------------------------------------------------------------------------
/test/test_transcribe.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import unittest
  4 | 
  5 | from parameterized import parameterized, param
  6 | 
  7 | from autocut.utils import MD
  8 | from config import (
  9 |     TEST_MEDIA_FILE,
 10 |     TestArgs,
 11 |     TEST_MEDIA_FILE_SIMPLE,
 12 |     TEST_MEDIA_FILE_LANG,
 13 |     TEST_MEDIA_PATH,
 14 | )
 15 | from autocut.transcribe import Transcribe
 16 | 
 17 | 
 18 | class TestTranscribe(unittest.TestCase):
 19 |     @classmethod
 20 |     def setUpClass(cls):
 21 |         logging.info("检查测试文件是否正常存在")
 22 |         scan_file = os.listdir(TEST_MEDIA_PATH)
 23 |         logging.info(
 24 |             "应存在文件列表："
 25 |             + str(TEST_MEDIA_FILE)
 26 |             + str(TEST_MEDIA_FILE_LANG)
 27 |             + str(TEST_MEDIA_FILE_SIMPLE)
 28 |             + "  扫描到文件列表："
 29 |             + str(scan_file)
 30 |         )
 31 |         for file in TEST_MEDIA_FILE:
 32 |             assert file in scan_file
 33 |         for file in TEST_MEDIA_FILE_LANG:
 34 |             assert file in scan_file
 35 |         for file in TEST_MEDIA_FILE_SIMPLE:
 36 |             assert file in scan_file
 37 | 
 38 |     @classmethod
 39 |     def tearDownClass(cls):
 40 |         for file in os.listdir(TEST_MEDIA_PATH):
 41 |             if file.endswith("md") or file.endswith("srt"):
 42 |                 os.remove(TEST_MEDIA_PATH + file)
 43 | 
 44 |     def tearDown(self):
 45 |         for file in TEST_MEDIA_FILE_SIMPLE:
 46 |             if os.path.exists(TEST_MEDIA_PATH + file.split(".")[0] + ".md"):
 47 |                 os.remove(TEST_MEDIA_PATH + file.split(".")[0] + ".md")
 48 |             if os.path.exists(TEST_MEDIA_PATH + file.split(".")[0] + ".srt"):
 49 |                 os.remove(TEST_MEDIA_PATH + file.split(".")[0] + ".srt")
 50 | 
 51 |     @parameterized.expand([param(file) for file in TEST_MEDIA_FILE])
 52 |     def test_default_transcribe(self, file_name):
 53 |         logging.info("检查默认参数生成字幕")
 54 |         args = TestArgs()
 55 |         args.inputs = [TEST_MEDIA_PATH + file_name]
 56 |         transcribe = Transcribe(args)
 57 |         transcribe.run()
 58 |         self.assertTrue(
 59 |             os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md")
 60 |         )
 61 | 
 62 |     @parameterized.expand([param(file) for file in TEST_MEDIA_FILE])
 63 |     def test_jump_done_transcribe(self, file_name):
 64 |         logging.info("检查默认参数跳过生成字幕")
 65 |         args = TestArgs()
 66 |         args.inputs = [TEST_MEDIA_PATH + file_name]
 67 |         transcribe = Transcribe(args)
 68 |         transcribe.run()
 69 |         self.assertTrue(
 70 |             os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md")
 71 |         )
 72 | 
 73 |     @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_LANG])
 74 |     def test_en_transcribe(self, file_name):
 75 |         logging.info("检查--lang='en'参数生成字幕")
 76 |         args = TestArgs()
 77 |         args.lang = "en"
 78 |         args.inputs = [TEST_MEDIA_PATH + file_name]
 79 |         transcribe = Transcribe(args)
 80 |         transcribe.run()
 81 |         self.assertTrue(
 82 |             os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md")
 83 |         )
 84 | 
 85 |     @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_LANG])
 86 |     def test_force_transcribe(self, file_name):
 87 |         logging.info("检查--force参数生成字幕")
 88 |         args = TestArgs()
 89 |         args.force = True
 90 |         args.inputs = [TEST_MEDIA_PATH + file_name]
 91 |         md0_lens = len(
 92 |             "".join(
 93 |                 MD(
 94 |                     TEST_MEDIA_PATH + file_name.split(".")[0] + ".md", args.encoding
 95 |                 ).lines
 96 |             )
 97 |         )
 98 |         transcribe = Transcribe(args)
 99 |         transcribe.run()
100 |         md1_lens = len(
101 |             "".join(
102 |                 MD(
103 |                     TEST_MEDIA_PATH + file_name.split(".")[0] + ".md", args.encoding
104 |                 ).lines
105 |             )
106 |         )
107 |         self.assertLessEqual(md1_lens, md0_lens)
108 | 
109 |     @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE])
110 |     def test_encoding_transcribe(self, file_name):
111 |         logging.info("检查--encoding参数生成字幕")
112 |         args = TestArgs()
113 |         args.encoding = "gbk"
114 |         args.inputs = [TEST_MEDIA_PATH + file_name]
115 |         transcribe = Transcribe(args)
116 |         transcribe.run()
117 |         with open(
118 |             os.path.join(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md"),
119 |             encoding="gbk",
120 |         ):
121 |             self.assertTrue(True)
122 | 
123 |     @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE])
124 |     def test_vad_transcribe(self, file_name):
125 |         logging.info("检查--vad参数生成字幕")
126 |         args = TestArgs()
127 |         args.force = True
128 |         args.vad = True
129 |         args.inputs = [TEST_MEDIA_PATH + file_name]
130 |         transcribe = Transcribe(args)
131 |         transcribe.run()
132 |         self.assertTrue(
133 |             os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md")
134 |         )
135 | 


--------------------------------------------------------------------------------