├── .github
└── workflows
│ ├── ci.yml
│ └── release.yml
├── .gitignore
├── Dockerfile
├── Dockerfile.cuda
├── LICENSE
├── README.md
├── autocut.py
├── autocut.spec
├── autocut
├── __init__.py
├── __main__.py
├── cut.py
├── daemon.py
├── main.py
├── transcribe.py
└── utils.py
├── build.sh
├── imgs
└── typora.jpg
├── requirements.txt
├── setup.cfg
├── setup.py
├── snakers4_silero-vad_master
├── .github
│ └── ISSUE_TEMPLATE
│ │ ├── bug_report.md
│ │ ├── feature_request.md
│ │ └── questions---help---support.md
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── examples
│ ├── colab_record_example.ipynb
│ ├── cpp
│ │ ├── README.md
│ │ ├── silero-vad-onnx.cpp
│ │ └── wav.h
│ ├── microphone_and_webRTC_integration
│ │ ├── README.md
│ │ └── microphone_and_webRTC_integration.py
│ └── pyaudio-streaming
│ │ ├── README.md
│ │ └── pyaudio-streaming-examples.ipynb
├── files
│ ├── lang_dict_95.json
│ ├── lang_group_dict_95.json
│ ├── silero_logo.jpg
│ ├── silero_vad.jit
│ └── silero_vad.onnx
├── hubconf.py
├── silero-vad.ipynb
└── utils_vad.py
└── test
├── config.py
├── content
├── test.srt
├── test_md.md
└── test_srt.srt
├── media
├── test001.mp4
├── test001_en.mp4
├── test002.mov
├── test003.mkv
├── test004.flv
├── test005.mp3
└── test006.MP4
├── test_cut.py
└── test_transcribe.py
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: Test
2 |
3 | on:
4 | pull_request:
5 | push:
6 | branches:
7 | - main
8 |
9 | jobs:
10 | lint_and_test:
11 | runs-on: ${{ matrix.os }}-latest
12 | strategy:
13 | matrix:
14 | python-version: ['3.9', '3.10']
15 | # macos did not support m1 for now
16 | os: [ubuntu, windows, macos]
17 | steps:
18 | - uses: actions/checkout@v3
19 | - name: Set up Python ${{ matrix.python-version }}
20 | uses: actions/setup-python@v4
21 | with:
22 | python-version: ${{ matrix.python-version }}
23 | - name: Set Variables
24 | id: set_variables
25 | shell: bash
26 | run: |
27 | echo "PY=$(python -c 'import hashlib, sys;print(hashlib.sha256(sys.version.encode()+sys.executable.encode()).hexdigest())')" >> $GITHUB_OUTPUT
28 | echo "PIP_CACHE=$(pip cache dir)" >> $GITHUB_OUTPUT
29 | - name: Cache PIP
30 | uses: actions/cache@v3
31 | with:
32 | path: ${{ steps.set_variables.outputs.PIP_CACHE }}
33 | key: ${{ runner.os }}-pip-${{ steps.set_variables.outputs.PY }}
34 |
35 | - name: Setup ffmpeg for differnt platforms
36 | uses: FedericoCarboni/setup-ffmpeg@master
37 |
38 | - name: Install dependencies
39 | run: |
40 | python -m pip install --upgrade pip
41 | pip install .
42 | pip install pytest black
43 | - name: Run Test
44 | run: pytest test/
45 | - name: Run Lint
46 | run: black . --check
47 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | push:
5 | tags:
6 | - 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10
7 |
8 | jobs:
9 |
10 | createrelease:
11 | name: Create Release
12 | runs-on: [ubuntu-latest]
13 | steps:
14 | - name: Create Release
15 | id: create_release
16 | uses: actions/create-release@v1
17 | env:
18 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
19 | with:
20 | tag_name: ${{ github.ref }}
21 | release_name: Release ${{ github.ref }}
22 | draft: false
23 | prerelease: false
24 | - name: Output Release URL File
25 | run: echo "${{ steps.create_release.outputs.upload_url }}" > release_url.txt
26 | - name: Save Release URL File for publish
27 | uses: actions/upload-artifact@v1
28 | with:
29 | name: release_url
30 | path: release_url.txt
31 |
32 | build:
33 | name: Build packages
34 | needs: createrelease
35 | runs-on: ${{ matrix.os }}
36 | strategy:
37 | matrix:
38 | include:
39 | - os: macos-latest
40 | TARGET: macos
41 | OUT_FILE_NAME: autocut_macos.zip
42 | ASSET_MIME: application/zip
43 | - os: ubuntu-latest
44 | TARGET: linux
45 | OUT_FILE_NAME: autocut_linux.zip
46 | ASSET_MIME: application/zip
47 | - os: windows-latest
48 | TARGET: windows
49 | OUT_FILE_NAME: autocut_windows.zip
50 | ASSET_MIME: application/zip
51 | steps:
52 | - uses: actions/checkout@v1
53 | - name: Set up Python 3.9
54 | uses: actions/setup-python@v2
55 | with:
56 | python-version: 3.9
57 | - name: Install dependencies
58 | run: |
59 | python -m pip install --upgrade pip
60 | pip install virtualenv
61 | python -m virtualenv ./.venv
62 | - name: Build with pyinstaller for Windows
63 | if: runner.os == 'windows'
64 | run: |
65 | .venv\Scripts\activate
66 | pip install -r requirements.txt
67 | pyinstaller autocut.spec -y
68 | - name: Build with pyinstaller for Other-${{matrix.TARGET}}
69 | if: runner.os != 'windows'
70 | run: |
71 | source .venv/bin/activate
72 | pip install -r requirements.txt
73 | pyinstaller autocut.spec -y
74 | - name: Zip Files
75 | uses: vimtor/action-zip@v1
76 | with:
77 | files: ./dist
78 | dest: ./dist/autocut_${{matrix.TARGET}}.zip
79 | - name: Load Release URL File from release job
80 | uses: actions/download-artifact@v1
81 | with:
82 | name: release_url
83 | - name: Get Release File Name & Upload URL
84 | id: get_release_info
85 | shell: bash
86 | run: |
87 | value=`cat release_url/release_url.txt`
88 | echo ::set-output name=upload_url::$value
89 | - name: Upload Release Asset
90 | id: upload-release-asset
91 | uses: actions/upload-release-asset@v1
92 | env:
93 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
94 | with:
95 | upload_url: ${{ steps.get_release_info.outputs.upload_url }}
96 | asset_path: ./dist/${{ matrix.OUT_FILE_NAME}}
97 | asset_name: ${{ matrix.OUT_FILE_NAME}}
98 | asset_content_type: ${{ matrix.ASSET_MIME}}
99 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 | log/
131 |
132 | # vad_model
133 | # snakers4_silero-vad_master
134 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9-slim as base
2 |
3 | RUN mkdir /autocut
4 | COPY ./ /autocut
5 | WORKDIR /autocut
6 |
7 | RUN apt update && \
8 | apt install -y git && \
9 | apt install -y ffmpeg
10 |
11 | RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu && \
12 | pip install .
--------------------------------------------------------------------------------
/Dockerfile.cuda:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime
2 |
3 | RUN mkdir /autocut
4 | COPY ./ /autocut
5 | WORKDIR /autocut
6 |
7 | RUN apt update && \
8 | apt install -y git && \
9 | apt install -y ffmpeg
10 |
11 | RUN pip install .
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AutoCut: 通过字幕来剪切视频
2 |
3 | AutoCut 对你的视频自动生成字幕。然后你选择需要保留的句子,AutoCut 将对你视频中对应的片段裁切并保存。你无需使用视频编辑软件,只需要编辑文本文件即可完成剪切。
4 |
5 | ## 使用例子
6 |
7 | 假如你录制的视频放在 `2022-11-04/` 这个文件夹里。那么运行
8 |
9 | ```bash
10 | autocut -d 2022-11-04
11 | ```
12 |
13 | > 提示:如果你使用 OBS 录屏,可以在 `设置->高级->录像->文件名格式` 中将空格改成 `/`,即 `%CCYY-%MM-%DD/%hh-%mm-%ss`。那么视频文件将放在日期命名的文件夹里。
14 |
15 | AutoCut 将持续对这个文件夹里视频进行字幕抽取和剪切。例如,你刚完成一个视频录制,保存在 `11-28-18.mp4`。AutoCut 将生成 `11-28-18.md`。你在里面选择需要保留的句子后,AutoCut 将剪切出 `11-28-18_cut.mp4`,并生成 `11-28-18_cut.md` 来预览结果。
16 |
17 | 你可以使用任何的 Markdown 编辑器。例如我常用 VS Code 和 Typora。下图是通过 Typora 来对 `11-28-18.md` 编辑。
18 |
19 | 
20 |
21 | 全部完成后在 `autocut.md` 里选择需要拼接的视频后,AutoCut 将输出 `autocut_merged.mp4` 和对应的字幕文件。
22 |
23 | ## 安装
24 |
25 | 首先安装 Python 包
26 |
27 | ```
28 | pip install git+https://github.com/mli/autocut.git
29 | ```
30 |
31 | ## 本地安装测试
32 |
33 |
34 | ```
35 | git clone https://github.com/mli/autocut
36 | cd autocut
37 | pip install .
38 | ```
39 |
40 |
41 | > 上面将安装 [pytorch](https://pytorch.org/)。如果你需要 GPU 运行,且默认安装的版本不匹配的话,你可以先安装 Pytorch。如果安装 Whipser 出现问题,请参考[官方文档](https://github.com/openai/whisper#setup)。
42 |
43 | 另外需要安装 [ffmpeg](https://ffmpeg.org/)
44 |
45 | ```
46 | # on Ubuntu or Debian
47 | sudo apt update && sudo apt install ffmpeg
48 |
49 | # on Arch Linux
50 | sudo pacman -S ffmpeg
51 |
52 | # on MacOS using Homebrew (https://brew.sh/)
53 | brew install ffmpeg
54 |
55 | # on Windows using Scoop (https://scoop.sh/)
56 | scoop install ffmpeg
57 | ```
58 |
59 | ## Docker 安装
60 |
61 | 首先将项目克隆到本地。
62 |
63 | ```bash
64 | git clone https://github.com/mli/autocut.git
65 | ```
66 |
67 | ### 安装 CPU 版本
68 |
69 | 进入项目根目录,然后构建 docker 映像。
70 |
71 | ```bash
72 | docker build -t autocut .
73 | ```
74 |
75 | 运行下面的命令创建 docker 容器,就可以直接使用了。
76 |
77 | ```bash
78 | docker run -it --rm -v E:\autocut:/autocut/video autocut /bin/bash
79 | ```
80 |
81 | 其中 `-v` 是将主机存放视频的文件夹 `E:\autocut` 映射到虚拟机的 `/autocut/video` 目录。`E:\autocut` 是主机存放视频的目录,需修改为自己主机存放视频的目录。
82 |
83 | ### 安装 GPU 版本
84 |
85 | 使用 GPU 加速需要主机有 Nvidia 的显卡并安装好相应驱动。然后在项目根目录,执行下面的命令构建 docker 映像。
86 |
87 | ```bash
88 | docker build -f ./Dockerfile.cuda -t autocut-gpu .
89 | ```
90 |
91 | 使用 GPU 加速时,运行 docker 容器需添加参数 `--gpus all`。
92 |
93 | ```bash
94 | docker run --gpus all -it --rm -v E:\autocut:/autocut/video autocut-gpu
95 | ```
96 |
97 | ## 更多使用选项
98 |
99 | ### 转录某个视频生成 `.srt` 和 `.md` 结果。
100 |
101 | ```bash
102 | autocut -t 22-52-00.mp4
103 | ```
104 |
105 | 1. 如果对转录质量不满意,可以使用更大的模型,例如
106 |
107 | ```bash
108 | autocut -t 22-52-00.mp4 --whisper-model large
109 | ```
110 |
111 | 默认是 `small`。更好的模型是 `medium` 和 `large`,但推荐使用 GPU 获得更好的速度。也可以使用更快的 `tiny` 和 `base`,但转录质量会下降。
112 |
113 |
114 | ### 剪切某个视频
115 |
116 | ```bash
117 | autocut -c 22-52-00.mp4 22-52-00.srt 22-52-00.md
118 | ```
119 |
120 | 1. 默认视频比特率是 `--bitrate 10m`,你可以根据需要调大调小。
121 | 2. 如果不习惯 Markdown 格式文件,你也可以直接在 `srt` 文件里删除不要的句子,在剪切时不传入 `md` 文件名即可。就是 `autocut -c 22-52-00.mp4 22-52-00.srt`
122 | 3. 如果仅有 `srt` 文件,编辑不方便可以使用如下命令生成 `md` 文件,然后编辑 `md` 文件即可,但此时会完全对照 `srt` 生成,不会出现 `no speech` 等提示文本。
123 |
124 | ```bash
125 | autocut -m test.srt test.mp4
126 | autocut -m test.mp4 test.srt # 支持视频和字幕乱序传入
127 | autocut -m test.srt # 也可以只传入字幕文件
128 | ```
129 |
130 |
131 | ### 一些小提示
132 |
133 |
134 | 1. 讲得流利的视频的转录质量会高一些,这因为是 Whisper 训练数据分布的缘故。对一个视频,你可以先粗选一下句子,然后在剪出来的视频上再剪一次。
135 | 2. ~~最终视频生成的字幕通常还需要做一些小编辑。你可以直接编辑`md`文件(比`srt`文件更紧凑,且嵌入了视频)。然后使用 `autocut -s 22-52-00.md 22-52-00.srt` 来生成更新的字幕 `22-52-00_edited.srt`。注意这里会无视句子是不是被选中,而是全部转换成 `srt`。~~
136 | 3. 最终视频生成的字幕通常还需要做一些小编辑。但 `srt` 里面空行太多。你可以使用 `autocut -s 22-52-00.srt` 来生成一个紧凑些的版本 `22-52-00_compact.srt` 方便编辑(这个格式不合法,但编辑器,例如 VS Code,还是会进行语法高亮)。编辑完成后,`autocut -s 22-52-00_compact.srt` 转回正常格式。
137 | 4. 用 Typora 和 VS Code 编辑 Markdown 都很方便。他们都有对应的快捷键 mark 一行或者多行。但 VS Code 视频预览似乎有点问题。
138 | 5. 视频是通过 ffmpeg 导出。在 Apple M1 芯片上它用不了 GPU,导致导出速度不如专业视频软件。
139 |
140 | ### 常见问题
141 |
142 | 1. **输出的是乱码?**
143 |
144 | AutoCut 默认输出编码是 `utf-8`. 确保你的编辑器也使用了 `utf-8` 解码。你可以通过 `--encoding` 指定其他编码格式。但是需要注意生成字幕文件和使用字幕文件剪辑时的编码格式需要一致。例如使用 `gbk`。
145 |
146 | ```bash
147 | autocut -t test.mp4 --encoding=gbk
148 | autocut -c test.mp4 test.srt test.md --encoding=gbk
149 | ```
150 |
151 | 如果使用了其他编码格式(如 `gbk` 等)生成 `md` 文件并用 Typora 打开后,该文件可能会被 Typora 自动转码为其他编码格式,此时再通过生成时指定的编码格式进行剪辑时可能会出现编码不支持等报错。因此可以在使用 Typora 编辑后再通过 VSCode 等修改到你需要的编码格式进行保存后再使用剪辑功能。
152 |
153 | 2. **如何使用 GPU 来转录?**
154 |
155 | 当你有 Nvidia GPU,而且安装了对应版本的 PyTorch 的时候,转录是在 GPU 上进行。你可以通过命令来查看当前是不是支持 GPU。
156 |
157 | ```bash
158 | python -c "import torch; print(torch.cuda.is_available())"
159 | ```
160 |
161 | 否则你可以在安装 AutoCut 前手动安装对应的 GPU 版本 PyTorch。
162 |
163 | 3. **使用 GPU 时报错显存不够。**
164 |
165 | whisper 的大模型需要一定的 GPU 显存。如果你的显存不够,你可以用小一点的模型,例如 `small`。如果你仍然想用大模型,可以通过 `--device` 来强制使用 CPU。例如
166 |
167 | ```bash
168 | autocut -t 11-28-18.mp4 --whisper-model large --device cpu
169 | ```
170 |
171 | 4. **能不能使用 `pip` 安装?**
172 |
173 | whisper已经发布到PyPI了,可以直接用`pip install openai-whisper`安装。
174 |
175 | [https://github.com/openai/whisper#setup](https://github.com/openai/whisper#setup)
176 |
177 | [https://pypi.org/project/openai-whisper/](https://pypi.org/project/openai-whisper/)
178 |
179 | ## 如何参与贡献
180 |
181 | [这里有一些想做的 feature](https://github.com/mli/autocut/issues/22),欢迎贡献。
182 |
183 | ### 代码结构
184 | ```text
185 | autocut
186 | │ .gitignore
187 | │ LICENSE
188 | │ README.md # 一般新增或修改需要让使用者知道就需要对应更新 README.md 内容
189 | │ setup.py
190 | │
191 | └─autocut # 核心代码位于 autocut 文件夹中,新增功能的实现也一般在这里面进行修改或新增
192 | │ cut.py
193 | │ daemon.py
194 | │ main.py
195 | │ transcribe.py
196 | │ utils.py
197 | └─ __init__.py
198 |
199 | ```
200 |
201 | ### 安装依赖
202 | 开始安装这个项目的需要的依赖之前,建议先了解一下 Anaconda 或者 venv 的虚拟环境使用,推荐**使用虚拟环境来搭建该项目的开发环境**。
203 | 具体安装方式为在你搭建搭建的虚拟环境之中按照[上方安装步骤](./README.md#安装)进行安装。
204 |
205 | > 为什么推荐使用虚拟环境开发?
206 | >
207 | > 一方面是保证各种不同的开发环境之间互相不污染。
208 | >
209 | > 更重要的是在于这个项目实际上是一个 Python Package,所以在你安装之后 AutoCut 的代码实际也会变成你的环境依赖。
210 | > **因此在你更新代码之后,你需要让将新代码重新安装到环境中,然后才能调用到新的代码。**
211 |
212 | ### 开发
213 |
214 | 1. 代码风格目前遵循 PEP-8,可以使用相关的自动格式化软件完成。
215 | 2. `utils.py` 主要是全局共用的一些工具方法。
216 | 3. `transcribe.py` 是调用模型生成`srt`和`md`的部分。
217 | 4. `cut.py` 提供根据标记后`md`或`srt`进行视频剪切合并的功能。
218 | 5. `daemon.py` 提供的是监听文件夹生成字幕和剪切视频的功能。
219 | 6. `main.py` 声明命令行参数,根据输入参数调用对应功能。
220 |
221 | 开发过程中请尽量保证修改在正确的地方,以及合理地复用代码,
222 | 同时工具函数请尽可能放在`utils.py`中。
223 | 代码格式目前是遵循 PEP-8,变量命名尽量语义化即可。
224 |
225 | 在开发完成之后,最重要的一点是需要进行**测试**,请保证提交之前对所有**与你修改直接相关的部分**以及**你修改会影响到的部分**都进行了测试,并保证功能的正常。
226 | 目前使用 `GitHub Actions` CI, Lint 使用 black 提交前请运行 `black`。
227 |
228 | ### 提交
229 |
230 | 1. commit 信息用英文描述清楚你做了哪些修改即可,小写字母开头。
231 | 2. 最好可以保证一次的 commit 涉及的修改比较小,可以简短地描述清楚,这样也方便之后有修改时的查找。
232 | 3. PR 的时候 title 简述有哪些修改, contents 可以具体写下修改内容。
233 | 4. run test `pip install pytest` then `pytest test`
234 | 5. run lint `pip install black` then `black .`
235 |
--------------------------------------------------------------------------------
/autocut.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | from autocut import main
3 |
4 | if __name__ == "__main__":
5 | multiprocessing.freeze_support()
6 | main.main()
7 |
--------------------------------------------------------------------------------
/autocut.spec:
--------------------------------------------------------------------------------
1 | # -*- mode: python ; coding: utf-8 -*-
2 | from PyInstaller.utils.hooks import copy_metadata, collect_data_files
3 | from os import path
4 | import platform
5 | plat = platform.system().lower()
6 |
7 | datas = []
8 | datas += collect_data_files('torch')
9 | datas += copy_metadata('tqdm')
10 | datas += copy_metadata('regex')
11 | datas += copy_metadata('requests')
12 | datas += copy_metadata('packaging')
13 | datas += copy_metadata('filelock')
14 | datas += copy_metadata('numpy')
15 | datas += copy_metadata('tokenizers')
16 | datas += copy_metadata('torch')
17 |
18 | datas += collect_data_files('transformers', include_py_files=True)
19 |
20 | datas += [(path.join(
21 | './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages',
22 | 'moviepy'
23 | ), 'moviepy')]
24 | datas += [(path.join(
25 | './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages',
26 | 'imageio_ffmpeg'
27 | ), 'imageio_ffmpeg')]
28 | datas += [(path.join(
29 | './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages',
30 | 'torchaudio'
31 | ), 'torchaudio')]
32 | datas += [(path.join(
33 | './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages',
34 | 'whisper'
35 | ), 'whisper')]
36 | datas += [(path.join(
37 | './.venv/Lib/site-packages' if plat == 'windows' else './.venv/lib/python3.9/site-packages',
38 | 'opencc'
39 | ), 'opencc')]
40 | datas += [('./snakers4_silero-vad_master', './snakers4_silero-vad_master')]
41 | if not plat == 'windows':
42 | datas += [('./build.sh', './')]
43 |
44 | block_cipher = None
45 |
46 |
47 | a = Analysis(
48 | ['autocut.py'],
49 | pathex=[],
50 | binaries=[],
51 | datas=datas,
52 | hiddenimports=[],
53 | hookspath=[],
54 | hooksconfig={},
55 | runtime_hooks=[],
56 | excludes=[],
57 | win_no_prefer_redirects=False,
58 | win_private_assemblies=False,
59 | cipher=block_cipher,
60 | noarchive=False,
61 | )
62 | pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
63 |
64 | exe = EXE(
65 | pyz,
66 | a.scripts,
67 | [],
68 | exclude_binaries=True,
69 | name='autocut',
70 | debug=False,
71 | bootloader_ignore_signals=False,
72 | strip=False,
73 | upx=True,
74 | console=True,
75 | disable_windowed_traceback=False,
76 | argv_emulation=False,
77 | target_arch=None,
78 | codesign_identity=None,
79 | entitlements_file=None,
80 | )
81 | coll = COLLECT(
82 | exe,
83 | a.binaries,
84 | a.zipfiles,
85 | a.datas,
86 | strip=False,
87 | upx=True,
88 | upx_exclude=[],
89 | name='autocut',
90 | )
91 |
--------------------------------------------------------------------------------
/autocut/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.3"
2 |
--------------------------------------------------------------------------------
/autocut/__main__.py:
--------------------------------------------------------------------------------
1 | from .main import main
2 |
3 | if __name__ == "__main__":
4 | main()
5 |
--------------------------------------------------------------------------------
/autocut/cut.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import re
4 |
5 | import srt
6 | from moviepy import editor
7 |
8 | from . import utils
9 |
10 |
11 | # Merge videos
12 | class Merger:
13 | def __init__(self, args):
14 | self.args = args
15 |
16 | def write_md(self, videos):
17 | md = utils.MD(self.args.inputs[0], self.args.encoding)
18 | num_tasks = len(md.tasks())
19 | # Not overwrite if already marked as down or no new videos
20 | if md.done_editing() or num_tasks == len(videos) + 1:
21 | return
22 |
23 | md.clear()
24 | md.add_done_editing(False)
25 | md.add("\nSelect the files that will be used to generate `autocut_final.mp4`\n")
26 | base = lambda fn: os.path.basename(fn)
27 | for f in videos:
28 | md_fn = utils.change_ext(f, "md")
29 | video_md = utils.MD(md_fn, self.args.encoding)
30 | # select a few words to scribe the video
31 | desc = ""
32 | if len(video_md.tasks()) > 1:
33 | for _, t in video_md.tasks()[1:]:
34 | m = re.findall(r"\] (.*)", t)
35 | if m and "no speech" not in m[0].lower():
36 | desc += m[0] + " "
37 | if len(desc) > 50:
38 | break
39 | md.add_task(
40 | False,
41 | f'[{base(f)}]({base(md_fn)}) {"[Edited]" if video_md.done_editing() else ""} {desc}',
42 | )
43 | md.write()
44 |
45 | def run(self):
46 | md_fn = self.args.inputs[0]
47 | md = utils.MD(md_fn, self.args.encoding)
48 | if not md.done_editing():
49 | return
50 |
51 | videos = []
52 | for m, t in md.tasks():
53 | if not m:
54 | continue
55 | m = re.findall(r"\[(.*)\]", t)
56 | if not m:
57 | continue
58 | fn = os.path.join(os.path.dirname(md_fn), m[0])
59 | logging.info(f"Loading {fn}")
60 | videos.append(editor.VideoFileClip(fn))
61 |
62 | dur = sum([v.duration for v in videos])
63 | logging.info(f"Merging into a video with {dur / 60:.1f} min length")
64 |
65 | merged = editor.concatenate_videoclips(videos)
66 | fn = os.path.splitext(md_fn)[0] + "_merged.mp4"
67 | merged.write_videofile(
68 | fn, audio_codec="aac", bitrate=self.args.bitrate
69 | ) # logger=None,
70 | logging.info(f"Saved merged video to {fn}")
71 |
72 |
73 | # Cut media
74 | class Cutter:
75 | def __init__(self, args):
76 | self.args = args
77 |
78 | def run(self):
79 | fns = {"srt": None, "media": None, "md": None}
80 | for fn in self.args.inputs:
81 | ext = os.path.splitext(fn)[1][1:]
82 | fns[ext if ext in fns else "media"] = fn
83 |
84 | assert fns["media"], "must provide a media filename"
85 | assert fns["srt"], "must provide a srt filename"
86 |
87 | is_video_file = utils.is_video(fns["media"].lower())
88 | outext = "mp4" if is_video_file else "mp3"
89 | output_fn = utils.change_ext(utils.add_cut(fns["media"]), outext)
90 | if utils.check_exists(output_fn, self.args.force):
91 | return
92 |
93 | with open(fns["srt"], encoding=self.args.encoding) as f:
94 | subs = list(srt.parse(f.read()))
95 |
96 | if fns["md"]:
97 | md = utils.MD(fns["md"], self.args.encoding)
98 | if not md.done_editing():
99 | return
100 | index = []
101 | for mark, sent in md.tasks():
102 | if not mark:
103 | continue
104 | m = re.match(r"\[(\d+)", sent.strip())
105 | if m:
106 | index.append(int(m.groups()[0]))
107 | subs = [s for s in subs if s.index in index]
108 | logging.info(f'Cut {fns["media"]} based on {fns["srt"]} and {fns["md"]}')
109 | else:
110 | logging.info(f'Cut {fns["media"]} based on {fns["srt"]}')
111 |
112 | segments = []
113 | # Avoid disordered subtitles
114 | subs.sort(key=lambda x: x.start)
115 | for x in subs:
116 | if len(segments) == 0:
117 | segments.append(
118 | {"start": x.start.total_seconds(), "end": x.end.total_seconds()}
119 | )
120 | else:
121 | if x.start.total_seconds() - segments[-1]["end"] < 0.5:
122 | segments[-1]["end"] = x.end.total_seconds()
123 | else:
124 | segments.append(
125 | {"start": x.start.total_seconds(), "end": x.end.total_seconds()}
126 | )
127 |
128 | if is_video_file:
129 | media = editor.VideoFileClip(fns["media"])
130 | else:
131 | media = editor.AudioFileClip(fns["media"])
132 |
133 | # Add a fade between two clips. Not quite necessary. keep code here for reference
134 | # fade = 0
135 | # segments = _expand_segments(segments, fade, 0, video.duration)
136 | # clips = [video.subclip(
137 | # s['start'], s['end']).crossfadein(fade) for s in segments]
138 | # final_clip = editor.concatenate_videoclips(clips, padding = -fade)
139 |
140 | clips = [media.subclip(s["start"], s["end"]) for s in segments]
141 | if is_video_file:
142 | final_clip: editor.VideoClip = editor.concatenate_videoclips(clips)
143 | logging.info(
144 | f"Reduced duration from {media.duration:.1f} to {final_clip.duration:.1f}"
145 | )
146 |
147 | aud = final_clip.audio.set_fps(44100)
148 | final_clip = final_clip.without_audio().set_audio(aud)
149 | final_clip = final_clip.fx(editor.afx.audio_normalize)
150 |
151 | # an alternative to birate is use crf, e.g. ffmpeg_params=['-crf', '18']
152 | final_clip.write_videofile(
153 | output_fn, audio_codec="aac", bitrate=self.args.bitrate
154 | )
155 | else:
156 | final_clip: editor.AudioClip = editor.concatenate_audioclips(clips)
157 | logging.info(
158 | f"Reduced duration from {media.duration:.1f} to {final_clip.duration:.1f}"
159 | )
160 |
161 | final_clip = final_clip.fx(editor.afx.audio_normalize)
162 | final_clip.write_audiofile(
163 | output_fn, codec="libmp3lame", fps=44100, bitrate=self.args.bitrate
164 | )
165 |
166 | media.close()
167 | logging.info(f"Saved media to {output_fn}")
168 |
--------------------------------------------------------------------------------
/autocut/daemon.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import glob
3 | import logging
4 | import os
5 | import time
6 |
7 | from . import cut, transcribe, utils
8 |
9 |
10 | class Daemon:
11 | def __init__(self, args):
12 | self.args = args
13 | self.sleep = 1
14 |
15 | def run(self):
16 | assert len(self.args.inputs) == 1, "Must provide a single folder"
17 | while True:
18 | self._iter()
19 | time.sleep(self.sleep)
20 | self.sleep = min(60, self.sleep + 1)
21 |
22 | def _iter(self):
23 | folder = self.args.inputs[0]
24 | files = sorted(list(glob.glob(os.path.join(folder, "*"))))
25 | media_files = [f for f in files if utils.is_video(f) or utils.is_audio(f)]
26 | args = copy.deepcopy(self.args)
27 | for f in media_files:
28 | srt_fn = utils.change_ext(f, "srt")
29 | md_fn = utils.change_ext(f, "md")
30 | is_video_file = utils.is_video(f)
31 | if srt_fn not in files or md_fn not in files:
32 | args.inputs = [f]
33 | try:
34 | transcribe.Transcribe(args).run()
35 | self.sleep = 1
36 | break
37 | except RuntimeError as e:
38 | logging.warn(
39 | "Failed, may be due to the video is still on recording"
40 | )
41 | pass
42 | if md_fn in files:
43 | if utils.add_cut(md_fn) in files:
44 | continue
45 | md = utils.MD(md_fn, self.args.encoding)
46 | ext = "mp4" if is_video_file else "mp3"
47 | if not md.done_editing() or os.path.exists(
48 | utils.change_ext(utils.add_cut(f), ext)
49 | ):
50 | continue
51 | args.inputs = [f, md_fn, srt_fn]
52 | cut.Cutter(args).run()
53 | self.sleep = 1
54 |
55 | args.inputs = [os.path.join(folder, "autocut.md")]
56 | merger = cut.Merger(args)
57 | merger.write_md(media_files)
58 | merger.run()
59 |
--------------------------------------------------------------------------------
/autocut/main.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import os
4 |
5 | from whisper.tokenizer import LANGUAGES
6 | from . import utils
7 |
8 | def main():
9 | parser = argparse.ArgumentParser(
10 | description="Edit videos based on transcribed subtitles",
11 | formatter_class=argparse.RawDescriptionHelpFormatter,
12 | )
13 |
14 | logging.basicConfig(
15 | format="[autocut:%(filename)s:L%(lineno)d] %(levelname)-6s %(message)s"
16 | )
17 | logging.getLogger().setLevel(logging.INFO)
18 |
19 | parser.add_argument("inputs", type=str, nargs="+", help="Inputs filenames/folders")
20 | parser.add_argument(
21 | "-t",
22 | "--transcribe",
23 | help="Transcribe videos/audio into subtitles",
24 | action=argparse.BooleanOptionalAction,
25 | )
26 | parser.add_argument(
27 | "-c",
28 | "--cut",
29 | help="Cut a video based on subtitles",
30 | action=argparse.BooleanOptionalAction,
31 | )
32 | parser.add_argument(
33 | "-d",
34 | "--daemon",
35 | help="Monitor a folder to transcribe and cut",
36 | action=argparse.BooleanOptionalAction,
37 | )
38 | parser.add_argument(
39 | "-s",
40 | help="Convert .srt to a compact format for easier editing",
41 | action=argparse.BooleanOptionalAction,
42 | )
43 | parser.add_argument(
44 | "-m",
45 | "--to-md",
46 | help="Convert .srt to .md for easier editing",
47 | action=argparse.BooleanOptionalAction,
48 | )
49 | parser.add_argument(
50 | "--lang",
51 | type=str,
52 | default="zh",
53 | choices=LANGUAGES.keys(),
54 | help="The output language of transcription",
55 | )
56 | parser.add_argument(
57 | "--prompt", type=str, default="", help="initial prompt feed into whisper"
58 | )
59 | parser.add_argument(
60 | "--whisper-model",
61 | type=str,
62 | default="small",
63 | choices=["tiny", "base", "small", "medium", "large"],
64 | help="The whisper model used to transcribe.",
65 | )
66 | parser.add_argument(
67 | "--bitrate",
68 | type=str,
69 | default="10m",
70 | help="The bitrate to export the cutted video, such as 10m, 1m, or 500k",
71 | )
72 | parser.add_argument(
73 | "--vad", help="If or not use VAD", choices=["1", "0", "auto"], default="auto"
74 | )
75 | parser.add_argument(
76 | "--force",
77 | help="Force write even if files exist",
78 | action=argparse.BooleanOptionalAction,
79 | )
80 | parser.add_argument(
81 | "--encoding", type=str, default="utf-8", help="Document encoding format"
82 | )
83 | parser.add_argument(
84 | "--device",
85 | type=str,
86 | default=None,
87 | choices=["cpu", "cuda"],
88 | help="Force to CPU or GPU for transcribing. In default automatically use GPU if available.",
89 | )
90 |
91 | args = parser.parse_args()
92 |
93 | if args.transcribe:
94 | from .transcribe import Transcribe
95 |
96 | Transcribe(args).run()
97 | elif args.to_md:
98 | from .utils import trans_srt_to_md
99 |
100 | if len(args.inputs) == 2:
101 | [input_1, input_2] = args.inputs
102 | base, ext = os.path.splitext(input_1)
103 | if ext != ".srt":
104 | input_1, input_2 = input_2, input_1
105 | trans_srt_to_md(args.encoding, args.force, input_1, input_2)
106 | elif len(args.inputs) == 1:
107 | trans_srt_to_md(args.encoding, args.force, args.inputs[0])
108 | else:
109 | logging.warn(
110 | "Wrong number of files, please pass in a .srt file or an additional video file"
111 | )
112 | elif args.cut:
113 | from .cut import Cutter
114 |
115 | Cutter(args).run()
116 | elif args.daemon:
117 | from .daemon import Daemon
118 |
119 | Daemon(args).run()
120 | elif args.s:
121 | utils.compact_rst(args.inputs[0], args.encoding)
122 | else:
123 | logging.warning("No action, use -c, -t or -d")
124 |
125 |
126 | if __name__ == "__main__":
127 | main()
128 |
--------------------------------------------------------------------------------
/autocut/transcribe.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging
3 | import os
4 | import sys
5 | import time
6 |
7 | import opencc
8 | import srt
9 | import torch
10 | import whisper
11 |
12 | from tqdm import tqdm
13 |
14 | from . import utils
15 |
16 |
17 | def process(whisper_model, audio, seg, lang, prompt):
18 | r = whisper_model.transcribe(
19 | audio[int(seg["start"]) : int(seg["end"])],
20 | task="transcribe",
21 | language=lang,
22 | initial_prompt=prompt,
23 | )
24 | r["origin_timestamp"] = seg
25 | return r
26 |
27 |
28 | class Transcribe:
29 | def __init__(self, args):
30 | self.args = args
31 | self.sampling_rate = 16000
32 | self.whisper_model = None
33 | self.vad_model = None
34 | self.detect_speech = None
35 |
36 | def run(self):
37 | for input in self.args.inputs:
38 | logging.info(f"Transcribing {input}")
39 | name, _ = os.path.splitext(input)
40 | if utils.check_exists(name + ".md", self.args.force):
41 | continue
42 |
43 | audio = whisper.load_audio(input, sr=self.sampling_rate)
44 | if (
45 | self.args.vad == "1"
46 | or self.args.vad == "auto"
47 | and not name.endswith("_cut")
48 | ):
49 | speech_timestamps = self._detect_voice_activity(audio)
50 | else:
51 | speech_timestamps = [{"start": 0, "end": len(audio)}]
52 | transcribe_results = self._transcribe(audio, speech_timestamps)
53 |
54 | output = name + ".srt"
55 | self._save_srt(output, transcribe_results)
56 | logging.info(f"Transcribed {input} to {output}")
57 | self._save_md(name + ".md", output, input)
58 | logging.info(f'Saved texts to {name + ".md"} to mark sentences')
59 |
60 | def _detect_voice_activity(self, audio):
61 | """Detect segments that have voice activities"""
62 | tic = time.time()
63 | if self.vad_model is None or self.detect_speech is None:
64 | # torch load limit https://github.com/pytorch/vision/issues/4156
65 | torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
66 | self.vad_model, funcs = torch.hub.load(
67 | repo_or_dir=os.path.join(os.path.dirname(sys.executable), "snakers4_silero-vad_master"),
68 | source="local",
69 | model="silero_vad",
70 | trust_repo=True,
71 | silero_vad_source="local",
72 | )
73 |
74 | self.detect_speech = funcs[0]
75 |
76 | speeches = self.detect_speech(
77 | audio, self.vad_model, sampling_rate=self.sampling_rate
78 | )
79 |
80 | # Remove too short segments
81 | speeches = utils.remove_short_segments(speeches, 1.0 * self.sampling_rate)
82 |
83 | # Expand to avoid to tight cut. You can tune the pad length
84 | speeches = utils.expand_segments(
85 | speeches, 0.2 * self.sampling_rate, 0.0 * self.sampling_rate, audio.shape[0]
86 | )
87 |
88 | # Merge very closed segments
89 | speeches = utils.merge_adjacent_segments(speeches, 0.5 * self.sampling_rate)
90 |
91 | logging.info(f"Done voice activity detection in {time.time() - tic:.1f} sec")
92 | return speeches if len(speeches) > 1 else [{"start": 0, "end": len(audio)}]
93 |
94 | def _transcribe(self, audio, speech_timestamps):
95 | tic = time.time()
96 | if self.whisper_model is None:
97 | self.whisper_model = whisper.load_model(
98 | self.args.whisper_model, self.args.device
99 | )
100 |
101 | res = []
102 | if self.args.device == "cpu" and len(speech_timestamps) > 1:
103 | from multiprocessing import Pool
104 |
105 | pbar = tqdm(total=len(speech_timestamps))
106 |
107 | pool = Pool(processes=4)
108 | # TODO, a better way is merging these segments into a single one, so whisper can get more context
109 | for seg in speech_timestamps:
110 | res.append(
111 | pool.apply_async(
112 | process,
113 | (
114 | self.whisper_model,
115 | audio,
116 | seg,
117 | self.args.lang,
118 | self.args.prompt,
119 | ),
120 | callback=lambda x: pbar.update(),
121 | )
122 | )
123 | pool.close()
124 | pool.join()
125 | pbar.close()
126 | logging.info(f"Done transcription in {time.time() - tic:.1f} sec")
127 | return [i.get() for i in res]
128 | else:
129 | for seg in (
130 | speech_timestamps
131 | if len(speech_timestamps) == 1
132 | else tqdm(speech_timestamps)
133 | ):
134 | r = self.whisper_model.transcribe(
135 | audio[int(seg["start"]) : int(seg["end"])],
136 | task="transcribe",
137 | language=self.args.lang,
138 | initial_prompt=self.args.prompt,
139 | verbose=False if len(speech_timestamps) == 1 else None,
140 | )
141 | r["origin_timestamp"] = seg
142 | res.append(r)
143 | logging.info(f"Done transcription in {time.time() - tic:.1f} sec")
144 | return res
145 |
146 | def _save_srt(self, output, transcribe_results):
147 | subs = []
148 | # whisper sometimes generate traditional chinese, explicitly convert
149 | cc = opencc.OpenCC("t2s")
150 |
151 | def _add_sub(start, end, text):
152 | subs.append(
153 | srt.Subtitle(
154 | index=0,
155 | start=datetime.timedelta(seconds=start),
156 | end=datetime.timedelta(seconds=end),
157 | content=cc.convert(text.strip()),
158 | )
159 | )
160 |
161 | prev_end = 0
162 | for r in transcribe_results:
163 | origin = r["origin_timestamp"]
164 | for s in r["segments"]:
165 | start = s["start"] + origin["start"] / self.sampling_rate
166 | end = min(
167 | s["end"] + origin["start"] / self.sampling_rate,
168 | origin["end"] / self.sampling_rate,
169 | )
170 | if start > end:
171 | continue
172 | # mark any empty segment that is not very short
173 | if start > prev_end + 1.0:
174 | _add_sub(prev_end, start, "< No Speech >")
175 | _add_sub(start, end, s["text"])
176 | prev_end = end
177 |
178 | with open(output, "wb") as f:
179 | f.write(srt.compose(subs).encode(self.args.encoding, "replace"))
180 |
181 | def _save_md(self, md_fn, srt_fn, video_fn):
182 | with open(srt_fn, encoding=self.args.encoding) as f:
183 | subs = srt.parse(f.read())
184 |
185 | md = utils.MD(md_fn, self.args.encoding)
186 | md.clear()
187 | md.add_done_editing(False)
188 | md.add_video(os.path.basename(video_fn))
189 | md.add(
190 | f"\nTexts generated from [{os.path.basename(srt_fn)}]({os.path.basename(srt_fn)})."
191 | "Mark the sentences to keep for autocut.\n"
192 | "The format is [subtitle_index,duration_in_second] subtitle context.\n\n"
193 | )
194 |
195 | for s in subs:
196 | sec = s.start.seconds
197 | pre = f"[{s.index},{sec // 60:02d}:{sec % 60:02d}]"
198 | md.add_task(False, f"{pre:11} {s.content.strip()}")
199 | md.write()
200 |
--------------------------------------------------------------------------------
/autocut/utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import re
4 |
5 | import srt
6 | import opencc
7 |
8 |
9 | def is_video(filename):
10 | _, ext = os.path.splitext(filename)
11 | return ext in [".mp4", ".mov", ".mkv", ".avi", ".flv", ".f4v", ".webm"]
12 |
13 |
14 | def is_audio(filename):
15 | _, ext = os.path.splitext(filename)
16 | return ext in [".ogg", ".wav", ".mp3", ".flac", ".m4a"]
17 |
18 |
19 | def change_ext(filename, new_ext):
20 | # Change the extension of filename to new_ext
21 | base, _ = os.path.splitext(filename)
22 | if not new_ext.startswith("."):
23 | new_ext = "." + new_ext
24 | return base + new_ext
25 |
26 |
27 | def add_cut(filename):
28 | # Add cut mark to the filename
29 | base, ext = os.path.splitext(filename)
30 | if base.endswith("_cut"):
31 | base = base[:-4] + "_" + base[-4:]
32 | else:
33 | base += "_cut"
34 | return base + ext
35 |
36 |
37 | # a very simple markdown parser
38 | class MD:
39 | def __init__(self, filename, encoding):
40 | self.lines = []
41 | self.EDIT_DONE_MAKR = "<-- Mark if you are done editing."
42 | self.encoding = encoding
43 | self.filename = filename
44 | if filename:
45 | self.load_file()
46 |
47 | def load_file(self):
48 | if os.path.exists(self.filename):
49 | with open(self.filename, encoding=self.encoding) as f:
50 | self.lines = f.readlines()
51 |
52 | def clear(self):
53 | self.lines = []
54 |
55 | def write(self):
56 | with open(self.filename, "wb") as f:
57 | f.write("\n".join(self.lines).encode(self.encoding, "replace"))
58 |
59 | def tasks(self):
60 | # get all tasks with their status
61 | ret = []
62 | for l in self.lines:
63 | mark, task = self._parse_task_status(l)
64 | if mark is not None:
65 | ret.append((mark, task))
66 | return ret
67 |
68 | def done_editing(self):
69 | for m, t in self.tasks():
70 | if m and self.EDIT_DONE_MAKR in t:
71 | return True
72 | return False
73 |
74 | def add(self, line):
75 | self.lines.append(line)
76 |
77 | def add_task(self, mark, contents):
78 | self.add(f'- [{"x" if mark else " "}] {contents.strip()}')
79 |
80 | def add_done_editing(self, mark):
81 | self.add_task(mark, self.EDIT_DONE_MAKR)
82 |
83 | def add_video(self, video_fn):
84 | ext = os.path.splitext(video_fn)[1][1:]
85 | self.add(
86 | f'\n\n'
87 | )
88 |
89 | def _parse_task_status(self, line):
90 | # return (is_marked, rest) or (None, line) if not a task
91 | m = re.match(r"- +\[([ x])\] +(.*)", line)
92 | if not m:
93 | return None, line
94 | return m.groups()[0].lower() == "x", m.groups()[1]
95 |
96 |
97 | def check_exists(output, force):
98 | if os.path.exists(output):
99 | if force:
100 | logging.info(f"{output} exists. Will overwrite it")
101 | else:
102 | logging.info(
103 | f"{output} exists, skipping... Use the --force flag to overwrite"
104 | )
105 | return True
106 | return False
107 |
108 |
109 | def expand_segments(segments, expand_head, expand_tail, total_length):
110 | # Pad head and tail for each time segment
111 | results = []
112 | for i in range(len(segments)):
113 | t = segments[i]
114 | start = max(t["start"] - expand_head, segments[i - 1]["end"] if i > 0 else 0)
115 | end = min(
116 | t["end"] + expand_tail,
117 | segments[i + 1]["start"] if i < len(segments) - 1 else total_length,
118 | )
119 | results.append({"start": start, "end": end})
120 | return results
121 |
122 |
123 | def remove_short_segments(segments, threshold):
124 | # Remove segments whose length < threshold
125 | return [s for s in segments if s["end"] - s["start"] > threshold]
126 |
127 |
128 | def merge_adjacent_segments(segments, threshold):
129 | # Merge two adjacent segments if their distance < threshold
130 | results = []
131 | i = 0
132 | while i < len(segments):
133 | s = segments[i]
134 | for j in range(i + 1, len(segments)):
135 | if segments[j]["start"] < s["end"] + threshold:
136 | s["end"] = segments[j]["end"]
137 | i = j
138 | else:
139 | break
140 | i += 1
141 | results.append(s)
142 | return results
143 |
144 |
145 | def compact_rst(sub_fn, encoding):
146 | cc = opencc.OpenCC("t2s")
147 |
148 | base, ext = os.path.splitext(sub_fn)
149 | COMPACT = "_compact"
150 | if ext != ".srt":
151 | logging.fatal("only .srt file is supported")
152 |
153 | if base.endswith(COMPACT):
154 | # to original rst
155 | with open(sub_fn, encoding=encoding) as f:
156 | lines = f.readlines()
157 | subs = []
158 | for l in lines:
159 | items = l.split(" ")
160 | if len(items) < 4:
161 | continue
162 | subs.append(
163 | srt.Subtitle(
164 | index=0,
165 | start=srt.srt_timestamp_to_timedelta(items[0]),
166 | end=srt.srt_timestamp_to_timedelta(items[2]),
167 | content=" ".join(items[3:]).strip(),
168 | )
169 | )
170 | with open(base[: -len(COMPACT)] + ext, "wb") as f:
171 | f.write(srt.compose(subs).encode(encoding, "replace"))
172 | else:
173 | # to a compact version
174 | with open(sub_fn, encoding=encoding) as f:
175 | subs = srt.parse(f.read())
176 | with open(base + COMPACT + ext, "wb") as f:
177 | for s in subs:
178 | f.write(
179 | f"{srt.timedelta_to_srt_timestamp(s.start)} --> {srt.timedelta_to_srt_timestamp(s.end)} "
180 | f"{cc.convert(s.content.strip())}\n".encode(encoding, "replace")
181 | )
182 |
183 |
184 | def trans_srt_to_md(encoding, force, srt_fn, video_fn=None):
185 | base, ext = os.path.splitext(srt_fn)
186 | if ext != ".srt":
187 | logging.fatal("only .srt file is supported")
188 | md_fn = base + ext.split(".")[0] + ".md"
189 |
190 | check_exists(md_fn, force)
191 |
192 | with open(srt_fn, encoding=encoding) as f:
193 | subs = srt.parse(f.read())
194 |
195 | md = MD(md_fn, encoding)
196 | md.clear()
197 | md.add_done_editing(False)
198 | if video_fn:
199 | if not is_video(video_fn):
200 | logging.fatal(f"{video_fn} may not be a video")
201 | md.add_video(os.path.basename(video_fn))
202 | md.add(
203 | f"\nTexts generated from [{os.path.basename(srt_fn)}]({os.path.basename(srt_fn)})."
204 | "Mark the sentences to keep for autocut.\n"
205 | "The format is [subtitle_index,duration_in_second] subtitle context.\n\n"
206 | )
207 |
208 | for s in subs:
209 | sec = s.start.seconds
210 | pre = f"[{s.index},{sec // 60:02d}:{sec % 60:02d}]"
211 | md.add_task(False, f"{pre:11} {s.content.strip()}")
212 | md.write()
213 |
--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | # run this script after pyinstaller
2 | # see https://github.com/pyinstaller/pyinstaller/issues/7582#issuecomment-1515434457
3 |
4 | rm -f libtorch*
5 | ln -s torch/lib/libtorch.dylib .
6 | ln -s torch/lib/libtorch_cpu.dylib .
7 | ln -s torch/lib/libtorch_python.dylib .
8 |
9 | ln -s torchaudio/lib/libtorchaudio.so .
10 |
11 | install_name_tool -add_rpath @loader_path/../.. torchaudio/lib/libtorchaudio.so
12 |
--------------------------------------------------------------------------------
/imgs/typora.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zcf0508/autocut/36f60160d7bd661d79303b53777bfc87833b6dc7/imgs/typora.jpg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | altgraph==0.17.3
2 | attrs==22.1.0
3 | black==22.12.0
4 | certifi==2022.12.7
5 | charset-normalizer==2.1.1
6 | click==8.1.3
7 | colorama==0.4.6
8 | decorator==4.4.2
9 | exceptiongroup==1.0.4
10 | ffmpeg-python==0.2.0
11 | filelock==3.8.2
12 | future==0.18.2
13 | huggingface-hub==0.11.1
14 | idna==3.4
15 | imageio==2.22.4
16 | imageio-ffmpeg==0.4.7
17 | iniconfig==1.1.1
18 | more-itertools==9.0.0
19 | moviepy==1.0.3
20 | mypy-extensions==0.4.3
21 | numpy==1.23.5
22 | opencc-python-reimplemented==0.1.6
23 | packaging==22.0
24 | parameterized==0.8.1
25 | pathspec==0.10.3
26 | pefile==2022.5.30
27 | Pillow==9.3.0
28 | platformdirs==2.6.0
29 | pluggy==1.0.0
30 | proglog==0.1.10
31 | pyinstaller==5.7.0
32 | pyinstaller-hooks-contrib==2022.14
33 | pyparsing==3.0.9
34 | pytest==7.2.0
35 | pywin32-ctypes==0.2.0
36 | PyYAML==6.0
37 | regex==2022.10.31
38 | requests==2.28.1
39 | six==1.16.0
40 | srt==3.5.2
41 | tokenizers==0.13.2
42 | tomli==2.0.1
43 | torch==1.13.0
44 | torchaudio==0.13.0
45 | tqdm==4.64.1
46 | transformers==4.25.1
47 | typing_extensions==4.4.0
48 | urllib3==1.26.13
49 | whisper @ git+https://github.com/openai/whisper.git@02aa851a4910201f0db56960064d7e121a01002c
50 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = autocut
3 | version = attr: autocut.__version__
4 | license = Apache Software License
5 | description = Cut video by subtitles
6 | long_description = file: README.md
7 | classifiers =
8 | License :: OSI Approved :: Apache Software License
9 | Operating System :: OS Independent
10 | Programming Language :: Python :: 3
11 |
12 | [options]
13 | packages = find:
14 | include_package_data = True
15 | python_requires = >= 3.9
16 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | requirements = [
4 | "srt",
5 | "moviepy",
6 | "opencc-python-reimplemented",
7 | "torchaudio",
8 | "parameterized",
9 | "openai-whisper",
10 | "tqdm",
11 | ]
12 |
13 |
14 | setup(
15 | name="autocut",
16 | install_requires=requirements,
17 | packages=find_packages(),
18 | entry_points={
19 | "console_scripts": [
20 | "autocut = autocut.main:main",
21 | ]
22 | },
23 | )
24 |
--------------------------------------------------------------------------------
/snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: Bug report - [X]
5 | labels: bug
6 | assignees: snakers4
7 |
8 | ---
9 |
10 | ## 🐛 Bug
11 |
12 |
13 |
14 | ## To Reproduce
15 |
16 | Steps to reproduce the behavior:
17 |
18 | 1.
19 | 2.
20 | 3.
21 |
22 |
23 |
24 | ## Expected behavior
25 |
26 |
27 |
28 | ## Environment
29 |
30 | Please copy and paste the output from this
31 | [environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py)
32 | (or fill out the checklist below manually).
33 |
34 | You can get the script and run it with:
35 | ```
36 | wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
37 | # For security purposes, please check the contents of collect_env.py before running it.
38 | python collect_env.py
39 | ```
40 |
41 | - PyTorch Version (e.g., 1.0):
42 | - OS (e.g., Linux):
43 | - How you installed PyTorch (`conda`, `pip`, source):
44 | - Build command you used (if compiling from source):
45 | - Python version:
46 | - CUDA/cuDNN version:
47 | - GPU models and configuration:
48 | - Any other relevant information:
49 |
50 | ## Additional context
51 |
52 |
53 |
--------------------------------------------------------------------------------
/snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: Feature request - [X]
5 | labels: enhancement
6 | assignees: snakers4
7 |
8 | ---
9 |
10 | ## 🚀 Feature
11 |
12 |
13 | ## Motivation
14 |
15 |
16 |
17 | ## Pitch
18 |
19 |
20 |
21 | ## Alternatives
22 |
23 |
24 |
25 | ## Additional context
26 |
27 |
28 |
--------------------------------------------------------------------------------
/snakers4_silero-vad_master/.github/ISSUE_TEMPLATE/questions---help---support.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Questions / Help / Support
3 | about: Ask for help, support or ask a question
4 | title: "❓ Questions / Help / Support"
5 | labels: help wanted
6 | assignees: snakers4
7 |
8 | ---
9 |
10 | ## ❓ Questions and Help
11 |
12 | We have a [wiki](https://github.com/snakers4/silero-models/wiki) available for our users. Please make sure you have checked it out first.
13 |
--------------------------------------------------------------------------------
/snakers4_silero-vad_master/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at aveysov@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/snakers4_silero-vad_master/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020-present Silero Team
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/snakers4_silero-vad_master/README.md:
--------------------------------------------------------------------------------
1 | [](mailto:hello@silero.ai) [](https://t.me/silero_speech) [](https://github.com/snakers4/silero-vad/blob/master/LICENSE)
2 |
3 | [](https://colab.research.google.com/github/snakers4/silero-vad/blob/master/silero-vad.ipynb)
4 |
5 | 
6 |
7 |
8 |
Silero VAD
9 |
10 |
11 | **Silero VAD** - pre-trained enterprise-grade [Voice Activity Detector](https://en.wikipedia.org/wiki/Voice_activity_detection) (also see our [STT models](https://github.com/snakers4/silero-models)).
12 |
13 | This repository also includes Number Detector and Language classifier [models](https://github.com/snakers4/silero-vad/wiki/Other-Models)
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | Real Time Example
23 |
24 | https://user-images.githubusercontent.com/36505480/144874384-95f80f6d-a4f1-42cc-9be7-004c891dd481.mp4
25 |
26 |
27 |
28 |
29 |
Key Features
30 |
31 |
32 | - **Stellar accuracy**
33 |
34 | Silero VAD has [excellent results](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#vs-other-available-solutions) on speech detection tasks.
35 |
36 | - **Fast**
37 |
38 | One audio chunk (30+ ms) [takes](https://github.com/snakers4/silero-vad/wiki/Performance-Metrics#silero-vad-performance-metrics) less than **1ms** to be processed on a single CPU thread. Using batching or GPU can also improve performance considerably. Under certain conditions ONNX may even run up to 4-5x faster.
39 |
40 | - **Lightweight**
41 |
42 | JIT model is around one megabyte in size.
43 |
44 | - **General**
45 |
46 | Silero VAD was trained on huge corpora that include over **100** languages and it performs well on audios from different domains with various background noise and quality levels.
47 |
48 | - **Flexible sampling rate**
49 |
50 | Silero VAD [supports](https://github.com/snakers4/silero-vad/wiki/Quality-Metrics#sample-rate-comparison) **8000 Hz** and **16000 Hz** [sampling rates](https://en.wikipedia.org/wiki/Sampling_(signal_processing)#Sampling_rate).
51 |
52 | - **Flexible chunk size**
53 |
54 | Model was trained on **30 ms**. Longer chunks are supported directly, others may work as well.
55 |
56 | - **Highly Portable**
57 |
58 | Silero VAD reaps benefits from the rich ecosystems built around **PyTorch** and **ONNX** running everywhere where these runtimes are available.
59 |
60 | - **No Strings Attached**
61 |
62 | Published under permissive license (MIT) Silero VAD has zero strings attached - no telemetry, no keys, no registration, no built-in expiration, no keys or vendor lock.
63 |
64 |
65 |
Typical Use Cases
66 |
67 |
68 | - Voice activity detection for IOT / edge / mobile use cases
69 | - Data cleaning and preparation, voice detection in general
70 | - Telephony and call-center automation, voice bots
71 | - Voice interfaces
72 |
73 |
74 |