├── .github
└── workflows
│ ├── base.yml
│ ├── faster-whisper
│ └── lint.yml
├── .gitignore
├── Dockerfile
├── Dockerfile.cuda
├── LICENSE
├── README.md
├── autocut
├── __init__.py
├── __main__.py
├── cut.py
├── daemon.py
├── main.py
├── package_transcribe.py
├── transcribe.py
├── type.py
├── utils.py
└── whisper_model.py
├── imgs
└── typora.jpg
├── setup.cfg
├── setup.py
├── tea.yaml
└── test
├── config.py
├── content
├── test.srt
├── test_md.md
└── test_srt.srt
├── media
├── test001.mp4
├── test001_en.mp4
├── test002.mov
├── test003.mkv
├── test004.flv
├── test005.mp3
└── test006.MP4
├── test_cut.py
└── test_transcribe.py
/.github/workflows/base.yml:
--------------------------------------------------------------------------------
1 | name: Test
2 |
3 | on:
4 | pull_request:
5 | push:
6 | branches:
7 | - main
8 |
9 | jobs:
10 | lint_and_test:
11 | runs-on: ${{ matrix.os }}-latest
12 | strategy:
13 | matrix:
14 | python-version: ['3.9', '3.10']
15 | # Wait for fix on macos-m1: https://github.com/federicocarboni/setup-ffmpeg/issues/21
16 | os: [ubuntu, windows, macos-12]
17 | steps:
18 | - uses: actions/checkout@v3
19 | - name: Set up Python ${{ matrix.python-version }}
20 | uses: actions/setup-python@v4
21 | with:
22 | python-version: ${{ matrix.python-version }}
23 | - name: Set Variables
24 | id: set_variables
25 | shell: bash
26 | run: |
27 | echo "PY=$(python -c 'import hashlib, sys;print(hashlib.sha256(sys.version.encode()+sys.executable.encode()).hexdigest())')" >> $GITHUB_OUTPUT
28 | echo "PIP_CACHE=$(pip cache dir)" >> $GITHUB_OUTPUT
29 | - name: Cache PIP
30 | uses: actions/cache@v3
31 | with:
32 | path: ${{ steps.set_variables.outputs.PIP_CACHE }}
33 | key: ${{ runner.os }}-pip-${{ steps.set_variables.outputs.PY }}
34 |
35 | - name: Setup ffmpeg for different platforms
36 | uses: FedericoCarboni/setup-ffmpeg@v3
37 |
38 | - name: Install dependencies
39 | run: |
40 | python -m pip install --upgrade pip
41 | pip install .
42 | pip install pytest
43 | - name: Run Test
44 | run: pytest test/
45 |
--------------------------------------------------------------------------------
/.github/workflows/faster-whisper:
--------------------------------------------------------------------------------
1 | name: Test Faster Whisper
2 |
3 | on:
4 | pull_request:
5 | push:
6 | branches:
7 | - main
8 |
9 | jobs:
10 | lint_and_test:
11 | runs-on: ${{ matrix.os }}-latest
12 | strategy:
13 | matrix:
14 | python-version: ['3.9', '3.10']
15 | # macos did not support m1 for now
16 | os: [ubuntu, windows, macos]
17 | steps:
18 | - uses: actions/checkout@v3
19 | - name: Set up Python ${{ matrix.python-version }}
20 | uses: actions/setup-python@v4
21 | with:
22 | python-version: ${{ matrix.python-version }}
23 | - name: Set Variables
24 | id: set_variables
25 | shell: bash
26 | run: |
27 | echo "PY=$(python -c 'import hashlib, sys;print(hashlib.sha256(sys.version.encode()+sys.executable.encode()).hexdigest())')" >> $GITHUB_OUTPUT
28 | echo "PIP_CACHE=$(pip cache dir)" >> $GITHUB_OUTPUT
29 | - name: Cache PIP
30 | uses: actions/cache@v3
31 | with:
32 | path: ${{ steps.set_variables.outputs.PIP_CACHE }}
33 | key: ${{ runner.os }}-pip-${{ steps.set_variables.outputs.PY }}
34 |
35 | - name: Setup ffmpeg for differnt platforms
36 | uses: FedericoCarboni/setup-ffmpeg@master
37 |
38 | - name: Install dependencies
39 | run: |
40 | python -m pip install --upgrade pip
41 | pip install ".[faster]"
42 | pip install pytest
43 | - name: Run Test
44 | run: WHISPER_MODE=faster pytest test/
45 |
--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
1 | name: Test Lint
2 |
3 | on:
4 | pull_request:
5 | push:
6 | branches:
7 | - main
8 |
9 | jobs:
10 | lint:
11 | runs-on: ${{ matrix.os }}-latest
12 | strategy:
13 | matrix:
14 | python-version: ['3.9']
15 | os: [ubuntu]
16 | steps:
17 | - uses: actions/checkout@v3
18 | - name: Set up Python ${{ matrix.python-version }}
19 | uses: actions/setup-python@v4
20 | with:
21 | python-version: ${{ matrix.python-version }}
22 | - name: Set Variables
23 | id: set_variables
24 | shell: bash
25 | run: |
26 | echo "PY=$(python -c 'import hashlib, sys;print(hashlib.sha256(sys.version.encode()+sys.executable.encode()).hexdigest())')" >> $GITHUB_OUTPUT
27 | echo "PIP_CACHE=$(pip cache dir)" >> $GITHUB_OUTPUT
28 | - name: Cache PIP
29 | uses: actions/cache@v3
30 | with:
31 | path: ${{ steps.set_variables.outputs.PIP_CACHE }}
32 | key: ${{ runner.os }}-pip-${{ steps.set_variables.outputs.PY }}
33 |
34 | - name: Install dependencies
35 | run: |
36 | python -m pip install --upgrade pip
37 | pip install black
38 |
39 | - name: Run Lint
40 | run: black . --check
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 | log/
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9-slim as base
2 |
3 | RUN mkdir /autocut
4 | COPY ./ /autocut
5 | WORKDIR /autocut
6 |
7 | RUN apt update && \
8 | apt install -y git && \
9 | apt install -y ffmpeg
10 |
11 | RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu && \
12 | pip install .
--------------------------------------------------------------------------------
/Dockerfile.cuda:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime
2 |
3 | RUN mkdir /autocut
4 | COPY ./ /autocut
5 | WORKDIR /autocut
6 |
7 | RUN apt update && \
8 | apt install -y git && \
9 | apt install -y ffmpeg
10 |
11 | RUN pip install .
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AutoCut: 通过字幕来剪切视频
2 |
3 | AutoCut 对你的视频自动生成字幕。然后你选择需要保留的句子,AutoCut 将对你视频中对应的片段裁切并保存。你无需使用视频编辑软件,只需要编辑文本文件即可完成剪切。
4 |
5 | **2024.10.05更新**:支持 `large-v3-turbo` [模型](https://github.com/openai/whisper/discussions/2363),提供更快的转录速度。
6 |
7 | ```shell
8 | autocut -t xxx --whisper-model large-v3-turbo
9 | ````
10 |
11 | **2024.03.10更新**:支持 pip 安装和提供 import 转录相关的功能
12 |
13 | ```shell
14 | # Install
15 | pip install autocut-sub
16 | ```
17 |
18 | ```python
19 | from autocut import Transcribe, load_audio
20 | ```
21 |
22 |
23 | **2023.10.14更新**:支持 faster-whisper 和指定依赖(但由于 Action 限制暂时移除了 faster-whisper 的测试运行)
24 |
25 | ```shell
26 | # for whisper only
27 | pip install .
28 |
29 | # for whisper and faster-whisper
30 | pip install '.[faster]'
31 |
32 | # for whisper and openai-whisper
33 | pip install '.[openai]'
34 |
35 | # for all
36 | pip install '.[all]'
37 | ```
38 |
39 | ```shell
40 | # using faster-whisper
41 | autocut -t xxx --whisper-mode=faster
42 | ```
43 |
44 | ```shell
45 | # using openai api
46 | export OPENAI_API_KEY=sk-xxx
47 | autocut -t xxx --whisper-mode=openai --openai-rpm=3
48 | ```
49 |
50 | **2023.8.13更新**:支持调用 Openai Whisper API
51 | ```shell
52 | export OPENAI_API_KEY=sk-xxx
53 | autocut -t xxx --whisper-mode=openai --openai-rpm=3
54 | ```
55 |
56 | ## 使用例子
57 |
58 | 假如你录制的视频放在 `2022-11-04/` 这个文件夹里。那么运行
59 |
60 | ```bash
61 | autocut -d 2022-11-04
62 | ```
63 |
64 | > 提示:如果你使用 OBS 录屏,可以在 `设置->高级->录像->文件名格式` 中将空格改成 `/`,即 `%CCYY-%MM-%DD/%hh-%mm-%ss`。那么视频文件将放在日期命名的文件夹里。
65 |
66 | AutoCut 将持续对这个文件夹里视频进行字幕抽取和剪切。例如,你刚完成一个视频录制,保存在 `11-28-18.mp4`。AutoCut 将生成 `11-28-18.md`。你在里面选择需要保留的句子后,AutoCut 将剪切出 `11-28-18_cut.mp4`,并生成 `11-28-18_cut.md` 来预览结果。
67 |
68 | 你可以使用任何的 Markdown 编辑器。例如我常用 VS Code 和 Typora。下图是通过 Typora 来对 `11-28-18.md` 编辑。
69 |
70 | 
71 |
72 | 全部完成后在 `autocut.md` 里选择需要拼接的视频后,AutoCut 将输出 `autocut_merged.mp4` 和对应的字幕文件。
73 |
74 | ## 安装
75 |
76 | 首先安装 Python 包
77 |
78 | ```
79 | pip install git+https://github.com/mli/autocut.git
80 | ```
81 |
82 | ## 本地安装测试
83 |
84 |
85 | ```
86 | git clone https://github.com/mli/autocut
87 | cd autocut
88 | pip install .
89 | ```
90 |
91 |
92 | > 上面将安装 [pytorch](https://pytorch.org/)。如果你需要 GPU 运行,且默认安装的版本不匹配的话,你可以先安装 Pytorch。如果安装 Whipser 出现问题,请参考[官方文档](https://github.com/openai/whisper#setup)。
93 |
94 | 另外需要安装 [ffmpeg](https://ffmpeg.org/)
95 |
96 | ```
97 | # on Ubuntu or Debian
98 | sudo apt update && sudo apt install ffmpeg
99 |
100 | # on Arch Linux
101 | sudo pacman -S ffmpeg
102 |
103 | # on MacOS using Homebrew (https://brew.sh/)
104 | brew install ffmpeg
105 |
106 | # on Windows using Scoop (https://scoop.sh/)
107 | scoop install ffmpeg
108 | ```
109 |
110 | ## Docker 安装
111 |
112 | 首先将项目克隆到本地。
113 |
114 | ```bash
115 | git clone https://github.com/mli/autocut.git
116 | ```
117 |
118 | ### 安装 CPU 版本
119 |
120 | 进入项目根目录,然后构建 docker 映像。
121 |
122 | ```bash
123 | docker build -t autocut .
124 | ```
125 |
126 | 运行下面的命令创建 docker 容器,就可以直接使用了。
127 |
128 | ```bash
129 | docker run -it --rm -v E:\autocut:/autocut/video autocut /bin/bash
130 | ```
131 |
132 | 其中 `-v` 是将主机存放视频的文件夹 `E:\autocut` 映射到虚拟机的 `/autocut/video` 目录。`E:\autocut` 是主机存放视频的目录,需修改为自己主机存放视频的目录。
133 |
134 | ### 安装 GPU 版本
135 |
136 | 使用 GPU 加速需要主机有 Nvidia 的显卡并安装好相应驱动。然后在项目根目录,执行下面的命令构建 docker 映像。
137 |
138 | ```bash
139 | docker build -f ./Dockerfile.cuda -t autocut-gpu .
140 | ```
141 |
142 | 使用 GPU 加速时,运行 docker 容器需添加参数 `--gpus all`。
143 |
144 | ```bash
145 | docker run --gpus all -it --rm -v E:\autocut:/autocut/video autocut-gpu
146 | ```
147 |
148 | ## 更多使用选项
149 |
150 | ### 转录某个视频生成 `.srt` 和 `.md` 结果。
151 |
152 | ```bash
153 | autocut -t 22-52-00.mp4
154 | ```
155 |
156 | 1. 如果对转录质量不满意,可以使用更大的模型,例如
157 |
158 | ```bash
159 | autocut -t 22-52-00.mp4 --whisper-model large
160 | ```
161 |
162 | 默认是 `small`。更好的模型是 `medium` 和 `large`,但推荐使用 GPU 获得更好的速度。也可以使用更快的 `tiny` 和 `base`,但转录质量会下降。
163 |
164 |
165 | ### 剪切某个视频
166 |
167 | ```bash
168 | autocut -c 22-52-00.mp4 22-52-00.srt 22-52-00.md
169 | ```
170 |
171 | 1. 默认视频比特率是 `--bitrate 10m`,你可以根据需要调大调小。
172 | 2. 如果不习惯 Markdown 格式文件,你也可以直接在 `srt` 文件里删除不要的句子,在剪切时不传入 `md` 文件名即可。就是 `autocut -c 22-52-00.mp4 22-52-00.srt`
173 | 3. 如果仅有 `srt` 文件,编辑不方便可以使用如下命令生成 `md` 文件,然后编辑 `md` 文件即可,但此时会完全对照 `srt` 生成,不会出现 `no speech` 等提示文本。
174 |
175 | ```bash
176 | autocut -m test.srt test.mp4
177 | autocut -m test.mp4 test.srt # 支持视频和字幕乱序传入
178 | autocut -m test.srt # 也可以只传入字幕文件
179 | ```
180 |
181 |
182 | ### 一些小提示
183 |
184 |
185 | 1. 讲得流利的视频的转录质量会高一些,这因为是 Whisper 训练数据分布的缘故。对一个视频,你可以先粗选一下句子,然后在剪出来的视频上再剪一次。
186 | 2. 最终视频生成的字幕通常还需要做一些小编辑。但 `srt` 里面空行太多。你可以使用 `autocut -s 22-52-00.srt` 来生成一个紧凑些的版本 `22-52-00_compact.srt` 方便编辑(这个格式不合法,但编辑器,例如 VS Code,还是会进行语法高亮)。编辑完成后,`autocut -s 22-52-00_compact.srt` 转回正常格式。
187 | 3. 用 Typora 和 VS Code 编辑 Markdown 都很方便。他们都有对应的快捷键 mark 一行或者多行。但 VS Code 视频预览似乎有点问题。
188 | 4. 视频是通过 ffmpeg 导出。在 Apple M1 芯片上它用不了 GPU,导致导出速度不如专业视频软件。
189 |
190 | ### 常见问题
191 |
192 | 1. **输出的是乱码?**
193 |
194 | AutoCut 默认输出编码是 `utf-8`. 确保你的编辑器也使用了 `utf-8` 解码。你可以通过 `--encoding` 指定其他编码格式。但是需要注意生成字幕文件和使用字幕文件剪辑时的编码格式需要一致。例如使用 `gbk`。
195 |
196 | ```bash
197 | autocut -t test.mp4 --encoding=gbk
198 | autocut -c test.mp4 test.srt test.md --encoding=gbk
199 | ```
200 |
201 | 如果使用了其他编码格式(如 `gbk` 等)生成 `md` 文件并用 Typora 打开后,该文件可能会被 Typora 自动转码为其他编码格式,此时再通过生成时指定的编码格式进行剪辑时可能会出现编码不支持等报错。因此可以在使用 Typora 编辑后再通过 VSCode 等修改到你需要的编码格式进行保存后再使用剪辑功能。
202 |
203 | 2. **如何使用 GPU 来转录?**
204 |
205 | 当你有 Nvidia GPU,而且安装了对应版本的 PyTorch 的时候,转录是在 GPU 上进行。你可以通过命令来查看当前是不是支持 GPU。
206 |
207 | ```bash
208 | python -c "import torch; print(torch.cuda.is_available())"
209 | ```
210 |
211 | 否则你可以在安装 AutoCut 前手动安装对应的 GPU 版本 PyTorch。
212 |
213 | 3. **使用 GPU 时报错显存不够。**
214 |
215 | whisper 的大模型需要一定的 GPU 显存。如果你的显存不够,你可以用小一点的模型,例如 `small`。如果你仍然想用大模型,可以通过 `--device` 来强制使用 CPU。例如
216 |
217 | ```bash
218 | autocut -t 11-28-18.mp4 --whisper-model large --device cpu
219 | ```
220 |
221 | 4. **能不能使用 `pip` 安装?**
222 |
223 | whisper已经发布到PyPI了,可以直接用`pip install openai-whisper`安装。
224 |
225 | [https://github.com/openai/whisper#setup](https://github.com/openai/whisper#setup)
226 |
227 | [https://pypi.org/project/openai-whisper/](https://pypi.org/project/openai-whisper/)
228 |
229 | ## 如何参与贡献
230 |
231 | [这里有一些想做的 feature](https://github.com/mli/autocut/issues/22),欢迎贡献。
232 |
233 | ### 代码结构
234 | ```text
235 | autocut
236 | │ .gitignore
237 | │ LICENSE
238 | │ README.md # 一般新增或修改需要让使用者知道就需要对应更新 README.md 内容
239 | │ setup.py
240 | │
241 | └─autocut # 核心代码位于 autocut 文件夹中,新增功能的实现也一般在这里面进行修改或新增
242 | │ cut.py
243 | │ daemon.py
244 | │ main.py
245 | │ transcribe.py
246 | │ utils.py
247 | └─ __init__.py
248 |
249 | ```
250 |
251 | ### 安装依赖
252 | 开始安装这个项目的需要的依赖之前,建议先了解一下 Anaconda 或者 venv 的虚拟环境使用,推荐**使用虚拟环境来搭建该项目的开发环境**。
253 | 具体安装方式为在你搭建搭建的虚拟环境之中按照[上方安装步骤](./README.md#安装)进行安装。
254 |
255 | > 为什么推荐使用虚拟环境开发?
256 | >
257 | > 一方面是保证各种不同的开发环境之间互相不污染。
258 | >
259 | > 更重要的是在于这个项目实际上是一个 Python Package,所以在你安装之后 AutoCut 的代码实际也会变成你的环境依赖。
260 | > **因此在你更新代码之后,你需要让将新代码重新安装到环境中,然后才能调用到新的代码。**
261 |
262 | ### 开发
263 |
264 | 1. 代码风格目前遵循 PEP-8,可以使用相关的自动格式化软件完成。
265 | 2. `utils.py` 主要是全局共用的一些工具方法。
266 | 3. `transcribe.py` 是调用模型生成`srt`和`md`的部分。
267 | 4. `cut.py` 提供根据标记后`md`或`srt`进行视频剪切合并的功能。
268 | 5. `daemon.py` 提供的是监听文件夹生成字幕和剪切视频的功能。
269 | 6. `main.py` 声明命令行参数,根据输入参数调用对应功能。
270 |
271 | 开发过程中请尽量保证修改在正确的地方,以及合理地复用代码,
272 | 同时工具函数请尽可能放在`utils.py`中。
273 | 代码格式目前是遵循 PEP-8,变量命名尽量语义化即可。
274 |
275 | 在开发完成之后,最重要的一点是需要进行**测试**,请保证提交之前对所有**与你修改直接相关的部分**以及**你修改会影响到的部分**都进行了测试,并保证功能的正常。
276 | 目前使用 `GitHub Actions` CI, Lint 使用 black 提交前请运行 `black`。
277 |
278 | ### 提交
279 |
280 | 1. commit 信息用英文描述清楚你做了哪些修改即可,小写字母开头。
281 | 2. 最好可以保证一次的 commit 涉及的修改比较小,可以简短地描述清楚,这样也方便之后有修改时的查找。
282 | 3. PR 的时候 title 简述有哪些修改, contents 可以具体写下修改内容。
283 | 4. run test `pip install pytest` then `pytest test`
284 | 5. run lint `pip install black` then `black .`
285 |
--------------------------------------------------------------------------------
/autocut/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.1.0"
2 |
3 | from .type import LANG, WhisperModel, WhisperMode
4 | from .utils import load_audio
5 | from .package_transcribe import Transcribe
6 |
7 | __all__ = ["Transcribe", "load_audio", "WhisperMode", "WhisperModel", "LANG"]
8 |
--------------------------------------------------------------------------------
/autocut/__main__.py:
--------------------------------------------------------------------------------
1 | from .main import main
2 |
3 | if __name__ == "__main__":
4 | main()
5 |
--------------------------------------------------------------------------------
/autocut/cut.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import re
4 |
5 | import srt
6 | from moviepy import editor
7 |
8 | from . import utils
9 |
10 |
11 | # Merge videos
12 | class Merger:
13 | def __init__(self, args):
14 | self.args = args
15 |
16 | def write_md(self, videos):
17 | md = utils.MD(self.args.inputs[0], self.args.encoding)
18 | num_tasks = len(md.tasks())
19 | # Not overwrite if already marked as down or no new videos
20 | if md.done_editing() or num_tasks == len(videos) + 1:
21 | return
22 |
23 | md.clear()
24 | md.add_done_editing(False)
25 | md.add("\nSelect the files that will be used to generate `autocut_final.mp4`\n")
26 | base = lambda fn: os.path.basename(fn)
27 | for f in videos:
28 | md_fn = utils.change_ext(f, "md")
29 | video_md = utils.MD(md_fn, self.args.encoding)
30 | # select a few words to scribe the video
31 | desc = ""
32 | if len(video_md.tasks()) > 1:
33 | for _, t in video_md.tasks()[1:]:
34 | m = re.findall(r"\] (.*)", t)
35 | if m and "no speech" not in m[0].lower():
36 | desc += m[0] + " "
37 | if len(desc) > 50:
38 | break
39 | md.add_task(
40 | False,
41 | f'[{base(f)}]({base(md_fn)}) {"[Edited]" if video_md.done_editing() else ""} {desc}',
42 | )
43 | md.write()
44 |
45 | def run(self):
46 | md_fn = self.args.inputs[0]
47 | md = utils.MD(md_fn, self.args.encoding)
48 | if not md.done_editing():
49 | return
50 |
51 | videos = []
52 | for m, t in md.tasks():
53 | if not m:
54 | continue
55 | m = re.findall(r"\[(.*)\]", t)
56 | if not m:
57 | continue
58 | fn = os.path.join(os.path.dirname(md_fn), m[0])
59 | logging.info(f"Loading {fn}")
60 | videos.append(editor.VideoFileClip(fn))
61 |
62 | dur = sum([v.duration for v in videos])
63 | logging.info(f"Merging into a video with {dur / 60:.1f} min length")
64 |
65 | merged = editor.concatenate_videoclips(videos)
66 | fn = os.path.splitext(md_fn)[0] + "_merged.mp4"
67 | merged.write_videofile(
68 | fn, audio_codec="aac", bitrate=self.args.bitrate
69 | ) # logger=None,
70 | logging.info(f"Saved merged video to {fn}")
71 |
72 |
73 | # Cut media
74 | class Cutter:
75 | def __init__(self, args):
76 | self.args = args
77 |
78 | def run(self):
79 | fns = {"srt": None, "media": None, "md": None}
80 | for fn in self.args.inputs:
81 | ext = os.path.splitext(fn)[1][1:]
82 | fns[ext if ext in fns else "media"] = fn
83 |
84 | assert fns["media"], "must provide a media filename"
85 | assert fns["srt"], "must provide a srt filename"
86 |
87 | is_video_file = utils.is_video(fns["media"].lower())
88 | outext = "mp4" if is_video_file else "mp3"
89 | output_fn = utils.change_ext(utils.add_cut(fns["media"]), outext)
90 | if utils.check_exists(output_fn, self.args.force):
91 | return
92 |
93 | with open(fns["srt"], encoding=self.args.encoding) as f:
94 | subs = list(srt.parse(f.read()))
95 |
96 | if fns["md"]:
97 | md = utils.MD(fns["md"], self.args.encoding)
98 | if not md.done_editing():
99 | return
100 | index = []
101 | for mark, sent in md.tasks():
102 | if not mark:
103 | continue
104 | m = re.match(r"\[(\d+)", sent.strip())
105 | if m:
106 | index.append(int(m.groups()[0]))
107 | subs = [s for s in subs if s.index in index]
108 | logging.info(f'Cut {fns["media"]} based on {fns["srt"]} and {fns["md"]}')
109 | else:
110 | logging.info(f'Cut {fns["media"]} based on {fns["srt"]}')
111 |
112 | segments = []
113 | # Avoid disordered subtitles
114 | subs.sort(key=lambda x: x.start)
115 | for x in subs:
116 | if len(segments) == 0:
117 | segments.append(
118 | {"start": x.start.total_seconds(), "end": x.end.total_seconds()}
119 | )
120 | else:
121 | if x.start.total_seconds() - segments[-1]["end"] < 0.5:
122 | segments[-1]["end"] = x.end.total_seconds()
123 | else:
124 | segments.append(
125 | {"start": x.start.total_seconds(), "end": x.end.total_seconds()}
126 | )
127 |
128 | if is_video_file:
129 | media = editor.VideoFileClip(fns["media"])
130 | else:
131 | media = editor.AudioFileClip(fns["media"])
132 |
133 | # Add a fade between two clips. Not quite necessary. keep code here for reference
134 | # fade = 0
135 | # segments = _expand_segments(segments, fade, 0, video.duration)
136 | # clips = [video.subclip(
137 | # s['start'], s['end']).crossfadein(fade) for s in segments]
138 | # final_clip = editor.concatenate_videoclips(clips, padding = -fade)
139 |
140 | clips = [media.subclip(s["start"], s["end"]) for s in segments]
141 | if is_video_file:
142 | final_clip: editor.VideoClip = editor.concatenate_videoclips(clips)
143 | logging.info(
144 | f"Reduced duration from {media.duration:.1f} to {final_clip.duration:.1f}"
145 | )
146 |
147 | aud = final_clip.audio.set_fps(44100)
148 | final_clip = final_clip.without_audio().set_audio(aud)
149 | final_clip = final_clip.fx(editor.afx.audio_normalize)
150 |
151 | # an alternative to birate is use crf, e.g. ffmpeg_params=['-crf', '18']
152 | final_clip.write_videofile(
153 | output_fn, audio_codec="aac", bitrate=self.args.bitrate
154 | )
155 | else:
156 | final_clip: editor.AudioClip = editor.concatenate_audioclips(clips)
157 | logging.info(
158 | f"Reduced duration from {media.duration:.1f} to {final_clip.duration:.1f}"
159 | )
160 |
161 | final_clip = final_clip.fx(editor.afx.audio_normalize)
162 | final_clip.write_audiofile(
163 | output_fn, codec="libmp3lame", fps=44100, bitrate=self.args.bitrate
164 | )
165 |
166 | media.close()
167 | logging.info(f"Saved media to {output_fn}")
168 |
--------------------------------------------------------------------------------
/autocut/daemon.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import glob
3 | import logging
4 | import os
5 | import time
6 |
7 | from . import cut, transcribe, utils
8 |
9 |
10 | class Daemon:
11 | def __init__(self, args):
12 | self.args = args
13 | self.sleep = 1
14 |
15 | def run(self):
16 | assert len(self.args.inputs) == 1, "Must provide a single folder"
17 | while True:
18 | self._iter()
19 | time.sleep(self.sleep)
20 | self.sleep = min(60, self.sleep + 1)
21 |
22 | def _iter(self):
23 | folder = self.args.inputs[0]
24 | files = sorted(list(glob.glob(os.path.join(folder, "*"))))
25 | media_files = [f for f in files if utils.is_video(f) or utils.is_audio(f)]
26 | args = copy.deepcopy(self.args)
27 | for f in media_files:
28 | srt_fn = utils.change_ext(f, "srt")
29 | md_fn = utils.change_ext(f, "md")
30 | is_video_file = utils.is_video(f)
31 | if srt_fn not in files or md_fn not in files:
32 | args.inputs = [f]
33 | try:
34 | transcribe.Transcribe(args).run()
35 | self.sleep = 1
36 | break
37 | except RuntimeError as e:
38 | logging.warn(
39 | "Failed, may be due to the video is still on recording"
40 | )
41 | pass
42 | if md_fn in files:
43 | if utils.add_cut(md_fn) in files:
44 | continue
45 | md = utils.MD(md_fn, self.args.encoding)
46 | ext = "mp4" if is_video_file else "mp3"
47 | if not md.done_editing() or os.path.exists(
48 | utils.change_ext(utils.add_cut(f), ext)
49 | ):
50 | continue
51 | args.inputs = [f, md_fn, srt_fn]
52 | cut.Cutter(args).run()
53 | self.sleep = 1
54 |
55 | args.inputs = [os.path.join(folder, "autocut.md")]
56 | merger = cut.Merger(args)
57 | merger.write_md(media_files)
58 | merger.run()
59 |
--------------------------------------------------------------------------------
/autocut/main.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | import os
4 |
5 | from . import utils
6 | from .type import WhisperMode, WhisperModel
7 |
8 |
9 | def main():
10 | parser = argparse.ArgumentParser(
11 | description="Edit videos based on transcribed subtitles",
12 | formatter_class=argparse.RawDescriptionHelpFormatter,
13 | )
14 |
15 | logging.basicConfig(
16 | format="[autocut:%(filename)s:L%(lineno)d] %(levelname)-6s %(message)s"
17 | )
18 | logging.getLogger().setLevel(logging.INFO)
19 |
20 | parser.add_argument("inputs", type=str, nargs="+", help="Inputs filenames/folders")
21 | parser.add_argument(
22 | "-t",
23 | "--transcribe",
24 | help="Transcribe videos/audio into subtitles",
25 | action=argparse.BooleanOptionalAction,
26 | )
27 | parser.add_argument(
28 | "-c",
29 | "--cut",
30 | help="Cut a video based on subtitles",
31 | action=argparse.BooleanOptionalAction,
32 | )
33 | parser.add_argument(
34 | "-d",
35 | "--daemon",
36 | help="Monitor a folder to transcribe and cut",
37 | action=argparse.BooleanOptionalAction,
38 | )
39 | parser.add_argument(
40 | "-s",
41 | help="Convert .srt to a compact format for easier editing",
42 | action=argparse.BooleanOptionalAction,
43 | )
44 | parser.add_argument(
45 | "-m",
46 | "--to-md",
47 | help="Convert .srt to .md for easier editing",
48 | action=argparse.BooleanOptionalAction,
49 | )
50 | parser.add_argument(
51 | "--lang",
52 | type=str,
53 | default="zh",
54 | choices=[
55 | "zh",
56 | "en",
57 | "Afrikaans",
58 | "Arabic",
59 | "Armenian",
60 | "Azerbaijani",
61 | "Belarusian",
62 | "Bosnian",
63 | "Bulgarian",
64 | "Catalan",
65 | "Croatian",
66 | "Czech",
67 | "Danish",
68 | "Dutch",
69 | "Estonian",
70 | "Finnish",
71 | "French",
72 | "Galician",
73 | "German",
74 | "Greek",
75 | "Hebrew",
76 | "Hindi",
77 | "Hungarian",
78 | "Icelandic",
79 | "Indonesian",
80 | "Italian",
81 | "Japanese",
82 | "Kannada",
83 | "Kazakh",
84 | "Korean",
85 | "Latvian",
86 | "Lithuanian",
87 | "Macedonian",
88 | "Malay",
89 | "Marathi",
90 | "Maori",
91 | "Nepali",
92 | "Norwegian",
93 | "Persian",
94 | "Polish",
95 | "Portuguese",
96 | "Romanian",
97 | "Russian",
98 | "Serbian",
99 | "Slovak",
100 | "Slovenian",
101 | "Spanish",
102 | "Swahili",
103 | "Swedish",
104 | "Tagalog",
105 | "Tamil",
106 | "Thai",
107 | "Turkish",
108 | "Ukrainian",
109 | "Urdu",
110 | "Vietnamese",
111 | "Welsh",
112 | ],
113 | help="The output language of transcription",
114 | )
115 | parser.add_argument(
116 | "--prompt", type=str, default="", help="initial prompt feed into whisper"
117 | )
118 | parser.add_argument(
119 | "--whisper-mode",
120 | type=str,
121 | default=WhisperMode.WHISPER.value,
122 | choices=WhisperMode.get_values(),
123 | help="Whisper inference mode: whisper: run whisper locally; openai: use openai api.",
124 | )
125 | parser.add_argument(
126 | "--openai-rpm",
127 | type=int,
128 | default=3,
129 | choices=[3, 50],
130 | help="Openai Whisper API REQUESTS PER MINUTE(FREE USERS: 3RPM; PAID USERS: 50RPM). "
131 | "More info: https://platform.openai.com/docs/guides/rate-limits/overview",
132 | )
133 | parser.add_argument(
134 | "--whisper-model",
135 | type=str,
136 | default=WhisperModel.SMALL.value,
137 | choices=WhisperModel.get_values(),
138 | help="The whisper model used to transcribe.",
139 | )
140 | parser.add_argument(
141 | "--bitrate",
142 | type=str,
143 | default="10m",
144 | help="The bitrate to export the cutted video, such as 10m, 1m, or 500k",
145 | )
146 | parser.add_argument(
147 | "--vad", help="If or not use VAD", choices=["1", "0", "auto"], default="auto"
148 | )
149 | parser.add_argument(
150 | "--force",
151 | help="Force write even if files exist",
152 | action=argparse.BooleanOptionalAction,
153 | )
154 | parser.add_argument(
155 | "--encoding", type=str, default="utf-8", help="Document encoding format"
156 | )
157 | parser.add_argument(
158 | "--device",
159 | type=str,
160 | default=None,
161 | choices=["cpu", "cuda"],
162 | help="Force to CPU or GPU for transcribing. In default automatically use GPU if available.",
163 | )
164 |
165 | args = parser.parse_args()
166 |
167 | if args.transcribe:
168 | from .transcribe import Transcribe
169 |
170 | Transcribe(args).run()
171 | elif args.to_md:
172 | from .utils import trans_srt_to_md
173 |
174 | if len(args.inputs) == 2:
175 | [input_1, input_2] = args.inputs
176 | base, ext = os.path.splitext(input_1)
177 | if ext != ".srt":
178 | input_1, input_2 = input_2, input_1
179 | trans_srt_to_md(args.encoding, args.force, input_1, input_2)
180 | elif len(args.inputs) == 1:
181 | trans_srt_to_md(args.encoding, args.force, args.inputs[0])
182 | else:
183 | logging.warning(
184 | "Wrong number of files, please pass in a .srt file or an additional video file"
185 | )
186 | elif args.cut:
187 | from .cut import Cutter
188 |
189 | Cutter(args).run()
190 | elif args.daemon:
191 | from .daemon import Daemon
192 |
193 | Daemon(args).run()
194 | elif args.s:
195 | utils.compact_rst(args.inputs[0], args.encoding)
196 | else:
197 | logging.warning("No action, use -c, -t or -d")
198 |
199 |
200 | if __name__ == "__main__":
201 | main()
202 |
--------------------------------------------------------------------------------
/autocut/package_transcribe.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 | from typing import List, Any, Union, Literal
4 |
5 | import numpy as np
6 | import torch
7 |
8 | from . import utils, whisper_model
9 | from .type import WhisperMode, SPEECH_ARRAY_INDEX, WhisperModel, LANG
10 |
11 |
12 | class Transcribe:
13 | def __init__(
14 | self,
15 | whisper_mode: Union[
16 | WhisperMode.WHISPER.value, WhisperMode.FASTER.value
17 | ] = WhisperMode.WHISPER.value,
18 | whisper_model_size: WhisperModel.get_values() = "small",
19 | vad: bool = True,
20 | device: Union[Literal["cpu", "cuda"], None] = None,
21 | ):
22 | self.whisper_mode = whisper_mode
23 | self.whisper_model_size = whisper_model_size
24 | self.vad = vad
25 | self.device = device
26 | self.sampling_rate = 16000
27 | self.whisper_model = None
28 | self.vad_model = None
29 | self.detect_speech = None
30 |
31 | tic = time.time()
32 | if self.whisper_model is None:
33 | if self.whisper_mode == WhisperMode.WHISPER.value:
34 | self.whisper_model = whisper_model.WhisperModel(self.sampling_rate)
35 | self.whisper_model.load(self.whisper_model_size, self.device)
36 | elif self.whisper_mode == WhisperMode.FASTER.value:
37 | self.whisper_model = whisper_model.FasterWhisperModel(
38 | self.sampling_rate
39 | )
40 | self.whisper_model.load(self.whisper_model_size, self.device)
41 | logging.info(f"Done Init model in {time.time() - tic:.1f} sec")
42 |
43 | def run(self, audio: np.ndarray, lang: LANG, prompt: str = ""):
44 | speech_array_indices = self._detect_voice_activity(audio)
45 | transcribe_results = self._transcribe(audio, speech_array_indices, lang, prompt)
46 | return transcribe_results
47 |
48 | def format_results_to_srt(self, transcribe_results: List[Any]):
49 | return self.whisper_model.gen_srt(transcribe_results)
50 |
51 | def _detect_voice_activity(self, audio) -> List[SPEECH_ARRAY_INDEX]:
52 | """Detect segments that have voice activities"""
53 | if self.vad is False:
54 | return [{"start": 0, "end": len(audio)}]
55 |
56 | tic = time.time()
57 | if self.vad_model is None or self.detect_speech is None:
58 | # torch load limit https://github.com/pytorch/vision/issues/4156
59 | torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
60 | self.vad_model, funcs = torch.hub.load(
61 | repo_or_dir="snakers4/silero-vad", model="silero_vad", trust_repo=True
62 | )
63 |
64 | self.detect_speech = funcs[0]
65 |
66 | speeches = self.detect_speech(
67 | audio, self.vad_model, sampling_rate=self.sampling_rate
68 | )
69 |
70 | # Remove too short segments
71 | speeches = utils.remove_short_segments(speeches, 1.0 * self.sampling_rate)
72 |
73 | # Expand to avoid to tight cut. You can tune the pad length
74 | speeches = utils.expand_segments(
75 | speeches, 0.2 * self.sampling_rate, 0.0 * self.sampling_rate, audio.shape[0]
76 | )
77 |
78 | # Merge very closed segments
79 | speeches = utils.merge_adjacent_segments(speeches, 0.5 * self.sampling_rate)
80 |
81 | logging.info(f"Done voice activity detection in {time.time() - tic:.1f} sec")
82 | return speeches if len(speeches) > 1 else [{"start": 0, "end": len(audio)}]
83 |
84 | def _transcribe(
85 | self,
86 | audio: np.ndarray,
87 | speech_array_indices: List[SPEECH_ARRAY_INDEX],
88 | lang: LANG,
89 | prompt: str = "",
90 | ) -> List[Any]:
91 | tic = time.time()
92 | res = self.whisper_model.transcribe(audio, speech_array_indices, lang, prompt)
93 | logging.info(f"Done transcription in {time.time() - tic:.1f} sec")
94 | return res
95 |
--------------------------------------------------------------------------------
/autocut/transcribe.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import time
4 | from typing import List, Any
5 |
6 | import numpy as np
7 | import srt
8 | import torch
9 |
10 | from . import utils, whisper_model
11 | from .type import WhisperMode, SPEECH_ARRAY_INDEX
12 |
13 |
14 | class Transcribe:
15 | def __init__(self, args):
16 | self.args = args
17 | self.sampling_rate = 16000
18 | self.whisper_model = None
19 | self.vad_model = None
20 | self.detect_speech = None
21 |
22 | tic = time.time()
23 | if self.whisper_model is None:
24 | if self.args.whisper_mode == WhisperMode.WHISPER.value:
25 | self.whisper_model = whisper_model.WhisperModel(self.sampling_rate)
26 | self.whisper_model.load(self.args.whisper_model, self.args.device)
27 | elif self.args.whisper_mode == WhisperMode.OPENAI.value:
28 | self.whisper_model = whisper_model.OpenAIModel(
29 | self.args.openai_rpm, self.sampling_rate
30 | )
31 | self.whisper_model.load()
32 | elif self.args.whisper_mode == WhisperMode.FASTER.value:
33 | self.whisper_model = whisper_model.FasterWhisperModel(
34 | self.sampling_rate
35 | )
36 | self.whisper_model.load(self.args.whisper_model, self.args.device)
37 | logging.info(f"Done Init model in {time.time() - tic:.1f} sec")
38 |
39 | def run(self):
40 | for input in self.args.inputs:
41 | logging.info(f"Transcribing {input}")
42 | name, _ = os.path.splitext(input)
43 | if utils.check_exists(name + ".md", self.args.force):
44 | continue
45 |
46 | audio = utils.load_audio(input, sr=self.sampling_rate)
47 | speech_array_indices = self._detect_voice_activity(audio)
48 | transcribe_results = self._transcribe(input, audio, speech_array_indices)
49 |
50 | output = name + ".srt"
51 | self._save_srt(output, transcribe_results)
52 | logging.info(f"Transcribed {input} to {output}")
53 | self._save_md(name + ".md", output, input)
54 | logging.info(f'Saved texts to {name + ".md"} to mark sentences')
55 |
56 | def _detect_voice_activity(self, audio) -> List[SPEECH_ARRAY_INDEX]:
57 | """Detect segments that have voice activities"""
58 | if self.args.vad == "0":
59 | return [{"start": 0, "end": len(audio)}]
60 |
61 | tic = time.time()
62 | if self.vad_model is None or self.detect_speech is None:
63 | # torch load limit https://github.com/pytorch/vision/issues/4156
64 | torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
65 | self.vad_model, funcs = torch.hub.load(
66 | repo_or_dir="snakers4/silero-vad", model="silero_vad", trust_repo=True
67 | )
68 |
69 | self.detect_speech = funcs[0]
70 |
71 | speeches = self.detect_speech(
72 | audio, self.vad_model, sampling_rate=self.sampling_rate
73 | )
74 |
75 | # Remove too short segments
76 | speeches = utils.remove_short_segments(speeches, 1.0 * self.sampling_rate)
77 |
78 | # Expand to avoid to tight cut. You can tune the pad length
79 | speeches = utils.expand_segments(
80 | speeches, 0.2 * self.sampling_rate, 0.0 * self.sampling_rate, audio.shape[0]
81 | )
82 |
83 | # Merge very closed segments
84 | speeches = utils.merge_adjacent_segments(speeches, 0.5 * self.sampling_rate)
85 |
86 | logging.info(f"Done voice activity detection in {time.time() - tic:.1f} sec")
87 | return speeches if len(speeches) > 1 else [{"start": 0, "end": len(audio)}]
88 |
89 | def _transcribe(
90 | self,
91 | input: str,
92 | audio: np.ndarray,
93 | speech_array_indices: List[SPEECH_ARRAY_INDEX],
94 | ) -> List[Any]:
95 | tic = time.time()
96 | res = (
97 | self.whisper_model.transcribe(
98 | audio, speech_array_indices, self.args.lang, self.args.prompt
99 | )
100 | if self.args.whisper_mode == WhisperMode.WHISPER.value
101 | or self.args.whisper_mode == WhisperMode.FASTER.value
102 | else self.whisper_model.transcribe(
103 | input, audio, speech_array_indices, self.args.lang, self.args.prompt
104 | )
105 | )
106 |
107 | logging.info(f"Done transcription in {time.time() - tic:.1f} sec")
108 | return res
109 |
110 | def _save_srt(self, output, transcribe_results):
111 | subs = self.whisper_model.gen_srt(transcribe_results)
112 | with open(output, "wb") as f:
113 | f.write(srt.compose(subs).encode(self.args.encoding, "replace"))
114 |
115 | def _save_md(self, md_fn, srt_fn, video_fn):
116 | with open(srt_fn, encoding=self.args.encoding) as f:
117 | subs = srt.parse(f.read())
118 |
119 | md = utils.MD(md_fn, self.args.encoding)
120 | md.clear()
121 | md.add_done_editing(False)
122 | md.add_video(os.path.basename(video_fn))
123 | md.add(
124 | f"\nTexts generated from [{os.path.basename(srt_fn)}]({os.path.basename(srt_fn)})."
125 | "Mark the sentences to keep for autocut.\n"
126 | "The format is [subtitle_index,duration_in_second] subtitle context.\n\n"
127 | )
128 |
129 | for s in subs:
130 | sec = s.start.seconds
131 | pre = f"[{s.index},{sec // 60:02d}:{sec % 60:02d}]"
132 | md.add_task(False, f"{pre:11} {s.content.strip()}")
133 | md.write()
134 |
--------------------------------------------------------------------------------
/autocut/type.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | from typing import TypedDict, Literal
3 |
4 | SPEECH_ARRAY_INDEX = TypedDict("SPEECH_ARRAY_INDEX", {"start": float, "end": float})
5 |
6 | LANG = Literal[
7 | "zh",
8 | "en",
9 | "Afrikaans",
10 | "Arabic",
11 | "Armenian",
12 | "Azerbaijani",
13 | "Belarusian",
14 | "Bosnian",
15 | "Bulgarian",
16 | "Catalan",
17 | "Croatian",
18 | "Czech",
19 | "Danish",
20 | "Dutch",
21 | "Estonian",
22 | "Finnish",
23 | "French",
24 | "Galician",
25 | "German",
26 | "Greek",
27 | "Hebrew",
28 | "Hindi",
29 | "Hungarian",
30 | "Icelandic",
31 | "Indonesian",
32 | "Italian",
33 | "Japanese",
34 | "Kannada",
35 | "Kazakh",
36 | "Korean",
37 | "Latvian",
38 | "Lithuanian",
39 | "Macedonian",
40 | "Malay",
41 | "Marathi",
42 | "Maori",
43 | "Nepali",
44 | "Norwegian",
45 | "Persian",
46 | "Polish",
47 | "Portuguese",
48 | "Romanian",
49 | "Russian",
50 | "Serbian",
51 | "Slovak",
52 | "Slovenian",
53 | "Spanish",
54 | "Swahili",
55 | "Swedish",
56 | "Tagalog",
57 | "Tamil",
58 | "Thai",
59 | "Turkish",
60 | "Ukrainian",
61 | "Urdu",
62 | "Vietnamese",
63 | "Welsh",
64 | ]
65 |
66 |
67 | class WhisperModel(Enum):
68 | TINY = "tiny"
69 | BASE = "base"
70 | SMALL = "small"
71 | MEDIUM = "medium"
72 | LARGE = "large"
73 | LARGE_V2 = "large-v2"
74 | LARGE_V3 = "large-v3"
75 | LARGE_V3_TURBO = "large-v3-turbo"
76 |
77 | @staticmethod
78 | def get_values():
79 | return [i.value for i in WhisperModel]
80 |
81 |
82 | class WhisperMode(Enum):
83 | WHISPER = "whisper"
84 | OPENAI = "openai"
85 | FASTER = "faster"
86 |
87 | @staticmethod
88 | def get_values():
89 | return [i.value for i in WhisperMode]
90 |
--------------------------------------------------------------------------------
/autocut/utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import re
4 |
5 | import ffmpeg
6 | import numpy as np
7 | import opencc
8 | import srt
9 |
10 |
11 | def load_audio(file: str, sr: int = 16000) -> np.ndarray:
12 | try:
13 | out, _ = (
14 | ffmpeg.input(file, threads=0)
15 | .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
16 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
17 | )
18 | except ffmpeg.Error as e:
19 | raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
20 |
21 | return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
22 |
23 |
24 | def is_video(filename):
25 | _, ext = os.path.splitext(filename)
26 | return ext in [".mp4", ".mov", ".mkv", ".avi", ".flv", ".f4v", ".webm"]
27 |
28 |
29 | def is_audio(filename):
30 | _, ext = os.path.splitext(filename)
31 | return ext in [".ogg", ".wav", ".mp3", ".flac", ".m4a"]
32 |
33 |
34 | def change_ext(filename, new_ext):
35 | # Change the extension of filename to new_ext
36 | base, _ = os.path.splitext(filename)
37 | if not new_ext.startswith("."):
38 | new_ext = "." + new_ext
39 | return base + new_ext
40 |
41 |
42 | def add_cut(filename):
43 | # Add cut mark to the filename
44 | base, ext = os.path.splitext(filename)
45 | if base.endswith("_cut"):
46 | base = base[:-4] + "_" + base[-4:]
47 | else:
48 | base += "_cut"
49 | return base + ext
50 |
51 |
52 | # a very simple markdown parser
53 | class MD:
54 | def __init__(self, filename, encoding):
55 | self.lines = []
56 | self.EDIT_DONE_MAKR = "<-- Mark if you are done editing."
57 | self.encoding = encoding
58 | self.filename = filename
59 | if filename:
60 | self.load_file()
61 |
62 | def load_file(self):
63 | if os.path.exists(self.filename):
64 | with open(self.filename, encoding=self.encoding) as f:
65 | self.lines = f.readlines()
66 |
67 | def clear(self):
68 | self.lines = []
69 |
70 | def write(self):
71 | with open(self.filename, "wb") as f:
72 | f.write("\n".join(self.lines).encode(self.encoding, "replace"))
73 |
74 | def tasks(self):
75 | # get all tasks with their status
76 | ret = []
77 | for l in self.lines:
78 | mark, task = self._parse_task_status(l)
79 | if mark is not None:
80 | ret.append((mark, task))
81 | return ret
82 |
83 | def done_editing(self):
84 | for m, t in self.tasks():
85 | if m and self.EDIT_DONE_MAKR in t:
86 | return True
87 | return False
88 |
89 | def add(self, line):
90 | self.lines.append(line)
91 |
92 | def add_task(self, mark, contents):
93 | self.add(f'- [{"x" if mark else " "}] {contents.strip()}')
94 |
95 | def add_done_editing(self, mark):
96 | self.add_task(mark, self.EDIT_DONE_MAKR)
97 |
98 | def add_video(self, video_fn):
99 | ext = os.path.splitext(video_fn)[1][1:]
100 | self.add(
101 | f'\n\n'
102 | )
103 |
104 | def _parse_task_status(self, line):
105 | # return (is_marked, rest) or (None, line) if not a task
106 | m = re.match(r"- +\[([ xX])\] +(.*)", line)
107 | if not m:
108 | return None, line
109 | return m.groups()[0].lower() == "x", m.groups()[1]
110 |
111 |
112 | def check_exists(output, force):
113 | if os.path.exists(output):
114 | if force:
115 | logging.info(f"{output} exists. Will overwrite it")
116 | else:
117 | logging.info(
118 | f"{output} exists, skipping... Use the --force flag to overwrite"
119 | )
120 | return True
121 | return False
122 |
123 |
124 | def expand_segments(segments, expand_head, expand_tail, total_length):
125 | # Pad head and tail for each time segment
126 | results = []
127 | for i in range(len(segments)):
128 | t = segments[i]
129 | start = max(t["start"] - expand_head, segments[i - 1]["end"] if i > 0 else 0)
130 | end = min(
131 | t["end"] + expand_tail,
132 | segments[i + 1]["start"] if i < len(segments) - 1 else total_length,
133 | )
134 | results.append({"start": start, "end": end})
135 | return results
136 |
137 |
138 | def remove_short_segments(segments, threshold):
139 | # Remove segments whose length < threshold
140 | return [s for s in segments if s["end"] - s["start"] > threshold]
141 |
142 |
143 | def merge_adjacent_segments(segments, threshold):
144 | # Merge two adjacent segments if their distance < threshold
145 | results = []
146 | i = 0
147 | while i < len(segments):
148 | s = segments[i]
149 | for j in range(i + 1, len(segments)):
150 | if segments[j]["start"] < s["end"] + threshold:
151 | s["end"] = segments[j]["end"]
152 | i = j
153 | else:
154 | break
155 | i += 1
156 | results.append(s)
157 | return results
158 |
159 |
160 | def compact_rst(sub_fn, encoding):
161 | cc = opencc.OpenCC("t2s")
162 |
163 | base, ext = os.path.splitext(sub_fn)
164 | COMPACT = "_compact"
165 | if ext != ".srt":
166 | logging.fatal("only .srt file is supported")
167 |
168 | if base.endswith(COMPACT):
169 | # to original rst
170 | with open(sub_fn, encoding=encoding) as f:
171 | lines = f.readlines()
172 | subs = []
173 | for l in lines:
174 | items = l.split(" ")
175 | if len(items) < 4:
176 | continue
177 | subs.append(
178 | srt.Subtitle(
179 | index=0,
180 | start=srt.srt_timestamp_to_timedelta(items[0]),
181 | end=srt.srt_timestamp_to_timedelta(items[2]),
182 | content=" ".join(items[3:]).strip(),
183 | )
184 | )
185 | with open(base[: -len(COMPACT)] + ext, "wb") as f:
186 | f.write(srt.compose(subs).encode(encoding, "replace"))
187 | else:
188 | # to a compact version
189 | with open(sub_fn, encoding=encoding) as f:
190 | subs = srt.parse(f.read())
191 | with open(base + COMPACT + ext, "wb") as f:
192 | for s in subs:
193 | f.write(
194 | f"{srt.timedelta_to_srt_timestamp(s.start)} --> {srt.timedelta_to_srt_timestamp(s.end)} "
195 | f"{cc.convert(s.content.strip())}\n".encode(encoding, "replace")
196 | )
197 |
198 |
199 | def trans_srt_to_md(encoding, force, srt_fn, video_fn=None):
200 | base, ext = os.path.splitext(srt_fn)
201 | if ext != ".srt":
202 | logging.fatal("only .srt file is supported")
203 | md_fn = base + ext.split(".")[0] + ".md"
204 |
205 | check_exists(md_fn, force)
206 |
207 | with open(srt_fn, encoding=encoding) as f:
208 | subs = srt.parse(f.read())
209 |
210 | md = MD(md_fn, encoding)
211 | md.clear()
212 | md.add_done_editing(False)
213 | if video_fn:
214 | if not is_video(video_fn):
215 | logging.fatal(f"{video_fn} may not be a video")
216 | md.add_video(os.path.basename(video_fn))
217 | md.add(
218 | f"\nTexts generated from [{os.path.basename(srt_fn)}]({os.path.basename(srt_fn)})."
219 | "Mark the sentences to keep for autocut.\n"
220 | "The format is [subtitle_index,duration_in_second] subtitle context.\n\n"
221 | )
222 |
223 | for s in subs:
224 | sec = s.start.seconds
225 | pre = f"[{s.index},{sec // 60:02d}:{sec % 60:02d}]"
226 | md.add_task(False, f"{pre:11} {s.content.strip()}")
227 | md.write()
228 |
--------------------------------------------------------------------------------
/autocut/whisper_model.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging
3 | import os
4 | from abc import ABC, abstractmethod
5 | from typing import Literal, Union, List, Any, TypedDict
6 |
7 | import numpy as np
8 | import opencc
9 | import srt
10 | from pydub import AudioSegment
11 | from tqdm import tqdm
12 |
13 | from .type import SPEECH_ARRAY_INDEX, LANG
14 |
15 | # whisper sometimes generate traditional chinese, explicitly convert
16 | cc = opencc.OpenCC("t2s")
17 |
18 |
19 | class AbstractWhisperModel(ABC):
20 | def __init__(self, mode, sample_rate=16000):
21 | self.mode = mode
22 | self.whisper_model = None
23 | self.sample_rate = sample_rate
24 |
25 | @abstractmethod
26 | def load(self, *args, **kwargs):
27 | pass
28 |
29 | @abstractmethod
30 | def transcribe(self, *args, **kwargs):
31 | pass
32 |
33 | @abstractmethod
34 | def _transcribe(self, *args, **kwargs):
35 | pass
36 |
37 | @abstractmethod
38 | def gen_srt(self, transcribe_results: List[Any]) -> List[srt.Subtitle]:
39 | pass
40 |
41 |
42 | class WhisperModel(AbstractWhisperModel):
43 | def __init__(self, sample_rate=16000):
44 | super().__init__("whisper", sample_rate)
45 | self.device = None
46 |
47 | def load(
48 | self,
49 | model_name: Literal[
50 | "tiny", "base", "small", "medium", "large", "large-v2"
51 | ] = "small",
52 | device: Union[Literal["cpu", "cuda"], None] = None,
53 | ):
54 | self.device = device
55 |
56 | import whisper
57 |
58 | self.whisper_model = whisper.load_model(model_name, device)
59 |
60 | def _transcribe(self, audio, seg, lang, prompt):
61 | r = self.whisper_model.transcribe(
62 | audio[int(seg["start"]) : int(seg["end"])],
63 | task="transcribe",
64 | language=lang,
65 | initial_prompt=prompt,
66 | )
67 | r["origin_timestamp"] = seg
68 | return r
69 |
70 | def transcribe(
71 | self,
72 | audio: np.ndarray,
73 | speech_array_indices: List[SPEECH_ARRAY_INDEX],
74 | lang: LANG,
75 | prompt: str,
76 | ):
77 | res = []
78 | if self.device == "cpu" and len(speech_array_indices) > 1:
79 | from multiprocessing import Pool
80 |
81 | pbar = tqdm(total=len(speech_array_indices))
82 |
83 | pool = Pool(processes=4)
84 | sub_res = []
85 | # TODO, a better way is merging these segments into a single one, so whisper can get more context
86 | for seg in speech_array_indices:
87 | sub_res.append(
88 | pool.apply_async(
89 | self._transcribe,
90 | (
91 | self.whisper_model,
92 | audio,
93 | seg,
94 | lang,
95 | prompt,
96 | ),
97 | callback=lambda x: pbar.update(),
98 | )
99 | )
100 | pool.close()
101 | pool.join()
102 | pbar.close()
103 | res = [i.get() for i in sub_res]
104 | else:
105 | for seg in (
106 | speech_array_indices
107 | if len(speech_array_indices) == 1
108 | else tqdm(speech_array_indices)
109 | ):
110 | r = self.whisper_model.transcribe(
111 | audio[int(seg["start"]) : int(seg["end"])],
112 | task="transcribe",
113 | language=lang,
114 | initial_prompt=prompt,
115 | verbose=False if len(speech_array_indices) == 1 else None,
116 | )
117 | r["origin_timestamp"] = seg
118 | res.append(r)
119 | return res
120 |
121 | def gen_srt(self, transcribe_results):
122 | subs = []
123 |
124 | def _add_sub(start, end, text):
125 | subs.append(
126 | srt.Subtitle(
127 | index=0,
128 | start=datetime.timedelta(seconds=start),
129 | end=datetime.timedelta(seconds=end),
130 | content=cc.convert(text.strip()),
131 | )
132 | )
133 |
134 | prev_end = 0
135 | for r in transcribe_results:
136 | origin = r["origin_timestamp"]
137 | for s in r["segments"]:
138 | start = s["start"] + origin["start"] / self.sample_rate
139 | end = min(
140 | s["end"] + origin["start"] / self.sample_rate,
141 | origin["end"] / self.sample_rate,
142 | )
143 | if start > end:
144 | continue
145 | # mark any empty segment that is not very short
146 | if start > prev_end + 1.0:
147 | _add_sub(prev_end, start, "< No Speech >")
148 | _add_sub(start, end, s["text"])
149 | prev_end = end
150 |
151 | return subs
152 |
153 |
154 | class OpenAIModel(AbstractWhisperModel):
155 | max_single_audio_bytes = 25 * 2**20 # 25MB
156 | split_audio_bytes = 23 * 2**20 # 23MB, 2MB for safety(header, etc.)
157 | rpm = 3
158 |
159 | def __init__(self, rpm: int, sample_rate=16000):
160 | super().__init__("openai_whisper-1", sample_rate)
161 | self.rpm = rpm
162 | if (
163 | os.environ.get("OPENAI_API_KEY") is None
164 | and os.environ.get("OPENAI_API_KEY_PATH") is None
165 | ):
166 | raise Exception("OPENAI_API_KEY is not set")
167 |
168 | def load(self, model_name: Literal["whisper-1"] = "whisper-1"):
169 | try:
170 | import openai
171 | except ImportError:
172 | raise Exception(
173 | "Please use openai mode(pip install '.[openai]') or all mode(pip install '.[all]')"
174 | )
175 | from functools import partial
176 |
177 | self.whisper_model = partial(openai.Audio.transcribe, model=model_name)
178 |
179 | def transcribe(
180 | self,
181 | input: srt,
182 | audio: np.ndarray,
183 | speech_array_indices: List[SPEECH_ARRAY_INDEX],
184 | lang: LANG,
185 | prompt: str,
186 | ) -> List[srt.Subtitle]:
187 | res = []
188 | name, _ = os.path.splitext(input)
189 | raw_audio = AudioSegment.from_file(input)
190 | ms_bytes = len(raw_audio[:1].raw_data)
191 | audios: List[
192 | TypedDict(
193 | "AudioInfo", {"input": str, "audio": AudioSegment, "start_ms": float}
194 | )
195 | ] = []
196 |
197 | i = 0
198 | for index in speech_array_indices:
199 | start = int(index["start"]) / self.sample_rate * 1000
200 | end = int(index["end"]) / self.sample_rate * 1000
201 | audio_seg = raw_audio[start:end]
202 | if len(audio_seg.raw_data) < self.split_audio_bytes:
203 | temp_file = f"{name}_temp_{i}.wav"
204 | audios.append(
205 | {"input": temp_file, "audio": audio_seg, "start_ms": start}
206 | )
207 | else:
208 | logging.info(
209 | f"Long audio with a size({len(audio_seg.raw_data)} bytes) greater than 25M({25 * 2 ** 20} bytes) "
210 | "will be segmented"
211 | "due to Openai's API restrictions on files smaller than 25M"
212 | )
213 | split_num = len(audio_seg.raw_data) // self.split_audio_bytes + 1
214 | for j in range(split_num):
215 | temp_file = f"{name}_{i}_temp_{j}.wav"
216 | split_audio = audio_seg[
217 | j
218 | * self.split_audio_bytes
219 | // ms_bytes : (j + 1)
220 | * self.split_audio_bytes
221 | // ms_bytes
222 | ]
223 | audios.append(
224 | {
225 | "input": temp_file,
226 | "audio": split_audio,
227 | "start_ms": start + j * self.split_audio_bytes // ms_bytes,
228 | }
229 | )
230 | i += 1
231 |
232 | if len(audios) > 1:
233 | from multiprocessing import Pool
234 |
235 | pbar = tqdm(total=len(audios))
236 |
237 | pool = Pool(processes=min(8, self.rpm))
238 | sub_res = []
239 | for audio in audios:
240 | sub_res.append(
241 | pool.apply_async(
242 | self._transcribe,
243 | (
244 | audio["input"],
245 | audio["audio"],
246 | prompt,
247 | lang,
248 | audio["start_ms"],
249 | ),
250 | callback=lambda x: pbar.update(),
251 | )
252 | )
253 | pool.close()
254 | pool.join()
255 | pbar.close()
256 | for subs in sub_res:
257 | subtitles = subs.get()
258 | res.extend(subtitles)
259 | else:
260 | res = self._transcribe(
261 | audios[0]["input"],
262 | audios[0]["audio"],
263 | prompt,
264 | lang,
265 | audios[0]["start_ms"],
266 | )
267 |
268 | return res
269 |
270 | def _transcribe(
271 | self, input: srt, audio: AudioSegment, prompt: str, lang: LANG, start_ms: float
272 | ):
273 | audio.export(input, "wav")
274 | subtitles = self.whisper_model(
275 | file=open(input, "rb"), prompt=prompt, language=lang, response_format="srt"
276 | )
277 | os.remove(input)
278 | return list(
279 | map(
280 | lambda x: (
281 | setattr(
282 | x, "start", x.start + datetime.timedelta(milliseconds=start_ms)
283 | ),
284 | setattr(
285 | x, "end", x.end + datetime.timedelta(milliseconds=start_ms)
286 | ),
287 | x,
288 | )[-1],
289 | list(srt.parse(subtitles)),
290 | )
291 | )
292 |
293 | def gen_srt(self, transcribe_results: List[srt.Subtitle]):
294 | if len(transcribe_results) == 0:
295 | return []
296 | if len(transcribe_results) == 1:
297 | return transcribe_results
298 | subs = [transcribe_results[0]]
299 | for subtitle in transcribe_results[1:]:
300 | if subtitle.start - subs[-1].end > datetime.timedelta(seconds=1):
301 | subs.append(
302 | srt.Subtitle(
303 | index=0,
304 | start=subs[-1].end,
305 | end=subtitle.start,
306 | content="< No Speech >",
307 | )
308 | )
309 | subs.append(subtitle)
310 | return subs
311 |
312 |
313 | class FasterWhisperModel(AbstractWhisperModel):
314 | def __init__(self, sample_rate=16000):
315 | super().__init__("faster-whisper", sample_rate)
316 | self.device = None
317 |
318 | def load(
319 | self,
320 | model_name: Literal[
321 | "tiny", "base", "small", "medium", "large", "large-v2"
322 | ] = "small",
323 | device: Union[Literal["cpu", "cuda"], None] = None,
324 | ):
325 | try:
326 | from faster_whisper import WhisperModel
327 | except ImportError:
328 | raise Exception(
329 | "Please use faster mode(pip install '.[faster]') or all mode(pip install '.[all]')"
330 | )
331 |
332 | self.device = device if device else "cpu"
333 | self.whisper_model = WhisperModel(model_name, self.device)
334 |
335 | def _transcribe(self):
336 | raise Exception("Not implemented")
337 |
338 | def transcribe(
339 | self,
340 | audio: np.ndarray,
341 | speech_array_indices: List[SPEECH_ARRAY_INDEX],
342 | lang: LANG,
343 | prompt: str,
344 | ):
345 | res = []
346 | for seg in speech_array_indices:
347 | segments, info = self.whisper_model.transcribe(
348 | audio[int(seg["start"]) : int(seg["end"])],
349 | task="transcribe",
350 | language=lang,
351 | initial_prompt=prompt,
352 | vad_filter=False,
353 | )
354 | segments = list(segments) # The transcription will actually run here.
355 | r = {"origin_timestamp": seg, "segments": segments, "info": info}
356 | res.append(r)
357 | return res
358 |
359 | def gen_srt(self, transcribe_results):
360 | subs = []
361 |
362 | def _add_sub(start, end, text):
363 | subs.append(
364 | srt.Subtitle(
365 | index=0,
366 | start=datetime.timedelta(seconds=start),
367 | end=datetime.timedelta(seconds=end),
368 | content=cc.convert(text.strip()),
369 | )
370 | )
371 |
372 | prev_end = 0
373 | for r in transcribe_results:
374 | origin = r["origin_timestamp"]
375 | for seg in r["segments"]:
376 | s = dict(start=seg.start, end=seg.end, text=seg.text)
377 | start = s["start"] + origin["start"] / self.sample_rate
378 | end = min(
379 | s["end"] + origin["start"] / self.sample_rate,
380 | origin["end"] / self.sample_rate,
381 | )
382 | if start > end:
383 | continue
384 | # mark any empty segment that is not very short
385 | if start > prev_end + 1.0:
386 | _add_sub(prev_end, start, "< No Speech >")
387 | _add_sub(start, end, s["text"])
388 | prev_end = end
389 |
390 | return subs
391 |
--------------------------------------------------------------------------------
/imgs/typora.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/imgs/typora.jpg
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = autocut
3 | version = attr: autocut.__version__
4 | license = Apache Software License
5 | description = Cut video by subtitles
6 | long_description = file: README.md
7 | classifiers =
8 | License :: OSI Approved :: Apache Software License
9 | Operating System :: OS Independent
10 | Programming Language :: Python :: 3
11 |
12 | [options]
13 | packages = find:
14 | include_package_data = True
15 | python_requires = >= 3.9
16 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | requirements = [
4 | "ffmpeg-python",
5 | "moviepy",
6 | "openai-whisper",
7 | "opencc-python-reimplemented",
8 | "parameterized",
9 | "pydub",
10 | "srt",
11 | "torchaudio",
12 | "tqdm",
13 | ]
14 |
15 |
16 | setup(
17 | name="autocut-sub",
18 | install_requires=requirements,
19 | url="https://github.com/mli/autocut",
20 | project_urls={
21 | "source": "https://github.com/mli/autocut",
22 | },
23 | license="Apache License 2.0",
24 | long_description=open("README.md", "r", encoding="utf-8").read(),
25 | long_description_content_type="text/markdown",
26 | extras_require={
27 | "all": ["openai", "faster-whisper"],
28 | "openai": ["openai"],
29 | "faster": ["faster-whisper"],
30 | },
31 | packages=find_packages(),
32 | entry_points={
33 | "console_scripts": [
34 | "autocut = autocut.main:main",
35 | ]
36 | },
37 | )
38 |
--------------------------------------------------------------------------------
/tea.yaml:
--------------------------------------------------------------------------------
1 | # https://tea.xyz/what-is-this-file
2 | ---
3 | version: 1.0.0
4 | codeOwners:
5 | - '0x1e292d6f2D09dc8ffDDb5B8Fd6b641e180224D84'
6 | quorum: 1
7 |
--------------------------------------------------------------------------------
/test/config.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | # 定义一个日志收集器
5 | logger = logging.getLogger()
6 | # 设置收集器的级别,不设定的话,默认收集warning及以上级别的日志
7 | logger.setLevel("DEBUG")
8 | # 设置日志格式
9 | fmt = logging.Formatter("%(filename)s-%(lineno)d-%(asctime)s-%(levelname)s-%(message)s")
10 | # 设置日志处理器-输出到文件,并且设置编码格式
11 | if not os.path.exists("./log"):
12 | os.makedirs("./log")
13 | file_handler = logging.FileHandler("./log/log.txt", encoding="utf-8")
14 | # 设置日志处理器级别
15 | file_handler.setLevel("DEBUG")
16 | # 处理器按指定格式输出日志
17 | file_handler.setFormatter(fmt)
18 | # 输出到控制台
19 | ch = logging.StreamHandler()
20 | # 设置日志处理器级别
21 | ch.setLevel("DEBUG")
22 | # 处理器按指定格式输出日志
23 | ch.setFormatter(fmt)
24 | # 收集器和处理器对接,指定输出渠道
25 | # 日志输出到文件
26 | logger.addHandler(file_handler)
27 | # 日志输出到控制台
28 | logger.addHandler(ch)
29 |
30 | TEST_MEDIA_PATH = "./test/media/"
31 | TEST_CONTENT_PATH = "./test/content/"
32 | TEST_MEDIA_FILE = [
33 | "test001.mp4",
34 | "test002.mov",
35 | "test003.mkv",
36 | "test004.flv",
37 | "test005.mp3",
38 | "test006.MP4",
39 | ]
40 |
41 | TEST_MEDIA_FILE_LANG = ["test001_en.mp4"]
42 | TEST_MEDIA_FILE_SIMPLE = ["test001.mp4", "test005.mp3"]
43 |
44 |
45 | class TestArgs:
46 | def __init__(self):
47 | self.inputs = []
48 | self.bitrate = "10m"
49 | self.encoding = "utf-8"
50 | self.sampling_rate = 16000
51 | self.lang = "zh"
52 | self.prompt = ""
53 | self.whisper_model = "small"
54 | self.device = None
55 | self.vad = False
56 | self.force = False
57 | self.whisper_mode = (
58 | "faster" if os.environ.get("WHISPER_MODE") == "faster" else "whisper"
59 | )
60 | self.openai_rpm = 3
61 |
--------------------------------------------------------------------------------
/test/content/test.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:00:00,000 --> 00:00:05,000
3 | 大家好,我的名字是AutoCut.这是一条用于测试的视频。
4 |
5 | 2
6 | 00:00:05,000 --> 00:00:10,260
7 | Hello, my name is AutoCut. This is a video for testing.
8 |
9 |
--------------------------------------------------------------------------------
/test/content/test_md.md:
--------------------------------------------------------------------------------
1 | - [x] <-- Mark if you are done editing.
2 |
3 |
4 |
5 | Texts generated from [test001.srt](test001.srt).Mark the sentences to keep for autocut.
6 | The format is [subtitle_index,duration_in_second] subtitle context.
7 |
8 | - [ ] [1,00:00] 大家好,我的名字是AutoCut.这是一条用于测试的视频。
9 | - [x] [2,00:05] Hello, my name is AutoCut. This is a video for testing.
10 |
--------------------------------------------------------------------------------
/test/content/test_srt.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:00:00,000 --> 00:00:05,000
3 | 大家好,我的名字是AutoCut.这是一条用于测试的视频。
4 |
5 |
--------------------------------------------------------------------------------
/test/media/test001.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/test/media/test001.mp4
--------------------------------------------------------------------------------
/test/media/test001_en.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/test/media/test001_en.mp4
--------------------------------------------------------------------------------
/test/media/test002.mov:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/test/media/test002.mov
--------------------------------------------------------------------------------
/test/media/test003.mkv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/test/media/test003.mkv
--------------------------------------------------------------------------------
/test/media/test004.flv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/test/media/test004.flv
--------------------------------------------------------------------------------
/test/media/test005.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/test/media/test005.mp3
--------------------------------------------------------------------------------
/test/media/test006.MP4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/test/media/test006.MP4
--------------------------------------------------------------------------------
/test/test_cut.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import unittest
4 |
5 | from parameterized import parameterized, param
6 |
7 | from autocut.cut import Cutter
8 | from config import TestArgs, TEST_MEDIA_PATH, TEST_MEDIA_FILE_SIMPLE, TEST_CONTENT_PATH
9 |
10 |
11 | class TestCut(unittest.TestCase):
12 | @classmethod
13 | def setUpClass(cls):
14 | logging.info("检查测试文件是否正常存在")
15 | scan_file = os.listdir(TEST_MEDIA_PATH)
16 | logging.info(
17 | "应存在文件列表:"
18 | + str(TEST_MEDIA_FILE_SIMPLE)
19 | + " 扫描到文件列表:"
20 | + str(scan_file)
21 | )
22 | for file in TEST_MEDIA_FILE_SIMPLE:
23 | assert file in scan_file
24 |
25 | def tearDown(self):
26 | for file in TEST_MEDIA_FILE_SIMPLE:
27 | namepart = os.path.join(
28 | TEST_MEDIA_PATH, os.path.splitext(file)[0] + "_cut."
29 | )
30 | if os.path.exists(namepart + "mp4"):
31 | os.remove(namepart + "mp4")
32 | if os.path.exists(namepart + "mp3"):
33 | os.remove(namepart + "mp3")
34 |
35 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE])
36 | def test_srt_cut(self, file_name):
37 | args = TestArgs()
38 | args.inputs = [
39 | os.path.join(TEST_MEDIA_PATH, file_name),
40 | os.path.join(TEST_CONTENT_PATH, "test_srt.srt"),
41 | ]
42 | cut = Cutter(args)
43 | cut.run()
44 | namepart = os.path.join(
45 | TEST_MEDIA_PATH, os.path.splitext(file_name)[0] + "_cut."
46 | )
47 | self.assertTrue(
48 | os.path.exists(namepart + "mp4") or os.path.exists(namepart + "mp3")
49 | )
50 |
51 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE])
52 | def test_md_cut(self, file_name):
53 | args = TestArgs()
54 | args.inputs = [
55 | TEST_MEDIA_PATH + file_name,
56 | os.path.join(TEST_CONTENT_PATH, "test.srt"),
57 | os.path.join(TEST_CONTENT_PATH, "test_md.md"),
58 | ]
59 | cut = Cutter(args)
60 | cut.run()
61 | namepart = os.path.join(
62 | TEST_MEDIA_PATH, os.path.splitext(file_name)[0] + "_cut."
63 | )
64 | self.assertTrue(
65 | os.path.exists(namepart + "mp4") or os.path.exists(namepart + "mp3")
66 | )
67 |
--------------------------------------------------------------------------------
/test/test_transcribe.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import unittest
4 |
5 | from parameterized import parameterized, param
6 |
7 | from autocut.utils import MD
8 | from config import (
9 | TEST_MEDIA_FILE,
10 | TestArgs,
11 | TEST_MEDIA_FILE_SIMPLE,
12 | TEST_MEDIA_FILE_LANG,
13 | TEST_MEDIA_PATH,
14 | )
15 | from autocut.transcribe import Transcribe
16 |
17 |
18 | class TestTranscribe(unittest.TestCase):
19 | @classmethod
20 | def setUpClass(cls):
21 | logging.info("检查测试文件是否正常存在")
22 | scan_file = os.listdir(TEST_MEDIA_PATH)
23 | logging.info(
24 | "应存在文件列表:"
25 | + str(TEST_MEDIA_FILE)
26 | + str(TEST_MEDIA_FILE_LANG)
27 | + str(TEST_MEDIA_FILE_SIMPLE)
28 | + " 扫描到文件列表:"
29 | + str(scan_file)
30 | )
31 | for file in TEST_MEDIA_FILE:
32 | assert file in scan_file
33 | for file in TEST_MEDIA_FILE_LANG:
34 | assert file in scan_file
35 | for file in TEST_MEDIA_FILE_SIMPLE:
36 | assert file in scan_file
37 |
38 | @classmethod
39 | def tearDownClass(cls):
40 | for file in os.listdir(TEST_MEDIA_PATH):
41 | if file.endswith("md") or file.endswith("srt"):
42 | os.remove(TEST_MEDIA_PATH + file)
43 |
44 | def tearDown(self):
45 | for file in TEST_MEDIA_FILE_SIMPLE:
46 | if os.path.exists(TEST_MEDIA_PATH + file.split(".")[0] + ".md"):
47 | os.remove(TEST_MEDIA_PATH + file.split(".")[0] + ".md")
48 | if os.path.exists(TEST_MEDIA_PATH + file.split(".")[0] + ".srt"):
49 | os.remove(TEST_MEDIA_PATH + file.split(".")[0] + ".srt")
50 |
51 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE])
52 | def test_default_transcribe(self, file_name):
53 | logging.info("检查默认参数生成字幕")
54 | args = TestArgs()
55 | args.inputs = [TEST_MEDIA_PATH + file_name]
56 | transcribe = Transcribe(args)
57 | transcribe.run()
58 | self.assertTrue(
59 | os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md")
60 | )
61 |
62 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE])
63 | def test_jump_done_transcribe(self, file_name):
64 | logging.info("检查默认参数跳过生成字幕")
65 | args = TestArgs()
66 | args.inputs = [TEST_MEDIA_PATH + file_name]
67 | transcribe = Transcribe(args)
68 | transcribe.run()
69 | self.assertTrue(
70 | os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md")
71 | )
72 |
73 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_LANG])
74 | def test_en_transcribe(self, file_name):
75 | logging.info("检查--lang='en'参数生成字幕")
76 | args = TestArgs()
77 | args.lang = "en"
78 | args.inputs = [TEST_MEDIA_PATH + file_name]
79 | transcribe = Transcribe(args)
80 | transcribe.run()
81 | self.assertTrue(
82 | os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md")
83 | )
84 |
85 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_LANG])
86 | def test_force_transcribe(self, file_name):
87 | logging.info("检查--force参数生成字幕")
88 | args = TestArgs()
89 | args.force = True
90 | args.inputs = [TEST_MEDIA_PATH + file_name]
91 | md0_lens = len(
92 | "".join(
93 | MD(
94 | TEST_MEDIA_PATH + file_name.split(".")[0] + ".md", args.encoding
95 | ).lines
96 | )
97 | )
98 | transcribe = Transcribe(args)
99 | transcribe.run()
100 | md1_lens = len(
101 | "".join(
102 | MD(
103 | TEST_MEDIA_PATH + file_name.split(".")[0] + ".md", args.encoding
104 | ).lines
105 | )
106 | )
107 | self.assertLessEqual(md1_lens, md0_lens)
108 |
109 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE])
110 | def test_encoding_transcribe(self, file_name):
111 | logging.info("检查--encoding参数生成字幕")
112 | args = TestArgs()
113 | args.encoding = "gbk"
114 | args.inputs = [TEST_MEDIA_PATH + file_name]
115 | transcribe = Transcribe(args)
116 | transcribe.run()
117 | with open(
118 | os.path.join(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md"),
119 | encoding="gbk",
120 | ):
121 | self.assertTrue(True)
122 |
123 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE])
124 | def test_vad_transcribe(self, file_name):
125 | logging.info("检查--vad参数生成字幕")
126 | args = TestArgs()
127 | args.force = True
128 | args.vad = True
129 | args.inputs = [TEST_MEDIA_PATH + file_name]
130 | transcribe = Transcribe(args)
131 | transcribe.run()
132 | self.assertTrue(
133 | os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md")
134 | )
135 |
--------------------------------------------------------------------------------