├── .github └── workflows │ ├── base.yml │ ├── faster-whisper │ └── lint.yml ├── .gitignore ├── Dockerfile ├── Dockerfile.cuda ├── LICENSE ├── README.md ├── autocut ├── __init__.py ├── __main__.py ├── cut.py ├── daemon.py ├── main.py ├── package_transcribe.py ├── transcribe.py ├── type.py ├── utils.py └── whisper_model.py ├── imgs └── typora.jpg ├── setup.cfg ├── setup.py ├── tea.yaml └── test ├── config.py ├── content ├── test.srt ├── test_md.md └── test_srt.srt ├── media ├── test001.mp4 ├── test001_en.mp4 ├── test002.mov ├── test003.mkv ├── test004.flv ├── test005.mp3 └── test006.MP4 ├── test_cut.py └── test_transcribe.py /.github/workflows/base.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | lint_and_test: 11 | runs-on: ${{ matrix.os }}-latest 12 | strategy: 13 | matrix: 14 | python-version: ['3.9', '3.10'] 15 | # Wait for fix on macos-m1: https://github.com/federicocarboni/setup-ffmpeg/issues/21 16 | os: [ubuntu, windows, macos-12] 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Set Variables 24 | id: set_variables 25 | shell: bash 26 | run: | 27 | echo "PY=$(python -c 'import hashlib, sys;print(hashlib.sha256(sys.version.encode()+sys.executable.encode()).hexdigest())')" >> $GITHUB_OUTPUT 28 | echo "PIP_CACHE=$(pip cache dir)" >> $GITHUB_OUTPUT 29 | - name: Cache PIP 30 | uses: actions/cache@v3 31 | with: 32 | path: ${{ steps.set_variables.outputs.PIP_CACHE }} 33 | key: ${{ runner.os }}-pip-${{ steps.set_variables.outputs.PY }} 34 | 35 | - name: Setup ffmpeg for different platforms 36 | uses: FedericoCarboni/setup-ffmpeg@v3 37 | 38 | - name: Install dependencies 39 | run: | 40 | python -m pip install --upgrade pip 41 | pip install . 42 | pip install pytest 43 | - name: Run Test 44 | run: pytest test/ 45 | -------------------------------------------------------------------------------- /.github/workflows/faster-whisper: -------------------------------------------------------------------------------- 1 | name: Test Faster Whisper 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | lint_and_test: 11 | runs-on: ${{ matrix.os }}-latest 12 | strategy: 13 | matrix: 14 | python-version: ['3.9', '3.10'] 15 | # macos did not support m1 for now 16 | os: [ubuntu, windows, macos] 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Set Variables 24 | id: set_variables 25 | shell: bash 26 | run: | 27 | echo "PY=$(python -c 'import hashlib, sys;print(hashlib.sha256(sys.version.encode()+sys.executable.encode()).hexdigest())')" >> $GITHUB_OUTPUT 28 | echo "PIP_CACHE=$(pip cache dir)" >> $GITHUB_OUTPUT 29 | - name: Cache PIP 30 | uses: actions/cache@v3 31 | with: 32 | path: ${{ steps.set_variables.outputs.PIP_CACHE }} 33 | key: ${{ runner.os }}-pip-${{ steps.set_variables.outputs.PY }} 34 | 35 | - name: Setup ffmpeg for differnt platforms 36 | uses: FedericoCarboni/setup-ffmpeg@master 37 | 38 | - name: Install dependencies 39 | run: | 40 | python -m pip install --upgrade pip 41 | pip install ".[faster]" 42 | pip install pytest 43 | - name: Run Test 44 | run: WHISPER_MODE=faster pytest test/ 45 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Test Lint 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | lint: 11 | runs-on: ${{ matrix.os }}-latest 12 | strategy: 13 | matrix: 14 | python-version: ['3.9'] 15 | os: [ubuntu] 16 | steps: 17 | - uses: actions/checkout@v3 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Set Variables 23 | id: set_variables 24 | shell: bash 25 | run: | 26 | echo "PY=$(python -c 'import hashlib, sys;print(hashlib.sha256(sys.version.encode()+sys.executable.encode()).hexdigest())')" >> $GITHUB_OUTPUT 27 | echo "PIP_CACHE=$(pip cache dir)" >> $GITHUB_OUTPUT 28 | - name: Cache PIP 29 | uses: actions/cache@v3 30 | with: 31 | path: ${{ steps.set_variables.outputs.PIP_CACHE }} 32 | key: ${{ runner.os }}-pip-${{ steps.set_variables.outputs.PY }} 33 | 34 | - name: Install dependencies 35 | run: | 36 | python -m pip install --upgrade pip 37 | pip install black 38 | 39 | - name: Run Lint 40 | run: black . --check -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | log/ -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim as base 2 | 3 | RUN mkdir /autocut 4 | COPY ./ /autocut 5 | WORKDIR /autocut 6 | 7 | RUN apt update && \ 8 | apt install -y git && \ 9 | apt install -y ffmpeg 10 | 11 | RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu && \ 12 | pip install . -------------------------------------------------------------------------------- /Dockerfile.cuda: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime 2 | 3 | RUN mkdir /autocut 4 | COPY ./ /autocut 5 | WORKDIR /autocut 6 | 7 | RUN apt update && \ 8 | apt install -y git && \ 9 | apt install -y ffmpeg 10 | 11 | RUN pip install . -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AutoCut: 通过字幕来剪切视频 2 | 3 | AutoCut 对你的视频自动生成字幕。然后你选择需要保留的句子,AutoCut 将对你视频中对应的片段裁切并保存。你无需使用视频编辑软件,只需要编辑文本文件即可完成剪切。 4 | 5 | **2024.10.05更新**:支持 `large-v3-turbo` [模型](https://github.com/openai/whisper/discussions/2363),提供更快的转录速度。 6 | 7 | ```shell 8 | autocut -t xxx --whisper-model large-v3-turbo 9 | ```` 10 | 11 | **2024.03.10更新**:支持 pip 安装和提供 import 转录相关的功能 12 | 13 | ```shell 14 | # Install 15 | pip install autocut-sub 16 | ``` 17 | 18 | ```python 19 | from autocut import Transcribe, load_audio 20 | ``` 21 | 22 | 23 | **2023.10.14更新**:支持 faster-whisper 和指定依赖(但由于 Action 限制暂时移除了 faster-whisper 的测试运行) 24 | 25 | ```shell 26 | # for whisper only 27 | pip install . 28 | 29 | # for whisper and faster-whisper 30 | pip install '.[faster]' 31 | 32 | # for whisper and openai-whisper 33 | pip install '.[openai]' 34 | 35 | # for all 36 | pip install '.[all]' 37 | ``` 38 | 39 | ```shell 40 | # using faster-whisper 41 | autocut -t xxx --whisper-mode=faster 42 | ``` 43 | 44 | ```shell 45 | # using openai api 46 | export OPENAI_API_KEY=sk-xxx 47 | autocut -t xxx --whisper-mode=openai --openai-rpm=3 48 | ``` 49 | 50 | **2023.8.13更新**:支持调用 Openai Whisper API 51 | ```shell 52 | export OPENAI_API_KEY=sk-xxx 53 | autocut -t xxx --whisper-mode=openai --openai-rpm=3 54 | ``` 55 | 56 | ## 使用例子 57 | 58 | 假如你录制的视频放在 `2022-11-04/` 这个文件夹里。那么运行 59 | 60 | ```bash 61 | autocut -d 2022-11-04 62 | ``` 63 | 64 | > 提示:如果你使用 OBS 录屏,可以在 `设置->高级->录像->文件名格式` 中将空格改成 `/`,即 `%CCYY-%MM-%DD/%hh-%mm-%ss`。那么视频文件将放在日期命名的文件夹里。 65 | 66 | AutoCut 将持续对这个文件夹里视频进行字幕抽取和剪切。例如,你刚完成一个视频录制,保存在 `11-28-18.mp4`。AutoCut 将生成 `11-28-18.md`。你在里面选择需要保留的句子后,AutoCut 将剪切出 `11-28-18_cut.mp4`,并生成 `11-28-18_cut.md` 来预览结果。 67 | 68 | 你可以使用任何的 Markdown 编辑器。例如我常用 VS Code 和 Typora。下图是通过 Typora 来对 `11-28-18.md` 编辑。 69 | 70 | ![](imgs/typora.jpg) 71 | 72 | 全部完成后在 `autocut.md` 里选择需要拼接的视频后,AutoCut 将输出 `autocut_merged.mp4` 和对应的字幕文件。 73 | 74 | ## 安装 75 | 76 | 首先安装 Python 包 77 | 78 | ``` 79 | pip install git+https://github.com/mli/autocut.git 80 | ``` 81 | 82 | ## 本地安装测试 83 | 84 | 85 | ``` 86 | git clone https://github.com/mli/autocut 87 | cd autocut 88 | pip install . 89 | ``` 90 | 91 | 92 | > 上面将安装 [pytorch](https://pytorch.org/)。如果你需要 GPU 运行,且默认安装的版本不匹配的话,你可以先安装 Pytorch。如果安装 Whipser 出现问题,请参考[官方文档](https://github.com/openai/whisper#setup)。 93 | 94 | 另外需要安装 [ffmpeg](https://ffmpeg.org/) 95 | 96 | ``` 97 | # on Ubuntu or Debian 98 | sudo apt update && sudo apt install ffmpeg 99 | 100 | # on Arch Linux 101 | sudo pacman -S ffmpeg 102 | 103 | # on MacOS using Homebrew (https://brew.sh/) 104 | brew install ffmpeg 105 | 106 | # on Windows using Scoop (https://scoop.sh/) 107 | scoop install ffmpeg 108 | ``` 109 | 110 | ## Docker 安装 111 | 112 | 首先将项目克隆到本地。 113 | 114 | ```bash 115 | git clone https://github.com/mli/autocut.git 116 | ``` 117 | 118 | ### 安装 CPU 版本 119 | 120 | 进入项目根目录,然后构建 docker 映像。 121 | 122 | ```bash 123 | docker build -t autocut . 124 | ``` 125 | 126 | 运行下面的命令创建 docker 容器,就可以直接使用了。 127 | 128 | ```bash 129 | docker run -it --rm -v E:\autocut:/autocut/video autocut /bin/bash 130 | ``` 131 | 132 | 其中 `-v` 是将主机存放视频的文件夹 `E:\autocut` 映射到虚拟机的 `/autocut/video` 目录。`E:\autocut` 是主机存放视频的目录,需修改为自己主机存放视频的目录。 133 | 134 | ### 安装 GPU 版本 135 | 136 | 使用 GPU 加速需要主机有 Nvidia 的显卡并安装好相应驱动。然后在项目根目录,执行下面的命令构建 docker 映像。 137 | 138 | ```bash 139 | docker build -f ./Dockerfile.cuda -t autocut-gpu . 140 | ``` 141 | 142 | 使用 GPU 加速时,运行 docker 容器需添加参数 `--gpus all`。 143 | 144 | ```bash 145 | docker run --gpus all -it --rm -v E:\autocut:/autocut/video autocut-gpu 146 | ``` 147 | 148 | ## 更多使用选项 149 | 150 | ### 转录某个视频生成 `.srt` 和 `.md` 结果。 151 | 152 | ```bash 153 | autocut -t 22-52-00.mp4 154 | ``` 155 | 156 | 1. 如果对转录质量不满意,可以使用更大的模型,例如 157 | 158 | ```bash 159 | autocut -t 22-52-00.mp4 --whisper-model large 160 | ``` 161 | 162 | 默认是 `small`。更好的模型是 `medium` 和 `large`,但推荐使用 GPU 获得更好的速度。也可以使用更快的 `tiny` 和 `base`,但转录质量会下降。 163 | 164 | 165 | ### 剪切某个视频 166 | 167 | ```bash 168 | autocut -c 22-52-00.mp4 22-52-00.srt 22-52-00.md 169 | ``` 170 | 171 | 1. 默认视频比特率是 `--bitrate 10m`,你可以根据需要调大调小。 172 | 2. 如果不习惯 Markdown 格式文件,你也可以直接在 `srt` 文件里删除不要的句子,在剪切时不传入 `md` 文件名即可。就是 `autocut -c 22-52-00.mp4 22-52-00.srt` 173 | 3. 如果仅有 `srt` 文件,编辑不方便可以使用如下命令生成 `md` 文件,然后编辑 `md` 文件即可,但此时会完全对照 `srt` 生成,不会出现 `no speech` 等提示文本。 174 | 175 | ```bash 176 | autocut -m test.srt test.mp4 177 | autocut -m test.mp4 test.srt # 支持视频和字幕乱序传入 178 | autocut -m test.srt # 也可以只传入字幕文件 179 | ``` 180 | 181 | 182 | ### 一些小提示 183 | 184 | 185 | 1. 讲得流利的视频的转录质量会高一些,这因为是 Whisper 训练数据分布的缘故。对一个视频,你可以先粗选一下句子,然后在剪出来的视频上再剪一次。 186 | 2. 最终视频生成的字幕通常还需要做一些小编辑。但 `srt` 里面空行太多。你可以使用 `autocut -s 22-52-00.srt` 来生成一个紧凑些的版本 `22-52-00_compact.srt` 方便编辑(这个格式不合法,但编辑器,例如 VS Code,还是会进行语法高亮)。编辑完成后,`autocut -s 22-52-00_compact.srt` 转回正常格式。 187 | 3. 用 Typora 和 VS Code 编辑 Markdown 都很方便。他们都有对应的快捷键 mark 一行或者多行。但 VS Code 视频预览似乎有点问题。 188 | 4. 视频是通过 ffmpeg 导出。在 Apple M1 芯片上它用不了 GPU,导致导出速度不如专业视频软件。 189 | 190 | ### 常见问题 191 | 192 | 1. **输出的是乱码?** 193 | 194 | AutoCut 默认输出编码是 `utf-8`. 确保你的编辑器也使用了 `utf-8` 解码。你可以通过 `--encoding` 指定其他编码格式。但是需要注意生成字幕文件和使用字幕文件剪辑时的编码格式需要一致。例如使用 `gbk`。 195 | 196 | ```bash 197 | autocut -t test.mp4 --encoding=gbk 198 | autocut -c test.mp4 test.srt test.md --encoding=gbk 199 | ``` 200 | 201 | 如果使用了其他编码格式(如 `gbk` 等)生成 `md` 文件并用 Typora 打开后,该文件可能会被 Typora 自动转码为其他编码格式,此时再通过生成时指定的编码格式进行剪辑时可能会出现编码不支持等报错。因此可以在使用 Typora 编辑后再通过 VSCode 等修改到你需要的编码格式进行保存后再使用剪辑功能。 202 | 203 | 2. **如何使用 GPU 来转录?** 204 | 205 | 当你有 Nvidia GPU,而且安装了对应版本的 PyTorch 的时候,转录是在 GPU 上进行。你可以通过命令来查看当前是不是支持 GPU。 206 | 207 | ```bash 208 | python -c "import torch; print(torch.cuda.is_available())" 209 | ``` 210 | 211 | 否则你可以在安装 AutoCut 前手动安装对应的 GPU 版本 PyTorch。 212 | 213 | 3. **使用 GPU 时报错显存不够。** 214 | 215 | whisper 的大模型需要一定的 GPU 显存。如果你的显存不够,你可以用小一点的模型,例如 `small`。如果你仍然想用大模型,可以通过 `--device` 来强制使用 CPU。例如 216 | 217 | ```bash 218 | autocut -t 11-28-18.mp4 --whisper-model large --device cpu 219 | ``` 220 | 221 | 4. **能不能使用 `pip` 安装?** 222 | 223 | whisper已经发布到PyPI了,可以直接用`pip install openai-whisper`安装。 224 | 225 | [https://github.com/openai/whisper#setup](https://github.com/openai/whisper#setup) 226 | 227 | [https://pypi.org/project/openai-whisper/](https://pypi.org/project/openai-whisper/) 228 | 229 | ## 如何参与贡献 230 | 231 | [这里有一些想做的 feature](https://github.com/mli/autocut/issues/22),欢迎贡献。 232 | 233 | ### 代码结构 234 | ```text 235 | autocut 236 | │ .gitignore 237 | │ LICENSE 238 | │ README.md # 一般新增或修改需要让使用者知道就需要对应更新 README.md 内容 239 | │ setup.py 240 | │ 241 | └─autocut # 核心代码位于 autocut 文件夹中,新增功能的实现也一般在这里面进行修改或新增 242 | │ cut.py 243 | │ daemon.py 244 | │ main.py 245 | │ transcribe.py 246 | │ utils.py 247 | └─ __init__.py 248 | 249 | ``` 250 | 251 | ### 安装依赖 252 | 开始安装这个项目的需要的依赖之前,建议先了解一下 Anaconda 或者 venv 的虚拟环境使用,推荐**使用虚拟环境来搭建该项目的开发环境**。 253 | 具体安装方式为在你搭建搭建的虚拟环境之中按照[上方安装步骤](./README.md#安装)进行安装。 254 | 255 | > 为什么推荐使用虚拟环境开发? 256 | > 257 | > 一方面是保证各种不同的开发环境之间互相不污染。 258 | > 259 | > 更重要的是在于这个项目实际上是一个 Python Package,所以在你安装之后 AutoCut 的代码实际也会变成你的环境依赖。 260 | > **因此在你更新代码之后,你需要让将新代码重新安装到环境中,然后才能调用到新的代码。** 261 | 262 | ### 开发 263 | 264 | 1. 代码风格目前遵循 PEP-8,可以使用相关的自动格式化软件完成。 265 | 2. `utils.py` 主要是全局共用的一些工具方法。 266 | 3. `transcribe.py` 是调用模型生成`srt`和`md`的部分。 267 | 4. `cut.py` 提供根据标记后`md`或`srt`进行视频剪切合并的功能。 268 | 5. `daemon.py` 提供的是监听文件夹生成字幕和剪切视频的功能。 269 | 6. `main.py` 声明命令行参数,根据输入参数调用对应功能。 270 | 271 | 开发过程中请尽量保证修改在正确的地方,以及合理地复用代码, 272 | 同时工具函数请尽可能放在`utils.py`中。 273 | 代码格式目前是遵循 PEP-8,变量命名尽量语义化即可。 274 | 275 | 在开发完成之后,最重要的一点是需要进行**测试**,请保证提交之前对所有**与你修改直接相关的部分**以及**你修改会影响到的部分**都进行了测试,并保证功能的正常。 276 | 目前使用 `GitHub Actions` CI, Lint 使用 black 提交前请运行 `black`。 277 | 278 | ### 提交 279 | 280 | 1. commit 信息用英文描述清楚你做了哪些修改即可,小写字母开头。 281 | 2. 最好可以保证一次的 commit 涉及的修改比较小,可以简短地描述清楚,这样也方便之后有修改时的查找。 282 | 3. PR 的时候 title 简述有哪些修改, contents 可以具体写下修改内容。 283 | 4. run test `pip install pytest` then `pytest test` 284 | 5. run lint `pip install black` then `black .` 285 | -------------------------------------------------------------------------------- /autocut/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.1.0" 2 | 3 | from .type import LANG, WhisperModel, WhisperMode 4 | from .utils import load_audio 5 | from .package_transcribe import Transcribe 6 | 7 | __all__ = ["Transcribe", "load_audio", "WhisperMode", "WhisperModel", "LANG"] 8 | -------------------------------------------------------------------------------- /autocut/__main__.py: -------------------------------------------------------------------------------- 1 | from .main import main 2 | 3 | if __name__ == "__main__": 4 | main() 5 | -------------------------------------------------------------------------------- /autocut/cut.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | 5 | import srt 6 | from moviepy import editor 7 | 8 | from . import utils 9 | 10 | 11 | # Merge videos 12 | class Merger: 13 | def __init__(self, args): 14 | self.args = args 15 | 16 | def write_md(self, videos): 17 | md = utils.MD(self.args.inputs[0], self.args.encoding) 18 | num_tasks = len(md.tasks()) 19 | # Not overwrite if already marked as down or no new videos 20 | if md.done_editing() or num_tasks == len(videos) + 1: 21 | return 22 | 23 | md.clear() 24 | md.add_done_editing(False) 25 | md.add("\nSelect the files that will be used to generate `autocut_final.mp4`\n") 26 | base = lambda fn: os.path.basename(fn) 27 | for f in videos: 28 | md_fn = utils.change_ext(f, "md") 29 | video_md = utils.MD(md_fn, self.args.encoding) 30 | # select a few words to scribe the video 31 | desc = "" 32 | if len(video_md.tasks()) > 1: 33 | for _, t in video_md.tasks()[1:]: 34 | m = re.findall(r"\] (.*)", t) 35 | if m and "no speech" not in m[0].lower(): 36 | desc += m[0] + " " 37 | if len(desc) > 50: 38 | break 39 | md.add_task( 40 | False, 41 | f'[{base(f)}]({base(md_fn)}) {"[Edited]" if video_md.done_editing() else ""} {desc}', 42 | ) 43 | md.write() 44 | 45 | def run(self): 46 | md_fn = self.args.inputs[0] 47 | md = utils.MD(md_fn, self.args.encoding) 48 | if not md.done_editing(): 49 | return 50 | 51 | videos = [] 52 | for m, t in md.tasks(): 53 | if not m: 54 | continue 55 | m = re.findall(r"\[(.*)\]", t) 56 | if not m: 57 | continue 58 | fn = os.path.join(os.path.dirname(md_fn), m[0]) 59 | logging.info(f"Loading {fn}") 60 | videos.append(editor.VideoFileClip(fn)) 61 | 62 | dur = sum([v.duration for v in videos]) 63 | logging.info(f"Merging into a video with {dur / 60:.1f} min length") 64 | 65 | merged = editor.concatenate_videoclips(videos) 66 | fn = os.path.splitext(md_fn)[0] + "_merged.mp4" 67 | merged.write_videofile( 68 | fn, audio_codec="aac", bitrate=self.args.bitrate 69 | ) # logger=None, 70 | logging.info(f"Saved merged video to {fn}") 71 | 72 | 73 | # Cut media 74 | class Cutter: 75 | def __init__(self, args): 76 | self.args = args 77 | 78 | def run(self): 79 | fns = {"srt": None, "media": None, "md": None} 80 | for fn in self.args.inputs: 81 | ext = os.path.splitext(fn)[1][1:] 82 | fns[ext if ext in fns else "media"] = fn 83 | 84 | assert fns["media"], "must provide a media filename" 85 | assert fns["srt"], "must provide a srt filename" 86 | 87 | is_video_file = utils.is_video(fns["media"].lower()) 88 | outext = "mp4" if is_video_file else "mp3" 89 | output_fn = utils.change_ext(utils.add_cut(fns["media"]), outext) 90 | if utils.check_exists(output_fn, self.args.force): 91 | return 92 | 93 | with open(fns["srt"], encoding=self.args.encoding) as f: 94 | subs = list(srt.parse(f.read())) 95 | 96 | if fns["md"]: 97 | md = utils.MD(fns["md"], self.args.encoding) 98 | if not md.done_editing(): 99 | return 100 | index = [] 101 | for mark, sent in md.tasks(): 102 | if not mark: 103 | continue 104 | m = re.match(r"\[(\d+)", sent.strip()) 105 | if m: 106 | index.append(int(m.groups()[0])) 107 | subs = [s for s in subs if s.index in index] 108 | logging.info(f'Cut {fns["media"]} based on {fns["srt"]} and {fns["md"]}') 109 | else: 110 | logging.info(f'Cut {fns["media"]} based on {fns["srt"]}') 111 | 112 | segments = [] 113 | # Avoid disordered subtitles 114 | subs.sort(key=lambda x: x.start) 115 | for x in subs: 116 | if len(segments) == 0: 117 | segments.append( 118 | {"start": x.start.total_seconds(), "end": x.end.total_seconds()} 119 | ) 120 | else: 121 | if x.start.total_seconds() - segments[-1]["end"] < 0.5: 122 | segments[-1]["end"] = x.end.total_seconds() 123 | else: 124 | segments.append( 125 | {"start": x.start.total_seconds(), "end": x.end.total_seconds()} 126 | ) 127 | 128 | if is_video_file: 129 | media = editor.VideoFileClip(fns["media"]) 130 | else: 131 | media = editor.AudioFileClip(fns["media"]) 132 | 133 | # Add a fade between two clips. Not quite necessary. keep code here for reference 134 | # fade = 0 135 | # segments = _expand_segments(segments, fade, 0, video.duration) 136 | # clips = [video.subclip( 137 | # s['start'], s['end']).crossfadein(fade) for s in segments] 138 | # final_clip = editor.concatenate_videoclips(clips, padding = -fade) 139 | 140 | clips = [media.subclip(s["start"], s["end"]) for s in segments] 141 | if is_video_file: 142 | final_clip: editor.VideoClip = editor.concatenate_videoclips(clips) 143 | logging.info( 144 | f"Reduced duration from {media.duration:.1f} to {final_clip.duration:.1f}" 145 | ) 146 | 147 | aud = final_clip.audio.set_fps(44100) 148 | final_clip = final_clip.without_audio().set_audio(aud) 149 | final_clip = final_clip.fx(editor.afx.audio_normalize) 150 | 151 | # an alternative to birate is use crf, e.g. ffmpeg_params=['-crf', '18'] 152 | final_clip.write_videofile( 153 | output_fn, audio_codec="aac", bitrate=self.args.bitrate 154 | ) 155 | else: 156 | final_clip: editor.AudioClip = editor.concatenate_audioclips(clips) 157 | logging.info( 158 | f"Reduced duration from {media.duration:.1f} to {final_clip.duration:.1f}" 159 | ) 160 | 161 | final_clip = final_clip.fx(editor.afx.audio_normalize) 162 | final_clip.write_audiofile( 163 | output_fn, codec="libmp3lame", fps=44100, bitrate=self.args.bitrate 164 | ) 165 | 166 | media.close() 167 | logging.info(f"Saved media to {output_fn}") 168 | -------------------------------------------------------------------------------- /autocut/daemon.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import glob 3 | import logging 4 | import os 5 | import time 6 | 7 | from . import cut, transcribe, utils 8 | 9 | 10 | class Daemon: 11 | def __init__(self, args): 12 | self.args = args 13 | self.sleep = 1 14 | 15 | def run(self): 16 | assert len(self.args.inputs) == 1, "Must provide a single folder" 17 | while True: 18 | self._iter() 19 | time.sleep(self.sleep) 20 | self.sleep = min(60, self.sleep + 1) 21 | 22 | def _iter(self): 23 | folder = self.args.inputs[0] 24 | files = sorted(list(glob.glob(os.path.join(folder, "*")))) 25 | media_files = [f for f in files if utils.is_video(f) or utils.is_audio(f)] 26 | args = copy.deepcopy(self.args) 27 | for f in media_files: 28 | srt_fn = utils.change_ext(f, "srt") 29 | md_fn = utils.change_ext(f, "md") 30 | is_video_file = utils.is_video(f) 31 | if srt_fn not in files or md_fn not in files: 32 | args.inputs = [f] 33 | try: 34 | transcribe.Transcribe(args).run() 35 | self.sleep = 1 36 | break 37 | except RuntimeError as e: 38 | logging.warn( 39 | "Failed, may be due to the video is still on recording" 40 | ) 41 | pass 42 | if md_fn in files: 43 | if utils.add_cut(md_fn) in files: 44 | continue 45 | md = utils.MD(md_fn, self.args.encoding) 46 | ext = "mp4" if is_video_file else "mp3" 47 | if not md.done_editing() or os.path.exists( 48 | utils.change_ext(utils.add_cut(f), ext) 49 | ): 50 | continue 51 | args.inputs = [f, md_fn, srt_fn] 52 | cut.Cutter(args).run() 53 | self.sleep = 1 54 | 55 | args.inputs = [os.path.join(folder, "autocut.md")] 56 | merger = cut.Merger(args) 57 | merger.write_md(media_files) 58 | merger.run() 59 | -------------------------------------------------------------------------------- /autocut/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | 5 | from . import utils 6 | from .type import WhisperMode, WhisperModel 7 | 8 | 9 | def main(): 10 | parser = argparse.ArgumentParser( 11 | description="Edit videos based on transcribed subtitles", 12 | formatter_class=argparse.RawDescriptionHelpFormatter, 13 | ) 14 | 15 | logging.basicConfig( 16 | format="[autocut:%(filename)s:L%(lineno)d] %(levelname)-6s %(message)s" 17 | ) 18 | logging.getLogger().setLevel(logging.INFO) 19 | 20 | parser.add_argument("inputs", type=str, nargs="+", help="Inputs filenames/folders") 21 | parser.add_argument( 22 | "-t", 23 | "--transcribe", 24 | help="Transcribe videos/audio into subtitles", 25 | action=argparse.BooleanOptionalAction, 26 | ) 27 | parser.add_argument( 28 | "-c", 29 | "--cut", 30 | help="Cut a video based on subtitles", 31 | action=argparse.BooleanOptionalAction, 32 | ) 33 | parser.add_argument( 34 | "-d", 35 | "--daemon", 36 | help="Monitor a folder to transcribe and cut", 37 | action=argparse.BooleanOptionalAction, 38 | ) 39 | parser.add_argument( 40 | "-s", 41 | help="Convert .srt to a compact format for easier editing", 42 | action=argparse.BooleanOptionalAction, 43 | ) 44 | parser.add_argument( 45 | "-m", 46 | "--to-md", 47 | help="Convert .srt to .md for easier editing", 48 | action=argparse.BooleanOptionalAction, 49 | ) 50 | parser.add_argument( 51 | "--lang", 52 | type=str, 53 | default="zh", 54 | choices=[ 55 | "zh", 56 | "en", 57 | "Afrikaans", 58 | "Arabic", 59 | "Armenian", 60 | "Azerbaijani", 61 | "Belarusian", 62 | "Bosnian", 63 | "Bulgarian", 64 | "Catalan", 65 | "Croatian", 66 | "Czech", 67 | "Danish", 68 | "Dutch", 69 | "Estonian", 70 | "Finnish", 71 | "French", 72 | "Galician", 73 | "German", 74 | "Greek", 75 | "Hebrew", 76 | "Hindi", 77 | "Hungarian", 78 | "Icelandic", 79 | "Indonesian", 80 | "Italian", 81 | "Japanese", 82 | "Kannada", 83 | "Kazakh", 84 | "Korean", 85 | "Latvian", 86 | "Lithuanian", 87 | "Macedonian", 88 | "Malay", 89 | "Marathi", 90 | "Maori", 91 | "Nepali", 92 | "Norwegian", 93 | "Persian", 94 | "Polish", 95 | "Portuguese", 96 | "Romanian", 97 | "Russian", 98 | "Serbian", 99 | "Slovak", 100 | "Slovenian", 101 | "Spanish", 102 | "Swahili", 103 | "Swedish", 104 | "Tagalog", 105 | "Tamil", 106 | "Thai", 107 | "Turkish", 108 | "Ukrainian", 109 | "Urdu", 110 | "Vietnamese", 111 | "Welsh", 112 | ], 113 | help="The output language of transcription", 114 | ) 115 | parser.add_argument( 116 | "--prompt", type=str, default="", help="initial prompt feed into whisper" 117 | ) 118 | parser.add_argument( 119 | "--whisper-mode", 120 | type=str, 121 | default=WhisperMode.WHISPER.value, 122 | choices=WhisperMode.get_values(), 123 | help="Whisper inference mode: whisper: run whisper locally; openai: use openai api.", 124 | ) 125 | parser.add_argument( 126 | "--openai-rpm", 127 | type=int, 128 | default=3, 129 | choices=[3, 50], 130 | help="Openai Whisper API REQUESTS PER MINUTE(FREE USERS: 3RPM; PAID USERS: 50RPM). " 131 | "More info: https://platform.openai.com/docs/guides/rate-limits/overview", 132 | ) 133 | parser.add_argument( 134 | "--whisper-model", 135 | type=str, 136 | default=WhisperModel.SMALL.value, 137 | choices=WhisperModel.get_values(), 138 | help="The whisper model used to transcribe.", 139 | ) 140 | parser.add_argument( 141 | "--bitrate", 142 | type=str, 143 | default="10m", 144 | help="The bitrate to export the cutted video, such as 10m, 1m, or 500k", 145 | ) 146 | parser.add_argument( 147 | "--vad", help="If or not use VAD", choices=["1", "0", "auto"], default="auto" 148 | ) 149 | parser.add_argument( 150 | "--force", 151 | help="Force write even if files exist", 152 | action=argparse.BooleanOptionalAction, 153 | ) 154 | parser.add_argument( 155 | "--encoding", type=str, default="utf-8", help="Document encoding format" 156 | ) 157 | parser.add_argument( 158 | "--device", 159 | type=str, 160 | default=None, 161 | choices=["cpu", "cuda"], 162 | help="Force to CPU or GPU for transcribing. In default automatically use GPU if available.", 163 | ) 164 | 165 | args = parser.parse_args() 166 | 167 | if args.transcribe: 168 | from .transcribe import Transcribe 169 | 170 | Transcribe(args).run() 171 | elif args.to_md: 172 | from .utils import trans_srt_to_md 173 | 174 | if len(args.inputs) == 2: 175 | [input_1, input_2] = args.inputs 176 | base, ext = os.path.splitext(input_1) 177 | if ext != ".srt": 178 | input_1, input_2 = input_2, input_1 179 | trans_srt_to_md(args.encoding, args.force, input_1, input_2) 180 | elif len(args.inputs) == 1: 181 | trans_srt_to_md(args.encoding, args.force, args.inputs[0]) 182 | else: 183 | logging.warning( 184 | "Wrong number of files, please pass in a .srt file or an additional video file" 185 | ) 186 | elif args.cut: 187 | from .cut import Cutter 188 | 189 | Cutter(args).run() 190 | elif args.daemon: 191 | from .daemon import Daemon 192 | 193 | Daemon(args).run() 194 | elif args.s: 195 | utils.compact_rst(args.inputs[0], args.encoding) 196 | else: 197 | logging.warning("No action, use -c, -t or -d") 198 | 199 | 200 | if __name__ == "__main__": 201 | main() 202 | -------------------------------------------------------------------------------- /autocut/package_transcribe.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from typing import List, Any, Union, Literal 4 | 5 | import numpy as np 6 | import torch 7 | 8 | from . import utils, whisper_model 9 | from .type import WhisperMode, SPEECH_ARRAY_INDEX, WhisperModel, LANG 10 | 11 | 12 | class Transcribe: 13 | def __init__( 14 | self, 15 | whisper_mode: Union[ 16 | WhisperMode.WHISPER.value, WhisperMode.FASTER.value 17 | ] = WhisperMode.WHISPER.value, 18 | whisper_model_size: WhisperModel.get_values() = "small", 19 | vad: bool = True, 20 | device: Union[Literal["cpu", "cuda"], None] = None, 21 | ): 22 | self.whisper_mode = whisper_mode 23 | self.whisper_model_size = whisper_model_size 24 | self.vad = vad 25 | self.device = device 26 | self.sampling_rate = 16000 27 | self.whisper_model = None 28 | self.vad_model = None 29 | self.detect_speech = None 30 | 31 | tic = time.time() 32 | if self.whisper_model is None: 33 | if self.whisper_mode == WhisperMode.WHISPER.value: 34 | self.whisper_model = whisper_model.WhisperModel(self.sampling_rate) 35 | self.whisper_model.load(self.whisper_model_size, self.device) 36 | elif self.whisper_mode == WhisperMode.FASTER.value: 37 | self.whisper_model = whisper_model.FasterWhisperModel( 38 | self.sampling_rate 39 | ) 40 | self.whisper_model.load(self.whisper_model_size, self.device) 41 | logging.info(f"Done Init model in {time.time() - tic:.1f} sec") 42 | 43 | def run(self, audio: np.ndarray, lang: LANG, prompt: str = ""): 44 | speech_array_indices = self._detect_voice_activity(audio) 45 | transcribe_results = self._transcribe(audio, speech_array_indices, lang, prompt) 46 | return transcribe_results 47 | 48 | def format_results_to_srt(self, transcribe_results: List[Any]): 49 | return self.whisper_model.gen_srt(transcribe_results) 50 | 51 | def _detect_voice_activity(self, audio) -> List[SPEECH_ARRAY_INDEX]: 52 | """Detect segments that have voice activities""" 53 | if self.vad is False: 54 | return [{"start": 0, "end": len(audio)}] 55 | 56 | tic = time.time() 57 | if self.vad_model is None or self.detect_speech is None: 58 | # torch load limit https://github.com/pytorch/vision/issues/4156 59 | torch.hub._validate_not_a_forked_repo = lambda a, b, c: True 60 | self.vad_model, funcs = torch.hub.load( 61 | repo_or_dir="snakers4/silero-vad", model="silero_vad", trust_repo=True 62 | ) 63 | 64 | self.detect_speech = funcs[0] 65 | 66 | speeches = self.detect_speech( 67 | audio, self.vad_model, sampling_rate=self.sampling_rate 68 | ) 69 | 70 | # Remove too short segments 71 | speeches = utils.remove_short_segments(speeches, 1.0 * self.sampling_rate) 72 | 73 | # Expand to avoid to tight cut. You can tune the pad length 74 | speeches = utils.expand_segments( 75 | speeches, 0.2 * self.sampling_rate, 0.0 * self.sampling_rate, audio.shape[0] 76 | ) 77 | 78 | # Merge very closed segments 79 | speeches = utils.merge_adjacent_segments(speeches, 0.5 * self.sampling_rate) 80 | 81 | logging.info(f"Done voice activity detection in {time.time() - tic:.1f} sec") 82 | return speeches if len(speeches) > 1 else [{"start": 0, "end": len(audio)}] 83 | 84 | def _transcribe( 85 | self, 86 | audio: np.ndarray, 87 | speech_array_indices: List[SPEECH_ARRAY_INDEX], 88 | lang: LANG, 89 | prompt: str = "", 90 | ) -> List[Any]: 91 | tic = time.time() 92 | res = self.whisper_model.transcribe(audio, speech_array_indices, lang, prompt) 93 | logging.info(f"Done transcription in {time.time() - tic:.1f} sec") 94 | return res 95 | -------------------------------------------------------------------------------- /autocut/transcribe.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | from typing import List, Any 5 | 6 | import numpy as np 7 | import srt 8 | import torch 9 | 10 | from . import utils, whisper_model 11 | from .type import WhisperMode, SPEECH_ARRAY_INDEX 12 | 13 | 14 | class Transcribe: 15 | def __init__(self, args): 16 | self.args = args 17 | self.sampling_rate = 16000 18 | self.whisper_model = None 19 | self.vad_model = None 20 | self.detect_speech = None 21 | 22 | tic = time.time() 23 | if self.whisper_model is None: 24 | if self.args.whisper_mode == WhisperMode.WHISPER.value: 25 | self.whisper_model = whisper_model.WhisperModel(self.sampling_rate) 26 | self.whisper_model.load(self.args.whisper_model, self.args.device) 27 | elif self.args.whisper_mode == WhisperMode.OPENAI.value: 28 | self.whisper_model = whisper_model.OpenAIModel( 29 | self.args.openai_rpm, self.sampling_rate 30 | ) 31 | self.whisper_model.load() 32 | elif self.args.whisper_mode == WhisperMode.FASTER.value: 33 | self.whisper_model = whisper_model.FasterWhisperModel( 34 | self.sampling_rate 35 | ) 36 | self.whisper_model.load(self.args.whisper_model, self.args.device) 37 | logging.info(f"Done Init model in {time.time() - tic:.1f} sec") 38 | 39 | def run(self): 40 | for input in self.args.inputs: 41 | logging.info(f"Transcribing {input}") 42 | name, _ = os.path.splitext(input) 43 | if utils.check_exists(name + ".md", self.args.force): 44 | continue 45 | 46 | audio = utils.load_audio(input, sr=self.sampling_rate) 47 | speech_array_indices = self._detect_voice_activity(audio) 48 | transcribe_results = self._transcribe(input, audio, speech_array_indices) 49 | 50 | output = name + ".srt" 51 | self._save_srt(output, transcribe_results) 52 | logging.info(f"Transcribed {input} to {output}") 53 | self._save_md(name + ".md", output, input) 54 | logging.info(f'Saved texts to {name + ".md"} to mark sentences') 55 | 56 | def _detect_voice_activity(self, audio) -> List[SPEECH_ARRAY_INDEX]: 57 | """Detect segments that have voice activities""" 58 | if self.args.vad == "0": 59 | return [{"start": 0, "end": len(audio)}] 60 | 61 | tic = time.time() 62 | if self.vad_model is None or self.detect_speech is None: 63 | # torch load limit https://github.com/pytorch/vision/issues/4156 64 | torch.hub._validate_not_a_forked_repo = lambda a, b, c: True 65 | self.vad_model, funcs = torch.hub.load( 66 | repo_or_dir="snakers4/silero-vad", model="silero_vad", trust_repo=True 67 | ) 68 | 69 | self.detect_speech = funcs[0] 70 | 71 | speeches = self.detect_speech( 72 | audio, self.vad_model, sampling_rate=self.sampling_rate 73 | ) 74 | 75 | # Remove too short segments 76 | speeches = utils.remove_short_segments(speeches, 1.0 * self.sampling_rate) 77 | 78 | # Expand to avoid to tight cut. You can tune the pad length 79 | speeches = utils.expand_segments( 80 | speeches, 0.2 * self.sampling_rate, 0.0 * self.sampling_rate, audio.shape[0] 81 | ) 82 | 83 | # Merge very closed segments 84 | speeches = utils.merge_adjacent_segments(speeches, 0.5 * self.sampling_rate) 85 | 86 | logging.info(f"Done voice activity detection in {time.time() - tic:.1f} sec") 87 | return speeches if len(speeches) > 1 else [{"start": 0, "end": len(audio)}] 88 | 89 | def _transcribe( 90 | self, 91 | input: str, 92 | audio: np.ndarray, 93 | speech_array_indices: List[SPEECH_ARRAY_INDEX], 94 | ) -> List[Any]: 95 | tic = time.time() 96 | res = ( 97 | self.whisper_model.transcribe( 98 | audio, speech_array_indices, self.args.lang, self.args.prompt 99 | ) 100 | if self.args.whisper_mode == WhisperMode.WHISPER.value 101 | or self.args.whisper_mode == WhisperMode.FASTER.value 102 | else self.whisper_model.transcribe( 103 | input, audio, speech_array_indices, self.args.lang, self.args.prompt 104 | ) 105 | ) 106 | 107 | logging.info(f"Done transcription in {time.time() - tic:.1f} sec") 108 | return res 109 | 110 | def _save_srt(self, output, transcribe_results): 111 | subs = self.whisper_model.gen_srt(transcribe_results) 112 | with open(output, "wb") as f: 113 | f.write(srt.compose(subs).encode(self.args.encoding, "replace")) 114 | 115 | def _save_md(self, md_fn, srt_fn, video_fn): 116 | with open(srt_fn, encoding=self.args.encoding) as f: 117 | subs = srt.parse(f.read()) 118 | 119 | md = utils.MD(md_fn, self.args.encoding) 120 | md.clear() 121 | md.add_done_editing(False) 122 | md.add_video(os.path.basename(video_fn)) 123 | md.add( 124 | f"\nTexts generated from [{os.path.basename(srt_fn)}]({os.path.basename(srt_fn)})." 125 | "Mark the sentences to keep for autocut.\n" 126 | "The format is [subtitle_index,duration_in_second] subtitle context.\n\n" 127 | ) 128 | 129 | for s in subs: 130 | sec = s.start.seconds 131 | pre = f"[{s.index},{sec // 60:02d}:{sec % 60:02d}]" 132 | md.add_task(False, f"{pre:11} {s.content.strip()}") 133 | md.write() 134 | -------------------------------------------------------------------------------- /autocut/type.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import TypedDict, Literal 3 | 4 | SPEECH_ARRAY_INDEX = TypedDict("SPEECH_ARRAY_INDEX", {"start": float, "end": float}) 5 | 6 | LANG = Literal[ 7 | "zh", 8 | "en", 9 | "Afrikaans", 10 | "Arabic", 11 | "Armenian", 12 | "Azerbaijani", 13 | "Belarusian", 14 | "Bosnian", 15 | "Bulgarian", 16 | "Catalan", 17 | "Croatian", 18 | "Czech", 19 | "Danish", 20 | "Dutch", 21 | "Estonian", 22 | "Finnish", 23 | "French", 24 | "Galician", 25 | "German", 26 | "Greek", 27 | "Hebrew", 28 | "Hindi", 29 | "Hungarian", 30 | "Icelandic", 31 | "Indonesian", 32 | "Italian", 33 | "Japanese", 34 | "Kannada", 35 | "Kazakh", 36 | "Korean", 37 | "Latvian", 38 | "Lithuanian", 39 | "Macedonian", 40 | "Malay", 41 | "Marathi", 42 | "Maori", 43 | "Nepali", 44 | "Norwegian", 45 | "Persian", 46 | "Polish", 47 | "Portuguese", 48 | "Romanian", 49 | "Russian", 50 | "Serbian", 51 | "Slovak", 52 | "Slovenian", 53 | "Spanish", 54 | "Swahili", 55 | "Swedish", 56 | "Tagalog", 57 | "Tamil", 58 | "Thai", 59 | "Turkish", 60 | "Ukrainian", 61 | "Urdu", 62 | "Vietnamese", 63 | "Welsh", 64 | ] 65 | 66 | 67 | class WhisperModel(Enum): 68 | TINY = "tiny" 69 | BASE = "base" 70 | SMALL = "small" 71 | MEDIUM = "medium" 72 | LARGE = "large" 73 | LARGE_V2 = "large-v2" 74 | LARGE_V3 = "large-v3" 75 | LARGE_V3_TURBO = "large-v3-turbo" 76 | 77 | @staticmethod 78 | def get_values(): 79 | return [i.value for i in WhisperModel] 80 | 81 | 82 | class WhisperMode(Enum): 83 | WHISPER = "whisper" 84 | OPENAI = "openai" 85 | FASTER = "faster" 86 | 87 | @staticmethod 88 | def get_values(): 89 | return [i.value for i in WhisperMode] 90 | -------------------------------------------------------------------------------- /autocut/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | 5 | import ffmpeg 6 | import numpy as np 7 | import opencc 8 | import srt 9 | 10 | 11 | def load_audio(file: str, sr: int = 16000) -> np.ndarray: 12 | try: 13 | out, _ = ( 14 | ffmpeg.input(file, threads=0) 15 | .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) 16 | .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) 17 | ) 18 | except ffmpeg.Error as e: 19 | raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e 20 | 21 | return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 22 | 23 | 24 | def is_video(filename): 25 | _, ext = os.path.splitext(filename) 26 | return ext in [".mp4", ".mov", ".mkv", ".avi", ".flv", ".f4v", ".webm"] 27 | 28 | 29 | def is_audio(filename): 30 | _, ext = os.path.splitext(filename) 31 | return ext in [".ogg", ".wav", ".mp3", ".flac", ".m4a"] 32 | 33 | 34 | def change_ext(filename, new_ext): 35 | # Change the extension of filename to new_ext 36 | base, _ = os.path.splitext(filename) 37 | if not new_ext.startswith("."): 38 | new_ext = "." + new_ext 39 | return base + new_ext 40 | 41 | 42 | def add_cut(filename): 43 | # Add cut mark to the filename 44 | base, ext = os.path.splitext(filename) 45 | if base.endswith("_cut"): 46 | base = base[:-4] + "_" + base[-4:] 47 | else: 48 | base += "_cut" 49 | return base + ext 50 | 51 | 52 | # a very simple markdown parser 53 | class MD: 54 | def __init__(self, filename, encoding): 55 | self.lines = [] 56 | self.EDIT_DONE_MAKR = "<-- Mark if you are done editing." 57 | self.encoding = encoding 58 | self.filename = filename 59 | if filename: 60 | self.load_file() 61 | 62 | def load_file(self): 63 | if os.path.exists(self.filename): 64 | with open(self.filename, encoding=self.encoding) as f: 65 | self.lines = f.readlines() 66 | 67 | def clear(self): 68 | self.lines = [] 69 | 70 | def write(self): 71 | with open(self.filename, "wb") as f: 72 | f.write("\n".join(self.lines).encode(self.encoding, "replace")) 73 | 74 | def tasks(self): 75 | # get all tasks with their status 76 | ret = [] 77 | for l in self.lines: 78 | mark, task = self._parse_task_status(l) 79 | if mark is not None: 80 | ret.append((mark, task)) 81 | return ret 82 | 83 | def done_editing(self): 84 | for m, t in self.tasks(): 85 | if m and self.EDIT_DONE_MAKR in t: 86 | return True 87 | return False 88 | 89 | def add(self, line): 90 | self.lines.append(line) 91 | 92 | def add_task(self, mark, contents): 93 | self.add(f'- [{"x" if mark else " "}] {contents.strip()}') 94 | 95 | def add_done_editing(self, mark): 96 | self.add_task(mark, self.EDIT_DONE_MAKR) 97 | 98 | def add_video(self, video_fn): 99 | ext = os.path.splitext(video_fn)[1][1:] 100 | self.add( 101 | f'\n\n' 102 | ) 103 | 104 | def _parse_task_status(self, line): 105 | # return (is_marked, rest) or (None, line) if not a task 106 | m = re.match(r"- +\[([ xX])\] +(.*)", line) 107 | if not m: 108 | return None, line 109 | return m.groups()[0].lower() == "x", m.groups()[1] 110 | 111 | 112 | def check_exists(output, force): 113 | if os.path.exists(output): 114 | if force: 115 | logging.info(f"{output} exists. Will overwrite it") 116 | else: 117 | logging.info( 118 | f"{output} exists, skipping... Use the --force flag to overwrite" 119 | ) 120 | return True 121 | return False 122 | 123 | 124 | def expand_segments(segments, expand_head, expand_tail, total_length): 125 | # Pad head and tail for each time segment 126 | results = [] 127 | for i in range(len(segments)): 128 | t = segments[i] 129 | start = max(t["start"] - expand_head, segments[i - 1]["end"] if i > 0 else 0) 130 | end = min( 131 | t["end"] + expand_tail, 132 | segments[i + 1]["start"] if i < len(segments) - 1 else total_length, 133 | ) 134 | results.append({"start": start, "end": end}) 135 | return results 136 | 137 | 138 | def remove_short_segments(segments, threshold): 139 | # Remove segments whose length < threshold 140 | return [s for s in segments if s["end"] - s["start"] > threshold] 141 | 142 | 143 | def merge_adjacent_segments(segments, threshold): 144 | # Merge two adjacent segments if their distance < threshold 145 | results = [] 146 | i = 0 147 | while i < len(segments): 148 | s = segments[i] 149 | for j in range(i + 1, len(segments)): 150 | if segments[j]["start"] < s["end"] + threshold: 151 | s["end"] = segments[j]["end"] 152 | i = j 153 | else: 154 | break 155 | i += 1 156 | results.append(s) 157 | return results 158 | 159 | 160 | def compact_rst(sub_fn, encoding): 161 | cc = opencc.OpenCC("t2s") 162 | 163 | base, ext = os.path.splitext(sub_fn) 164 | COMPACT = "_compact" 165 | if ext != ".srt": 166 | logging.fatal("only .srt file is supported") 167 | 168 | if base.endswith(COMPACT): 169 | # to original rst 170 | with open(sub_fn, encoding=encoding) as f: 171 | lines = f.readlines() 172 | subs = [] 173 | for l in lines: 174 | items = l.split(" ") 175 | if len(items) < 4: 176 | continue 177 | subs.append( 178 | srt.Subtitle( 179 | index=0, 180 | start=srt.srt_timestamp_to_timedelta(items[0]), 181 | end=srt.srt_timestamp_to_timedelta(items[2]), 182 | content=" ".join(items[3:]).strip(), 183 | ) 184 | ) 185 | with open(base[: -len(COMPACT)] + ext, "wb") as f: 186 | f.write(srt.compose(subs).encode(encoding, "replace")) 187 | else: 188 | # to a compact version 189 | with open(sub_fn, encoding=encoding) as f: 190 | subs = srt.parse(f.read()) 191 | with open(base + COMPACT + ext, "wb") as f: 192 | for s in subs: 193 | f.write( 194 | f"{srt.timedelta_to_srt_timestamp(s.start)} --> {srt.timedelta_to_srt_timestamp(s.end)} " 195 | f"{cc.convert(s.content.strip())}\n".encode(encoding, "replace") 196 | ) 197 | 198 | 199 | def trans_srt_to_md(encoding, force, srt_fn, video_fn=None): 200 | base, ext = os.path.splitext(srt_fn) 201 | if ext != ".srt": 202 | logging.fatal("only .srt file is supported") 203 | md_fn = base + ext.split(".")[0] + ".md" 204 | 205 | check_exists(md_fn, force) 206 | 207 | with open(srt_fn, encoding=encoding) as f: 208 | subs = srt.parse(f.read()) 209 | 210 | md = MD(md_fn, encoding) 211 | md.clear() 212 | md.add_done_editing(False) 213 | if video_fn: 214 | if not is_video(video_fn): 215 | logging.fatal(f"{video_fn} may not be a video") 216 | md.add_video(os.path.basename(video_fn)) 217 | md.add( 218 | f"\nTexts generated from [{os.path.basename(srt_fn)}]({os.path.basename(srt_fn)})." 219 | "Mark the sentences to keep for autocut.\n" 220 | "The format is [subtitle_index,duration_in_second] subtitle context.\n\n" 221 | ) 222 | 223 | for s in subs: 224 | sec = s.start.seconds 225 | pre = f"[{s.index},{sec // 60:02d}:{sec % 60:02d}]" 226 | md.add_task(False, f"{pre:11} {s.content.strip()}") 227 | md.write() 228 | -------------------------------------------------------------------------------- /autocut/whisper_model.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import os 4 | from abc import ABC, abstractmethod 5 | from typing import Literal, Union, List, Any, TypedDict 6 | 7 | import numpy as np 8 | import opencc 9 | import srt 10 | from pydub import AudioSegment 11 | from tqdm import tqdm 12 | 13 | from .type import SPEECH_ARRAY_INDEX, LANG 14 | 15 | # whisper sometimes generate traditional chinese, explicitly convert 16 | cc = opencc.OpenCC("t2s") 17 | 18 | 19 | class AbstractWhisperModel(ABC): 20 | def __init__(self, mode, sample_rate=16000): 21 | self.mode = mode 22 | self.whisper_model = None 23 | self.sample_rate = sample_rate 24 | 25 | @abstractmethod 26 | def load(self, *args, **kwargs): 27 | pass 28 | 29 | @abstractmethod 30 | def transcribe(self, *args, **kwargs): 31 | pass 32 | 33 | @abstractmethod 34 | def _transcribe(self, *args, **kwargs): 35 | pass 36 | 37 | @abstractmethod 38 | def gen_srt(self, transcribe_results: List[Any]) -> List[srt.Subtitle]: 39 | pass 40 | 41 | 42 | class WhisperModel(AbstractWhisperModel): 43 | def __init__(self, sample_rate=16000): 44 | super().__init__("whisper", sample_rate) 45 | self.device = None 46 | 47 | def load( 48 | self, 49 | model_name: Literal[ 50 | "tiny", "base", "small", "medium", "large", "large-v2" 51 | ] = "small", 52 | device: Union[Literal["cpu", "cuda"], None] = None, 53 | ): 54 | self.device = device 55 | 56 | import whisper 57 | 58 | self.whisper_model = whisper.load_model(model_name, device) 59 | 60 | def _transcribe(self, audio, seg, lang, prompt): 61 | r = self.whisper_model.transcribe( 62 | audio[int(seg["start"]) : int(seg["end"])], 63 | task="transcribe", 64 | language=lang, 65 | initial_prompt=prompt, 66 | ) 67 | r["origin_timestamp"] = seg 68 | return r 69 | 70 | def transcribe( 71 | self, 72 | audio: np.ndarray, 73 | speech_array_indices: List[SPEECH_ARRAY_INDEX], 74 | lang: LANG, 75 | prompt: str, 76 | ): 77 | res = [] 78 | if self.device == "cpu" and len(speech_array_indices) > 1: 79 | from multiprocessing import Pool 80 | 81 | pbar = tqdm(total=len(speech_array_indices)) 82 | 83 | pool = Pool(processes=4) 84 | sub_res = [] 85 | # TODO, a better way is merging these segments into a single one, so whisper can get more context 86 | for seg in speech_array_indices: 87 | sub_res.append( 88 | pool.apply_async( 89 | self._transcribe, 90 | ( 91 | self.whisper_model, 92 | audio, 93 | seg, 94 | lang, 95 | prompt, 96 | ), 97 | callback=lambda x: pbar.update(), 98 | ) 99 | ) 100 | pool.close() 101 | pool.join() 102 | pbar.close() 103 | res = [i.get() for i in sub_res] 104 | else: 105 | for seg in ( 106 | speech_array_indices 107 | if len(speech_array_indices) == 1 108 | else tqdm(speech_array_indices) 109 | ): 110 | r = self.whisper_model.transcribe( 111 | audio[int(seg["start"]) : int(seg["end"])], 112 | task="transcribe", 113 | language=lang, 114 | initial_prompt=prompt, 115 | verbose=False if len(speech_array_indices) == 1 else None, 116 | ) 117 | r["origin_timestamp"] = seg 118 | res.append(r) 119 | return res 120 | 121 | def gen_srt(self, transcribe_results): 122 | subs = [] 123 | 124 | def _add_sub(start, end, text): 125 | subs.append( 126 | srt.Subtitle( 127 | index=0, 128 | start=datetime.timedelta(seconds=start), 129 | end=datetime.timedelta(seconds=end), 130 | content=cc.convert(text.strip()), 131 | ) 132 | ) 133 | 134 | prev_end = 0 135 | for r in transcribe_results: 136 | origin = r["origin_timestamp"] 137 | for s in r["segments"]: 138 | start = s["start"] + origin["start"] / self.sample_rate 139 | end = min( 140 | s["end"] + origin["start"] / self.sample_rate, 141 | origin["end"] / self.sample_rate, 142 | ) 143 | if start > end: 144 | continue 145 | # mark any empty segment that is not very short 146 | if start > prev_end + 1.0: 147 | _add_sub(prev_end, start, "< No Speech >") 148 | _add_sub(start, end, s["text"]) 149 | prev_end = end 150 | 151 | return subs 152 | 153 | 154 | class OpenAIModel(AbstractWhisperModel): 155 | max_single_audio_bytes = 25 * 2**20 # 25MB 156 | split_audio_bytes = 23 * 2**20 # 23MB, 2MB for safety(header, etc.) 157 | rpm = 3 158 | 159 | def __init__(self, rpm: int, sample_rate=16000): 160 | super().__init__("openai_whisper-1", sample_rate) 161 | self.rpm = rpm 162 | if ( 163 | os.environ.get("OPENAI_API_KEY") is None 164 | and os.environ.get("OPENAI_API_KEY_PATH") is None 165 | ): 166 | raise Exception("OPENAI_API_KEY is not set") 167 | 168 | def load(self, model_name: Literal["whisper-1"] = "whisper-1"): 169 | try: 170 | import openai 171 | except ImportError: 172 | raise Exception( 173 | "Please use openai mode(pip install '.[openai]') or all mode(pip install '.[all]')" 174 | ) 175 | from functools import partial 176 | 177 | self.whisper_model = partial(openai.Audio.transcribe, model=model_name) 178 | 179 | def transcribe( 180 | self, 181 | input: srt, 182 | audio: np.ndarray, 183 | speech_array_indices: List[SPEECH_ARRAY_INDEX], 184 | lang: LANG, 185 | prompt: str, 186 | ) -> List[srt.Subtitle]: 187 | res = [] 188 | name, _ = os.path.splitext(input) 189 | raw_audio = AudioSegment.from_file(input) 190 | ms_bytes = len(raw_audio[:1].raw_data) 191 | audios: List[ 192 | TypedDict( 193 | "AudioInfo", {"input": str, "audio": AudioSegment, "start_ms": float} 194 | ) 195 | ] = [] 196 | 197 | i = 0 198 | for index in speech_array_indices: 199 | start = int(index["start"]) / self.sample_rate * 1000 200 | end = int(index["end"]) / self.sample_rate * 1000 201 | audio_seg = raw_audio[start:end] 202 | if len(audio_seg.raw_data) < self.split_audio_bytes: 203 | temp_file = f"{name}_temp_{i}.wav" 204 | audios.append( 205 | {"input": temp_file, "audio": audio_seg, "start_ms": start} 206 | ) 207 | else: 208 | logging.info( 209 | f"Long audio with a size({len(audio_seg.raw_data)} bytes) greater than 25M({25 * 2 ** 20} bytes) " 210 | "will be segmented" 211 | "due to Openai's API restrictions on files smaller than 25M" 212 | ) 213 | split_num = len(audio_seg.raw_data) // self.split_audio_bytes + 1 214 | for j in range(split_num): 215 | temp_file = f"{name}_{i}_temp_{j}.wav" 216 | split_audio = audio_seg[ 217 | j 218 | * self.split_audio_bytes 219 | // ms_bytes : (j + 1) 220 | * self.split_audio_bytes 221 | // ms_bytes 222 | ] 223 | audios.append( 224 | { 225 | "input": temp_file, 226 | "audio": split_audio, 227 | "start_ms": start + j * self.split_audio_bytes // ms_bytes, 228 | } 229 | ) 230 | i += 1 231 | 232 | if len(audios) > 1: 233 | from multiprocessing import Pool 234 | 235 | pbar = tqdm(total=len(audios)) 236 | 237 | pool = Pool(processes=min(8, self.rpm)) 238 | sub_res = [] 239 | for audio in audios: 240 | sub_res.append( 241 | pool.apply_async( 242 | self._transcribe, 243 | ( 244 | audio["input"], 245 | audio["audio"], 246 | prompt, 247 | lang, 248 | audio["start_ms"], 249 | ), 250 | callback=lambda x: pbar.update(), 251 | ) 252 | ) 253 | pool.close() 254 | pool.join() 255 | pbar.close() 256 | for subs in sub_res: 257 | subtitles = subs.get() 258 | res.extend(subtitles) 259 | else: 260 | res = self._transcribe( 261 | audios[0]["input"], 262 | audios[0]["audio"], 263 | prompt, 264 | lang, 265 | audios[0]["start_ms"], 266 | ) 267 | 268 | return res 269 | 270 | def _transcribe( 271 | self, input: srt, audio: AudioSegment, prompt: str, lang: LANG, start_ms: float 272 | ): 273 | audio.export(input, "wav") 274 | subtitles = self.whisper_model( 275 | file=open(input, "rb"), prompt=prompt, language=lang, response_format="srt" 276 | ) 277 | os.remove(input) 278 | return list( 279 | map( 280 | lambda x: ( 281 | setattr( 282 | x, "start", x.start + datetime.timedelta(milliseconds=start_ms) 283 | ), 284 | setattr( 285 | x, "end", x.end + datetime.timedelta(milliseconds=start_ms) 286 | ), 287 | x, 288 | )[-1], 289 | list(srt.parse(subtitles)), 290 | ) 291 | ) 292 | 293 | def gen_srt(self, transcribe_results: List[srt.Subtitle]): 294 | if len(transcribe_results) == 0: 295 | return [] 296 | if len(transcribe_results) == 1: 297 | return transcribe_results 298 | subs = [transcribe_results[0]] 299 | for subtitle in transcribe_results[1:]: 300 | if subtitle.start - subs[-1].end > datetime.timedelta(seconds=1): 301 | subs.append( 302 | srt.Subtitle( 303 | index=0, 304 | start=subs[-1].end, 305 | end=subtitle.start, 306 | content="< No Speech >", 307 | ) 308 | ) 309 | subs.append(subtitle) 310 | return subs 311 | 312 | 313 | class FasterWhisperModel(AbstractWhisperModel): 314 | def __init__(self, sample_rate=16000): 315 | super().__init__("faster-whisper", sample_rate) 316 | self.device = None 317 | 318 | def load( 319 | self, 320 | model_name: Literal[ 321 | "tiny", "base", "small", "medium", "large", "large-v2" 322 | ] = "small", 323 | device: Union[Literal["cpu", "cuda"], None] = None, 324 | ): 325 | try: 326 | from faster_whisper import WhisperModel 327 | except ImportError: 328 | raise Exception( 329 | "Please use faster mode(pip install '.[faster]') or all mode(pip install '.[all]')" 330 | ) 331 | 332 | self.device = device if device else "cpu" 333 | self.whisper_model = WhisperModel(model_name, self.device) 334 | 335 | def _transcribe(self): 336 | raise Exception("Not implemented") 337 | 338 | def transcribe( 339 | self, 340 | audio: np.ndarray, 341 | speech_array_indices: List[SPEECH_ARRAY_INDEX], 342 | lang: LANG, 343 | prompt: str, 344 | ): 345 | res = [] 346 | for seg in speech_array_indices: 347 | segments, info = self.whisper_model.transcribe( 348 | audio[int(seg["start"]) : int(seg["end"])], 349 | task="transcribe", 350 | language=lang, 351 | initial_prompt=prompt, 352 | vad_filter=False, 353 | ) 354 | segments = list(segments) # The transcription will actually run here. 355 | r = {"origin_timestamp": seg, "segments": segments, "info": info} 356 | res.append(r) 357 | return res 358 | 359 | def gen_srt(self, transcribe_results): 360 | subs = [] 361 | 362 | def _add_sub(start, end, text): 363 | subs.append( 364 | srt.Subtitle( 365 | index=0, 366 | start=datetime.timedelta(seconds=start), 367 | end=datetime.timedelta(seconds=end), 368 | content=cc.convert(text.strip()), 369 | ) 370 | ) 371 | 372 | prev_end = 0 373 | for r in transcribe_results: 374 | origin = r["origin_timestamp"] 375 | for seg in r["segments"]: 376 | s = dict(start=seg.start, end=seg.end, text=seg.text) 377 | start = s["start"] + origin["start"] / self.sample_rate 378 | end = min( 379 | s["end"] + origin["start"] / self.sample_rate, 380 | origin["end"] / self.sample_rate, 381 | ) 382 | if start > end: 383 | continue 384 | # mark any empty segment that is not very short 385 | if start > prev_end + 1.0: 386 | _add_sub(prev_end, start, "< No Speech >") 387 | _add_sub(start, end, s["text"]) 388 | prev_end = end 389 | 390 | return subs 391 | -------------------------------------------------------------------------------- /imgs/typora.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/imgs/typora.jpg -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = autocut 3 | version = attr: autocut.__version__ 4 | license = Apache Software License 5 | description = Cut video by subtitles 6 | long_description = file: README.md 7 | classifiers = 8 | License :: OSI Approved :: Apache Software License 9 | Operating System :: OS Independent 10 | Programming Language :: Python :: 3 11 | 12 | [options] 13 | packages = find: 14 | include_package_data = True 15 | python_requires = >= 3.9 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | requirements = [ 4 | "ffmpeg-python", 5 | "moviepy", 6 | "openai-whisper", 7 | "opencc-python-reimplemented", 8 | "parameterized", 9 | "pydub", 10 | "srt", 11 | "torchaudio", 12 | "tqdm", 13 | ] 14 | 15 | 16 | setup( 17 | name="autocut-sub", 18 | install_requires=requirements, 19 | url="https://github.com/mli/autocut", 20 | project_urls={ 21 | "source": "https://github.com/mli/autocut", 22 | }, 23 | license="Apache License 2.0", 24 | long_description=open("README.md", "r", encoding="utf-8").read(), 25 | long_description_content_type="text/markdown", 26 | extras_require={ 27 | "all": ["openai", "faster-whisper"], 28 | "openai": ["openai"], 29 | "faster": ["faster-whisper"], 30 | }, 31 | packages=find_packages(), 32 | entry_points={ 33 | "console_scripts": [ 34 | "autocut = autocut.main:main", 35 | ] 36 | }, 37 | ) 38 | -------------------------------------------------------------------------------- /tea.yaml: -------------------------------------------------------------------------------- 1 | # https://tea.xyz/what-is-this-file 2 | --- 3 | version: 1.0.0 4 | codeOwners: 5 | - '0x1e292d6f2D09dc8ffDDb5B8Fd6b641e180224D84' 6 | quorum: 1 7 | -------------------------------------------------------------------------------- /test/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | # 定义一个日志收集器 5 | logger = logging.getLogger() 6 | # 设置收集器的级别,不设定的话,默认收集warning及以上级别的日志 7 | logger.setLevel("DEBUG") 8 | # 设置日志格式 9 | fmt = logging.Formatter("%(filename)s-%(lineno)d-%(asctime)s-%(levelname)s-%(message)s") 10 | # 设置日志处理器-输出到文件,并且设置编码格式 11 | if not os.path.exists("./log"): 12 | os.makedirs("./log") 13 | file_handler = logging.FileHandler("./log/log.txt", encoding="utf-8") 14 | # 设置日志处理器级别 15 | file_handler.setLevel("DEBUG") 16 | # 处理器按指定格式输出日志 17 | file_handler.setFormatter(fmt) 18 | # 输出到控制台 19 | ch = logging.StreamHandler() 20 | # 设置日志处理器级别 21 | ch.setLevel("DEBUG") 22 | # 处理器按指定格式输出日志 23 | ch.setFormatter(fmt) 24 | # 收集器和处理器对接,指定输出渠道 25 | # 日志输出到文件 26 | logger.addHandler(file_handler) 27 | # 日志输出到控制台 28 | logger.addHandler(ch) 29 | 30 | TEST_MEDIA_PATH = "./test/media/" 31 | TEST_CONTENT_PATH = "./test/content/" 32 | TEST_MEDIA_FILE = [ 33 | "test001.mp4", 34 | "test002.mov", 35 | "test003.mkv", 36 | "test004.flv", 37 | "test005.mp3", 38 | "test006.MP4", 39 | ] 40 | 41 | TEST_MEDIA_FILE_LANG = ["test001_en.mp4"] 42 | TEST_MEDIA_FILE_SIMPLE = ["test001.mp4", "test005.mp3"] 43 | 44 | 45 | class TestArgs: 46 | def __init__(self): 47 | self.inputs = [] 48 | self.bitrate = "10m" 49 | self.encoding = "utf-8" 50 | self.sampling_rate = 16000 51 | self.lang = "zh" 52 | self.prompt = "" 53 | self.whisper_model = "small" 54 | self.device = None 55 | self.vad = False 56 | self.force = False 57 | self.whisper_mode = ( 58 | "faster" if os.environ.get("WHISPER_MODE") == "faster" else "whisper" 59 | ) 60 | self.openai_rpm = 3 61 | -------------------------------------------------------------------------------- /test/content/test.srt: -------------------------------------------------------------------------------- 1 | 1 2 | 00:00:00,000 --> 00:00:05,000 3 | 大家好,我的名字是AutoCut.这是一条用于测试的视频。 4 | 5 | 2 6 | 00:00:05,000 --> 00:00:10,260 7 | Hello, my name is AutoCut. This is a video for testing. 8 | 9 | -------------------------------------------------------------------------------- /test/content/test_md.md: -------------------------------------------------------------------------------- 1 | - [x] <-- Mark if you are done editing. 2 | 3 | 4 | 5 | Texts generated from [test001.srt](test001.srt).Mark the sentences to keep for autocut. 6 | The format is [subtitle_index,duration_in_second] subtitle context. 7 | 8 | - [ ] [1,00:00] 大家好,我的名字是AutoCut.这是一条用于测试的视频。 9 | - [x] [2,00:05] Hello, my name is AutoCut. This is a video for testing. 10 | -------------------------------------------------------------------------------- /test/content/test_srt.srt: -------------------------------------------------------------------------------- 1 | 1 2 | 00:00:00,000 --> 00:00:05,000 3 | 大家好,我的名字是AutoCut.这是一条用于测试的视频。 4 | 5 | -------------------------------------------------------------------------------- /test/media/test001.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/test/media/test001.mp4 -------------------------------------------------------------------------------- /test/media/test001_en.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/test/media/test001_en.mp4 -------------------------------------------------------------------------------- /test/media/test002.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/test/media/test002.mov -------------------------------------------------------------------------------- /test/media/test003.mkv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/test/media/test003.mkv -------------------------------------------------------------------------------- /test/media/test004.flv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/test/media/test004.flv -------------------------------------------------------------------------------- /test/media/test005.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/test/media/test005.mp3 -------------------------------------------------------------------------------- /test/media/test006.MP4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mli/autocut/ba2bb3bfbd57454727780eafad2861d66af58567/test/media/test006.MP4 -------------------------------------------------------------------------------- /test/test_cut.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | from parameterized import parameterized, param 6 | 7 | from autocut.cut import Cutter 8 | from config import TestArgs, TEST_MEDIA_PATH, TEST_MEDIA_FILE_SIMPLE, TEST_CONTENT_PATH 9 | 10 | 11 | class TestCut(unittest.TestCase): 12 | @classmethod 13 | def setUpClass(cls): 14 | logging.info("检查测试文件是否正常存在") 15 | scan_file = os.listdir(TEST_MEDIA_PATH) 16 | logging.info( 17 | "应存在文件列表:" 18 | + str(TEST_MEDIA_FILE_SIMPLE) 19 | + " 扫描到文件列表:" 20 | + str(scan_file) 21 | ) 22 | for file in TEST_MEDIA_FILE_SIMPLE: 23 | assert file in scan_file 24 | 25 | def tearDown(self): 26 | for file in TEST_MEDIA_FILE_SIMPLE: 27 | namepart = os.path.join( 28 | TEST_MEDIA_PATH, os.path.splitext(file)[0] + "_cut." 29 | ) 30 | if os.path.exists(namepart + "mp4"): 31 | os.remove(namepart + "mp4") 32 | if os.path.exists(namepart + "mp3"): 33 | os.remove(namepart + "mp3") 34 | 35 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE]) 36 | def test_srt_cut(self, file_name): 37 | args = TestArgs() 38 | args.inputs = [ 39 | os.path.join(TEST_MEDIA_PATH, file_name), 40 | os.path.join(TEST_CONTENT_PATH, "test_srt.srt"), 41 | ] 42 | cut = Cutter(args) 43 | cut.run() 44 | namepart = os.path.join( 45 | TEST_MEDIA_PATH, os.path.splitext(file_name)[0] + "_cut." 46 | ) 47 | self.assertTrue( 48 | os.path.exists(namepart + "mp4") or os.path.exists(namepart + "mp3") 49 | ) 50 | 51 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE]) 52 | def test_md_cut(self, file_name): 53 | args = TestArgs() 54 | args.inputs = [ 55 | TEST_MEDIA_PATH + file_name, 56 | os.path.join(TEST_CONTENT_PATH, "test.srt"), 57 | os.path.join(TEST_CONTENT_PATH, "test_md.md"), 58 | ] 59 | cut = Cutter(args) 60 | cut.run() 61 | namepart = os.path.join( 62 | TEST_MEDIA_PATH, os.path.splitext(file_name)[0] + "_cut." 63 | ) 64 | self.assertTrue( 65 | os.path.exists(namepart + "mp4") or os.path.exists(namepart + "mp3") 66 | ) 67 | -------------------------------------------------------------------------------- /test/test_transcribe.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import unittest 4 | 5 | from parameterized import parameterized, param 6 | 7 | from autocut.utils import MD 8 | from config import ( 9 | TEST_MEDIA_FILE, 10 | TestArgs, 11 | TEST_MEDIA_FILE_SIMPLE, 12 | TEST_MEDIA_FILE_LANG, 13 | TEST_MEDIA_PATH, 14 | ) 15 | from autocut.transcribe import Transcribe 16 | 17 | 18 | class TestTranscribe(unittest.TestCase): 19 | @classmethod 20 | def setUpClass(cls): 21 | logging.info("检查测试文件是否正常存在") 22 | scan_file = os.listdir(TEST_MEDIA_PATH) 23 | logging.info( 24 | "应存在文件列表:" 25 | + str(TEST_MEDIA_FILE) 26 | + str(TEST_MEDIA_FILE_LANG) 27 | + str(TEST_MEDIA_FILE_SIMPLE) 28 | + " 扫描到文件列表:" 29 | + str(scan_file) 30 | ) 31 | for file in TEST_MEDIA_FILE: 32 | assert file in scan_file 33 | for file in TEST_MEDIA_FILE_LANG: 34 | assert file in scan_file 35 | for file in TEST_MEDIA_FILE_SIMPLE: 36 | assert file in scan_file 37 | 38 | @classmethod 39 | def tearDownClass(cls): 40 | for file in os.listdir(TEST_MEDIA_PATH): 41 | if file.endswith("md") or file.endswith("srt"): 42 | os.remove(TEST_MEDIA_PATH + file) 43 | 44 | def tearDown(self): 45 | for file in TEST_MEDIA_FILE_SIMPLE: 46 | if os.path.exists(TEST_MEDIA_PATH + file.split(".")[0] + ".md"): 47 | os.remove(TEST_MEDIA_PATH + file.split(".")[0] + ".md") 48 | if os.path.exists(TEST_MEDIA_PATH + file.split(".")[0] + ".srt"): 49 | os.remove(TEST_MEDIA_PATH + file.split(".")[0] + ".srt") 50 | 51 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE]) 52 | def test_default_transcribe(self, file_name): 53 | logging.info("检查默认参数生成字幕") 54 | args = TestArgs() 55 | args.inputs = [TEST_MEDIA_PATH + file_name] 56 | transcribe = Transcribe(args) 57 | transcribe.run() 58 | self.assertTrue( 59 | os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md") 60 | ) 61 | 62 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE]) 63 | def test_jump_done_transcribe(self, file_name): 64 | logging.info("检查默认参数跳过生成字幕") 65 | args = TestArgs() 66 | args.inputs = [TEST_MEDIA_PATH + file_name] 67 | transcribe = Transcribe(args) 68 | transcribe.run() 69 | self.assertTrue( 70 | os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md") 71 | ) 72 | 73 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_LANG]) 74 | def test_en_transcribe(self, file_name): 75 | logging.info("检查--lang='en'参数生成字幕") 76 | args = TestArgs() 77 | args.lang = "en" 78 | args.inputs = [TEST_MEDIA_PATH + file_name] 79 | transcribe = Transcribe(args) 80 | transcribe.run() 81 | self.assertTrue( 82 | os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md") 83 | ) 84 | 85 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_LANG]) 86 | def test_force_transcribe(self, file_name): 87 | logging.info("检查--force参数生成字幕") 88 | args = TestArgs() 89 | args.force = True 90 | args.inputs = [TEST_MEDIA_PATH + file_name] 91 | md0_lens = len( 92 | "".join( 93 | MD( 94 | TEST_MEDIA_PATH + file_name.split(".")[0] + ".md", args.encoding 95 | ).lines 96 | ) 97 | ) 98 | transcribe = Transcribe(args) 99 | transcribe.run() 100 | md1_lens = len( 101 | "".join( 102 | MD( 103 | TEST_MEDIA_PATH + file_name.split(".")[0] + ".md", args.encoding 104 | ).lines 105 | ) 106 | ) 107 | self.assertLessEqual(md1_lens, md0_lens) 108 | 109 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE]) 110 | def test_encoding_transcribe(self, file_name): 111 | logging.info("检查--encoding参数生成字幕") 112 | args = TestArgs() 113 | args.encoding = "gbk" 114 | args.inputs = [TEST_MEDIA_PATH + file_name] 115 | transcribe = Transcribe(args) 116 | transcribe.run() 117 | with open( 118 | os.path.join(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md"), 119 | encoding="gbk", 120 | ): 121 | self.assertTrue(True) 122 | 123 | @parameterized.expand([param(file) for file in TEST_MEDIA_FILE_SIMPLE]) 124 | def test_vad_transcribe(self, file_name): 125 | logging.info("检查--vad参数生成字幕") 126 | args = TestArgs() 127 | args.force = True 128 | args.vad = True 129 | args.inputs = [TEST_MEDIA_PATH + file_name] 130 | transcribe = Transcribe(args) 131 | transcribe.run() 132 | self.assertTrue( 133 | os.path.exists(TEST_MEDIA_PATH + file_name.split(".")[0] + ".md") 134 | ) 135 | --------------------------------------------------------------------------------