├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── dependabot.yml └── workflows │ ├── python-package.yml │ └── python-publish.yml ├── .gitignore ├── .vscode └── settings.json ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE.md ├── LICENSE ├── PULL_REQUEST_TEMPLATE.md ├── README.md ├── SECURITY.md ├── docs ├── index.html └── vtt_to_srt.html ├── requirements.txt ├── setup.py ├── tests ├── conftest.py ├── idd.vtt ├── idd_format.vtt ├── input_alternative_iso-8859-2.vtt ├── input_alternative_utf8.vtt ├── input_iso-8859-2.vtt ├── input_utf8.vtt ├── test_base.py ├── test_convert_directory.py ├── test_convert_file.py ├── test_vtt_to_str.py ├── valid_output_idd.srt ├── valid_output_idd_format.srt ├── valid_output_iso-8859-2.srt └── valid_output_utf8.srt └── vtt_to_srt ├── __init__.py └── vtt_to_srt.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: jsonzilla 4 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: jsonzilla 7 | 8 | --- 9 | 10 | **Note: for support questions, please use stackoverflow**. 11 | This repository's issues are reserved for feature requests and bug reports. 12 | Your issue may already be reported! Please search on the [issue tab](../) before creating one. 13 | 14 | 15 | 16 | ## Expected Behavior 17 | 18 | 19 | ## Current Behavior 20 | 21 | 22 | ## Possible Solution 23 | 24 | 25 | ## Steps to Reproduce 26 | 27 | 28 | 1. 29 | 2. 30 | 3. 31 | 4. 32 | 33 | ## Context (Environment) 34 | 35 | 36 | 37 | 38 | * **Version**: 39 | * **Platform**: 40 | * **Subsystem**: 41 | * **Files**: 42 | 43 | ## Detailed Description 44 | 45 | 46 | ## Possible Implementation 47 | 48 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[REQUEST]" 5 | labels: enhancement 6 | assignees: jsonzilla 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Note: for support questions, please use stackoverflow**. 14 | This repository's issues are reserved for feature requests and bug reports. 15 | Your issue may already be reported! Please search on the [issue tab](../) before creating one. 16 | 17 | **Describe the solution you'd like** 18 | A clear and concise description of what you want to happen. 19 | 20 | **Describe alternatives you've considered** 21 | A clear and concise description of any alternative solutions or features you've considered. 22 | 23 | **Additional context** 24 | Add any other context or screenshots about the feature request here. 25 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [master] 9 | pull_request: 10 | branches: [master] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | python-version: ["3.8", "3.9", "3.10", "3.11"] 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | python -m pip install flake8 pytest 30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 31 | - name: Lint with flake8 32 | run: | 33 | # stop the build if there are Python syntax errors or undefined names 34 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 35 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 36 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 37 | - name: Test with pytest 38 | run: | 39 | pytest 40 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Python 2 | build/ 3 | develop-eggs/ 4 | dist/ 5 | downloads/ 6 | eggs/ 7 | .eggs/ 8 | lib/ 9 | lib64/ 10 | parts/ 11 | pytest_cache/ 12 | sdist/ 13 | var/ 14 | wheels/ 15 | *.egg-info/ 16 | .installed.cfg 17 | *.egg 18 | *.pyc 19 | MANIFEST 20 | 21 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.formatOnType": true, 3 | "editor.formatOnSave": true, 4 | } -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | j@jsonzilla.com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ## Request for changes/ Pull Requests 4 | You first need to create a fork of the repository to commit your changes to it. Methods to fork a repository can be found in the [GitHub Documentation](https://docs.github.com/en/get-started/quickstart/fork-a-repo). 5 | 6 | Then add your fork as a local project: 7 | 8 | ```sh 9 | # Using HTTPS 10 | git clone https://github.com/jsonzilla/REPOSITORY.git 11 | 12 | # Using SSH 13 | git clone git@github.com:jsonzilla/REPOSITORY.git 14 | ``` 15 | 16 | > [Which remote URL should be used ?](https://docs.github.com/en/get-started/getting-started-with-git/about-remote-repositories) 17 | 18 | Then, go to your local folder 19 | 20 | ```sh 21 | cd github-issue-template 22 | ``` 23 | 24 | Add git remote controls : 25 | 26 | ```sh 27 | # Using HTTPS 28 | git remote add fork https://github.com/YOUR-USERNAME/REPOSITORY.git 29 | git remote add upstream https://github.com/jsonzilla/REPOSITORY.git 30 | 31 | 32 | # Using SSH 33 | git remote add fork git@github.com:YOUR-USERNAME/REPOSITORY.git 34 | git remote add upstream git@github.com/jsonzilla/REPOSITORY.git 35 | ``` 36 | 37 | You can now verify that you have your two git remotes: 38 | 39 | ```sh 40 | git remote -v 41 | ``` 42 | 43 | ## Receive remote updates 44 | In view of staying up to date with the central repository : 45 | 46 | ```sh 47 | git pull upstream master 48 | ``` 49 | 50 | ## Choose a base branch 51 | Before starting development, you need to know which branch to base your modifications/additions on. When in doubt, use master. 52 | 53 | | Type of change | | Branches | 54 | | :------------------ |:---------:| ---------------------:| 55 | | Documentation | | `master` or `main` | 56 | | Bug fixes | | `master` or `main` | 57 | | New features | | `master` or `main` | 58 | | New issues models | | `YOUR-USERNAME:patch` | 59 | 60 | ```sh 61 | # Switch to the desired branch 62 | git switch master 63 | # or 64 | git switch main 65 | 66 | # Pull down any upstream changes 67 | git pull 68 | 69 | # Create a new branch to work on 70 | git switch --create patch/1234-name-issue 71 | ``` 72 | 73 | Commit your changes, then push the branch to your fork with `git push -u fork` and open a pull request on [the REPOSITORY repository](https://github.com/jsonzilla/REPOSITORY/) following the template provided. -------------------------------------------------------------------------------- /ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | **Note: for support questions, please use stackoverflow**. 2 | This repository's issues are reserved for feature requests and bug reports. 3 | Your issue may already be reported! Please search on the [issue tab](../) before creating one. 4 | 5 | 6 | 7 | ## I'm submitting a ... 8 | - [ ] bug report 9 | - [ ] feature request 10 | - [ ] support request => Please do not submit support request here, see note at the top of this template. 11 | 12 | ## Do you want to request a *feature* or report a *bug*? 13 | 14 | ## Expected Behavior 15 | 16 | 17 | ## Current Behavior 18 | 19 | 20 | ## Possible Solution 21 | 22 | 23 | ## Steps to Reproduce 24 | 25 | 26 | 1. 27 | 2. 28 | 3. 29 | 4. 30 | 31 | ## Context (Environment) 32 | 33 | 34 | 35 | 36 | * **Version**: 37 | * **Platform**: 38 | * **Subsystem**: 39 | * **Files**: 40 | 41 | ## Detailed Description 42 | 43 | 44 | ## Possible Implementation 45 | 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | A similar PR may already be submitted! Please search among the [Pull request](../) before creating one. 2 | 3 | Thanks for submitting a pull request! Please provide enough information so that others can review your pull request: 4 | 5 | For more information, see the `CONTRIBUTING` guide. 6 | 7 | 8 | ## Summary 9 | 10 | 11 | 12 | This PR fixes/implements the following **bugs/features** 13 | 14 | * [ ] Bug 1 15 | * [ ] Bug 2 16 | * [ ] Feature 1 17 | * [ ] Feature 2 18 | * [ ] Breaking changes 19 | 20 | 21 | 22 | * **What kind of change does this PR introduce?** (Bug fix, feature, docs update, ...) 23 | 24 | 25 | 26 | * **What is the current behavior?** (You can also link to an open issue here) 27 | 28 | 29 | 30 | * **What is the new behavior (if this is a feature change)?** 31 | 32 | 33 | 34 | * **Does this PR introduce a breaking change?** (What changes might users need to make in their application due to this PR?) 35 | 36 | 37 | 38 | ## Checklist 39 | 40 | * **Please check if the PR fulfills these requirements** 41 | - [ ] The commit message follows our guidelines 42 | - [ ] Tests for the changes have been added (for bug fixes / features) 43 | - [ ] Docs have been added / updated (for bug fixes / features) 44 | 45 | ## Test plan (required) 46 | 47 | Demonstrate the code is solid. Example: The exact commands you ran and their output, screenshots / videos if the pull request changes UI. 48 | 49 | 50 | 51 | ## Closing issues 52 | 53 | 54 | Fixes # 55 | 56 | ## Other information: 57 | 58 | 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # vtt_to_srt3 2 | Convert vtt files to srt subtitle format 3 | > For Python 3.x 4 | * [you can get a new version for Python 2.7 here](https://github.com/jsonzilla/vtt_to_srt2) 5 | * [you can get the old version for Python 2.7 here](https://github.com/jansenicus/vtt-to-srt.py) 6 | 7 | ## Docs 8 | [https://jsonzilla.github.io/vtt_to_srt3/](https://jsonzilla.github.io/vtt_to_srt3/) 9 | 10 | 11 | ## Installation 12 | ```shell 13 | pip install vtt_to_srt3 14 | ``` 15 | 16 | ```cmd 17 | python -m pip install vtt_to_srt3 18 | ``` 19 | 20 | ## Usage from terminal 21 | 22 | ```shell 23 | usage: vtt_to_srt [-h] [-r] [-e ENCODING] [-rf] pathname 24 | 25 | Convert vtt files to srt files 26 | 27 | positional arguments: 28 | pathname a file or directory with files to be converted 29 | 30 | options: 31 | -h, --help show this help message and exit 32 | -r, --recursive walk path recursively 33 | -e ENCODING, --encoding ENCODING 34 | encoding format for input and output files 35 | -rf, --remove_format remove the format tags like bold & italic from output files 36 | ``` 37 | 38 | ## Usage as a lib 39 | 40 | Convert vtt file 41 | ```python 42 | from vtt_to_srt.vtt_to_srt import ConvertFile 43 | 44 | convert_file = ConvertFile("input_utf8.vtt", "utf-8") 45 | convert_file.convert() 46 | ``` 47 | 48 | Recursively convert all vtt files in directory 49 | ```python 50 | from vtt_to_srt.vtt_to_srt import ConvertDirectories 51 | 52 | recursive = False 53 | convert_file = ConvertDirectories(".", recursive, "utf-8") 54 | convert_file.convert() 55 | ``` 56 | 57 | ## Manual build 58 | 59 | Generate wheel 60 | ```shell 61 | python -m pip install --upgrade setuptools wheel build 62 | python -m build 63 | ``` 64 | 65 | ## Generate documentation 66 | 67 | Generate documentation 68 | ```shell 69 | python -m pip install pdoc3 70 | pdoc --html vtt_to_srt/vtt_to_srt.py -o docs 71 | mv docs/vtt_to_srt.html docs/index.html 72 | rm -rm docs/vtt_to_srt 73 | ``` 74 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting Security Issues 4 | 5 | **Please do not report security vulnerabilities through public GitHub issues.** 6 | 7 | If you prefer to submit without logging in, send email to [j@jsonzilla.com](mailto:j@jsonzilla.com). If possible, encrypt your message with our PGP key; please download it from the [PGP Key page](https://keys.openpgp.org/vks/v1/by-fingerprint/D9EAB1475BF5E6D2E13C0AEE4EA9E3DAFD05CFD4). 8 | 9 | You should receive a response within 72 hours. If for some reason you do not, please follow up via email to ensure we received your original message. 10 | 11 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 12 | 13 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 14 | * Full paths of source file(s) related to the manifestation of the issue 15 | * The location of the affected source code (tag/branch/commit or direct URL) 16 | * Any special configuration required to reproduce the issue 17 | * Step-by-step instructions to reproduce the issue 18 | * Proof-of-concept or exploit code (if possible) 19 | * Impact of the issue, including how an attacker might exploit the issue 20 | 21 | ## Preferred Languages 22 | 23 | We prefer all communications to be in English. 24 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | vtt_to_srt API documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
20 |
21 |
22 |

Module vtt_to_srt

23 |
24 |
25 |

Convert of vtt to srt format

26 |
27 | 28 | Expand source code 29 | 30 |
#!/usr/bin/python
 31 | # Jansen A. Simanullang / Jeison Cardoso
 32 | 
 33 | """Convert of vtt to srt format"""
 34 | 
 35 | import os
 36 | import re
 37 | import argparse
 38 | from string import Template
 39 | from stat import S_ISDIR, ST_MODE, S_ISREG
 40 | 
 41 | 
 42 | class VttToStr:
 43 |     """Convert vtt to srt"""
 44 | 
 45 |     def __init__(self) -> None:
 46 |         pass
 47 | 
 48 |     def convert_header(self, contents):
 49 |         """Convert of vtt header to srt format
 50 | 
 51 |         :contents -- contents of vtt file
 52 |         """
 53 |         replacement = re.sub(r"WEBVTT\n", "", contents)
 54 |         replacement = re.sub(r"Kind:[ \-\w]+\n", "", replacement)
 55 |         replacement = re.sub(r"Language:[ \-\w]+\n", "", replacement)
 56 |         return replacement
 57 | 
 58 |     def add_padding_to_timestamp(self, contents):
 59 |         """Add 00 to padding timestamp of to srt format
 60 | 
 61 |         :contents -- contents of vtt file
 62 |         """
 63 |         find_srt = Template(r'$a,$b --> $a,$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
 64 |         minute = r"((?:\d\d:){1}\d\d)"
 65 |         second = r"((?:\d\d:){0}\d\d)"
 66 |         padding_minute = find_srt.substitute(a=minute, b=r"(\d{0,3})")
 67 |         padding_second = find_srt.substitute(a=second, b=r"(\d{0,3})")
 68 |         replacement = re.sub(
 69 |             padding_minute, r"00:\1,\2 --> 00:\3,\4\n", contents)
 70 |         return re.sub(padding_second, r"00:00:\1,\2 --> 00:00:\3,\4\n", replacement)
 71 | 
 72 |     def convert_timestamp(self, contents):
 73 |         """Convert timestamp of vtt file to srt format
 74 | 
 75 |         :contents -- contents of vtt file
 76 |         """
 77 |         find_vtt = Template(r'$a.$b --> $a.$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
 78 |         all_timestamp = find_vtt.substitute(
 79 |             a=r"((?:\d\d:){0,2}\d\d)", b=r"(\d{0,3})")
 80 |         return self.add_padding_to_timestamp(re.sub(all_timestamp, r"\1,\2 --> \3,\4\n", contents))
 81 | 
 82 |     def convert_content(self, contents):
 83 |         """Convert content of vtt file to srt format
 84 | 
 85 |         :contents -- contents of vtt file
 86 |         """
 87 |         replacement = self.convert_timestamp(contents)
 88 |         replacement = self.convert_header(replacement)
 89 |         replacement = re.sub(r"<c[.\w\d]*>", "", replacement)
 90 |         replacement = re.sub(r"</c>", "", replacement)
 91 |         replacement = re.sub(r"<\d\d:\d\d:\d\d.\d\d\d>", "", replacement)
 92 |         replacement = re.sub(
 93 |             r"::[\-\w]+\([\-.\w\d]+\)[ ]*{[.,:;\(\) \-\w\d]+\n }\n", "", replacement)
 94 |         replacement = re.sub(r"Style:\n##\n", "", replacement)
 95 |         replacement = self.add_sequence_numbers(replacement)
 96 | 
 97 |         return replacement
 98 | 
 99 |     def has_timestamp(self, content):
100 |         """Check if line is a timestamp srt format
101 | 
102 |         :contents -- contents of vtt file
103 |         """
104 |         return re.match(r"((\d\d:){2}\d\d),(\d{3}) --> ((\d\d:){2}\d\d),(\d{3})", content) is not None
105 | 
106 |     def add_sequence_numbers(self, contents):
107 |         """Adds sequence numbers to subtitle contents and returns new subtitle contents
108 | 
109 |         :contents -- contents of vtt file
110 |         """
111 |         output = ''
112 |         lines = contents.split('\n')
113 |         i = 1
114 |         for line in lines:
115 |             if self.has_timestamp(line):
116 |                 output += str(i) + '\n'
117 |                 i += 1
118 |             output += line + '\n'
119 |         return output
120 | 
121 |     def write_file(self, filename: str, data, encoding_format: str = "utf-8"):
122 |         """Create a file with some data
123 | 
124 |         :filename -- filename pat
125 |         :data -- data to write
126 |         :encoding_format -- encoding format
127 |         """
128 |         try:
129 |             with open(filename, "w", encoding=encoding_format) as file:
130 |                 file.writelines(str(data))
131 |         except IOError:
132 |             filename = filename.split(os.sep)[-1]
133 |             with open(filename, "w", encoding=encoding_format) as file:
134 |                 file.writelines(str(data))
135 |         print(f"file created {filename}\n")
136 | 
137 |     def read_file(self, filename: str, encoding_format: str = "utf-8"):
138 |         """Read a file text
139 | 
140 |         :filename -- filename path
141 |         :encoding_format -- encoding format
142 |         """
143 |         content: str = ''
144 |         with open(filename, mode="r", encoding=encoding_format) as file:
145 |             print(f"file being read: {filename}\n")
146 |             content = file.read()
147 | 
148 |         return content
149 | 
150 |     def process(self, filename: str, encoding_format: str = "utf-8"):
151 |         """Convert vtt file to a srt file
152 | 
153 |         :str_name_file -- filename path
154 |         :encoding_format -- encoding format
155 |         """
156 |         file_contents: str = self.read_file(filename, encoding_format)
157 |         str_data: str = ""
158 |         str_data = str_data + self.convert_content(file_contents)
159 |         filename = filename.replace(".vtt", ".srt")
160 |         self.write_file(filename, str_data, encoding_format)
161 | 
162 | 
163 | class ConvertFile:
164 |     """Convert vtt file to srt file"""
165 | 
166 |     def __init__(self, pathname: str, encoding_format: str):
167 |         """Constructor
168 | 
169 |         :pathname -- path to file or directory
170 |         :encoding_format -- encoding format
171 |         """
172 |         self.pathname = pathname
173 |         self.encoding_format = encoding_format
174 |         self.vtt_to_str = VttToStr()
175 | 
176 |     def convert(self):
177 |         """Convert vtt file to srt file"""
178 |         if ".vtt" in self.pathname:
179 |             self.vtt_to_str.process(self.pathname, self.encoding_format)
180 | 
181 | 
182 | class ConvertDirectories:
183 |     """Convert vtt files to srt files"""
184 | 
185 |     def __init__(self, pathname: str, enable_recursive: bool, encoding_format: str):
186 |         """Constructor
187 | 
188 |         pathname -- path to file or directory
189 |         :enable_recursive -- enable recursive
190 |         :encoding_format -- encoding format
191 |         """
192 |         self.pathname = pathname
193 |         self.enable_recursive = enable_recursive
194 |         self.encoding_format = encoding_format
195 |         self.vtt_to_str = VttToStr()
196 | 
197 |     def _walk_dir(self, top_most_path: str, callback):
198 |         """Walk a directory
199 | 
200 |         :top_most_path -- parent directory
201 |         :callback -- function to call
202 |         """
203 |         for file in os.listdir(top_most_path):
204 |             pathname = os.path.join(top_most_path, file)
205 |             if not os.path.isdir(pathname):
206 |                 # It"s a file, call the callback function
207 |                 callback(pathname)
208 | 
209 |     def _walk_tree(self, top_most_path, callback):
210 |         """Recursively descend the directory tree rooted at top_most_path,
211 |         calling the callback function for each regular file
212 | 
213 |         :top_most_path -- parent directory
214 |         :callback -- function to call
215 |         """
216 |         for file in os.listdir(top_most_path):
217 |             pathname = os.path.join(top_most_path, file)
218 |             mode = os.stat(pathname)[ST_MODE]
219 |             if S_ISDIR(mode):
220 |                 # It's a directory, recurse into it
221 |                 self._walk_tree(pathname, callback)
222 |             elif S_ISREG(mode):
223 |                 # It's a file, call the callback function
224 |                 callback(pathname)
225 |             else:
226 |                 # Unknown file type, print a message
227 |                 print(f"Skipping {pathname}")
228 | 
229 |     def convert_vtt_to_str(self, file: str):
230 |         """Convert vtt file to string
231 | 
232 |         :file -- file to convert
233 |         """
234 |         if ".vtt" in file:
235 |             try:
236 |                 self.vtt_to_str.process(file, self.encoding_format)
237 |             except UnicodeDecodeError:
238 |                 print(f"UnicodeDecodeError: {file}")
239 | 
240 |     def _vtt_to_srt_batch(self, directory: str):
241 |         """Walk down directory searching for vtt files
242 | 
243 |         :directory -- path to search
244 |         """
245 |         top_most_path = directory
246 |         if self.enable_recursive:
247 |             self._walk_tree(top_most_path, self.convert_vtt_to_str)
248 |         else:
249 |             self._walk_dir(top_most_path, self.convert_vtt_to_str)
250 | 
251 |     def convert(self):
252 |         """Convert vtt files to srt files"""
253 |         self._vtt_to_srt_batch(self.pathname)
254 | 
255 | 
256 | def _show_usage():
257 |     """Show a info message about the usage"""
258 |     print("\nUsage:\tvtt_to_srt pathname [-r]\n")
259 |     print("\tpathname\t- a file or directory with files to be converted")
260 |     print("\t-r\t\t- walk path recursively\n")
261 | 
262 | 
263 | def _parse_args():
264 |     """Parse command line arguments"""
265 |     parser = argparse.ArgumentParser(
266 |         description='Convert vtt files to srt files')
267 |     parser.add_argument(
268 |         "pathname", help="a file or directory with files to be converted")
269 |     parser.add_argument("-r", "--recursive",
270 |                         help="walk path recursively", action="store_true")
271 |     parser.add_argument("-e", "--encoding",
272 |                         help="encoding format for input and output files")
273 | 
274 |     args = parser.parse_args()
275 |     return args
276 | 
277 | 
278 | def main():
279 |     """main function"""
280 | 
281 |     args = _parse_args()
282 |     pathname = args.pathname
283 |     recursive = args.recursive
284 |     encoding = args.encoding
285 | 
286 |     if not encoding:
287 |         encoding = "utf-8"
288 | 
289 |     if os.path.isfile(pathname):
290 |         print(f"file being converted: {pathname}\n")
291 |         ConvertFile(pathname, encoding).convert()
292 | 
293 |     if os.path.isdir(pathname):
294 |         print(f"directory being converted: {pathname}\n")
295 |         ConvertDirectories(pathname, recursive, encoding).convert()
296 | 
297 |     if not os.path.isfile(pathname) and not os.path.isdir(pathname):
298 |         print(f"pathname is not a file or directory: {pathname}\n")
299 |         _show_usage()
300 | 
301 | 
302 | if __name__ == "__main__":
303 |     main()
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |

Functions

312 |
313 |
314 | def main() 315 |
316 |
317 |

main function

318 |
319 | 320 | Expand source code 321 | 322 |
def main():
323 |     """main function"""
324 | 
325 |     args = _parse_args()
326 |     pathname = args.pathname
327 |     recursive = args.recursive
328 |     encoding = args.encoding
329 | 
330 |     if not encoding:
331 |         encoding = "utf-8"
332 | 
333 |     if os.path.isfile(pathname):
334 |         print(f"file being converted: {pathname}\n")
335 |         ConvertFile(pathname, encoding).convert()
336 | 
337 |     if os.path.isdir(pathname):
338 |         print(f"directory being converted: {pathname}\n")
339 |         ConvertDirectories(pathname, recursive, encoding).convert()
340 | 
341 |     if not os.path.isfile(pathname) and not os.path.isdir(pathname):
342 |         print(f"pathname is not a file or directory: {pathname}\n")
343 |         _show_usage()
344 |
345 |
346 |
347 |
348 |
349 |

Classes

350 |
351 |
352 | class ConvertDirectories 353 | (pathname: str, enable_recursive: bool, encoding_format: str) 354 |
355 |
356 |

Convert vtt files to srt files

357 |

Constructor

358 |

pathname – path to file or directory 359 | :enable_recursive – enable recursive 360 | :encoding_format – encoding format

361 |
362 | 363 | Expand source code 364 | 365 |
class ConvertDirectories:
366 |     """Convert vtt files to srt files"""
367 | 
368 |     def __init__(self, pathname: str, enable_recursive: bool, encoding_format: str):
369 |         """Constructor
370 | 
371 |         pathname -- path to file or directory
372 |         :enable_recursive -- enable recursive
373 |         :encoding_format -- encoding format
374 |         """
375 |         self.pathname = pathname
376 |         self.enable_recursive = enable_recursive
377 |         self.encoding_format = encoding_format
378 |         self.vtt_to_str = VttToStr()
379 | 
380 |     def _walk_dir(self, top_most_path: str, callback):
381 |         """Walk a directory
382 | 
383 |         :top_most_path -- parent directory
384 |         :callback -- function to call
385 |         """
386 |         for file in os.listdir(top_most_path):
387 |             pathname = os.path.join(top_most_path, file)
388 |             if not os.path.isdir(pathname):
389 |                 # It"s a file, call the callback function
390 |                 callback(pathname)
391 | 
392 |     def _walk_tree(self, top_most_path, callback):
393 |         """Recursively descend the directory tree rooted at top_most_path,
394 |         calling the callback function for each regular file
395 | 
396 |         :top_most_path -- parent directory
397 |         :callback -- function to call
398 |         """
399 |         for file in os.listdir(top_most_path):
400 |             pathname = os.path.join(top_most_path, file)
401 |             mode = os.stat(pathname)[ST_MODE]
402 |             if S_ISDIR(mode):
403 |                 # It's a directory, recurse into it
404 |                 self._walk_tree(pathname, callback)
405 |             elif S_ISREG(mode):
406 |                 # It's a file, call the callback function
407 |                 callback(pathname)
408 |             else:
409 |                 # Unknown file type, print a message
410 |                 print(f"Skipping {pathname}")
411 | 
412 |     def convert_vtt_to_str(self, file: str):
413 |         """Convert vtt file to string
414 | 
415 |         :file -- file to convert
416 |         """
417 |         if ".vtt" in file:
418 |             try:
419 |                 self.vtt_to_str.process(file, self.encoding_format)
420 |             except UnicodeDecodeError:
421 |                 print(f"UnicodeDecodeError: {file}")
422 | 
423 |     def _vtt_to_srt_batch(self, directory: str):
424 |         """Walk down directory searching for vtt files
425 | 
426 |         :directory -- path to search
427 |         """
428 |         top_most_path = directory
429 |         if self.enable_recursive:
430 |             self._walk_tree(top_most_path, self.convert_vtt_to_str)
431 |         else:
432 |             self._walk_dir(top_most_path, self.convert_vtt_to_str)
433 | 
434 |     def convert(self):
435 |         """Convert vtt files to srt files"""
436 |         self._vtt_to_srt_batch(self.pathname)
437 |
438 |

Methods

439 |
440 |
441 | def convert(self) 442 |
443 |
444 |

Convert vtt files to srt files

445 |
446 | 447 | Expand source code 448 | 449 |
def convert(self):
450 |     """Convert vtt files to srt files"""
451 |     self._vtt_to_srt_batch(self.pathname)
452 |
453 |
454 |
455 | def convert_vtt_to_str(self, file: str) 456 |
457 |
458 |

Convert vtt file to string

459 |

:file – file to convert

460 |
461 | 462 | Expand source code 463 | 464 |
def convert_vtt_to_str(self, file: str):
465 |     """Convert vtt file to string
466 | 
467 |     :file -- file to convert
468 |     """
469 |     if ".vtt" in file:
470 |         try:
471 |             self.vtt_to_str.process(file, self.encoding_format)
472 |         except UnicodeDecodeError:
473 |             print(f"UnicodeDecodeError: {file}")
474 |
475 |
476 |
477 |
478 |
479 | class ConvertFile 480 | (pathname: str, encoding_format: str) 481 |
482 |
483 |

Convert vtt file to srt file

484 |

Constructor

485 |

:pathname – path to file or directory 486 | :encoding_format – encoding format

487 |
488 | 489 | Expand source code 490 | 491 |
class ConvertFile:
492 |     """Convert vtt file to srt file"""
493 | 
494 |     def __init__(self, pathname: str, encoding_format: str):
495 |         """Constructor
496 | 
497 |         :pathname -- path to file or directory
498 |         :encoding_format -- encoding format
499 |         """
500 |         self.pathname = pathname
501 |         self.encoding_format = encoding_format
502 |         self.vtt_to_str = VttToStr()
503 | 
504 |     def convert(self):
505 |         """Convert vtt file to srt file"""
506 |         if ".vtt" in self.pathname:
507 |             self.vtt_to_str.process(self.pathname, self.encoding_format)
508 |
509 |

Methods

510 |
511 |
512 | def convert(self) 513 |
514 |
515 |

Convert vtt file to srt file

516 |
517 | 518 | Expand source code 519 | 520 |
def convert(self):
521 |     """Convert vtt file to srt file"""
522 |     if ".vtt" in self.pathname:
523 |         self.vtt_to_str.process(self.pathname, self.encoding_format)
524 |
525 |
526 |
527 |
528 |
529 | class VttToStr 530 |
531 |
532 |

Convert vtt to srt

533 |
534 | 535 | Expand source code 536 | 537 |
class VttToStr:
538 |     """Convert vtt to srt"""
539 | 
540 |     def __init__(self) -> None:
541 |         pass
542 | 
543 |     def convert_header(self, contents):
544 |         """Convert of vtt header to srt format
545 | 
546 |         :contents -- contents of vtt file
547 |         """
548 |         replacement = re.sub(r"WEBVTT\n", "", contents)
549 |         replacement = re.sub(r"Kind:[ \-\w]+\n", "", replacement)
550 |         replacement = re.sub(r"Language:[ \-\w]+\n", "", replacement)
551 |         return replacement
552 | 
553 |     def add_padding_to_timestamp(self, contents):
554 |         """Add 00 to padding timestamp of to srt format
555 | 
556 |         :contents -- contents of vtt file
557 |         """
558 |         find_srt = Template(r'$a,$b --> $a,$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
559 |         minute = r"((?:\d\d:){1}\d\d)"
560 |         second = r"((?:\d\d:){0}\d\d)"
561 |         padding_minute = find_srt.substitute(a=minute, b=r"(\d{0,3})")
562 |         padding_second = find_srt.substitute(a=second, b=r"(\d{0,3})")
563 |         replacement = re.sub(
564 |             padding_minute, r"00:\1,\2 --> 00:\3,\4\n", contents)
565 |         return re.sub(padding_second, r"00:00:\1,\2 --> 00:00:\3,\4\n", replacement)
566 | 
567 |     def convert_timestamp(self, contents):
568 |         """Convert timestamp of vtt file to srt format
569 | 
570 |         :contents -- contents of vtt file
571 |         """
572 |         find_vtt = Template(r'$a.$b --> $a.$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
573 |         all_timestamp = find_vtt.substitute(
574 |             a=r"((?:\d\d:){0,2}\d\d)", b=r"(\d{0,3})")
575 |         return self.add_padding_to_timestamp(re.sub(all_timestamp, r"\1,\2 --> \3,\4\n", contents))
576 | 
577 |     def convert_content(self, contents):
578 |         """Convert content of vtt file to srt format
579 | 
580 |         :contents -- contents of vtt file
581 |         """
582 |         replacement = self.convert_timestamp(contents)
583 |         replacement = self.convert_header(replacement)
584 |         replacement = re.sub(r"<c[.\w\d]*>", "", replacement)
585 |         replacement = re.sub(r"</c>", "", replacement)
586 |         replacement = re.sub(r"<\d\d:\d\d:\d\d.\d\d\d>", "", replacement)
587 |         replacement = re.sub(
588 |             r"::[\-\w]+\([\-.\w\d]+\)[ ]*{[.,:;\(\) \-\w\d]+\n }\n", "", replacement)
589 |         replacement = re.sub(r"Style:\n##\n", "", replacement)
590 |         replacement = self.add_sequence_numbers(replacement)
591 | 
592 |         return replacement
593 | 
594 |     def has_timestamp(self, content):
595 |         """Check if line is a timestamp srt format
596 | 
597 |         :contents -- contents of vtt file
598 |         """
599 |         return re.match(r"((\d\d:){2}\d\d),(\d{3}) --> ((\d\d:){2}\d\d),(\d{3})", content) is not None
600 | 
601 |     def add_sequence_numbers(self, contents):
602 |         """Adds sequence numbers to subtitle contents and returns new subtitle contents
603 | 
604 |         :contents -- contents of vtt file
605 |         """
606 |         output = ''
607 |         lines = contents.split('\n')
608 |         i = 1
609 |         for line in lines:
610 |             if self.has_timestamp(line):
611 |                 output += str(i) + '\n'
612 |                 i += 1
613 |             output += line + '\n'
614 |         return output
615 | 
616 |     def write_file(self, filename: str, data, encoding_format: str = "utf-8"):
617 |         """Create a file with some data
618 | 
619 |         :filename -- filename pat
620 |         :data -- data to write
621 |         :encoding_format -- encoding format
622 |         """
623 |         try:
624 |             with open(filename, "w", encoding=encoding_format) as file:
625 |                 file.writelines(str(data))
626 |         except IOError:
627 |             filename = filename.split(os.sep)[-1]
628 |             with open(filename, "w", encoding=encoding_format) as file:
629 |                 file.writelines(str(data))
630 |         print(f"file created {filename}\n")
631 | 
632 |     def read_file(self, filename: str, encoding_format: str = "utf-8"):
633 |         """Read a file text
634 | 
635 |         :filename -- filename path
636 |         :encoding_format -- encoding format
637 |         """
638 |         content: str = ''
639 |         with open(filename, mode="r", encoding=encoding_format) as file:
640 |             print(f"file being read: {filename}\n")
641 |             content = file.read()
642 | 
643 |         return content
644 | 
645 |     def process(self, filename: str, encoding_format: str = "utf-8"):
646 |         """Convert vtt file to a srt file
647 | 
648 |         :str_name_file -- filename path
649 |         :encoding_format -- encoding format
650 |         """
651 |         file_contents: str = self.read_file(filename, encoding_format)
652 |         str_data: str = ""
653 |         str_data = str_data + self.convert_content(file_contents)
654 |         filename = filename.replace(".vtt", ".srt")
655 |         self.write_file(filename, str_data, encoding_format)
656 |
657 |

Methods

658 |
659 |
660 | def add_padding_to_timestamp(self, contents) 661 |
662 |
663 |

Add 00 to padding timestamp of to srt format

664 |

:contents – contents of vtt file

665 |
666 | 667 | Expand source code 668 | 669 |
def add_padding_to_timestamp(self, contents):
670 |     """Add 00 to padding timestamp of to srt format
671 | 
672 |     :contents -- contents of vtt file
673 |     """
674 |     find_srt = Template(r'$a,$b --> $a,$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
675 |     minute = r"((?:\d\d:){1}\d\d)"
676 |     second = r"((?:\d\d:){0}\d\d)"
677 |     padding_minute = find_srt.substitute(a=minute, b=r"(\d{0,3})")
678 |     padding_second = find_srt.substitute(a=second, b=r"(\d{0,3})")
679 |     replacement = re.sub(
680 |         padding_minute, r"00:\1,\2 --> 00:\3,\4\n", contents)
681 |     return re.sub(padding_second, r"00:00:\1,\2 --> 00:00:\3,\4\n", replacement)
682 |
683 |
684 |
685 | def add_sequence_numbers(self, contents) 686 |
687 |
688 |

Adds sequence numbers to subtitle contents and returns new subtitle contents

689 |

:contents – contents of vtt file

690 |
691 | 692 | Expand source code 693 | 694 |
def add_sequence_numbers(self, contents):
695 |     """Adds sequence numbers to subtitle contents and returns new subtitle contents
696 | 
697 |     :contents -- contents of vtt file
698 |     """
699 |     output = ''
700 |     lines = contents.split('\n')
701 |     i = 1
702 |     for line in lines:
703 |         if self.has_timestamp(line):
704 |             output += str(i) + '\n'
705 |             i += 1
706 |         output += line + '\n'
707 |     return output
708 |
709 |
710 |
711 | def convert_content(self, contents) 712 |
713 |
714 |

Convert content of vtt file to srt format

715 |

:contents – contents of vtt file

716 |
717 | 718 | Expand source code 719 | 720 |
def convert_content(self, contents):
721 |     """Convert content of vtt file to srt format
722 | 
723 |     :contents -- contents of vtt file
724 |     """
725 |     replacement = self.convert_timestamp(contents)
726 |     replacement = self.convert_header(replacement)
727 |     replacement = re.sub(r"<c[.\w\d]*>", "", replacement)
728 |     replacement = re.sub(r"</c>", "", replacement)
729 |     replacement = re.sub(r"<\d\d:\d\d:\d\d.\d\d\d>", "", replacement)
730 |     replacement = re.sub(
731 |         r"::[\-\w]+\([\-.\w\d]+\)[ ]*{[.,:;\(\) \-\w\d]+\n }\n", "", replacement)
732 |     replacement = re.sub(r"Style:\n##\n", "", replacement)
733 |     replacement = self.add_sequence_numbers(replacement)
734 | 
735 |     return replacement
736 |
737 |
738 |
739 | def convert_header(self, contents) 740 |
741 |
742 |

Convert of vtt header to srt format

743 |

:contents – contents of vtt file

744 |
745 | 746 | Expand source code 747 | 748 |
def convert_header(self, contents):
749 |     """Convert of vtt header to srt format
750 | 
751 |     :contents -- contents of vtt file
752 |     """
753 |     replacement = re.sub(r"WEBVTT\n", "", contents)
754 |     replacement = re.sub(r"Kind:[ \-\w]+\n", "", replacement)
755 |     replacement = re.sub(r"Language:[ \-\w]+\n", "", replacement)
756 |     return replacement
757 |
758 |
759 |
760 | def convert_timestamp(self, contents) 761 |
762 |
763 |

Convert timestamp of vtt file to srt format

764 |

:contents – contents of vtt file

765 |
766 | 767 | Expand source code 768 | 769 |
def convert_timestamp(self, contents):
770 |     """Convert timestamp of vtt file to srt format
771 | 
772 |     :contents -- contents of vtt file
773 |     """
774 |     find_vtt = Template(r'$a.$b --> $a.$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
775 |     all_timestamp = find_vtt.substitute(
776 |         a=r"((?:\d\d:){0,2}\d\d)", b=r"(\d{0,3})")
777 |     return self.add_padding_to_timestamp(re.sub(all_timestamp, r"\1,\2 --> \3,\4\n", contents))
778 |
779 |
780 |
781 | def has_timestamp(self, content) 782 |
783 |
784 |

Check if line is a timestamp srt format

785 |

:contents – contents of vtt file

786 |
787 | 788 | Expand source code 789 | 790 |
def has_timestamp(self, content):
791 |     """Check if line is a timestamp srt format
792 | 
793 |     :contents -- contents of vtt file
794 |     """
795 |     return re.match(r"((\d\d:){2}\d\d),(\d{3}) --> ((\d\d:){2}\d\d),(\d{3})", content) is not None
796 |
797 |
798 |
799 | def process(self, filename: str, encoding_format: str = 'utf-8') 800 |
801 |
802 |

Convert vtt file to a srt file

803 |

:str_name_file – filename path 804 | :encoding_format – encoding format

805 |
806 | 807 | Expand source code 808 | 809 |
def process(self, filename: str, encoding_format: str = "utf-8"):
810 |     """Convert vtt file to a srt file
811 | 
812 |     :str_name_file -- filename path
813 |     :encoding_format -- encoding format
814 |     """
815 |     file_contents: str = self.read_file(filename, encoding_format)
816 |     str_data: str = ""
817 |     str_data = str_data + self.convert_content(file_contents)
818 |     filename = filename.replace(".vtt", ".srt")
819 |     self.write_file(filename, str_data, encoding_format)
820 |
821 |
822 |
823 | def read_file(self, filename: str, encoding_format: str = 'utf-8') 824 |
825 |
826 |

Read a file text

827 |

:filename – filename path 828 | :encoding_format – encoding format

829 |
830 | 831 | Expand source code 832 | 833 |
def read_file(self, filename: str, encoding_format: str = "utf-8"):
834 |     """Read a file text
835 | 
836 |     :filename -- filename path
837 |     :encoding_format -- encoding format
838 |     """
839 |     content: str = ''
840 |     with open(filename, mode="r", encoding=encoding_format) as file:
841 |         print(f"file being read: {filename}\n")
842 |         content = file.read()
843 | 
844 |     return content
845 |
846 |
847 |
848 | def write_file(self, filename: str, data, encoding_format: str = 'utf-8') 849 |
850 |
851 |

Create a file with some data

852 |

:filename – filename pat 853 | :data – data to write 854 | :encoding_format – encoding format

855 |
856 | 857 | Expand source code 858 | 859 |
def write_file(self, filename: str, data, encoding_format: str = "utf-8"):
860 |     """Create a file with some data
861 | 
862 |     :filename -- filename pat
863 |     :data -- data to write
864 |     :encoding_format -- encoding format
865 |     """
866 |     try:
867 |         with open(filename, "w", encoding=encoding_format) as file:
868 |             file.writelines(str(data))
869 |     except IOError:
870 |         filename = filename.split(os.sep)[-1]
871 |         with open(filename, "w", encoding=encoding_format) as file:
872 |             file.writelines(str(data))
873 |     print(f"file created {filename}\n")
874 |
875 |
876 |
877 |
878 |
879 |
880 |
881 | 925 |
926 | 929 | 930 | -------------------------------------------------------------------------------- /docs/vtt_to_srt.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | vtt_to_srt API documentation 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
20 |
21 |
22 |

Module vtt_to_srt

23 |
24 |
25 |

Convert of vtt to srt format

26 |
27 | 28 | Expand source code 29 | 30 |
#!/usr/bin/python
 31 | # Jansen A. Simanullang / Jeison Cardoso
 32 | 
 33 | """Convert of vtt to srt format"""
 34 | 
 35 | import os
 36 | import re
 37 | import argparse
 38 | from string import Template
 39 | from stat import S_ISDIR, ST_MODE, S_ISREG
 40 | 
 41 | 
 42 | class VttToStr:
 43 |     """Convert vtt to srt"""
 44 | 
 45 |     def __init__(self) -> None:
 46 |         pass
 47 | 
 48 |     def convert_header(self, contents: str) -> str:
 49 |         """Convert of vtt header to srt format
 50 | 
 51 |         :contents -- contents of vtt file
 52 |         """
 53 |         replacement = re.sub(r"WEBVTT\n", "", contents)
 54 |         replacement = re.sub(r"Kind:[ \-\w]+\n", "", replacement)
 55 |         replacement = re.sub(r"Language:[ \-\w]+\n", "", replacement)
 56 |         return replacement
 57 | 
 58 |     def add_padding_to_timestamp(self, contents: str) -> str:
 59 |         """Add 00 to padding timestamp of to srt format
 60 | 
 61 |         :contents -- contents of vtt file
 62 |         """
 63 |         find_srt = Template(r'$a,$b --> $a,$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
 64 |         minute = r"((?:\d\d:){1}\d\d)"
 65 |         second = r"((?:\d\d:){0}\d\d)"
 66 |         padding_minute = find_srt.substitute(a=minute, b=r"(\d{0,3})")
 67 |         padding_second = find_srt.substitute(a=second, b=r"(\d{0,3})")
 68 |         replacement = re.sub(
 69 |             padding_minute, r"00:\1,\2 --> 00:\3,\4\n", contents)
 70 |         return re.sub(padding_second, r"00:00:\1,\2 --> 00:00:\3,\4\n", replacement)
 71 | 
 72 |     def convert_timestamp(self, contents: str) -> str:
 73 |         """Convert timestamp of vtt file to srt format
 74 | 
 75 |         :contents -- contents of vtt file
 76 |         """
 77 |         find_vtt = Template(r'$a.$b --> $a.$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
 78 |         all_timestamp = find_vtt.substitute(
 79 |             a=r"((?:\d\d:){0,2}\d\d)", b=r"(\d{0,3})")
 80 |         return self.add_padding_to_timestamp(re.sub(all_timestamp, r"\1,\2 --> \3,\4\n", contents))
 81 | 
 82 |     def convert_content(self, contents: str) -> str:
 83 |         """Convert content of vtt file to srt format
 84 | 
 85 |         :contents -- contents of vtt file
 86 |         """
 87 |         replacement = self.convert_timestamp(contents)
 88 |         replacement = self.convert_header(replacement)
 89 |         replacement = re.sub(r"<c[.\w\d]*>", "", replacement)
 90 |         replacement = re.sub(r"</c>", "", replacement)
 91 |         replacement = re.sub(r"<\d\d:\d\d:\d\d.\d\d\d>", "", replacement)
 92 |         replacement = re.sub(
 93 |             r"::[\-\w]+\([\-.\w\d]+\)[ ]*{[.,:;\(\) \-\w\d]+\n }\n", "", replacement)
 94 |         replacement = re.sub(r"Style:\n##\n", "", replacement)
 95 |         replacement = self.remove_simple_identifiers(replacement)
 96 |         replacement = self.add_sequence_numbers(replacement)
 97 | 
 98 |         return replacement
 99 | 
100 |     def has_timestamp(self, content: str) -> bool:
101 |         """Check if line is a timestamp srt format
102 | 
103 |         :contents -- contents of vtt file
104 |         """
105 |         return re.match(r"((\d\d:){2}\d\d),(\d{3}) --> ((\d\d:){2}\d\d),(\d{3})", content) is not None
106 | 
107 |     def add_sequence_numbers(self, contents: str) -> str:
108 |         """Adds sequence numbers to subtitle contents and returns new subtitle contents
109 | 
110 |         :contents -- contents of vtt file
111 |         """
112 |         lines = contents.split('\n')
113 |         out = ''
114 |         counter = 1
115 |         for line in lines:
116 |             if self.has_timestamp(line):
117 |                 out += str(counter) + '\n'
118 |                 counter += 1
119 |             out += line + '\n'
120 |         return out
121 | 
122 |     def remove_simple_identifiers(self, contents: str) -> str:
123 |         """Remove simple identifiers of vtt file
124 | 
125 |         :contents -- contents of vtt file
126 |         """
127 |         lines = contents.split('\n')
128 |         out = []
129 |         for i, line in enumerate(lines):
130 |             if self.has_timestamp(line):
131 |                 if re.match(r"^\d+$", lines[i - 1]):
132 |                     out.pop()
133 |             out.append(line)
134 |         return '\n'.join(out)
135 | 
136 |     def write_file(self, filename: str, data, encoding_format: str = "utf-8"):
137 |         """Create a file with some data
138 | 
139 |         :filename -- filename pat
140 |         :data -- data to write
141 |         :encoding_format -- encoding format
142 |         """
143 |         try:
144 |             with open(filename, "w", encoding=encoding_format) as file:
145 |                 file.writelines(str(data))
146 |         except IOError:
147 |             filename = filename.split(os.sep)[-1]
148 |             with open(filename, "w", encoding=encoding_format) as file:
149 |                 file.writelines(str(data))
150 |         print(f"file created {filename}\n")
151 | 
152 |     def read_file(self, filename: str, encoding_format: str = "utf-8"):
153 |         """Read a file text
154 | 
155 |         :filename -- filename path
156 |         :encoding_format -- encoding format
157 |         """
158 |         content: str = ''
159 |         with open(filename, mode="r", encoding=encoding_format) as file:
160 |             print(f"file being read: {filename}\n")
161 |             content = file.read()
162 | 
163 |         return content
164 | 
165 |     def process(self, filename: str, encoding_format: str = "utf-8"):
166 |         """Convert vtt file to a srt file
167 | 
168 |         :str_name_file -- filename path
169 |         :encoding_format -- encoding format
170 |         """
171 |         file_contents: str = self.read_file(filename, encoding_format)
172 |         str_data: str = ""
173 |         str_data = str_data + self.convert_content(file_contents)
174 |         filename = filename.replace(".vtt", ".srt")
175 |         self.write_file(filename, str_data, encoding_format)
176 | 
177 | 
178 | class ConvertFile:
179 |     """Convert vtt file to srt file"""
180 | 
181 |     def __init__(self, pathname: str, encoding_format: str):
182 |         """Constructor
183 | 
184 |         :pathname -- path to file or directory
185 |         :encoding_format -- encoding format
186 |         """
187 |         self.pathname = pathname
188 |         self.encoding_format = encoding_format
189 |         self.vtt_to_str = VttToStr()
190 | 
191 |     def convert(self):
192 |         """Convert vtt file to srt file"""
193 |         if ".vtt" in self.pathname:
194 |             self.vtt_to_str.process(self.pathname, self.encoding_format)
195 | 
196 | 
197 | class ConvertDirectories:
198 |     """Convert vtt files to srt files"""
199 | 
200 |     def __init__(self, pathname: str, enable_recursive: bool, encoding_format: str):
201 |         """Constructor
202 | 
203 |         pathname -- path to file or directory
204 |         :enable_recursive -- enable recursive
205 |         :encoding_format -- encoding format
206 |         """
207 |         self.pathname = pathname
208 |         self.enable_recursive = enable_recursive
209 |         self.encoding_format = encoding_format
210 |         self.vtt_to_str = VttToStr()
211 | 
212 |     def _walk_dir(self, top_most_path: str, callback):
213 |         """Walk a directory
214 | 
215 |         :top_most_path -- parent directory
216 |         :callback -- function to call
217 |         """
218 |         for file in os.listdir(top_most_path):
219 |             pathname = os.path.join(top_most_path, file)
220 |             if not os.path.isdir(pathname):
221 |                 # It"s a file, call the callback function
222 |                 callback(pathname)
223 | 
224 |     def _walk_tree(self, top_most_path, callback):
225 |         """Recursively descend the directory tree rooted at top_most_path,
226 |         calling the callback function for each regular file
227 | 
228 |         :top_most_path -- parent directory
229 |         :callback -- function to call
230 |         """
231 |         for file in os.listdir(top_most_path):
232 |             pathname = os.path.join(top_most_path, file)
233 |             mode = os.stat(pathname)[ST_MODE]
234 |             if S_ISDIR(mode):
235 |                 # It's a directory, recurse into it
236 |                 self._walk_tree(pathname, callback)
237 |             elif S_ISREG(mode):
238 |                 # It's a file, call the callback function
239 |                 callback(pathname)
240 |             else:
241 |                 # Unknown file type, print a message
242 |                 print(f"Skipping {pathname}")
243 | 
244 |     def convert_vtt_to_str(self, file: str):
245 |         """Convert vtt file to string
246 | 
247 |         :file -- file to convert
248 |         """
249 |         if ".vtt" in file:
250 |             try:
251 |                 self.vtt_to_str.process(file, self.encoding_format)
252 |             except UnicodeDecodeError:
253 |                 print(f"UnicodeDecodeError: {file}")
254 | 
255 |     def _vtt_to_srt_batch(self, directory: str):
256 |         """Walk down directory searching for vtt files
257 | 
258 |         :directory -- path to search
259 |         """
260 |         top_most_path = directory
261 |         if self.enable_recursive:
262 |             self._walk_tree(top_most_path, self.convert_vtt_to_str)
263 |         else:
264 |             self._walk_dir(top_most_path, self.convert_vtt_to_str)
265 | 
266 |     def convert(self):
267 |         """Convert vtt files to srt files"""
268 |         self._vtt_to_srt_batch(self.pathname)
269 | 
270 | 
271 | def _show_usage():
272 |     """Show a info message about the usage"""
273 |     print("\nUsage:\tvtt_to_srt pathname [-r]\n")
274 |     print("\tpathname\t- a file or directory with files to be converted")
275 |     print("\t-r\t\t- walk path recursively\n")
276 | 
277 | 
278 | def _parse_args():
279 |     """Parse command line arguments"""
280 |     parser = argparse.ArgumentParser(
281 |         description='Convert vtt files to srt files')
282 |     parser.add_argument(
283 |         "pathname", help="a file or directory with files to be converted")
284 |     parser.add_argument("-r", "--recursive",
285 |                         help="walk path recursively", action="store_true")
286 |     parser.add_argument("-e", "--encoding",
287 |                         help="encoding format for input and output files")
288 | 
289 |     args = parser.parse_args()
290 |     return args
291 | 
292 | 
293 | def main():
294 |     """main function"""
295 | 
296 |     args = _parse_args()
297 |     pathname = args.pathname
298 |     recursive = args.recursive
299 |     encoding = args.encoding
300 | 
301 |     if not encoding:
302 |         encoding = "utf-8"
303 | 
304 |     if os.path.isfile(pathname):
305 |         print(f"file being converted: {pathname}\n")
306 |         ConvertFile(pathname, encoding).convert()
307 | 
308 |     if os.path.isdir(pathname):
309 |         print(f"directory being converted: {pathname}\n")
310 |         ConvertDirectories(pathname, recursive, encoding).convert()
311 | 
312 |     if not os.path.isfile(pathname) and not os.path.isdir(pathname):
313 |         print(f"pathname is not a file or directory: {pathname}\n")
314 |         _show_usage()
315 | 
316 | 
317 | if __name__ == "__main__":
318 |     main()
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |

Functions

327 |
328 |
329 | def main() 330 |
331 |
332 |

main function

333 |
334 | 335 | Expand source code 336 | 337 |
def main():
338 |     """main function"""
339 | 
340 |     args = _parse_args()
341 |     pathname = args.pathname
342 |     recursive = args.recursive
343 |     encoding = args.encoding
344 | 
345 |     if not encoding:
346 |         encoding = "utf-8"
347 | 
348 |     if os.path.isfile(pathname):
349 |         print(f"file being converted: {pathname}\n")
350 |         ConvertFile(pathname, encoding).convert()
351 | 
352 |     if os.path.isdir(pathname):
353 |         print(f"directory being converted: {pathname}\n")
354 |         ConvertDirectories(pathname, recursive, encoding).convert()
355 | 
356 |     if not os.path.isfile(pathname) and not os.path.isdir(pathname):
357 |         print(f"pathname is not a file or directory: {pathname}\n")
358 |         _show_usage()
359 |
360 |
361 |
362 |
363 |
364 |

Classes

365 |
366 |
367 | class ConvertDirectories 368 | (pathname: str, enable_recursive: bool, encoding_format: str) 369 |
370 |
371 |

Convert vtt files to srt files

372 |

Constructor

373 |

pathname – path to file or directory 374 | :enable_recursive – enable recursive 375 | :encoding_format – encoding format

376 |
377 | 378 | Expand source code 379 | 380 |
class ConvertDirectories:
381 |     """Convert vtt files to srt files"""
382 | 
383 |     def __init__(self, pathname: str, enable_recursive: bool, encoding_format: str):
384 |         """Constructor
385 | 
386 |         pathname -- path to file or directory
387 |         :enable_recursive -- enable recursive
388 |         :encoding_format -- encoding format
389 |         """
390 |         self.pathname = pathname
391 |         self.enable_recursive = enable_recursive
392 |         self.encoding_format = encoding_format
393 |         self.vtt_to_str = VttToStr()
394 | 
395 |     def _walk_dir(self, top_most_path: str, callback):
396 |         """Walk a directory
397 | 
398 |         :top_most_path -- parent directory
399 |         :callback -- function to call
400 |         """
401 |         for file in os.listdir(top_most_path):
402 |             pathname = os.path.join(top_most_path, file)
403 |             if not os.path.isdir(pathname):
404 |                 # It"s a file, call the callback function
405 |                 callback(pathname)
406 | 
407 |     def _walk_tree(self, top_most_path, callback):
408 |         """Recursively descend the directory tree rooted at top_most_path,
409 |         calling the callback function for each regular file
410 | 
411 |         :top_most_path -- parent directory
412 |         :callback -- function to call
413 |         """
414 |         for file in os.listdir(top_most_path):
415 |             pathname = os.path.join(top_most_path, file)
416 |             mode = os.stat(pathname)[ST_MODE]
417 |             if S_ISDIR(mode):
418 |                 # It's a directory, recurse into it
419 |                 self._walk_tree(pathname, callback)
420 |             elif S_ISREG(mode):
421 |                 # It's a file, call the callback function
422 |                 callback(pathname)
423 |             else:
424 |                 # Unknown file type, print a message
425 |                 print(f"Skipping {pathname}")
426 | 
427 |     def convert_vtt_to_str(self, file: str):
428 |         """Convert vtt file to string
429 | 
430 |         :file -- file to convert
431 |         """
432 |         if ".vtt" in file:
433 |             try:
434 |                 self.vtt_to_str.process(file, self.encoding_format)
435 |             except UnicodeDecodeError:
436 |                 print(f"UnicodeDecodeError: {file}")
437 | 
438 |     def _vtt_to_srt_batch(self, directory: str):
439 |         """Walk down directory searching for vtt files
440 | 
441 |         :directory -- path to search
442 |         """
443 |         top_most_path = directory
444 |         if self.enable_recursive:
445 |             self._walk_tree(top_most_path, self.convert_vtt_to_str)
446 |         else:
447 |             self._walk_dir(top_most_path, self.convert_vtt_to_str)
448 | 
449 |     def convert(self):
450 |         """Convert vtt files to srt files"""
451 |         self._vtt_to_srt_batch(self.pathname)
452 |
453 |

Methods

454 |
455 |
456 | def convert(self) 457 |
458 |
459 |

Convert vtt files to srt files

460 |
461 | 462 | Expand source code 463 | 464 |
def convert(self):
465 |     """Convert vtt files to srt files"""
466 |     self._vtt_to_srt_batch(self.pathname)
467 |
468 |
469 |
470 | def convert_vtt_to_str(self, file: str) 471 |
472 |
473 |

Convert vtt file to string

474 |

:file – file to convert

475 |
476 | 477 | Expand source code 478 | 479 |
def convert_vtt_to_str(self, file: str):
480 |     """Convert vtt file to string
481 | 
482 |     :file -- file to convert
483 |     """
484 |     if ".vtt" in file:
485 |         try:
486 |             self.vtt_to_str.process(file, self.encoding_format)
487 |         except UnicodeDecodeError:
488 |             print(f"UnicodeDecodeError: {file}")
489 |
490 |
491 |
492 |
493 |
494 | class ConvertFile 495 | (pathname: str, encoding_format: str) 496 |
497 |
498 |

Convert vtt file to srt file

499 |

Constructor

500 |

:pathname – path to file or directory 501 | :encoding_format – encoding format

502 |
503 | 504 | Expand source code 505 | 506 |
class ConvertFile:
507 |     """Convert vtt file to srt file"""
508 | 
509 |     def __init__(self, pathname: str, encoding_format: str):
510 |         """Constructor
511 | 
512 |         :pathname -- path to file or directory
513 |         :encoding_format -- encoding format
514 |         """
515 |         self.pathname = pathname
516 |         self.encoding_format = encoding_format
517 |         self.vtt_to_str = VttToStr()
518 | 
519 |     def convert(self):
520 |         """Convert vtt file to srt file"""
521 |         if ".vtt" in self.pathname:
522 |             self.vtt_to_str.process(self.pathname, self.encoding_format)
523 |
524 |

Methods

525 |
526 |
527 | def convert(self) 528 |
529 |
530 |

Convert vtt file to srt file

531 |
532 | 533 | Expand source code 534 | 535 |
def convert(self):
536 |     """Convert vtt file to srt file"""
537 |     if ".vtt" in self.pathname:
538 |         self.vtt_to_str.process(self.pathname, self.encoding_format)
539 |
540 |
541 |
542 |
543 |
544 | class VttToStr 545 |
546 |
547 |

Convert vtt to srt

548 |
549 | 550 | Expand source code 551 | 552 |
class VttToStr:
553 |     """Convert vtt to srt"""
554 | 
555 |     def __init__(self) -> None:
556 |         pass
557 | 
558 |     def convert_header(self, contents: str) -> str:
559 |         """Convert of vtt header to srt format
560 | 
561 |         :contents -- contents of vtt file
562 |         """
563 |         replacement = re.sub(r"WEBVTT\n", "", contents)
564 |         replacement = re.sub(r"Kind:[ \-\w]+\n", "", replacement)
565 |         replacement = re.sub(r"Language:[ \-\w]+\n", "", replacement)
566 |         return replacement
567 | 
568 |     def add_padding_to_timestamp(self, contents: str) -> str:
569 |         """Add 00 to padding timestamp of to srt format
570 | 
571 |         :contents -- contents of vtt file
572 |         """
573 |         find_srt = Template(r'$a,$b --> $a,$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
574 |         minute = r"((?:\d\d:){1}\d\d)"
575 |         second = r"((?:\d\d:){0}\d\d)"
576 |         padding_minute = find_srt.substitute(a=minute, b=r"(\d{0,3})")
577 |         padding_second = find_srt.substitute(a=second, b=r"(\d{0,3})")
578 |         replacement = re.sub(
579 |             padding_minute, r"00:\1,\2 --> 00:\3,\4\n", contents)
580 |         return re.sub(padding_second, r"00:00:\1,\2 --> 00:00:\3,\4\n", replacement)
581 | 
582 |     def convert_timestamp(self, contents: str) -> str:
583 |         """Convert timestamp of vtt file to srt format
584 | 
585 |         :contents -- contents of vtt file
586 |         """
587 |         find_vtt = Template(r'$a.$b --> $a.$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
588 |         all_timestamp = find_vtt.substitute(
589 |             a=r"((?:\d\d:){0,2}\d\d)", b=r"(\d{0,3})")
590 |         return self.add_padding_to_timestamp(re.sub(all_timestamp, r"\1,\2 --> \3,\4\n", contents))
591 | 
592 |     def convert_content(self, contents: str) -> str:
593 |         """Convert content of vtt file to srt format
594 | 
595 |         :contents -- contents of vtt file
596 |         """
597 |         replacement = self.convert_timestamp(contents)
598 |         replacement = self.convert_header(replacement)
599 |         replacement = re.sub(r"<c[.\w\d]*>", "", replacement)
600 |         replacement = re.sub(r"</c>", "", replacement)
601 |         replacement = re.sub(r"<\d\d:\d\d:\d\d.\d\d\d>", "", replacement)
602 |         replacement = re.sub(
603 |             r"::[\-\w]+\([\-.\w\d]+\)[ ]*{[.,:;\(\) \-\w\d]+\n }\n", "", replacement)
604 |         replacement = re.sub(r"Style:\n##\n", "", replacement)
605 |         replacement = self.remove_simple_identifiers(replacement)
606 |         replacement = self.add_sequence_numbers(replacement)
607 | 
608 |         return replacement
609 | 
610 |     def has_timestamp(self, content: str) -> bool:
611 |         """Check if line is a timestamp srt format
612 | 
613 |         :contents -- contents of vtt file
614 |         """
615 |         return re.match(r"((\d\d:){2}\d\d),(\d{3}) --> ((\d\d:){2}\d\d),(\d{3})", content) is not None
616 | 
617 |     def add_sequence_numbers(self, contents: str) -> str:
618 |         """Adds sequence numbers to subtitle contents and returns new subtitle contents
619 | 
620 |         :contents -- contents of vtt file
621 |         """
622 |         lines = contents.split('\n')
623 |         out = ''
624 |         counter = 1
625 |         for line in lines:
626 |             if self.has_timestamp(line):
627 |                 out += str(counter) + '\n'
628 |                 counter += 1
629 |             out += line + '\n'
630 |         return out
631 | 
632 |     def remove_simple_identifiers(self, contents: str) -> str:
633 |         """Remove simple identifiers of vtt file
634 | 
635 |         :contents -- contents of vtt file
636 |         """
637 |         lines = contents.split('\n')
638 |         out = []
639 |         for i, line in enumerate(lines):
640 |             if self.has_timestamp(line):
641 |                 if re.match(r"^\d+$", lines[i - 1]):
642 |                     out.pop()
643 |             out.append(line)
644 |         return '\n'.join(out)
645 | 
646 |     def write_file(self, filename: str, data, encoding_format: str = "utf-8"):
647 |         """Create a file with some data
648 | 
649 |         :filename -- filename pat
650 |         :data -- data to write
651 |         :encoding_format -- encoding format
652 |         """
653 |         try:
654 |             with open(filename, "w", encoding=encoding_format) as file:
655 |                 file.writelines(str(data))
656 |         except IOError:
657 |             filename = filename.split(os.sep)[-1]
658 |             with open(filename, "w", encoding=encoding_format) as file:
659 |                 file.writelines(str(data))
660 |         print(f"file created {filename}\n")
661 | 
662 |     def read_file(self, filename: str, encoding_format: str = "utf-8"):
663 |         """Read a file text
664 | 
665 |         :filename -- filename path
666 |         :encoding_format -- encoding format
667 |         """
668 |         content: str = ''
669 |         with open(filename, mode="r", encoding=encoding_format) as file:
670 |             print(f"file being read: {filename}\n")
671 |             content = file.read()
672 | 
673 |         return content
674 | 
675 |     def process(self, filename: str, encoding_format: str = "utf-8"):
676 |         """Convert vtt file to a srt file
677 | 
678 |         :str_name_file -- filename path
679 |         :encoding_format -- encoding format
680 |         """
681 |         file_contents: str = self.read_file(filename, encoding_format)
682 |         str_data: str = ""
683 |         str_data = str_data + self.convert_content(file_contents)
684 |         filename = filename.replace(".vtt", ".srt")
685 |         self.write_file(filename, str_data, encoding_format)
686 |
687 |

Methods

688 |
689 |
690 | def add_padding_to_timestamp(self, contents: str) ‑> str 691 |
692 |
693 |

Add 00 to padding timestamp of to srt format

694 |

:contents – contents of vtt file

695 |
696 | 697 | Expand source code 698 | 699 |
def add_padding_to_timestamp(self, contents: str) -> str:
700 |     """Add 00 to padding timestamp of to srt format
701 | 
702 |     :contents -- contents of vtt file
703 |     """
704 |     find_srt = Template(r'$a,$b --> $a,$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
705 |     minute = r"((?:\d\d:){1}\d\d)"
706 |     second = r"((?:\d\d:){0}\d\d)"
707 |     padding_minute = find_srt.substitute(a=minute, b=r"(\d{0,3})")
708 |     padding_second = find_srt.substitute(a=second, b=r"(\d{0,3})")
709 |     replacement = re.sub(
710 |         padding_minute, r"00:\1,\2 --> 00:\3,\4\n", contents)
711 |     return re.sub(padding_second, r"00:00:\1,\2 --> 00:00:\3,\4\n", replacement)
712 |
713 |
714 |
715 | def add_sequence_numbers(self, contents: str) ‑> str 716 |
717 |
718 |

Adds sequence numbers to subtitle contents and returns new subtitle contents

719 |

:contents – contents of vtt file

720 |
721 | 722 | Expand source code 723 | 724 |
def add_sequence_numbers(self, contents: str) -> str:
725 |     """Adds sequence numbers to subtitle contents and returns new subtitle contents
726 | 
727 |     :contents -- contents of vtt file
728 |     """
729 |     lines = contents.split('\n')
730 |     out = ''
731 |     counter = 1
732 |     for line in lines:
733 |         if self.has_timestamp(line):
734 |             out += str(counter) + '\n'
735 |             counter += 1
736 |         out += line + '\n'
737 |     return out
738 |
739 |
740 |
741 | def convert_content(self, contents: str) ‑> str 742 |
743 |
744 |

Convert content of vtt file to srt format

745 |

:contents – contents of vtt file

746 |
747 | 748 | Expand source code 749 | 750 |
def convert_content(self, contents: str) -> str:
751 |     """Convert content of vtt file to srt format
752 | 
753 |     :contents -- contents of vtt file
754 |     """
755 |     replacement = self.convert_timestamp(contents)
756 |     replacement = self.convert_header(replacement)
757 |     replacement = re.sub(r"<c[.\w\d]*>", "", replacement)
758 |     replacement = re.sub(r"</c>", "", replacement)
759 |     replacement = re.sub(r"<\d\d:\d\d:\d\d.\d\d\d>", "", replacement)
760 |     replacement = re.sub(
761 |         r"::[\-\w]+\([\-.\w\d]+\)[ ]*{[.,:;\(\) \-\w\d]+\n }\n", "", replacement)
762 |     replacement = re.sub(r"Style:\n##\n", "", replacement)
763 |     replacement = self.remove_simple_identifiers(replacement)
764 |     replacement = self.add_sequence_numbers(replacement)
765 | 
766 |     return replacement
767 |
768 |
769 |
770 | def convert_header(self, contents: str) ‑> str 771 |
772 |
773 |

Convert of vtt header to srt format

774 |

:contents – contents of vtt file

775 |
776 | 777 | Expand source code 778 | 779 |
def convert_header(self, contents: str) -> str:
780 |     """Convert of vtt header to srt format
781 | 
782 |     :contents -- contents of vtt file
783 |     """
784 |     replacement = re.sub(r"WEBVTT\n", "", contents)
785 |     replacement = re.sub(r"Kind:[ \-\w]+\n", "", replacement)
786 |     replacement = re.sub(r"Language:[ \-\w]+\n", "", replacement)
787 |     return replacement
788 |
789 |
790 |
791 | def convert_timestamp(self, contents: str) ‑> str 792 |
793 |
794 |

Convert timestamp of vtt file to srt format

795 |

:contents – contents of vtt file

796 |
797 | 798 | Expand source code 799 | 800 |
def convert_timestamp(self, contents: str) -> str:
801 |     """Convert timestamp of vtt file to srt format
802 | 
803 |     :contents -- contents of vtt file
804 |     """
805 |     find_vtt = Template(r'$a.$b --> $a.$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n')
806 |     all_timestamp = find_vtt.substitute(
807 |         a=r"((?:\d\d:){0,2}\d\d)", b=r"(\d{0,3})")
808 |     return self.add_padding_to_timestamp(re.sub(all_timestamp, r"\1,\2 --> \3,\4\n", contents))
809 |
810 |
811 |
812 | def has_timestamp(self, content: str) ‑> bool 813 |
814 |
815 |

Check if line is a timestamp srt format

816 |

:contents – contents of vtt file

817 |
818 | 819 | Expand source code 820 | 821 |
def has_timestamp(self, content: str) -> bool:
822 |     """Check if line is a timestamp srt format
823 | 
824 |     :contents -- contents of vtt file
825 |     """
826 |     return re.match(r"((\d\d:){2}\d\d),(\d{3}) --> ((\d\d:){2}\d\d),(\d{3})", content) is not None
827 |
828 |
829 |
830 | def process(self, filename: str, encoding_format: str = 'utf-8') 831 |
832 |
833 |

Convert vtt file to a srt file

834 |

:str_name_file – filename path 835 | :encoding_format – encoding format

836 |
837 | 838 | Expand source code 839 | 840 |
def process(self, filename: str, encoding_format: str = "utf-8"):
841 |     """Convert vtt file to a srt file
842 | 
843 |     :str_name_file -- filename path
844 |     :encoding_format -- encoding format
845 |     """
846 |     file_contents: str = self.read_file(filename, encoding_format)
847 |     str_data: str = ""
848 |     str_data = str_data + self.convert_content(file_contents)
849 |     filename = filename.replace(".vtt", ".srt")
850 |     self.write_file(filename, str_data, encoding_format)
851 |
852 |
853 |
854 | def read_file(self, filename: str, encoding_format: str = 'utf-8') 855 |
856 |
857 |

Read a file text

858 |

:filename – filename path 859 | :encoding_format – encoding format

860 |
861 | 862 | Expand source code 863 | 864 |
def read_file(self, filename: str, encoding_format: str = "utf-8"):
865 |     """Read a file text
866 | 
867 |     :filename -- filename path
868 |     :encoding_format -- encoding format
869 |     """
870 |     content: str = ''
871 |     with open(filename, mode="r", encoding=encoding_format) as file:
872 |         print(f"file being read: {filename}\n")
873 |         content = file.read()
874 | 
875 |     return content
876 |
877 |
878 |
879 | def remove_simple_identifiers(self, contents: str) ‑> str 880 |
881 |
882 |

Remove simple identifiers of vtt file

883 |

:contents – contents of vtt file

884 |
885 | 886 | Expand source code 887 | 888 |
def remove_simple_identifiers(self, contents: str) -> str:
889 |     """Remove simple identifiers of vtt file
890 | 
891 |     :contents -- contents of vtt file
892 |     """
893 |     lines = contents.split('\n')
894 |     out = []
895 |     for i, line in enumerate(lines):
896 |         if self.has_timestamp(line):
897 |             if re.match(r"^\d+$", lines[i - 1]):
898 |                 out.pop()
899 |         out.append(line)
900 |     return '\n'.join(out)
901 |
902 |
903 |
904 | def write_file(self, filename: str, data, encoding_format: str = 'utf-8') 905 |
906 |
907 |

Create a file with some data

908 |

:filename – filename pat 909 | :data – data to write 910 | :encoding_format – encoding format

911 |
912 | 913 | Expand source code 914 | 915 |
def write_file(self, filename: str, data, encoding_format: str = "utf-8"):
916 |     """Create a file with some data
917 | 
918 |     :filename -- filename pat
919 |     :data -- data to write
920 |     :encoding_format -- encoding format
921 |     """
922 |     try:
923 |         with open(filename, "w", encoding=encoding_format) as file:
924 |             file.writelines(str(data))
925 |     except IOError:
926 |         filename = filename.split(os.sep)[-1]
927 |         with open(filename, "w", encoding=encoding_format) as file:
928 |             file.writelines(str(data))
929 |     print(f"file created {filename}\n")
930 |
931 |
932 |
933 |
934 |
935 |
936 |
937 | 982 |
983 | 986 | 987 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsonzilla/vtt_to_srt3/c1032e45aaaad0de121e7e039814a3be4ac94849/requirements.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open(file="README.md", mode="r", encoding="utf-8") as fh: 4 | 5 | long_description = fh.read() 6 | 7 | setuptools.setup(name='vtt_to_srt3', 8 | version='0.2.0.4', 9 | author="Jeison Cardoso", 10 | author_email="j@jsonzilla.com", 11 | maintainer="Jeison Cardoso", 12 | description="vtt to srt subtitles converter package", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/jsonzilla/vtt_to_srt3", 16 | packages=setuptools.find_packages(exclude=["tests", "tests.*"]), 17 | classifiers=["Programming Language :: Python :: 3.7", 18 | "Programming Language :: Python :: 3.8", 19 | "Programming Language :: Python :: 3.9", 20 | "Programming Language :: Python :: 3.10", 21 | "Programming Language :: Python :: 3.11", 22 | "Operating System :: OS Independent"], 23 | entry_points={ 24 | "console_scripts": 25 | ["vtt_to_srt=vtt_to_srt.vtt_to_srt:main"] 26 | }, 27 | ) 28 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Jeison Cardoso 3 | 4 | import os 5 | import sys 6 | 7 | sys.path.append(os.path.realpath(os.path.dirname(__file__) + "/..")) 8 | -------------------------------------------------------------------------------- /tests/idd.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 1 4 | 00:00:08.393 --> 00:00:10.437 5 | ♪ ♪ 6 | 7 | 2 8 | 00:00:11.688 --> 00:00:14.941 9 | Narrator: blaba 10 | 2 11 | 12 | 13 | 3 14 | 00:00:15.024 --> 00:00:15.817 15 | bla 16 | 17 | 4 18 | 00:00:15.900 --> 00:00:18.820 19 | blaba, 20 | 21 | 5 22 | 00:00:18.903 --> 00:00:21.573 23 | blaba. 24 | 25 | 6 26 | 00:00:22.907 --> 00:00:25.910 27 | oh 28 | ah 29 | 30 | 7 31 | 00:00:25.994 --> 00:00:28.371 32 | blaba 33 | 34 | 35 | 8 36 | 00:00:28.455 --> 00:00:32.125 37 | blaba 38 | blaba 39 | 40 | 41 | 9 42 | 00:00:32.208 --> 00:00:34.002 43 | blaba 44 | 1010 45 | -------------------------------------------------------------------------------- /tests/idd_format.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 1 4 | 00:00:08.393 --> 00:00:10.437 5 | ♪ ♪ 6 | 7 | 2 8 | 00:00:11.688 --> 00:00:14.941 9 | Narrator: blaba 10 | 2 11 | 12 | 13 | 3 14 | 00:00:15.024 --> 00:00:15.817 15 | bla 16 | 17 | 4 18 | 00:00:15.900 --> 00:00:18.820 19 | blaba, 20 | 21 | 5 22 | 00:00:18.903 --> 00:00:21.573 23 | blaba. 24 | 25 | 6 26 | 00:00:22.907 --> 00:00:25.910 27 | oh 28 | ah 29 | 30 | 7 31 | 00:00:25.994 --> 00:00:28.371 32 | blaba 33 | 34 | 35 | 8 36 | 00:00:28.455 --> 00:00:32.125 37 | blaba 38 | blaba 39 | 40 | 41 | 9 42 | 00:00:32.208 --> 00:00:34.002 43 | blaba 44 | 1010 45 | -------------------------------------------------------------------------------- /tests/input_alternative_iso-8859-2.vtt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsonzilla/vtt_to_srt3/c1032e45aaaad0de121e7e039814a3be4ac94849/tests/input_alternative_iso-8859-2.vtt -------------------------------------------------------------------------------- /tests/input_alternative_utf8.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 00:01.000 --> 00:04.000 4 | - Never drink liquid nitrogen. 5 | 6 | 00:05.000 --> 00:09.000 7 | - It will perforate your stomach. 8 | - You could die. 9 | -------------------------------------------------------------------------------- /tests/input_iso-8859-2.vtt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsonzilla/vtt_to_srt3/c1032e45aaaad0de121e7e039814a3be4ac94849/tests/input_iso-8859-2.vtt -------------------------------------------------------------------------------- /tests/input_utf8.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 00:01.000 --> 00:04.000 4 | - Never drink liquid nitrogen. 5 | 6 | 00:05.000 --> 00:09.000 7 | - It will perforate your stomach. 8 | - You could die. 9 | -------------------------------------------------------------------------------- /tests/test_base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Jeison Cardoso 3 | 4 | 5 | import os 6 | import pytest 7 | 8 | 9 | def _clean(): 10 | """Remove all files with .srt extension without valid_output in name recursively""" 11 | for root, _, files in os.walk(os.path.dirname(__file__)): 12 | for file in files: 13 | if file.endswith(".srt") and "valid_output" not in file: 14 | os.remove(os.path.join(root, file)) 15 | 16 | 17 | @pytest.fixture(autouse=True, scope="module") 18 | def clean_files(): 19 | """Clean files""" 20 | _clean() 21 | yield 22 | _clean() 23 | 24 | 25 | def concat_path(pathname): 26 | """Concat path to file for unix and windows""" 27 | return os.path.join(os.path.dirname(__file__), pathname) 28 | 29 | 30 | def equals_files(file_a, file_b, encoding): 31 | """Compare two text files independently of line endings""" 32 | with open(concat_path(file_a), "r", encoding=encoding) as file_a: 33 | with open(concat_path(file_b), "r", encoding=encoding) as file_b: 34 | a = file_a.read() 35 | b = file_b.read() 36 | return a == b 37 | -------------------------------------------------------------------------------- /tests/test_convert_directory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Jeison Cardoso 3 | 4 | import os 5 | import pytest 6 | 7 | from test_base import concat_path, equals_files, clean_files 8 | from vtt_to_srt.vtt_to_srt import ConvertDirectories 9 | 10 | 11 | class TestConvertDirectories: 12 | """Test ConvertFile class""" 13 | 14 | def test_convert_directory(self, clean_files): 15 | """Test convert file""" 16 | convert_file = ConvertDirectories( 17 | concat_path("."), False, "utf-8") 18 | convert_file.convert() 19 | 20 | assert equals_files("input_alternative_utf8.srt", 21 | "valid_output_utf8.srt", "utf-8") 22 | 23 | def test_convert_directory_recursive(self, clean_files): 24 | """Test convert file""" 25 | convert_file = ConvertDirectories( 26 | concat_path("."), True, "utf-8") 27 | convert_file.convert() 28 | 29 | assert equals_files("input_alternative_utf8.srt", 30 | "valid_output_utf8.srt", "utf-8") 31 | -------------------------------------------------------------------------------- /tests/test_convert_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Jeison Cardoso 3 | 4 | import os 5 | import pytest 6 | 7 | from test_base import concat_path, equals_files, clean_files 8 | from vtt_to_srt.vtt_to_srt import ConvertFile 9 | 10 | 11 | class TestConvertFile: 12 | """Test ConvertFile class""" 13 | 14 | def test_convert_file(self, clean_files): 15 | """Test convert file""" 16 | convert_file = ConvertFile( 17 | concat_path("input_utf8.vtt"), "utf-8") 18 | convert_file.convert() 19 | 20 | assert equals_files("input_utf8.srt", 21 | "valid_output_utf8.srt", "utf-8") 22 | 23 | def test_convert_file_with_simple_identifier(self, clean_files): 24 | """Test convert file with simple identifier""" 25 | convert_file = ConvertFile(concat_path("idd.vtt"), "utf-8") 26 | convert_file.convert() 27 | 28 | assert equals_files("idd.srt", 29 | "valid_output_idd.srt", "utf-8") 30 | 31 | def test_convert_file_not_utf8(self, clean_files): 32 | """Test convert file with not utf-8 encoding""" 33 | convert_file = ConvertFile( 34 | concat_path("input_iso-8859-2.vtt"), "ISO-8859-2") 35 | convert_file.convert() 36 | 37 | assert equals_files("input_iso-8859-2.srt", 38 | "valid_output_iso-8859-2.srt", "ISO-8859-2") 39 | 40 | def test_convert_file_no_format(self, clean_files): 41 | """ Test convert file with remove format tags """ 42 | convert_file = ConvertFile(concat_path("idd_format.vtt"), "utf-8", True) 43 | convert_file.convert() 44 | 45 | assert equals_files("idd_format.srt", 46 | "valid_output_idd_format.srt", "utf-8") 47 | 48 | -------------------------------------------------------------------------------- /tests/test_vtt_to_str.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Jeison Cardoso 3 | 4 | import pytest 5 | 6 | from vtt_to_srt.vtt_to_srt import VttToStr 7 | 8 | 9 | class TestVttToStr: 10 | def test_convert_header(self): 11 | assert repr(VttToStr().convert_header( 12 | "WEBVTT\nKind: captions\nLanguage: zh-TW")) == repr("Language: zh-TW") 13 | 14 | def test_convert_timestamp(self): 15 | vtt_to_str = VttToStr() 16 | assert repr(vtt_to_str.convert_timestamp("00:03:08.500 --> 00:03:15.300\n") 17 | ) == repr("00:03:08,500 --> 00:03:15,300\n") 18 | assert repr(vtt_to_str.convert_timestamp("03:08.500 --> 03:15.300\n") 19 | ) == repr("00:03:08,500 --> 00:03:15,300\n") 20 | assert repr(vtt_to_str.convert_timestamp("08.500 --> 15.300\n") 21 | ) == repr("00:00:08,500 --> 00:00:15,300\n") 22 | 23 | def test_not_add_sequence_before(self): 24 | vtt_to_str = VttToStr() 25 | assert repr(vtt_to_str.add_sequence_numbers("What you got, a billion could've never bought (oooh)")) == repr( 26 | "What you got, a billion could've never bought (oooh)\n") 27 | assert repr(vtt_to_str.add_sequence_numbers("") 28 | ) == repr("\n") 29 | assert repr(vtt_to_str.add_sequence_numbers("告訴你,今晚我想帶你出去。")) == repr( 30 | "告訴你,今晚我想帶你出去。\n") 31 | assert repr(vtt_to_str.add_sequence_numbers("Hi --> MAX") 32 | ) == repr("Hi --> MAX\n") 33 | 34 | def test_add_sequence_before_timestamp(self): 35 | vtt_to_str = VttToStr() 36 | assert repr(vtt_to_str.add_sequence_numbers("00:03:08,500 --> 00:03:15,300") 37 | ) == repr("1\n00:03:08,500 --> 00:03:15,300\n") 38 | 39 | def test_convert_empty_return_newline(self): 40 | vtt_to_str = VttToStr() 41 | assert repr(vtt_to_str.convert_content("")) == repr("\n") 42 | 43 | def test_convert_header_language(self): 44 | vtt_to_str = VttToStr() 45 | assert repr(vtt_to_str.convert_content("WEBVTT\nKind: captions\nLanguage: zh-TW") 46 | ) == repr("Language: zh-TW\n") 47 | 48 | def test_text(self): 49 | vtt_to_str = VttToStr() 50 | assert repr(vtt_to_str.convert_content("告訴你,今晚我想帶你出去。")) == repr( 51 | "告訴你,今晚我想帶你出去。\n") 52 | assert repr(vtt_to_str.convert_content("What you got, a billion could've never bought (oooh)")) == repr( 53 | "What you got, a billion could've never bought (oooh)\n") 54 | -------------------------------------------------------------------------------- /tests/valid_output_idd.srt: -------------------------------------------------------------------------------- 1 | 1 2 | 00:00:08,393 --> 00:00:10,437 3 | ♪ ♪ 4 | 5 | 2 6 | 00:00:11,688 --> 00:00:14,941 7 | Narrator: blaba 8 | 2 9 | 10 | 3 11 | 00:00:15,024 --> 00:00:15,817 12 | bla 13 | 14 | 4 15 | 00:00:15,900 --> 00:00:18,820 16 | blaba, 17 | 18 | 5 19 | 00:00:18,903 --> 00:00:21,573 20 | blaba. 21 | 22 | 6 23 | 00:00:22,907 --> 00:00:25,910 24 | oh 25 | ah 26 | 27 | 7 28 | 00:00:25,994 --> 00:00:28,371 29 | blaba 30 | 31 | 8 32 | 00:00:28,455 --> 00:00:32,125 33 | blaba 34 | blaba 35 | 36 | 9 37 | 00:00:32,208 --> 00:00:34,002 38 | blaba 39 | 1010 40 | -------------------------------------------------------------------------------- /tests/valid_output_idd_format.srt: -------------------------------------------------------------------------------- 1 | 1 2 | 00:00:08,393 --> 00:00:10,437 3 | ♪ ♪ 4 | 5 | 2 6 | 00:00:11,688 --> 00:00:14,941 7 | Narrator: blaba 8 | 2 9 | 10 | 3 11 | 00:00:15,024 --> 00:00:15,817 12 | bla 13 | 14 | 4 15 | 00:00:15,900 --> 00:00:18,820 16 | blaba, 17 | 18 | 5 19 | 00:00:18,903 --> 00:00:21,573 20 | blaba. 21 | 22 | 6 23 | 00:00:22,907 --> 00:00:25,910 24 | oh 25 | ah 26 | 27 | 7 28 | 00:00:25,994 --> 00:00:28,371 29 | blaba 30 | 31 | 8 32 | 00:00:28,455 --> 00:00:32,125 33 | blaba 34 | blaba 35 | 36 | 9 37 | 00:00:32,208 --> 00:00:34,002 38 | blaba 39 | 1010 40 | -------------------------------------------------------------------------------- /tests/valid_output_iso-8859-2.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsonzilla/vtt_to_srt3/c1032e45aaaad0de121e7e039814a3be4ac94849/tests/valid_output_iso-8859-2.srt -------------------------------------------------------------------------------- /tests/valid_output_utf8.srt: -------------------------------------------------------------------------------- 1 | 1 2 | 00:00:01,000 --> 00:00:04,000 3 | - Never drink liquid nitrogen. 4 | 5 | 2 6 | 00:00:05,000 --> 00:00:09,000 7 | - It will perforate your stomach. 8 | - You could die. 9 | -------------------------------------------------------------------------------- /vtt_to_srt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsonzilla/vtt_to_srt3/c1032e45aaaad0de121e7e039814a3be4ac94849/vtt_to_srt/__init__.py -------------------------------------------------------------------------------- /vtt_to_srt/vtt_to_srt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Jansen A. Simanullang / Jeison Cardoso 3 | 4 | """Convert of vtt to srt format""" 5 | 6 | import os 7 | import re 8 | import argparse 9 | from string import Template 10 | from stat import S_ISDIR, ST_MODE, S_ISREG 11 | 12 | 13 | class VttToStr: 14 | """Convert vtt to srt""" 15 | 16 | def __init__(self) -> None: 17 | pass 18 | 19 | def convert_header(self, contents: str) -> str: 20 | """Convert of vtt header to srt format 21 | 22 | :contents -- contents of vtt file 23 | """ 24 | replacement = re.sub(r"WEBVTT\n", "", contents) 25 | replacement = re.sub(r"Kind:[ \-\w]+\n", "", replacement) 26 | replacement = re.sub(r"Language:[ \-\w]+\n", "", replacement) 27 | return replacement 28 | 29 | def add_padding_to_timestamp(self, contents: str) -> str: 30 | """Add 00 to padding timestamp of to srt format 31 | 32 | :contents -- contents of vtt file 33 | """ 34 | find_srt = Template(r'$a,$b --> $a,$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n') 35 | minute = r"((?:\d\d:){1}\d\d)" 36 | second = r"((?:\d\d:){0}\d\d)" 37 | padding_minute = find_srt.substitute(a=minute, b=r"(\d{0,3})") 38 | padding_second = find_srt.substitute(a=second, b=r"(\d{0,3})") 39 | replacement = re.sub( 40 | padding_minute, r"00:\1,\2 --> 00:\3,\4\n", contents) 41 | return re.sub(padding_second, r"00:00:\1,\2 --> 00:00:\3,\4\n", replacement) 42 | 43 | def convert_timestamp(self, contents: str) -> str: 44 | """Convert timestamp of vtt file to srt format 45 | 46 | :contents -- contents of vtt file 47 | """ 48 | find_vtt = Template(r'$a.$b --> $a.$b(?:[ \-\w]+:[\w\%\d:,.]+)*\n') 49 | all_timestamp = find_vtt.substitute( 50 | a=r"((?:\d\d:){0,2}\d\d)", b=r"(\d{0,3})") 51 | return self.add_padding_to_timestamp(re.sub(all_timestamp, r"\1,\2 --> \3,\4\n", contents)) 52 | 53 | def convert_content(self, contents: str, remove_format: bool = False) -> str: 54 | """Convert content of vtt file to srt format 55 | 56 | :contents -- contents of vtt file 57 | """ 58 | replacement = self.convert_timestamp(contents) 59 | replacement = self.convert_header(replacement) 60 | replacement = re.sub(r"", "", replacement) 61 | replacement = re.sub(r"", "", replacement) 62 | replacement = re.sub(r"<\d\d:\d\d:\d\d.\d\d\d>", "", replacement) 63 | replacement = re.sub( 64 | r"::[\-\w]+\([\-.\w\d]+\)[ ]*{[.,:;\(\) \-\w\d]+\n }\n", "", replacement) 65 | replacement = re.sub(r"Style:\n##\n", "", replacement) 66 | if remove_format: 67 | replacement = re.sub(r"<[^>]*>", "", replacement) 68 | replacement = self.remove_blank_lines(replacement) 69 | replacement = self.remove_simple_identifiers(replacement) 70 | replacement = self.add_sequence_numbers(replacement) 71 | 72 | return replacement 73 | 74 | def has_timestamp(self, content: str) -> bool: 75 | """Check if line is a timestamp srt format 76 | 77 | :contents -- contents of vtt file 78 | """ 79 | return re.match(r"((\d\d:){2}\d\d),(\d{3}) --> ((\d\d:){2}\d\d),(\d{3})", content) is not None 80 | 81 | def add_sequence_numbers(self, contents: str) -> str: 82 | """Adds sequence numbers to subtitle contents and returns new subtitle contents 83 | 84 | :contents -- contents of vtt file 85 | """ 86 | lines = contents.split('\n') 87 | out = '' 88 | counter = 1 89 | for line in lines: 90 | if self.has_timestamp(line): 91 | out += str(counter) + '\n' 92 | counter += 1 93 | out += line + '\n' 94 | return out 95 | 96 | def remove_blank_lines(self, contents: str) -> str: 97 | # Remove useless blank lines from the vtt file 98 | lines = contents.split('\n') 99 | lines = [x for x in lines if x != ''] 100 | lines.append('') 101 | out = [] 102 | num = 0 103 | while num < len(lines) : 104 | if re.match(r"^\d+$", lines[num]) and self.has_timestamp(lines[num + 1]): 105 | if num == 0 : 106 | pass 107 | else: 108 | out.append('') 109 | out.append(lines[num]) 110 | out.append(lines[num + 1]) 111 | num += 2 112 | elif self.has_timestamp(lines[num]): 113 | if num == 0 : 114 | pass 115 | else : 116 | out.append('') 117 | out.append(lines[num]) 118 | num += 1 119 | else: 120 | out.append(lines[num]) 121 | num += 1 122 | out.pop() 123 | return '\n'.join(out) 124 | 125 | def remove_simple_identifiers(self, contents: str) -> str: 126 | """Remove simple identifiers of vtt file 127 | 128 | :contents -- contents of vtt file 129 | """ 130 | lines = contents.split('\n') 131 | out = [] 132 | for i, line in enumerate(lines): 133 | if self.has_timestamp(line): 134 | if re.match(r"^\d+$", lines[i - 1]): 135 | out.pop() 136 | out.append(line) 137 | return '\n'.join(out) 138 | 139 | def write_file(self, filename: str, data, encoding_format: str = "utf-8"): 140 | """Create a file with some data 141 | 142 | :filename -- filename pat 143 | :data -- data to write 144 | :encoding_format -- encoding format 145 | """ 146 | try: 147 | with open(filename, "w", encoding=encoding_format) as file: 148 | file.writelines(str(data)) 149 | except IOError: 150 | filename = filename.split(os.sep)[-1] 151 | with open(filename, "w", encoding=encoding_format) as file: 152 | file.writelines(str(data)) 153 | print(f"file created {filename}\n") 154 | 155 | def read_file(self, filename: str, encoding_format: str = "utf-8"): 156 | """Read a file text 157 | 158 | :filename -- filename path 159 | :encoding_format -- encoding format 160 | """ 161 | content: str = '' 162 | with open(filename, mode="r", encoding=encoding_format) as file: 163 | print(f"file being read: {filename}\n") 164 | content = file.read() 165 | 166 | return content 167 | 168 | def process(self, filename: str, remove_format : bool, encoding_format: str = "utf-8"): 169 | """Convert vtt file to a srt file 170 | 171 | :str_name_file -- filename path 172 | :encoding_format -- encoding format 173 | """ 174 | file_contents: str = self.read_file(filename, encoding_format) 175 | str_data: str = "" 176 | str_data = str_data + self.convert_content(file_contents, remove_format) 177 | filename = filename.replace(".vtt", ".srt") 178 | self.write_file(filename, str_data, encoding_format) 179 | 180 | 181 | class ConvertFile: 182 | """Convert vtt file to srt file""" 183 | 184 | def __init__(self, pathname: str, encoding_format: str, remove_format: bool = False): 185 | """Constructor 186 | 187 | :pathname -- path to file or directory 188 | :encoding_format -- encoding format 189 | """ 190 | self.pathname = pathname 191 | self.encoding_format = encoding_format 192 | self.remove_format = remove_format 193 | self.vtt_to_str = VttToStr() 194 | 195 | def convert(self): 196 | """Convert vtt file to srt file""" 197 | if ".vtt" in self.pathname: 198 | self.vtt_to_str.process(self.pathname,self.remove_format, self.encoding_format) 199 | 200 | 201 | class ConvertDirectories: 202 | """Convert vtt files to srt files""" 203 | 204 | def __init__(self, pathname: str, enable_recursive: bool, encoding_format: str, remove_format: bool = False): 205 | """Constructor 206 | 207 | pathname -- path to file or directory 208 | :enable_recursive -- enable recursive 209 | :encoding_format -- encoding format 210 | """ 211 | self.pathname = pathname 212 | self.enable_recursive = enable_recursive 213 | self.encoding_format = encoding_format 214 | self.remove_format = remove_format 215 | self.vtt_to_str = VttToStr() 216 | 217 | def _walk_dir(self, top_most_path: str, callback): 218 | """Walk a directory 219 | 220 | :top_most_path -- parent directory 221 | :callback -- function to call 222 | """ 223 | for file in os.listdir(top_most_path): 224 | pathname = os.path.join(top_most_path, file) 225 | if not os.path.isdir(pathname): 226 | # It"s a file, call the callback function 227 | callback(pathname) 228 | 229 | def _walk_tree(self, top_most_path, callback): 230 | """Recursively descend the directory tree rooted at top_most_path, 231 | calling the callback function for each regular file 232 | 233 | :top_most_path -- parent directory 234 | :callback -- function to call 235 | """ 236 | for file in os.listdir(top_most_path): 237 | pathname = os.path.join(top_most_path, file) 238 | mode = os.stat(pathname)[ST_MODE] 239 | if S_ISDIR(mode): 240 | # It's a directory, recurse into it 241 | self._walk_tree(pathname, callback) 242 | elif S_ISREG(mode): 243 | # It's a file, call the callback function 244 | callback(pathname) 245 | else: 246 | # Unknown file type, print a message 247 | print(f"Skipping {pathname}") 248 | 249 | def convert_vtt_to_str(self, file: str): 250 | """Convert vtt file to string 251 | 252 | :file -- file to convert 253 | """ 254 | if ".vtt" in file: 255 | try: 256 | self.vtt_to_str.process(file, self.remove_format, self.encoding_format) 257 | except UnicodeDecodeError: 258 | print(f"UnicodeDecodeError: {file}") 259 | 260 | def _vtt_to_srt_batch(self, directory: str): 261 | """Walk down directory searching for vtt files 262 | 263 | :directory -- path to search 264 | """ 265 | top_most_path = directory 266 | if self.enable_recursive: 267 | self._walk_tree(top_most_path, self.convert_vtt_to_str) 268 | else: 269 | self._walk_dir(top_most_path, self.convert_vtt_to_str) 270 | 271 | def convert(self): 272 | """Convert vtt files to srt files""" 273 | self._vtt_to_srt_batch(self.pathname) 274 | 275 | 276 | def _show_usage(): 277 | """Show a info message about the usage""" 278 | print("\nUsage:\tvtt_to_srt pathname [-r]\n") 279 | print("\tpathname\t- a file or directory with files to be converted") 280 | print("\t-r\t\t- walk path recursively\n") 281 | print("\t-rf\t\t- remove the format tags like bold & italic from output files\n") 282 | 283 | 284 | def _parse_args(): 285 | """Parse command line arguments""" 286 | parser = argparse.ArgumentParser( 287 | description='Convert vtt files to srt files') 288 | parser.add_argument( 289 | "pathname", help="a file or directory with files to be converted") 290 | parser.add_argument("-r", "--recursive", 291 | help="walk path recursively", action="store_true") 292 | parser.add_argument("-e", "--encoding", 293 | help="encoding format for input and output files") 294 | parser.add_argument("-rf", "--remove_format", 295 | help="remove the format tags like bold & italic from output files", action="store_true") 296 | 297 | args = parser.parse_args() 298 | return args 299 | 300 | 301 | def main(): 302 | """main function""" 303 | 304 | args = _parse_args() 305 | pathname = args.pathname 306 | recursive = args.recursive 307 | encoding = args.encoding 308 | remove_format = args.remove_format 309 | 310 | if not encoding: 311 | encoding = "utf-8" 312 | 313 | if os.path.isfile(pathname): 314 | print(f"file being converted: {pathname}\n") 315 | ConvertFile(pathname, encoding, remove_format).convert() 316 | 317 | if os.path.isdir(pathname): 318 | print(f"directory being converted: {pathname}\n") 319 | ConvertDirectories(pathname, recursive, encoding, remove_format).convert() 320 | 321 | if not os.path.isfile(pathname) and not os.path.isdir(pathname): 322 | print(f"pathname is not a file or directory: {pathname}\n") 323 | _show_usage() 324 | 325 | 326 | if __name__ == "__main__": 327 | main() 328 | --------------------------------------------------------------------------------