├── srt_tools ├── __init__.py ├── tests │ ├── __init__.py │ ├── files │ │ ├── gb2312.srt │ │ └── ascii.srt │ └── test_srt_tools.py ├── srt-normalise ├── srt-fixed-timeshift ├── srt ├── srt-play ├── srt-process ├── srt-lines-matching ├── README.rst ├── srt-deduplicate ├── srt-linear-timeshift ├── srt-mux └── utils.py ├── docs ├── requirements.txt ├── api.rst ├── index.rst ├── conf.py └── quickstart.rst ├── .coveragerc ├── MANIFEST.in ├── tests ├── requirements.txt └── test_srt.py ├── LICENSE ├── .github └── workflows │ └── ci.yml ├── tox.ini ├── setup.py ├── README.rst └── srt.py /srt_tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /srt_tools/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==3.* 2 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | relative_files = True 3 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | API documentation 2 | ================= 3 | 4 | .. automodule:: srt 5 | :members: 6 | -------------------------------------------------------------------------------- /srt_tools/tests/files/gb2312.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdown/srt/HEAD/srt_tools/tests/files/gb2312.srt -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include MANIFEST.in 3 | include README.rst 4 | recursive-include docs * 5 | recursive-include tests * 6 | 7 | recursive-exclude * *.py[co] 8 | recursive-exclude * __pycache__ 9 | -------------------------------------------------------------------------------- /srt_tools/tests/files/ascii.srt: -------------------------------------------------------------------------------- 1 | 2 2 | 00:00:27,000 --> 00:00:30,730 3 | ascii 4 | 5 | 4 6 | 00:00:31,500 --> 00:00:34,100 7 | oh look 8 | 9 | 6 10 | 00:00:34,100 --> 00:00:36,570 11 | ascii everywhere 12 | 13 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest==4.*; python_version < '3.0' 2 | pytest==6.*; python_version >= '3.0' 3 | pytest-xdist==1.*; python_version < '3.0' 4 | pytest-xdist==2.*; python_version >= '3.0' 5 | pytest-cov==2.* 6 | hypothesis==4.*; python_version < '3.6' 7 | hypothesis==6.*; python_version >= '3.6' 8 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | `srt`: Parse SubRip files 2 | ========================= 3 | 4 | srt_ is a tiny Python library for parsing, modifying, and composing SRT files. 5 | 6 | .. _srt: https://github.com/cdown/srt 7 | 8 | Documentation 9 | ============= 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | quickstart 15 | api 16 | 17 | Indices and tables 18 | ================== 19 | 20 | * :ref:`genindex` 21 | * :ref:`search` 22 | -------------------------------------------------------------------------------- /srt_tools/srt-normalise: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Takes a badly formatted SRT file and outputs a strictly valid one.""" 4 | 5 | import srt_tools.utils 6 | import logging 7 | 8 | log = logging.getLogger(__name__) 9 | 10 | 11 | def main(): 12 | examples = {"Normalise a subtitle": "srt normalise -i bad.srt -o good.srt"} 13 | 14 | args = srt_tools.utils.basic_parser( 15 | description=__doc__, examples=examples, hide_no_strict=True 16 | ).parse_args() 17 | logging.basicConfig(level=args.log_level) 18 | srt_tools.utils.set_basic_args(args) 19 | output = srt_tools.utils.compose_suggest_on_fail(args.input, strict=args.strict) 20 | 21 | try: 22 | args.output.write(output) 23 | except (UnicodeEncodeError, TypeError): # Python 2 fallback 24 | args.output.write(output.encode(args.encoding)) 25 | 26 | 27 | if __name__ == "__main__": # pragma: no cover 28 | main() 29 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | # srt.py is in the next directory up 5 | sys.path.insert(0, os.path.abspath("..")) 6 | 7 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.doctest"] 8 | 9 | copyright = "Chris Down" 10 | exclude_patterns = ["_build"] 11 | master_doc = "index" 12 | project = "srt" 13 | pygments_style = "sphinx" 14 | source_suffix = ".rst" 15 | templates_path = ["_templates"] 16 | 17 | version = "3.5.3" 18 | release = version 19 | 20 | html_static_path = ["_static"] 21 | html_theme = "alabaster" 22 | htmlhelp_basename = "srtdoc" 23 | 24 | latex_elements = {} 25 | latex_documents = [("index", "srt.tex", "srt Documentation", "Chris Down", "manual")] 26 | 27 | man_pages = [("index", "srt", "srt Documentation", ["Chris Down"], 1)] 28 | 29 | texinfo_documents = [ 30 | ( 31 | "index", 32 | "srt", 33 | "srt Documentation", 34 | "Chris Down", 35 | "srt", 36 | "One line description of project.", 37 | "Miscellaneous", 38 | ) 39 | ] 40 | 41 | intersphinx_mapping = {"python": ("https://docs.python.org/3.8", None)} 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2014-present Christopher Down 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | jobs: 2 | build_and_test: 3 | name: CI 4 | strategy: 5 | matrix: 6 | # Pin to 20.04 for 3.6: https://github.com/actions/setup-python/issues/544 7 | os: [ubuntu-20.04, macos-latest, windows-latest] 8 | python-version: ['3.5', '3.6', '3.7', '3.8', '3.9', '3.10', '3.11'] 9 | runs-on: ${{ matrix.os }} 10 | 11 | steps: 12 | - uses: actions/checkout@v3 13 | 14 | - uses: actions/setup-python@v4 15 | with: 16 | python-version: ${{ matrix.python-version }} 17 | - run: python --version 18 | 19 | - run: pip install -U pip 20 | - run: pip install -U tox 21 | 22 | - if: matrix.python-version == '3.9' && startsWith(matrix.os, 'ubuntu-') 23 | run: | 24 | echo "TOXENV=doctest,black,pylint,pytype,bandit,coverage" >> "$GITHUB_ENV" 25 | 26 | - run: tox 27 | env: 28 | TOXENV: ${{ env.TOXENV }} 29 | 30 | - if: matrix.python-version == '3.9' && startsWith(matrix.os, 'ubuntu-') 31 | uses: AndreMiras/coveralls-python-action@develop 32 | 33 | on: 34 | push: 35 | pull_request: 36 | workflow_dispatch: 37 | -------------------------------------------------------------------------------- /docs/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quickstart 2 | ========== 3 | 4 | Parse an SRT to Python objects 5 | ------------------------------ 6 | 7 | .. code:: python 8 | 9 | >>> import srt 10 | >>> subtitle_generator = srt.parse('''\ 11 | ... 1 12 | ... 00:31:37,894 --> 00:31:39,928 13 | ... OK, look, I think I have a plan here. 14 | ... 15 | ... 2 16 | ... 00:31:39,931 --> 00:31:41,931 17 | ... Using mainly spoons, 18 | ... 19 | ... 3 20 | ... 00:31:41,933 --> 00:31:43,435 21 | ... we dig a tunnel under the city and release it into the wild. 22 | ... 23 | ... ''') 24 | >>> subtitles = list(subtitle_generator) 25 | >>> 26 | >>> subtitles[0].start 27 | datetime.timedelta(0, 1897, 894000) 28 | >>> subtitles[1].content 29 | 'Using mainly spoons,' 30 | 31 | Compose an SRT from Python objects 32 | ---------------------------------- 33 | 34 | .. code:: python 35 | 36 | >>> print(srt.compose(subtitles)) 37 | 1 38 | 00:31:37,894 --> 00:31:39,928 39 | OK, look, I think I have a plan here. 40 | 41 | 2 42 | 00:31:39,931 --> 00:31:41,931 43 | Using mainly spoons, 44 | 45 | 3 46 | 00:31:41,933 --> 00:31:43,435 47 | we dig a tunnel under the city and release it into the wild. 48 | 49 | 50 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = python 3 | 4 | [testenv] 5 | deps = 6 | -rtests/requirements.txt 7 | commands = 8 | {basepython} --version 9 | pytest -vv -n auto 10 | allowlist_externals = 11 | {basepython} 12 | pytest 13 | setenv= 14 | release: HYPOTHESIS_PROFILE=release 15 | 16 | [testenv:doctest] 17 | deps = 18 | {[testenv]deps} 19 | commands = 20 | pytest --doctest-modules 21 | 22 | [testenv:coverage] 23 | passenv = 24 | TRAVIS 25 | TRAVIS_JOB_ID 26 | TRAVIS_BRANCH 27 | deps = 28 | {[testenv]deps} 29 | coverage 30 | commands = 31 | coverage erase 32 | pytest -vv --cov=srt --cov-branch --cov-fail-under=100 --cov-report term-missing 33 | 34 | [testenv:pylint] 35 | skipsdist = True 36 | deps = 37 | {[testenv]deps} 38 | pylint 39 | commands = 40 | # C0330: https://github.com/psf/black/issues/1178 41 | # R0913: These are intentional design decisions, so leave them. 42 | # R0205, R1725, C0209: We still support py2. 43 | pylint --disable=C0330,R0913,R0205,R1725,C0209 srt.py 44 | 45 | [testenv:black] 46 | skipsdist = True 47 | allowlist_externals = sh 48 | deps = 49 | black 50 | commands = 51 | black --check . 52 | sh -c 'exec black --check srt_tools/srt*' 53 | 54 | [testenv:pytype] 55 | skipsdist = True 56 | deps = 57 | {[testenv]deps} 58 | pytype 59 | commands = 60 | pytype . 61 | 62 | [testenv:bandit] 63 | skipsdist = True 64 | deps = 65 | {[testenv]deps} 66 | bandit 67 | commands = 68 | bandit srt.py 69 | 70 | [testenv:pypy3] 71 | basepython = pypy3 72 | -------------------------------------------------------------------------------- /srt_tools/srt-fixed-timeshift: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Shifts a subtitle by a fixed number of seconds.""" 4 | 5 | import datetime 6 | import srt_tools.utils 7 | import logging 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | 12 | def parse_args(): 13 | examples = { 14 | "Make all subtitles 5 seconds later": "srt fixed-timeshift --seconds 5", 15 | "Make all subtitles 5 seconds earlier": "srt fixed-timeshift --seconds -5", 16 | } 17 | 18 | parser = srt_tools.utils.basic_parser(description=__doc__, examples=examples) 19 | parser.add_argument( 20 | "--seconds", type=float, required=True, help="how many seconds to shift" 21 | ) 22 | return parser.parse_args() 23 | 24 | 25 | def scalar_correct_subs(subtitles, seconds_to_shift): 26 | td_to_shift = datetime.timedelta(seconds=seconds_to_shift) 27 | for subtitle in subtitles: 28 | subtitle.start += td_to_shift 29 | subtitle.end += td_to_shift 30 | yield subtitle 31 | 32 | 33 | def main(): 34 | args = parse_args() 35 | logging.basicConfig(level=args.log_level) 36 | srt_tools.utils.set_basic_args(args) 37 | corrected_subs = scalar_correct_subs(args.input, args.seconds) 38 | output = srt_tools.utils.compose_suggest_on_fail(corrected_subs, strict=args.strict) 39 | 40 | try: 41 | args.output.write(output) 42 | except (UnicodeEncodeError, TypeError): # Python 2 fallback 43 | args.output.write(output.encode(args.encoding)) 44 | 45 | 46 | if __name__ == "__main__": # pragma: no cover 47 | main() 48 | -------------------------------------------------------------------------------- /srt_tools/srt: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import errno 6 | 7 | 8 | SRT_BIN_PREFIX = "srt-" 9 | 10 | 11 | def find_srt_commands_in_path(): 12 | paths = os.environ.get("PATH", "").split(os.pathsep) 13 | 14 | for path in paths: 15 | try: 16 | path_files = os.listdir(path) 17 | except OSError as thrown_exc: 18 | if thrown_exc.errno in (errno.ENOENT, errno.ENOTDIR): 19 | continue 20 | else: 21 | raise 22 | 23 | for path_file in path_files: 24 | if path_file.startswith(SRT_BIN_PREFIX): 25 | yield path_file[len(SRT_BIN_PREFIX) :] 26 | 27 | 28 | def show_help(): 29 | print( 30 | "Available commands " 31 | "(pass --help to a specific command for usage information):\n" 32 | ) 33 | commands = sorted(set(find_srt_commands_in_path())) 34 | for command in commands: 35 | print("- {}".format(command)) 36 | 37 | 38 | def main(): 39 | if len(sys.argv) < 2 or sys.argv[1].startswith("-"): 40 | show_help() 41 | sys.exit(0) 42 | 43 | command = sys.argv[1] 44 | 45 | available_commands = find_srt_commands_in_path() 46 | 47 | if command not in available_commands: 48 | print('Unknown command: "{}"\n'.format(command)) 49 | show_help() 50 | sys.exit(1) 51 | 52 | real_command = SRT_BIN_PREFIX + command 53 | os.execvp(real_command, [real_command] + sys.argv[2:]) 54 | 55 | 56 | if __name__ == "__main__": # pragma: no cover 57 | main() 58 | -------------------------------------------------------------------------------- /srt_tools/srt-play: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Play subtitles with correct timing to stdout.""" 4 | 5 | from __future__ import print_function 6 | import logging 7 | from threading import Timer, Lock 8 | import srt_tools.utils 9 | import sys 10 | import time 11 | 12 | log = logging.getLogger(__name__) 13 | output_lock = Lock() 14 | 15 | 16 | def print_sub(sub, encoding): 17 | log.debug("Timer woke up to print %s", sub.content) 18 | 19 | with output_lock: 20 | try: 21 | sys.stdout.write(sub.content + "\n\n") 22 | except UnicodeEncodeError: # Python 2 fallback 23 | sys.stdout.write(sub.content.encode(encoding) + "\n\n") 24 | sys.stdout.flush() 25 | 26 | 27 | def schedule(subs, encoding): 28 | timers = set() 29 | log.debug("Scheduling subtitles") 30 | 31 | for sub in subs: 32 | secs = sub.start.total_seconds() 33 | cur_timer = Timer(secs, print_sub, [sub, encoding]) 34 | cur_timer.name = "%s:%s" % (sub.index, secs) 35 | cur_timer.daemon = True 36 | log.debug('Adding "%s" to schedule queue', cur_timer.name) 37 | timers.add(cur_timer) 38 | 39 | for timer in timers: 40 | log.debug('Starting timer for "%s"', timer.name) 41 | timer.start() 42 | 43 | while any(t.is_alive() for t in timers): 44 | time.sleep(0.5) 45 | 46 | 47 | def main(): 48 | examples = {"Play a subtitle": "srt play -i foo.srt"} 49 | 50 | args = srt_tools.utils.basic_parser( 51 | description=__doc__, examples=examples, no_output=True 52 | ).parse_args() 53 | logging.basicConfig(level=args.log_level) 54 | srt_tools.utils.set_basic_args(args) 55 | schedule(args.input, args.encoding) 56 | 57 | 58 | if __name__ == "__main__": # pragma: no cover 59 | main() 60 | -------------------------------------------------------------------------------- /srt_tools/srt-process: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Process subtitle text content using arbitrary Python code.""" 4 | 5 | import importlib 6 | import srt_tools.utils 7 | import logging 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | 12 | def strip_to_matching_lines_only(subtitles, imports, func_str): 13 | for import_name in imports: 14 | real_import = importlib.import_module(import_name) 15 | globals()[import_name] = real_import 16 | 17 | func = eval(func_str) # pylint: disable-msg=eval-used 18 | 19 | for subtitle in subtitles: 20 | subtitle.content = func(subtitle.content) 21 | yield subtitle 22 | 23 | 24 | def parse_args(): 25 | examples = { 26 | "Strip HTML-like symbols from a subtitle": """srt process -m re -f 'lambda sub: re.sub("<[^<]+?>", "", sub)'""" 27 | } 28 | 29 | parser = srt_tools.utils.basic_parser(description=__doc__, examples=examples) 30 | parser.add_argument( 31 | "-f", "--func", help="a function to use to process lines", required=True 32 | ) 33 | parser.add_argument( 34 | "-m", 35 | "--module", 36 | help="modules to import in the function context", 37 | action="append", 38 | default=[], 39 | ) 40 | return parser.parse_args() 41 | 42 | 43 | def main(): 44 | args = parse_args() 45 | logging.basicConfig(level=args.log_level) 46 | srt_tools.utils.set_basic_args(args) 47 | processed_subs = strip_to_matching_lines_only(args.input, args.module, args.func) 48 | output = srt_tools.utils.compose_suggest_on_fail(processed_subs, strict=args.strict) 49 | 50 | try: 51 | args.output.write(output) 52 | except (UnicodeEncodeError, TypeError): # Python 2 fallback 53 | args.output.write(output.encode(args.encoding)) 54 | 55 | 56 | if __name__ == "__main__": # pragma: no cover 57 | main() 58 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import codecs 4 | 5 | from setuptools import setup 6 | 7 | with codecs.open("README.rst", encoding="utf8") as readme_f: 8 | README = readme_f.read() 9 | 10 | setup( 11 | name="srt", 12 | version="3.5.3", 13 | python_requires=">=2.7", 14 | description="A tiny library for parsing, modifying, and composing SRT files.", 15 | long_description=README, 16 | author="Chris Down", 17 | author_email="chris@chrisdown.name", 18 | url="https://github.com/cdown/srt", 19 | py_modules=["srt", "srt_tools.utils"], 20 | scripts=[ 21 | "srt_tools/srt", 22 | "srt_tools/srt-deduplicate", 23 | "srt_tools/srt-normalise", 24 | "srt_tools/srt-fixed-timeshift", 25 | "srt_tools/srt-linear-timeshift", 26 | "srt_tools/srt-lines-matching", 27 | "srt_tools/srt-mux", 28 | "srt_tools/srt-play", 29 | "srt_tools/srt-process", 30 | ], 31 | license="MIT", 32 | keywords="srt", 33 | classifiers=[ 34 | "Development Status :: 5 - Production/Stable", 35 | "Intended Audience :: Developers", 36 | "License :: OSI Approved :: MIT License", 37 | "Operating System :: OS Independent", 38 | "Programming Language :: Python :: 2", 39 | "Programming Language :: Python :: 2.7", 40 | "Programming Language :: Python :: 3", 41 | "Programming Language :: Python :: 3.5", 42 | "Programming Language :: Python :: 3.6", 43 | "Programming Language :: Python :: 3.7", 44 | "Programming Language :: Python :: 3.8", 45 | "Programming Language :: Python :: 3.9", 46 | "Programming Language :: Python :: 3.10", 47 | "Programming Language :: Python :: 3.11", 48 | "Topic :: Multimedia :: Video", 49 | "Topic :: Software Development :: Libraries", 50 | "Topic :: Text Processing", 51 | ], 52 | ) 53 | -------------------------------------------------------------------------------- /srt_tools/srt-lines-matching: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Filter subtitles that match or don't match a particular pattern.""" 4 | 5 | import importlib 6 | import srt_tools.utils 7 | import logging 8 | 9 | log = logging.getLogger(__name__) 10 | 11 | 12 | def strip_to_matching_lines_only(subtitles, imports, func_str, invert, per_sub): 13 | for import_name in imports: 14 | real_import = importlib.import_module(import_name) 15 | globals()[import_name] = real_import 16 | 17 | raw_func = eval(func_str) # pylint: disable-msg=eval-used 18 | 19 | if invert: 20 | func = lambda line: not raw_func(line) 21 | else: 22 | func = raw_func 23 | 24 | for subtitle in subtitles: 25 | if per_sub: 26 | if not func(subtitle.content): 27 | subtitle.content = "" 28 | else: 29 | subtitle.content = "\n".join( 30 | line for line in subtitle.content.splitlines() if func(line) 31 | ) 32 | 33 | yield subtitle 34 | 35 | 36 | def parse_args(): 37 | examples = { 38 | "Only include Chinese lines": "srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese", 39 | "Exclude all lines which only contain numbers": "srt lines-matching -v -f 'lambda x: x.isdigit()'", 40 | } 41 | parser = srt_tools.utils.basic_parser(description=__doc__, examples=examples) 42 | parser.add_argument( 43 | "-f", "--func", help="a function to use to match lines", required=True 44 | ) 45 | parser.add_argument( 46 | "-m", 47 | "--module", 48 | help="modules to import in the function context", 49 | action="append", 50 | default=[], 51 | ) 52 | parser.add_argument( 53 | "-s", 54 | "--per-subtitle", 55 | help="match the content of each subtitle, not each line", 56 | action="store_true", 57 | ) 58 | parser.add_argument( 59 | "-v", 60 | "--invert", 61 | help="invert matching -- only match lines returning False", 62 | action="store_true", 63 | ) 64 | return parser.parse_args() 65 | 66 | 67 | def main(): 68 | args = parse_args() 69 | logging.basicConfig(level=args.log_level) 70 | srt_tools.utils.set_basic_args(args) 71 | matching_subtitles_only = strip_to_matching_lines_only( 72 | args.input, args.module, args.func, args.invert, args.per_subtitle 73 | ) 74 | output = srt_tools.utils.compose_suggest_on_fail( 75 | matching_subtitles_only, strict=args.strict 76 | ) 77 | 78 | try: 79 | args.output.write(output) 80 | except (UnicodeEncodeError, TypeError): # Python 2 fallback 81 | args.output.write(output.encode(args.encoding)) 82 | 83 | 84 | if __name__ == "__main__": # pragma: no cover 85 | main() 86 | -------------------------------------------------------------------------------- /srt_tools/README.rst: -------------------------------------------------------------------------------- 1 | srt_tools contains utilities written to process SRT files. All utilities use 2 | the Python srt_ library internally. 3 | 4 | .. _srt: https://github.com/cdown/srt 5 | 6 | Usage 7 | ----- 8 | 9 | You can call ``srt`` directly to see a list of all available utilities. 10 | 11 | .. code:: 12 | 13 | srt [utility-name] [args ...] 14 | 15 | Arbitrary things can be done with *srt process* and *srt lines-matching*, for 16 | example: 17 | 18 | .. code:: 19 | 20 | # Strip HTML 21 | srt process -m re -f 'lambda sub: re.sub("<[^<]+?>", "", sub)' 22 | 23 | # Only keep Chinese subtitles 24 | srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese 25 | 26 | Utilities 27 | --------- 28 | 29 | - *deduplicate* removes subtitles with duplicate content. If you have subtitles 30 | which mistakenly repeat the same content in different subs at roughly the 31 | same time, you can run this tool to remove them. 32 | - *fixed-timeshift* does fixed time correction. For example, if you have a 33 | movie that is consistently out of sync by two seconds, you can run this tool 34 | to shift the entire subtitle two seconds ahead or behind. 35 | - *linear-timeshift* does linear time correction. If you have a movie that 36 | runs slower or faster than the subtitle that you have, it will repeatedly 37 | lose sync. This tool can apply linear time corrections to all subtitles in 38 | the SRT, resyncing it with the video. 39 | - *lines-matching* takes a function and removes lines that don't return true 40 | when passed to it. For example, you can keep only lines that contain Chinese 41 | by installing the hanzidentifier_ package, and running 42 | ``srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese < input``. 43 | - *mux* can mux_ multiple subtitles together into one. For example, if you 44 | have a Chinese subtitle and an English subtitle, and you want to have one 45 | subtitle file that contains both, this tool can do that for you. It also 46 | supports clamping subtitles starting or ending at similar times to the same 47 | time to avoid subtitles jumping around the screen. 48 | - *normalise* standardises and cleans up SRT files. For example, it removes 49 | spurious newlines, normalises timestamps, and fixes subtitle indexing to a 50 | format that all media players should accept, with no noncompliant data. 51 | - *play* plays subtitles in the terminal at the time they are scheduled to 52 | display (note: it does not clear them from the screen afterwards). If you 53 | need to fast-forward to some point, you can combine it with 54 | *fixed-timeshift*. 55 | - *process* allows processing text freely. It takes a function, similarly to 56 | *lines-matching*, and changes SRT content into the return value. For example, 57 | you can naively strip some basic HTML-like markup with 58 | ``srt process -m re -f 'lambda sub: re.sub("<[^<]+?>", "", sub)'``. HTML-like 59 | syntax is especially prevalant in `SSA/ASS`_ subtitles that have been 60 | directly converted to SRT. 61 | 62 | .. _mux: https://en.wikipedia.org/wiki/Multiplexing 63 | .. _`SSA/ASS`: https://en.wikipedia.org/wiki/SubStation_Alpha 64 | .. _hanzidentifier: https://github.com/tsroten/hanzidentifier 65 | -------------------------------------------------------------------------------- /srt_tools/srt-deduplicate: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Deduplicate repeated subtitles.""" 4 | 5 | import datetime 6 | import srt_tools.utils 7 | import logging 8 | import operator 9 | 10 | log = logging.getLogger(__name__) 11 | 12 | try: # Python 2 13 | range = xrange # pytype: disable=name-error 14 | except NameError: 15 | pass 16 | 17 | 18 | def parse_args(): 19 | examples = { 20 | "Remove duplicated subtitles within 5 seconds of each other": "srt deduplicate -i duplicated.srt", 21 | "Remove duplicated subtitles within 500 milliseconds of each other": "srt deduplicate -t 500 -i duplicated.srt", 22 | "Remove duplicated subtitles regardless of temporal proximity": "srt deduplicate -t 0 -i duplicated.srt", 23 | } 24 | parser = srt_tools.utils.basic_parser( 25 | description=__doc__, 26 | examples=examples, 27 | ) 28 | parser.add_argument( 29 | "-t", 30 | "--ms", 31 | metavar="MILLISECONDS", 32 | default=datetime.timedelta(milliseconds=5000), 33 | type=lambda ms: datetime.timedelta(milliseconds=int(ms)), 34 | help="how many milliseconds distance a subtitle start time must be " 35 | "within of another to be considered a duplicate " 36 | "(default: 5000ms)", 37 | ) 38 | 39 | return parser.parse_args() 40 | 41 | 42 | def deduplicate_subs(orig_subs, acceptable_diff): 43 | """Remove subtitles with duplicated content.""" 44 | indices_to_remove = [] 45 | 46 | # If we only store the subtitle itself and compare that, it's possible that 47 | # we'll not only remove the duplicate, but also the _original_ subtitle if 48 | # they have the same sub index/times/etc. 49 | # 50 | # As such, we need to also store the index in the original subs list that 51 | # this entry belongs to for each subtitle prior to sorting. 52 | sorted_subs = sorted( 53 | enumerate(orig_subs), key=lambda sub: (sub[1].content, sub[1].start) 54 | ) 55 | 56 | for subs in srt_tools.utils.sliding_window(sorted_subs, width=2, inclusive=False): 57 | cur_idx, cur_sub = subs[0] 58 | next_idx, next_sub = subs[1] 59 | 60 | if cur_sub.content == next_sub.content and ( 61 | not acceptable_diff or cur_sub.start + acceptable_diff >= next_sub.start 62 | ): 63 | log.debug( 64 | "Marking l%d/s%d for removal, duplicate of l%d/s%d", 65 | next_idx, 66 | next_sub.index, 67 | cur_idx, 68 | cur_sub.index, 69 | ) 70 | indices_to_remove.append(next_idx) 71 | 72 | offset = 0 73 | for idx in indices_to_remove: 74 | del orig_subs[idx - offset] 75 | offset += 1 76 | 77 | 78 | def main(): 79 | args = parse_args() 80 | logging.basicConfig(level=args.log_level) 81 | 82 | srt_tools.utils.set_basic_args(args) 83 | 84 | subs = list(args.input) 85 | deduplicate_subs(subs, args.ms) 86 | 87 | output = srt_tools.utils.compose_suggest_on_fail(subs, strict=args.strict) 88 | 89 | try: 90 | args.output.write(output) 91 | except (UnicodeEncodeError, TypeError): # Python 2 fallback 92 | args.output.write(output.encode(args.encoding)) 93 | 94 | 95 | if __name__ == "__main__": # pragma: no cover 96 | main() 97 | -------------------------------------------------------------------------------- /srt_tools/srt-linear-timeshift: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Perform linear time correction on a subtitle.""" 4 | 5 | from __future__ import division 6 | 7 | import srt 8 | import datetime 9 | import srt_tools.utils 10 | import logging 11 | 12 | log = logging.getLogger(__name__) 13 | 14 | 15 | def timedelta_to_milliseconds(delta): 16 | return delta.days * 86400000 + delta.seconds * 1000 + delta.microseconds / 1000 17 | 18 | 19 | def parse_args(): 20 | def srt_timestamp_to_milliseconds(parser, arg): 21 | try: 22 | delta = srt.srt_timestamp_to_timedelta(arg) 23 | except ValueError: 24 | parser.error("not a valid SRT timestamp: %s" % arg) 25 | else: 26 | return timedelta_to_milliseconds(delta) 27 | 28 | examples = { 29 | "Stretch out a subtitle so that second 1 is 1, 2 is 3, 3 is 5, etc": "srt linear-timeshift --f1 00:00:01,000 --t1 00:00:01,000 --f2 00:00:02,000 --t2 00:00:03,000" 30 | } 31 | 32 | parser = srt_tools.utils.basic_parser(description=__doc__, examples=examples) 33 | parser.add_argument( 34 | "--from-start", 35 | "--f1", 36 | type=lambda arg: srt_timestamp_to_milliseconds(parser, arg), 37 | required=True, 38 | help="the first desynchronised timestamp", 39 | ) 40 | parser.add_argument( 41 | "--to-start", 42 | "--t1", 43 | type=lambda arg: srt_timestamp_to_milliseconds(parser, arg), 44 | required=True, 45 | help="the first synchronised timestamp", 46 | ) 47 | parser.add_argument( 48 | "--from-end", 49 | "--f2", 50 | type=lambda arg: srt_timestamp_to_milliseconds(parser, arg), 51 | required=True, 52 | help="the second desynchronised timestamp", 53 | ) 54 | parser.add_argument( 55 | "--to-end", 56 | "--t2", 57 | type=lambda arg: srt_timestamp_to_milliseconds(parser, arg), 58 | required=True, 59 | help="the second synchronised timestamp", 60 | ) 61 | return parser.parse_args() 62 | 63 | 64 | def calc_correction(to_start, to_end, from_start, from_end): 65 | angular = (to_end - to_start) / (from_end - from_start) 66 | linear = to_end - angular * from_end 67 | return angular, linear 68 | 69 | 70 | def correct_time(current_msecs, angular, linear): 71 | return round(current_msecs * angular + linear) 72 | 73 | 74 | def correct_timedelta(bad_delta, angular, linear): 75 | bad_msecs = timedelta_to_milliseconds(bad_delta) 76 | good_msecs = correct_time(bad_msecs, angular, linear) 77 | good_delta = datetime.timedelta(milliseconds=good_msecs) 78 | return good_delta 79 | 80 | 81 | def linear_correct_subs(subtitles, angular, linear): 82 | for subtitle in subtitles: 83 | subtitle.start = correct_timedelta(subtitle.start, angular, linear) 84 | subtitle.end = correct_timedelta(subtitle.end, angular, linear) 85 | yield subtitle 86 | 87 | 88 | def main(): 89 | args = parse_args() 90 | logging.basicConfig(level=args.log_level) 91 | angular, linear = calc_correction( 92 | args.to_start, args.to_end, args.from_start, args.from_end 93 | ) 94 | srt_tools.utils.set_basic_args(args) 95 | corrected_subs = linear_correct_subs(args.input, angular, linear) 96 | output = srt_tools.utils.compose_suggest_on_fail(corrected_subs, strict=args.strict) 97 | 98 | try: 99 | args.output.write(output) 100 | except (UnicodeEncodeError, TypeError): # Python 2 fallback 101 | args.output.write(output.encode(args.encoding)) 102 | 103 | 104 | if __name__ == "__main__": # pragma: no cover 105 | main() 106 | -------------------------------------------------------------------------------- /srt_tools/srt-mux: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Merge multiple subtitles together into one.""" 4 | 5 | import datetime 6 | import srt_tools.utils 7 | import logging 8 | import operator 9 | 10 | log = logging.getLogger(__name__) 11 | 12 | TOP = r"{\an8}" 13 | BOTTOM = r"{\an2}" 14 | 15 | 16 | def parse_args(): 17 | examples = { 18 | "Merge English and Chinese subtitles": "srt mux -i eng.srt -i chs.srt -o both.srt", 19 | "Merge subtitles, with one on top and one at the bottom": "srt mux -t -i eng.srt -i chs.srt -o both.srt", 20 | } 21 | parser = srt_tools.utils.basic_parser( 22 | description=__doc__, examples=examples, multi_input=True 23 | ) 24 | parser.add_argument( 25 | "--ms", 26 | metavar="MILLISECONDS", 27 | default=datetime.timedelta(milliseconds=600), 28 | type=lambda ms: datetime.timedelta(milliseconds=int(ms)), 29 | help="if subs being muxed are within this number of milliseconds " 30 | "of each other, they will have their times matched (default: 600)", 31 | ) 32 | parser.add_argument( 33 | "-w", 34 | "--width", 35 | default=5, 36 | type=int, 37 | help="how many subs to consider for time matching at once (default: %(default)s)", 38 | ) 39 | parser.add_argument( 40 | "-t", 41 | "--top-and-bottom", 42 | action="store_true", 43 | help="use SSA-style tags to place files at the top and bottom, respectively. Turns off time matching", 44 | ) 45 | parser.add_argument( 46 | "--no-time-matching", 47 | action="store_true", 48 | help="don't try to do time matching for close subtitles (see --ms)", 49 | ) 50 | return parser.parse_args() 51 | 52 | 53 | def merge_subs(subs, acceptable_diff, attr, width): 54 | """ 55 | Merge subs with similar start/end times together. This prevents the 56 | subtitles jumping around the screen. 57 | 58 | The merge is done in-place. 59 | """ 60 | sorted_subs = sorted(subs, key=operator.attrgetter(attr)) 61 | 62 | for subs in srt_tools.utils.sliding_window(sorted_subs, width=width): 63 | current_sub = subs[0] 64 | future_subs = subs[1:] 65 | current_comp = getattr(current_sub, attr) 66 | 67 | for future_sub in future_subs: 68 | future_comp = getattr(future_sub, attr) 69 | if current_comp + acceptable_diff > future_comp: 70 | log.debug( 71 | "Merging %d's %s time into %d", 72 | future_sub.index, 73 | attr, 74 | current_sub.index, 75 | ) 76 | setattr(future_sub, attr, current_comp) 77 | else: 78 | # Since these are sorted, and this one didn't match, we can be 79 | # sure future ones won't match either. 80 | break 81 | 82 | 83 | def main(): 84 | args = parse_args() 85 | logging.basicConfig(level=args.log_level) 86 | 87 | srt_tools.utils.set_basic_args(args) 88 | 89 | muxed_subs = [] 90 | for idx, subs in enumerate(args.input): 91 | for sub in subs: 92 | if args.top_and_bottom: 93 | if idx % 2 == 0: 94 | sub.content = TOP + sub.content 95 | else: 96 | sub.content = BOTTOM + sub.content 97 | muxed_subs.append(sub) 98 | 99 | if args.no_time_matching or not args.top_and_bottom: 100 | merge_subs(muxed_subs, args.ms, "start", args.width) 101 | merge_subs(muxed_subs, args.ms, "end", args.width) 102 | 103 | output = srt_tools.utils.compose_suggest_on_fail(muxed_subs, strict=args.strict) 104 | 105 | try: 106 | args.output.write(output) 107 | except (UnicodeEncodeError, TypeError): # Python 2 fallback 108 | args.output.write(output.encode(args.encoding)) 109 | 110 | 111 | if __name__ == "__main__": # pragma: no cover 112 | main() 113 | -------------------------------------------------------------------------------- /srt_tools/tests/test_srt_tools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import subprocess 5 | import sys 6 | import tempfile 7 | 8 | try: 9 | from shlex import quote 10 | except ImportError: # <3.3 fallback 11 | from pipes import quote 12 | 13 | 14 | sample_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "files") 15 | 16 | 17 | if os.name == "nt": 18 | # Sigh, shlex.quote quotes incorrectly on Windows 19 | quote = lambda x: windows_crappy_quote(x) 20 | 21 | 22 | def run_srt_util(cmd, shell=False, encoding="utf-8-sig"): 23 | extra_env = {} 24 | 25 | env = {"PYTHONPATH": ".", "SystemRoot": r"C:\Windows"} 26 | env.update(extra_env) 27 | 28 | raw_out = subprocess.check_output(cmd, shell=shell, env=env) 29 | return raw_out.decode(encoding) 30 | 31 | 32 | def windows_crappy_quote(data): 33 | """ 34 | I'm 100% sure this isn't secure, please don't use it with untrusted code. I 35 | beg you. 36 | """ 37 | data = data.replace('"', '""') 38 | return '"' + data + '"' 39 | 40 | 41 | def assert_supports_all_io_methods(cmd, exclude_output=False, exclude_stdin=False): 42 | # TODO: pytype doesn't like the mixed types in the matrix, but this works 43 | # fine. Maybe it would be happier with a namedtuple? 44 | cmd[0] = "srt_tools/" + cmd[0] # pytype: disable=unsupported-operands 45 | cmd.insert(0, sys.executable) # pytype: disable=attribute-error 46 | in_file = os.path.join(sample_dir, "ascii.srt") 47 | in_file_gb = os.path.join(sample_dir, "gb2312.srt") 48 | fd, out_file = tempfile.mkstemp() 49 | 50 | # This is accessed by filename, not fd 51 | os.close(fd) 52 | 53 | outputs = [] 54 | cmd_string = " ".join(quote(x) for x in cmd) 55 | 56 | try: 57 | outputs.append(run_srt_util(cmd + ["-i", in_file])) 58 | if not exclude_stdin: 59 | outputs.append( 60 | run_srt_util("%s < %s" % (cmd_string, quote(in_file)), shell=True) 61 | ) 62 | if not exclude_output: 63 | run_srt_util(cmd + ["-i", in_file, "-o", out_file]) 64 | run_srt_util( 65 | cmd + ["-i", in_file_gb, "-o", out_file, "-e", "gb2312"], 66 | encoding="gb2312", 67 | ) 68 | if not exclude_stdin: 69 | run_srt_util( 70 | "%s < %s > %s" % (cmd_string, quote(in_file), quote(out_file)), 71 | shell=True, 72 | ) 73 | run_srt_util( 74 | "%s < %s > %s" 75 | % (cmd_string + " -e gb2312", quote(in_file), quote(out_file)), 76 | shell=True, 77 | encoding="gb2312", 78 | ) 79 | assert len(set(outputs)) == 1, repr(outputs) 80 | 81 | if os.name == "nt": 82 | assert "\r\n" in outputs[0] 83 | else: 84 | assert "\r\n" not in outputs[0] 85 | finally: 86 | os.remove(out_file) 87 | 88 | 89 | def test_tools_support(): 90 | matrix = [ 91 | (["srt-normalise"], False), 92 | (["srt-deduplicate"], False), 93 | (["srt-fixed-timeshift", "--seconds", "5"], False), 94 | ( 95 | [ 96 | "srt-linear-timeshift", 97 | "--f1", 98 | "00:00:01,000", 99 | "--f2", 100 | "00:00:02,000", 101 | "--t1", 102 | "00:00:03,000", 103 | "--t2", 104 | "00:00:04,000", 105 | ], 106 | False, 107 | ), 108 | (["srt-lines-matching", "-f", "lambda x: True"], False), 109 | (["srt-process", "-f", "lambda x: x"], False), 110 | (["srt-mux"], False, True), 111 | (["srt-mux", "-t"], False, True), 112 | # Need to sort out time/thread issues 113 | # (('srt-play'), True), 114 | ] 115 | 116 | for args in matrix: 117 | assert_supports_all_io_methods(*args) 118 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | |ghactions| |coveralls| 2 | 3 | .. |ghactions| image:: https://img.shields.io/github/actions/workflow/status/cdown/srt/ci.yml?branch=develop 4 | :target: https://github.com/cdown/srt/actions?query=branch%3Adevelop 5 | :alt: Tests 6 | 7 | .. |coveralls| image:: https://img.shields.io/coveralls/cdown/srt/develop.svg?label=test%20coverage 8 | :target: https://coveralls.io/github/cdown/srt?branch=develop 9 | :alt: Coverage 10 | 11 | srt is a tiny but featureful Python library for parsing, modifying, and 12 | composing `SRT files`_. Take a look at the quickstart_ for a basic overview of 13 | the library. `Detailed API documentation`_ is also available. 14 | 15 | Want to see some examples of its use? Take a look at the `tools shipped with 16 | the library`_. This library is also used internally by projects like 17 | `subsync`_, `NVIDIA RAD-TTS`_, `manim`_, `kinobot`_, `bw_plex`_, and many more. 18 | 19 | .. _subsync: https://github.com/smacke/subsync 20 | .. _`NVIDIA RAD-TTS`: https://github.com/NVIDIA/radtts 21 | .. _bw_plex: https://github.com/Hellowlol/bw_plex 22 | .. _manim: https://github.com/ManimCommunity/manim 23 | .. _kinobot: https://github.com/vitiko98/kinobot 24 | 25 | Why choose this library? 26 | ------------------------ 27 | 28 | - Can parse many broken SRT files which other SRT libraries cannot, and fix them 29 | - Extremely lightweight, ~200 lines of code excluding docstrings 30 | - Simple, intuitive API 31 | - High quality test suite using Hypothesis_ 32 | - `100% test coverage`_ (including branches) 33 | - `Well documented API`_, at both a high and low level 34 | - `~30% faster than pysrt on typical workloads`_ 35 | - Full support for `PyPy`_ 36 | - No dependencies outside of the standard library 37 | - Tolerant of many common errors found in real-world SRT files 38 | - Support for Asian-style SRT formats (ie. "fullwidth" SRT format) 39 | - Completely Unicode compliant 40 | - Released under a highly permissive license (MIT) 41 | - Real world tested — used in production to process thousands of SRT files 42 | every day 43 | - Portable — runs on Linux, OSX, and Windows 44 | - Tools included — contains lightweight tools to perform generic tasks with the 45 | library 46 | 47 | .. _quickstart: http://srt.readthedocs.org/en/latest/quickstart.html 48 | .. _`SRT files`: https://en.wikipedia.org/wiki/SubRip#SubRip_text_file_format 49 | .. _Hypothesis: https://github.com/DRMacIver/hypothesis 50 | .. _`100% test coverage`: https://coveralls.io/github/cdown/srt?branch=develop 51 | .. _`Well documented API`: http://srt.readthedocs.org/en/latest/index.html 52 | .. _PyPy: http://pypy.org/ 53 | .. _`~30% faster than pysrt on typical workloads`: https://paste.pound-python.org/raw/8nQKbDW0ROWvS7bOeAb3/ 54 | 55 | Usage 56 | ----- 57 | 58 | Tools 59 | ===== 60 | 61 | There are a number of `tools shipped with the library`_ to manipulate, process, 62 | and fix SRT files. Here's an example using `hanzidentifier`_ to strip out 63 | non-Chinese lines: 64 | 65 | .. code:: 66 | 67 | $ cat pe.srt 68 | 1 69 | 00:00:33,843 --> 00:00:38,097 70 | Only 3% of the water on our planet is fresh. 71 | 地球上只有3%的水是淡水 72 | 73 | 2 74 | 00:00:40,641 --> 00:00:44,687 75 | Yet, these precious waters are rich with surprise. 76 | 可是这些珍贵的淡水中却充满了惊奇 77 | 78 | $ srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese -i pe.srt 79 | 1 80 | 00:00:33,843 --> 00:00:38,097 81 | 地球上只有3%的水是淡水 82 | 83 | 2 84 | 00:00:40,641 --> 00:00:44,687 85 | 可是这些珍贵的淡水中却充满了惊奇 86 | 87 | 88 | These tools are easy to chain together, for example, say you have one subtitle 89 | with Chinese and English, and other with French, but you want Chinese and 90 | French only. Oh, and the Chinese one is 5 seconds later than it should be. 91 | That's easy enough to sort out: 92 | 93 | .. code:: 94 | 95 | $ srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese -i chs+eng.srt | 96 | > srt fixed-timeshift --seconds -5 | 97 | > srt mux --input - --input fra.srt 98 | 99 | See the srt_tools/ directory for more information. 100 | 101 | .. _hanzidentifier: https://github.com/tsroten/hanzidentifier 102 | 103 | Library 104 | ======= 105 | 106 | `Detailed API documentation`_ is available, but here are the basics. 107 | 108 | Here's how you convert SRT input to Subtitle objects which you can manipulate: 109 | 110 | .. code:: python 111 | 112 | >>> data = '''\ 113 | 1 114 | 00:00:33,843 --> 00:00:38,097 115 | 地球上只有3%的水是淡水 116 | 117 | 2 118 | 00:00:40,641 --> 00:00:44,687 119 | 可是这些珍贵的淡水中却充满了惊奇 120 | 121 | 3 122 | 00:00:57,908 --> 00:01:03,414 123 | 所有陆地生命归根结底都依赖於淡水 124 | 125 | ''' 126 | >>> for sub in srt.parse(data): 127 | ... print(sub) 128 | Subtitle(index=1, start=datetime.timedelta(seconds=33, microseconds=843000), end=datetime.timedelta(seconds=38, microseconds=97000), content='地球上只有3%的水是淡水', proprietary='') 129 | Subtitle(index=2, start=datetime.timedelta(seconds=40, microseconds=641000), end=datetime.timedelta(seconds=44, microseconds=687000), content='可是这些珍贵的淡水中却充满了惊奇', proprietary='') 130 | Subtitle(index=3, start=datetime.timedelta(seconds=57, microseconds=908000), end=datetime.timedelta(seconds=63, microseconds=414000), content='所有陆地生命归根结底都依赖於淡水', proprietary='') 131 | 132 | And here's how you go back from Subtitle objects to SRT output: 133 | 134 | .. code:: python 135 | 136 | >>> subs = list(srt.parse(data)) 137 | >>> subs[1].content = "Changing subtitle data is easy!" 138 | >>> print(srt.compose(subs)) 139 | 1 140 | 00:00:33,843 --> 00:00:38,097 141 | 地球上只有3%的水是淡水 142 | 143 | 2 144 | 00:00:40,641 --> 00:00:44,687 145 | Changing subtitle data is easy! 146 | 147 | 3 148 | 00:00:57,908 --> 00:01:03,414 149 | 所有陆地生命归根结底都依赖於淡水 150 | 151 | Installation 152 | ------------ 153 | 154 | To install the latest stable version from PyPi: 155 | 156 | .. code:: 157 | 158 | pip install -U srt 159 | 160 | To install the latest development version directly from GitHub: 161 | 162 | .. code:: 163 | 164 | pip install -U git+https://github.com/cdown/srt.git@develop 165 | 166 | Testing 167 | ------- 168 | 169 | .. code:: 170 | 171 | tox 172 | 173 | .. _Tox: https://tox.readthedocs.org 174 | .. _`Detailed API documentation`: http://srt.readthedocs.org/en/latest/api.html 175 | .. _`tools shipped with the library`: https://github.com/cdown/srt/tree/develop/srt_tools 176 | -------------------------------------------------------------------------------- /srt_tools/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import codecs 5 | import srt 6 | import logging 7 | import sys 8 | import itertools 9 | import os 10 | 11 | try: 12 | from collections.abc import MutableSequence 13 | except ImportError: 14 | from collections import MutableSequence 15 | 16 | PROG_NAME = os.path.basename(sys.argv[0]).replace("-", " ", 1) 17 | 18 | STDIN_BYTESTREAM = getattr(sys.stdin, "buffer", sys.stdin) 19 | STDOUT_BYTESTREAM = getattr(sys.stdout, "buffer", sys.stdout) 20 | 21 | DASH_STREAM_MAP = {"input": STDIN_BYTESTREAM, "output": STDOUT_BYTESTREAM} 22 | 23 | try: # Python 2 24 | range = xrange # pytype: disable=name-error 25 | except NameError: 26 | pass 27 | 28 | log = logging.getLogger(__name__) 29 | 30 | 31 | def noop(stream): 32 | """ 33 | Used when we didn't explicitly specify a stream to avoid using 34 | codecs.get{reader,writer} 35 | """ 36 | return stream 37 | 38 | 39 | def dash_to_stream(arg, arg_type): 40 | if arg == "-": 41 | return DASH_STREAM_MAP[arg_type] 42 | return arg 43 | 44 | 45 | def basic_parser( 46 | description=None, 47 | multi_input=False, 48 | no_output=False, 49 | examples=None, 50 | hide_no_strict=False, 51 | ): 52 | example_lines = [] 53 | 54 | if examples is not None: 55 | example_lines.append("examples:") 56 | 57 | for desc, code in examples.items(): 58 | example_lines.append(" {}".format(desc)) 59 | example_lines.append(" $ {}\n".format(code)) 60 | 61 | parser = argparse.ArgumentParser( 62 | prog=PROG_NAME, 63 | description=description, 64 | epilog="\n".join(example_lines), 65 | formatter_class=argparse.RawDescriptionHelpFormatter, 66 | ) 67 | 68 | # Cannot use argparse.FileType as we need to know the encoding from the 69 | # args 70 | 71 | if multi_input: 72 | parser.add_argument( 73 | "--input", 74 | "-i", 75 | metavar="FILE", 76 | action="append", 77 | type=lambda arg: dash_to_stream(arg, "input"), 78 | help="the files to process", 79 | required=True, 80 | ) 81 | else: 82 | parser.add_argument( 83 | "--input", 84 | "-i", 85 | metavar="FILE", 86 | default=STDIN_BYTESTREAM, 87 | type=lambda arg: dash_to_stream(arg, "input"), 88 | help="the file to process (default: stdin)", 89 | ) 90 | 91 | if not no_output: 92 | parser.add_argument( 93 | "--output", 94 | "-o", 95 | metavar="FILE", 96 | default=STDOUT_BYTESTREAM, 97 | type=lambda arg: dash_to_stream(arg, "output"), 98 | help="the file to write to (default: stdout)", 99 | ) 100 | if not multi_input: 101 | parser.add_argument( 102 | "--inplace", 103 | "-p", 104 | action="store_true", 105 | help="modify file in place", 106 | ) 107 | 108 | shelp = "allow blank lines in output, your media player may explode" 109 | if hide_no_strict: 110 | shelp = argparse.SUPPRESS 111 | 112 | parser.add_argument("--no-strict", action="store_false", dest="strict", help=shelp) 113 | parser.add_argument( 114 | "--debug", 115 | action="store_const", 116 | dest="log_level", 117 | const=logging.DEBUG, 118 | default=logging.INFO, 119 | help="enable debug logging", 120 | ) 121 | 122 | parser.add_argument( 123 | "--ignore-parsing-errors", 124 | "-c", 125 | action="store_true", 126 | help="try to keep going, even if there are parsing errors", 127 | ) 128 | 129 | parser.add_argument( 130 | "--encoding", "-e", help="the encoding to read/write files in (default: utf8)" 131 | ) 132 | return parser 133 | 134 | 135 | def set_basic_args(args): 136 | # TODO: dedupe some of this 137 | if getattr(args, "inplace", None): 138 | if args.input == DASH_STREAM_MAP["input"]: 139 | raise ValueError("Cannot use --inplace on stdin") 140 | 141 | if args.output != DASH_STREAM_MAP["output"]: 142 | raise ValueError("Cannot use -o and -p together") 143 | 144 | args.output = args.input 145 | 146 | for stream_name in ("input", "output"): 147 | log.debug('Processing stream "%s"', stream_name) 148 | 149 | try: 150 | stream = getattr(args, stream_name) 151 | except AttributeError: 152 | # For example, in the case of no_output 153 | continue 154 | 155 | # We don't use system default encoding, because usually one runs this 156 | # on files they got from elsewhere. As such, be opinionated that these 157 | # files are probably UTF-8. Looking for the BOM on reading allows us to 158 | # be more liberal with what we accept, without adding BOMs on write. 159 | read_encoding = args.encoding or "utf-8-sig" 160 | write_encoding = args.encoding or "utf-8" 161 | 162 | r_enc = codecs.getreader(read_encoding) 163 | w_enc = codecs.getwriter(write_encoding) 164 | 165 | log.debug("Got %r as stream", stream) 166 | # We don't use encoding= option to open because we want to have the 167 | # same universal newlines behaviour as STD{IN,OUT}_BYTESTREAM 168 | if stream in DASH_STREAM_MAP.values(): 169 | log.debug("%s in DASH_STREAM_MAP", stream_name) 170 | if stream is args.input: 171 | args.input = srt.parse( 172 | r_enc(args.input).read(), ignore_errors=args.ignore_parsing_errors 173 | ) 174 | elif stream is args.output: 175 | # Since args.output is not in text mode (since we didn't 176 | # earlier know the encoding), we have no universal newline 177 | # support and need to do it ourselves 178 | args.output = w_enc(args.output) 179 | else: 180 | log.debug("%s not in DASH_STREAM_MAP", stream_name) 181 | if stream is args.input: 182 | if isinstance(args.input, MutableSequence): 183 | for i, input_fn in enumerate(args.input): 184 | if input_fn in DASH_STREAM_MAP.values(): 185 | if stream is args.input: 186 | args.input[i] = srt.parse( 187 | r_enc(input_fn).read(), 188 | ignore_errors=args.ignore_parsing_errors, 189 | ) 190 | else: 191 | f = r_enc(open(input_fn, "rb")) 192 | with f: 193 | args.input[i] = srt.parse( 194 | f.read(), ignore_errors=args.ignore_parsing_errors 195 | ) 196 | else: 197 | f = r_enc(open(stream, "rb")) 198 | with f: 199 | args.input = srt.parse( 200 | f.read(), ignore_errors=args.ignore_parsing_errors 201 | ) 202 | else: 203 | args.output = w_enc(open(args.output, "wb")) 204 | 205 | 206 | def compose_suggest_on_fail(subs, strict=True): 207 | try: 208 | return srt.compose(subs, strict=strict, eol=os.linesep, in_place=True) 209 | except srt.SRTParseError as thrown_exc: 210 | # Since `subs` is actually a generator 211 | log.critical( 212 | "Parsing failed, maybe you need to pass a different encoding " 213 | "with --encoding?" 214 | ) 215 | raise 216 | 217 | 218 | def sliding_window(seq, width=2, inclusive=True): 219 | """ 220 | If inclusive is True, we also include final elements where len(sliced) < 221 | width. 222 | """ 223 | seq_iter = iter(seq) 224 | 225 | # Consume seq_iter up to width 226 | sliced = tuple(itertools.islice(seq_iter, width)) 227 | 228 | if not inclusive and len(sliced) != width: 229 | return 230 | 231 | yield sliced 232 | 233 | for elem in seq_iter: 234 | sliced = sliced[1:] + (elem,) 235 | yield sliced 236 | 237 | if inclusive: 238 | for idx in range(len(sliced)): 239 | if idx != 0: 240 | yield sliced[idx:] 241 | -------------------------------------------------------------------------------- /srt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf8 3 | 4 | """A tiny library for parsing, modifying, and composing SRT files.""" 5 | 6 | from __future__ import unicode_literals 7 | import functools 8 | import re 9 | from datetime import timedelta 10 | import logging 11 | import io 12 | 13 | 14 | LOG = logging.getLogger(__name__) 15 | 16 | # "." is not technically valid as a delimiter, but many editors create SRT 17 | # files with this delimiter for whatever reason. Many editors and players 18 | # accept it, so we do too. 19 | RGX_TIMESTAMP_MAGNITUDE_DELIM = r"[,.:,.。:]" 20 | RGX_TIMESTAMP_FIELD = r"[0-9]+" 21 | RGX_TIMESTAMP_FIELD_OPTIONAL = r"[0-9]*" 22 | RGX_TIMESTAMP = "".join( 23 | [ 24 | RGX_TIMESTAMP_MAGNITUDE_DELIM.join([RGX_TIMESTAMP_FIELD] * 3), 25 | RGX_TIMESTAMP_MAGNITUDE_DELIM, 26 | "?", 27 | RGX_TIMESTAMP_FIELD_OPTIONAL, 28 | ] 29 | ) 30 | RGX_TIMESTAMP_PARSEABLE = r"^{}$".format( 31 | "".join( 32 | [ 33 | RGX_TIMESTAMP_MAGNITUDE_DELIM.join(["(" + RGX_TIMESTAMP_FIELD + ")"] * 3), 34 | RGX_TIMESTAMP_MAGNITUDE_DELIM, 35 | "?", 36 | "(", 37 | RGX_TIMESTAMP_FIELD_OPTIONAL, 38 | ")", 39 | ] 40 | ) 41 | ) 42 | RGX_INDEX = r"-?[0-9]+\.?[0-9]*" 43 | RGX_PROPRIETARY = r"[^\r\n]*" 44 | RGX_CONTENT = r".*?" 45 | RGX_POSSIBLE_CRLF = r"\r?\n" 46 | 47 | TS_REGEX = re.compile(RGX_TIMESTAMP_PARSEABLE) 48 | MULTI_WS_REGEX = re.compile(r"\n\n+") 49 | SRT_REGEX = re.compile( 50 | r"\s*(?:({idx})\s*{eof})?({ts}) *-[ -] *> *({ts}) ?({proprietary})(?:{eof}|\Z)({content})" 51 | # Many sub editors don't add a blank line to the end, and many editors and 52 | # players accept that. We allow it to be missing in input. 53 | # 54 | # We also allow subs that are missing a double blank newline. This often 55 | # happens on subs which were first created as a mixed language subtitle, 56 | # for example chs/eng, and then were stripped using naive methods (such as 57 | # ed/sed) that don't understand newline preservation rules in SRT files. 58 | # 59 | # This means that when you are, say, only keeping chs, and the line only 60 | # contains english, you end up with not only no content, but also all of 61 | # the content lines are stripped instead of retaining a newline. 62 | r"(?:{eof}|\Z)(?:{eof}|\Z|(?=(?:{idx}\s*{eof}{ts})))" 63 | # Some SRT blocks, while this is technically invalid, have blank lines 64 | # inside the subtitle content. We look ahead a little to check that the 65 | # next lines look like an index and a timestamp as a best-effort 66 | # solution to work around these. 67 | r"(?=(?:(?:{idx}\s*{eof})?{ts}|\Z))".format( 68 | idx=RGX_INDEX, 69 | ts=RGX_TIMESTAMP, 70 | proprietary=RGX_PROPRIETARY, 71 | content=RGX_CONTENT, 72 | eof=RGX_POSSIBLE_CRLF, 73 | ), 74 | re.DOTALL, 75 | ) 76 | 77 | ZERO_TIMEDELTA = timedelta(0) 78 | 79 | # Info message if truthy return -> Function taking a Subtitle, skip if True 80 | SUBTITLE_SKIP_CONDITIONS = ( 81 | ("No content", lambda sub: not sub.content.strip()), 82 | ("Start time < 0 seconds", lambda sub: sub.start < ZERO_TIMEDELTA), 83 | ("Subtitle start time >= end time", lambda sub: sub.start >= sub.end), 84 | ) 85 | 86 | SECONDS_IN_HOUR = 3600 87 | SECONDS_IN_MINUTE = 60 88 | HOURS_IN_DAY = 24 89 | MICROSECONDS_IN_MILLISECOND = 1000 90 | 91 | try: 92 | FILE_TYPES = (file, io.IOBase) # pytype: disable=name-error 93 | except NameError: # `file` doesn't exist in Python 3 94 | FILE_TYPES = (io.IOBase,) 95 | 96 | 97 | @functools.total_ordering 98 | class Subtitle(object): 99 | r""" 100 | The metadata relating to a single subtitle. Subtitles are sorted by start 101 | time by default. If no index was provided, index 0 will be used on writing 102 | an SRT block. 103 | 104 | :param index: The SRT index for this subtitle 105 | :type index: int or None 106 | :param start: The time that the subtitle should start being shown 107 | :type start: :py:class:`datetime.timedelta` 108 | :param end: The time that the subtitle should stop being shown 109 | :type end: :py:class:`datetime.timedelta` 110 | :param str proprietary: Proprietary metadata for this subtitle 111 | :param str content: The subtitle content. Should not contain OS-specific 112 | line separators, only \\n. This is taken care of 113 | already if you use :py:func:`srt.parse` to generate 114 | Subtitle objects. 115 | """ 116 | 117 | # pylint: disable=R0913 118 | def __init__(self, index, start, end, content, proprietary=""): 119 | self.index = index 120 | self.start = start 121 | self.end = end 122 | self.content = content 123 | self.proprietary = proprietary 124 | 125 | def __hash__(self): 126 | return hash(frozenset(vars(self).items())) 127 | 128 | def __eq__(self, other): 129 | return isinstance(other, Subtitle) and vars(self) == vars(other) 130 | 131 | def __lt__(self, other): 132 | return (self.start, self.end, self.index) < ( 133 | other.start, 134 | other.end, 135 | other.index, 136 | ) 137 | 138 | def __repr__(self): 139 | # Python 2/3 cross compatibility 140 | var_items = getattr(vars(self), "iteritems", getattr(vars(self), "items")) 141 | item_list = ", ".join("%s=%r" % (k, v) for k, v in var_items()) 142 | return "%s(%s)" % (type(self).__name__, item_list) 143 | 144 | def to_srt(self, strict=True, eol="\n"): 145 | r""" 146 | Convert the current :py:class:`Subtitle` to an SRT block. 147 | 148 | :param bool strict: If disabled, will allow blank lines in the content 149 | of the SRT block, which is a violation of the SRT 150 | standard and may cause your media player to explode 151 | :param str eol: The end of line string to use (default "\\n") 152 | :returns: The metadata of the current :py:class:`Subtitle` object as an 153 | SRT formatted subtitle block 154 | :rtype: str 155 | """ 156 | output_content = self.content 157 | output_proprietary = self.proprietary 158 | 159 | if output_proprietary: 160 | # output_proprietary is output directly next to the timestamp, so 161 | # we need to add the space as a field delimiter. 162 | output_proprietary = " " + output_proprietary 163 | 164 | if strict: 165 | output_content = make_legal_content(output_content) 166 | 167 | if eol is None: 168 | eol = "\n" 169 | elif eol != "\n": 170 | output_content = output_content.replace("\n", eol) 171 | 172 | template = "{idx}{eol}{start} --> {end}{prop}{eol}{content}{eol}{eol}" 173 | return template.format( 174 | idx=self.index or 0, 175 | start=timedelta_to_srt_timestamp(self.start), 176 | end=timedelta_to_srt_timestamp(self.end), 177 | prop=output_proprietary, 178 | content=output_content, 179 | eol=eol, 180 | ) 181 | 182 | 183 | def make_legal_content(content): 184 | r""" 185 | Remove illegal content from a content block. Illegal content includes: 186 | 187 | * Blank lines 188 | * Starting or ending with a blank line 189 | 190 | .. doctest:: 191 | 192 | >>> make_legal_content('\nfoo\n\nbar\n') 193 | 'foo\nbar' 194 | 195 | :param str content: The content to make legal 196 | :returns: The legalised content 197 | :rtype: srt 198 | """ 199 | # Optimisation: Usually the content we get is legally valid. Do a quick 200 | # check to see if we really need to do anything here. This saves time from 201 | # generating legal_content by about 50%. 202 | if content and content[0] != "\n" and "\n\n" not in content: 203 | return content 204 | 205 | legal_content = MULTI_WS_REGEX.sub("\n", content.strip("\n")) 206 | LOG.info("Legalised content %r to %r", content, legal_content) 207 | return legal_content 208 | 209 | 210 | def timedelta_to_srt_timestamp(timedelta_timestamp): 211 | r""" 212 | Convert a :py:class:`~datetime.timedelta` to an SRT timestamp. 213 | 214 | .. doctest:: 215 | 216 | >>> import datetime 217 | >>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4) 218 | >>> timedelta_to_srt_timestamp(delta) 219 | '01:23:04,000' 220 | 221 | :param datetime.timedelta timedelta_timestamp: A datetime to convert to an 222 | SRT timestamp 223 | :returns: The timestamp in SRT format 224 | :rtype: str 225 | """ 226 | 227 | hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR) 228 | hrs += timedelta_timestamp.days * HOURS_IN_DAY 229 | mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE) 230 | msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND 231 | return "%02d:%02d:%02d,%03d" % (hrs, mins, secs, msecs) 232 | 233 | 234 | def srt_timestamp_to_timedelta(timestamp): 235 | r""" 236 | Convert an SRT timestamp to a :py:class:`~datetime.timedelta`. 237 | 238 | .. doctest:: 239 | 240 | >>> srt_timestamp_to_timedelta('01:23:04,000') 241 | datetime.timedelta(seconds=4984) 242 | 243 | :param str timestamp: A timestamp in SRT format 244 | :returns: The timestamp as a :py:class:`~datetime.timedelta` 245 | :rtype: datetime.timedelta 246 | :raises TimestampParseError: If the timestamp is not parseable 247 | """ 248 | 249 | match = TS_REGEX.match(timestamp) 250 | if match is None: 251 | raise TimestampParseError("Unparseable timestamp: {}".format(timestamp)) 252 | hrs, mins, secs, msecs = [int(m) if m else 0 for m in match.groups()] 253 | return timedelta(hours=hrs, minutes=mins, seconds=secs, milliseconds=msecs) 254 | 255 | 256 | def sort_and_reindex(subtitles, start_index=1, in_place=False, skip=True): 257 | """ 258 | Reorder subtitles to be sorted by start time order, and rewrite the indexes 259 | to be in that same order. This ensures that the SRT file will play in an 260 | expected fashion after, for example, times were changed in some subtitles 261 | and they may need to be resorted. 262 | 263 | If skip=True, subtitles will also be skipped if they are considered not to 264 | be useful. Currently, the conditions to be considered "not useful" are as 265 | follows: 266 | 267 | - Content is empty, or only whitespace 268 | - The start time is negative 269 | - The start time is equal to or later than the end time 270 | 271 | .. doctest:: 272 | 273 | >>> from datetime import timedelta 274 | >>> one = timedelta(seconds=1) 275 | >>> two = timedelta(seconds=2) 276 | >>> three = timedelta(seconds=3) 277 | >>> subs = [ 278 | ... Subtitle(index=999, start=one, end=two, content='1'), 279 | ... Subtitle(index=0, start=two, end=three, content='2'), 280 | ... ] 281 | >>> list(sort_and_reindex(subs)) # doctest: +ELLIPSIS 282 | [Subtitle(...index=1...), Subtitle(...index=2...)] 283 | 284 | :param subtitles: :py:class:`Subtitle` objects in any order 285 | :param int start_index: The index to start from 286 | :param bool in_place: Whether to modify subs in-place for performance 287 | (version <=1.0.0 behaviour) 288 | :param bool skip: Whether to skip subtitles considered not useful (see 289 | above for rules) 290 | :returns: The sorted subtitles 291 | :rtype: :term:`generator` of :py:class:`Subtitle` objects 292 | """ 293 | skipped_subs = 0 294 | for sub_num, subtitle in enumerate(sorted(subtitles), start=start_index): 295 | if not in_place: 296 | subtitle = Subtitle(**vars(subtitle)) 297 | 298 | if skip: 299 | try: 300 | _should_skip_sub(subtitle) 301 | except _ShouldSkipException as thrown_exc: 302 | if subtitle.index is None: 303 | LOG.info("Skipped subtitle with no index: %s", thrown_exc) 304 | else: 305 | LOG.info( 306 | "Skipped subtitle at index %d: %s", subtitle.index, thrown_exc 307 | ) 308 | skipped_subs += 1 309 | continue 310 | 311 | subtitle.index = sub_num - skipped_subs 312 | 313 | yield subtitle 314 | 315 | 316 | def _should_skip_sub(subtitle): 317 | """ 318 | Check if a subtitle should be skipped based on the rules in 319 | SUBTITLE_SKIP_CONDITIONS. 320 | 321 | :param subtitle: A :py:class:`Subtitle` to check whether to skip 322 | :raises _ShouldSkipException: If the subtitle should be skipped 323 | """ 324 | for info_msg, sub_skipper in SUBTITLE_SKIP_CONDITIONS: 325 | if sub_skipper(subtitle): 326 | raise _ShouldSkipException(info_msg) 327 | 328 | 329 | def parse(srt, ignore_errors=False): 330 | r''' 331 | Convert an SRT formatted string (in Python 2, a :class:`unicode` object) to 332 | a :term:`generator` of Subtitle objects. 333 | 334 | This function works around bugs present in many SRT files, most notably 335 | that it is designed to not bork when presented with a blank line as part of 336 | a subtitle's content. 337 | 338 | .. doctest:: 339 | 340 | >>> subs = parse("""\ 341 | ... 422 342 | ... 00:31:39,931 --> 00:31:41,931 343 | ... Using mainly spoons, 344 | ... 345 | ... 423 346 | ... 00:31:41,933 --> 00:31:43,435 347 | ... we dig a tunnel under the city and release it into the wild. 348 | ... 349 | ... """) 350 | >>> list(subs) # doctest: +ELLIPSIS 351 | [Subtitle(...index=422...), Subtitle(...index=423...)] 352 | 353 | :param srt: Subtitles in SRT format 354 | :type srt: str or a file-like object 355 | :param ignore_errors: If True, garbled SRT data will be ignored, and we'll 356 | continue trying to parse the rest of the file, 357 | instead of raising :py:class:`SRTParseError` and 358 | stopping execution. 359 | :returns: The subtitles contained in the SRT file as :py:class:`Subtitle` 360 | objects 361 | :rtype: :term:`generator` of :py:class:`Subtitle` objects 362 | :raises SRTParseError: If the matches are not contiguous and 363 | ``ignore_errors`` is False. 364 | ''' 365 | 366 | expected_start = 0 367 | 368 | # Transparently read files -- the whole thing is needed for regex's 369 | # finditer 370 | if isinstance(srt, FILE_TYPES): 371 | srt = srt.read() 372 | 373 | for match in SRT_REGEX.finditer(srt): 374 | actual_start = match.start() 375 | _check_contiguity(srt, expected_start, actual_start, ignore_errors) 376 | raw_index, raw_start, raw_end, proprietary, content = match.groups() 377 | 378 | # pytype sees that this is Optional[str] and thus complains that they 379 | # can be None, but they can't realistically be None, since we're using 380 | # finditer and all match groups are mandatory in the regex. 381 | content = content.replace("\r\n", "\n") # pytype: disable=attribute-error 382 | 383 | try: 384 | raw_index = int(raw_index) 385 | except ValueError: 386 | # Index 123.4. Handled separately, since it's a rare case and we 387 | # don't want to affect general performance. 388 | # 389 | # The pytype disable is for the same reason as content, above. 390 | raw_index = int(raw_index.split(".")[0]) # pytype: disable=attribute-error 391 | except TypeError: 392 | # There's no index, so raw_index is already set to None. We'll 393 | # handle this when rendering the subtitle with to_srt. 394 | pass 395 | 396 | yield Subtitle( 397 | index=raw_index, 398 | start=srt_timestamp_to_timedelta(raw_start), 399 | end=srt_timestamp_to_timedelta(raw_end), 400 | content=content, 401 | proprietary=proprietary, 402 | ) 403 | 404 | expected_start = match.end() 405 | 406 | _check_contiguity(srt, expected_start, len(srt), ignore_errors) 407 | 408 | 409 | def _check_contiguity(srt, expected_start, actual_start, warn_only): 410 | """ 411 | If ``warn_only`` is False, raise :py:class:`SRTParseError` with diagnostic 412 | info if expected_start does not equal actual_start. Otherwise, log a 413 | warning. 414 | 415 | :param str srt: The data being matched 416 | :param int expected_start: The expected next start, as from the last 417 | iteration's match.end() 418 | :param int actual_start: The actual start, as from this iteration's 419 | match.start() 420 | :raises SRTParseError: If the matches are not contiguous and ``warn_only`` 421 | is False 422 | """ 423 | if expected_start != actual_start: 424 | unmatched_content = srt[expected_start:actual_start] 425 | 426 | if expected_start == 0 and ( 427 | unmatched_content.isspace() or unmatched_content == "\ufeff" 428 | ): 429 | # #50: Leading whitespace has nowhere to be captured like in an 430 | # intermediate subtitle 431 | return 432 | 433 | if warn_only: 434 | LOG.warning("Skipped unparseable SRT data: %r", unmatched_content) 435 | else: 436 | raise SRTParseError(expected_start, actual_start, unmatched_content) 437 | 438 | 439 | def compose( 440 | subtitles, reindex=True, start_index=1, strict=True, eol=None, in_place=False 441 | ): 442 | r""" 443 | Convert an iterator of :py:class:`Subtitle` objects to a string of joined 444 | SRT blocks. 445 | 446 | .. doctest:: 447 | 448 | >>> from datetime import timedelta 449 | >>> start = timedelta(seconds=1) 450 | >>> end = timedelta(seconds=2) 451 | >>> subs = [ 452 | ... Subtitle(index=1, start=start, end=end, content='x'), 453 | ... Subtitle(index=2, start=start, end=end, content='y'), 454 | ... ] 455 | >>> compose(subs) # doctest: +ELLIPSIS 456 | '1\n00:00:01,000 --> 00:00:02,000\nx\n\n2\n00:00:01,000 --> ...' 457 | 458 | :param subtitles: The subtitles to convert to SRT blocks 459 | :type subtitles: :term:`iterator` of :py:class:`Subtitle` objects 460 | :param bool reindex: Whether to reindex subtitles based on start time 461 | :param int start_index: If reindexing, the index to start reindexing from 462 | :param bool strict: Whether to enable strict mode, see 463 | :py:func:`Subtitle.to_srt` for more information 464 | :param str eol: The end of line string to use (default "\\n") 465 | :returns: A single SRT formatted string, with each input 466 | :py:class:`Subtitle` represented as an SRT block 467 | :param bool in_place: Whether to reindex subs in-place for performance 468 | (version <=1.0.0 behaviour) 469 | :rtype: str 470 | """ 471 | if reindex: 472 | subtitles = sort_and_reindex( 473 | subtitles, start_index=start_index, in_place=in_place 474 | ) 475 | 476 | return "".join(subtitle.to_srt(strict=strict, eol=eol) for subtitle in subtitles) 477 | 478 | 479 | class SRTParseError(Exception): 480 | """ 481 | Raised when part of an SRT block could not be parsed. 482 | 483 | :param int expected_start: The expected contiguous start index 484 | :param int actual_start: The actual non-contiguous start index 485 | :param str unmatched_content: The content between the expected start index 486 | and the actual start index 487 | """ 488 | 489 | def __init__(self, expected_start, actual_start, unmatched_content): 490 | message = ( 491 | "Expected contiguous start of match or end of input at char %d, " 492 | "but started at char %d (unmatched content: %r)" 493 | % (expected_start, actual_start, unmatched_content) 494 | ) 495 | super(SRTParseError, self).__init__(message) 496 | 497 | self.expected_start = expected_start 498 | self.actual_start = actual_start 499 | self.unmatched_content = unmatched_content 500 | 501 | 502 | class TimestampParseError(ValueError): 503 | """ 504 | Raised when an SRT timestamp could not be parsed. 505 | """ 506 | 507 | 508 | class _ShouldSkipException(Exception): 509 | """ 510 | Raised when a subtitle should be skipped. 511 | """ 512 | -------------------------------------------------------------------------------- /tests/test_srt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf8 3 | 4 | from __future__ import unicode_literals 5 | from datetime import timedelta 6 | import collections 7 | import functools 8 | import os 9 | import re 10 | import string 11 | from io import StringIO 12 | 13 | import pytest 14 | from hypothesis import given, settings, HealthCheck, assume, example 15 | import hypothesis.strategies as st 16 | 17 | import srt 18 | 19 | REGISTER_SETTINGS = lambda name, **kwargs: settings.register_profile( 20 | name, suppress_health_check=[HealthCheck.too_slow], deadline=None, **kwargs 21 | ) 22 | 23 | REGISTER_SETTINGS("base") 24 | REGISTER_SETTINGS("release", max_examples=1000) 25 | 26 | settings.load_profile(os.getenv("HYPOTHESIS_PROFILE", "base")) 27 | 28 | HOURS_IN_DAY = 24 29 | TIMEDELTA_MAX_DAYS = 999999999 30 | CONTENTLESS_SUB = functools.partial( 31 | srt.Subtitle, index=1, start=timedelta(seconds=1), end=timedelta(seconds=2) 32 | ) 33 | 34 | 35 | def is_strictly_legal_content(content): 36 | """ 37 | Filter out things that would violate strict mode. Illegal content 38 | includes: 39 | 40 | - A content section that starts or ends with a newline 41 | - A content section that contains blank lines 42 | """ 43 | 44 | if content.strip("\r\n") != content: 45 | return False 46 | elif not content.strip(): 47 | return False 48 | elif "\n\n" in content: 49 | return False 50 | else: 51 | return True 52 | 53 | 54 | def subs_eq(got, expected, any_order=False): 55 | """ 56 | Compare Subtitle objects using vars() so that differences are easy to 57 | identify. 58 | """ 59 | got_vars = [frozenset(vars(sub).items()) for sub in got] 60 | expected_vars = [frozenset(vars(sub).items()) for sub in expected] 61 | if any_order: 62 | assert collections.Counter(got_vars) == collections.Counter(expected_vars) 63 | else: 64 | assert got_vars == expected_vars 65 | 66 | 67 | def timedeltas(min_value=0, max_value=TIMEDELTA_MAX_DAYS): 68 | """ 69 | A Hypothesis strategy to generate timedeltas. 70 | 71 | Right now {min,max}_value are shoved into multiple fields in timedelta(), 72 | which is not very customisable, but it's good enough for our current test 73 | purposes. If you need more precise control, you may need to add more 74 | parameters to this function to be able to customise more freely. 75 | """ 76 | time_unit_strategy = st.integers(min_value=min_value, max_value=max_value) 77 | timestamp_strategy = st.builds( 78 | timedelta, 79 | hours=time_unit_strategy, 80 | minutes=time_unit_strategy, 81 | seconds=time_unit_strategy, 82 | ) 83 | return timestamp_strategy 84 | 85 | 86 | def equivalent_timestamps(min_value=0, max_value=TIMEDELTA_MAX_DAYS): 87 | def string_timestamp(hours, minutes, seconds, msecs, paddings): 88 | hours, minutes, seconds, msecs = map( 89 | lambda v_and_p: "0" * v_and_p[1] + str(v_and_p[0]), 90 | zip((hours, minutes, seconds, msecs), paddings), 91 | ) 92 | return "{}:{}:{},{}".format(hours, minutes, seconds, msecs) 93 | 94 | def ts_field_value(): 95 | return st.integers(min_value=min_value, max_value=max_value) 96 | 97 | def zero_padding(): 98 | return st.integers(min_value=0, max_value=2) 99 | 100 | @st.composite 101 | def maybe_off_by_one_fields(draw): 102 | field = draw(ts_field_value()) 103 | field_maybe_plus_one = draw(st.integers(min_value=field, max_value=field + 1)) 104 | return field_maybe_plus_one, field 105 | 106 | def get_equiv_timestamps(h, m, s, ms2, ts1paddings, ts2paddings): 107 | h2, h1 = h 108 | m2, m1 = m 109 | s2, s1 = s 110 | ms1 = ( 111 | (h2 - h1) * 60 * 60 * 1000 + (m2 - m1) * 60 * 1000 + (s2 - s1) * 1000 + ms2 112 | ) 113 | return ( 114 | string_timestamp(h2, m2, s2, ms2, ts2paddings), 115 | string_timestamp(h1, m1, s1, ms1, ts1paddings), 116 | ) 117 | 118 | return st.builds( 119 | get_equiv_timestamps, 120 | maybe_off_by_one_fields(), 121 | maybe_off_by_one_fields(), 122 | maybe_off_by_one_fields(), 123 | ts_field_value(), 124 | st.tuples(*[zero_padding() for _ in range(4)]), 125 | st.tuples(*[zero_padding() for _ in range(4)]), 126 | ) 127 | 128 | 129 | def subtitles(strict=True): 130 | """A Hypothesis strategy to generate Subtitle objects.""" 131 | # max_value settings are just to avoid overflowing TIMEDELTA_MAX_DAYS by 132 | # using arbitrary low enough numbers. 133 | # 134 | # We also skip subs with start time >= end time, so we split them into two 135 | # groups to avoid overlap. 136 | start_timestamp_strategy = timedeltas(min_value=0, max_value=500000) 137 | end_timestamp_strategy = timedeltas(min_value=500001, max_value=999999) 138 | 139 | # \r is not legal inside Subtitle.content, it should have already been 140 | # normalised to \n. 141 | content_strategy = st.text(min_size=1).filter(lambda x: "\r" not in x) 142 | proprietary_strategy = st.text().filter( 143 | lambda x: all(eol not in x for eol in "\r\n") 144 | ) 145 | 146 | if strict: 147 | content_strategy = content_strategy.filter(is_strictly_legal_content) 148 | 149 | subtitle_strategy = st.builds( 150 | srt.Subtitle, 151 | index=st.integers(min_value=0), 152 | start=start_timestamp_strategy, 153 | end=end_timestamp_strategy, 154 | proprietary=proprietary_strategy, 155 | content=content_strategy, 156 | ) 157 | 158 | return subtitle_strategy 159 | 160 | 161 | @given(st.lists(subtitles())) 162 | def test_compose_and_parse_from_file(input_subs): 163 | srt_file = StringIO(srt.compose(input_subs, reindex=False)) 164 | reparsed_subs = srt.parse(srt_file) 165 | subs_eq(reparsed_subs, input_subs) 166 | 167 | 168 | @given(st.lists(subtitles())) 169 | def test_compose_and_parse_from_file_bom(input_subs): 170 | srt_file = StringIO("\ufeff" + srt.compose(input_subs, reindex=False)) 171 | reparsed_subs = srt.parse(srt_file) 172 | subs_eq(reparsed_subs, input_subs) 173 | 174 | 175 | @given(st.lists(subtitles())) 176 | def test_compose_and_parse_strict(input_subs): 177 | composed = srt.compose(input_subs, reindex=False) 178 | reparsed_subs = srt.parse(composed) 179 | subs_eq(reparsed_subs, input_subs) 180 | 181 | 182 | @given(st.lists(subtitles())) 183 | def test_can_compose_without_ending_blank_line(input_subs): 184 | """ 185 | Many sub editors don't add a blank line to the end, and many editors accept 186 | it. We should just accept this too in input. 187 | """ 188 | composed = srt.compose(input_subs, reindex=False) 189 | composed_without_ending_blank = composed[:-1] 190 | reparsed_subs = srt.parse(composed_without_ending_blank) 191 | subs_eq(reparsed_subs, input_subs) 192 | 193 | 194 | @given(st.lists(subtitles())) 195 | def test_can_compose_without_eol_at_all(input_subs): 196 | composed = srt.compose(input_subs, reindex=False) 197 | composed_without_ending_blank = composed.rstrip("\r\n") 198 | reparsed_subs = srt.parse(composed_without_ending_blank) 199 | subs_eq(reparsed_subs, input_subs) 200 | 201 | 202 | @given(st.text().filter(is_strictly_legal_content)) 203 | def test_compose_and_parse_strict_mode(content): 204 | # sub.content should not have OS-specific line separators, only \n 205 | assume("\r" not in content) 206 | 207 | content = "\n" + content + "\n\n" + content + "\n" 208 | sub = CONTENTLESS_SUB(content=content) 209 | 210 | parsed_strict = list(srt.parse(sub.to_srt()))[0] 211 | parsed_unstrict = list(srt.parse(sub.to_srt(strict=False)))[0] 212 | 213 | # Strict mode should remove blank lines in content, leading, and trailing 214 | # newlines. 215 | assert not parsed_strict.content.startswith("\n") 216 | assert not parsed_strict.content.endswith("\n") 217 | assert "\n\n" not in parsed_strict.content 218 | 219 | # When strict mode is false, no processing should be applied to the 220 | # content (other than \r\n becoming \n). 221 | assert parsed_unstrict.content == sub.content.replace("\r\n", "\n") 222 | 223 | 224 | @given(st.integers(min_value=1, max_value=TIMEDELTA_MAX_DAYS)) 225 | def test_timedelta_to_srt_timestamp_can_go_over_24_hours(days): 226 | srt_timestamp = srt.timedelta_to_srt_timestamp(timedelta(days=days)) 227 | srt_timestamp_hours = int(srt_timestamp.split(":")[0]) 228 | assert srt_timestamp_hours == days * HOURS_IN_DAY 229 | 230 | 231 | @given(subtitles()) 232 | def test_subtitle_equality(sub_1): 233 | sub_2 = srt.Subtitle(**vars(sub_1)) 234 | assert sub_1 == sub_2 235 | 236 | 237 | @given(subtitles()) 238 | def test_subtitle_inequality(sub_1): 239 | sub_2 = srt.Subtitle(**vars(sub_1)) 240 | sub_2.index += 1 241 | assert sub_1 != sub_2 242 | 243 | 244 | @given(subtitles()) 245 | def test_subtitle_inequality_to_non_matching_type(sub_1): 246 | assert sub_1 != None 247 | assert sub_1 != 1 248 | 249 | 250 | @given(subtitles()) 251 | def test_subtitle_from_scratch_equality(subtitle): 252 | srt_block = subtitle.to_srt() 253 | 254 | # Get two totally new sets of objects so as not to affect the hash 255 | # comparison 256 | sub_1 = list(srt.parse(srt_block))[0] 257 | sub_2 = list(srt.parse(srt_block))[0] 258 | 259 | subs_eq([sub_1], [sub_2]) 260 | # In case subs_eq and eq disagree for some reason 261 | assert sub_1 == sub_2 262 | assert hash(sub_1) == hash(sub_2) 263 | 264 | 265 | @given(st.lists(subtitles())) 266 | def test_parsing_spaced_arrow(subs): 267 | spaced_block = srt.compose(subs, reindex=False, strict=False).replace("-->", "- >") 268 | reparsed_subtitles = srt.parse(spaced_block) 269 | subs_eq(reparsed_subtitles, subs) 270 | 271 | 272 | @given(st.lists(subtitles())) 273 | def test_parsing_spaced_ender_arrow(subs): 274 | # Seen in BSG subtitles 275 | spaced_block = srt.compose(subs, reindex=False, strict=False).replace("-->", "-- >") 276 | reparsed_subtitles = srt.parse(spaced_block) 277 | subs_eq(reparsed_subtitles, subs) 278 | 279 | 280 | @given(st.lists(subtitles())) 281 | def test_parsing_no_ws_arrow(subs): 282 | spaced_block = srt.compose(subs, reindex=False, strict=False).replace( 283 | " --> ", "-->" 284 | ) 285 | reparsed_subtitles = srt.parse(spaced_block) 286 | subs_eq(reparsed_subtitles, subs) 287 | 288 | 289 | @given(st.text(string.whitespace), st.lists(subtitles())) 290 | def test_parsing_leading_whitespace(ws, subs): 291 | prews_block = ws + srt.compose(subs, reindex=False, strict=False) 292 | reparsed_subtitles = srt.parse(prews_block) 293 | subs_eq(reparsed_subtitles, subs) 294 | 295 | 296 | @given(st.lists(subtitles())) 297 | def test_parsing_negative_index(subs): 298 | for sub in subs: 299 | sub.index *= -1 300 | prews_block = srt.compose(subs, reindex=False, strict=False) 301 | reparsed_subtitles = srt.parse(prews_block) 302 | subs_eq(reparsed_subtitles, subs) 303 | 304 | 305 | @given(st.lists(subtitles())) 306 | def test_parsing_content_with_blank_lines(subs): 307 | for subtitle in subs: 308 | # We stuff a blank line in the middle so as to trigger the "special" 309 | # content parsing for erroneous SRT files that have blank lines. 310 | subtitle.content = subtitle.content + "\n\n" + subtitle.content 311 | 312 | reparsed_subtitles = srt.parse(srt.compose(subs, reindex=False, strict=False)) 313 | subs_eq(reparsed_subtitles, subs) 314 | 315 | 316 | @given(st.lists(subtitles())) 317 | def test_parsing_no_content(subs): 318 | for subtitle in subs: 319 | subtitle.content = "" 320 | 321 | reparsed_subtitles = srt.parse(srt.compose(subs, reindex=False, strict=False)) 322 | subs_eq(reparsed_subtitles, subs) 323 | 324 | 325 | @given(st.lists(subtitles()), st.lists(subtitles()), st.text(alphabet="\n\r\t ")) 326 | def test_subs_missing_content_removed(content_subs, contentless_subs, contentless_text): 327 | for sub in contentless_subs: 328 | sub.content = contentless_text 329 | 330 | subs = contentless_subs + content_subs 331 | composed_subs = list(srt.sort_and_reindex(subs, in_place=True)) 332 | 333 | # We should have composed the same subs as there are in content_subs, as 334 | # all contentless_subs should have been stripped. 335 | subs_eq(composed_subs, content_subs, any_order=True) 336 | 337 | # The subtitles should be reindexed starting at start_index, excluding 338 | # contentless subs 339 | default_start_index = 1 340 | assert [sub.index for sub in composed_subs] == list( 341 | range(default_start_index, default_start_index + len(composed_subs)) 342 | ) 343 | 344 | 345 | @given( 346 | st.lists(subtitles()), 347 | st.lists(subtitles()), 348 | timedeltas(min_value=-999, max_value=-1), 349 | ) 350 | def test_subs_starts_before_zero_removed(positive_subs, negative_subs, negative_td): 351 | for sub in negative_subs: 352 | sub.start = negative_td 353 | sub.end = negative_td # Just to avoid tripping any start >= end errors 354 | 355 | subs = positive_subs + negative_subs 356 | composed_subs = list(srt.sort_and_reindex(subs, in_place=True)) 357 | 358 | # There should be no negative subs 359 | subs_eq(composed_subs, positive_subs, any_order=True) 360 | 361 | 362 | @given(st.lists(subtitles(), min_size=1), st.integers(min_value=0)) 363 | def test_sort_and_reindex(input_subs, start_index): 364 | for sub in input_subs: 365 | # Pin all subs to same end time and index so that start time is 366 | # compared only, must be guaranteed to be < sub.start, see how 367 | # start_timestamp_strategy is done 368 | sub.end = timedelta(500001) 369 | sub.index = 1 370 | 371 | reindexed_subs = list( 372 | srt.sort_and_reindex(input_subs, start_index=start_index, in_place=True) 373 | ) 374 | 375 | # The subtitles should be reindexed starting at start_index 376 | assert [sub.index for sub in reindexed_subs] == list( 377 | range(start_index, start_index + len(input_subs)) 378 | ) 379 | 380 | # The subtitles should be sorted by start time 381 | expected_sorting = sorted(input_subs, key=lambda sub: sub.start) 382 | assert reindexed_subs == expected_sorting 383 | 384 | 385 | @given(st.lists(subtitles())) 386 | def test_sort_and_reindex_no_skip(input_subs): 387 | # end time > start time should not trigger a skip if skip=False 388 | for sub in input_subs: 389 | old_start = sub.start 390 | sub.start = sub.end 391 | sub.end = old_start 392 | 393 | reindexed_subs = list(srt.sort_and_reindex(input_subs, skip=False)) 394 | 395 | # Nothing should have been skipped 396 | assert len(reindexed_subs) == len(input_subs) 397 | 398 | 399 | @given(st.lists(subtitles())) 400 | def test_sort_and_reindex_handles_no_index(input_subs): 401 | # end time > start time should not trigger a skip if skip=False 402 | for sub in input_subs: 403 | old_start = sub.start 404 | sub.start = sub.end 405 | sub.end = old_start 406 | sub.index = None 407 | 408 | reindexed_subs = list(srt.sort_and_reindex(input_subs)) 409 | 410 | # Everything should have been skipped 411 | assert not reindexed_subs 412 | 413 | 414 | @given(st.lists(subtitles(), min_size=1)) 415 | def test_sort_and_reindex_same_start_time_uses_end(input_subs): 416 | for sub in input_subs: 417 | # Pin all subs to same start time and index so that end time is 418 | # compared only 419 | sub.start = timedelta(1) 420 | sub.index = 1 421 | 422 | reindexed_subs = list(srt.sort_and_reindex(input_subs, in_place=True)) 423 | 424 | # The subtitles should be sorted by end time when start time is the same 425 | expected_sorting = sorted(input_subs, key=lambda sub: sub.end) 426 | assert reindexed_subs == expected_sorting 427 | 428 | 429 | @given(st.lists(subtitles(), min_size=1)) 430 | def test_sort_and_reindex_same_start_and_end_time_uses_index(input_subs): 431 | for sub in input_subs: 432 | # Pin all subs to same start and end time so that index is compared 433 | # only 434 | sub.start = timedelta(1) 435 | sub.end = timedelta(2) 436 | 437 | reindexed_subs = list(srt.sort_and_reindex(input_subs, in_place=True)) 438 | 439 | # The subtitles should be sorted by index when start and end time are the 440 | # same 441 | expected_sorting = sorted(input_subs, key=lambda sub: sub.index) 442 | assert reindexed_subs == expected_sorting 443 | 444 | 445 | @given(st.lists(subtitles(), min_size=1), st.integers(min_value=0)) 446 | def test_sort_and_reindex_not_in_place_matches(input_subs, start_index): 447 | # Make copies for both sort_and_reindex calls so that they can't affect 448 | # each other 449 | not_in_place_subs = [srt.Subtitle(**vars(sub)) for sub in input_subs] 450 | in_place_subs = [srt.Subtitle(**vars(sub)) for sub in input_subs] 451 | 452 | nip_ids = [id(sub) for sub in not_in_place_subs] 453 | ip_ids = [id(sub) for sub in in_place_subs] 454 | 455 | not_in_place_output = list( 456 | srt.sort_and_reindex(not_in_place_subs, start_index=start_index) 457 | ) 458 | in_place_output = list( 459 | srt.sort_and_reindex(in_place_subs, start_index=start_index, in_place=True) 460 | ) 461 | 462 | # The results in each case should be the same 463 | subs_eq(not_in_place_output, in_place_output) 464 | 465 | # Not in place sort_and_reindex should have created new subs 466 | assert not any(id(sub) in nip_ids for sub in not_in_place_output) 467 | 468 | # In place sort_and_reindex should be reusing the same subs 469 | assert all(id(sub) in ip_ids for sub in in_place_output) 470 | 471 | 472 | @given( 473 | st.lists(subtitles(), min_size=1), 474 | st.integers(min_value=0), 475 | st.text(min_size=1), 476 | timedeltas(), 477 | ) 478 | def test_parser_noncontiguous(subs, fake_idx, garbage, fake_timedelta): 479 | composed = srt.compose(subs) 480 | 481 | # Put some garbage between subs that should trigger our failed parsing 482 | # detection. Since we do some magic to try and detect blank lines that 483 | # don't really delimit subtitles, it has to look at least a little like an 484 | # SRT block. 485 | srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta) 486 | composed = composed.replace( 487 | "\n\n", "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage) 488 | ) 489 | 490 | with pytest.raises(srt.SRTParseError): 491 | list(srt.parse(composed)) 492 | 493 | 494 | @given( 495 | st.lists(subtitles(), min_size=1), 496 | st.integers(min_value=0), 497 | st.text(min_size=1), 498 | timedeltas(), 499 | ) 500 | def test_parser_noncontiguous_ignore_errors(subs, fake_idx, garbage, fake_timedelta): 501 | composed = srt.compose(subs) 502 | srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta) 503 | composed = composed.replace( 504 | "\n\n", "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage) 505 | ) 506 | # Should not raise, we have ignore_errors 507 | list(srt.parse(composed, ignore_errors=True)) 508 | 509 | 510 | def _parseable_as_int(text): 511 | try: 512 | int(text) 513 | except ValueError: 514 | return False 515 | return True 516 | 517 | 518 | def _parseable_as_float(text): 519 | try: 520 | float(text) 521 | except ValueError: 522 | return False 523 | return True 524 | 525 | 526 | @given(st.lists(subtitles()), st.text(min_size=1)) 527 | def test_parser_noncontiguous_leading(subs, garbage): 528 | # Issue #50 permits leading whitespace, see test_parsing_leading_whitespace 529 | assume(not garbage.isspace()) 530 | 531 | # Issue #56 permits negative indexes, see test_parsing_negative_index. It 532 | # also shouldn't just be a number, because then we'd confuse it with our 533 | # index... 534 | assume(garbage.strip()[0] != ".") 535 | assume(garbage.strip()[0] != "-") 536 | assume(not _parseable_as_int(garbage.strip())) 537 | assume(not _parseable_as_float(garbage.strip())) 538 | 539 | # Put some garbage at the beginning that should trigger our noncontiguity 540 | # checks 541 | composed = garbage + srt.compose(subs) 542 | 543 | with pytest.raises(srt.SRTParseError): 544 | list(srt.parse(composed)) 545 | 546 | 547 | @given( 548 | st.lists(subtitles(), min_size=1), 549 | st.integers(min_value=0), 550 | st.text(min_size=1), 551 | timedeltas(), 552 | ) 553 | def test_parser_didnt_match_to_end_raises(subs, fake_idx, garbage, fake_timedelta): 554 | srt_blocks = [sub.to_srt() for sub in subs] 555 | srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta) 556 | garbage = "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage) 557 | srt_blocks.append(garbage) 558 | composed = "".join(srt_blocks) 559 | 560 | with pytest.raises(srt.SRTParseError) as thrown_exc: 561 | list(srt.parse(composed)) 562 | 563 | # Since we will consume as many \n as needed until we meet the lookahead 564 | # assertion, leading newlines in `garbage` will be stripped. 565 | garbage_stripped = garbage.lstrip("\n") 566 | 567 | assert garbage_stripped == thrown_exc.value.unmatched_content 568 | assert len(composed) - len(garbage_stripped) == thrown_exc.value.expected_start 569 | assert len(composed) == thrown_exc.value.actual_start 570 | 571 | 572 | @given(st.lists(subtitles())) 573 | def test_parser_can_parse_with_dot_msec_delimiter(subs): 574 | original_srt_blocks = [sub.to_srt() for sub in subs] 575 | dot_srt_blocks = [] 576 | 577 | for srt_block in original_srt_blocks: 578 | srt_lines = srt_block.split("\n") 579 | # We should only do the first two, as it might also be in the 580 | # proprietary metadata, causing this test to fail. 581 | dot_timestamp = srt_lines[1].replace(",", ".", 2) 582 | srt_lines[1] = dot_timestamp 583 | dot_srt_blocks.append("\n".join(srt_lines)) 584 | 585 | composed_with_dots = "".join(dot_srt_blocks) 586 | reparsed_subs = srt.parse(composed_with_dots) 587 | subs_eq(reparsed_subs, subs) 588 | 589 | 590 | @given(st.lists(subtitles())) 591 | def test_parser_can_parse_with_fullwidth_delimiter(subs): 592 | original_srt_blocks = [sub.to_srt() for sub in subs] 593 | dot_srt_blocks = [] 594 | 595 | for srt_block in original_srt_blocks: 596 | srt_lines = srt_block.split("\n") 597 | dot_timestamp = srt_lines[1].replace(",", ",", 1).replace(":", ":", 1) 598 | srt_lines[1] = dot_timestamp 599 | dot_srt_blocks.append("\n".join(srt_lines)) 600 | 601 | composed_with_fullwidth = "".join(dot_srt_blocks) 602 | reparsed_subs = srt.parse(composed_with_fullwidth) 603 | subs_eq(reparsed_subs, subs) 604 | 605 | 606 | @given(st.lists(subtitles())) 607 | def test_parser_can_parse_with_no_msec(subs): 608 | original_srt_blocks = [sub.to_srt() for sub in subs] 609 | srt_blocks = [] 610 | 611 | for srt_block in original_srt_blocks: 612 | srt_lines = srt_block.split("\n") 613 | # We should only do the first two, as it might also be in the 614 | # proprietary metadata, causing this test to fail. 615 | srt_lines[1] = re.sub(",[0-9]+", "", srt_lines[1], 2) 616 | srt_blocks.append("\n".join(srt_lines)) 617 | 618 | composed = "".join(srt_blocks) 619 | reparsed_subs = srt.parse(composed) 620 | subs_eq(reparsed_subs, subs) 621 | 622 | 623 | @given(subtitles()) 624 | def test_repr_doesnt_crash(sub): 625 | # Not much we can do here, but we should make sure __repr__ doesn't crash 626 | # or anything and it does at least vaguely look like what we want 627 | assert "Subtitle" in repr(sub) 628 | assert str(sub.index) in repr(sub) 629 | 630 | 631 | @given(subtitles(), subtitles()) 632 | def test_parser_accepts_final_no_newline_no_content(sub1, sub2): 633 | # Limit size so we know how much to remove 634 | sub2.content = "" 635 | subs = [sub1, sub2] 636 | 637 | # Remove the last newlines so that there are none. Cannot use rstrip since 638 | # there might be other stuff that gets matched in proprietary 639 | stripped_srt_blocks = srt.compose(subs, reindex=False)[:-2] 640 | 641 | reparsed_subs = srt.parse(stripped_srt_blocks) 642 | subs_eq(reparsed_subs, subs) 643 | 644 | 645 | @given(st.lists(subtitles())) 646 | def test_parser_accepts_newline_no_content(subs): 647 | for sub in subs: 648 | # Limit size so we know how many lines to remove 649 | sub.content = "" 650 | 651 | # Remove the last \n so that there is only one 652 | stripped_srt_blocks = "".join(sub.to_srt()[:-1] for sub in subs) 653 | 654 | reparsed_subs = srt.parse(stripped_srt_blocks) 655 | subs_eq(reparsed_subs, subs) 656 | 657 | 658 | @given(st.lists(subtitles())) 659 | def test_compose_and_parse_strict_crlf(input_subs): 660 | composed_raw = srt.compose(input_subs, reindex=False) 661 | composed = composed_raw.replace("\n", "\r\n") 662 | reparsed_subs = list(srt.parse(composed)) 663 | 664 | for sub in reparsed_subs: 665 | sub.content = sub.content.replace("\r\n", "\n") 666 | 667 | subs_eq(reparsed_subs, input_subs) 668 | 669 | 670 | @given(st.lists(subtitles()), st.one_of(st.just("\n"), st.just("\r\n"))) 671 | def test_compose_and_parse_strict_custom_eol(input_subs, eol): 672 | composed = srt.compose(input_subs, reindex=False, eol=eol) 673 | reparsed_subs = srt.parse(composed) 674 | subs_eq(reparsed_subs, input_subs) 675 | 676 | 677 | @given(equivalent_timestamps()) 678 | def test_equal_timestamps_despite_different_fields_parsed_as_equal(timestamps): 679 | ts1, ts2 = timestamps 680 | assert srt.srt_timestamp_to_timedelta(ts1) == srt.srt_timestamp_to_timedelta(ts2) 681 | 682 | 683 | @given(timedeltas()) 684 | def test_bad_timestamp_format_raises(ts): 685 | ts = srt.timedelta_to_srt_timestamp(ts) 686 | ts = ts.replace(":", "t", 1) 687 | with pytest.raises(srt.TimestampParseError): 688 | srt.srt_timestamp_to_timedelta(ts) 689 | 690 | 691 | @given(st.lists(subtitles()), st.lists(st.sampled_from(string.whitespace))) 692 | def test_can_parse_index_trailing_ws(input_subs, whitespace): 693 | out = "" 694 | 695 | for sub in input_subs: 696 | lines = sub.to_srt().split("\n") 697 | lines[0] = lines[0] + "".join(whitespace) 698 | out += "\n".join(lines) 699 | 700 | reparsed_subs = srt.parse(out) 701 | subs_eq(reparsed_subs, input_subs) 702 | 703 | 704 | @given(st.lists(subtitles())) 705 | def test_can_parse_index_with_dot(input_subs): 706 | # Seen in Battlestar Galactica subs 707 | out = "" 708 | 709 | for sub in input_subs: 710 | lines = sub.to_srt().split("\n") 711 | lines[0] = lines[0] + "." + lines[0] 712 | out += "\n".join(lines) 713 | 714 | reparsed_subs = srt.parse(out) 715 | subs_eq(reparsed_subs, input_subs) 716 | 717 | 718 | @given(st.lists(subtitles()), st.lists(st.just("0"))) 719 | def test_can_parse_index_leading_zeroes(input_subs, zeroes): 720 | out = "" 721 | 722 | for sub in input_subs: 723 | lines = sub.to_srt().split("\n") 724 | lines[0] = "".join(zeroes) + lines[0] 725 | out += "\n".join(lines) 726 | 727 | reparsed_subs = srt.parse(out) 728 | subs_eq(reparsed_subs, input_subs) 729 | 730 | 731 | @given(st.lists(subtitles(), min_size=1)) 732 | def test_parse_file_with_missing_index(input_subs): # cf. issue #51 733 | for sub in input_subs: 734 | try: 735 | int(sub.content.strip().split("\n")[-1]) 736 | except ValueError: 737 | pass 738 | else: 739 | # If the final line with actual content is a number, we'll parse it 740 | # as the index, so ignore that 741 | assume(False) 742 | 743 | out_no_index = "" 744 | out_zero_index = "" 745 | 746 | for sub in input_subs: 747 | block = sub.to_srt() 748 | block = block[block.index("\n") + 1 :] 749 | out_no_index += block 750 | 751 | input_subs_copy = [srt.Subtitle(**vars(sub)) for sub in input_subs] 752 | for sub in input_subs_copy: 753 | # sub.index == None will get rendered in to_srt as 0 754 | sub.index = 0 755 | out_zero_index += sub.to_srt() 756 | 757 | subs_no_index = list(srt.parse(out_no_index)) 758 | subs_zero_index = list(srt.parse(out_zero_index)) 759 | 760 | # One should have index == None, one should have index == 0... 761 | assert subs_no_index 762 | assert subs_zero_index 763 | assert all(sub.index == None for sub in subs_no_index) 764 | assert all(sub.index == 0 for sub in subs_zero_index) 765 | assert subs_no_index != subs_zero_index 766 | 767 | # ...but they should render the same... 768 | assert srt.compose(subs_no_index, reindex=False) == srt.compose( 769 | subs_zero_index, reindex=False 770 | ) 771 | 772 | # ...and sort the same. 773 | assert srt.compose(subs_no_index) == srt.compose(subs_zero_index) 774 | --------------------------------------------------------------------------------