├── srt_tools
    ├── __init__.py
    ├── tests
    │   ├── __init__.py
    │   ├── files
    │   │   ├── gb2312.srt
    │   │   └── ascii.srt
    │   └── test_srt_tools.py
    ├── srt-normalise
    ├── srt-fixed-timeshift
    ├── srt
    ├── srt-play
    ├── srt-process
    ├── srt-lines-matching
    ├── README.rst
    ├── srt-deduplicate
    ├── srt-linear-timeshift
    ├── srt-mux
    └── utils.py
├── docs
    ├── requirements.txt
    ├── api.rst
    ├── index.rst
    ├── conf.py
    └── quickstart.rst
├── .coveragerc
├── MANIFEST.in
├── tests
    ├── requirements.txt
    └── test_srt.py
├── LICENSE
├── .github
    └── workflows
    │   └── ci.yml
├── tox.ini
├── setup.py
├── README.rst
└── srt.py


/srt_tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/srt_tools/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==3.*
2 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | relative_files = True
3 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
1 | API documentation
2 | =================
3 | 
4 | .. automodule:: srt
5 |    :members:
6 | 


--------------------------------------------------------------------------------
/srt_tools/tests/files/gb2312.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cdown/srt/HEAD/srt_tools/tests/files/gb2312.srt


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include MANIFEST.in
3 | include README.rst
4 | recursive-include docs *
5 | recursive-include tests *
6 | 
7 | recursive-exclude * *.py[co]
8 | recursive-exclude * __pycache__
9 | 


--------------------------------------------------------------------------------
/srt_tools/tests/files/ascii.srt:
--------------------------------------------------------------------------------
 1 | 2
 2 | 00:00:27,000 --> 00:00:30,730
 3 | ascii
 4 | 
 5 | 4
 6 | 00:00:31,500 --> 00:00:34,100
 7 | oh look
 8 | 
 9 | 6
10 | 00:00:34,100 --> 00:00:36,570
11 | ascii everywhere
12 | 
13 | 


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest==4.*; python_version < '3.0'
2 | pytest==6.*; python_version >= '3.0'
3 | pytest-xdist==1.*; python_version < '3.0'
4 | pytest-xdist==2.*; python_version >= '3.0'
5 | pytest-cov==2.*
6 | hypothesis==4.*; python_version < '3.6'
7 | hypothesis==6.*; python_version >= '3.6'
8 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | `srt`: Parse SubRip files
 2 | =========================
 3 | 
 4 | srt_ is a tiny Python library for parsing, modifying, and composing SRT files.
 5 | 
 6 | .. _srt: https://github.com/cdown/srt
 7 | 
 8 | Documentation
 9 | =============
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 | 
14 |    quickstart
15 |    api
16 | 
17 | Indices and tables
18 | ==================
19 | 
20 | * :ref:`genindex`
21 | * :ref:`search`
22 | 


--------------------------------------------------------------------------------
/srt_tools/srt-normalise:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """Takes a badly formatted SRT file and outputs a strictly valid one."""
 4 | 
 5 | import srt_tools.utils
 6 | import logging
 7 | 
 8 | log = logging.getLogger(__name__)
 9 | 
10 | 
11 | def main():
12 |     examples = {"Normalise a subtitle": "srt normalise -i bad.srt -o good.srt"}
13 | 
14 |     args = srt_tools.utils.basic_parser(
15 |         description=__doc__, examples=examples, hide_no_strict=True
16 |     ).parse_args()
17 |     logging.basicConfig(level=args.log_level)
18 |     srt_tools.utils.set_basic_args(args)
19 |     output = srt_tools.utils.compose_suggest_on_fail(args.input, strict=args.strict)
20 | 
21 |     try:
22 |         args.output.write(output)
23 |     except (UnicodeEncodeError, TypeError):  # Python 2 fallback
24 |         args.output.write(output.encode(args.encoding))
25 | 
26 | 
27 | if __name__ == "__main__":  # pragma: no cover
28 |     main()
29 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | # srt.py is in the next directory up
 5 | sys.path.insert(0, os.path.abspath(".."))
 6 | 
 7 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.doctest"]
 8 | 
 9 | copyright = "Chris Down"
10 | exclude_patterns = ["_build"]
11 | master_doc = "index"
12 | project = "srt"
13 | pygments_style = "sphinx"
14 | source_suffix = ".rst"
15 | templates_path = ["_templates"]
16 | 
17 | version = "3.5.3"
18 | release = version
19 | 
20 | html_static_path = ["_static"]
21 | html_theme = "alabaster"
22 | htmlhelp_basename = "srtdoc"
23 | 
24 | latex_elements = {}
25 | latex_documents = [("index", "srt.tex", "srt Documentation", "Chris Down", "manual")]
26 | 
27 | man_pages = [("index", "srt", "srt Documentation", ["Chris Down"], 1)]
28 | 
29 | texinfo_documents = [
30 |     (
31 |         "index",
32 |         "srt",
33 |         "srt Documentation",
34 |         "Chris Down",
35 |         "srt",
36 |         "One line description of project.",
37 |         "Miscellaneous",
38 |     )
39 | ]
40 | 
41 | intersphinx_mapping = {"python": ("https://docs.python.org/3.8", None)}
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2014-present Christopher Down
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | jobs:
 2 |   build_and_test:
 3 |     name: CI
 4 |     strategy:
 5 |       matrix:
 6 |         # Pin to 20.04 for 3.6: https://github.com/actions/setup-python/issues/544
 7 |         os: [ubuntu-20.04, macos-latest, windows-latest]
 8 |         python-version: ['3.5', '3.6', '3.7', '3.8', '3.9', '3.10', '3.11']
 9 |     runs-on: ${{ matrix.os }}
10 | 
11 |     steps:
12 |       - uses: actions/checkout@v3
13 | 
14 |       - uses: actions/setup-python@v4
15 |         with:
16 |           python-version: ${{ matrix.python-version }}
17 |       - run: python --version
18 | 
19 |       - run: pip install -U pip
20 |       - run: pip install -U tox
21 | 
22 |       - if: matrix.python-version == '3.9' && startsWith(matrix.os, 'ubuntu-')
23 |         run: |
24 |           echo "TOXENV=doctest,black,pylint,pytype,bandit,coverage" >> "$GITHUB_ENV"
25 | 
26 |       - run: tox
27 |         env:
28 |           TOXENV: ${{ env.TOXENV }}
29 | 
30 |       - if: matrix.python-version == '3.9' && startsWith(matrix.os, 'ubuntu-')
31 |         uses: AndreMiras/coveralls-python-action@develop
32 | 
33 | on:
34 |   push:
35 |   pull_request:
36 |   workflow_dispatch:
37 | 


--------------------------------------------------------------------------------
/docs/quickstart.rst:
--------------------------------------------------------------------------------
 1 | Quickstart
 2 | ==========
 3 | 
 4 | Parse an SRT to Python objects
 5 | ------------------------------
 6 | 
 7 | .. code:: python
 8 | 
 9 |     >>> import srt
10 |     >>> subtitle_generator = srt.parse('''\
11 |     ... 1
12 |     ... 00:31:37,894 --> 00:31:39,928
13 |     ... OK, look, I think I have a plan here.
14 |     ...
15 |     ... 2
16 |     ... 00:31:39,931 --> 00:31:41,931
17 |     ... Using mainly spoons,
18 |     ...
19 |     ... 3
20 |     ... 00:31:41,933 --> 00:31:43,435
21 |     ... we dig a tunnel under the city and release it into the wild.
22 |     ...
23 |     ... ''')
24 |     >>> subtitles = list(subtitle_generator)
25 |     >>>
26 |     >>> subtitles[0].start
27 |     datetime.timedelta(0, 1897, 894000)
28 |     >>> subtitles[1].content
29 |     'Using mainly spoons,'
30 | 
31 | Compose an SRT from Python objects
32 | ----------------------------------
33 | 
34 | .. code:: python
35 | 
36 |     >>> print(srt.compose(subtitles))
37 |     1
38 |     00:31:37,894 --> 00:31:39,928
39 |     OK, look, I think I have a plan here.
40 |     <BLANKLINE>
41 |     2
42 |     00:31:39,931 --> 00:31:41,931
43 |     Using mainly spoons,
44 |     <BLANKLINE>
45 |     3
46 |     00:31:41,933 --> 00:31:43,435
47 |     we dig a tunnel under the city and release it into the wild.
48 |     <BLANKLINE>
49 |     <BLANKLINE>
50 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = python
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     -rtests/requirements.txt
 7 | commands =
 8 |     {basepython} --version
 9 |     pytest -vv -n auto
10 | allowlist_externals =
11 |     {basepython}
12 |     pytest
13 | setenv=
14 |     release: HYPOTHESIS_PROFILE=release
15 | 
16 | [testenv:doctest]
17 | deps =
18 |     {[testenv]deps}
19 | commands =
20 |     pytest --doctest-modules
21 | 
22 | [testenv:coverage]
23 | passenv =
24 |     TRAVIS
25 |     TRAVIS_JOB_ID
26 |     TRAVIS_BRANCH
27 | deps =
28 |     {[testenv]deps}
29 |     coverage
30 | commands =
31 |     coverage erase
32 |     pytest -vv --cov=srt --cov-branch --cov-fail-under=100 --cov-report term-missing
33 | 
34 | [testenv:pylint]
35 | skipsdist = True
36 | deps =
37 |     {[testenv]deps}
38 |     pylint
39 | commands =
40 |     # C0330: https://github.com/psf/black/issues/1178
41 |     # R0913: These are intentional design decisions, so leave them.
42 |     # R0205, R1725, C0209: We still support py2.
43 |     pylint --disable=C0330,R0913,R0205,R1725,C0209 srt.py
44 | 
45 | [testenv:black]
46 | skipsdist = True
47 | allowlist_externals = sh
48 | deps =
49 |     black
50 | commands =
51 |     black --check .
52 |     sh -c 'exec black --check srt_tools/srt*'
53 | 
54 | [testenv:pytype]
55 | skipsdist = True
56 | deps =
57 |     {[testenv]deps}
58 |     pytype
59 | commands =
60 |     pytype .
61 | 
62 | [testenv:bandit]
63 | skipsdist = True
64 | deps =
65 |     {[testenv]deps}
66 |     bandit
67 | commands =
68 |     bandit srt.py
69 | 
70 | [testenv:pypy3]
71 | basepython = pypy3
72 | 


--------------------------------------------------------------------------------
/srt_tools/srt-fixed-timeshift:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """Shifts a subtitle by a fixed number of seconds."""
 4 | 
 5 | import datetime
 6 | import srt_tools.utils
 7 | import logging
 8 | 
 9 | log = logging.getLogger(__name__)
10 | 
11 | 
12 | def parse_args():
13 |     examples = {
14 |         "Make all subtitles 5 seconds later": "srt fixed-timeshift --seconds 5",
15 |         "Make all subtitles 5 seconds earlier": "srt fixed-timeshift --seconds -5",
16 |     }
17 | 
18 |     parser = srt_tools.utils.basic_parser(description=__doc__, examples=examples)
19 |     parser.add_argument(
20 |         "--seconds", type=float, required=True, help="how many seconds to shift"
21 |     )
22 |     return parser.parse_args()
23 | 
24 | 
25 | def scalar_correct_subs(subtitles, seconds_to_shift):
26 |     td_to_shift = datetime.timedelta(seconds=seconds_to_shift)
27 |     for subtitle in subtitles:
28 |         subtitle.start += td_to_shift
29 |         subtitle.end += td_to_shift
30 |         yield subtitle
31 | 
32 | 
33 | def main():
34 |     args = parse_args()
35 |     logging.basicConfig(level=args.log_level)
36 |     srt_tools.utils.set_basic_args(args)
37 |     corrected_subs = scalar_correct_subs(args.input, args.seconds)
38 |     output = srt_tools.utils.compose_suggest_on_fail(corrected_subs, strict=args.strict)
39 | 
40 |     try:
41 |         args.output.write(output)
42 |     except (UnicodeEncodeError, TypeError):  # Python 2 fallback
43 |         args.output.write(output.encode(args.encoding))
44 | 
45 | 
46 | if __name__ == "__main__":  # pragma: no cover
47 |     main()
48 | 


--------------------------------------------------------------------------------
/srt_tools/srt:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | import errno
 6 | 
 7 | 
 8 | SRT_BIN_PREFIX = "srt-"
 9 | 
10 | 
11 | def find_srt_commands_in_path():
12 |     paths = os.environ.get("PATH", "").split(os.pathsep)
13 | 
14 |     for path in paths:
15 |         try:
16 |             path_files = os.listdir(path)
17 |         except OSError as thrown_exc:
18 |             if thrown_exc.errno in (errno.ENOENT, errno.ENOTDIR):
19 |                 continue
20 |             else:
21 |                 raise
22 | 
23 |         for path_file in path_files:
24 |             if path_file.startswith(SRT_BIN_PREFIX):
25 |                 yield path_file[len(SRT_BIN_PREFIX) :]
26 | 
27 | 
28 | def show_help():
29 |     print(
30 |         "Available commands "
31 |         "(pass --help to a specific command for usage information):\n"
32 |     )
33 |     commands = sorted(set(find_srt_commands_in_path()))
34 |     for command in commands:
35 |         print("- {}".format(command))
36 | 
37 | 
38 | def main():
39 |     if len(sys.argv) < 2 or sys.argv[1].startswith("-"):
40 |         show_help()
41 |         sys.exit(0)
42 | 
43 |     command = sys.argv[1]
44 | 
45 |     available_commands = find_srt_commands_in_path()
46 | 
47 |     if command not in available_commands:
48 |         print('Unknown command: "{}"\n'.format(command))
49 |         show_help()
50 |         sys.exit(1)
51 | 
52 |     real_command = SRT_BIN_PREFIX + command
53 |     os.execvp(real_command, [real_command] + sys.argv[2:])
54 | 
55 | 
56 | if __name__ == "__main__":  # pragma: no cover
57 |     main()
58 | 


--------------------------------------------------------------------------------
/srt_tools/srt-play:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """Play subtitles with correct timing to stdout."""
 4 | 
 5 | from __future__ import print_function
 6 | import logging
 7 | from threading import Timer, Lock
 8 | import srt_tools.utils
 9 | import sys
10 | import time
11 | 
12 | log = logging.getLogger(__name__)
13 | output_lock = Lock()
14 | 
15 | 
16 | def print_sub(sub, encoding):
17 |     log.debug("Timer woke up to print %s", sub.content)
18 | 
19 |     with output_lock:
20 |         try:
21 |             sys.stdout.write(sub.content + "\n\n")
22 |         except UnicodeEncodeError:  # Python 2 fallback
23 |             sys.stdout.write(sub.content.encode(encoding) + "\n\n")
24 |         sys.stdout.flush()
25 | 
26 | 
27 | def schedule(subs, encoding):
28 |     timers = set()
29 |     log.debug("Scheduling subtitles")
30 | 
31 |     for sub in subs:
32 |         secs = sub.start.total_seconds()
33 |         cur_timer = Timer(secs, print_sub, [sub, encoding])
34 |         cur_timer.name = "%s:%s" % (sub.index, secs)
35 |         cur_timer.daemon = True
36 |         log.debug('Adding "%s" to schedule queue', cur_timer.name)
37 |         timers.add(cur_timer)
38 | 
39 |     for timer in timers:
40 |         log.debug('Starting timer for "%s"', timer.name)
41 |         timer.start()
42 | 
43 |     while any(t.is_alive() for t in timers):
44 |         time.sleep(0.5)
45 | 
46 | 
47 | def main():
48 |     examples = {"Play a subtitle": "srt play -i foo.srt"}
49 | 
50 |     args = srt_tools.utils.basic_parser(
51 |         description=__doc__, examples=examples, no_output=True
52 |     ).parse_args()
53 |     logging.basicConfig(level=args.log_level)
54 |     srt_tools.utils.set_basic_args(args)
55 |     schedule(args.input, args.encoding)
56 | 
57 | 
58 | if __name__ == "__main__":  # pragma: no cover
59 |     main()
60 | 


--------------------------------------------------------------------------------
/srt_tools/srt-process:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """Process subtitle text content using arbitrary Python code."""
 4 | 
 5 | import importlib
 6 | import srt_tools.utils
 7 | import logging
 8 | 
 9 | log = logging.getLogger(__name__)
10 | 
11 | 
12 | def strip_to_matching_lines_only(subtitles, imports, func_str):
13 |     for import_name in imports:
14 |         real_import = importlib.import_module(import_name)
15 |         globals()[import_name] = real_import
16 | 
17 |     func = eval(func_str)  # pylint: disable-msg=eval-used
18 | 
19 |     for subtitle in subtitles:
20 |         subtitle.content = func(subtitle.content)
21 |         yield subtitle
22 | 
23 | 
24 | def parse_args():
25 |     examples = {
26 |         "Strip HTML-like symbols from a subtitle": """srt process -m re -f 'lambda sub: re.sub("<[^<]+?>", "", sub)'"""
27 |     }
28 | 
29 |     parser = srt_tools.utils.basic_parser(description=__doc__, examples=examples)
30 |     parser.add_argument(
31 |         "-f", "--func", help="a function to use to process lines", required=True
32 |     )
33 |     parser.add_argument(
34 |         "-m",
35 |         "--module",
36 |         help="modules to import in the function context",
37 |         action="append",
38 |         default=[],
39 |     )
40 |     return parser.parse_args()
41 | 
42 | 
43 | def main():
44 |     args = parse_args()
45 |     logging.basicConfig(level=args.log_level)
46 |     srt_tools.utils.set_basic_args(args)
47 |     processed_subs = strip_to_matching_lines_only(args.input, args.module, args.func)
48 |     output = srt_tools.utils.compose_suggest_on_fail(processed_subs, strict=args.strict)
49 | 
50 |     try:
51 |         args.output.write(output)
52 |     except (UnicodeEncodeError, TypeError):  # Python 2 fallback
53 |         args.output.write(output.encode(args.encoding))
54 | 
55 | 
56 | if __name__ == "__main__":  # pragma: no cover
57 |     main()
58 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import codecs
 4 | 
 5 | from setuptools import setup
 6 | 
 7 | with codecs.open("README.rst", encoding="utf8") as readme_f:
 8 |     README = readme_f.read()
 9 | 
10 | setup(
11 |     name="srt",
12 |     version="3.5.3",
13 |     python_requires=">=2.7",
14 |     description="A tiny library for parsing, modifying, and composing SRT files.",
15 |     long_description=README,
16 |     author="Chris Down",
17 |     author_email="chris@chrisdown.name",
18 |     url="https://github.com/cdown/srt",
19 |     py_modules=["srt", "srt_tools.utils"],
20 |     scripts=[
21 |         "srt_tools/srt",
22 |         "srt_tools/srt-deduplicate",
23 |         "srt_tools/srt-normalise",
24 |         "srt_tools/srt-fixed-timeshift",
25 |         "srt_tools/srt-linear-timeshift",
26 |         "srt_tools/srt-lines-matching",
27 |         "srt_tools/srt-mux",
28 |         "srt_tools/srt-play",
29 |         "srt_tools/srt-process",
30 |     ],
31 |     license="MIT",
32 |     keywords="srt",
33 |     classifiers=[
34 |         "Development Status :: 5 - Production/Stable",
35 |         "Intended Audience :: Developers",
36 |         "License :: OSI Approved :: MIT License",
37 |         "Operating System :: OS Independent",
38 |         "Programming Language :: Python :: 2",
39 |         "Programming Language :: Python :: 2.7",
40 |         "Programming Language :: Python :: 3",
41 |         "Programming Language :: Python :: 3.5",
42 |         "Programming Language :: Python :: 3.6",
43 |         "Programming Language :: Python :: 3.7",
44 |         "Programming Language :: Python :: 3.8",
45 |         "Programming Language :: Python :: 3.9",
46 |         "Programming Language :: Python :: 3.10",
47 |         "Programming Language :: Python :: 3.11",
48 |         "Topic :: Multimedia :: Video",
49 |         "Topic :: Software Development :: Libraries",
50 |         "Topic :: Text Processing",
51 |     ],
52 | )
53 | 


--------------------------------------------------------------------------------
/srt_tools/srt-lines-matching:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """Filter subtitles that match or don't match a particular pattern."""
 4 | 
 5 | import importlib
 6 | import srt_tools.utils
 7 | import logging
 8 | 
 9 | log = logging.getLogger(__name__)
10 | 
11 | 
12 | def strip_to_matching_lines_only(subtitles, imports, func_str, invert, per_sub):
13 |     for import_name in imports:
14 |         real_import = importlib.import_module(import_name)
15 |         globals()[import_name] = real_import
16 | 
17 |     raw_func = eval(func_str)  # pylint: disable-msg=eval-used
18 | 
19 |     if invert:
20 |         func = lambda line: not raw_func(line)
21 |     else:
22 |         func = raw_func
23 | 
24 |     for subtitle in subtitles:
25 |         if per_sub:
26 |             if not func(subtitle.content):
27 |                 subtitle.content = ""
28 |         else:
29 |             subtitle.content = "\n".join(
30 |                 line for line in subtitle.content.splitlines() if func(line)
31 |             )
32 | 
33 |         yield subtitle
34 | 
35 | 
36 | def parse_args():
37 |     examples = {
38 |         "Only include Chinese lines": "srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese",
39 |         "Exclude all lines which only contain numbers": "srt lines-matching -v -f 'lambda x: x.isdigit()'",
40 |     }
41 |     parser = srt_tools.utils.basic_parser(description=__doc__, examples=examples)
42 |     parser.add_argument(
43 |         "-f", "--func", help="a function to use to match lines", required=True
44 |     )
45 |     parser.add_argument(
46 |         "-m",
47 |         "--module",
48 |         help="modules to import in the function context",
49 |         action="append",
50 |         default=[],
51 |     )
52 |     parser.add_argument(
53 |         "-s",
54 |         "--per-subtitle",
55 |         help="match the content of each subtitle, not each line",
56 |         action="store_true",
57 |     )
58 |     parser.add_argument(
59 |         "-v",
60 |         "--invert",
61 |         help="invert matching -- only match lines returning False",
62 |         action="store_true",
63 |     )
64 |     return parser.parse_args()
65 | 
66 | 
67 | def main():
68 |     args = parse_args()
69 |     logging.basicConfig(level=args.log_level)
70 |     srt_tools.utils.set_basic_args(args)
71 |     matching_subtitles_only = strip_to_matching_lines_only(
72 |         args.input, args.module, args.func, args.invert, args.per_subtitle
73 |     )
74 |     output = srt_tools.utils.compose_suggest_on_fail(
75 |         matching_subtitles_only, strict=args.strict
76 |     )
77 | 
78 |     try:
79 |         args.output.write(output)
80 |     except (UnicodeEncodeError, TypeError):  # Python 2 fallback
81 |         args.output.write(output.encode(args.encoding))
82 | 
83 | 
84 | if __name__ == "__main__":  # pragma: no cover
85 |     main()
86 | 


--------------------------------------------------------------------------------
/srt_tools/README.rst:
--------------------------------------------------------------------------------
 1 | srt_tools contains utilities written to process SRT files. All utilities use
 2 | the Python srt_ library internally.
 3 | 
 4 | .. _srt: https://github.com/cdown/srt
 5 | 
 6 | Usage
 7 | -----
 8 | 
 9 | You can call ``srt`` directly to see a list of all available utilities.
10 | 
11 | .. code::
12 | 
13 |     srt [utility-name] [args ...]
14 | 
15 | Arbitrary things can be done with *srt process* and *srt lines-matching*, for
16 | example:
17 | 
18 | .. code::
19 | 
20 |     # Strip HTML
21 |     srt process -m re -f 'lambda sub: re.sub("<[^<]+?>", "", sub)'
22 | 
23 |     # Only keep Chinese subtitles
24 |     srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese
25 | 
26 | Utilities
27 | ---------
28 | 
29 | - *deduplicate* removes subtitles with duplicate content. If you have subtitles
30 |   which mistakenly repeat the same content in different subs at roughly the
31 |   same time, you can run this tool to remove them.
32 | - *fixed-timeshift* does fixed time correction. For example, if you have a
33 |   movie that is consistently out of sync by two seconds, you can run this tool
34 |   to shift the entire subtitle two seconds ahead or behind.
35 | - *linear-timeshift* does linear time correction. If you have a movie that
36 |   runs slower or faster than the subtitle that you have, it will repeatedly
37 |   lose sync. This tool can apply linear time corrections to all subtitles in
38 |   the SRT, resyncing it with the video.
39 | - *lines-matching* takes a function and removes lines that don't return true
40 |   when passed to it. For example, you can keep only lines that contain Chinese
41 |   by installing the hanzidentifier_ package, and running
42 |   ``srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese < input``.
43 | - *mux* can mux_ multiple subtitles together into one. For example, if you
44 |   have a Chinese subtitle and an English subtitle, and you want to have one
45 |   subtitle file that contains both, this tool can do that for you. It also
46 |   supports clamping subtitles starting or ending at similar times to the same
47 |   time to avoid subtitles jumping around the screen.
48 | - *normalise* standardises and cleans up SRT files. For example, it removes
49 |   spurious newlines, normalises timestamps, and fixes subtitle indexing to a
50 |   format that all media players should accept, with no noncompliant data.
51 | - *play* plays subtitles in the terminal at the time they are scheduled to
52 |   display (note: it does not clear them from the screen afterwards). If you
53 |   need to fast-forward to some point, you can combine it with
54 |   *fixed-timeshift*.
55 | - *process* allows processing text freely. It takes a function, similarly to
56 |   *lines-matching*, and changes SRT content into the return value. For example,
57 |   you can naively strip some basic HTML-like markup with
58 |   ``srt process -m re -f 'lambda sub: re.sub("<[^<]+?>", "", sub)'``. HTML-like
59 |   syntax is especially prevalant in `SSA/ASS`_ subtitles that have been
60 |   directly converted to SRT.
61 | 
62 | .. _mux: https://en.wikipedia.org/wiki/Multiplexing
63 | .. _`SSA/ASS`: https://en.wikipedia.org/wiki/SubStation_Alpha
64 | .. _hanzidentifier: https://github.com/tsroten/hanzidentifier
65 | 


--------------------------------------------------------------------------------
/srt_tools/srt-deduplicate:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """Deduplicate repeated subtitles."""
 4 | 
 5 | import datetime
 6 | import srt_tools.utils
 7 | import logging
 8 | import operator
 9 | 
10 | log = logging.getLogger(__name__)
11 | 
12 | try:  # Python 2
13 |     range = xrange  # pytype: disable=name-error
14 | except NameError:
15 |     pass
16 | 
17 | 
18 | def parse_args():
19 |     examples = {
20 |         "Remove duplicated subtitles within 5 seconds of each other": "srt deduplicate -i duplicated.srt",
21 |         "Remove duplicated subtitles within 500 milliseconds of each other": "srt deduplicate -t 500 -i duplicated.srt",
22 |         "Remove duplicated subtitles regardless of temporal proximity": "srt deduplicate -t 0 -i duplicated.srt",
23 |     }
24 |     parser = srt_tools.utils.basic_parser(
25 |         description=__doc__,
26 |         examples=examples,
27 |     )
28 |     parser.add_argument(
29 |         "-t",
30 |         "--ms",
31 |         metavar="MILLISECONDS",
32 |         default=datetime.timedelta(milliseconds=5000),
33 |         type=lambda ms: datetime.timedelta(milliseconds=int(ms)),
34 |         help="how many milliseconds distance a subtitle start time must be "
35 |         "within of another to be considered a duplicate "
36 |         "(default: 5000ms)",
37 |     )
38 | 
39 |     return parser.parse_args()
40 | 
41 | 
42 | def deduplicate_subs(orig_subs, acceptable_diff):
43 |     """Remove subtitles with duplicated content."""
44 |     indices_to_remove = []
45 | 
46 |     # If we only store the subtitle itself and compare that, it's possible that
47 |     # we'll not only remove the duplicate, but also the _original_ subtitle if
48 |     # they have the same sub index/times/etc.
49 |     #
50 |     # As such, we need to also store the index in the original subs list that
51 |     # this entry belongs to for each subtitle prior to sorting.
52 |     sorted_subs = sorted(
53 |         enumerate(orig_subs), key=lambda sub: (sub[1].content, sub[1].start)
54 |     )
55 | 
56 |     for subs in srt_tools.utils.sliding_window(sorted_subs, width=2, inclusive=False):
57 |         cur_idx, cur_sub = subs[0]
58 |         next_idx, next_sub = subs[1]
59 | 
60 |         if cur_sub.content == next_sub.content and (
61 |             not acceptable_diff or cur_sub.start + acceptable_diff >= next_sub.start
62 |         ):
63 |             log.debug(
64 |                 "Marking l%d/s%d for removal, duplicate of l%d/s%d",
65 |                 next_idx,
66 |                 next_sub.index,
67 |                 cur_idx,
68 |                 cur_sub.index,
69 |             )
70 |             indices_to_remove.append(next_idx)
71 | 
72 |     offset = 0
73 |     for idx in indices_to_remove:
74 |         del orig_subs[idx - offset]
75 |         offset += 1
76 | 
77 | 
78 | def main():
79 |     args = parse_args()
80 |     logging.basicConfig(level=args.log_level)
81 | 
82 |     srt_tools.utils.set_basic_args(args)
83 | 
84 |     subs = list(args.input)
85 |     deduplicate_subs(subs, args.ms)
86 | 
87 |     output = srt_tools.utils.compose_suggest_on_fail(subs, strict=args.strict)
88 | 
89 |     try:
90 |         args.output.write(output)
91 |     except (UnicodeEncodeError, TypeError):  # Python 2 fallback
92 |         args.output.write(output.encode(args.encoding))
93 | 
94 | 
95 | if __name__ == "__main__":  # pragma: no cover
96 |     main()
97 | 


--------------------------------------------------------------------------------
/srt_tools/srt-linear-timeshift:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Perform linear time correction on a subtitle."""
  4 | 
  5 | from __future__ import division
  6 | 
  7 | import srt
  8 | import datetime
  9 | import srt_tools.utils
 10 | import logging
 11 | 
 12 | log = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | def timedelta_to_milliseconds(delta):
 16 |     return delta.days * 86400000 + delta.seconds * 1000 + delta.microseconds / 1000
 17 | 
 18 | 
 19 | def parse_args():
 20 |     def srt_timestamp_to_milliseconds(parser, arg):
 21 |         try:
 22 |             delta = srt.srt_timestamp_to_timedelta(arg)
 23 |         except ValueError:
 24 |             parser.error("not a valid SRT timestamp: %s" % arg)
 25 |         else:
 26 |             return timedelta_to_milliseconds(delta)
 27 | 
 28 |     examples = {
 29 |         "Stretch out a subtitle so that second 1 is 1, 2 is 3, 3 is 5, etc": "srt linear-timeshift --f1 00:00:01,000 --t1 00:00:01,000 --f2 00:00:02,000 --t2 00:00:03,000"
 30 |     }
 31 | 
 32 |     parser = srt_tools.utils.basic_parser(description=__doc__, examples=examples)
 33 |     parser.add_argument(
 34 |         "--from-start",
 35 |         "--f1",
 36 |         type=lambda arg: srt_timestamp_to_milliseconds(parser, arg),
 37 |         required=True,
 38 |         help="the first desynchronised timestamp",
 39 |     )
 40 |     parser.add_argument(
 41 |         "--to-start",
 42 |         "--t1",
 43 |         type=lambda arg: srt_timestamp_to_milliseconds(parser, arg),
 44 |         required=True,
 45 |         help="the first synchronised timestamp",
 46 |     )
 47 |     parser.add_argument(
 48 |         "--from-end",
 49 |         "--f2",
 50 |         type=lambda arg: srt_timestamp_to_milliseconds(parser, arg),
 51 |         required=True,
 52 |         help="the second desynchronised timestamp",
 53 |     )
 54 |     parser.add_argument(
 55 |         "--to-end",
 56 |         "--t2",
 57 |         type=lambda arg: srt_timestamp_to_milliseconds(parser, arg),
 58 |         required=True,
 59 |         help="the second synchronised timestamp",
 60 |     )
 61 |     return parser.parse_args()
 62 | 
 63 | 
 64 | def calc_correction(to_start, to_end, from_start, from_end):
 65 |     angular = (to_end - to_start) / (from_end - from_start)
 66 |     linear = to_end - angular * from_end
 67 |     return angular, linear
 68 | 
 69 | 
 70 | def correct_time(current_msecs, angular, linear):
 71 |     return round(current_msecs * angular + linear)
 72 | 
 73 | 
 74 | def correct_timedelta(bad_delta, angular, linear):
 75 |     bad_msecs = timedelta_to_milliseconds(bad_delta)
 76 |     good_msecs = correct_time(bad_msecs, angular, linear)
 77 |     good_delta = datetime.timedelta(milliseconds=good_msecs)
 78 |     return good_delta
 79 | 
 80 | 
 81 | def linear_correct_subs(subtitles, angular, linear):
 82 |     for subtitle in subtitles:
 83 |         subtitle.start = correct_timedelta(subtitle.start, angular, linear)
 84 |         subtitle.end = correct_timedelta(subtitle.end, angular, linear)
 85 |         yield subtitle
 86 | 
 87 | 
 88 | def main():
 89 |     args = parse_args()
 90 |     logging.basicConfig(level=args.log_level)
 91 |     angular, linear = calc_correction(
 92 |         args.to_start, args.to_end, args.from_start, args.from_end
 93 |     )
 94 |     srt_tools.utils.set_basic_args(args)
 95 |     corrected_subs = linear_correct_subs(args.input, angular, linear)
 96 |     output = srt_tools.utils.compose_suggest_on_fail(corrected_subs, strict=args.strict)
 97 | 
 98 |     try:
 99 |         args.output.write(output)
100 |     except (UnicodeEncodeError, TypeError):  # Python 2 fallback
101 |         args.output.write(output.encode(args.encoding))
102 | 
103 | 
104 | if __name__ == "__main__":  # pragma: no cover
105 |     main()
106 | 


--------------------------------------------------------------------------------
/srt_tools/srt-mux:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Merge multiple subtitles together into one."""
  4 | 
  5 | import datetime
  6 | import srt_tools.utils
  7 | import logging
  8 | import operator
  9 | 
 10 | log = logging.getLogger(__name__)
 11 | 
 12 | TOP = r"{\an8}"
 13 | BOTTOM = r"{\an2}"
 14 | 
 15 | 
 16 | def parse_args():
 17 |     examples = {
 18 |         "Merge English and Chinese subtitles": "srt mux -i eng.srt -i chs.srt -o both.srt",
 19 |         "Merge subtitles, with one on top and one at the bottom": "srt mux -t -i eng.srt -i chs.srt -o both.srt",
 20 |     }
 21 |     parser = srt_tools.utils.basic_parser(
 22 |         description=__doc__, examples=examples, multi_input=True
 23 |     )
 24 |     parser.add_argument(
 25 |         "--ms",
 26 |         metavar="MILLISECONDS",
 27 |         default=datetime.timedelta(milliseconds=600),
 28 |         type=lambda ms: datetime.timedelta(milliseconds=int(ms)),
 29 |         help="if subs being muxed are within this number of milliseconds "
 30 |         "of each other, they will have their times matched (default: 600)",
 31 |     )
 32 |     parser.add_argument(
 33 |         "-w",
 34 |         "--width",
 35 |         default=5,
 36 |         type=int,
 37 |         help="how many subs to consider for time matching at once (default: %(default)s)",
 38 |     )
 39 |     parser.add_argument(
 40 |         "-t",
 41 |         "--top-and-bottom",
 42 |         action="store_true",
 43 |         help="use SSA-style tags to place files at the top and bottom, respectively. Turns off time matching",
 44 |     )
 45 |     parser.add_argument(
 46 |         "--no-time-matching",
 47 |         action="store_true",
 48 |         help="don't try to do time matching for close subtitles (see --ms)",
 49 |     )
 50 |     return parser.parse_args()
 51 | 
 52 | 
 53 | def merge_subs(subs, acceptable_diff, attr, width):
 54 |     """
 55 |     Merge subs with similar start/end times together. This prevents the
 56 |     subtitles jumping around the screen.
 57 | 
 58 |     The merge is done in-place.
 59 |     """
 60 |     sorted_subs = sorted(subs, key=operator.attrgetter(attr))
 61 | 
 62 |     for subs in srt_tools.utils.sliding_window(sorted_subs, width=width):
 63 |         current_sub = subs[0]
 64 |         future_subs = subs[1:]
 65 |         current_comp = getattr(current_sub, attr)
 66 | 
 67 |         for future_sub in future_subs:
 68 |             future_comp = getattr(future_sub, attr)
 69 |             if current_comp + acceptable_diff > future_comp:
 70 |                 log.debug(
 71 |                     "Merging %d's %s time into %d",
 72 |                     future_sub.index,
 73 |                     attr,
 74 |                     current_sub.index,
 75 |                 )
 76 |                 setattr(future_sub, attr, current_comp)
 77 |             else:
 78 |                 # Since these are sorted, and this one didn't match, we can be
 79 |                 # sure future ones won't match either.
 80 |                 break
 81 | 
 82 | 
 83 | def main():
 84 |     args = parse_args()
 85 |     logging.basicConfig(level=args.log_level)
 86 | 
 87 |     srt_tools.utils.set_basic_args(args)
 88 | 
 89 |     muxed_subs = []
 90 |     for idx, subs in enumerate(args.input):
 91 |         for sub in subs:
 92 |             if args.top_and_bottom:
 93 |                 if idx % 2 == 0:
 94 |                     sub.content = TOP + sub.content
 95 |                 else:
 96 |                     sub.content = BOTTOM + sub.content
 97 |             muxed_subs.append(sub)
 98 | 
 99 |     if args.no_time_matching or not args.top_and_bottom:
100 |         merge_subs(muxed_subs, args.ms, "start", args.width)
101 |         merge_subs(muxed_subs, args.ms, "end", args.width)
102 | 
103 |     output = srt_tools.utils.compose_suggest_on_fail(muxed_subs, strict=args.strict)
104 | 
105 |     try:
106 |         args.output.write(output)
107 |     except (UnicodeEncodeError, TypeError):  # Python 2 fallback
108 |         args.output.write(output.encode(args.encoding))
109 | 
110 | 
111 | if __name__ == "__main__":  # pragma: no cover
112 |     main()
113 | 


--------------------------------------------------------------------------------
/srt_tools/tests/test_srt_tools.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import subprocess
  5 | import sys
  6 | import tempfile
  7 | 
  8 | try:
  9 |     from shlex import quote
 10 | except ImportError:  # <3.3 fallback
 11 |     from pipes import quote
 12 | 
 13 | 
 14 | sample_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "files")
 15 | 
 16 | 
 17 | if os.name == "nt":
 18 |     # Sigh, shlex.quote quotes incorrectly on Windows
 19 |     quote = lambda x: windows_crappy_quote(x)
 20 | 
 21 | 
 22 | def run_srt_util(cmd, shell=False, encoding="utf-8-sig"):
 23 |     extra_env = {}
 24 | 
 25 |     env = {"PYTHONPATH": ".", "SystemRoot": r"C:\Windows"}
 26 |     env.update(extra_env)
 27 | 
 28 |     raw_out = subprocess.check_output(cmd, shell=shell, env=env)
 29 |     return raw_out.decode(encoding)
 30 | 
 31 | 
 32 | def windows_crappy_quote(data):
 33 |     """
 34 |     I'm 100% sure this isn't secure, please don't use it with untrusted code. I
 35 |     beg you.
 36 |     """
 37 |     data = data.replace('"', '""')
 38 |     return '"' + data + '"'
 39 | 
 40 | 
 41 | def assert_supports_all_io_methods(cmd, exclude_output=False, exclude_stdin=False):
 42 |     # TODO: pytype doesn't like the mixed types in the matrix, but this works
 43 |     # fine. Maybe it would be happier with a namedtuple?
 44 |     cmd[0] = "srt_tools/" + cmd[0]  # pytype: disable=unsupported-operands
 45 |     cmd.insert(0, sys.executable)  # pytype: disable=attribute-error
 46 |     in_file = os.path.join(sample_dir, "ascii.srt")
 47 |     in_file_gb = os.path.join(sample_dir, "gb2312.srt")
 48 |     fd, out_file = tempfile.mkstemp()
 49 | 
 50 |     # This is accessed by filename, not fd
 51 |     os.close(fd)
 52 | 
 53 |     outputs = []
 54 |     cmd_string = " ".join(quote(x) for x in cmd)
 55 | 
 56 |     try:
 57 |         outputs.append(run_srt_util(cmd + ["-i", in_file]))
 58 |         if not exclude_stdin:
 59 |             outputs.append(
 60 |                 run_srt_util("%s < %s" % (cmd_string, quote(in_file)), shell=True)
 61 |             )
 62 |         if not exclude_output:
 63 |             run_srt_util(cmd + ["-i", in_file, "-o", out_file])
 64 |             run_srt_util(
 65 |                 cmd + ["-i", in_file_gb, "-o", out_file, "-e", "gb2312"],
 66 |                 encoding="gb2312",
 67 |             )
 68 |             if not exclude_stdin:
 69 |                 run_srt_util(
 70 |                     "%s < %s > %s" % (cmd_string, quote(in_file), quote(out_file)),
 71 |                     shell=True,
 72 |                 )
 73 |                 run_srt_util(
 74 |                     "%s < %s > %s"
 75 |                     % (cmd_string + " -e gb2312", quote(in_file), quote(out_file)),
 76 |                     shell=True,
 77 |                     encoding="gb2312",
 78 |                 )
 79 |         assert len(set(outputs)) == 1, repr(outputs)
 80 | 
 81 |         if os.name == "nt":
 82 |             assert "\r\n" in outputs[0]
 83 |         else:
 84 |             assert "\r\n" not in outputs[0]
 85 |     finally:
 86 |         os.remove(out_file)
 87 | 
 88 | 
 89 | def test_tools_support():
 90 |     matrix = [
 91 |         (["srt-normalise"], False),
 92 |         (["srt-deduplicate"], False),
 93 |         (["srt-fixed-timeshift", "--seconds", "5"], False),
 94 |         (
 95 |             [
 96 |                 "srt-linear-timeshift",
 97 |                 "--f1",
 98 |                 "00:00:01,000",
 99 |                 "--f2",
100 |                 "00:00:02,000",
101 |                 "--t1",
102 |                 "00:00:03,000",
103 |                 "--t2",
104 |                 "00:00:04,000",
105 |             ],
106 |             False,
107 |         ),
108 |         (["srt-lines-matching", "-f", "lambda x: True"], False),
109 |         (["srt-process", "-f", "lambda x: x"], False),
110 |         (["srt-mux"], False, True),
111 |         (["srt-mux", "-t"], False, True),
112 |         # Need to sort out time/thread issues
113 |         # (('srt-play'), True),
114 |     ]
115 | 
116 |     for args in matrix:
117 |         assert_supports_all_io_methods(*args)
118 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | |ghactions| |coveralls|
  2 | 
  3 | .. |ghactions| image:: https://img.shields.io/github/actions/workflow/status/cdown/srt/ci.yml?branch=develop
  4 |   :target: https://github.com/cdown/srt/actions?query=branch%3Adevelop
  5 |   :alt: Tests
  6 | 
  7 | .. |coveralls| image:: https://img.shields.io/coveralls/cdown/srt/develop.svg?label=test%20coverage
  8 |   :target: https://coveralls.io/github/cdown/srt?branch=develop
  9 |   :alt: Coverage
 10 | 
 11 | srt is a tiny but featureful Python library for parsing, modifying, and
 12 | composing `SRT files`_. Take a look at the quickstart_ for a basic overview of
 13 | the library. `Detailed API documentation`_ is also available.
 14 | 
 15 | Want to see some examples of its use? Take a look at the `tools shipped with
 16 | the library`_. This library is also used internally by projects like
 17 | `subsync`_, `NVIDIA RAD-TTS`_, `manim`_, `kinobot`_, `bw_plex`_, and many more.
 18 | 
 19 | .. _subsync: https://github.com/smacke/subsync
 20 | .. _`NVIDIA RAD-TTS`: https://github.com/NVIDIA/radtts
 21 | .. _bw_plex: https://github.com/Hellowlol/bw_plex
 22 | .. _manim: https://github.com/ManimCommunity/manim
 23 | .. _kinobot: https://github.com/vitiko98/kinobot
 24 | 
 25 | Why choose this library?
 26 | ------------------------
 27 | 
 28 | - Can parse many broken SRT files which other SRT libraries cannot, and fix them
 29 | - Extremely lightweight, ~200 lines of code excluding docstrings
 30 | - Simple, intuitive API
 31 | - High quality test suite using Hypothesis_
 32 | - `100% test coverage`_ (including branches)
 33 | - `Well documented API`_, at both a high and low level
 34 | - `~30% faster than pysrt on typical workloads`_
 35 | - Full support for `PyPy`_
 36 | - No dependencies outside of the standard library
 37 | - Tolerant of many common errors found in real-world SRT files
 38 | - Support for Asian-style SRT formats (ie. "fullwidth" SRT format)
 39 | - Completely Unicode compliant
 40 | - Released under a highly permissive license (MIT)
 41 | - Real world tested — used in production to process thousands of SRT files
 42 |   every day
 43 | - Portable — runs on Linux, OSX, and Windows
 44 | - Tools included — contains lightweight tools to perform generic tasks with the
 45 |   library
 46 | 
 47 | .. _quickstart: http://srt.readthedocs.org/en/latest/quickstart.html
 48 | .. _`SRT files`: https://en.wikipedia.org/wiki/SubRip#SubRip_text_file_format
 49 | .. _Hypothesis: https://github.com/DRMacIver/hypothesis
 50 | .. _`100% test coverage`: https://coveralls.io/github/cdown/srt?branch=develop
 51 | .. _`Well documented API`: http://srt.readthedocs.org/en/latest/index.html
 52 | .. _PyPy: http://pypy.org/
 53 | .. _`~30% faster than pysrt on typical workloads`: https://paste.pound-python.org/raw/8nQKbDW0ROWvS7bOeAb3/
 54 | 
 55 | Usage
 56 | -----
 57 | 
 58 | Tools
 59 | =====
 60 | 
 61 | There are a number of `tools shipped with the library`_ to manipulate, process,
 62 | and fix SRT files. Here's an example using `hanzidentifier`_ to strip out
 63 | non-Chinese lines:
 64 | 
 65 | .. code::
 66 | 
 67 |     $ cat pe.srt
 68 |     1
 69 |     00:00:33,843 --> 00:00:38,097
 70 |     Only 3% of the water on our planet is fresh.
 71 |     地球上只有3%的水是淡水
 72 | 
 73 |     2
 74 |     00:00:40,641 --> 00:00:44,687
 75 |     Yet, these precious waters are rich with surprise.
 76 |     可是这些珍贵的淡水中却充满了惊奇
 77 | 
 78 |     $ srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese -i pe.srt
 79 |     1
 80 |     00:00:33,843 --> 00:00:38,097
 81 |     地球上只有3%的水是淡水
 82 | 
 83 |     2
 84 |     00:00:40,641 --> 00:00:44,687
 85 |     可是这些珍贵的淡水中却充满了惊奇
 86 | 
 87 | 
 88 | These tools are easy to chain together, for example, say you have one subtitle
 89 | with Chinese and English, and other with French, but you want Chinese and
 90 | French only. Oh, and the Chinese one is 5 seconds later than it should be.
 91 | That's easy enough to sort out:
 92 | 
 93 | .. code::
 94 | 
 95 |    $ srt lines-matching -m hanzidentifier -f hanzidentifier.has_chinese -i chs+eng.srt |
 96 |    >     srt fixed-timeshift --seconds -5 |
 97 |    >     srt mux --input - --input fra.srt
 98 | 
 99 | See the srt_tools/ directory for more information.
100 | 
101 | .. _hanzidentifier: https://github.com/tsroten/hanzidentifier
102 | 
103 | Library
104 | =======
105 | 
106 | `Detailed API documentation`_ is available, but here are the basics.
107 | 
108 | Here's how you convert SRT input to Subtitle objects which you can manipulate:
109 | 
110 | .. code:: python
111 | 
112 |     >>> data = '''\
113 |     1
114 |     00:00:33,843 --> 00:00:38,097
115 |     地球上只有3%的水是淡水
116 | 
117 |     2
118 |     00:00:40,641 --> 00:00:44,687
119 |     可是这些珍贵的淡水中却充满了惊奇
120 | 
121 |     3
122 |     00:00:57,908 --> 00:01:03,414
123 |     所有陆地生命归根结底都依赖於淡水
124 | 
125 |     '''
126 |     >>> for sub in srt.parse(data):
127 |     ...     print(sub)
128 |     Subtitle(index=1, start=datetime.timedelta(seconds=33, microseconds=843000), end=datetime.timedelta(seconds=38, microseconds=97000), content='地球上只有3%的水是淡水', proprietary='')
129 |     Subtitle(index=2, start=datetime.timedelta(seconds=40, microseconds=641000), end=datetime.timedelta(seconds=44, microseconds=687000), content='可是这些珍贵的淡水中却充满了惊奇', proprietary='')
130 |     Subtitle(index=3, start=datetime.timedelta(seconds=57, microseconds=908000), end=datetime.timedelta(seconds=63, microseconds=414000), content='所有陆地生命归根结底都依赖於淡水', proprietary='')
131 | 
132 | And here's how you go back from Subtitle objects to SRT output:
133 | 
134 | .. code:: python
135 | 
136 |     >>> subs = list(srt.parse(data))
137 |     >>> subs[1].content = "Changing subtitle data is easy!"
138 |     >>> print(srt.compose(subs))
139 |     1
140 |     00:00:33,843 --> 00:00:38,097
141 |     地球上只有3%的水是淡水
142 | 
143 |     2
144 |     00:00:40,641 --> 00:00:44,687
145 |     Changing subtitle data is easy!
146 | 
147 |     3
148 |     00:00:57,908 --> 00:01:03,414
149 |     所有陆地生命归根结底都依赖於淡水
150 | 
151 | Installation
152 | ------------
153 | 
154 | To install the latest stable version from PyPi:
155 | 
156 | .. code::
157 | 
158 |     pip install -U srt
159 | 
160 | To install the latest development version directly from GitHub:
161 | 
162 | .. code::
163 | 
164 |     pip install -U git+https://github.com/cdown/srt.git@develop
165 | 
166 | Testing
167 | -------
168 | 
169 | .. code::
170 | 
171 |    tox
172 | 
173 | .. _Tox: https://tox.readthedocs.org
174 | .. _`Detailed API documentation`: http://srt.readthedocs.org/en/latest/api.html
175 | .. _`tools shipped with the library`: https://github.com/cdown/srt/tree/develop/srt_tools
176 | 


--------------------------------------------------------------------------------
/srt_tools/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import argparse
  4 | import codecs
  5 | import srt
  6 | import logging
  7 | import sys
  8 | import itertools
  9 | import os
 10 | 
 11 | try:
 12 |     from collections.abc import MutableSequence
 13 | except ImportError:
 14 |     from collections import MutableSequence
 15 | 
 16 | PROG_NAME = os.path.basename(sys.argv[0]).replace("-", " ", 1)
 17 | 
 18 | STDIN_BYTESTREAM = getattr(sys.stdin, "buffer", sys.stdin)
 19 | STDOUT_BYTESTREAM = getattr(sys.stdout, "buffer", sys.stdout)
 20 | 
 21 | DASH_STREAM_MAP = {"input": STDIN_BYTESTREAM, "output": STDOUT_BYTESTREAM}
 22 | 
 23 | try:  # Python 2
 24 |     range = xrange  # pytype: disable=name-error
 25 | except NameError:
 26 |     pass
 27 | 
 28 | log = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | def noop(stream):
 32 |     """
 33 |     Used when we didn't explicitly specify a stream to avoid using
 34 |     codecs.get{reader,writer}
 35 |     """
 36 |     return stream
 37 | 
 38 | 
 39 | def dash_to_stream(arg, arg_type):
 40 |     if arg == "-":
 41 |         return DASH_STREAM_MAP[arg_type]
 42 |     return arg
 43 | 
 44 | 
 45 | def basic_parser(
 46 |     description=None,
 47 |     multi_input=False,
 48 |     no_output=False,
 49 |     examples=None,
 50 |     hide_no_strict=False,
 51 | ):
 52 |     example_lines = []
 53 | 
 54 |     if examples is not None:
 55 |         example_lines.append("examples:")
 56 | 
 57 |         for desc, code in examples.items():
 58 |             example_lines.append("  {}".format(desc))
 59 |             example_lines.append("    $ {}\n".format(code))
 60 | 
 61 |     parser = argparse.ArgumentParser(
 62 |         prog=PROG_NAME,
 63 |         description=description,
 64 |         epilog="\n".join(example_lines),
 65 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 66 |     )
 67 | 
 68 |     # Cannot use argparse.FileType as we need to know the encoding from the
 69 |     # args
 70 | 
 71 |     if multi_input:
 72 |         parser.add_argument(
 73 |             "--input",
 74 |             "-i",
 75 |             metavar="FILE",
 76 |             action="append",
 77 |             type=lambda arg: dash_to_stream(arg, "input"),
 78 |             help="the files to process",
 79 |             required=True,
 80 |         )
 81 |     else:
 82 |         parser.add_argument(
 83 |             "--input",
 84 |             "-i",
 85 |             metavar="FILE",
 86 |             default=STDIN_BYTESTREAM,
 87 |             type=lambda arg: dash_to_stream(arg, "input"),
 88 |             help="the file to process (default: stdin)",
 89 |         )
 90 | 
 91 |     if not no_output:
 92 |         parser.add_argument(
 93 |             "--output",
 94 |             "-o",
 95 |             metavar="FILE",
 96 |             default=STDOUT_BYTESTREAM,
 97 |             type=lambda arg: dash_to_stream(arg, "output"),
 98 |             help="the file to write to (default: stdout)",
 99 |         )
100 |         if not multi_input:
101 |             parser.add_argument(
102 |                 "--inplace",
103 |                 "-p",
104 |                 action="store_true",
105 |                 help="modify file in place",
106 |             )
107 | 
108 |     shelp = "allow blank lines in output, your media player may explode"
109 |     if hide_no_strict:
110 |         shelp = argparse.SUPPRESS
111 | 
112 |     parser.add_argument("--no-strict", action="store_false", dest="strict", help=shelp)
113 |     parser.add_argument(
114 |         "--debug",
115 |         action="store_const",
116 |         dest="log_level",
117 |         const=logging.DEBUG,
118 |         default=logging.INFO,
119 |         help="enable debug logging",
120 |     )
121 | 
122 |     parser.add_argument(
123 |         "--ignore-parsing-errors",
124 |         "-c",
125 |         action="store_true",
126 |         help="try to keep going, even if there are parsing errors",
127 |     )
128 | 
129 |     parser.add_argument(
130 |         "--encoding", "-e", help="the encoding to read/write files in (default: utf8)"
131 |     )
132 |     return parser
133 | 
134 | 
135 | def set_basic_args(args):
136 |     # TODO: dedupe some of this
137 |     if getattr(args, "inplace", None):
138 |         if args.input == DASH_STREAM_MAP["input"]:
139 |             raise ValueError("Cannot use --inplace on stdin")
140 | 
141 |         if args.output != DASH_STREAM_MAP["output"]:
142 |             raise ValueError("Cannot use -o and -p together")
143 | 
144 |         args.output = args.input
145 | 
146 |     for stream_name in ("input", "output"):
147 |         log.debug('Processing stream "%s"', stream_name)
148 | 
149 |         try:
150 |             stream = getattr(args, stream_name)
151 |         except AttributeError:
152 |             # For example, in the case of no_output
153 |             continue
154 | 
155 |         # We don't use system default encoding, because usually one runs this
156 |         # on files they got from elsewhere. As such, be opinionated that these
157 |         # files are probably UTF-8. Looking for the BOM on reading allows us to
158 |         # be more liberal with what we accept, without adding BOMs on write.
159 |         read_encoding = args.encoding or "utf-8-sig"
160 |         write_encoding = args.encoding or "utf-8"
161 | 
162 |         r_enc = codecs.getreader(read_encoding)
163 |         w_enc = codecs.getwriter(write_encoding)
164 | 
165 |         log.debug("Got %r as stream", stream)
166 |         # We don't use encoding= option to open because we want to have the
167 |         # same universal newlines behaviour as STD{IN,OUT}_BYTESTREAM
168 |         if stream in DASH_STREAM_MAP.values():
169 |             log.debug("%s in DASH_STREAM_MAP", stream_name)
170 |             if stream is args.input:
171 |                 args.input = srt.parse(
172 |                     r_enc(args.input).read(), ignore_errors=args.ignore_parsing_errors
173 |                 )
174 |             elif stream is args.output:
175 |                 # Since args.output is not in text mode (since we didn't
176 |                 # earlier know the encoding), we have no universal newline
177 |                 # support and need to do it ourselves
178 |                 args.output = w_enc(args.output)
179 |         else:
180 |             log.debug("%s not in DASH_STREAM_MAP", stream_name)
181 |             if stream is args.input:
182 |                 if isinstance(args.input, MutableSequence):
183 |                     for i, input_fn in enumerate(args.input):
184 |                         if input_fn in DASH_STREAM_MAP.values():
185 |                             if stream is args.input:
186 |                                 args.input[i] = srt.parse(
187 |                                     r_enc(input_fn).read(),
188 |                                     ignore_errors=args.ignore_parsing_errors,
189 |                                 )
190 |                         else:
191 |                             f = r_enc(open(input_fn, "rb"))
192 |                             with f:
193 |                                 args.input[i] = srt.parse(
194 |                                     f.read(), ignore_errors=args.ignore_parsing_errors
195 |                                 )
196 |                 else:
197 |                     f = r_enc(open(stream, "rb"))
198 |                     with f:
199 |                         args.input = srt.parse(
200 |                             f.read(), ignore_errors=args.ignore_parsing_errors
201 |                         )
202 |             else:
203 |                 args.output = w_enc(open(args.output, "wb"))
204 | 
205 | 
206 | def compose_suggest_on_fail(subs, strict=True):
207 |     try:
208 |         return srt.compose(subs, strict=strict, eol=os.linesep, in_place=True)
209 |     except srt.SRTParseError as thrown_exc:
210 |         # Since `subs` is actually a generator
211 |         log.critical(
212 |             "Parsing failed, maybe you need to pass a different encoding "
213 |             "with --encoding?"
214 |         )
215 |         raise
216 | 
217 | 
218 | def sliding_window(seq, width=2, inclusive=True):
219 |     """
220 |     If inclusive is True, we also include final elements where len(sliced) <
221 |     width.
222 |     """
223 |     seq_iter = iter(seq)
224 | 
225 |     # Consume seq_iter up to width
226 |     sliced = tuple(itertools.islice(seq_iter, width))
227 | 
228 |     if not inclusive and len(sliced) != width:
229 |         return
230 | 
231 |     yield sliced
232 | 
233 |     for elem in seq_iter:
234 |         sliced = sliced[1:] + (elem,)
235 |         yield sliced
236 | 
237 |     if inclusive:
238 |         for idx in range(len(sliced)):
239 |             if idx != 0:
240 |                 yield sliced[idx:]
241 | 


--------------------------------------------------------------------------------
/srt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf8
  3 | 
  4 | """A tiny library for parsing, modifying, and composing SRT files."""
  5 | 
  6 | from __future__ import unicode_literals
  7 | import functools
  8 | import re
  9 | from datetime import timedelta
 10 | import logging
 11 | import io
 12 | 
 13 | 
 14 | LOG = logging.getLogger(__name__)
 15 | 
 16 | # "." is not technically valid as a delimiter, but many editors create SRT
 17 | # files with this delimiter for whatever reason. Many editors and players
 18 | # accept it, so we do too.
 19 | RGX_TIMESTAMP_MAGNITUDE_DELIM = r"[,.:，．。：]"
 20 | RGX_TIMESTAMP_FIELD = r"[0-9]+"
 21 | RGX_TIMESTAMP_FIELD_OPTIONAL = r"[0-9]*"
 22 | RGX_TIMESTAMP = "".join(
 23 |     [
 24 |         RGX_TIMESTAMP_MAGNITUDE_DELIM.join([RGX_TIMESTAMP_FIELD] * 3),
 25 |         RGX_TIMESTAMP_MAGNITUDE_DELIM,
 26 |         "?",
 27 |         RGX_TIMESTAMP_FIELD_OPTIONAL,
 28 |     ]
 29 | )
 30 | RGX_TIMESTAMP_PARSEABLE = r"^{}$".format(
 31 |     "".join(
 32 |         [
 33 |             RGX_TIMESTAMP_MAGNITUDE_DELIM.join(["(" + RGX_TIMESTAMP_FIELD + ")"] * 3),
 34 |             RGX_TIMESTAMP_MAGNITUDE_DELIM,
 35 |             "?",
 36 |             "(",
 37 |             RGX_TIMESTAMP_FIELD_OPTIONAL,
 38 |             ")",
 39 |         ]
 40 |     )
 41 | )
 42 | RGX_INDEX = r"-?[0-9]+\.?[0-9]*"
 43 | RGX_PROPRIETARY = r"[^\r\n]*"
 44 | RGX_CONTENT = r".*?"
 45 | RGX_POSSIBLE_CRLF = r"\r?\n"
 46 | 
 47 | TS_REGEX = re.compile(RGX_TIMESTAMP_PARSEABLE)
 48 | MULTI_WS_REGEX = re.compile(r"\n\n+")
 49 | SRT_REGEX = re.compile(
 50 |     r"\s*(?:({idx})\s*{eof})?({ts}) *-[ -] *> *({ts}) ?({proprietary})(?:{eof}|\Z)({content})"
 51 |     # Many sub editors don't add a blank line to the end, and many editors and
 52 |     # players accept that. We allow it to be missing in input.
 53 |     #
 54 |     # We also allow subs that are missing a double blank newline. This often
 55 |     # happens on subs which were first created as a mixed language subtitle,
 56 |     # for example chs/eng, and then were stripped using naive methods (such as
 57 |     # ed/sed) that don't understand newline preservation rules in SRT files.
 58 |     #
 59 |     # This means that when you are, say, only keeping chs, and the line only
 60 |     # contains english, you end up with not only no content, but also all of
 61 |     # the content lines are stripped instead of retaining a newline.
 62 |     r"(?:{eof}|\Z)(?:{eof}|\Z|(?=(?:{idx}\s*{eof}{ts})))"
 63 |     # Some SRT blocks, while this is technically invalid, have blank lines
 64 |     # inside the subtitle content. We look ahead a little to check that the
 65 |     # next lines look like an index and a timestamp as a best-effort
 66 |     # solution to work around these.
 67 |     r"(?=(?:(?:{idx}\s*{eof})?{ts}|\Z))".format(
 68 |         idx=RGX_INDEX,
 69 |         ts=RGX_TIMESTAMP,
 70 |         proprietary=RGX_PROPRIETARY,
 71 |         content=RGX_CONTENT,
 72 |         eof=RGX_POSSIBLE_CRLF,
 73 |     ),
 74 |     re.DOTALL,
 75 | )
 76 | 
 77 | ZERO_TIMEDELTA = timedelta(0)
 78 | 
 79 | # Info message if truthy return -> Function taking a Subtitle, skip if True
 80 | SUBTITLE_SKIP_CONDITIONS = (
 81 |     ("No content", lambda sub: not sub.content.strip()),
 82 |     ("Start time < 0 seconds", lambda sub: sub.start < ZERO_TIMEDELTA),
 83 |     ("Subtitle start time >= end time", lambda sub: sub.start >= sub.end),
 84 | )
 85 | 
 86 | SECONDS_IN_HOUR = 3600
 87 | SECONDS_IN_MINUTE = 60
 88 | HOURS_IN_DAY = 24
 89 | MICROSECONDS_IN_MILLISECOND = 1000
 90 | 
 91 | try:
 92 |     FILE_TYPES = (file, io.IOBase)  # pytype: disable=name-error
 93 | except NameError:  # `file` doesn't exist in Python 3
 94 |     FILE_TYPES = (io.IOBase,)
 95 | 
 96 | 
 97 | @functools.total_ordering
 98 | class Subtitle(object):
 99 |     r"""
100 |     The metadata relating to a single subtitle. Subtitles are sorted by start
101 |     time by default. If no index was provided, index 0 will be used on writing
102 |     an SRT block.
103 | 
104 |     :param index: The SRT index for this subtitle
105 |     :type index: int or None
106 |     :param start: The time that the subtitle should start being shown
107 |     :type start: :py:class:`datetime.timedelta`
108 |     :param end: The time that the subtitle should stop being shown
109 |     :type end: :py:class:`datetime.timedelta`
110 |     :param str proprietary: Proprietary metadata for this subtitle
111 |     :param str content: The subtitle content. Should not contain OS-specific
112 |                         line separators, only \\n. This is taken care of
113 |                         already if you use :py:func:`srt.parse` to generate
114 |                         Subtitle objects.
115 |     """
116 | 
117 |     # pylint: disable=R0913
118 |     def __init__(self, index, start, end, content, proprietary=""):
119 |         self.index = index
120 |         self.start = start
121 |         self.end = end
122 |         self.content = content
123 |         self.proprietary = proprietary
124 | 
125 |     def __hash__(self):
126 |         return hash(frozenset(vars(self).items()))
127 | 
128 |     def __eq__(self, other):
129 |         return isinstance(other, Subtitle) and vars(self) == vars(other)
130 | 
131 |     def __lt__(self, other):
132 |         return (self.start, self.end, self.index) < (
133 |             other.start,
134 |             other.end,
135 |             other.index,
136 |         )
137 | 
138 |     def __repr__(self):
139 |         # Python 2/3 cross compatibility
140 |         var_items = getattr(vars(self), "iteritems", getattr(vars(self), "items"))
141 |         item_list = ", ".join("%s=%r" % (k, v) for k, v in var_items())
142 |         return "%s(%s)" % (type(self).__name__, item_list)
143 | 
144 |     def to_srt(self, strict=True, eol="\n"):
145 |         r"""
146 |         Convert the current :py:class:`Subtitle` to an SRT block.
147 | 
148 |         :param bool strict: If disabled, will allow blank lines in the content
149 |                             of the SRT block, which is a violation of the SRT
150 |                             standard and may cause your media player to explode
151 |         :param str eol: The end of line string to use (default "\\n")
152 |         :returns: The metadata of the current :py:class:`Subtitle` object as an
153 |                   SRT formatted subtitle block
154 |         :rtype: str
155 |         """
156 |         output_content = self.content
157 |         output_proprietary = self.proprietary
158 | 
159 |         if output_proprietary:
160 |             # output_proprietary is output directly next to the timestamp, so
161 |             # we need to add the space as a field delimiter.
162 |             output_proprietary = " " + output_proprietary
163 | 
164 |         if strict:
165 |             output_content = make_legal_content(output_content)
166 | 
167 |         if eol is None:
168 |             eol = "\n"
169 |         elif eol != "\n":
170 |             output_content = output_content.replace("\n", eol)
171 | 
172 |         template = "{idx}{eol}{start} --> {end}{prop}{eol}{content}{eol}{eol}"
173 |         return template.format(
174 |             idx=self.index or 0,
175 |             start=timedelta_to_srt_timestamp(self.start),
176 |             end=timedelta_to_srt_timestamp(self.end),
177 |             prop=output_proprietary,
178 |             content=output_content,
179 |             eol=eol,
180 |         )
181 | 
182 | 
183 | def make_legal_content(content):
184 |     r"""
185 |     Remove illegal content from a content block. Illegal content includes:
186 | 
187 |     * Blank lines
188 |     * Starting or ending with a blank line
189 | 
190 |     .. doctest::
191 | 
192 |         >>> make_legal_content('\nfoo\n\nbar\n')
193 |         'foo\nbar'
194 | 
195 |     :param str content: The content to make legal
196 |     :returns: The legalised content
197 |     :rtype: srt
198 |     """
199 |     # Optimisation: Usually the content we get is legally valid. Do a quick
200 |     # check to see if we really need to do anything here. This saves time from
201 |     # generating legal_content by about 50%.
202 |     if content and content[0] != "\n" and "\n\n" not in content:
203 |         return content
204 | 
205 |     legal_content = MULTI_WS_REGEX.sub("\n", content.strip("\n"))
206 |     LOG.info("Legalised content %r to %r", content, legal_content)
207 |     return legal_content
208 | 
209 | 
210 | def timedelta_to_srt_timestamp(timedelta_timestamp):
211 |     r"""
212 |     Convert a :py:class:`~datetime.timedelta` to an SRT timestamp.
213 | 
214 |     .. doctest::
215 | 
216 |         >>> import datetime
217 |         >>> delta = datetime.timedelta(hours=1, minutes=23, seconds=4)
218 |         >>> timedelta_to_srt_timestamp(delta)
219 |         '01:23:04,000'
220 | 
221 |     :param datetime.timedelta timedelta_timestamp: A datetime to convert to an
222 |                                                    SRT timestamp
223 |     :returns: The timestamp in SRT format
224 |     :rtype: str
225 |     """
226 | 
227 |     hrs, secs_remainder = divmod(timedelta_timestamp.seconds, SECONDS_IN_HOUR)
228 |     hrs += timedelta_timestamp.days * HOURS_IN_DAY
229 |     mins, secs = divmod(secs_remainder, SECONDS_IN_MINUTE)
230 |     msecs = timedelta_timestamp.microseconds // MICROSECONDS_IN_MILLISECOND
231 |     return "%02d:%02d:%02d,%03d" % (hrs, mins, secs, msecs)
232 | 
233 | 
234 | def srt_timestamp_to_timedelta(timestamp):
235 |     r"""
236 |     Convert an SRT timestamp to a :py:class:`~datetime.timedelta`.
237 | 
238 |     .. doctest::
239 | 
240 |         >>> srt_timestamp_to_timedelta('01:23:04,000')
241 |         datetime.timedelta(seconds=4984)
242 | 
243 |     :param str timestamp: A timestamp in SRT format
244 |     :returns: The timestamp as a :py:class:`~datetime.timedelta`
245 |     :rtype: datetime.timedelta
246 |     :raises TimestampParseError: If the timestamp is not parseable
247 |     """
248 | 
249 |     match = TS_REGEX.match(timestamp)
250 |     if match is None:
251 |         raise TimestampParseError("Unparseable timestamp: {}".format(timestamp))
252 |     hrs, mins, secs, msecs = [int(m) if m else 0 for m in match.groups()]
253 |     return timedelta(hours=hrs, minutes=mins, seconds=secs, milliseconds=msecs)
254 | 
255 | 
256 | def sort_and_reindex(subtitles, start_index=1, in_place=False, skip=True):
257 |     """
258 |     Reorder subtitles to be sorted by start time order, and rewrite the indexes
259 |     to be in that same order. This ensures that the SRT file will play in an
260 |     expected fashion after, for example, times were changed in some subtitles
261 |     and they may need to be resorted.
262 | 
263 |     If skip=True, subtitles will also be skipped if they are considered not to
264 |     be useful. Currently, the conditions to be considered "not useful" are as
265 |     follows:
266 | 
267 |     - Content is empty, or only whitespace
268 |     - The start time is negative
269 |     - The start time is equal to or later than the end time
270 | 
271 |     .. doctest::
272 | 
273 |         >>> from datetime import timedelta
274 |         >>> one = timedelta(seconds=1)
275 |         >>> two = timedelta(seconds=2)
276 |         >>> three = timedelta(seconds=3)
277 |         >>> subs = [
278 |         ...     Subtitle(index=999, start=one, end=two, content='1'),
279 |         ...     Subtitle(index=0, start=two, end=three, content='2'),
280 |         ... ]
281 |         >>> list(sort_and_reindex(subs))  # doctest: +ELLIPSIS
282 |         [Subtitle(...index=1...), Subtitle(...index=2...)]
283 | 
284 |     :param subtitles: :py:class:`Subtitle` objects in any order
285 |     :param int start_index: The index to start from
286 |     :param bool in_place: Whether to modify subs in-place for performance
287 |                           (version <=1.0.0 behaviour)
288 |     :param bool skip: Whether to skip subtitles considered not useful (see
289 |                       above for rules)
290 |     :returns: The sorted subtitles
291 |     :rtype: :term:`generator` of :py:class:`Subtitle` objects
292 |     """
293 |     skipped_subs = 0
294 |     for sub_num, subtitle in enumerate(sorted(subtitles), start=start_index):
295 |         if not in_place:
296 |             subtitle = Subtitle(**vars(subtitle))
297 | 
298 |         if skip:
299 |             try:
300 |                 _should_skip_sub(subtitle)
301 |             except _ShouldSkipException as thrown_exc:
302 |                 if subtitle.index is None:
303 |                     LOG.info("Skipped subtitle with no index: %s", thrown_exc)
304 |                 else:
305 |                     LOG.info(
306 |                         "Skipped subtitle at index %d: %s", subtitle.index, thrown_exc
307 |                     )
308 |                 skipped_subs += 1
309 |                 continue
310 | 
311 |         subtitle.index = sub_num - skipped_subs
312 | 
313 |         yield subtitle
314 | 
315 | 
316 | def _should_skip_sub(subtitle):
317 |     """
318 |     Check if a subtitle should be skipped based on the rules in
319 |     SUBTITLE_SKIP_CONDITIONS.
320 | 
321 |     :param subtitle: A :py:class:`Subtitle` to check whether to skip
322 |     :raises _ShouldSkipException: If the subtitle should be skipped
323 |     """
324 |     for info_msg, sub_skipper in SUBTITLE_SKIP_CONDITIONS:
325 |         if sub_skipper(subtitle):
326 |             raise _ShouldSkipException(info_msg)
327 | 
328 | 
329 | def parse(srt, ignore_errors=False):
330 |     r'''
331 |     Convert an SRT formatted string (in Python 2, a :class:`unicode` object) to
332 |     a :term:`generator` of Subtitle objects.
333 | 
334 |     This function works around bugs present in many SRT files, most notably
335 |     that it is designed to not bork when presented with a blank line as part of
336 |     a subtitle's content.
337 | 
338 |     .. doctest::
339 | 
340 |         >>> subs = parse("""\
341 |         ... 422
342 |         ... 00:31:39,931 --> 00:31:41,931
343 |         ... Using mainly spoons,
344 |         ...
345 |         ... 423
346 |         ... 00:31:41,933 --> 00:31:43,435
347 |         ... we dig a tunnel under the city and release it into the wild.
348 |         ...
349 |         ... """)
350 |         >>> list(subs)  # doctest: +ELLIPSIS
351 |         [Subtitle(...index=422...), Subtitle(...index=423...)]
352 | 
353 |     :param srt: Subtitles in SRT format
354 |     :type srt: str or a file-like object
355 |     :param ignore_errors: If True, garbled SRT data will be ignored, and we'll
356 |                           continue trying to parse the rest of the file,
357 |                           instead of raising :py:class:`SRTParseError` and
358 |                           stopping execution.
359 |     :returns: The subtitles contained in the SRT file as :py:class:`Subtitle`
360 |               objects
361 |     :rtype: :term:`generator` of :py:class:`Subtitle` objects
362 |     :raises SRTParseError: If the matches are not contiguous and
363 |                            ``ignore_errors`` is False.
364 |     '''
365 | 
366 |     expected_start = 0
367 | 
368 |     # Transparently read files -- the whole thing is needed for regex's
369 |     # finditer
370 |     if isinstance(srt, FILE_TYPES):
371 |         srt = srt.read()
372 | 
373 |     for match in SRT_REGEX.finditer(srt):
374 |         actual_start = match.start()
375 |         _check_contiguity(srt, expected_start, actual_start, ignore_errors)
376 |         raw_index, raw_start, raw_end, proprietary, content = match.groups()
377 | 
378 |         # pytype sees that this is Optional[str] and thus complains that they
379 |         # can be None, but they can't realistically be None, since we're using
380 |         # finditer and all match groups are mandatory in the regex.
381 |         content = content.replace("\r\n", "\n")  # pytype: disable=attribute-error
382 | 
383 |         try:
384 |             raw_index = int(raw_index)
385 |         except ValueError:
386 |             # Index 123.4. Handled separately, since it's a rare case and we
387 |             # don't want to affect general performance.
388 |             #
389 |             # The pytype disable is for the same reason as content, above.
390 |             raw_index = int(raw_index.split(".")[0])  # pytype: disable=attribute-error
391 |         except TypeError:
392 |             # There's no index, so raw_index is already set to None. We'll
393 |             # handle this when rendering the subtitle with to_srt.
394 |             pass
395 | 
396 |         yield Subtitle(
397 |             index=raw_index,
398 |             start=srt_timestamp_to_timedelta(raw_start),
399 |             end=srt_timestamp_to_timedelta(raw_end),
400 |             content=content,
401 |             proprietary=proprietary,
402 |         )
403 | 
404 |         expected_start = match.end()
405 | 
406 |     _check_contiguity(srt, expected_start, len(srt), ignore_errors)
407 | 
408 | 
409 | def _check_contiguity(srt, expected_start, actual_start, warn_only):
410 |     """
411 |     If ``warn_only`` is False, raise :py:class:`SRTParseError` with diagnostic
412 |     info if expected_start does not equal actual_start. Otherwise, log a
413 |     warning.
414 | 
415 |     :param str srt: The data being matched
416 |     :param int expected_start: The expected next start, as from the last
417 |                                iteration's match.end()
418 |     :param int actual_start: The actual start, as from this iteration's
419 |                              match.start()
420 |     :raises SRTParseError: If the matches are not contiguous and ``warn_only``
421 |                            is False
422 |     """
423 |     if expected_start != actual_start:
424 |         unmatched_content = srt[expected_start:actual_start]
425 | 
426 |         if expected_start == 0 and (
427 |             unmatched_content.isspace() or unmatched_content == "\ufeff"
428 |         ):
429 |             # #50: Leading whitespace has nowhere to be captured like in an
430 |             # intermediate subtitle
431 |             return
432 | 
433 |         if warn_only:
434 |             LOG.warning("Skipped unparseable SRT data: %r", unmatched_content)
435 |         else:
436 |             raise SRTParseError(expected_start, actual_start, unmatched_content)
437 | 
438 | 
439 | def compose(
440 |     subtitles, reindex=True, start_index=1, strict=True, eol=None, in_place=False
441 | ):
442 |     r"""
443 |     Convert an iterator of :py:class:`Subtitle` objects to a string of joined
444 |     SRT blocks.
445 | 
446 |     .. doctest::
447 | 
448 |         >>> from datetime import timedelta
449 |         >>> start = timedelta(seconds=1)
450 |         >>> end = timedelta(seconds=2)
451 |         >>> subs = [
452 |         ...     Subtitle(index=1, start=start, end=end, content='x'),
453 |         ...     Subtitle(index=2, start=start, end=end, content='y'),
454 |         ... ]
455 |         >>> compose(subs)  # doctest: +ELLIPSIS
456 |         '1\n00:00:01,000 --> 00:00:02,000\nx\n\n2\n00:00:01,000 --> ...'
457 | 
458 |     :param subtitles: The subtitles to convert to SRT blocks
459 |     :type subtitles: :term:`iterator` of :py:class:`Subtitle` objects
460 |     :param bool reindex: Whether to reindex subtitles based on start time
461 |     :param int start_index: If reindexing, the index to start reindexing from
462 |     :param bool strict: Whether to enable strict mode, see
463 |                         :py:func:`Subtitle.to_srt` for more information
464 |     :param str eol: The end of line string to use (default "\\n")
465 |     :returns: A single SRT formatted string, with each input
466 |               :py:class:`Subtitle` represented as an SRT block
467 |     :param bool in_place: Whether to reindex subs in-place for performance
468 |                           (version <=1.0.0 behaviour)
469 |     :rtype: str
470 |     """
471 |     if reindex:
472 |         subtitles = sort_and_reindex(
473 |             subtitles, start_index=start_index, in_place=in_place
474 |         )
475 | 
476 |     return "".join(subtitle.to_srt(strict=strict, eol=eol) for subtitle in subtitles)
477 | 
478 | 
479 | class SRTParseError(Exception):
480 |     """
481 |     Raised when part of an SRT block could not be parsed.
482 | 
483 |     :param int expected_start: The expected contiguous start index
484 |     :param int actual_start: The actual non-contiguous start index
485 |     :param str unmatched_content: The content between the expected start index
486 |                                   and the actual start index
487 |     """
488 | 
489 |     def __init__(self, expected_start, actual_start, unmatched_content):
490 |         message = (
491 |             "Expected contiguous start of match or end of input at char %d, "
492 |             "but started at char %d (unmatched content: %r)"
493 |             % (expected_start, actual_start, unmatched_content)
494 |         )
495 |         super(SRTParseError, self).__init__(message)
496 | 
497 |         self.expected_start = expected_start
498 |         self.actual_start = actual_start
499 |         self.unmatched_content = unmatched_content
500 | 
501 | 
502 | class TimestampParseError(ValueError):
503 |     """
504 |     Raised when an SRT timestamp could not be parsed.
505 |     """
506 | 
507 | 
508 | class _ShouldSkipException(Exception):
509 |     """
510 |     Raised when a subtitle should be skipped.
511 |     """
512 | 


--------------------------------------------------------------------------------
/tests/test_srt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf8
  3 | 
  4 | from __future__ import unicode_literals
  5 | from datetime import timedelta
  6 | import collections
  7 | import functools
  8 | import os
  9 | import re
 10 | import string
 11 | from io import StringIO
 12 | 
 13 | import pytest
 14 | from hypothesis import given, settings, HealthCheck, assume, example
 15 | import hypothesis.strategies as st
 16 | 
 17 | import srt
 18 | 
 19 | REGISTER_SETTINGS = lambda name, **kwargs: settings.register_profile(
 20 |     name, suppress_health_check=[HealthCheck.too_slow], deadline=None, **kwargs
 21 | )
 22 | 
 23 | REGISTER_SETTINGS("base")
 24 | REGISTER_SETTINGS("release", max_examples=1000)
 25 | 
 26 | settings.load_profile(os.getenv("HYPOTHESIS_PROFILE", "base"))
 27 | 
 28 | HOURS_IN_DAY = 24
 29 | TIMEDELTA_MAX_DAYS = 999999999
 30 | CONTENTLESS_SUB = functools.partial(
 31 |     srt.Subtitle, index=1, start=timedelta(seconds=1), end=timedelta(seconds=2)
 32 | )
 33 | 
 34 | 
 35 | def is_strictly_legal_content(content):
 36 |     """
 37 |     Filter out things that would violate strict mode. Illegal content
 38 |     includes:
 39 | 
 40 |     - A content section that starts or ends with a newline
 41 |     - A content section that contains blank lines
 42 |     """
 43 | 
 44 |     if content.strip("\r\n") != content:
 45 |         return False
 46 |     elif not content.strip():
 47 |         return False
 48 |     elif "\n\n" in content:
 49 |         return False
 50 |     else:
 51 |         return True
 52 | 
 53 | 
 54 | def subs_eq(got, expected, any_order=False):
 55 |     """
 56 |     Compare Subtitle objects using vars() so that differences are easy to
 57 |     identify.
 58 |     """
 59 |     got_vars = [frozenset(vars(sub).items()) for sub in got]
 60 |     expected_vars = [frozenset(vars(sub).items()) for sub in expected]
 61 |     if any_order:
 62 |         assert collections.Counter(got_vars) == collections.Counter(expected_vars)
 63 |     else:
 64 |         assert got_vars == expected_vars
 65 | 
 66 | 
 67 | def timedeltas(min_value=0, max_value=TIMEDELTA_MAX_DAYS):
 68 |     """
 69 |     A Hypothesis strategy to generate timedeltas.
 70 | 
 71 |     Right now {min,max}_value are shoved into multiple fields in timedelta(),
 72 |     which is not very customisable, but it's good enough for our current test
 73 |     purposes. If you need more precise control, you may need to add more
 74 |     parameters to this function to be able to customise more freely.
 75 |     """
 76 |     time_unit_strategy = st.integers(min_value=min_value, max_value=max_value)
 77 |     timestamp_strategy = st.builds(
 78 |         timedelta,
 79 |         hours=time_unit_strategy,
 80 |         minutes=time_unit_strategy,
 81 |         seconds=time_unit_strategy,
 82 |     )
 83 |     return timestamp_strategy
 84 | 
 85 | 
 86 | def equivalent_timestamps(min_value=0, max_value=TIMEDELTA_MAX_DAYS):
 87 |     def string_timestamp(hours, minutes, seconds, msecs, paddings):
 88 |         hours, minutes, seconds, msecs = map(
 89 |             lambda v_and_p: "0" * v_and_p[1] + str(v_and_p[0]),
 90 |             zip((hours, minutes, seconds, msecs), paddings),
 91 |         )
 92 |         return "{}:{}:{},{}".format(hours, minutes, seconds, msecs)
 93 | 
 94 |     def ts_field_value():
 95 |         return st.integers(min_value=min_value, max_value=max_value)
 96 | 
 97 |     def zero_padding():
 98 |         return st.integers(min_value=0, max_value=2)
 99 | 
100 |     @st.composite
101 |     def maybe_off_by_one_fields(draw):
102 |         field = draw(ts_field_value())
103 |         field_maybe_plus_one = draw(st.integers(min_value=field, max_value=field + 1))
104 |         return field_maybe_plus_one, field
105 | 
106 |     def get_equiv_timestamps(h, m, s, ms2, ts1paddings, ts2paddings):
107 |         h2, h1 = h
108 |         m2, m1 = m
109 |         s2, s1 = s
110 |         ms1 = (
111 |             (h2 - h1) * 60 * 60 * 1000 + (m2 - m1) * 60 * 1000 + (s2 - s1) * 1000 + ms2
112 |         )
113 |         return (
114 |             string_timestamp(h2, m2, s2, ms2, ts2paddings),
115 |             string_timestamp(h1, m1, s1, ms1, ts1paddings),
116 |         )
117 | 
118 |     return st.builds(
119 |         get_equiv_timestamps,
120 |         maybe_off_by_one_fields(),
121 |         maybe_off_by_one_fields(),
122 |         maybe_off_by_one_fields(),
123 |         ts_field_value(),
124 |         st.tuples(*[zero_padding() for _ in range(4)]),
125 |         st.tuples(*[zero_padding() for _ in range(4)]),
126 |     )
127 | 
128 | 
129 | def subtitles(strict=True):
130 |     """A Hypothesis strategy to generate Subtitle objects."""
131 |     # max_value settings are just to avoid overflowing TIMEDELTA_MAX_DAYS by
132 |     # using arbitrary low enough numbers.
133 |     #
134 |     # We also skip subs with start time >= end time, so we split them into two
135 |     # groups to avoid overlap.
136 |     start_timestamp_strategy = timedeltas(min_value=0, max_value=500000)
137 |     end_timestamp_strategy = timedeltas(min_value=500001, max_value=999999)
138 | 
139 |     # \r is not legal inside Subtitle.content, it should have already been
140 |     # normalised to \n.
141 |     content_strategy = st.text(min_size=1).filter(lambda x: "\r" not in x)
142 |     proprietary_strategy = st.text().filter(
143 |         lambda x: all(eol not in x for eol in "\r\n")
144 |     )
145 | 
146 |     if strict:
147 |         content_strategy = content_strategy.filter(is_strictly_legal_content)
148 | 
149 |     subtitle_strategy = st.builds(
150 |         srt.Subtitle,
151 |         index=st.integers(min_value=0),
152 |         start=start_timestamp_strategy,
153 |         end=end_timestamp_strategy,
154 |         proprietary=proprietary_strategy,
155 |         content=content_strategy,
156 |     )
157 | 
158 |     return subtitle_strategy
159 | 
160 | 
161 | @given(st.lists(subtitles()))
162 | def test_compose_and_parse_from_file(input_subs):
163 |     srt_file = StringIO(srt.compose(input_subs, reindex=False))
164 |     reparsed_subs = srt.parse(srt_file)
165 |     subs_eq(reparsed_subs, input_subs)
166 | 
167 | 
168 | @given(st.lists(subtitles()))
169 | def test_compose_and_parse_from_file_bom(input_subs):
170 |     srt_file = StringIO("\ufeff" + srt.compose(input_subs, reindex=False))
171 |     reparsed_subs = srt.parse(srt_file)
172 |     subs_eq(reparsed_subs, input_subs)
173 | 
174 | 
175 | @given(st.lists(subtitles()))
176 | def test_compose_and_parse_strict(input_subs):
177 |     composed = srt.compose(input_subs, reindex=False)
178 |     reparsed_subs = srt.parse(composed)
179 |     subs_eq(reparsed_subs, input_subs)
180 | 
181 | 
182 | @given(st.lists(subtitles()))
183 | def test_can_compose_without_ending_blank_line(input_subs):
184 |     """
185 |     Many sub editors don't add a blank line to the end, and many editors accept
186 |     it. We should just accept this too in input.
187 |     """
188 |     composed = srt.compose(input_subs, reindex=False)
189 |     composed_without_ending_blank = composed[:-1]
190 |     reparsed_subs = srt.parse(composed_without_ending_blank)
191 |     subs_eq(reparsed_subs, input_subs)
192 | 
193 | 
194 | @given(st.lists(subtitles()))
195 | def test_can_compose_without_eol_at_all(input_subs):
196 |     composed = srt.compose(input_subs, reindex=False)
197 |     composed_without_ending_blank = composed.rstrip("\r\n")
198 |     reparsed_subs = srt.parse(composed_without_ending_blank)
199 |     subs_eq(reparsed_subs, input_subs)
200 | 
201 | 
202 | @given(st.text().filter(is_strictly_legal_content))
203 | def test_compose_and_parse_strict_mode(content):
204 |     # sub.content should not have OS-specific line separators, only \n
205 |     assume("\r" not in content)
206 | 
207 |     content = "\n" + content + "\n\n" + content + "\n"
208 |     sub = CONTENTLESS_SUB(content=content)
209 | 
210 |     parsed_strict = list(srt.parse(sub.to_srt()))[0]
211 |     parsed_unstrict = list(srt.parse(sub.to_srt(strict=False)))[0]
212 | 
213 |     # Strict mode should remove blank lines in content, leading, and trailing
214 |     # newlines.
215 |     assert not parsed_strict.content.startswith("\n")
216 |     assert not parsed_strict.content.endswith("\n")
217 |     assert "\n\n" not in parsed_strict.content
218 | 
219 |     # When strict mode is false, no processing should be applied to the
220 |     # content (other than \r\n becoming \n).
221 |     assert parsed_unstrict.content == sub.content.replace("\r\n", "\n")
222 | 
223 | 
224 | @given(st.integers(min_value=1, max_value=TIMEDELTA_MAX_DAYS))
225 | def test_timedelta_to_srt_timestamp_can_go_over_24_hours(days):
226 |     srt_timestamp = srt.timedelta_to_srt_timestamp(timedelta(days=days))
227 |     srt_timestamp_hours = int(srt_timestamp.split(":")[0])
228 |     assert srt_timestamp_hours == days * HOURS_IN_DAY
229 | 
230 | 
231 | @given(subtitles())
232 | def test_subtitle_equality(sub_1):
233 |     sub_2 = srt.Subtitle(**vars(sub_1))
234 |     assert sub_1 == sub_2
235 | 
236 | 
237 | @given(subtitles())
238 | def test_subtitle_inequality(sub_1):
239 |     sub_2 = srt.Subtitle(**vars(sub_1))
240 |     sub_2.index += 1
241 |     assert sub_1 != sub_2
242 | 
243 | 
244 | @given(subtitles())
245 | def test_subtitle_inequality_to_non_matching_type(sub_1):
246 |     assert sub_1 != None
247 |     assert sub_1 != 1
248 | 
249 | 
250 | @given(subtitles())
251 | def test_subtitle_from_scratch_equality(subtitle):
252 |     srt_block = subtitle.to_srt()
253 | 
254 |     # Get two totally new sets of objects so as not to affect the hash
255 |     # comparison
256 |     sub_1 = list(srt.parse(srt_block))[0]
257 |     sub_2 = list(srt.parse(srt_block))[0]
258 | 
259 |     subs_eq([sub_1], [sub_2])
260 |     # In case subs_eq and eq disagree for some reason
261 |     assert sub_1 == sub_2
262 |     assert hash(sub_1) == hash(sub_2)
263 | 
264 | 
265 | @given(st.lists(subtitles()))
266 | def test_parsing_spaced_arrow(subs):
267 |     spaced_block = srt.compose(subs, reindex=False, strict=False).replace("-->", "- >")
268 |     reparsed_subtitles = srt.parse(spaced_block)
269 |     subs_eq(reparsed_subtitles, subs)
270 | 
271 | 
272 | @given(st.lists(subtitles()))
273 | def test_parsing_spaced_ender_arrow(subs):
274 |     # Seen in BSG subtitles
275 |     spaced_block = srt.compose(subs, reindex=False, strict=False).replace("-->", "-- >")
276 |     reparsed_subtitles = srt.parse(spaced_block)
277 |     subs_eq(reparsed_subtitles, subs)
278 | 
279 | 
280 | @given(st.lists(subtitles()))
281 | def test_parsing_no_ws_arrow(subs):
282 |     spaced_block = srt.compose(subs, reindex=False, strict=False).replace(
283 |         " --> ", "-->"
284 |     )
285 |     reparsed_subtitles = srt.parse(spaced_block)
286 |     subs_eq(reparsed_subtitles, subs)
287 | 
288 | 
289 | @given(st.text(string.whitespace), st.lists(subtitles()))
290 | def test_parsing_leading_whitespace(ws, subs):
291 |     prews_block = ws + srt.compose(subs, reindex=False, strict=False)
292 |     reparsed_subtitles = srt.parse(prews_block)
293 |     subs_eq(reparsed_subtitles, subs)
294 | 
295 | 
296 | @given(st.lists(subtitles()))
297 | def test_parsing_negative_index(subs):
298 |     for sub in subs:
299 |         sub.index *= -1
300 |     prews_block = srt.compose(subs, reindex=False, strict=False)
301 |     reparsed_subtitles = srt.parse(prews_block)
302 |     subs_eq(reparsed_subtitles, subs)
303 | 
304 | 
305 | @given(st.lists(subtitles()))
306 | def test_parsing_content_with_blank_lines(subs):
307 |     for subtitle in subs:
308 |         # We stuff a blank line in the middle so as to trigger the "special"
309 |         # content parsing for erroneous SRT files that have blank lines.
310 |         subtitle.content = subtitle.content + "\n\n" + subtitle.content
311 | 
312 |     reparsed_subtitles = srt.parse(srt.compose(subs, reindex=False, strict=False))
313 |     subs_eq(reparsed_subtitles, subs)
314 | 
315 | 
316 | @given(st.lists(subtitles()))
317 | def test_parsing_no_content(subs):
318 |     for subtitle in subs:
319 |         subtitle.content = ""
320 | 
321 |     reparsed_subtitles = srt.parse(srt.compose(subs, reindex=False, strict=False))
322 |     subs_eq(reparsed_subtitles, subs)
323 | 
324 | 
325 | @given(st.lists(subtitles()), st.lists(subtitles()), st.text(alphabet="\n\r\t "))
326 | def test_subs_missing_content_removed(content_subs, contentless_subs, contentless_text):
327 |     for sub in contentless_subs:
328 |         sub.content = contentless_text
329 | 
330 |     subs = contentless_subs + content_subs
331 |     composed_subs = list(srt.sort_and_reindex(subs, in_place=True))
332 | 
333 |     # We should have composed the same subs as there are in content_subs, as
334 |     # all contentless_subs should have been stripped.
335 |     subs_eq(composed_subs, content_subs, any_order=True)
336 | 
337 |     # The subtitles should be reindexed starting at start_index, excluding
338 |     # contentless subs
339 |     default_start_index = 1
340 |     assert [sub.index for sub in composed_subs] == list(
341 |         range(default_start_index, default_start_index + len(composed_subs))
342 |     )
343 | 
344 | 
345 | @given(
346 |     st.lists(subtitles()),
347 |     st.lists(subtitles()),
348 |     timedeltas(min_value=-999, max_value=-1),
349 | )
350 | def test_subs_starts_before_zero_removed(positive_subs, negative_subs, negative_td):
351 |     for sub in negative_subs:
352 |         sub.start = negative_td
353 |         sub.end = negative_td  # Just to avoid tripping any start >= end errors
354 | 
355 |     subs = positive_subs + negative_subs
356 |     composed_subs = list(srt.sort_and_reindex(subs, in_place=True))
357 | 
358 |     # There should be no negative subs
359 |     subs_eq(composed_subs, positive_subs, any_order=True)
360 | 
361 | 
362 | @given(st.lists(subtitles(), min_size=1), st.integers(min_value=0))
363 | def test_sort_and_reindex(input_subs, start_index):
364 |     for sub in input_subs:
365 |         # Pin all subs to same end time and index so that start time is
366 |         # compared only, must be guaranteed to be < sub.start, see how
367 |         # start_timestamp_strategy is done
368 |         sub.end = timedelta(500001)
369 |         sub.index = 1
370 | 
371 |     reindexed_subs = list(
372 |         srt.sort_and_reindex(input_subs, start_index=start_index, in_place=True)
373 |     )
374 | 
375 |     # The subtitles should be reindexed starting at start_index
376 |     assert [sub.index for sub in reindexed_subs] == list(
377 |         range(start_index, start_index + len(input_subs))
378 |     )
379 | 
380 |     # The subtitles should be sorted by start time
381 |     expected_sorting = sorted(input_subs, key=lambda sub: sub.start)
382 |     assert reindexed_subs == expected_sorting
383 | 
384 | 
385 | @given(st.lists(subtitles()))
386 | def test_sort_and_reindex_no_skip(input_subs):
387 |     # end time > start time should not trigger a skip if skip=False
388 |     for sub in input_subs:
389 |         old_start = sub.start
390 |         sub.start = sub.end
391 |         sub.end = old_start
392 | 
393 |     reindexed_subs = list(srt.sort_and_reindex(input_subs, skip=False))
394 | 
395 |     # Nothing should have been skipped
396 |     assert len(reindexed_subs) == len(input_subs)
397 | 
398 | 
399 | @given(st.lists(subtitles()))
400 | def test_sort_and_reindex_handles_no_index(input_subs):
401 |     # end time > start time should not trigger a skip if skip=False
402 |     for sub in input_subs:
403 |         old_start = sub.start
404 |         sub.start = sub.end
405 |         sub.end = old_start
406 |         sub.index = None
407 | 
408 |     reindexed_subs = list(srt.sort_and_reindex(input_subs))
409 | 
410 |     # Everything should have been skipped
411 |     assert not reindexed_subs
412 | 
413 | 
414 | @given(st.lists(subtitles(), min_size=1))
415 | def test_sort_and_reindex_same_start_time_uses_end(input_subs):
416 |     for sub in input_subs:
417 |         # Pin all subs to same start time and index so that end time is
418 |         # compared only
419 |         sub.start = timedelta(1)
420 |         sub.index = 1
421 | 
422 |     reindexed_subs = list(srt.sort_and_reindex(input_subs, in_place=True))
423 | 
424 |     # The subtitles should be sorted by end time when start time is the same
425 |     expected_sorting = sorted(input_subs, key=lambda sub: sub.end)
426 |     assert reindexed_subs == expected_sorting
427 | 
428 | 
429 | @given(st.lists(subtitles(), min_size=1))
430 | def test_sort_and_reindex_same_start_and_end_time_uses_index(input_subs):
431 |     for sub in input_subs:
432 |         # Pin all subs to same start and end time so that index is compared
433 |         # only
434 |         sub.start = timedelta(1)
435 |         sub.end = timedelta(2)
436 | 
437 |     reindexed_subs = list(srt.sort_and_reindex(input_subs, in_place=True))
438 | 
439 |     # The subtitles should be sorted by index when start and end time are the
440 |     # same
441 |     expected_sorting = sorted(input_subs, key=lambda sub: sub.index)
442 |     assert reindexed_subs == expected_sorting
443 | 
444 | 
445 | @given(st.lists(subtitles(), min_size=1), st.integers(min_value=0))
446 | def test_sort_and_reindex_not_in_place_matches(input_subs, start_index):
447 |     # Make copies for both sort_and_reindex calls so that they can't affect
448 |     # each other
449 |     not_in_place_subs = [srt.Subtitle(**vars(sub)) for sub in input_subs]
450 |     in_place_subs = [srt.Subtitle(**vars(sub)) for sub in input_subs]
451 | 
452 |     nip_ids = [id(sub) for sub in not_in_place_subs]
453 |     ip_ids = [id(sub) for sub in in_place_subs]
454 | 
455 |     not_in_place_output = list(
456 |         srt.sort_and_reindex(not_in_place_subs, start_index=start_index)
457 |     )
458 |     in_place_output = list(
459 |         srt.sort_and_reindex(in_place_subs, start_index=start_index, in_place=True)
460 |     )
461 | 
462 |     # The results in each case should be the same
463 |     subs_eq(not_in_place_output, in_place_output)
464 | 
465 |     # Not in place sort_and_reindex should have created new subs
466 |     assert not any(id(sub) in nip_ids for sub in not_in_place_output)
467 | 
468 |     # In place sort_and_reindex should be reusing the same subs
469 |     assert all(id(sub) in ip_ids for sub in in_place_output)
470 | 
471 | 
472 | @given(
473 |     st.lists(subtitles(), min_size=1),
474 |     st.integers(min_value=0),
475 |     st.text(min_size=1),
476 |     timedeltas(),
477 | )
478 | def test_parser_noncontiguous(subs, fake_idx, garbage, fake_timedelta):
479 |     composed = srt.compose(subs)
480 | 
481 |     # Put some garbage between subs that should trigger our failed parsing
482 |     # detection. Since we do some magic to try and detect blank lines that
483 |     # don't really delimit subtitles, it has to look at least a little like an
484 |     # SRT block.
485 |     srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta)
486 |     composed = composed.replace(
487 |         "\n\n", "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage)
488 |     )
489 | 
490 |     with pytest.raises(srt.SRTParseError):
491 |         list(srt.parse(composed))
492 | 
493 | 
494 | @given(
495 |     st.lists(subtitles(), min_size=1),
496 |     st.integers(min_value=0),
497 |     st.text(min_size=1),
498 |     timedeltas(),
499 | )
500 | def test_parser_noncontiguous_ignore_errors(subs, fake_idx, garbage, fake_timedelta):
501 |     composed = srt.compose(subs)
502 |     srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta)
503 |     composed = composed.replace(
504 |         "\n\n", "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage)
505 |     )
506 |     # Should not raise, we have ignore_errors
507 |     list(srt.parse(composed, ignore_errors=True))
508 | 
509 | 
510 | def _parseable_as_int(text):
511 |     try:
512 |         int(text)
513 |     except ValueError:
514 |         return False
515 |     return True
516 | 
517 | 
518 | def _parseable_as_float(text):
519 |     try:
520 |         float(text)
521 |     except ValueError:
522 |         return False
523 |     return True
524 | 
525 | 
526 | @given(st.lists(subtitles()), st.text(min_size=1))
527 | def test_parser_noncontiguous_leading(subs, garbage):
528 |     # Issue #50 permits leading whitespace, see test_parsing_leading_whitespace
529 |     assume(not garbage.isspace())
530 | 
531 |     # Issue #56 permits negative indexes, see test_parsing_negative_index. It
532 |     # also shouldn't just be a number, because then we'd confuse it with our
533 |     # index...
534 |     assume(garbage.strip()[0] != ".")
535 |     assume(garbage.strip()[0] != "-")
536 |     assume(not _parseable_as_int(garbage.strip()))
537 |     assume(not _parseable_as_float(garbage.strip()))
538 | 
539 |     # Put some garbage at the beginning that should trigger our noncontiguity
540 |     # checks
541 |     composed = garbage + srt.compose(subs)
542 | 
543 |     with pytest.raises(srt.SRTParseError):
544 |         list(srt.parse(composed))
545 | 
546 | 
547 | @given(
548 |     st.lists(subtitles(), min_size=1),
549 |     st.integers(min_value=0),
550 |     st.text(min_size=1),
551 |     timedeltas(),
552 | )
553 | def test_parser_didnt_match_to_end_raises(subs, fake_idx, garbage, fake_timedelta):
554 |     srt_blocks = [sub.to_srt() for sub in subs]
555 |     srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta)
556 |     garbage = "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage)
557 |     srt_blocks.append(garbage)
558 |     composed = "".join(srt_blocks)
559 | 
560 |     with pytest.raises(srt.SRTParseError) as thrown_exc:
561 |         list(srt.parse(composed))
562 | 
563 |     # Since we will consume as many \n as needed until we meet the lookahead
564 |     # assertion, leading newlines in `garbage` will be stripped.
565 |     garbage_stripped = garbage.lstrip("\n")
566 | 
567 |     assert garbage_stripped == thrown_exc.value.unmatched_content
568 |     assert len(composed) - len(garbage_stripped) == thrown_exc.value.expected_start
569 |     assert len(composed) == thrown_exc.value.actual_start
570 | 
571 | 
572 | @given(st.lists(subtitles()))
573 | def test_parser_can_parse_with_dot_msec_delimiter(subs):
574 |     original_srt_blocks = [sub.to_srt() for sub in subs]
575 |     dot_srt_blocks = []
576 | 
577 |     for srt_block in original_srt_blocks:
578 |         srt_lines = srt_block.split("\n")
579 |         # We should only do the first two, as it might also be in the
580 |         # proprietary metadata, causing this test to fail.
581 |         dot_timestamp = srt_lines[1].replace(",", ".", 2)
582 |         srt_lines[1] = dot_timestamp
583 |         dot_srt_blocks.append("\n".join(srt_lines))
584 | 
585 |     composed_with_dots = "".join(dot_srt_blocks)
586 |     reparsed_subs = srt.parse(composed_with_dots)
587 |     subs_eq(reparsed_subs, subs)
588 | 
589 | 
590 | @given(st.lists(subtitles()))
591 | def test_parser_can_parse_with_fullwidth_delimiter(subs):
592 |     original_srt_blocks = [sub.to_srt() for sub in subs]
593 |     dot_srt_blocks = []
594 | 
595 |     for srt_block in original_srt_blocks:
596 |         srt_lines = srt_block.split("\n")
597 |         dot_timestamp = srt_lines[1].replace(",", "，", 1).replace(":", "：", 1)
598 |         srt_lines[1] = dot_timestamp
599 |         dot_srt_blocks.append("\n".join(srt_lines))
600 | 
601 |     composed_with_fullwidth = "".join(dot_srt_blocks)
602 |     reparsed_subs = srt.parse(composed_with_fullwidth)
603 |     subs_eq(reparsed_subs, subs)
604 | 
605 | 
606 | @given(st.lists(subtitles()))
607 | def test_parser_can_parse_with_no_msec(subs):
608 |     original_srt_blocks = [sub.to_srt() for sub in subs]
609 |     srt_blocks = []
610 | 
611 |     for srt_block in original_srt_blocks:
612 |         srt_lines = srt_block.split("\n")
613 |         # We should only do the first two, as it might also be in the
614 |         # proprietary metadata, causing this test to fail.
615 |         srt_lines[1] = re.sub(",[0-9]+", "", srt_lines[1], 2)
616 |         srt_blocks.append("\n".join(srt_lines))
617 | 
618 |     composed = "".join(srt_blocks)
619 |     reparsed_subs = srt.parse(composed)
620 |     subs_eq(reparsed_subs, subs)
621 | 
622 | 
623 | @given(subtitles())
624 | def test_repr_doesnt_crash(sub):
625 |     # Not much we can do here, but we should make sure __repr__ doesn't crash
626 |     # or anything and it does at least vaguely look like what we want
627 |     assert "Subtitle" in repr(sub)
628 |     assert str(sub.index) in repr(sub)
629 | 
630 | 
631 | @given(subtitles(), subtitles())
632 | def test_parser_accepts_final_no_newline_no_content(sub1, sub2):
633 |     # Limit size so we know how much to remove
634 |     sub2.content = ""
635 |     subs = [sub1, sub2]
636 | 
637 |     # Remove the last newlines so that there are none. Cannot use rstrip since
638 |     # there might be other stuff that gets matched in proprietary
639 |     stripped_srt_blocks = srt.compose(subs, reindex=False)[:-2]
640 | 
641 |     reparsed_subs = srt.parse(stripped_srt_blocks)
642 |     subs_eq(reparsed_subs, subs)
643 | 
644 | 
645 | @given(st.lists(subtitles()))
646 | def test_parser_accepts_newline_no_content(subs):
647 |     for sub in subs:
648 |         # Limit size so we know how many lines to remove
649 |         sub.content = ""
650 | 
651 |     # Remove the last \n so that there is only one
652 |     stripped_srt_blocks = "".join(sub.to_srt()[:-1] for sub in subs)
653 | 
654 |     reparsed_subs = srt.parse(stripped_srt_blocks)
655 |     subs_eq(reparsed_subs, subs)
656 | 
657 | 
658 | @given(st.lists(subtitles()))
659 | def test_compose_and_parse_strict_crlf(input_subs):
660 |     composed_raw = srt.compose(input_subs, reindex=False)
661 |     composed = composed_raw.replace("\n", "\r\n")
662 |     reparsed_subs = list(srt.parse(composed))
663 | 
664 |     for sub in reparsed_subs:
665 |         sub.content = sub.content.replace("\r\n", "\n")
666 | 
667 |     subs_eq(reparsed_subs, input_subs)
668 | 
669 | 
670 | @given(st.lists(subtitles()), st.one_of(st.just("\n"), st.just("\r\n")))
671 | def test_compose_and_parse_strict_custom_eol(input_subs, eol):
672 |     composed = srt.compose(input_subs, reindex=False, eol=eol)
673 |     reparsed_subs = srt.parse(composed)
674 |     subs_eq(reparsed_subs, input_subs)
675 | 
676 | 
677 | @given(equivalent_timestamps())
678 | def test_equal_timestamps_despite_different_fields_parsed_as_equal(timestamps):
679 |     ts1, ts2 = timestamps
680 |     assert srt.srt_timestamp_to_timedelta(ts1) == srt.srt_timestamp_to_timedelta(ts2)
681 | 
682 | 
683 | @given(timedeltas())
684 | def test_bad_timestamp_format_raises(ts):
685 |     ts = srt.timedelta_to_srt_timestamp(ts)
686 |     ts = ts.replace(":", "t", 1)
687 |     with pytest.raises(srt.TimestampParseError):
688 |         srt.srt_timestamp_to_timedelta(ts)
689 | 
690 | 
691 | @given(st.lists(subtitles()), st.lists(st.sampled_from(string.whitespace)))
692 | def test_can_parse_index_trailing_ws(input_subs, whitespace):
693 |     out = ""
694 | 
695 |     for sub in input_subs:
696 |         lines = sub.to_srt().split("\n")
697 |         lines[0] = lines[0] + "".join(whitespace)
698 |         out += "\n".join(lines)
699 | 
700 |     reparsed_subs = srt.parse(out)
701 |     subs_eq(reparsed_subs, input_subs)
702 | 
703 | 
704 | @given(st.lists(subtitles()))
705 | def test_can_parse_index_with_dot(input_subs):
706 |     # Seen in Battlestar Galactica subs
707 |     out = ""
708 | 
709 |     for sub in input_subs:
710 |         lines = sub.to_srt().split("\n")
711 |         lines[0] = lines[0] + "." + lines[0]
712 |         out += "\n".join(lines)
713 | 
714 |     reparsed_subs = srt.parse(out)
715 |     subs_eq(reparsed_subs, input_subs)
716 | 
717 | 
718 | @given(st.lists(subtitles()), st.lists(st.just("0")))
719 | def test_can_parse_index_leading_zeroes(input_subs, zeroes):
720 |     out = ""
721 | 
722 |     for sub in input_subs:
723 |         lines = sub.to_srt().split("\n")
724 |         lines[0] = "".join(zeroes) + lines[0]
725 |         out += "\n".join(lines)
726 | 
727 |     reparsed_subs = srt.parse(out)
728 |     subs_eq(reparsed_subs, input_subs)
729 | 
730 | 
731 | @given(st.lists(subtitles(), min_size=1))
732 | def test_parse_file_with_missing_index(input_subs):  # cf. issue #51
733 |     for sub in input_subs:
734 |         try:
735 |             int(sub.content.strip().split("\n")[-1])
736 |         except ValueError:
737 |             pass
738 |         else:
739 |             # If the final line with actual content is a number, we'll parse it
740 |             # as the index, so ignore that
741 |             assume(False)
742 | 
743 |     out_no_index = ""
744 |     out_zero_index = ""
745 | 
746 |     for sub in input_subs:
747 |         block = sub.to_srt()
748 |         block = block[block.index("\n") + 1 :]
749 |         out_no_index += block
750 | 
751 |     input_subs_copy = [srt.Subtitle(**vars(sub)) for sub in input_subs]
752 |     for sub in input_subs_copy:
753 |         # sub.index == None will get rendered in to_srt as 0
754 |         sub.index = 0
755 |         out_zero_index += sub.to_srt()
756 | 
757 |     subs_no_index = list(srt.parse(out_no_index))
758 |     subs_zero_index = list(srt.parse(out_zero_index))
759 | 
760 |     # One should have index == None, one should have index == 0...
761 |     assert subs_no_index
762 |     assert subs_zero_index
763 |     assert all(sub.index == None for sub in subs_no_index)
764 |     assert all(sub.index == 0 for sub in subs_zero_index)
765 |     assert subs_no_index != subs_zero_index
766 | 
767 |     # ...but they should render the same...
768 |     assert srt.compose(subs_no_index, reindex=False) == srt.compose(
769 |         subs_zero_index, reindex=False
770 |     )
771 | 
772 |     # ...and sort the same.
773 |     assert srt.compose(subs_no_index) == srt.compose(subs_zero_index)
774 | 


--------------------------------------------------------------------------------