├── tests
    ├── __init__.py
    ├── fixtures
    │   ├── __init__.py
    │   ├── microdvd.py
    │   ├── srt.py
    │   ├── webvtt.py
    │   └── translated_scc.py
    ├── test_functions.py
    ├── test_microdvd_conversion.py
    ├── test_scc_translator.py
    ├── test_base.py
    ├── test_srt_conversion.py
    ├── test_microdvd.py
    ├── test_scc_conversion.py
    ├── test_srt.py
    ├── test_sami_conversion.py
    ├── test_webvtt_conversion.py
    ├── test_geometry.py
    ├── test_dfxp_extras.py
    ├── mixins.py
    ├── conftest.py
    ├── test_webvtt.py
    ├── test_sami.py
    └── test_dfxp.py
├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── create_github_release.yml
    │   ├── release.yml
    │   ├── release_test_pypi.yml
    │   └── unit_tests.yml
├── MANIFEST.in
├── docs
    ├── requirements.txt
    ├── extensibility.rst
    ├── index.rst
    ├── supported_formats.rst
    ├── Makefile
    ├── changelog.rst
    ├── conf.py
    └── introduction.rst
├── test_requirements.txt
├── pycaption
    ├── dfxp
    │   ├── __init__.py
    │   └── extras.py
    ├── utils.py
    ├── transcript.py
    ├── exceptions.py
    ├── scc
    │   ├── translator.py
    │   └── state_machines.py
    ├── __init__.py
    ├── microdvd.py
    ├── srt.py
    └── base.py
├── .readthedocs.yaml
├── .pre-commit-config.yaml
├── .gitignore
├── examples
    ├── example.scc
    └── example.sub
├── run_tests.sh
├── docker-compose.yml
├── README.rst
├── setup.py
└── LICENSE


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/fixtures/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @pbs/core-services
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include tests *
2 | include README.rst
3 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==7.2.6
2 | sphinx_rtd_theme==1.3.0
3 | readthedocs-sphinx-search==0.3.2


--------------------------------------------------------------------------------
/test_requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-cov
3 | beautifulsoup4>=4.12.1
4 | lxml>=4.9.1
5 | cssutils>=2.0.0


--------------------------------------------------------------------------------
/pycaption/dfxp/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import *  # noqa: F401, F403
2 | from .extras import SinglePositioningDFXPWriter, LegacyDFXPWriter  # noqa: F401
3 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: "ubuntu-22.04"
 5 |   tools:
 6 |     python: "3.11"
 7 | 
 8 | # Build from the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/conf.py
11 | 
12 | # Explicitly set the version of Python and its requirements
13 | python:
14 |   install:
15 |     - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/docs/extensibility.rst:
--------------------------------------------------------------------------------
 1 | Extensibility
 2 | =============
 3 | 
 4 | Different readers and writers are easy to add if you would like to: -
 5 | Read/Write a previously unsupported format - Read/Write a supported
 6 | format in a different way (more styling?)
 7 | 
 8 | Simply follow the format of a current Reader or Writer, and edit to your
 9 | heart's desire.
10 | 


--------------------------------------------------------------------------------
/pycaption/utils.py:
--------------------------------------------------------------------------------
 1 | def is_leaf(element):
 2 |     """
 3 |     Return True if the element is a leaf, False otherwise. The element is
 4 |     considered a leaf if it is either NavigableString or the "br" tag
 5 |     :param element: A BeautifulSoup tag or NavigableString
 6 |     """
 7 |     name = getattr(element, 'name', None)
 8 |     if not name or name == 'br':
 9 |         return True
10 |     return False
11 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: git://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.0.1
 4 |     hooks:
 5 |     -   id: end-of-file-fixer
 6 |     -   id: trailing-whitespace
 7 |     -   id: debug-statements
 8 | 
 9 | -   repo: git://github.com/PyCQA/flake8
10 |     rev: 3.9.2
11 |     hooks:
12 |     -   id: flake8
13 |         args: [
14 |                 '--exclude=tests/fixtures*',
15 |                 '--ignore=W503,C901',
16 |                 '--max-line-length=80',
17 |         ]
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | .cache
16 | .eggs
17 | 
18 | # Installer logs
19 | pip-log.txt
20 | 
21 | # Unit test / coverage reports
22 | .coverage
23 | .tox
24 | coverage.xml
25 | 
26 | #Translations
27 | *.mo
28 | 
29 | #Mr Developer
30 | .mr.developer.cfg
31 | 
32 | # Sphinx documentation build
33 | docs/_build
34 | 
35 | # PyCharm files
36 | .idea/
37 | 
38 | # Environments
39 | env/
40 | venv/
41 | 
42 | # Pyenv files
43 | .python-version
44 | 


--------------------------------------------------------------------------------
/tests/test_functions.py:
--------------------------------------------------------------------------------
 1 | from pycaption import DFXPReader
 2 | from pycaption.base import merge_concurrent_captions
 3 | 
 4 | 
 5 | class TestFunctions:
 6 |     def test_merge_concurrent_captions(self, dfxp_with_concurrent_captions):
 7 |         initial_caption_set = DFXPReader().read(dfxp_with_concurrent_captions)
 8 |         initial_captions = initial_caption_set.get_captions('en-US')
 9 |         caption_set = merge_concurrent_captions(initial_caption_set)
10 |         captions = caption_set.get_captions('en-US')
11 | 
12 |         assert len(initial_captions) == 5
13 |         assert len(captions) == 3
14 | 


--------------------------------------------------------------------------------
/examples/example.scc:
--------------------------------------------------------------------------------
 1 | Scenarist_SCC V1.0
 2 | 
 3 | 00:00:00:00	9420 94d0 9723 4ce5 f4f2 6120 f4f2 6164 75e3 e964 6120 61ec 2045 7370 61fe efec 94f2 97a1 9137 20a1 92a7 d5ef ef79 e5a1 20a1 92a7 d62a 6def 6eef 73a1 2080 9137 9420 942c 942f 9420 94d0 97a1 9137 204c ef20 ece5 20ec ef20 ec61 e92c 20ec ef20 ec61 e920 ec61 e920
 4 | 
 5 | 00:00:05:08	942c 9420 9470 9723 544f c44f d3ba 20d3 5e2c 20e5 7320 e3e9 e5f2 f4ef 20c1 ec6d 61ae
 6 | 
 7 | 00:00:08:17	9420 942c 942f 9420 94d0 9723 cdc1 cd49 ba20 a180 92a7 d661 6def 7320 6120 64e9 76e5 f2f4 e9f2 6eef
 8 | 
 9 | 00:00:10:04	9420 942c 942f
10 | 
11 | 00:00:11:03	942c
12 | 


--------------------------------------------------------------------------------
/run_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | DOCKER_CMD="docker compose -p pycaption"
 3 | 
 4 | SERVICE="test_py312"
 5 | 
 6 | if [ "$@" ]; then
 7 |   if [ "$1" == "test_py38" ]  || [ "$1" == "test_py39"  ] ||
 8 |   [ "$1" == "test_py310" ] || [ "$1" == "test_py311" ] || [ "$1" == "test_py312" ]; then
 9 |     SERVICE="$1"
10 |   fi
11 | fi
12 | 
13 | $DOCKER_CMD build "$SERVICE"
14 | 
15 | function cleanup {
16 |     echo "Cleaning up ..."
17 |     $DOCKER_CMD down && $DOCKER_CMD rm -fv
18 | }
19 | 
20 | $DOCKER_CMD run --rm "$SERVICE"
21 | 
22 | if [ $? != 0 ]; then
23 |   cleanup
24 |   exit 1
25 | else
26 |   cleanup
27 | fi
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. pycaption documentation master file, created by
 2 |    sphinx-quickstart on Thu Feb 12 12:18:37 2015.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to pycaption's documentation!
 7 | =====================================
 8 | 
 9 | ``pycaption`` is a python library for converting caption formats.
10 | 
11 | 
12 | Table of contents
13 | =================
14 | 
15 | .. toctree::
16 |    :maxdepth: 3
17 | 
18 |     Introduction </introduction>
19 |     Supported formats </supported_formats>
20 |     Extensibility </extensibility>
21 |     Changelog </changelog>
22 | 


--------------------------------------------------------------------------------
/tests/test_microdvd_conversion.py:
--------------------------------------------------------------------------------
 1 | from pycaption import MicroDVDReader, MicroDVDWriter, SAMIReader
 2 | 
 3 | from tests.mixins import MicroDVDTestingMixIn
 4 | 
 5 | 
 6 | class TestMicroDVDtoMicroDVD(MicroDVDTestingMixIn):
 7 |     def test_microdvd_to_microdvd_conversion(self, sample_microdvd):
 8 |         caption_set = MicroDVDReader().read(sample_microdvd)
 9 |         results = MicroDVDWriter().write(caption_set)
10 | 
11 |         assert isinstance(results, str)
12 |         self.assert_microdvd_equals(sample_microdvd, results)
13 | 
14 | 
15 | class TestSAMItoMicroDVD(MicroDVDTestingMixIn):
16 |     def test_sami_to_micro_dvd_conversion(self, sample_microdvd_2, sample_sami):
17 |         caption_set = SAMIReader().read(sample_sami)
18 |         results = MicroDVDWriter().write(caption_set)
19 | 
20 |         assert isinstance(results, str)
21 |         self.assert_microdvd_equals(sample_microdvd_2, results)
22 | 


--------------------------------------------------------------------------------
/examples/example.sub:
--------------------------------------------------------------------------------
 1 | {230}{307}( clock ticking )
 2 | {371}{433}MAN:|When we think|of E equals m c-squared,
 3 | {433}{468}we have this vision of Einstein
 4 | {468}{522}as an old, wrinkly man|with white hair.
 5 | {522}{669}MAN 2:|E equals m c-squared is|not about an old Einstein.
 6 | {669}{754}It's actually about|a young, energetic, dynamic,
 7 | {754}{805}even a sexy Einstein.
 8 | {841}{953}ACTOR AS EINSTEIN:|What would I see if I rode|on a beam of light?
 9 | {1087}{1137}MAN:|Perhaps some sort
10 | {1137}{1197}of electrical force is emanating
11 | {1197}{1224}outwards from|the wire.
12 | {1224}{1255}What?
13 | {1255}{1282}MAN:|Faraday, my dear boy,
14 | {1282}{1317}electricity flows|through a wire,
15 | {1317}{1373}not sideways to it.
16 | {1373}{1412}You see, John?
17 | {1412}{1431}You see?
18 | {1549}{1614}MAN:|It is my great ambition|to demonstrate
19 | {1614}{1713}that nature is a closed system;
20 | {1713}{1762}that in any transformation,
21 | {1762}{1919}no amount of matter, no mass,|is ever lost and none is gained.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/create_github_release.yml:
--------------------------------------------------------------------------------
 1 | name: Release PyCaption on GitHub
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "[0-9]+.[0-9]+.[0-9]+"
 7 | 
 8 | jobs:
 9 |   call-unit-tests-workflow:
10 |     name: Run unit tests
11 |     uses: pbs/pycaption/.github/workflows/unit_tests.yml@main
12 |     secrets:
13 |       SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
14 |       SLACK_CHANNEL_ID: ${{ secrets.SLACK_CHANNEL_ID }}
15 | 
16 |   create-release:
17 |     name: Release PyCaption on GitHub
18 |     needs: call-unit-tests-workflow
19 |     runs-on: ubuntu-latest
20 |     steps:
21 |     - uses: actions/checkout@v2
22 | 
23 |     - name: Extract release notes for current version
24 |       env:
25 |         TAG: ${{ github.ref }}
26 |       run: |
27 |         sudo apt-get install -y --no-install-recommends pandoc
28 |         pandoc docs/changelog.rst -f rst -t gfm -o changelog.md
29 |         sed -n "/## ${TAG#refs/tags/}/,/## /p" changelog.md | sed -e "/## /d" >> notes.md
30 | 
31 |     - name: Create release on GitHub
32 |       run: gh release create ${{ github.ref }} --notes-file notes.md
33 |       env:
34 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
35 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Publish PyCaption to PyPI
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | jobs:
 6 |   call-unit-tests-workflow:
 7 |     name: Run unit tests
 8 |     uses: pbs/pycaption/.github/workflows/unit_tests.yml@main
 9 |     secrets:
10 |       SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
11 |       SLACK_CHANNEL_ID: ${{ secrets.SLACK_CHANNEL_ID }}
12 | 
13 |   build-n-publish:
14 |     name: Build and publish PyCaption to PyPI
15 |     needs: call-unit-tests-workflow
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |     - uses: actions/checkout@v2
19 | 
20 |     - name: Set up Python
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: 3.9
24 | 
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install build
29 |         python -m pip install --upgrade twine
30 | 
31 |     - name: Build package
32 |       run: python -m build
33 |       timeout-minutes: 10
34 | 
35 |     - name: Publish package on PyPI
36 |       run: python -m twine upload --verbose dist/*
37 |       env:
38 |         TWINE_USERNAME: __token__
39 |         TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   test_py310:
 5 |     image: python:3.10-slim-bullseye
 6 |     command: sh -c "
 7 |       cd pycaption;
 8 |       pip install --upgrade pip;
 9 |       pip install -r test_requirements.txt;
10 |       pip install -e .;
11 |       pytest -vvvv --color=yes --junit-xml=junit.xml --cov=pycaption --cov-report xml:coverage.xml;
12 |       "
13 |     volumes:
14 |       - .:/pycaption
15 | 
16 |   test_py311:
17 |     image: python:3.11-slim-bullseye
18 |     command: sh -c "
19 |       cd pycaption;
20 |       pip install --upgrade pip;
21 |       pip install -r test_requirements.txt;
22 |       pip install -e .;
23 |       pytest -vvvv --color=yes --junit-xml=junit.xml --cov=pycaption --cov-report xml:coverage.xml;
24 |       "
25 |     volumes:
26 |       - .:/pycaption
27 | 
28 |   test_py312:
29 |     image: python:3.12-slim-bullseye
30 |     command: sh -c "
31 |       cd pycaption;
32 |       pip install --upgrade pip;
33 |       pip install -r test_requirements.txt;
34 |       pip install -e .;
35 |       pytest -vvvv --color=yes --junit-xml=junit.xml --cov=pycaption --cov-report xml:coverage.xml;
36 |       "
37 |     volumes:
38 |       - .:/pycaption


--------------------------------------------------------------------------------
/pycaption/transcript.py:
--------------------------------------------------------------------------------
 1 | from pycaption.base import BaseWriter, CaptionNode
 2 | 
 3 | 
 4 | class TranscriptWriter(BaseWriter):
 5 |     def __init__(self, *args, **kw):
 6 |         try:
 7 |             from nltk import PunktSentenceTokenizer
 8 | 
 9 |             self.tokenizer = PunktSentenceTokenizer()
10 |         except ModuleNotFoundError as exc:
11 |             raise ModuleNotFoundError(
12 |                 "Missing Dependency: You must install nltk"
13 |             ) from exc
14 | 
15 |     def write(self, captions):
16 |         transcripts = []
17 | 
18 |         for lang in captions.get_languages():
19 |             lang_transcript = ""
20 | 
21 |             for caption in captions.get_captions(lang):
22 |                 lang_transcript = self._strip_text(caption.nodes, lang_transcript)
23 | 
24 |             lang_transcript = "\n".join(self.tokenizer.tokenize(lang_transcript))
25 |             transcripts.append(lang_transcript)
26 | 
27 |         return "\n".join(transcripts)
28 | 
29 |     def _strip_text(self, elements, lang_transcript):
30 |         return " ".join(
31 |             [lang_transcript]
32 |             + [el.content for el in elements if el.type_ == CaptionNode.TEXT]
33 |         )
34 | 


--------------------------------------------------------------------------------
/pycaption/exceptions.py:
--------------------------------------------------------------------------------
 1 | class CaptionReadError(Exception):
 2 |     """
 3 |     Generic error raised when the reading of the caption file failed.
 4 |     """
 5 |     def __str__(self):
 6 |         return f'{self.__class__.__name__}({self.args[0]})'
 7 | 
 8 | 
 9 | class CaptionReadNoCaptions(CaptionReadError):
10 |     """
11 |     Error raised when the provided caption file was not containing any
12 |     actual captions.
13 |     """
14 | 
15 | 
16 | class CaptionReadSyntaxError(CaptionReadError):
17 |     """
18 |     Error raised when the provided caption file has syntax errors and could
19 |     not be parsed.
20 |     """
21 | 
22 | 
23 | class CaptionReadTimingError(CaptionReadError):
24 |     """
25 |     Error raised when a Caption is initialized with invalid timings.
26 |     """
27 | 
28 | 
29 | class RelativizationError(Exception):
30 |     """
31 |     Error raised when absolute positioning cannot be converted to
32 |     percentage
33 |     """
34 | 
35 | 
36 | class InvalidInputError(RuntimeError):
37 |     """Error raised when the input is invalid (i.e. a unicode string)"""
38 | 
39 | 
40 | class CaptionLineLengthError(CaptionReadError):
41 |     """
42 |     Error raised when a Caption has a line longer than 32 characters.
43 |     """
44 | 


--------------------------------------------------------------------------------
/.github/workflows/release_test_pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish PyCaption to Test PyPI
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | jobs:
 6 |   call-unit-tests-workflow:
 7 |     name: Run unit tests
 8 |     uses: ./.github/workflows/unit_tests.yml
 9 |     secrets:
10 |       SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
11 |       SLACK_CHANNEL_ID: ${{ secrets.SLACK_CHANNEL_ID }}
12 | 
13 |   build-n-publish:
14 |     name: Build and publish PyCaption to Test PyPI
15 |     needs: call-unit-tests-workflow
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |     - uses: actions/checkout@v2
19 | 
20 |     - name: Set up Python
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: 3.9
24 | 
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install build
29 |         python -m pip install --upgrade twine
30 | 
31 |     - name: Build package
32 |       run: python -m build
33 |       timeout-minutes: 10
34 | 
35 |     - name: Publish package on Test PyPI
36 |       run: python -m twine upload --verbose dist/*
37 |       env:
38 |         TWINE_USERNAME: __token__
39 |         TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }}
40 |         TWINE_REPOSITORY:  "testpypi"
41 | 


--------------------------------------------------------------------------------
/pycaption/scc/translator.py:
--------------------------------------------------------------------------------
 1 | from pycaption.scc.constants import ALL_CHARACTERS, COMMAND_LABELS
 2 | 
 3 | 
 4 | def translate_scc(scc_content, brackets='[]'):
 5 |     """
 6 |     Replaces hexadecimal words with their meaning
 7 | 
 8 |     In order to make SCC files more human-readable and easier to debug,
 9 |     this function is used to replace command codes with their labels and
10 |     character bytes with their actual characters
11 | 
12 |     :param scc_content: SCC captions to be translated
13 |     :type scc_content: str
14 |     :param brackets: Brackets to group the translated content of a command
15 |     :type brackets: str
16 |     :return: Translated SCC captions
17 |     :rtype: str
18 |     """
19 |     opening_bracket, closing_bracket = brackets if brackets else ('', '')
20 |     scc_elements = set(scc_content.split())
21 |     for elem in scc_elements:
22 |         name = COMMAND_LABELS.get(elem, ALL_CHARACTERS.get(elem))
23 |         # If a 2 byte command was not found, try retrieving 1 byte characters
24 |         if not name:
25 |             char1 = ALL_CHARACTERS.get(elem[:2])
26 |             char2 = ALL_CHARACTERS.get(elem[2:])
27 |             if char1 is not None and char2 is not None:
28 |                 name = f"{char1}{char2}"
29 |         if name:
30 |             scc_content = scc_content.replace(
31 |                 elem, f"{opening_bracket}{name}{closing_bracket}")
32 |     return scc_content
33 | 


--------------------------------------------------------------------------------
/tests/test_scc_translator.py:
--------------------------------------------------------------------------------
 1 | from pycaption.scc.translator import translate_scc
 2 | 
 3 | 
 4 | class TestSCCTranslator:
 5 | 
 6 |     def test_successful_translation(
 7 |             self, sample_scc_pop_on, sample_translated_scc_success):
 8 |         result = translate_scc(sample_scc_pop_on)
 9 | 
10 |         assert sample_translated_scc_success == result
11 | 
12 |     def test_no_brackets(
13 |             self, sample_scc_pop_on, sample_translated_scc_no_brackets):
14 |         result = translate_scc(sample_scc_pop_on, brackets=None)
15 | 
16 |         assert sample_translated_scc_no_brackets == result
17 | 
18 |     def test_custom_brackets(
19 |             self, sample_scc_pop_on, sample_translated_scc_custom_brackets):
20 |         result = translate_scc(sample_scc_pop_on, brackets="{}")
21 | 
22 |         assert sample_translated_scc_custom_brackets == result
23 | 
24 |     def test_commands_not_found(self, sample_scc_with_unknown_commands,
25 |                                 sample_translated_scc_commands_not_found):
26 |         result = translate_scc(sample_scc_with_unknown_commands)
27 | 
28 |         assert sample_translated_scc_commands_not_found == result
29 | 
30 |     def test_special_and_extended_characters(
31 |             self, sample_scc_special_and_extended_characters,
32 |             sample_translated_scc_special_and_extended_characters):
33 |         result = translate_scc(sample_scc_special_and_extended_characters)
34 | 
35 |         assert sample_translated_scc_special_and_extended_characters == result
36 | 


--------------------------------------------------------------------------------
/pycaption/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import (
 2 |     CaptionConverter, CaptionNode, Caption, CaptionList, CaptionSet,
 3 | )
 4 | from .dfxp import DFXPWriter, DFXPReader
 5 | from .microdvd import MicroDVDReader, MicroDVDWriter
 6 | from .sami import SAMIReader, SAMIWriter
 7 | from .srt import SRTReader, SRTWriter
 8 | from .scc import SCCReader, SCCWriter
 9 | from .scc.translator import translate_scc
10 | from .transcript import TranscriptWriter
11 | from .webvtt import WebVTTReader, WebVTTWriter
12 | from .exceptions import (
13 |     CaptionReadError, CaptionReadNoCaptions, CaptionReadSyntaxError, CaptionLineLengthError
14 | )
15 | 
16 | 
17 | __all__ = [
18 |     'CaptionConverter', 'DFXPReader', 'DFXPWriter', 'MicroDVDReader',
19 |     'MicroDVDWriter', 'SAMIReader', 'SAMIWriter', 'SRTReader', 'SRTWriter',
20 |     'SCCReader', 'SCCWriter', 'translate_scc', 'WebVTTReader', 'WebVTTWriter',
21 |     'CaptionReadError', 'CaptionReadNoCaptions', 'CaptionReadSyntaxError',
22 |     'detect_format', 'CaptionNode', 'Caption', 'CaptionList', 'CaptionSet',
23 |     'TranscriptWriter'
24 | ]
25 | 
26 | SUPPORTED_READERS = (
27 |     DFXPReader, MicroDVDReader, WebVTTReader, SAMIReader, SRTReader, SCCReader,
28 | )
29 | 
30 | 
31 | def detect_format(caps):
32 |     """
33 |     Detect the format of the provided caption string.
34 | 
35 |     :returns: the reader class for the detected format.
36 |     """
37 |     if not len(caps):
38 |         raise CaptionReadNoCaptions("Empty caption file")
39 |     
40 |     for reader in SUPPORTED_READERS:
41 |         if reader().detect(caps):
42 |             return reader
43 | 
44 |     return None
45 | 


--------------------------------------------------------------------------------
/tests/fixtures/microdvd.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.fixture(scope="session")
 5 | def sample_microdvd():
 6 |     return """{230}{307}( clock ticking )
 7 | {371}{425}MAN:|When we think|\u266a ...say bow, wow, \u266a
 8 | {425}{468}we have this vision of Einstein
 9 | {468}{522}as an old, wrinkly man|with white hair.
10 | {522}{669}MAN 2:|E equals m c-squared is|not about an old Einstein.
11 | {669}{805}MAN 2:|It's all about an eternal Einstein.
12 | {805}{905}<LAUGHING & WHOOPS!>
13 | """
14 | 
15 | 
16 | @pytest.fixture(scope="session")
17 | def sample_microdvd_2():
18 |     return """{230}{307}( clock ticking )
19 | {371}{425}MAN:|When we think|\u266a ...say bow, wow, \u266a
20 | {425}{468}we have this vision of Einstein
21 | {468}{522}|as an old, wrinkly man|with white hair.
22 | {522}{669}MAN 2:|E equals m c-squared is|not about an old Einstein.
23 | {669}{805}MAN 2:|It's all about an eternal Einstein.
24 | {805}{905}<LAUGHING & WHOOPS!>
25 | """
26 | 
27 | 
28 | @pytest.fixture(scope="session")
29 | def sample_microdvd_invalid_format():
30 |     return """{230}{307}( clock ticking )
31 | {}{425}{567} MAN:|When we think|\u266a ...say bow, wow, \u266a
32 | {425}{468}we have this vision of Einstein
33 | """
34 | 
35 | 
36 | @pytest.fixture(scope="session")
37 | def missing_fps_sample_microdvd():
38 |     return """{301}{307}( clock ticking )
39 | {0}{0} MAN:|When we think|\u266a ...say bow, wow, \u266a
40 | """
41 | 
42 | 
43 | @pytest.fixture(scope="session")
44 | def sample_microdvd_empty():
45 |     return """
46 | """
47 | 
48 | 
49 | @pytest.fixture(scope="session")
50 | def sample_microdvd_empty_cue_output():
51 |     return """{30}{57}abc"""
52 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | py-caption
 2 | ==========
 3 | 
 4 | |Build Status| |Python Versions| |Pre-Commit| |Dependencies| |License|
 5 | 
 6 | ``pycaption`` is a caption reading/writing module. Use one of the given Readers
 7 | to read content into a CaptionSet object, and then use one of the Writers to
 8 | output the CaptionSet into captions of your desired format.
 9 | 
10 | Tested with Python versions 3.8, 3.9, 3.10, 3.11 and 3.12.
11 | (for Python 2 use pycaption < 1.0.0)
12 | 
13 | For details, see the `documentation <http://pycaption.readthedocs.org>`__.
14 | 
15 | License
16 | -------
17 | 
18 | This module is Copyright (c) 2012-2025 PBS.org and is available under the `Apache
19 | License, Version 2.0 <http://www.apache.org/licenses/LICENSE-2.0>`__.
20 | 
21 | .. |Build Status| image:: https://github.com/pbs/pycaption/actions/workflows/unit_tests.yml/badge.svg
22 |     :target: https://github.com/pbs/pycaption/actions/workflows/unit_tests.yml
23 |     :alt: Unit Tests
24 | 
25 | .. |Pre-Commit| image:: https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white
26 |    :target: https://github.com/pre-commit/pre-commit
27 |    :alt: pre-commit
28 | 
29 | .. |Dependencies| image:: https://img.shields.io/librariesio/release/pypi/pycaption
30 |     :target: https://libraries.io/pypi/pycaption
31 |     :alt: Dependencies
32 | 
33 | .. |Python Versions| image:: https://img.shields.io/pypi/pyversions/pycaption
34 |     :target: https://pypi.org/project/pycaption/
35 |     :alt: Python Versions
36 | 
37 | .. |License| image:: https://img.shields.io/github/license/pbs/pycaption
38 |     :target: https://github.com/pbs/pycaption/blob/main/LICENSE
39 |     :alt: License
40 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | 
 4 | from setuptools import find_packages, setup
 5 | 
 6 | README_PATH = os.path.join(
 7 |     os.path.abspath(os.path.dirname(__file__)),
 8 |     "README.rst",
 9 | )
10 | 
11 | dependencies = [
12 |     "beautifulsoup4>=4.12.1",
13 |     "lxml>=4.9.1",
14 |     "cssutils>=2.0.0",
15 | ]
16 | 
17 | dev_dependencies = ["pytest", "pytest-lazy-fixture"]
18 | 
19 | transcript_dependencies = ["nltk==3.9.1"]
20 | 
21 | setup(
22 |     name="pycaption",
23 |     version="2.2.19",
24 |     description="Closed caption converter",
25 |     long_description=open(README_PATH).read(),
26 |     author="Joe Norton",
27 |     author_email="joey@nortoncrew.com",
28 |     project_urls={
29 |         "Source": "https://github.com/pbs/pycaption",
30 |         "Documentation": "https://pycaption.readthedocs.io/",
31 |         "Release notes": "https://pycaption.readthedocs.io" "/en/stable/changelog.html",
32 |     },
33 |     python_requires=">=3.10,<4.0",
34 |     install_requires=dependencies,
35 |     extras_require={"dev": dev_dependencies, "transcript": transcript_dependencies},
36 |     packages=find_packages(),
37 |     include_package_data=True,
38 |     classifiers=[
39 |         "Development Status :: 5 - Production/Stable",
40 |         "License :: OSI Approved :: Apache Software License",
41 |         "Operating System :: OS Independent",
42 |         "Programming Language :: Python",
43 |         "Programming Language :: Python :: 3",
44 |         "Programming Language :: Python :: 3.10",
45 |         "Programming Language :: Python :: 3.11",
46 |         "Programming Language :: Python :: 3.12",
47 |         "Topic :: Software Development :: Libraries",
48 |         "Topic :: Software Development :: Libraries :: Python Modules",
49 |         "Topic :: Multimedia :: Video",
50 |     ],
51 |     test_suite="tests",
52 | )
53 | 


--------------------------------------------------------------------------------
/tests/test_base.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from pycaption.base import CaptionList, Caption
 4 | 
 5 | 
 6 | class TestCaption:
 7 |     def setup_method(self):
 8 |         self.caption = Caption(0, 999999999999, ['test'])
 9 | 
10 |     def test_format_start(self):
11 |         assert self.caption.format_start() == '00:00:00.000'
12 | 
13 |     def test_format_end(self):
14 |         assert self.caption.format_end() == '13:46:39.999'
15 | 
16 | 
17 | class TestCaptionList:
18 |     def setup_method(self):
19 |         self.layout_info = "My Layout"
20 |         self.caps = CaptionList([1, 2, 3], layout_info=self.layout_info)
21 | 
22 |     def test_splice(self):
23 |         newcaps = self.caps[1:]
24 | 
25 |         assert isinstance(newcaps, CaptionList)
26 |         assert newcaps.layout_info == self.layout_info
27 | 
28 |     def test_mul(self):
29 |         newcaps = self.caps * 2
30 | 
31 |         assert isinstance(newcaps, CaptionList)
32 |         assert newcaps.layout_info == self.layout_info
33 | 
34 |     def test_rmul(self):
35 |         newcaps = 2 * self.caps
36 | 
37 |         assert isinstance(newcaps, CaptionList)
38 |         assert newcaps.layout_info == self.layout_info
39 | 
40 |     def test_add_list_to_caption_list(self):
41 |         newcaps = self.caps + [9, 8, 7]
42 | 
43 |         assert isinstance(newcaps, CaptionList)
44 |         assert newcaps.layout_info == self.layout_info
45 | 
46 |     def test_add_two_caption_lists(self):
47 |         newcaps = self.caps + CaptionList([4], layout_info=None)
48 | 
49 |         assert isinstance(newcaps, CaptionList)
50 |         assert newcaps.layout_info == self.layout_info
51 | 
52 |         newcaps = self.caps + CaptionList([4], layout_info=self.layout_info)
53 | 
54 |         assert isinstance(newcaps, CaptionList)
55 |         assert newcaps.layout_info == self.layout_info
56 | 
57 |         with pytest.raises(ValueError):
58 |             newcaps = self.caps + CaptionList([4], layout_info="Other Layout")
59 | 


--------------------------------------------------------------------------------
/tests/test_srt_conversion.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from pycaption import (
 4 |     DFXPReader, SAMIReader, SRTReader, SRTWriter, WebVTTReader,
 5 | )
 6 | 
 7 | from tests.mixins import SRTTestingMixIn
 8 | 
 9 | 
10 | class TestDFXPtoSRT(SRTTestingMixIn):
11 |     def setup_class(self):
12 |         self.reader = DFXPReader()
13 |         self.writer = SRTWriter()
14 | 
15 |     def test_dfxp_to_srt_conversion(self, sample_srt, sample_dfxp):
16 |         caption_set = self.reader.read(sample_dfxp)
17 |         results = self.writer.write(caption_set)
18 | 
19 |         assert isinstance(results, str)
20 |         self.assert_srt_equals(sample_srt, results)
21 | 
22 |     def test_dfxp_empty_cue_to_srt(self, sample_srt_empty_cue_output,
23 |                                    sample_dfxp_empty_cue):
24 |         caption_set = self.reader.read(sample_dfxp_empty_cue)
25 |         results = self.writer.write(caption_set)
26 | 
27 |         self.assert_srt_equals(sample_srt_empty_cue_output, results)
28 | 
29 | 
30 | class TestSAMItoSRT(SRTTestingMixIn):
31 |     def test_sami_to_srt_conversion(self, sample_srt, sample_sami):
32 |         caption_set = SAMIReader().read(sample_sami)
33 |         results = SRTWriter().write(caption_set)
34 | 
35 |         assert isinstance(results, str)
36 |         self.assert_srt_equals(sample_srt, results)
37 | 
38 | 
39 | class TestSRTtoSRT(SRTTestingMixIn):
40 |     def setup_class(self):
41 |         self.reader = SRTReader()
42 |         self.writer = SRTWriter()
43 | 
44 |     def test_srt_to_srt_conversion(self, sample_srt):
45 |         caption_set = self.reader.read(sample_srt)
46 |         results = self.writer.write(caption_set)
47 | 
48 |         assert isinstance(results, str)
49 |         self.assert_srt_equals(sample_srt, results)
50 | 
51 |     def test_multiple_lines_for_one_sentence(self, samples_srt_same_time):
52 |         caption_set = self.reader.read(samples_srt_same_time)
53 |         results = self.writer.write(caption_set)
54 |         sentences = re.split(r"\n\d\n", results)
55 | 
56 |         assert 3 == len(sentences)
57 |         assert 4 == len(sentences[0].splitlines())
58 | 
59 | 
60 | class TestWebVTTtoSRT(SRTTestingMixIn):
61 |     def test_webvtt_to_srt_conversion(self, sample_srt, sample_webvtt):
62 |         caption_set = WebVTTReader().read(sample_webvtt)
63 |         results = SRTWriter().write(caption_set)
64 | 
65 |         assert isinstance(results, str)
66 |         self.assert_srt_equals(sample_srt, results)
67 | 


--------------------------------------------------------------------------------
/tests/test_microdvd.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from pycaption import MicroDVDReader, CaptionReadNoCaptions
 4 | from pycaption.exceptions import CaptionReadSyntaxError, CaptionReadTimingError
 5 | from pycaption.base import DEFAULT_LANGUAGE_CODE
 6 | from tests.mixins import ReaderTestingMixIn
 7 | 
 8 | 
 9 | class TestMicroDVDReader(ReaderTestingMixIn):
10 |     def setup_class(self):
11 |         self.reader = MicroDVDReader()
12 | 
13 |     def test_positive_answer_for_detection(self, sample_microdvd):
14 |         super().assert_positive_answer_for_detection(sample_microdvd)
15 | 
16 |     def test_negative_answer_for_detection_dfxp(self, sample_dfxp):
17 |         super().assert_negative_answer_for_detection(sample_dfxp)
18 | 
19 |     def test_negative_answer_for_detection_sami(self, sample_sami):
20 |         super().assert_negative_answer_for_detection(sample_sami)
21 | 
22 |     def test_negative_answer_for_detection_scc_pop_on(self, sample_scc_pop_on):
23 |         super().assert_negative_answer_for_detection(sample_scc_pop_on)
24 | 
25 |     def test_negative_answer_for_detection_srt(self, sample_srt):
26 |         super().assert_negative_answer_for_detection(sample_srt)
27 | 
28 |     def test_negative_answer_for_detection_webvtt(self, sample_webvtt):
29 |         super().assert_negative_answer_for_detection(sample_webvtt)
30 | 
31 |     def test_caption_length(self, sample_microdvd):
32 |         captions = MicroDVDReader().read(sample_microdvd)
33 | 
34 |         assert 7 == len(captions.get_captions(DEFAULT_LANGUAGE_CODE))
35 | 
36 |     def test_proper_timestamps(self, sample_microdvd):
37 |         captions = MicroDVDReader().read(sample_microdvd)
38 |         paragraph = captions.get_captions(DEFAULT_LANGUAGE_CODE)[2]
39 | 
40 |         # due to lossy nature of microsec -> frame# we check that
41 |         # conversion is within a second of expected value
42 |         # (fyi: timestamps in examples/ and tests/fixtures/ differ)
43 |         assert abs(17350000 - paragraph.start) < 10 ** 6
44 |         assert abs(18752000 - paragraph.end) < 10 ** 6
45 | 
46 |     def test_empty_file(self, sample_microdvd_empty):
47 |         with pytest.raises(CaptionReadNoCaptions):
48 |             MicroDVDReader().read(sample_microdvd_empty)
49 | 
50 |     def test_invalid_format(self, sample_microdvd_invalid_format):
51 |         with pytest.raises(CaptionReadSyntaxError):
52 |             MicroDVDReader().read(sample_microdvd_invalid_format)
53 | 
54 |     def test_no_fps_provided(self, missing_fps_sample_microdvd):
55 |         with pytest.raises(CaptionReadTimingError):
56 |             MicroDVDReader().read(missing_fps_sample_microdvd)
57 | 


--------------------------------------------------------------------------------
/tests/test_scc_conversion.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from pycaption import (
 4 |     DFXPWriter,
 5 |     SCCReader,
 6 |     SCCWriter,
 7 |     SRTReader,
 8 |     SRTWriter,
 9 |     WebVTTWriter,
10 | )
11 | from tests.mixins import CaptionSetTestingMixIn
12 | 
13 | # This is quite fuzzy at the moment.
14 | TOLERANCE_MICROSECONDS = 600 * 1000
15 | 
16 | 
17 | class TestSRTtoSCCtoSRT(CaptionSetTestingMixIn):
18 |     def _test_srt_to_scc_to_srt_conversion(self, srt_captions):
19 |         captions_1 = SRTReader().read(srt_captions)
20 |         scc_results = SCCWriter().write(captions_1)
21 |         scc_captions = SCCReader().read(scc_results)
22 |         srt_results = SRTWriter().write(scc_captions)
23 |         captions_2 = SRTReader().read(srt_results)
24 |         self.assert_captionset_almost_equals(
25 |             captions_1, captions_2, TOLERANCE_MICROSECONDS
26 |         )
27 | 
28 |     def test_srt_to_scc_to_srt_conversion(self, sample_srt_ascii):
29 |         self._test_srt_to_scc_to_srt_conversion(sample_srt_ascii)
30 | 
31 | 
32 | # The following test fails -- maybe a bug with SCCReader
33 | #    def test_srt_to_srt_unicode_conversion(self):
34 | #        self._test_srt_to_scc_to_srt_conversion(SAMPLE_SRT_UNICODE)
35 | 
36 | 
37 | class TestSCCtoDFXP:
38 |     def test_scc_to_dfxp(
39 |         self, sample_dfxp_from_scc_output, sample_scc_multiple_positioning
40 |     ):
41 |         caption_set = SCCReader().read(sample_scc_multiple_positioning)
42 |         dfxp = DFXPWriter(relativize=False, fit_to_screen=False).write(caption_set)
43 |         assert sample_dfxp_from_scc_output == dfxp
44 | 
45 |     def test_dfxp_is_valid_xml_when_scc_source_has_weird_italic_commands(
46 |         self,
47 |         sample_dfxp_with_properly_closing_spans_output,
48 |         sample_scc_created_dfxp_with_wrongly_closing_spans,
49 |     ):
50 |         caption_set = SCCReader().read(
51 |             sample_scc_created_dfxp_with_wrongly_closing_spans
52 |         )
53 | 
54 |         dfxp = DFXPWriter().write(caption_set)
55 | 
56 |         assert dfxp == sample_dfxp_with_properly_closing_spans_output
57 | 
58 |     def test_dfxp_is_valid_xml_when_scc_source_has_ampersand_character(
59 |         self, sample_dfxp_with_ampersand_character, sample_scc_with_ampersand_character
60 |     ):
61 |         caption_set = SCCReader().read(sample_scc_with_ampersand_character)
62 | 
63 |         dfxp = DFXPWriter().write(caption_set)
64 | 
65 |         assert dfxp == sample_dfxp_with_ampersand_character
66 | 
67 | 
68 | class TestSCCToWebVTT:
69 |     def test_webvtt_newlines_are_properly_rendered(
70 |         self,
71 |         sample_webvtt_from_scc_properly_writes_newlines_output,
72 |         scc_that_generates_webvtt_with_proper_newlines,
73 |     ):
74 |         caption_set = SCCReader().read(scc_that_generates_webvtt_with_proper_newlines)
75 |         webvtt = WebVTTWriter().write(caption_set)
76 | 
77 |         assert webvtt == sample_webvtt_from_scc_properly_writes_newlines_output
78 | 


--------------------------------------------------------------------------------
/tests/test_srt.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from pycaption import SRTReader, CaptionReadNoCaptions
 4 | from tests.mixins import ReaderTestingMixIn
 5 | 
 6 | 
 7 | class TestSRTReader(ReaderTestingMixIn):
 8 |     def setup_class(self):
 9 |         self.reader = SRTReader()
10 | 
11 |     def test_positive_answer_for_detection(self, sample_srt):
12 |         super().assert_positive_answer_for_detection(sample_srt)
13 | 
14 |     def test_negative_answer_for_detection_dfxp(self, sample_dfxp):
15 |         super().assert_negative_answer_for_detection(sample_dfxp)
16 | 
17 |     def test_negative_answer_for_detection_microdvd(self, sample_microdvd):
18 |         super().assert_negative_answer_for_detection(sample_microdvd)
19 | 
20 |     def test_negative_answer_for_detection_sami(self, sample_sami):
21 |         super().assert_negative_answer_for_detection(sample_sami)
22 | 
23 |     def test_negative_answer_for_detection_scc_pop_on(self, sample_scc_pop_on):
24 |         super().assert_negative_answer_for_detection(sample_scc_pop_on)
25 | 
26 |     def test_negative_answer_for_detection_webvtt(self, sample_webvtt):
27 |         super().assert_negative_answer_for_detection(sample_webvtt)
28 | 
29 |     def test_caption_length(self, sample_srt):
30 |         captions = self.reader.read(sample_srt)
31 | 
32 |         assert 7 == len(captions.get_captions("en-US"))
33 | 
34 |     def test_proper_timestamps(self, sample_srt):
35 |         captions = self.reader.read(sample_srt)
36 |         third_paragraph = captions.get_captions("en-US")[2]
37 | 
38 |         assert 17000000 == third_paragraph.start
39 |         assert 18752000 == third_paragraph.end
40 | 
41 |     def test_numeric_captions(self, sample_srt_numeric):
42 |         captions = self.reader.read(sample_srt_numeric)
43 |         paragraphs = captions.get_captions("en-US")
44 | 
45 |         assert 7 == len(captions.get_captions("en-US"))
46 |         assert paragraphs[-3].get_text() == "NUMBER  IS  662-429-84-77."
47 |         assert paragraphs[-1].get_text() == "3"
48 | 
49 |     def test_empty_file(self, sample_srt_empty):
50 |         with pytest.raises(CaptionReadNoCaptions) as exc_info:
51 |             self.reader.read(sample_srt_empty)
52 |         assert exc_info.value.args[0] == 'empty caption file'
53 | 
54 |     def test_extra_empty_line(self, sample_srt_blank_lines):
55 |         captions = self.reader.read(sample_srt_blank_lines)
56 |         paragraphs = captions.get_captions("en-US")
57 | 
58 |         assert 2 == len(paragraphs)
59 |         assert '\n' not in paragraphs[0].get_text()
60 |         assert '\n' not in paragraphs[1].get_text()
61 | 
62 |     def test_extra_trailing_empty_line(self, sample_srt_trailing_blanks):
63 |         captions = self.reader.read(sample_srt_trailing_blanks)
64 |         paragraphs = captions.get_captions("en-US")
65 | 
66 |         assert 2 == len(paragraphs)
67 |         assert '\n' not in paragraphs[0].get_text()
68 |         assert '\n' not in paragraphs[1].get_text()
69 | 
70 |     def test_timestamps_without_micro(
71 |             self, sample_srt_timestamps_without_microseconds):
72 |         captions = self.reader.read(sample_srt_timestamps_without_microseconds)
73 |         first_paragraph = captions.get_captions("en-US")[0]
74 | 
75 |         assert 13000000 == first_paragraph.start
76 |         assert 16000000 == first_paragraph.end
77 | 


--------------------------------------------------------------------------------
/.github/workflows/unit_tests.yml:
--------------------------------------------------------------------------------
 1 | name: Unit Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 |   workflow_call:
 9 |     secrets:
10 |       SLACK_BOT_TOKEN:
11 |         required: true
12 |       SLACK_CHANNEL_ID:
13 |         required: true
14 |   workflow_dispatch:
15 | 
16 | jobs:
17 |   build:
18 | 
19 |     runs-on: ubuntu-latest
20 |     strategy:
21 |       fail-fast: false
22 |       matrix:
23 |         python-version: ["py310", "py311", "py312"]
24 | 
25 |     steps:
26 |       - uses: actions/checkout@v2
27 | 
28 |       - name: Run Test
29 |         id: tests
30 |         run: |
31 |           ./run_tests.sh test_${{ matrix.python-version }} 
32 |         continue-on-error: true
33 | 
34 |       - name: Archive production artifacts
35 |         uses: actions/upload-artifact@v4
36 |         with:
37 |           name: test-report-${{ matrix.python-version }}
38 |           path: junit.xml
39 | 
40 |       - name: Archive code coverage results
41 |         uses: actions/upload-artifact@v4
42 |         with:
43 |           name: code-coverage-report-${{ matrix.python-version }}
44 |           path: coverage.xml
45 | 
46 |       - name: Add context info to env
47 |         run: |
48 |           sudo apt-get install -y --no-install-recommends libxml-xpath-perl
49 |           COVERAGE=`xpath -q -e "floor(/coverage/@line-rate * 100)" coverage.xml`
50 |           FAILED_AMOUNT=`xpath -q -e "number(/testsuites/testsuite/@failures)" junit.xml`
51 |           SKIPPED_AMOUNT=`xpath -q -e "number(/testsuites/testsuite/@skipped)" junit.xml`
52 |           PASSED_AMOUNT=`xpath -q -e "/testsuites/testsuite/@tests - $SKIPPED_AMOUNT - $FAILED_AMOUNT" junit.xml`
53 |           echo "COVERAGE=$COVERAGE" >> $GITHUB_ENV
54 |           echo "FAILED_AMOUNT=$FAILED_AMOUNT" >> $GITHUB_ENV
55 |           echo "PASSED_AMOUNT=$PASSED_AMOUNT" >> $GITHUB_ENV
56 |           ${{ contains(github.ref, 'tags/') }} \
57 |             && BRANCH='refs/heads/main' \
58 |             || BRANCH=${{ github.head_ref || github.ref }}
59 |           echo "BRANCH=${BRANCH#refs/*/}" >> $GITHUB_ENV
60 | 
61 |       - name: Notify if test FAILED
62 |         uses: archive/github-actions-slack@v2.0.0
63 |         with:
64 |           slack-bot-user-oauth-access-token: ${{ secrets.SLACK_BOT_TOKEN }}
65 |           slack-channel: ${{ secrets.SLACK_CHANNEL_ID }}
66 |           slack-text: ":boom: *${{ env.FAILED_AMOUNT }}* Pycaption test(s) failed for Python *${{ matrix.python-version }}* on the *${{ env.BRANCH }}* branch (triggered by *${{ github.actor }}*)"
67 |         if: steps.tests.outcome == 'failure' && !github.event.pull_request.head.repo.fork
68 | 
69 |       - name: Mark test failure
70 |         run: exit 1
71 |         if: steps.tests.outcome == 'failure'
72 | 
73 |       - name: Slack notify tests PASSED
74 |         uses: archive/github-actions-slack@v2.0.0
75 |         with:
76 |           slack-bot-user-oauth-access-token: ${{ secrets.SLACK_BOT_TOKEN }}
77 |           slack-channel: ${{ secrets.SLACK_CHANNEL_ID }}
78 |           slack-text: ":rocket: All (*${{ env.PASSED_AMOUNT }}*) Pycaption tests passed for Python *${{ matrix.python-version }}* covering *${{ env.COVERAGE }}%* of code on the *${{ env.BRANCH }}* branch (triggered by *${{ github.actor }}*)"
79 |         if: steps.tests.outcome == 'success' && !github.event.pull_request.head.repo.fork
80 | 


--------------------------------------------------------------------------------
/tests/fixtures/srt.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | 
  4 | @pytest.fixture(scope="session")
  5 | def sample_srt():
  6 |     return """1
  7 | 00:00:09,209 --> 00:00:12,312
  8 | ( clock ticking )
  9 | 
 10 | 2
 11 | 00:00:14,848 --> 00:00:17,000
 12 | MAN:
 13 | When we think
 14 | \u266a ...say bow, wow, \u266a
 15 | 
 16 | 3
 17 | 00:00:17,000 --> 00:00:18,752
 18 | we have this vision of Einstein
 19 | 
 20 | 4
 21 | 00:00:18,752 --> 00:00:20,887
 22 | as an old, wrinkly man
 23 | with white hair.
 24 | 
 25 | 5
 26 | 00:00:20,887 --> 00:00:26,760
 27 | MAN 2:
 28 | E equals m c-squared is
 29 | not about an old Einstein.
 30 | 
 31 | 6
 32 | 00:00:26,760 --> 00:00:32,200
 33 | MAN 2:
 34 | It's all about an eternal Einstein.
 35 | 
 36 | 7
 37 | 00:00:32,200 --> 00:00:36,200
 38 | <LAUGHING & WHOOPS!>
 39 | """
 40 | 
 41 | 
 42 | @pytest.fixture(scope="session")
 43 | def sample_srt_ascii():
 44 |     return """1
 45 | 00:00:09,209 --> 00:00:12,312
 46 | ( clock ticking )
 47 | 
 48 | 2
 49 | 00:00:14,848 --> 00:00:17,000
 50 | MAN:
 51 | When we think
 52 | of "E equals m c-squared",
 53 | 
 54 | 3
 55 | 00:00:17,000 --> 00:00:18,752
 56 | we have this vision of Einstein
 57 | 
 58 | 4
 59 | 00:00:18,752 --> 00:00:20,887
 60 | as an old, wrinkly man
 61 | with white hair.
 62 | 
 63 | 5
 64 | 00:00:20,887 --> 00:00:26,760
 65 | MAN 2:
 66 | E equals m c-squared is
 67 | not about an old Einstein.
 68 | 
 69 | 6
 70 | 00:00:26,760 --> 00:00:32,200
 71 | MAN 2:
 72 | It's all about an eternal Einstein.
 73 | 
 74 | 7
 75 | 00:00:32,200 --> 00:00:34,400
 76 | <LAUGHING & WHOOPS!>
 77 | 
 78 | 8
 79 | 00:00:34,400 --> 00:00:38,400
 80 | some more text
 81 | """
 82 | 
 83 | 
 84 | @pytest.fixture(scope="session")
 85 | def sample_srt_numeric():
 86 |     return """35
 87 | 00:00:32,290 --> 00:00:32,890
 88 | TO  FIND  HIM.            IF
 89 | 
 90 | 36
 91 | 00:00:32,990 --> 00:00:33,590
 92 | YOU  HAVE  ANY  INFORMATION
 93 | 
 94 | 37
 95 | 00:00:33,690 --> 00:00:34,290
 96 | THAT  CAN  HELP,  CALL  THE
 97 | 
 98 | 38
 99 | 00:00:34,390 --> 00:00:35,020
100 | STOPPERS  LINE.          THAT
101 | 
102 | 39
103 | 00:00:35,120 --> 00:00:35,760
104 | NUMBER  IS  662-429-84-77.
105 | 
106 | 40
107 | 00:00:35,860 --> 00:00:36,360
108 | STD  OUT
109 | 
110 | 41
111 | 00:00:36,460 --> 00:02:11,500
112 | 3
113 | """
114 | 
115 | 
116 | @pytest.fixture(scope="session")
117 | def sample_srt_empty():
118 |     return """
119 | """
120 | 
121 | 
122 | @pytest.fixture(scope="session")
123 | def sample_srt_blank_lines():
124 |     return """35
125 | 00:00:32,290 --> 00:00:32,890
126 | 
127 | 
128 | 36
129 | 00:00:32,990 --> 00:00:33,590
130 | YOU  HAVE  ANY  INFORMATION
131 | 
132 | """
133 | 
134 | 
135 | @pytest.fixture(scope="session")
136 | def sample_srt_trailing_blanks():
137 |     return """35
138 | 00:00:32,290 --> 00:00:32,890
139 | HELP  I  SAY
140 | 
141 | 
142 | 36
143 | 00:00:32,990 --> 00:00:33,590
144 | YOU  HAVE  ANY  INFORMATION
145 | 
146 | 
147 | 
148 | """
149 | 
150 | 
151 | @pytest.fixture(scope="session")
152 | def samples_srt_same_time():
153 |     return """1
154 | 00:00:05,213 --> 00:00:10,552
155 | SO NO ONE TOLD YOU
156 | 
157 | 2
158 | 00:00:05,213 --> 00:00:10,552
159 | LIFE WAS GONNA BE THIS WAY
160 | 
161 | 3
162 | 00:00:10,566 --> 00:00:10,580
163 | YOUR JOB IS A JOKE, YOUR ARE BROKE
164 | 
165 | 4
166 | 00:00:10,594 --> 00:00:10,600
167 | IT IS LIKE YOU ARE ALWAYS STUCK
168 | 
169 | 5
170 | 00:00:10,594 --> 00:00:10,600
171 | IN A SECOND GEAR
172 | """
173 | 
174 | 
175 | @pytest.fixture(scope="session")
176 | def sample_srt_empty_cue_output():
177 |     return """\
178 | 1
179 | 00:00:01,209 --> 00:00:02,312
180 | abc
181 | """
182 | 
183 | 
184 | @pytest.fixture(scope="session")
185 | def sample_srt_timestamps_without_microseconds():
186 |     return """\
187 | 1
188 | 00:00:13 --> 00:00:16
189 | Guard this envelope.
190 | If anything happens
191 | to me
192 | 
193 | 2
194 | 00:00:16 --> 00:00:18
195 | see that it reaches
196 | the hands of Mr
197 | Sherlock Holmes
198 | """
199 | 


--------------------------------------------------------------------------------
/pycaption/microdvd.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from copy import deepcopy
  3 | 
  4 | from .base import (
  5 |     BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode,
  6 |     DEFAULT_LANGUAGE_CODE,
  7 | )
  8 | from .exceptions import (
  9 |     CaptionReadNoCaptions, CaptionReadSyntaxError, CaptionReadTimingError,
 10 |     InvalidInputError,
 11 | )
 12 | 
 13 | 
 14 | class MicroDVDReader(BaseReader):
 15 |     def detect(self, content):
 16 |         return re.match(r"{\d+}{\d+}", content) is not None
 17 | 
 18 |     def read(self, content, lang=DEFAULT_LANGUAGE_CODE):
 19 |         if not isinstance(content, str):
 20 |             raise InvalidInputError('The content is not a unicode string.')
 21 | 
 22 |         lines = content.splitlines()
 23 |         captions = CaptionList()
 24 |         fps = 25.0
 25 |         for line in lines:
 26 |             if not line:
 27 |                 continue
 28 | 
 29 |             m = re.match(r"{(\d+)}{(\d+)}(.*)", line)
 30 |             if not m:
 31 |                 raise CaptionReadSyntaxError(
 32 |                     "Line does not match expected format")
 33 | 
 34 |             start, end, txt = m.groups()
 35 | 
 36 |             if start == '0' and end == '0':
 37 |                 try:
 38 |                     fps = float(txt)
 39 |                     continue
 40 |                 except ValueError:
 41 |                     raise CaptionReadTimingError(
 42 |                         'FPS information is not provided')
 43 | 
 44 |             caption_start = self._framestomicro(int(start), fps)
 45 |             caption_end = self._framestomicro(int(end), fps)
 46 |             nodes = []
 47 | 
 48 |             for line in txt.split('|'):
 49 |                 # skip extra blank lines
 50 |                 if line != '':
 51 |                     nodes.append(CaptionNode.create_text(line))
 52 |                     nodes.append(CaptionNode.create_break())
 53 | 
 54 |             # remove last line break from end of caption list
 55 |             if len(nodes):
 56 |                 nodes.pop()
 57 | 
 58 |                 caption = Caption(caption_start, caption_end, nodes)
 59 |                 captions.append(caption)
 60 | 
 61 |         caption_set = CaptionSet({lang: captions})
 62 |         caption_set.set_captions(lang, captions)
 63 | 
 64 |         if caption_set.is_empty():
 65 |             raise CaptionReadNoCaptions("Empty caption file")
 66 | 
 67 |         return caption_set
 68 | 
 69 |     def _framestomicro(self, framenum, fps=25.0):
 70 |         return int(framenum / fps * (10 ** 6))
 71 | 
 72 | 
 73 | class MicroDVDWriter(BaseWriter):
 74 |     def write(self, caption_set):
 75 |         caption_set = deepcopy(caption_set)
 76 | 
 77 |         captions = []
 78 | 
 79 |         for lang in caption_set.get_languages():
 80 |             captions.append(
 81 |                 self._recreate_lang(caption_set.get_captions(lang))
 82 |             )
 83 | 
 84 |         return ''.join(captions)
 85 | 
 86 |     def _microtoframes(self, micro, fps=25.0):
 87 |         return int(micro * fps / (10 ** 6))
 88 | 
 89 |     def _recreate_lang(self, captions):
 90 |         sub = ''
 91 | 
 92 |         for caption in captions:
 93 |             start = self._microtoframes(caption.start)
 94 |             end = self._microtoframes(caption.end)
 95 |             sub += f'{{{start}}}{{{end}}}'
 96 | 
 97 |             new_content = ''
 98 |             for node in caption.nodes:
 99 |                 new_content = self._recreate_line(new_content, node)
100 | 
101 |             # Eliminate excessive line breaks
102 |             new_content = new_content.strip() + '\n'
103 |             while '\n\n' in new_content:
104 |                 new_content = new_content.replace('\n\n', '\n')
105 |             # Break unnecessary on last line
106 |             while '|\n' in new_content:
107 |                 new_content = new_content.replace('|\n', '\n')
108 | 
109 |             sub += new_content
110 | 
111 |         return sub
112 | 
113 |     def _recreate_line(self, sub, line):
114 |         if line.type_ == CaptionNode.TEXT:
115 |             return sub + line.content
116 |         elif line.type_ == CaptionNode.BREAK:
117 |             return sub + '|'
118 |         else:
119 |             return sub
120 | 


--------------------------------------------------------------------------------
/tests/test_sami_conversion.py:
--------------------------------------------------------------------------------
  1 | from pycaption import (
  2 |     DFXPReader, SAMIReader, SAMIWriter, SRTReader, WebVTTReader,
  3 | )
  4 | 
  5 | from .mixins import SAMITestingMixIn
  6 | 
  7 | # Arbitrary values used to test relativization
  8 | VIDEO_WIDTH = 640
  9 | VIDEO_HEIGHT = 360
 10 | 
 11 | 
 12 | class TestDFXPtoSAMI(SAMITestingMixIn):
 13 |     def setup_method(self):
 14 |         self.reader = DFXPReader()
 15 |         self.writer = SAMIWriter()
 16 | 
 17 |     def test_dfxp_to_sami_conversion(self, sample_sami, sample_dfxp):
 18 |         caption_set = self.reader.read(sample_dfxp)
 19 |         result = self.writer.write(caption_set)
 20 | 
 21 |         assert isinstance(result, str)
 22 |         self.assert_sami_captions_equal(sample_sami, result)
 23 | 
 24 |     def test_dfxp_to_sami_with_nested_spans(
 25 |             self, sample_sami_from_dfxp_with_nested_spans,
 26 |             sample_dfxp_with_nested_spans):
 27 |         caption_set = self.reader.read(sample_dfxp_with_nested_spans)
 28 |         result = self.writer.write(caption_set)
 29 | 
 30 |         assert isinstance(result, str)
 31 |         self.assert_sami_captions_equal(sample_sami_from_dfxp_with_nested_spans,
 32 |                                         result)
 33 | 
 34 |     def test_dfxp_to_sami_with_margins(
 35 |             self, sample_dfxp_from_sami_with_margins):
 36 |         caption_set = self.reader.read(sample_dfxp_from_sami_with_margins)
 37 |         result = SAMIWriter(video_width=VIDEO_WIDTH,
 38 |                             video_height=VIDEO_HEIGHT).write(caption_set)
 39 |         margins = ["margin-right: 6.04%;",
 40 |                    "margin-bottom: 0%;",
 41 |                    "margin-top: 0%;",
 42 |                    "margin-left: 6.04%;"]
 43 | 
 44 |         assert all(margin in result for margin in margins)
 45 | 
 46 |     def test_dfxp_empty_cue_to_sami(self, sample_sami_empty_cue_output,
 47 |                                     sample_dfxp_empty_cue):
 48 |         caption_set = self.reader.read(sample_dfxp_empty_cue)
 49 |         result = self.writer.write(caption_set)
 50 | 
 51 |         self.assert_sami_captions_equal(sample_sami_empty_cue_output, result)
 52 | 
 53 | 
 54 | class TestSRTtoSAMI(SAMITestingMixIn):
 55 |     def test_srt_to_sami_conversion(self, sample_sami, sample_srt):
 56 |         caption_set = SRTReader().read(sample_srt)
 57 |         result = SAMIWriter().write(caption_set)
 58 | 
 59 |         assert isinstance(result, str)
 60 |         self.assert_sami_captions_equal(sample_sami, result)
 61 | 
 62 | 
 63 | class TestSAMItoSAMI(SAMITestingMixIn):
 64 |     def setup_method(self):
 65 |         self.reader = SAMIReader()
 66 |         self.writer = SAMIWriter()
 67 | 
 68 |     def test_sami_to_sami_conversion(self, sample_sami):
 69 |         caption_set = self.reader.read(sample_sami)
 70 |         result = SAMIWriter(relativize=False,
 71 |                             fit_to_screen=False).write(caption_set)
 72 | 
 73 |         assert isinstance(result, str)
 74 |         self.assert_sami_captions_equal(sample_sami, result)
 75 | 
 76 |     def test_sami_with_multi_lang(self, sample_sami_with_separate_multi_lang):
 77 |         caption_set = self.reader.read(sample_sami_with_separate_multi_lang)
 78 |         result = self.writer.write(caption_set)
 79 | 
 80 |         assert isinstance(result, str)
 81 |         self.assert_sami_captions_equal(sample_sami_with_separate_multi_lang,
 82 |                                         result)
 83 | 
 84 |     def test_is_relativized(self, sample_sami_partial_margins_relativized,
 85 |                             sample_sami_partial_margins):
 86 |         # Absolute positioning settings (e.g. px) are converted to percentages
 87 |         caption_set = self.reader.read(sample_sami_partial_margins)
 88 |         result = SAMIWriter(
 89 |             video_width=VIDEO_WIDTH, video_height=VIDEO_HEIGHT
 90 |         ).write(caption_set)
 91 | 
 92 |         self.assert_sami_captions_equal(sample_sami_partial_margins_relativized,
 93 |                                         result)
 94 | 
 95 |     def test_missing_language_conversion(self, sample_sami_with_lang,
 96 |                                          sample_sami_no_lang):
 97 |         caption_set = self.reader.read(sample_sami_no_lang)
 98 |         result = self.writer.write(caption_set)
 99 | 
100 |         assert isinstance(result, str)
101 |         self.assert_sami_captions_equal(sample_sami_with_lang, result)
102 |         assert "lang: und;" in result
103 | 
104 | 
105 | class TestWebVTTtoSAMI(SAMITestingMixIn):
106 |     def test_webvtt_to_sami_conversion(self, sample_sami, sample_webvtt):
107 |         caption_set = WebVTTReader().read(sample_webvtt)
108 |         result = SAMIWriter().write(caption_set)
109 | 
110 |         assert isinstance(result, str)
111 |         self.assert_sami_captions_equal(sample_sami, result)
112 | 


--------------------------------------------------------------------------------
/tests/test_webvtt_conversion.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from pycaption import (
  4 |     SAMIReader, SRTReader, WebVTTReader, WebVTTWriter, DFXPWriter,
  5 |     MicroDVDWriter,
  6 | )
  7 | 
  8 | from tests.mixins import (
  9 |     WebVTTTestingMixIn, DFXPTestingMixIn, MicroDVDTestingMixIn,
 10 | )
 11 | 
 12 | 
 13 | class TestSAMItoWebVTT(WebVTTTestingMixIn):
 14 |     def test_conversion(self, sample_webvtt_from_sami, sample_sami):
 15 |         caption_set = SAMIReader().read(sample_sami)
 16 |         results = WebVTTWriter(
 17 |             video_width=640, video_height=360).write(caption_set)
 18 | 
 19 |         assert isinstance(results, str)
 20 |         self.assert_webvtt_equals(sample_webvtt_from_sami, results)
 21 | 
 22 |     def test_style_tags_conversion(self, sample_webvtt_from_sami_with_style,
 23 |                                    sample_sami_with_style_tags):
 24 |         caption_set = SAMIReader().read(sample_sami_with_style_tags)
 25 |         results = WebVTTWriter(
 26 |             video_width=640, video_height=360).write(caption_set)
 27 | 
 28 |         assert isinstance(results, str)
 29 |         self.assert_webvtt_equals(sample_webvtt_from_sami_with_style, results)
 30 | 
 31 |     def test_css_inline_style_conversion(
 32 |             self, sample_webvtt_from_sami_with_style,
 33 |             sample_sami_with_css_inline_style):
 34 |         caption_set = SAMIReader().read(sample_sami_with_css_inline_style)
 35 |         results = WebVTTWriter(
 36 |             video_width=640, video_height=360).write(caption_set)
 37 | 
 38 |         assert isinstance(results, str)
 39 |         self.assert_webvtt_equals(sample_webvtt_from_sami_with_style, results)
 40 | 
 41 |     def test_css_id_style_conversion(
 42 |             self, sample_webvtt_from_sami_with_id_style,
 43 |             sample_sami_with_css_id_style):
 44 |         caption_set = SAMIReader().read(sample_sami_with_css_id_style)
 45 |         results = WebVTTWriter(
 46 |             video_width=640, video_height=360).write(caption_set)
 47 | 
 48 |         assert isinstance(results, str)
 49 |         self.assert_webvtt_equals(sample_webvtt_from_sami_with_id_style,
 50 |                                   results)
 51 | 
 52 | 
 53 | class TestSRTtoWebVTT(WebVTTTestingMixIn):
 54 |     def test_srt_to_webvtt_conversion(self, sample_webvtt_from_srt, sample_srt):
 55 |         caption_set = SRTReader().read(sample_srt)
 56 |         results = WebVTTWriter().write(caption_set)
 57 | 
 58 |         assert isinstance(results, str)
 59 |         self.assert_webvtt_equals(sample_webvtt_from_srt, results)
 60 | 
 61 | 
 62 | class TestWebVTTtoWebVTT(WebVTTTestingMixIn):
 63 |     def test_webvtt_to_webvtt_conversion(self, sample_webvtt_from_webvtt,
 64 |                                          sample_webvtt):
 65 |         caption_set = WebVTTReader().read(sample_webvtt)
 66 |         results = WebVTTWriter().write(caption_set)
 67 | 
 68 |         assert isinstance(results, str)
 69 |         self.assert_webvtt_equals(sample_webvtt_from_webvtt, results)
 70 | 
 71 |     def test_cue_settings_are_kept(self, sample_webvtt_with_cue_settings):
 72 |         caption_set = WebVTTReader().read(sample_webvtt_with_cue_settings)
 73 | 
 74 |         webvtt = WebVTTWriter().write(caption_set)
 75 | 
 76 |         assert sample_webvtt_with_cue_settings == webvtt
 77 | 
 78 |     def test_positioning_is_kept(self,
 79 |                                  sample_webvtt_keeps_positioning):
 80 |         caption_set = WebVTTReader().read(
 81 |             sample_webvtt_keeps_positioning)
 82 |         results = WebVTTWriter().write(caption_set)
 83 | 
 84 |         assert sample_webvtt_keeps_positioning == results
 85 | 
 86 |     def test_output_timestamps(self, sample_webvtt_timestamps):
 87 |         expected_timestamp_line_pattern = re.compile(
 88 |             r'^(\d{2,}):(\d{2})(:\d{2})?\.(\d{3}) '
 89 |             r'--> (\d{2,}):(\d{2})(:\d{2})?\.(\d{3})')
 90 | 
 91 |         caption_set = WebVTTReader().read(sample_webvtt_timestamps)
 92 |         results = WebVTTWriter().write(caption_set).splitlines()
 93 | 
 94 |         assert re.match(expected_timestamp_line_pattern, results[2])
 95 |         assert re.match(expected_timestamp_line_pattern, results[5])
 96 | 
 97 | #     # TODO: Write a test that includes a WebVTT file with style tags
 98 | #     # That will fail because the styles used in the cues are not tracked.
 99 | 
100 | 
101 | class TestWebVTTtoDFXP(DFXPTestingMixIn):
102 |     def test_conversion(self, sample_dfxp, sample_webvtt):
103 |         caption_set = WebVTTReader().read(sample_webvtt)
104 |         results = DFXPWriter().write(caption_set)
105 | 
106 |         assert isinstance(results, str)
107 |         self.assert_dfxp_equals(
108 |             sample_dfxp, results, ignore_styling=True, ignore_spans=True
109 |         )
110 | 
111 | 
112 | class TestWebVTTtoMicroDVD(MicroDVDTestingMixIn):
113 |     def test_webvtt_to_microdvd_conversion(self, sample_microdvd,
114 |                                            sample_webvtt):
115 |         caption_set = WebVTTReader().read(sample_webvtt)
116 |         results = MicroDVDWriter().write(caption_set)
117 | 
118 |         assert isinstance(results, str)
119 |         self.assert_microdvd_equals(sample_microdvd, results)
120 | 


--------------------------------------------------------------------------------
/tests/test_geometry.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from pycaption import CaptionReadSyntaxError
  4 | from pycaption.geometry import Size, Point, Stretch, Padding, UnitEnum, Layout
  5 | 
  6 | 
  7 | class TestIsValidGeometryObject:
  8 |     def test_size_is_valid(self):
  9 |         with pytest.raises(TypeError):
 10 |             Size()
 11 | 
 12 |         with pytest.raises(ValueError):
 13 |             Size(None, None)
 14 | 
 15 |     def test_point_is_valid(self):
 16 |         with pytest.raises(TypeError):
 17 |             Point()
 18 | 
 19 |         with pytest.raises(ValueError):
 20 |             Point(None, None)
 21 | 
 22 |     def test_stretch_is_valid(self):
 23 |         with pytest.raises(TypeError):
 24 |             Stretch()
 25 | 
 26 |         with pytest.raises(ValueError):
 27 |             Stretch(None, None)
 28 | 
 29 | 
 30 | class TestIsRelative:
 31 |     def test_size_is_relative(self):
 32 |         size_px = Size(30, UnitEnum.PIXEL)
 33 |         size_percent = Size(30, UnitEnum.PERCENT)
 34 | 
 35 |         assert not size_px.is_relative()
 36 |         assert size_percent.is_relative()
 37 | 
 38 |     def test_point_is_relative(self):
 39 |         size_px = Size(30, UnitEnum.PIXEL)
 40 |         size_px2 = Size(30, UnitEnum.PIXEL)
 41 | 
 42 |         size_percent = Size(30, UnitEnum.PERCENT)
 43 |         size_percent2 = Size(30, UnitEnum.PERCENT)
 44 | 
 45 |         point_abs = Point(size_px, size_px2)
 46 |         point_mix = Point(size_percent, size_px)
 47 |         point_rel = Point(size_percent, size_percent2)
 48 | 
 49 |         assert not point_abs.is_relative()
 50 |         assert not point_mix.is_relative()
 51 |         assert point_rel.is_relative()
 52 | 
 53 |     def test_stretch_is_relative(self):
 54 |         size_px = Size(30, UnitEnum.PIXEL)
 55 |         size_px2 = Size(30, UnitEnum.PIXEL)
 56 | 
 57 |         size_percent = Size(30, UnitEnum.PERCENT)
 58 |         size_percent2 = Size(30, UnitEnum.PERCENT)
 59 | 
 60 |         stretch_abs = Stretch(size_px, size_px2)
 61 |         stretch_mix = Stretch(size_percent, size_px)
 62 |         stretch_rel = Stretch(size_percent, size_percent2)
 63 | 
 64 |         assert not stretch_abs.is_relative()
 65 |         assert not stretch_mix.is_relative()
 66 |         assert stretch_rel.is_relative()
 67 | 
 68 |     def test_padding_is_relative(self):
 69 |         size_px = Size(30, UnitEnum.PIXEL)
 70 |         size_px2 = Size(30, UnitEnum.PIXEL)
 71 |         size_px3 = Size(30, UnitEnum.PIXEL)
 72 |         size_px4 = Size(30, UnitEnum.PIXEL)
 73 | 
 74 |         size_percent = Size(30, UnitEnum.PERCENT)
 75 |         size_percent2 = Size(30, UnitEnum.PERCENT)
 76 |         size_percent3 = Size(30, UnitEnum.PERCENT)
 77 |         size_percent4 = Size(30, UnitEnum.PERCENT)
 78 | 
 79 |         padding_abs = Padding(size_px, size_px2, size_px3, size_px4)
 80 |         padding_mix = Padding(size_px, size_px2, size_px3, size_percent)
 81 |         padding_rel = Padding(
 82 |             size_percent, size_percent2, size_percent3, size_percent4)
 83 | 
 84 |         assert not padding_abs.is_relative()
 85 |         assert not padding_mix.is_relative()
 86 |         assert padding_rel.is_relative()
 87 | 
 88 |     def test_layout_is_relative(self):
 89 |         empty_layout = Layout()
 90 | 
 91 |         size_px = Size(30, UnitEnum.PIXEL)
 92 |         size_px2 = Size(30, UnitEnum.PIXEL)
 93 | 
 94 |         size_percent = Size(30, UnitEnum.PERCENT)
 95 |         size_percent2 = Size(30, UnitEnum.PERCENT)
 96 | 
 97 |         point_abs = Point(size_px, size_px2)
 98 |         point_rel = Point(size_percent, size_percent2)
 99 | 
100 |         stretch_abs = Stretch(size_px, size_px2)
101 |         stretch_rel = Stretch(size_percent, size_percent2)
102 | 
103 |         layout_abs = Layout(
104 |             origin=point_abs,
105 |             extent=stretch_abs,
106 |             padding=None
107 |         )
108 | 
109 |         layout_mix = Layout(
110 |             origin=point_abs,
111 |             extent=stretch_rel,
112 |             padding=None
113 |         )
114 | 
115 |         layout_rel = Layout(
116 |             origin=point_rel,
117 |             extent=stretch_rel,
118 |             padding=None
119 |         )
120 | 
121 |         assert empty_layout.is_relative()
122 |         assert not layout_abs.is_relative()
123 |         assert not layout_mix.is_relative()
124 |         assert layout_rel.is_relative()
125 | 
126 | 
127 | class TestSize:
128 |     @pytest.mark.parametrize('string, value, unit', [
129 |         ('1px', 1.0, UnitEnum.PIXEL), ('2.3em', 2.3, UnitEnum.EM),
130 |         ('12.34%', 12.34, UnitEnum.PERCENT), ('1.234c', 1.234, UnitEnum.CELL),
131 |         ('10pt', 10.0, UnitEnum.PT), ('0', 0.0, UnitEnum.PIXEL)])
132 |     def test_valid_size_from_string(self, string, value, unit):
133 |         size = Size.from_string(string)
134 | 
135 |         assert size.value == value
136 |         assert size.unit == unit
137 | 
138 |     @pytest.mark.parametrize('string', ['10', '11,1px', '12xx', '%', 'o1pt'])
139 |     def test_invalid_size_from_string(self, string):
140 |         with pytest.raises(CaptionReadSyntaxError) as exc_info:
141 |             Size.from_string(string)
142 | 
143 |         assert exc_info.value.args[0].startswith(f"Invalid size: {string}.")


--------------------------------------------------------------------------------
/pycaption/scc/state_machines.py:
--------------------------------------------------------------------------------
  1 | from ..exceptions import CaptionReadSyntaxError
  2 | 
  3 | 
  4 | class _PositioningTracker:
  5 |     """Helps determine the positioning of a node, having kept track of
  6 |     positioning-related commands.
  7 |     """
  8 | 
  9 |     def __init__(self, positioning=None):
 10 |         """
 11 |         :param positioning: positioning information (row, column)
 12 |         :type positioning: tuple[int]
 13 |         """
 14 |         self._positions = [positioning]
 15 |         self._break_required = False
 16 |         self._repositioning_required = False
 17 |         # Since the actual column is not applied when encountering a line break
 18 |         # this attribute is used to store it and determine by comparison if the
 19 |         # next positioning is actually a Tab Offset
 20 |         self._last_column = None
 21 | 
 22 |     def update_positioning(self, positioning):
 23 |         """Being notified of a position change, updates the internal state,
 24 |         to as to be able to tell if it was a trivial change (a simple line
 25 |         break) or not.
 26 | 
 27 |         :type positioning: tuple[int]
 28 |         :param positioning: a tuple (row, col)
 29 |         """
 30 |         current = self._positions[-1]
 31 | 
 32 |         if not current:
 33 |             if positioning:
 34 |                 # Set the positioning for the first time
 35 |                 self._positions = [positioning]
 36 |             return
 37 | 
 38 |         row, col = current
 39 |         if self._break_required:
 40 |             col = self._last_column
 41 |         new_row, new_col = positioning
 42 |         is_tab_offset = new_row == row and col + 1 <= new_col <= col + 3
 43 |         # One line below will be treated as line break, not repositioning
 44 |         if new_row == row + 1:
 45 |             self._positions.append((new_row, col))
 46 |             self._break_required = True
 47 |             self._last_column = new_col
 48 |         # Tab offsets after line breaks will be ignored to avoid repositioning
 49 |         elif self._break_required and is_tab_offset:
 50 |             return
 51 |         # force not to reposition on the same coordinates
 52 |         elif positioning == current:
 53 |             return
 54 |         else:
 55 |             # Reset the "current" position altogether.
 56 |             self._positions = [positioning]
 57 |             # Tab offsets are not interpreted as repositioning, but adjustments
 58 |             # to the previous PAC command
 59 |             if not is_tab_offset:
 60 |                 self._repositioning_required = True
 61 | 
 62 |     def get_current_position(self):
 63 |         """Returns the current usable position
 64 | 
 65 |         :rtype: tuple[int]
 66 | 
 67 |         :raise: CaptionReadSyntaxError
 68 |         """
 69 |         if not any(self._positions):
 70 |             raise CaptionReadSyntaxError("No Preamble Address Code [PAC] was provided")
 71 |         else:
 72 |             return self._positions[0]
 73 | 
 74 |     def is_repositioning_required(self):
 75 |         """Determines whether the current positioning has changed non-trivially
 76 | 
 77 |         Trivial would be mean that a line break should suffice.
 78 |         :rtype: bool
 79 |         """
 80 |         return self._repositioning_required
 81 | 
 82 |     def acknowledge_position_changed(self):
 83 |         """Acknowledge the position tracer that the position was changed"""
 84 |         self._repositioning_required = False
 85 | 
 86 |     def is_linebreak_required(self):
 87 |         """If the current position is simply one line below the previous.
 88 |         :rtype: bool
 89 |         """
 90 |         return self._break_required
 91 | 
 92 |     def acknowledge_linebreak_consumed(self):
 93 |         """Call to acknowledge that the line required was consumed"""
 94 |         self._break_required = False
 95 | 
 96 | 
 97 | class DefaultProvidingPositionTracker(_PositioningTracker):
 98 |     """A _PositioningTracker that provides if needed a default value (14, 0), or
 99 |     uses the last positioning value set anywhere in the document
100 |     """
101 | 
102 |     default = (14, 0)
103 | 
104 |     def __init__(self, positioning=None, default=None):
105 |         """
106 |         :type positioning: tuple[int]
107 |         :param positioning: a tuple of ints (row, column)
108 | 
109 |         :type default: tuple[int]
110 |         :param default: a tuple of ints (row, column) to use as fallback
111 |         """
112 |         super().__init__(positioning)
113 | 
114 |         if default:
115 |             self.default = default
116 | 
117 |     def get_current_position(self):
118 |         """Returns the currently tracked positioning, the last positioning that
119 |         was set (anywhere), or the default it was initiated with
120 | 
121 |         :rtype: tuple[int]
122 |         """
123 |         try:
124 |             return super().get_current_position()
125 |         except CaptionReadSyntaxError:
126 |             return self.default
127 | 
128 |     def update_positioning(self, positioning):
129 |         """If called, sets this positioning as the default, then delegates
130 |         to the super class.
131 | 
132 |         :param positioning: a tuple of ints (row, col)
133 |         :type positioning: tuple[int]
134 |         """
135 |         if positioning:
136 |             self.default = positioning
137 | 
138 |         super().update_positioning(positioning)
139 | 


--------------------------------------------------------------------------------
/pycaption/srt.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | 
  3 | from .base import (
  4 |     BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode,
  5 | )
  6 | from .exceptions import CaptionReadNoCaptions, InvalidInputError
  7 | 
  8 | 
  9 | class SRTReader(BaseReader):
 10 |     def detect(self, content):
 11 |         lines = content.splitlines()
 12 |         if lines[0].isdigit() and '-->' in lines[1]:
 13 |             return True
 14 |         else:
 15 |             return False
 16 | 
 17 |     def read(self, content, lang='en-US'):
 18 |         if not isinstance(content, str):
 19 |             raise InvalidInputError('The content is not a unicode string.')
 20 | 
 21 |         lines = content.splitlines()
 22 |         start_line = 0
 23 |         captions = CaptionList()
 24 | 
 25 |         while start_line < len(lines):
 26 |             if not lines[start_line].isdigit():
 27 |                 break
 28 | 
 29 |             end_line = self._find_text_line(start_line, lines)
 30 | 
 31 |             timing = lines[start_line + 1].split('-->')
 32 |             start = self._srttomicro(timing[0].strip(' \r\n'))
 33 |             end = self._srttomicro(timing[1].strip(' \r\n'))
 34 | 
 35 |             nodes = []
 36 | 
 37 |             for line in lines[start_line + 2:end_line - 1]:
 38 |                 # skip extra blank lines
 39 |                 if not nodes or line != '':
 40 |                     nodes.append(CaptionNode.create_text(line))
 41 |                     nodes.append(CaptionNode.create_break())
 42 | 
 43 |             if len(nodes):
 44 |                 # remove last line break from end of caption list
 45 |                 nodes.pop()
 46 |                 caption = Caption(start, end, nodes)
 47 |                 captions.append(caption)
 48 | 
 49 |             start_line = end_line
 50 | 
 51 |         caption_set = CaptionSet({lang: captions})
 52 | 
 53 |         if caption_set.is_empty():
 54 |             raise CaptionReadNoCaptions("empty caption file")
 55 | 
 56 |         return caption_set
 57 | 
 58 |     def _srttomicro(self, stamp):
 59 |         timesplit = stamp.split(':')
 60 |         if ',' not in timesplit[2]:
 61 |             timesplit[2] += ',000'
 62 |         secsplit = timesplit[2].split(',')
 63 |         microseconds = (int(timesplit[0]) * 3600000000
 64 |                         + int(timesplit[1]) * 60000000
 65 |                         + int(secsplit[0]) * 1000000
 66 |                         + int(secsplit[1]) * 1000)
 67 | 
 68 |         return microseconds
 69 | 
 70 |     def _find_text_line(self, start_line, lines):
 71 |         end_line = start_line
 72 | 
 73 |         found = False
 74 |         while end_line < len(lines):
 75 |             if lines[end_line].strip() == "":
 76 |                 found = True
 77 |             elif found is True:
 78 |                 end_line -= 1
 79 |                 break
 80 |             end_line += 1
 81 | 
 82 |         return end_line + 1
 83 | 
 84 | 
 85 | class SRTWriter(BaseWriter):
 86 |     def write(self, caption_set):
 87 |         caption_set = deepcopy(caption_set)
 88 | 
 89 |         srt_captions = []
 90 | 
 91 |         for lang in caption_set.get_languages():
 92 |             srt_captions.append(
 93 |                 self._recreate_lang(caption_set.get_captions(lang))
 94 |             )
 95 | 
 96 |         caption_content = 'MULTI-LANGUAGE SRT\n'.join(srt_captions)
 97 |         return caption_content
 98 | 
 99 |     def _recreate_lang(self, captions):
100 |         # Merge caption's that are on the exact same timestamp otherwise some
101 |         # players will play them in reversed order, libass specifically which is
102 |         # used quite a lot, including VLC and MPV.
103 | 
104 |         merged_captions = [captions[0]] if captions else []
105 | 
106 |         for caption in captions[1:]:
107 |             # Merge if the timestamp is the same as last caption
108 |             if (caption.start, caption.end) == (
109 |                     merged_captions[-1].start, merged_captions[-1].end):
110 |                 merged_captions[-1] = Caption(
111 |                     start=caption.start,
112 |                     end=caption.end,
113 |                     nodes=(merged_captions[-1].nodes
114 |                            + [CaptionNode.create_break()]
115 |                            + caption.nodes))
116 |             else:
117 |                 # Different timestamp, end of merging, append new caption
118 |                 merged_captions.append(caption)
119 |         captions = merged_captions
120 | 
121 |         srt = ''
122 |         count = 1
123 | 
124 |         for caption in captions:
125 |             srt += f'{count}\n'
126 | 
127 |             start = caption.format_start(msec_separator=',')
128 |             end = caption.format_end(msec_separator=',')
129 | 
130 |             srt += f'{start[:12]} --> {end[:12]}\n'
131 | 
132 |             new_content = ''
133 |             for node in caption.nodes:
134 |                 new_content = self._recreate_line(new_content, node)
135 | 
136 |             # Eliminate excessive line breaks
137 |             new_content = new_content.strip()
138 | 
139 |             srt += f"{new_content}\n\n"
140 |             count += 1
141 | 
142 |         return srt[:-1]  # remove unwanted newline at end of file
143 | 
144 |     def _recreate_line(self, srt, line):
145 |         if line.type_ == CaptionNode.TEXT:
146 |             return srt + f'{line.content} '
147 |         elif line.type_ == CaptionNode.BREAK:
148 |             return srt + '\n'
149 |         else:
150 |             return srt
151 | 


--------------------------------------------------------------------------------
/docs/supported_formats.rst:
--------------------------------------------------------------------------------
  1 | Supported formats
  2 | ==================
  3 | 
  4 | Read: - DFXP/TTML - SAMI - SCC - SRT - WebVTT
  5 | 
  6 | Write: - DFXP/TTML - SAMI - SRT - Transcript - WebVTT
  7 | 
  8 | See the `examples
  9 | folder <https://github.com/pbs/pycaption/tree/master/examples/>`__ for
 10 | example captions that currently can be read correctly.
 11 | 
 12 | SAMI Reader / Writer :: `spec <http://msdn.microsoft.com/en-us/library/ms971327.aspx>`__
 13 | ----------------------------------------------------------------------------------------
 14 | 
 15 | Microsoft Synchronized Accessible Media Interchange. Supports multiple
 16 | languages.
 17 | 
 18 | Supported Styling: - text-align - italics - font-size - font-family -
 19 | color
 20 | 
 21 | If the SAMI file is not valid XML (e.g. unclosed tags), will still
 22 | attempt to read it.
 23 | 
 24 | DFXP/TTML Reader / Writer :: `spec <http://www.w3.org/TR/ttaf1-dfxp/>`__
 25 | -------------------------------------------------------------------
 26 | 
 27 | The W3 standard. Supports multiple languages.
 28 | 
 29 | Supported Styling: - text-align - italics - font-size - font-family -
 30 | color
 31 | 
 32 | SRT Reader / Writer :: `spec <http://matroska.org/technical/specs/subtitles/srt.html>`__
 33 | ----------------------------------------------------------------------------------------
 34 | 
 35 | SubRip captions. If given multiple languages to write, will output all
 36 | joined together by a 'MULTI-LANGUAGE SRT' line.
 37 | 
 38 | Supported Styling: - None
 39 | 
 40 | Assumes input language is english. To change:
 41 | 
 42 | ::
 43 | 
 44 |     pycaps = SRTReader().read(srt_content, lang='fr')
 45 | 
 46 | WebVTT Reader / Writer :: `spec <http://dev.w3.org/html5/webvtt/>`__
 47 | -----------------------------------------------------------------
 48 | 
 49 | **WebVTT** is a W3C standard for displaying timed text in HTML5. Its
 50 | specification is currently (as of February 2015) in draft stage and
 51 | therefore not all features are implemented by major players, the same
 52 | being true for ``pycaption``.
 53 | 
 54 | By default, the reader assumes the language is English and the writer
 55 | returns the first language it finds in the caption set. You can specify
 56 | a language using the ``lang`` parameter:
 57 | 
 58 | ::
 59 | 
 60 |     pycaps = WebVTTReader().read(content, lang='fr')
 61 | 
 62 | If you need to adjust all timestamps in a WebVTT, you can use the
 63 | ``time_shift_milliseconds`` parameter which moves the timestamps
 64 | forward (positive integer) or backward (negative integer) with
 65 | the specified amount:
 66 | 
 67 | ::
 68 | 
 69 |     pycaps = WebVTTReader(time_shift_milliseconds=1154).read(content)
 70 | 
 71 | Styling
 72 | ^^^^^^^
 73 | 
 74 | Styling in WebVTT can be done via inline tags (e.g. ``<b>``, ``<i>`` etc.) or external
 75 | CSS rules applied to text wrapped in class (``<c>``) or voice (``<v>``) tags.
 76 | 
 77 | ``pycaption`` currently only keeps *voice tags* on conversion.
 78 | 
 79 | Example:
 80 | 
 81 | ::
 82 | 
 83 |     <v Fred>Hi, my name is Fred
 84 | 
 85 | is converted to
 86 | 
 87 | ::
 88 | 
 89 |     Fred: Hi, my name is Fred
 90 | 
 91 | The following WebVTT supported tags are stripped off the cue text:
 92 | 
 93 |     - ``<c>``, ``<i>``, ``<b>``, ``<u>``, ``<ruby>``, ``<rt>``, ``<lang>`` and timestamp tags (``<h:mm:ss.sss>``)
 94 | 
 95 | Non-supported tags are left unchanged as a natural part of the cue text with no
 96 | special meaning.
 97 | 
 98 | Positioning
 99 | ^^^^^^^^^^^
100 | 
101 | The WebVTT specs allow customizing the position of cues by configuring a
102 | number of cue settings. ``pycaption`` currently only *maintains positioning
103 | information on writing*, in which case it supports the following settings:
104 | 
105 | -  A WebVTT line position cue setting.
106 | -  A WebVTT text position cue setting.
107 | -  A WebVTT size cue setting.
108 | -  A WebVTT alignment cue setting.
109 | 
110 | ``pycaption`` **does not** support:
111 | 
112 | -  A WebVTT vertical text cue setting.
113 | -  A WebVTT region cue setting.
114 | 
115 | Refer to the `official WebVTT specification`_ for details about the cue
116 | settings.
117 | 
118 | .. _official WebVTT specification: http://dev.w3.org/html5/webvtt/#webvtt-cue-settings
119 | 
120 | SCC Reader :: `spec <http://www.theneitherworld.com/mcpoodle/SCC_TOOLS/DOCS/SCC_FORMAT.HTML>`__
121 | -----------------------------------------------------------------------------------------------
122 | 
123 | Scenarist Closed Caption format. Assumes Channel 1 input.
124 | 
125 | Supported Styling: - italics
126 | 
127 | By default, the SCC Reader does not simulate roll-up captions. To enable
128 | roll-ups:
129 | 
130 | ::
131 | 
132 |     pycaps = SCCReader().read(scc_content, simulate_roll_up=True)
133 | 
134 | Also, assumes input language is english. To change:
135 | 
136 | ::
137 | 
138 |     pycaps = SCCReader().read(scc_content, lang='fr')
139 | 
140 | Now has the option of specifying an offset (measured in seconds) for the
141 | timestamp. For example, if the SCC file is 45 seconds ahead of the
142 | video:
143 | 
144 | ::
145 | 
146 |     pycaps = SCCReader().read(scc_content, offset=45)
147 | 
148 | The SCC Reader handles both dropframe and non-dropframe captions, and
149 | will auto-detect which format the captions are in.
150 | 
151 | For debugging purposes, the SCC captions can be translated into a human readable
152 | form as following:
153 | ::
154 | 
155 |     translated_scc = translate_scc(scc_content, brackets="[]")
156 | 
157 | Square brackets are used by default, but they can be replaced with other
158 | brackets or None.
159 | 
160 | Transcript Writer
161 | -----------------
162 | 
163 | Text stripped of styling, arranged in sentences.
164 | 
165 | Supported Styling: - None
166 | 
167 | The transcript writer uses natural sentence boundary detection
168 | algorithms to create the transcript.
169 | 


--------------------------------------------------------------------------------
/tests/test_dfxp_extras.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | from bs4 import BeautifulSoup
  3 | 
  4 | from pycaption.dfxp.base import _create_internal_alignment
  5 | from pycaption.dfxp import (
  6 |     SinglePositioningDFXPWriter, DFXPReader, DFXP_DEFAULT_REGION,
  7 |     DFXP_DEFAULT_REGION_ID, LegacyDFXPWriter,
  8 | )
  9 | from pycaption.geometry import (
 10 |     HorizontalAlignmentEnum, VerticalAlignmentEnum, Layout, Alignment,
 11 | )
 12 | 
 13 | 
 14 | class TestSinglePositioningDFXPWRiter:
 15 |     def test_only_the_default_region_is_created(
 16 |             self, sample_dfxp_to_render_with_only_default_positioning_input):
 17 |         caption_set = DFXPReader().read(
 18 |             sample_dfxp_to_render_with_only_default_positioning_input
 19 |         )
 20 | 
 21 |         dfxp = SinglePositioningDFXPWriter().write(caption_set)
 22 |         layout = BeautifulSoup(dfxp, features='html.parser').findChild('layout')
 23 | 
 24 |         assert len(layout.findChildren('region')) == 1
 25 | 
 26 |     def test_only_the_default_region_is_referenced(
 27 |             self, sample_dfxp_to_render_with_only_default_positioning_input):
 28 |         caption_set = DFXPReader().read(
 29 |             sample_dfxp_to_render_with_only_default_positioning_input
 30 |         )
 31 | 
 32 |         dfxp = SinglePositioningDFXPWriter().write(caption_set)
 33 | 
 34 |         soup = BeautifulSoup(dfxp, features='html.parser')
 35 | 
 36 |         for elem in soup.findAll():
 37 |             if 'region' in elem.attrs:
 38 |                 assert elem['region'] == DFXP_DEFAULT_REGION_ID
 39 | 
 40 |     def test_only_the_custom_region_is_created(
 41 |             self, sample_dfxp_to_render_with_only_default_positioning_input):
 42 |         caption_set = DFXPReader().read(
 43 |             sample_dfxp_to_render_with_only_default_positioning_input
 44 |         )
 45 | 
 46 |         new_region = Layout(
 47 |             alignment=Alignment(
 48 |                 HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP
 49 |             )
 50 |         )
 51 | 
 52 |         dfxp = SinglePositioningDFXPWriter(new_region).write(caption_set)
 53 |         # Using a different parser, because this preserves letter case
 54 |         # The output file is ok, but when parsing it, the "regular" parses
 55 |         # loses letter case.
 56 |         layout = BeautifulSoup(dfxp, features='xml').findChild('layout')
 57 | 
 58 |         region = layout.findChild('region')
 59 |         text_align = region['tts:textAlign']
 60 |         display_align = region['tts:displayAlign']
 61 | 
 62 |         internal_alignment = _create_internal_alignment(
 63 |             text_align, display_align
 64 |         )
 65 | 
 66 |         assert len(layout.findChildren('region')) == 1
 67 |         assert internal_alignment.horizontal == HorizontalAlignmentEnum.LEFT
 68 |         assert internal_alignment.vertical == VerticalAlignmentEnum.TOP
 69 | 
 70 |     def test_only_the_specified_custom_attributes_are_created_for_the_region(
 71 |         self, sample_dfxp_to_render_with_only_default_positioning_input
 72 |     ):
 73 |         caption_set = DFXPReader().read(
 74 |             sample_dfxp_to_render_with_only_default_positioning_input
 75 |         )
 76 | 
 77 |         new_region = Layout(
 78 |             alignment=Alignment(
 79 |                 HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP
 80 |             )
 81 |         )
 82 | 
 83 |         dfxp = SinglePositioningDFXPWriter(new_region).write(caption_set)
 84 | 
 85 |         region = BeautifulSoup(dfxp, features='lxml').find('region')
 86 | 
 87 |         assert 'xml:id' in region.attrs
 88 |         assert region.attrs['xml:id'] != DFXP_DEFAULT_REGION_ID
 89 |         assert len(region.attrs) == 3
 90 | 
 91 |     def test_only_the_custom_region_is_referenced(
 92 |             self, sample_dfxp_to_render_with_only_default_positioning_input):
 93 |         caption_set = DFXPReader().read(
 94 |             sample_dfxp_to_render_with_only_default_positioning_input
 95 |         )
 96 | 
 97 |         # it's easier to copy this than create a new one
 98 |         new_region = deepcopy(DFXP_DEFAULT_REGION)
 99 |         new_region.alignment.horizontal = HorizontalAlignmentEnum.LEFT
100 |         new_region.alignment.vertical = VerticalAlignmentEnum.TOP
101 | 
102 |         dfxp = SinglePositioningDFXPWriter(new_region).write(caption_set)
103 | 
104 |         soup = BeautifulSoup(dfxp, features='html.parser')
105 | 
106 |         # get the region_id created, and see it's the one referenced
107 |         created_region_id = soup.find('region')['xml:id']
108 | 
109 |         referenced_region_ids = set()
110 | 
111 |         for elem in soup.findAll():
112 |             if 'region' in elem.attrs:
113 |                 referenced_region_ids.add(elem.attrs['region'])
114 | 
115 |         assert len(referenced_region_ids) == 1
116 |         assert referenced_region_ids.pop() == created_region_id
117 | 
118 |     def test_styles_dont_contain_text_align_attribute(
119 |             self, sample_dfxp_to_render_with_only_default_positioning_input):
120 |         caption_set = DFXPReader().read(
121 |             sample_dfxp_to_render_with_only_default_positioning_input
122 |         )
123 | 
124 |         result = SinglePositioningDFXPWriter().write(caption_set)
125 | 
126 |         caption_set = DFXPReader().read(result)
127 | 
128 |         for _, style in caption_set.get_styles():
129 |             assert 'text-align' not in style
130 | 
131 | 
132 | class TestLegacyDFXPWriter:
133 |     def test_default_style_is_written_to_output_file(
134 |             self, sample_dfxp_with_templated_style):
135 |         caption_set = DFXPReader(read_invalid_positioning=True).read(
136 |             sample_dfxp_with_templated_style.format(
137 |                 style_name="foxy_the_squirrel"))
138 | 
139 |         result = LegacyDFXPWriter().write(caption_set)
140 | 
141 |         assert result.count('foxy_the_squirrel') == 2
142 | 


--------------------------------------------------------------------------------
/tests/mixins.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import pytest
  4 | from bs4 import BeautifulSoup
  5 | 
  6 | from pycaption.exceptions import InvalidInputError
  7 | 
  8 | 
  9 | class ReaderTestingMixIn:
 10 |     """
 11 |     Provide test case capabilities for asserting common Reader functionalities.
 12 |     """
 13 | 
 14 |     def assert_positive_answer_for_detection(self, matching_sample):
 15 |         assert self.reader.detect(matching_sample) is True
 16 | 
 17 |     def assert_negative_answer_for_detection(self, different_sample):
 18 |         assert self.reader.detect(different_sample) is False
 19 | 
 20 |     def test_reader_only_supports_unicode_input(self):
 21 |         with pytest.raises(InvalidInputError) as exc_info:
 22 |             self.reader.read(b'')
 23 |         assert exc_info.value.args[0] == 'The content is not a unicode string.'
 24 | 
 25 | 
 26 | class WebVTTTestingMixIn:
 27 |     """
 28 |     Provide specialized test case capabilities for asserting on WebVTT content.
 29 |     """
 30 | 
 31 |     def _extract_webvtt_captions(self, content):
 32 |         return tuple(line.strip() for line in content.splitlines())
 33 | 
 34 |     def assert_webvtt_equals(self, first, second):
 35 |         """
 36 |         Assert that two WebVTT contents are equal.
 37 |         """
 38 |         first_items = self._extract_webvtt_captions(first)
 39 |         second_items = self._extract_webvtt_captions(second)
 40 | 
 41 |         assert first_items == second_items
 42 | 
 43 | 
 44 | class SRTTestingMixIn:
 45 |     """
 46 |     Provide specialized test case capabilities for asserting on SRT content.
 47 |     """
 48 | 
 49 |     def _extract_srt_captions(self, content):
 50 |         return tuple(line.strip() for line in content.splitlines())
 51 | 
 52 |     def assert_srt_equals(self, first, second):
 53 |         """
 54 |         Assert that two SRT contents are equal.
 55 |         """
 56 |         first_items = self._extract_srt_captions(first)
 57 |         second_items = self._extract_srt_captions(second)
 58 | 
 59 |         assert first_items == second_items
 60 | 
 61 | 
 62 | class CaptionSetTestingMixIn:
 63 |     def assert_captionset_almost_equals(self, first, second,
 64 |                                         tolerance_microseconds):
 65 |         """
 66 |         Assert that two caption sets have equal text except for newlines,
 67 |         and differences in timing that are less than tolerance_microseconds.
 68 |         """
 69 | 
 70 |         captions_1 = first.get_captions(list(first.get_languages())[0])
 71 |         captions_2 = second.get_captions(list(first.get_languages())[0])
 72 | 
 73 |         def get_text_for_caption(caption):
 74 |             text = caption.get_text()
 75 |             text = re.sub(r'\s+', ' ', text)
 76 | 
 77 |             return text
 78 | 
 79 |         text_1 = [get_text_for_caption(caption) for caption in captions_1]
 80 |         text_2 = [get_text_for_caption(caption) for caption in captions_2]
 81 | 
 82 |         def close_enough(ts1, ts2):
 83 |             return abs(ts1 - ts2) < tolerance_microseconds
 84 | 
 85 |         start_differences = [
 86 |             (caption_1.start, caption_2.start)
 87 |             for caption_1, caption_2 in zip(captions_1, captions_2)
 88 |             if not close_enough(caption_1.start, caption_2.start)
 89 |         ]
 90 | 
 91 |         end_differences = [
 92 |             (caption_1.end, caption_2.end)
 93 |             for caption_1, caption_2 in zip(captions_1, captions_2)
 94 |             if not close_enough(caption_1.end, caption_2.end)
 95 |         ]
 96 | 
 97 |         assert text_1 == text_2
 98 |         assert start_differences == []
 99 |         assert end_differences == []
100 | 
101 | 
102 | class DFXPTestingMixIn:
103 |     """
104 |     Provide specialized test case capabilities for asserting on DFXP content.
105 |     """
106 | 
107 |     def _remove_styling(self, soup):
108 |         for style in soup('styling'):
109 |             style.clear()
110 | 
111 |         for paragraph in soup('p'):
112 |             if 'style' in paragraph.attrs:
113 |                 del paragraph.attrs['style']
114 | 
115 |     def _remove_spans(self, soup):
116 |         for span in soup('span'):
117 |             span.unwrap()
118 | 
119 |     def _trim_text(self, soup):
120 |         for paragraph in soup('p'):
121 |             paragraph.string = paragraph.text.strip()
122 | 
123 |     def assert_dfxp_equals(self, first, second,
124 |                            ignore_styling=False,
125 |                            ignore_spans=False):
126 |         first_soup = BeautifulSoup(first, 'lxml')
127 |         second_soup = BeautifulSoup(second, 'lxml')
128 | 
129 |         if ignore_styling:
130 |             self._remove_styling(first_soup)
131 |             self._remove_styling(second_soup)
132 | 
133 |         if ignore_spans:
134 |             self._remove_spans(first_soup)
135 |             self._remove_spans(second_soup)
136 | 
137 |         self._trim_text(first_soup)
138 |         self._trim_text(second_soup)
139 | 
140 |         assert first_soup == second_soup
141 | 
142 | 
143 | class SAMITestingMixIn:
144 |     """
145 |     Provide specialized test case capabilities for asserting on SAMI content.
146 |     """
147 | 
148 |     def _extract_sami_captions(self, soup):
149 |         return tuple(
150 |             (caption.attrs['start'], caption.p.text.strip())
151 |             for caption in soup.select('sync')
152 |         )
153 | 
154 |     def assert_sami_captions_equal(self, first, second):
155 |         first_soup = BeautifulSoup(first, 'lxml')
156 |         second_soup = BeautifulSoup(second, 'lxml')
157 | 
158 |         first_items = self._extract_sami_captions(first_soup)
159 |         second_items = self._extract_sami_captions(second_soup)
160 | 
161 |         assert first_items == second_items
162 | 
163 | 
164 | class MicroDVDTestingMixIn:
165 |     """
166 |     Provide specialized test case capabilities for asserting on MicroDVD content.
167 |     """  # noqa
168 | 
169 |     def _extract_micro_dvd_captions(self, content):
170 |         return tuple(line.strip() for line in content.splitlines())
171 | 
172 |     def assert_microdvd_equals(self, first, second):
173 |         """
174 |         Assert that two MicroDVD contents are equal.
175 |         """
176 |         first_items = self._extract_micro_dvd_captions(first)
177 |         second_items = self._extract_micro_dvd_captions(second)
178 | 
179 |         assert first_items == second_items
180 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | from tests.fixtures.dfxp import (  # noqa: F401
  2 |     sample_dfxp, sample_dfxp_with_inline_style, sample_dfxp_with_defined_style,
  3 |     sample_dfxp_with_inherited_style, sample_dfxp_without_region_and_style,
  4 |     sample_dfxp_with_positioning, sample_dfxp_with_relativized_positioning,
  5 |     sample_dfxp_empty, sample_dfxp_syntax_error,
  6 |     sample_dfxp_from_sami_with_positioning,
  7 |     sample_dfxp_long_cue, sample_dfxp_long_cue_fit_to_screen,
  8 |     sample_dfxp_from_sami_with_margins, sample_dfxp_from_sami_with_lang_margins,
  9 |     sample_dfxp_from_sami_with_span, sample_dfxp_from_sami_with_bad_span_align,
 10 |     sample_dfxp_invalid_but_supported_positioning_input,
 11 |     sample_dfxp_invalid_but_supported_positioning_output,
 12 |     sample_dfxp_multiple_regions_input, sample_dfxp_multiple_regions_output,
 13 |     sample_dfxp_to_render_with_only_default_positioning_input,
 14 |     sample_dfxp_output, sample_dfxp_style_tag_with_no_xml_id_input,
 15 |     sample_dfxp_style_tag_with_no_xml_id_output, sample_dfxp_from_scc_output,
 16 |     sample_dfxp_with_properly_closing_spans_output,
 17 |     sample_dfxp_for_legacy_writer_input, sample_dfxp_for_legacy_writer_output,
 18 |     sample_dfxp_with_templated_style, sample_dfxp_with_escaped_apostrophe,
 19 |     sample_dfxp_with_alternative_timing_formats, sample_dfxp_empty_paragraph,
 20 |     sample_dfxp_only_spaces_paragraph, sample_dfxp_incorrect_time_format,
 21 |     sample_dfxp_missing_begin, sample_dfxp_missing_end_and_dur,
 22 |     sample_dfxp_with_frame_timing, sample_dfxp_empty_cue,
 23 |     sample_dfxp_empty_cue_output, sample_dfxp_default_styling_p_tags,
 24 |     sample_dfxp_invalid_positioning_value_template,
 25 |     sample_dfxp_multiple_captions_with_the_same_timing,
 26 |     sample_dfxp_with_ampersand_character, sample_dfxp_with_nested_spans,
 27 |     dfxp_style_region_align_conflict, dfxp_with_concurrent_captions,
 28 | )
 29 | from tests.fixtures.microdvd import (  # noqa: F401
 30 |     sample_microdvd, sample_microdvd_2,
 31 |     sample_microdvd_invalid_format, missing_fps_sample_microdvd,
 32 |     sample_microdvd_empty, sample_microdvd_empty_cue_output,
 33 | )
 34 | from tests.fixtures.sami import (  # noqa: F401
 35 |     sample_sami, sample_sami_with_style_tags,
 36 |     sample_sami_with_css_inline_style, sample_sami_with_css_id_style,
 37 |     sample_sami_empty, sample_sami_syntax_error,
 38 |     sample_sami_double_br, sample_sami_partial_margins,
 39 |     sample_sami_partial_margins_relativized, sample_sami_lang_margin,
 40 |     sample_sami_with_span, sample_sami_with_bad_span_align,
 41 |     sample_sami_with_bad_div_align, sample_sami_with_p_align,
 42 |     sample_sami_with_p_and_span_align, sample_sami_with_multiple_span_aligns,
 43 |     sample_sami_no_lang, sample_sami_with_lang, sample_sami_with_multi_lang,
 44 |     sample_sami_with_multiple_p, sample_sami_empty_cue_output,
 45 |     sample_sami_with_invalid_inline_style,
 46 |     sample_sami_including_hexadecimal_charref,
 47 |     sample_sami_including_decimal_charref,
 48 |     sample_sami_including_html5_entityref, sample_sami_with_unclosed_tag,
 49 |     sample_sami_with_inline_lang, sample_sami_from_dfxp_with_nested_spans,
 50 |     sample_sami_with_separate_multi_lang, sample_sami_missing_start
 51 | )
 52 | from tests.fixtures.scc import (  # noqa: F401
 53 |     sample_scc_created_dfxp_with_wrongly_closing_spans,
 54 |     scc_that_generates_webvtt_with_proper_newlines,
 55 |     sample_scc_produces_captions_with_start_and_end_time_the_same,
 56 |     sample_scc_pop_on, sample_scc_multiple_positioning, sample_scc_with_italics,
 57 |     sample_scc_empty, sample_scc_roll_up_ru2, sample_scc_roll_up_ru3,
 58 |     sample_no_positioning_at_all_scc, sample_scc_with_line_too_long,
 59 |     sample_scc_no_explicit_end_to_last_caption, sample_scc_flashing_cue,
 60 |     sample_scc_eoc_first_command, sample_scc_with_extended_characters,
 61 |     sample_scc_with_ampersand_character, sample_scc_multiple_formats,
 62 |     sample_scc_duplicate_tab_offset, sample_scc_duplicate_special_characters,
 63 |     sample_scc_tab_offset, sample_scc_with_unknown_commands,
 64 |     sample_scc_special_and_extended_characters, sample_scc_mid_row_before_text_pop,
 65 |     sample_scc_mid_row_before_text_roll, sample_scc_mid_row_before_text_paint,
 66 |     sample_scc_mid_row_following_text_no_text_before_italics_off_pop,
 67 |     sample_scc_mid_row_following_text_no_text_before_italics_off_roll,
 68 |     sample_scc_mid_row_following_text_no_text_before_italics_off_paint,
 69 |     sample_scc_mid_row_following_text_no_text_before_italics_on_pop,
 70 |     sample_scc_mid_row_following_text_no_text_before_italics_on_roll,
 71 |     sample_scc_mid_row_following_text_no_text_before_italics_on_paint,
 72 |     sample_scc_mid_row_with_space_before_pop,
 73 |     sample_scc_mid_row_with_space_before_roll,
 74 |     sample_scc_mid_row_with_space_before_paint,
 75 |     sample_scc_with_spaces_at_eol_pop,
 76 |     sample_scc_with_spaces_at_eol_roll,
 77 |     sample_scc_with_spaces_at_eol_paint,
 78 | )
 79 | from tests.fixtures.srt import (  # noqa: F401
 80 |     sample_srt, sample_srt_ascii, sample_srt_numeric, sample_srt_empty,
 81 |     sample_srt_blank_lines, sample_srt_trailing_blanks,
 82 |     samples_srt_same_time, sample_srt_empty_cue_output,
 83 |     sample_srt_timestamps_without_microseconds,
 84 | )
 85 | from tests.fixtures.translated_scc import (  # noqa: F401
 86 |     sample_translated_scc_custom_brackets, sample_translated_scc_success,
 87 |     sample_translated_scc_commands_not_found, sample_translated_scc_no_brackets,
 88 |     sample_translated_scc_special_and_extended_characters
 89 | )
 90 | from tests.fixtures.webvtt import (  # noqa: F401
 91 |     sample_webvtt, sample_webvtt_from_dfxp, sample_webvtt_from_sami,
 92 |     sample_webvtt_from_sami_with_style, sample_webvtt_from_sami_with_id_style,
 93 |     sample_webvtt_from_dfxp_with_style,
 94 |     sample_webvtt_keeps_positioning,
 95 |     sample_webvtt_from_dfxp_with_positioning_and_style,
 96 |     sample_webvtt_from_srt, sample_webvtt_from_webvtt,
 97 |     sample_webvtt_2, sample_webvtt_empty, sample_webvtt_double_br,
 98 |     sample_webvtt_output_long_cue, webvtt_from_dfxp_with_conflicting_align,
 99 |     sample_webvtt_with_cue_settings,
100 |     sample_webvtt_from_scc_properly_writes_newlines_output,
101 |     sample_webvtt_last_cue_zero_start, sample_webvtt_empty_cue,
102 |     sample_webvtt_multi_lang_en, sample_webvtt_multi_lang_de,
103 |     sample_webvtt_empty_cue_output, sample_webvtt_timestamps
104 | )
105 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pycaption.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pycaption.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/pycaption"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pycaption"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
  1 | Changelog
  2 | ---------
  3 | 2.2.19
  4 | ^^^^^^
  5 | - Remove support for python 3.8 and 3.9.
  6 | 
  7 | 2.2.18
  8 | ^^^^^^
  9 | - Update changelog and new release tag.
 10 | 
 11 | 2.2.17
 12 | ^^^^^^
 13 | - Update nltk from 3.8.0 to 3.9.1.
 14 | 
 15 | 2.2.16
 16 | ^^^^^^
 17 | - Update copyright details.
 18 | 
 19 | 2.2.15
 20 | ^^^^^^
 21 | - Always skip doubled special characters, not just in case the cue starters are doubled.
 22 | 
 23 | 2.2.14
 24 | ^^^^^^
 25 | - Fix an issue with WebVTT writer text positioning on break inside a cue.
 26 | - Prevent creating a repositioning command to the same coordinates.
 27 | 
 28 | 2.2.13
 29 | ^^^^^^
 30 | - Mid-row codes only add spaces only if there isn't one before.
 31 | - Mid-row codes add spaces only if they affect the text in the same row (not adding if it follows break or PACS).
 32 | - Remove spaces to the end of the lines.
 33 | - Close italics on receiving another style setting command.
 34 | - Throw an CaptionReadNoCaptions error in case of an empty input file is provided.
 35 | - Ignore repositioning commands which are not followed by any text before breaks.
 36 | - Mid-row codes will not add the space if it is in front of punctuation.
 37 | - Fix a bug with background codes when the InstructionNodeCreator collection is empty.
 38 | - Fix a bug WebVTT writer adding double line breaks.
 39 | 
 40 | 2.2.12
 41 | ^^^^^^
 42 | - Pinned nltk to 3.8.0
 43 | 
 44 | 2.2.11
 45 | ^^^^^^
 46 | - A space should not be placed before a mid row code if it follows a PAC command or a Tab Offset
 47 | - The backspace command should be treated like other commands and duplicates should be skipped if PAC commands are duplicated
 48 | - Prevent webvtt writer from creating a new cue in case of line break
 49 | - In case of style setting PAC which also breaks the line, we add the break first, then the style tag
 50 | 
 51 | 2.2.10
 52 | ^^^^^
 53 | - Yanked.
 54 | 
 55 | 2.2.9
 56 | ^^^^^
 57 | - Yanked.
 58 | 
 59 | 2.2.8
 60 | ^^^^^
 61 | - Honor backspaces on captions in scc files
 62 | - When mid-row codes which are preceded by a PAC command don't add spaces
 63 | - Mid row codes which don't follow after a PAC and don't have a style reset command before will add a space to the end of the previous text node
 64 | - Mid row codes which don't follow after a PAC and have a style reset command before will add a space to the beginning of the next text node
 65 | - Background color codes to delete the space in front
 66 | 
 67 | 2.2.7
 68 | ^^^^^
 69 | - The cursor moves automatically one column to the right after each character or Mid-Row Code received.
 70 | 
 71 | 2.2.6
 72 | ^^^^^
 73 | - Pass the caption cue time with all error messages.
 74 | 
 75 | 2.2.5
 76 | ^^^^^
 77 | - Yanked.
 78 | 
 79 | 2.2.4
 80 | ^^^^^
 81 | - Skip duplicated extended characters.
 82 | 
 83 | 2.2.3
 84 | ^^^^^
 85 | - Add new substitute character to ignore before extended character in SCC input files
 86 | 
 87 | 2.2.2
 88 | ^^^^^
 89 | - Remove support for Python 3.6 & 3.7
 90 | - Restrict SCC source files to 31 characters per line (32 will throw an exception)
 91 | - Bump readthedocs-sphinx-search from 0.3.1 to 0.3.2
 92 | - Change Apache copyright licensing (ending) copyright year
 93 | 
 94 | 2.2.1
 95 | ^^^^^
 96 | - Ignore the substitute character that comes before the extended character in SCC files.
 97 | 
 98 | 2.2.0
 99 | ^^^^^
100 | - Added support for Python 3.11
101 | - Added support for Beautifulsoup 4.12.2
102 | - Remove support for Beautifulsoup < 4.12.1
103 | - DFXP captions now end consistently with a newline
104 | 
105 | 2.1.1
106 | ^^^^^
107 | - Added nltk as transcript dependency
108 | 
109 | 2.1.0
110 | ^^^^^
111 | - Remove upper limit for dependency versions to solve vulnerabilities
112 | 
113 | 2.0.9
114 | ^^^^^
115 | - Changed DFXPReader default horizontal alignment from 'center' to 'start'
116 | - Updated WebVTT horizontal alignment from 'middle' to 'center'
117 | 
118 | 2.0.8
119 | ^^^^^
120 | - Added support for Python 3.10
121 | - Added default start align to WebVTTWriter
122 | 
123 | 2.0.7
124 | ^^^^^
125 | - Implemented skipping duplicate special characters for SCCReader
126 | - Added support for beautifulsoup 4.10 and lxml 4.8
127 | - Added pytest and pytest-lazy-fixture as development dependencies
128 | 
129 | 2.0.6
130 | ^^^^^
131 | - Updated Size.from_string() to accept 0 size without measuring unit
132 | - Replaced ValueError with CaptionReadSyntaxError for invalid sizes passed to Size.from_string()
133 | - Updated DFXPReader timestamp validation according to TTML time expression specs
134 | - Updated flashing cues validation for SCCReader to raise a CaptionReadTimingError
135 | - Fixed SCC translator not recognising special and extended characters
136 | - Raise CaptionReadTimingError for missing 'start' on SAMIReader
137 | 
138 | 2.0.5
139 | ^^^^^
140 | - Updated DFXPReader to ignore paragraphs that only contain spaces, tabs or new lines
141 | - Added CaptionReadTimingError for invalid SCC timestamps
142 | - Added CaptionReadSyntaxError for invalid colors in SAMIReader
143 | - Raise CaptionReadTimingError when missing 'begin' or 'end' and 'dur' time on DFXPReader
144 | 
145 | 2.0.4
146 | ^^^^^
147 | - Updated the counting of frames to happen after processing SCC commands
148 | - Made all SCC-sourced captions which have a difference of up to 5 frames between them more fluid
149 | 
150 | 2.0.3
151 | ^^^^^
152 | - Implemented time shift for WebVTTReader
153 | - Removed WebVTTWriter 'start' position alignment
154 | - Updated the SCC Pop-On caption timing logic
155 | - Fixed the correction of end times for multiple last captions
156 | - Fixed bug when flushing implicit buffers and old key was None
157 | 
158 | 2.0.2
159 | ^^^^^
160 | - Implemented Tab Offset commands for SCCReader
161 | - Implemented caption safe area limits (80% horizontally and 90% vertically)
162 | - Implemented SCC translator
163 | 
164 | 2.0.1
165 | ^^^^^
166 | - Added newline between merged SRT captions with overlapping timestamps
167 | - Updated tests for SAMI format
168 | - Updated tests for SRT format
169 | - Added zero padding to 1-digit hours outputted by WebVTTWriter
170 | 
171 | 2.0.0
172 | ^^^^^
173 | - Dropped support for Python 3.5
174 | - Updated tests to run using pytest
175 | - Added pre-commit config
176 | 
177 | 1.0.7
178 | ^^^^^
179 | - Fixed issue with SCC paint-on buffer not being cleared after storing
180 | - Removed null DFXPReader captions from the resulting caption list
181 | - Updated SCCReader double command handling to include the positioning and tab offset case
182 | 
183 | 1.0.6
184 | ^^^^^
185 | - Added MicroDVD format
186 | - Fix for missing end times when reading multiple SAMI paragraphs inside a SYNC
187 | - Fix for wrong order when multiple SRT captions have the same timestamp
188 | - Fix for DFXP timestamps adding leading zeros to 2-digit hours
189 | - Added support for BeautifulSoup 4.9
190 | - Added tests for SCC to DFXP conversion when the source contains ampersands
191 | - Added support for Python 3.9
192 | 
193 | 1.0.5
194 | ^^^^^
195 | - Added language parameter to WebVTTWriter
196 | - Fix for TranscriptWriter merging words at caption boundary
197 | - Updated documentation with positioning information
198 | - Updated DFXP reader to fallback to the document's language if no language is present on individual <div>
199 | - Introduced PYCAPTION_DEFAULT_LANG environment variable and set it to default to 'und'
200 | - Fixed DFXPReader timestamp validation to accept frames and frames conversion to microseconds
201 | 
202 | 1.0.4
203 | ^^^^^
204 | - Included tests in PyPI tarball
205 | - Ignore WebVTT empty cues instead of raising an exception
206 | - Updated BeautifulSoup version to >=4.8.1,<4.9 and fixed failing tests
207 | - Handled index error when sending bad timestamp for DFXP format
208 | 
209 | 1.0.3
210 | ^^^^^
211 | - Fixed issue with SCC reader including both special characters and their potential substitute
212 | - Modified enum34 dependency to versions under Python 3.4
213 | - Removed Python 3.4 and added 3.6, 3.7 and 3.8 to Travis tests
214 | 
215 | 1.0.2
216 | ^^^^^
217 | - Fixed typos in SCC positioning codes
218 | - Added missing SCC positioning codes to positioning map
219 | 
220 | 1.0.0
221 | ^^^^^
222 | - Added Python 3 support
223 | 
224 | 0.5.x
225 | ^^^^^
226 | - Added positioning support
227 | - Created documentation
228 | 


--------------------------------------------------------------------------------
/tests/test_webvtt.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from pycaption import (
  4 |     WebVTTReader, WebVTTWriter, SAMIReader, DFXPReader,
  5 |     CaptionReadNoCaptions, CaptionReadError, CaptionReadSyntaxError,
  6 | )
  7 | from tests.mixins import ReaderTestingMixIn
  8 | 
  9 | 
 10 | class TestWebVTTReader(ReaderTestingMixIn):
 11 |     def setup_method(self):
 12 |         self.reader = WebVTTReader()
 13 | 
 14 |     def test_positive_answer_for_detection(self, sample_webvtt):
 15 |         super().assert_positive_answer_for_detection(sample_webvtt)
 16 | 
 17 |     def test_negative_answer_for_detection_dfxp(self, sample_dfxp):
 18 |         super().assert_negative_answer_for_detection(sample_dfxp)
 19 | 
 20 |     def test_negative_answer_for_detection_microdvd(self, sample_microdvd):
 21 |         super().assert_negative_answer_for_detection(sample_microdvd)
 22 | 
 23 |     def test_negative_answer_for_detection_sami(self, sample_sami):
 24 |         super().assert_negative_answer_for_detection(sample_sami)
 25 | 
 26 |     def test_negative_answer_for_detection_scc_pop_on(self, sample_scc_pop_on):
 27 |         super().assert_negative_answer_for_detection(sample_scc_pop_on)
 28 | 
 29 |     def test_negative_answer_for_detection_srt(self, sample_srt):
 30 |         super().assert_negative_answer_for_detection(sample_srt)
 31 | 
 32 |     def test_caption_length(self, sample_webvtt_2):
 33 |         captions = self.reader.read(sample_webvtt_2)
 34 | 
 35 |         assert len(captions.get_captions('en-US')) == 7
 36 | 
 37 |     def test_read_supports_multiple_languages(self, sample_webvtt):
 38 |         captions = self.reader.read(sample_webvtt, lang='es')
 39 | 
 40 |         assert captions.get_captions('es') is not None
 41 | 
 42 |     def test_proper_timestamps(self, sample_webvtt):
 43 |         captions = self.reader.read(sample_webvtt)
 44 |         cue = captions.get_captions('en-US')[2]
 45 | 
 46 |         assert cue.start == 17000000
 47 |         assert cue.end == 18752000
 48 | 
 49 |     def test_forward_time_shift(self, sample_webvtt):
 50 |         captions = WebVTTReader(time_shift_milliseconds=15).read(sample_webvtt)
 51 |         cue = captions.get_captions('en-US')[2]
 52 | 
 53 |         assert cue.start == 17015000
 54 |         assert cue.end == 18767000
 55 | 
 56 |     def test_backward_time_shift(self, sample_webvtt):
 57 |         captions = WebVTTReader(time_shift_milliseconds=-15).read(sample_webvtt)
 58 |         cue = captions.get_captions('en-US')[2]
 59 | 
 60 |         assert cue.start == 16985000
 61 |         assert cue.end == 18737000
 62 | 
 63 |     def test_webvtt_cue_components_removed_from_text(self):
 64 |         result = self.reader._remove_styles(
 65 |             "<c vIntro><b>Wikipedia</b> is a great adventure. <i>It may have "
 66 |             "its shortcomings</i>, but it is<u> the largest</u> collective "
 67 |             "knowledge construction endevour</c> <ruby>base text <rt>"
 68 |             "annotation</rt></ruby> <v Audry><b>Yes</b>, indeed!"
 69 |         )
 70 |         expected = (
 71 |             "Wikipedia is a great adventure. It may have "
 72 |             "its shortcomings, but it is the largest collective "
 73 |             "knowledge construction endevour base text annotation"
 74 |             " Audry: Yes, indeed!"
 75 |         )
 76 |         assert result == expected
 77 | 
 78 |     def test_empty_file(self, sample_webvtt_empty):
 79 |         with pytest.raises(CaptionReadNoCaptions):
 80 |             WebVTTReader().read(sample_webvtt_empty)
 81 | 
 82 |     def test_not_ignoring_timing_errors(self):
 83 |         # todo: same assert w/ different arguments -> this can be parametrized;
 84 |         with pytest.raises(CaptionReadError):
 85 |             WebVTTReader(ignore_timing_errors=False).read(
 86 |                 "\n" "00:00:20.000 --> 00:00:10.000\n" "foo bar baz")
 87 | 
 88 |         with pytest.raises(CaptionReadError):
 89 |             WebVTTReader(ignore_timing_errors=False).read(
 90 |                 "00:00:20.000 --> 00:00:10.000\n"
 91 |                 "Start time is greater than end time.\n"
 92 |             )
 93 | 
 94 |         with pytest.raises(CaptionReadError):
 95 |             WebVTTReader(ignore_timing_errors=False).read(
 96 |                 "00:00:20.000 --> 00:00:30.000\n"
 97 |                 "Start times should be consecutive.\n"
 98 |                 "\n"
 99 |                 "00:00:10.000 --> 00:00:20.000\n"
100 |                 "This cue starts before the previous one.\n"
101 |             )
102 | 
103 |     def test_ignoring_timing_errors(self):
104 |         # Even if timing errors are ignored, this has to raise an exception
105 |         with pytest.raises(CaptionReadSyntaxError):
106 |             WebVTTReader().read(
107 |                 "\nNOTE invalid cue stamp\n00:00:20.000 --> \nfoo bar baz\n")
108 | 
109 |         # And this too
110 |         with pytest.raises(CaptionReadSyntaxError):
111 |             WebVTTReader().read("\n00:00:20,000 --> 00:00:22,000\n"
112 |                                 "Note the comma instead of point.\n")
113 | 
114 |         # todo: at this point it can be split into 2 separate tests
115 |         try:
116 |             WebVTTReader().read(
117 |                 "\n"
118 |                 "00:00:20.000 --> 00:00:10.000\n"
119 |                 "Start time is greater than end time.\n"
120 |             )
121 |         except CaptionReadError:
122 |             pytest.fail("Shouldn't raise CaptionReadError")
123 | 
124 |         try:
125 |             WebVTTReader().read(
126 |                 "\n"
127 |                 "00:00:20.000 --> 00:00:30.000\n"
128 |                 "Start times should be consecutive.\n"
129 |                 "\n"
130 |                 "00:00:10.000 --> 00:00:20.000\n"
131 |                 "This cue starts before the previous one.\n"
132 |             )
133 |         except CaptionReadError:
134 |             pytest.fail("Shouldn't raise CaptionReadError")
135 | 
136 |     def test_invalid_files(self):
137 |         with pytest.raises(CaptionReadError):
138 |             WebVTTReader(ignore_timing_errors=False).read(
139 |                 "00:00:20.000 --> 00:00:10.000\n"
140 |                 "Start time is greater than end time.")
141 | 
142 |         with pytest.raises(CaptionReadError):
143 |             WebVTTReader(ignore_timing_errors=False).read(
144 |                 "00:00:20.000 --> 00:00:30.000\n"
145 |                 "Start times should be consecutive.\n"
146 |                 "\n"
147 |                 "00:00:10.000 --> 00:00:20.000\n"
148 |                 "This cue starts before the previous one.\n"
149 |             )
150 | 
151 |     def test_zero_start(self, sample_webvtt_last_cue_zero_start):
152 |         captions = self.reader.read(sample_webvtt_last_cue_zero_start)
153 |         cue = captions.get_captions('en-US')[0]
154 | 
155 |         assert cue.start == 0
156 | 
157 |     def test_webvtt_empty_cue(self, sample_webvtt_empty_cue):
158 |         assert 1 == len(self.reader.read(
159 |             sample_webvtt_empty_cue).get_captions('en-US'))
160 | 
161 | 
162 | class TestWebVTTWriter:
163 |     def setup_method(self):
164 |         self.writer = WebVTTWriter()
165 | 
166 |     def test_double_br(self, sample_webvtt_double_br, sample_sami_double_br):
167 |         caption_set = SAMIReader().read(sample_sami_double_br)
168 |         results = WebVTTWriter().write(caption_set)
169 | 
170 |         assert sample_webvtt_double_br == results
171 | 
172 |     def test_break_node_positioning_is_ignored(
173 |             self, webvtt_from_dfxp_with_conflicting_align,
174 |             dfxp_style_region_align_conflict):
175 |         caption_set = DFXPReader().read(dfxp_style_region_align_conflict)
176 |         results = WebVTTWriter().write(caption_set)
177 | 
178 |         assert webvtt_from_dfxp_with_conflicting_align == results
179 | 
180 |     def test_lang_option(self, sample_webvtt_multi_lang_en,
181 |                          sample_webvtt_multi_lang_de,
182 |                          sample_sami_with_multi_lang):
183 |         caption_set = SAMIReader().read(sample_sami_with_multi_lang)
184 |         results = WebVTTWriter().write(caption_set, 'de-DE')
185 | 
186 |         assert sample_webvtt_multi_lang_de == results
187 |         results = WebVTTWriter().write(caption_set, 'en-US')
188 |         assert sample_webvtt_multi_lang_en == results
189 | 


--------------------------------------------------------------------------------
/tests/test_sami.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | 
  3 | import pytest
  4 | 
  5 | from pycaption import SAMIReader, CaptionReadNoCaptions, CaptionReadSyntaxError
  6 | from pycaption.exceptions import CaptionReadTimingError
  7 | from pycaption.geometry import HorizontalAlignmentEnum, Size, UnitEnum  # noqa
  8 | from tests.mixins import ReaderTestingMixIn
  9 | 
 10 | 
 11 | class TestSAMIReader(ReaderTestingMixIn):
 12 |     def setup_method(self):
 13 |         self.reader = SAMIReader()
 14 | 
 15 |     def test_positive_answer_for_detection(self, sample_sami):
 16 |         super().assert_positive_answer_for_detection(sample_sami)
 17 | 
 18 |     def test_negative_answer_for_detection_dfxp(self, sample_dfxp):
 19 |         super().assert_negative_answer_for_detection(sample_dfxp)
 20 | 
 21 |     def test_negative_answer_for_detection_microdvd(self, sample_microdvd):
 22 |         super().assert_negative_answer_for_detection(sample_microdvd)
 23 | 
 24 |     def test_negative_answer_for_detection_scc_pop_on(self, sample_scc_pop_on):
 25 |         super().assert_negative_answer_for_detection(sample_scc_pop_on)
 26 | 
 27 |     def test_negative_answer_for_detection_srt(self, sample_srt):
 28 |         super().assert_negative_answer_for_detection(sample_srt)
 29 | 
 30 |     def test_negative_answer_for_detection_webvtt(self, sample_webvtt):
 31 |         super().assert_negative_answer_for_detection(sample_webvtt)
 32 | 
 33 |     def test_caption_length(self, sample_sami):
 34 |         caption_set = self.reader.read(sample_sami)
 35 | 
 36 |         assert 7 == len(caption_set.get_captions("en-US"))
 37 | 
 38 |     def test_proper_timestamps(self, sample_sami):
 39 |         caption_set = self.reader.read(sample_sami)
 40 |         paragraph = caption_set.get_captions("en-US")[2]
 41 | 
 42 |         assert 17000000 == paragraph.start
 43 |         assert 18752000 == paragraph.end
 44 | 
 45 |     def test_missing_start(self, sample_sami_missing_start):
 46 |         with pytest.raises(CaptionReadTimingError) as exc_info:
 47 |             self.reader.read(sample_sami_missing_start)
 48 | 
 49 |         assert exc_info.value.args[0].startswith(
 50 |             "Missing start time on the following line: ")
 51 | 
 52 |     def test_6digit_color_code_from_6digit_input(self, sample_sami):
 53 |         caption_set = self.reader.read(sample_sami)
 54 |         p_style = caption_set.get_style("p")
 55 | 
 56 |         assert "#ffeedd" == p_style['color']
 57 | 
 58 |     def test_6digit_color_code_from_3digit_input(self, sample_sami):
 59 |         sample_sami = deepcopy(sample_sami)
 60 |         caption_set = self.reader.read(sample_sami.replace("#ffeedd", "#fed"))
 61 |         p_style = caption_set.get_style("p")
 62 | 
 63 |         assert "#ffeedd" == p_style['color']
 64 | 
 65 |     def test_invalid_color_code(self, sample_sami):
 66 |         with pytest.raises(CaptionReadSyntaxError) as exc_info:
 67 |             self.reader.read(sample_sami.replace("#ffeedd", "ffffff"))
 68 |         assert exc_info.value.args[0] == \
 69 |                "Invalid color value: ffffff. Check for missing # before hex " \
 70 |                "values or misspelled color values."
 71 | 
 72 |     def test_empty_file(self, sample_sami_empty):
 73 |         with pytest.raises(CaptionReadNoCaptions):
 74 |             self.reader.read(sample_sami_empty)
 75 | 
 76 |     def test_invalid_markup_is_properly_handled(self, sample_sami_syntax_error):
 77 |         caption_set = self.reader.read(sample_sami_syntax_error)
 78 | 
 79 |         assert 2 == len(caption_set.get_captions("en-US"))
 80 | 
 81 |     def test_partial_margins(self, sample_sami_partial_margins):
 82 |         caption_set = self.reader.read(sample_sami_partial_margins)
 83 |         # Ensure that undefined margins are converted to explicitly nil padding
 84 |         # (i.e. "0%")
 85 | 
 86 |         assert caption_set.layout_info.padding.to_xml_attribute() == \
 87 |                '0% 29pt 0% 29pt'
 88 | 
 89 |     def test_sami_with_bad_span_align(self, sample_sami_with_bad_span_align):
 90 |         caption_set = self.reader.read(sample_sami_with_bad_span_align)
 91 |         caption = caption_set.get_captions('en-US')[0]
 92 | 
 93 |         assert caption.layout_info.alignment.horizontal == \
 94 |                HorizontalAlignmentEnum.RIGHT
 95 | 
 96 |     def test_sami_with_bad_div_align(self, sample_sami_with_bad_div_align):
 97 |         caption_set = self.reader.read(sample_sami_with_bad_div_align)
 98 |         caption = caption_set.get_captions('en-US')[0]
 99 | 
100 |         assert caption.layout_info.alignment.horizontal == \
101 |                HorizontalAlignmentEnum.RIGHT
102 | 
103 |     def test_sami_with_p_align(self, sample_sami_with_p_align):
104 |         caption_set = self.reader.read(sample_sami_with_p_align)
105 |         caption = caption_set.get_captions('en-US')[0]
106 | 
107 |         assert caption.layout_info.alignment.horizontal == \
108 |                HorizontalAlignmentEnum.RIGHT
109 | 
110 |     def test_sami_with_p_and_span_align(self,
111 |                                         sample_sami_with_p_and_span_align):
112 |         """<span> align DOES NOT override <p> align if it is specified inline.
113 |         """
114 |         caption_set = self.reader.read(sample_sami_with_p_and_span_align)
115 |         caption = caption_set.get_captions('en-US')[0]
116 | 
117 |         assert caption.layout_info.alignment.horizontal == \
118 |                HorizontalAlignmentEnum.RIGHT
119 | 
120 |     def test_sami_with_invalid_inline_style(
121 |             self, sample_sami_with_invalid_inline_style):
122 |         caption_set = self.reader.read(sample_sami_with_invalid_inline_style)
123 |         caption = caption_set.get_captions("en-US")[0]
124 | 
125 |         assert caption.layout_info.alignment is None
126 | 
127 |     def test_sami_including_hexadecimal_charref(
128 |             self, sample_sami_including_hexadecimal_charref):
129 |         caption_set = self.reader.read(
130 |             sample_sami_including_hexadecimal_charref)
131 |         paragraph = caption_set.get_captions("en-US")[0]
132 | 
133 |         assert '> >' == paragraph.get_text()
134 | 
135 |     def test_sami_including_decimal_charref(
136 |             self, sample_sami_including_decimal_charref):
137 |         caption_set = self.reader.read(sample_sami_including_decimal_charref)
138 |         paragraph = caption_set.get_captions("en-US")[0]
139 | 
140 |         assert '> >' == paragraph.get_text()
141 | 
142 |     def test_sami_including_html5_entityref(
143 |             self, sample_sami_including_html5_entityref):
144 |         caption_set = self.reader.read(sample_sami_including_html5_entityref)
145 |         paragraph = caption_set.get_captions("en-US")[0]
146 | 
147 |         assert '&starf_&starf' == paragraph.get_text()
148 | 
149 |     def test_html_file(self):
150 |         with pytest.raises(CaptionReadSyntaxError) as exc_info:
151 |             self.reader.read("<html><head></head><body></body></html>")
152 |         assert exc_info.value.args[0] == 'SAMI File seems to be an HTML file.'
153 | 
154 |     def test_no_cc_available(self):
155 |         no_cc = 'no closed captioning available'
156 |         with pytest.raises(CaptionReadSyntaxError) as exc_info:
157 |             self.reader.read(f"<SAMI>{no_cc}</SAMI>")
158 |         assert exc_info.value.args[0] == f'SAMI File contains "{no_cc}"'
159 | 
160 |     def test_sami_with_unclosed_tag(self, sample_sami_with_unclosed_tag):
161 |         caption_set = self.reader.read(sample_sami_with_unclosed_tag)
162 |         paragraph = caption_set.get_captions("en-US")[0]
163 | 
164 |         assert '.' == paragraph.get_text()
165 | 
166 |     def test_sami_with_inline_lang(self, sample_sami_with_inline_lang):
167 |         caption_set = self.reader.read(sample_sami_with_inline_lang)
168 |         paragraph = caption_set.get_captions("en")[0]
169 | 
170 |         assert 'Inlined.' == paragraph.get_text()
171 | 
172 |     def test_proper_with_timestamps_with_multiple_paragraph(
173 |             self, sample_sami_with_multiple_p):
174 |         captions = self.reader.read(sample_sami_with_multiple_p)
175 |         paragraph_1 = captions.get_captions("en-US")[0]
176 |         paragraph_2 = captions.get_captions("en-US")[1]
177 | 
178 |         assert paragraph_1.start == paragraph_2.start
179 |         assert paragraph_1.end == paragraph_2.end
180 | 


--------------------------------------------------------------------------------
/tests/fixtures/webvtt.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | 
  4 | @pytest.fixture(scope="session")
  5 | def sample_webvtt():
  6 |     return """WEBVTT
  7 | 
  8 | 00:09.209 --> 00:12.312
  9 | ( clock ticking )
 10 | 
 11 | 00:14.848 --> 00:17.000
 12 | MAN:
 13 | When we think
 14 | ♪ ...say bow, wow, ♪
 15 | 
 16 | 00:17.000 --> 00:18.752
 17 | we have this vision of Einstein
 18 | 
 19 | 00:18.752 --> 00:20.887
 20 | as an old, wrinkly man
 21 | with white hair.
 22 | 
 23 | 00:20.887 --> 00:26.760
 24 | MAN 2:
 25 | E equals m c-squared is
 26 | not about an old Einstein.
 27 | 
 28 | 00:26.760 --> 00:32.200
 29 | MAN 2:
 30 | It's all about an eternal Einstein.
 31 | 
 32 | 00:32.200 --> 00:36.200
 33 | <LAUGHING & WHOOPS!>
 34 | """
 35 | 
 36 | 
 37 | @pytest.fixture(scope="session")
 38 | def sample_webvtt_from_dfxp():
 39 |     return """WEBVTT
 40 | 
 41 | 00:09.209 --> 00:12.312 align:start
 42 | ( clock ticking )
 43 | 
 44 | 00:14.848 --> 00:17.000 align:start
 45 | MAN:
 46 | When we think
 47 | ♪ ...say bow, wow, ♪
 48 | 
 49 | 00:17.000 --> 00:18.752 align:right
 50 | we have this vision of Einstein
 51 | 
 52 | 00:18.752 --> 00:20.887 align:start
 53 | &nbsp;
 54 | as an old, wrinkly man
 55 | with white hair.
 56 | 
 57 | 00:20.887 --> 00:26.760 align:start
 58 | MAN 2:
 59 | E equals m c-squared is
 60 | not about an old Einstein.
 61 | 
 62 | 00:26.760 --> 00:32.200 align:start
 63 | MAN 2:
 64 | It's all about an eternal Einstein.
 65 | 
 66 | 00:32.200 --> 00:36.200 align:start
 67 | &lt;LAUGHING &amp; WHOOPS!>
 68 | """
 69 | 
 70 | 
 71 | @pytest.fixture(scope="session")
 72 | def sample_webvtt_from_sami():
 73 |     return """WEBVTT
 74 | 
 75 | 00:09.209 --> 00:12.312
 76 | ( clock ticking )
 77 | 
 78 | 00:14.848 --> 00:17.000
 79 | MAN:
 80 | When we think
 81 | ♪ ...say bow, wow, ♪
 82 | 
 83 | 00:17.000 --> 00:18.752 align:right
 84 | we have this vision of Einstein
 85 | 
 86 | 00:18.752 --> 00:20.887
 87 | &nbsp;
 88 | as an old, wrinkly man
 89 | with white hair.
 90 | 
 91 | 00:20.887 --> 00:26.760
 92 | MAN 2:
 93 | E equals m c-squared is
 94 | not about an old Einstein.
 95 | 
 96 | 00:26.760 --> 00:32.200
 97 | MAN 2:
 98 | It's all about an eternal Einstein.
 99 | 
100 | 00:32.200 --> 00:36.200
101 | &lt;LAUGHING &amp; WHOOPS!>
102 | """
103 | 
104 | 
105 | @pytest.fixture(scope="session")
106 | def sample_webvtt_from_sami_with_style():
107 |     return """WEBVTT
108 | 
109 | 00:09.209 --> 00:12.312
110 | I <b>do</b> <i>not</i> want to go <u>home</u>.
111 | I don't like it <i><u><b>there</b></u></i>.
112 | """
113 | 
114 | 
115 | @pytest.fixture(scope="session")
116 | def sample_webvtt_from_sami_with_id_style():
117 |     return """WEBVTT
118 | 
119 | 00:09.209 --> 00:12.312
120 | <i>This is in italics.</i>
121 | 
122 | 00:14.848 --> 00:17.000
123 | <u>This is underlined.</u>
124 | 
125 | 00:17.000 --> 00:18.752
126 | <b>This is bold.</b>
127 | 
128 | 00:20.887 --> 00:26.760
129 | <b><i><u>This is everything together.</u></i></b>
130 | """
131 | 
132 | 
133 | @pytest.fixture(scope="session")
134 | def sample_webvtt_from_dfxp_with_style():
135 |     return """WEBVTT
136 | 
137 | 00:09.209 --> 00:12.312
138 | This is <i>italic</i>, <b>bold</b>, <u>underline</u>, <i><u><b>everything together in one tag</b></u></i>, and <u><b><i>nested</i></b></u>.
139 | """
140 | 
141 | 
142 | @pytest.fixture(scope="session")
143 | def sample_webvtt_keeps_positioning():
144 |     return """WEBVTT
145 | 
146 | 00:01.000 --> 00:03.000 align:start position:25% line:25% size:50%
147 | You might not remember us. We are a typical transparent region with centered text that has an outline.
148 | 
149 | 00:03.500 --> 00:05.000 align:right position:25% line:25% size:50%
150 | had personality.
151 | 
152 | 00:05.500 --> 00:07.000 align:left position:50% line:50% size:25%
153 | Hello there, children! Have you seen any visitors?
154 | 
155 | 00:07.500 --> 00:09.000 align:right position:25% line:75% size:25%
156 | This is
157 | the last cue
158 | """
159 | 
160 | 
161 | @pytest.fixture(scope="session")
162 | def sample_webvtt_from_dfxp_with_positioning_and_style():
163 |     return """WEBVTT
164 | 
165 | 00:01.000 --> 00:03.000 position:25% line:25% size:50%
166 | You might not remember us. We are a typical transparent region with centered text that has an outline.
167 | 
168 | 00:03.500 --> 00:05.000 align:right position:25% line:25% size:50%
169 | had <u>personality.</u>
170 | 
171 | 00:05.500 --> 00:07.000 align:left position:50% line:50% size:25%
172 | Hello there, children! Have you seen any visitors?
173 | 
174 | 00:07.500 --> 00:09.000 align:right position:25% line:75% size:25%
175 | This is
176 | the last cue
177 | """
178 | 
179 | 
180 | @pytest.fixture(scope="session")
181 | def sample_webvtt_from_srt():
182 |     return """WEBVTT
183 | 
184 | 00:09.209 --> 00:12.312
185 | ( clock ticking )
186 | 
187 | 00:14.848 --> 00:17.000
188 | MAN:
189 | When we think
190 | ♪ ...say bow, wow, ♪
191 | 
192 | 00:17.000 --> 00:18.752
193 | we have this vision of Einstein
194 | 
195 | 00:18.752 --> 00:20.887
196 | as an old, wrinkly man
197 | with white hair.
198 | 
199 | 00:20.887 --> 00:26.760
200 | MAN 2:
201 | E equals m c-squared is
202 | not about an old Einstein.
203 | 
204 | 00:26.760 --> 00:32.200
205 | MAN 2:
206 | It's all about an eternal Einstein.
207 | 
208 | 00:32.200 --> 00:36.200
209 | &lt;LAUGHING &amp; WHOOPS!>
210 | """
211 | 
212 | 
213 | # This is not equal to the input because we accept unescaped illegal characters
214 | # when reading (because many players do so) but escape them when writing
215 | # in order to conform to the specification.
216 | @pytest.fixture(scope="session")
217 | def sample_webvtt_from_webvtt(sample_webvtt_from_srt):
218 |     return sample_webvtt_from_srt
219 | 
220 | 
221 | @pytest.fixture(scope="session")
222 | def sample_webvtt_2():
223 |     return """WEBVTT
224 | 
225 | 1
226 | 00:00:00.000 --> 00:00:43.000
227 | - HELLO WORLD!
228 | 
229 | 2
230 | 00:00:59.000 --> 00:01:30.000
231 | - LOOKING GOOOOD.
232 | 
233 | 3
234 | 00:01:40.000 --> 00:02:00.000
235 | - HA HA HA!
236 | 
237 | 4
238 | 00:02:05.105 --> 00:03:07.007
239 | - HI. WELCOME TO SESAME STREET.
240 | 
241 | 5
242 | 00:04:07.007 --> 00:05:38.441
243 | ON TONIGHT'S SHOW...
244 | 
245 | 6
246 | 00:05:58.441 --> 00:06:40.543
247 | - I'M NOT GOING TO WATCH THIS.
248 | 
249 | 7
250 | 00:07:10.543 --> 00:07:51.711
251 | HEY. WATCH THIS.
252 | """
253 | 
254 | 
255 | @pytest.fixture(scope="session")
256 | def sample_webvtt_empty():
257 |     return """WEBVTT
258 | """
259 | 
260 | 
261 | @pytest.fixture(scope="session")
262 | def sample_webvtt_double_br():
263 |     return """WEBVTT
264 | 
265 | 00:14.848 --> 00:18.848
266 | MAN:
267 | &nbsp;
268 | When we think
269 | of "E equals m c-squared",
270 | """
271 | 
272 | 
273 | @pytest.fixture(scope="session")
274 | def sample_webvtt_output_long_cue():
275 |     return """\
276 | WEBVTT
277 | 
278 | 00:01.000 --> 00:02.000 align:start
279 | NARRATOR:
280 | 
281 | 00:02.000 --> 00:03.000 position:25% line:25% size:65%
282 | They built the largest, most incredible, wildest, craziest,
283 | 
284 | 00:03.000 --> 00:04.000 align:start
285 | most complex machine in history.
286 | """
287 | 
288 | 
289 | @pytest.fixture(scope="session")
290 | def webvtt_from_dfxp_with_conflicting_align():
291 |     return """WEBVTT
292 | 
293 | 00:04.537 --> 00:07.841
294 | IT'S WORD GIRL♫
295 | 
296 | 00:08.537 --> 00:10.841
297 | ♫WORD UP,
298 | IT'S WORD GIRL♫
299 | """
300 | 
301 | 
302 | @pytest.fixture(scope="session")
303 | def sample_webvtt_with_cue_settings():
304 |     return """\
305 | WEBVTT
306 | 
307 | 00:01.000 --> 00:06.000 align:center position:37% line:74%
308 | 37% 74% - NARRATOR:
309 | 
310 | 00:01.000 --> 00:06.000 this is invalid, but will also be kept
311 | They built the largest,
312 | """
313 | 
314 | 
315 | @pytest.fixture(scope="session")
316 | def sample_webvtt_from_scc_properly_writes_newlines_output():
317 |     return """\
318 | WEBVTT
319 | 
320 | 21:30.000 --> 21:34.000 align:left position:20% line:83% size:70%
321 | aa
322 | bb
323 | """
324 | 
325 | 
326 | @pytest.fixture(scope="session")
327 | def sample_webvtt_last_cue_zero_start():
328 |     return """WEBVTT
329 | 
330 | 00:00.000 --> 00:12.312
331 | ( clock ticking )"""
332 | 
333 | 
334 | @pytest.fixture(scope="session")
335 | def sample_webvtt_empty_cue():
336 |     return """WEBVTT
337 | 
338 | 1
339 | 00:00.000 --> 00:02.000
340 | 
341 | 00:04.000 --> 00:05.000
342 | Transcribed by Celestials
343 | """
344 | 
345 | 
346 | @pytest.fixture(scope="session")
347 | def sample_webvtt_multi_lang_en():
348 |     return """WEBVTT
349 | 
350 | 00:14.848 --> 00:18.848
351 | Butterfly.
352 | """
353 | 
354 | 
355 | @pytest.fixture(scope="session")
356 | def sample_webvtt_multi_lang_de():
357 |     return """WEBVTT
358 | 
359 | 00:14.848 --> 00:18.848
360 | Schmetterling.
361 | """
362 | 
363 | 
364 | @pytest.fixture(scope="session")
365 | def sample_webvtt_empty_cue_output():
366 |     return """\
367 | WEBVTT
368 | 
369 | 00:01.209 --> 00:02.312 align:start position:10% line:10% size:80%
370 | abc
371 | """
372 | 
373 | 
374 | @pytest.fixture(scope="session")
375 | def sample_webvtt_timestamps():
376 |     return """WEBVTT
377 | 
378 | 01:01.001 --> 10:10.100
379 | Test zero padded and two digit timestamps without hours
380 | 
381 | 01:01:01.001 --> 10:10:10.100
382 | Test zero padded and two digit timestamps without hours"""
383 | 


--------------------------------------------------------------------------------
/tests/test_dfxp.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from pycaption import DFXPReader, CaptionReadNoCaptions
  4 | from pycaption.exceptions import (
  5 |     CaptionReadSyntaxError, CaptionReadError, CaptionReadTimingError,
  6 | )
  7 | from pycaption.geometry import (
  8 |     UnitEnum, HorizontalAlignmentEnum, VerticalAlignmentEnum,
  9 | )
 10 | from tests.mixins import ReaderTestingMixIn
 11 | 
 12 | 
 13 | class TestDFXPReader(ReaderTestingMixIn):
 14 |     def setup_class(self):
 15 |         self.reader = DFXPReader()
 16 | 
 17 |     def test_positive_answer_for_detection(self, sample_dfxp):
 18 |         super().assert_positive_answer_for_detection(sample_dfxp)
 19 | 
 20 |     def test_negative_answer_for_microdvd(self, sample_microdvd):
 21 |         super().assert_negative_answer_for_detection(sample_microdvd)
 22 | 
 23 |     def test_negative_answer_for_sami(self, sample_sami):
 24 |         super().assert_negative_answer_for_detection(sample_sami)
 25 | 
 26 |     def test_negative_answer_for_scc_on_pop_on(self, sample_scc_pop_on):
 27 |         super().assert_negative_answer_for_detection(sample_scc_pop_on)
 28 | 
 29 |     def test_negative_answer_for_srt(self, sample_srt):
 30 |         super().assert_negative_answer_for_detection(sample_srt)
 31 | 
 32 |     def test_negative_answer_for_webvtt(self, sample_webvtt):
 33 |         super().assert_negative_answer_for_detection(sample_webvtt)
 34 | 
 35 |     def test_caption_length(self, sample_dfxp):
 36 |         captions = DFXPReader().read(sample_dfxp)
 37 | 
 38 |         assert 7 == len(captions.get_captions("en-US"))
 39 | 
 40 |     def test_proper_timestamps(self, sample_dfxp):
 41 |         captions = DFXPReader().read(sample_dfxp)
 42 |         paragraph = captions.get_captions("en-US")[2]
 43 | 
 44 |         assert 17000000 == paragraph.start
 45 |         assert 18752000 == paragraph.end
 46 | 
 47 |     def test_incorrect_time_format(self, sample_dfxp_incorrect_time_format):
 48 |         with pytest.raises(CaptionReadTimingError) as exc_info:
 49 |             DFXPReader().read(sample_dfxp_incorrect_time_format)
 50 | 
 51 |         assert exc_info.value.args[0].startswith("Invalid timestamp: 0:05.")
 52 | 
 53 |     def test_missing_begin(self, sample_dfxp_missing_begin):
 54 |         with pytest.raises(CaptionReadTimingError) as exc_info:
 55 |             DFXPReader().read(sample_dfxp_missing_begin)
 56 |         assert exc_info.value.args[0].startswith('Missing begin time on line ')
 57 | 
 58 |     def test_missing_end_and_dur(self, sample_dfxp_missing_end_and_dur):
 59 |         with pytest.raises(CaptionReadTimingError) as exc_info:
 60 |             DFXPReader().read(sample_dfxp_missing_end_and_dur)
 61 |         assert exc_info.value.args[0].startswith(
 62 |             'Missing end time or duration on line ')
 63 | 
 64 |     def test_convert_timestamp_to_microseconds(self):
 65 |         reader = DFXPReader()
 66 | 
 67 |         assert 1 == reader._convert_timestamp_to_microseconds("0.001ms")
 68 |         assert 2000 == reader._convert_timestamp_to_microseconds("2ms")
 69 |         assert 1000000 == reader._convert_timestamp_to_microseconds("1s")
 70 |         assert 1234567 == reader._convert_timestamp_to_microseconds("1.234567s")
 71 |         assert 180000000 == reader._convert_timestamp_to_microseconds("3m")
 72 |         assert 14400000000 == reader._convert_timestamp_to_microseconds("4h")
 73 |         assert 53333 == reader._convert_timestamp_to_microseconds("1.6f")
 74 |         # Tick values are not supported
 75 |         with pytest.raises(NotImplementedError):
 76 |             reader._convert_timestamp_to_microseconds("2.3t")
 77 | 
 78 |     @pytest.mark.parametrize('timestamp, microseconds', [
 79 |         ('12:23:34', 44614000000), ('23:34:45:56', 84886866666),
 80 |         ('34:45:56.7', 125156700000), ('13:24:35.67', 48275670000),
 81 |         ('24:35:46.456', 88546456000), ('1:23:34', 5014000000)])
 82 |     def test_clock_time(self, timestamp, microseconds):
 83 |         assert DFXPReader()._convert_timestamp_to_microseconds(
 84 |             timestamp) == microseconds
 85 | 
 86 |     @pytest.mark.parametrize('timestamp', [
 87 |         '1:1:11', '1:11:1', '1:11:11:1', '11:11:11:11.11', '11:11:11,11',
 88 |         '11.11.11.11', '11:11:11.', 'o1:11:11'])
 89 |     def test_invalid_timestamp(self, timestamp):
 90 |         with pytest.raises(CaptionReadTimingError) as exc_info:
 91 |             DFXPReader()._convert_timestamp_to_microseconds(timestamp)
 92 | 
 93 |     def test_empty_file(self, sample_dfxp_empty):
 94 |         with pytest.raises(CaptionReadNoCaptions):
 95 |             DFXPReader().read(sample_dfxp_empty)
 96 | 
 97 |     def test_invalid_markup_is_properly_handled(self, sample_dfxp_syntax_error):
 98 |         captions = DFXPReader().read(sample_dfxp_syntax_error)
 99 | 
100 |         assert 2 == len(captions.get_captions("en"))
101 | 
102 |     def test_caption_error_for_invalid_positioning_values(
103 |             self, sample_dfxp_invalid_positioning_value_template):
104 |         invalid_value_dfxp = (
105 |             sample_dfxp_invalid_positioning_value_template.
106 |                 format(origin="px 5px")
107 |         )
108 |         with pytest.raises(CaptionReadSyntaxError):
109 |             DFXPReader().read(invalid_value_dfxp)
110 | 
111 |     def test_caption_error_for_invalid_or_unsupported_positioning_units(
112 |             self, sample_dfxp_invalid_positioning_value_template):
113 |         invalid_dfxp = sample_dfxp_invalid_positioning_value_template.format(
114 |             origin="6foo 7bar"
115 |         )
116 |         with pytest.raises(CaptionReadSyntaxError):
117 |             DFXPReader().read(invalid_dfxp)
118 | 
119 |     def test_individual_timings_of_captions_with_matching_timespec_are_kept(
120 |             self, sample_dfxp_multiple_captions_with_the_same_timing
121 |     ):
122 |         captionset = DFXPReader().read(
123 |             sample_dfxp_multiple_captions_with_the_same_timing
124 |         )
125 |         expected_timings = [(9209000, 12312000)] * 3
126 |         actual_timings = [(c_.start, c_.end) for c_ in
127 |                           captionset.get_captions('en-US')]
128 | 
129 |         assert expected_timings == actual_timings
130 | 
131 |     def test_individual_texts_of_captions_with_matching_timespec_are_kept(
132 |             self, sample_dfxp_multiple_captions_with_the_same_timing):
133 |         captionset = DFXPReader().read(
134 |             sample_dfxp_multiple_captions_with_the_same_timing
135 |         )
136 | 
137 |         expected_texts = ['Some text here',
138 |                           'Some text there',
139 |                           'Caption texts are everywhere!']
140 |         actual_texts = [c_.nodes[0].content for c_ in
141 |                         captionset.get_captions("en-US")]
142 | 
143 |         assert expected_texts == actual_texts
144 | 
145 |     def test_individual_layouts_of_captions_with_matching_timespec_are_kept(
146 |             self, sample_dfxp_multiple_captions_with_the_same_timing
147 |     ):
148 |         captionset = DFXPReader().read(
149 |             sample_dfxp_multiple_captions_with_the_same_timing
150 |         )
151 |         expected_layouts = [
152 |             (((10, UnitEnum.PERCENT), (10, UnitEnum.PERCENT)), None, None,
153 |              (HorizontalAlignmentEnum.START, VerticalAlignmentEnum.BOTTOM)),
154 |             (((40, UnitEnum.PERCENT), (40, UnitEnum.PERCENT)), None, None,
155 |              (HorizontalAlignmentEnum.START, VerticalAlignmentEnum.BOTTOM)),
156 |             (((10, UnitEnum.PERCENT), (70, UnitEnum.PERCENT)), None, None,
157 |              (HorizontalAlignmentEnum.START, VerticalAlignmentEnum.BOTTOM))]
158 |         actual_layouts = [c_.layout_info.serialized() for c_ in
159 |                           captionset.get_captions('en-US')]
160 | 
161 |         assert expected_layouts == actual_layouts
162 | 
163 |     def test_properly_converts_timing(
164 |             self, sample_dfxp_with_alternative_timing_formats):
165 |         caption_set = DFXPReader().read(
166 |             sample_dfxp_with_alternative_timing_formats)
167 |         caps = caption_set.get_captions('en-US')
168 | 
169 |         assert caps[0].start == 1900000
170 |         assert caps[0].end == 3050000
171 |         assert caps[1].start == 4000000
172 |         assert caps[1].end == 5200000
173 | 
174 |     def test_empty_paragraph(self, sample_dfxp_empty_paragraph):
175 |         try:
176 |             DFXPReader().read(sample_dfxp_empty_paragraph)
177 |         except CaptionReadError:
178 |             pytest.fail("Failing on empty paragraph")
179 | 
180 |     def test_only_spaces_paragraph(self, sample_dfxp_only_spaces_paragraph):
181 |         caption_set = DFXPReader().read(sample_dfxp_only_spaces_paragraph)
182 |         caps = caption_set.get_captions('en-US')
183 | 
184 |         assert len(caps) == 1
185 | 
186 |     def test_properly_converts_frametiming(self, sample_dfxp_with_frame_timing):
187 |         caption_set = DFXPReader().read(sample_dfxp_with_frame_timing)
188 |         caps = caption_set.get_captions('en-US')
189 | 
190 |         assert caps[0].end == 12233333
191 |         assert caps[0].start == 9666666
192 | 
193 |     def test_empty_cue(self, sample_dfxp_empty_cue):
194 |         caption_set = DFXPReader().read(sample_dfxp_empty_cue)
195 |         caps = caption_set.get_captions('en-US')
196 | 
197 |         assert len(caps) == 1
198 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # pycaption documentation build configuration file, created by
  3 | # sphinx-quickstart on Thu Feb 12 12:18:37 2015.
  4 | #
  5 | # This file is execfile()d with the current directory set to its
  6 | # containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sphinx_rtd_theme
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | # import os
 21 | # import sys
 22 | # sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | # needs_sphinx = '1.0'
 28 | 
 29 | # Add any Sphinx extension module names here, as strings. They can be
 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 31 | # ones.
 32 | extensions = []
 33 | 
 34 | # Add any paths that contain templates here, relative to this directory.
 35 | templates_path = ['_templates']
 36 | 
 37 | # The suffix of source filenames.
 38 | source_suffix = '.rst'
 39 | 
 40 | # The encoding of source files.
 41 | # source_encoding = 'utf-8-sig'
 42 | 
 43 | # The master toctree document.
 44 | master_doc = 'index'
 45 | 
 46 | # General information about the project.
 47 | project = 'pycaption'
 48 | copyright = '2012-2025, PBS.org ' \
 49 |             '(available under the Apache License, Version 2.0)'
 50 | 
 51 | # The version info for the project you're documenting, acts as replacement for
 52 | # |version| and |release|, also used in various other places throughout the
 53 | # built documents.
 54 | #
 55 | # The short X.Y version.
 56 | version = '2.2.19'
 57 | # The full version, including alpha/beta/rc tags.
 58 | release = '2.2.19'
 59 | 
 60 | # The language for content autogenerated by Sphinx. Refer to documentation
 61 | # for a list of supported languages.
 62 | # language = None
 63 | 
 64 | # There are two options for replacing |today|: either, you set today to some
 65 | # non-false value, then it is used:
 66 | # today = ''
 67 | # Else, today_fmt is used as the format for a strftime call.
 68 | # today_fmt = '%B %d, %Y'
 69 | 
 70 | # List of patterns, relative to source directory, that match files and
 71 | # directories to ignore when looking for source files.
 72 | exclude_patterns = ['_build']
 73 | 
 74 | # The reST default role (used for this markup: `text`) to use for all
 75 | # documents.
 76 | # default_role = None
 77 | 
 78 | # If true, '()' will be appended to :func: etc. cross-reference text.
 79 | # add_function_parentheses = True
 80 | 
 81 | # If true, the current module name will be prepended to all description
 82 | # unit titles (such as .. function::).
 83 | # add_module_names = True
 84 | 
 85 | # If true, sectionauthor and moduleauthor directives will be shown in the
 86 | # output. They are ignored by default.
 87 | # show_authors = False
 88 | 
 89 | # The name of the Pygments (syntax highlighting) style to use.
 90 | pygments_style = 'sphinx'
 91 | 
 92 | # A list of ignored prefixes for module index sorting.
 93 | # modindex_common_prefix = []
 94 | 
 95 | # If true, keep warnings as "system message" paragraphs in the built documents.
 96 | # keep_warnings = False
 97 | 
 98 | 
 99 | # -- Options for HTML output ----------------------------------------------
100 | 
101 | # The theme to use for HTML and HTML Help pages.  See the documentation for
102 | # a list of builtin themes.
103 | # html_theme = 'sphinx_rtd_theme'
104 | html_theme = "sphinx_rtd_theme"
105 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
106 | 
107 | # Theme options are theme-specific and customize the look and feel of a theme
108 | # further.  For a list of options available for each theme, see the
109 | # documentation.
110 | # html_theme_options = {}
111 | 
112 | # Add any paths that contain custom themes here, relative to this directory.
113 | # html_theme_path = []
114 | 
115 | # The name for this set of Sphinx documents.  If None, it defaults to
116 | # "<project> v<release> documentation".
117 | # html_title = None
118 | 
119 | # A shorter title for the navigation bar.  Default is the same as html_title.
120 | # html_short_title = None
121 | 
122 | # The name of an image file (relative to this directory) to place at the top
123 | # of the sidebar.
124 | # html_logo = None
125 | 
126 | # The name of an image file (within the static path) to use as favicon of the
127 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
128 | # pixels large.
129 | # html_favicon = None
130 | 
131 | # Add any paths that contain custom static files (such as style sheets) here,
132 | # relative to this directory. They are copied after the builtin static files,
133 | # so a file named "default.css" will overwrite the builtin "default.css".
134 | html_static_path = ['_static']
135 | 
136 | # Add any extra paths that contain custom files (such as robots.txt or
137 | # .htaccess) here, relative to this directory. These files are copied
138 | # directly to the root of the documentation.
139 | # html_extra_path = []
140 | 
141 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
142 | # using the given strftime format.
143 | # html_last_updated_fmt = '%b %d, %Y'
144 | 
145 | # If true, SmartyPants will be used to convert quotes and dashes to
146 | # typographically correct entities.
147 | # html_use_smartypants = True
148 | 
149 | # Custom sidebar templates, maps document names to template names.
150 | # html_sidebars = {}
151 | 
152 | # Additional templates that should be rendered to pages, maps page names to
153 | # template names.
154 | # html_additional_pages = {}
155 | 
156 | # If false, no module index is generated.
157 | # html_domain_indices = True
158 | 
159 | # If false, no index is generated.
160 | # html_use_index = True
161 | 
162 | # If true, the index is split into individual pages for each letter.
163 | # html_split_index = False
164 | 
165 | # If true, links to the reST sources are added to the pages.
166 | # html_show_sourcelink = True
167 | 
168 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
169 | # html_show_sphinx = True
170 | 
171 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
172 | # html_show_copyright = True
173 | 
174 | # If true, an OpenSearch description file will be output, and all pages will
175 | # contain a <link> tag referring to it.  The value of this option must be the
176 | # base URL from which the finished HTML is served.
177 | # html_use_opensearch = ''
178 | 
179 | # This is the file name suffix for HTML files (e.g. ".xhtml").
180 | # html_file_suffix = None
181 | 
182 | # Output file base name for HTML help builder.
183 | htmlhelp_basename = 'pycaptiondoc'
184 | 
185 | 
186 | # -- Options for LaTeX output ---------------------------------------------
187 | 
188 | latex_elements = {
189 |     #  The paper size ('letterpaper' or 'a4paper').
190 |     # 'papersize': 'letterpaper',
191 | 
192 |     #  The font size ('10pt', '11pt' or '12pt').
193 |     # 'pointsize': '10pt',
194 | 
195 |     #  Additional stuff for the LaTeX preamble.
196 |     # 'preamble': '',
197 | }
198 | 
199 | # Grouping the document tree into LaTeX files. List of tuples
200 | # (source start file, target name, title,
201 | #  author, documentclass [howto, manual, or own class]).
202 | latex_documents = [
203 |     ('index', 'pycaption.tex', 'pycaption Documentation',
204 |      'PBS', 'manual'),
205 | ]
206 | 
207 | # The name of an image file (relative to this directory) to place at the top of
208 | # the title page.
209 | # latex_logo = None
210 | 
211 | # For "manual" documents, if this is true, then toplevel headings are parts,
212 | # not chapters.
213 | # latex_use_parts = False
214 | 
215 | # If true, show page references after internal links.
216 | # latex_show_pagerefs = False
217 | 
218 | # If true, show URL addresses after external links.
219 | # latex_show_urls = False
220 | 
221 | # Documents to append as an appendix to all manuals.
222 | # latex_appendices = []
223 | 
224 | # If false, no module index is generated.
225 | # latex_domain_indices = True
226 | 
227 | 
228 | # -- Options for manual page output ---------------------------------------
229 | 
230 | # One entry per manual page. List of tuples
231 | # (source start file, name, description, authors, manual section).
232 | man_pages = [
233 |     ('index', 'pycaption', 'pycaption Documentation',
234 |      ['PBS'], 1)
235 | ]
236 | 
237 | # If true, show URL addresses after external links.
238 | # man_show_urls = False
239 | 
240 | 
241 | # -- Options for Texinfo output -------------------------------------------
242 | 
243 | # Grouping the document tree into Texinfo files. List of tuples
244 | # (source start file, target name, title, author,
245 | #  dir menu entry, description, category)
246 | texinfo_documents = [
247 |     ('index', 'pycaption', 'pycaption Documentation',
248 |      'PBS', 'pycaption', 'One line description of project.',
249 |      'Miscellaneous'),
250 | ]
251 | 
252 | # Documents to append as an appendix to all manuals.
253 | # texinfo_appendices = []
254 | 
255 | # If false, no module index is generated.
256 | # texinfo_domain_indices = True
257 | 
258 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
259 | # texinfo_show_urls = 'footnote'
260 | 
261 | # If true, do not generate a @detailmenu in the "Top" node's menu.
262 | # texinfo_no_detailmenu = False
263 | 


--------------------------------------------------------------------------------
/pycaption/dfxp/extras.py:
--------------------------------------------------------------------------------
  1 | # We thought about making pycaption.base objects immutable. This would be nice
  2 | # in a lot of cases, but since the transformations on them could be quite
  3 | # complex, the deepcopy method is good enough sometimes.
  4 | from copy import deepcopy
  5 | from xml.sax.saxutils import escape
  6 | 
  7 | from bs4 import BeautifulSoup
  8 | 
  9 | from .base import DFXPWriter, DFXP_DEFAULT_REGION
 10 | from ..base import BaseWriter, CaptionNode, merge_concurrent_captions
 11 | 
 12 | LEGACY_DFXP_BASE_MARKUP = '''
 13 | <tt xmlns="http://www.w3.org/ns/ttml"
 14 |     xmlns:tts="http://www.w3.org/ns/ttml#styling">
 15 |     <head>
 16 |         <styling/>
 17 |         <layout/>
 18 |     </head>
 19 |     <body/>
 20 | </tt>
 21 | '''
 22 | 
 23 | LEGACY_DFXP_DEFAULT_STYLE = {
 24 |     'color': 'white',
 25 |     'font-family': 'monospace',
 26 |     'font-size': '1c',
 27 | }
 28 | 
 29 | LEGACY_DFXP_DEFAULT_STYLE_ID = 'default'
 30 | LEGACY_DFXP_DEFAULT_REGION_ID = 'bottom'
 31 | 
 32 | LEGACY_DFXP_DEFAULT_REGION = {
 33 |     'text-align': 'center',
 34 |     'display-align': 'after'
 35 | }
 36 | 
 37 | 
 38 | class SinglePositioningDFXPWriter(DFXPWriter):
 39 |     """
 40 |     A dfxp writer, that ignores all positioning, using a single provided value
 41 |     """
 42 |     def __init__(self, default_positioning=DFXP_DEFAULT_REGION,
 43 |                  *args, **kwargs):
 44 |         super().__init__(*args, **kwargs)
 45 |         self.default_positioning = default_positioning
 46 | 
 47 |     def write(self, captions_set, force=''):
 48 |         """Writes a DFXP file using the positioning provided in the initializer
 49 | 
 50 |         :type captions_set: pycaption.base.CaptionSet
 51 |         :param force: only write this language, if available in the CaptionSet
 52 |         :rtype: str
 53 |         """
 54 |         captions_set = self._create_single_positioning_caption_set(
 55 |             captions_set, self.default_positioning)
 56 | 
 57 |         return super().write(captions_set, force)  # noqa
 58 | 
 59 |     @staticmethod
 60 |     def _create_single_positioning_caption_set(caption_set, positioning):
 61 |         """Return a caption where all the positioning information was
 62 |         replaced from positioning
 63 | 
 64 |         :type caption_set: pycaption.base.CaptionSet
 65 |         :rtype: pycaption.base.CaptionSet
 66 |         """
 67 |         # If SinglePositioningDFXPWriter would modify the state of the caption
 68 |         # set, any writer using the same caption_set thereafter would be
 69 |         # affected. At the moment we know we don't use any other writers, but
 70 |         # this is important and mustn't be neglected
 71 |         caption_set = deepcopy(caption_set)
 72 |         caption_set = merge_concurrent_captions(caption_set)
 73 |         caption_set.layout_info = positioning
 74 | 
 75 |         for lang in caption_set.get_languages():
 76 |             caption_set.set_layout_info(lang, positioning)
 77 | 
 78 |             caption_list = caption_set.get_captions(lang)
 79 |             for caption in caption_list:
 80 |                 caption.layout_info = positioning
 81 | 
 82 |                 for node in caption.nodes:
 83 |                     if hasattr(node, 'layout_info'):
 84 |                         node.layout_info = positioning
 85 | 
 86 |         for _, style in caption_set.get_styles():
 87 |             if 'text-align' in style:
 88 |                 style.pop('text-align')
 89 | 
 90 |         return caption_set
 91 | 
 92 | 
 93 | class LegacyDFXPWriter(BaseWriter):
 94 |     """Ported the legacy DFXPWriter from 0.4.5"""
 95 |     def __init__(self, *args, **kw):
 96 |         self.p_style = False
 97 |         self.open_span = False
 98 | 
 99 |     def write(self, caption_set, force=''):
100 |         caption_set = deepcopy(caption_set)
101 |         caption_set = merge_concurrent_captions(caption_set)
102 | 
103 |         dfxp = BeautifulSoup(LEGACY_DFXP_BASE_MARKUP, 'lxml-xml')
104 |         dfxp.find('tt')['xml:lang'] = "en"
105 | 
106 |         for style_id, style in caption_set.get_styles():
107 |             if style != {}:
108 |                 dfxp = self._recreate_styling_tag(style_id, style, dfxp)
109 |         if not caption_set.get_styles():
110 |             dfxp = self._recreate_styling_tag(
111 |                 LEGACY_DFXP_DEFAULT_STYLE_ID, LEGACY_DFXP_DEFAULT_STYLE, dfxp)
112 | 
113 |         # XXX For now we will always use this default region. In the future if
114 |         # regions are provided, they will be kept
115 |         dfxp = self._recreate_region_tag(
116 |             LEGACY_DFXP_DEFAULT_REGION_ID, LEGACY_DFXP_DEFAULT_REGION, dfxp)
117 | 
118 |         body = dfxp.find('body')
119 | 
120 |         if force:
121 |             langs = [self._force_language(force, caption_set.get_languages())]
122 |         else:
123 |             langs = caption_set.get_languages()
124 | 
125 |         for lang in langs:
126 |             div = dfxp.new_tag('div')
127 |             div['xml:lang'] = lang
128 | 
129 |             for caption in caption_set.get_captions(lang):
130 |                 if caption.style:
131 |                     caption_style = caption.style
132 |                     caption_style.update(
133 |                         {'region': LEGACY_DFXP_DEFAULT_REGION_ID})
134 |                 else:
135 |                     caption_style = {'class': LEGACY_DFXP_DEFAULT_STYLE_ID,
136 |                                      'region': LEGACY_DFXP_DEFAULT_REGION_ID}
137 |                 p = self._recreate_p_tag(caption, caption_style, dfxp)
138 |                 div.append(p)
139 | 
140 |             body.append(div)
141 | 
142 |         caption_content = dfxp.prettify(formatter=None)
143 |         return caption_content
144 | 
145 |     # force the DFXP to only have one language, trying to match on "force"
146 |     def _force_language(self, force, langs):
147 |         for lang in langs:
148 |             if force == lang:
149 |                 return lang
150 | 
151 |         return langs[-1]
152 | 
153 |     def _recreate_region_tag(self, region_id, styling, dfxp):
154 |         dfxp_region = dfxp.new_tag('region')
155 |         dfxp_region.attrs.update({'xml:id': region_id})
156 | 
157 |         attributes = self._recreate_style(styling, dfxp)
158 |         dfxp_region.attrs.update(attributes)
159 | 
160 |         new_tag = dfxp.new_tag('region')
161 |         new_tag.attrs.update({'xml:id': region_id})
162 |         if dfxp_region != new_tag:
163 |             dfxp.find('layout').append(dfxp_region)
164 |         return dfxp
165 | 
166 |     def _recreate_styling_tag(self, style, content, dfxp):
167 |         dfxp_style = dfxp.new_tag('style')
168 |         dfxp_style.attrs.update({'xml:id': style})
169 | 
170 |         attributes = self._recreate_style(content, dfxp)
171 |         dfxp_style.attrs.update(attributes)
172 | 
173 |         new_tag = dfxp.new_tag('style')
174 |         new_tag.attrs.update({'xml:id': style})
175 |         if dfxp_style != new_tag:
176 |             dfxp.find('styling').append(dfxp_style)
177 | 
178 |         return dfxp
179 | 
180 |     def _recreate_p_tag(self, caption, caption_style, dfxp):
181 |         start = caption.format_start()
182 |         end = caption.format_end()
183 |         p = dfxp.new_tag("p", begin=start, end=end)
184 |         p.string = self._recreate_text(caption, dfxp)
185 | 
186 |         if dfxp.find("style", {"xml:id": "p"}):
187 |             p['style'] = 'p'
188 | 
189 |         p.attrs.update(self._recreate_style(caption_style, dfxp))
190 | 
191 |         return p
192 | 
193 |     def _recreate_text(self, caption, dfxp):
194 |         line = ''
195 | 
196 |         for node in caption.nodes:
197 |             if node.type_ == CaptionNode.TEXT:
198 |                 line += escape(node.content) + ' '
199 | 
200 |             elif node.type_ == CaptionNode.BREAK:
201 |                 line = line.rstrip() + '<br/>\n    '
202 | 
203 |             elif node.type_ == CaptionNode.STYLE:
204 |                 line = self._recreate_span(line, node, dfxp)
205 | 
206 |         return line.rstrip()
207 | 
208 |     def _recreate_span(self, line, node, dfxp):
209 |         if node.start:
210 |             styles = ''
211 | 
212 |             content_with_style = self._recreate_style(node.content, dfxp)
213 |             for style, value in list(content_with_style.items()):
214 |                 styles += f' {style}="{value}"'
215 | 
216 |             if styles:
217 |                 if self.open_span:
218 |                     line = line.rstrip() + '</span> '
219 |                 line += f'<span{styles}>'
220 |                 self.open_span = True
221 | 
222 |         elif self.open_span:
223 |             line = line.rstrip() + '</span> '
224 |             self.open_span = False
225 | 
226 |         return line
227 | 
228 |     def _recreate_style(self, content, dfxp):
229 |         dfxp_style = {}
230 | 
231 |         if 'region' in content:
232 |             if dfxp.find('region', {'xml:id': content['region']}):
233 |                 dfxp_style['region'] = content['region']
234 |         if 'class' in content:
235 |             if dfxp.find("style", {"xml:id": content['class']}):
236 |                 dfxp_style['style'] = content['class']
237 |         if 'text-align' in content:
238 |             dfxp_style['tts:textAlign'] = content['text-align']
239 |         if 'italics' in content:
240 |             dfxp_style['tts:fontStyle'] = 'italic'
241 |         if 'font-family' in content:
242 |             dfxp_style['tts:fontFamily'] = content['font-family']
243 |         if 'font-size' in content:
244 |             dfxp_style['tts:fontSize'] = content['font-size']
245 |         if 'color' in content:
246 |             dfxp_style['tts:color'] = content['color']
247 |         if 'display-align' in content:
248 |             dfxp_style['tts:displayAlign'] = content['display-align']
249 | 
250 |         return dfxp_style
251 | 


--------------------------------------------------------------------------------
/tests/fixtures/translated_scc.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | 
  4 | @pytest.fixture(scope="session")
  5 | def sample_translated_scc_success():
  6 |     return """Scenarist_SCC V1.0
  7 | 
  8 | 00:00:09:05 [Erase Non-displayed Memory] [Erase Non-displayed Memory] [Resume Caption Loading] [Resume Caption Loading] [row 15, column 00, with plain white text.] [row 15, column 00, with plain white text.] [( ] [cl] [oc] [k ] [ti] [ck] [in] [g ] [)] [Erase Displayed Memory] [Erase Displayed Memory] [End Of Caption] [End Of Caption]
  9 | 
 10 | 00:00:12:08 [Erase Displayed Memory] [Erase Displayed Memory]
 11 | 
 12 | 00:00:13:18 [Erase Non-displayed Memory] [Erase Non-displayed Memory] [Resume Caption Loading] [Resume Caption Loading] [row 13, column 00, with plain white text.] [row 13, column 00, with plain white text.] [MA] [N:] [row 14, column 00, with plain white text.] [row 14, column 00, with plain white text.] [Wh] [en] [ w] [e ] [th] [in] [k] [row 15, column 00, with plain white text.] [row 15, column 00, with plain white text.] [of] [ "] [E ] [eq] [ua] [ls] [ m] [ c] [-s] [qu] [ar] [ed] [",] [Erase Displayed Memory] [Erase Displayed Memory] [End Of Caption] [End Of Caption]
 13 | 
 14 | 00:00:16:03 [Erase Non-displayed Memory] [Erase Non-displayed Memory] [Resume Caption Loading] [Resume Caption Loading] [row 15, column 00, with plain white text.] [row 15, column 00, with plain white text.] [we] [ h] [av] [e ] [th] [is] [ v] [is] [io] [n ] [of] [ E] [in] [st] [ei] [n] [Erase Displayed Memory] [Erase Displayed Memory] [End Of Caption] [End Of Caption]
 15 | 
 16 | 00:00:17:20 [Erase Non-displayed Memory] [Erase Non-displayed Memory] [Resume Caption Loading] [Resume Caption Loading] [row 14, column 00, with plain white text.] [row 14, column 00, with plain white text.] [as] [ a] [n ] [ol] [d,] [ w] [ri] [nk] [ly] [ m] [an] [row 15, column 00, with plain white text.] [row 15, column 00, with plain white text.] [wi] [th] [ w] [hi] [te] [ h] [ai] [r.] [Erase Displayed Memory] [Erase Displayed Memory] [End Of Caption] [End Of Caption]
 17 | 
 18 | 00:00:19:13 [Erase Non-displayed Memory] [Erase Non-displayed Memory] [Resume Caption Loading] [Resume Caption Loading] [row 13, column 00, with plain white text.] [row 13, column 00, with plain white text.] [MA] [N ] [2:] [row 14, column 00, with plain white text.] [row 14, column 00, with plain white text.] [E ] [eq] [ua] [ls] [ m] [ c] [-s] [qu] [ar] [ed] [ i] [s] [row 15, column 00, with plain white text.] [row 15, column 00, with plain white text.] [no] [t ] [ab] [ou] [t ] [an] [ o] [ld] [ E] [in] [st] [ei] [n.] [Erase Displayed Memory] [Erase Displayed Memory] [End Of Caption] [End Of Caption]
 19 | 
 20 | 00:00:25:16 [Erase Non-displayed Memory] [Erase Non-displayed Memory] [Resume Caption Loading] [Resume Caption Loading] [row 13, column 00, with plain white text.] [row 13, column 00, with plain white text.] [MA] [N ] [2:] [row 14, column 00, with plain white text.] [row 14, column 00, with plain white text.] [It] ['s] [ a] [ll] [ a] [bo] [ut] [ a] [n ] [et] [er] [na] [l] [row 15, column 00, with plain white text.] [row 15, column 00, with plain white text.] [Ei] [ns] [te] [in] [.] [Erase Displayed Memory] [Erase Displayed Memory] [End Of Caption] [End Of Caption]
 21 | 
 22 | 00:00:31:15 [Erase Non-displayed Memory] [Erase Non-displayed Memory] [Resume Caption Loading] [Resume Caption Loading] [row 15, column 00, with plain white text.] [row 15, column 00, with plain white text.] [<L] [AU] [GH] [IN] [G ] [& ] [WH] [OO] [PS] [!>] [Erase Displayed Memory] [Erase Displayed Memory] [End Of Caption] [End Of Caption]
 23 | 
 24 | 00:00:36:04 [Erase Displayed Memory] [Erase Displayed Memory]
 25 | 
 26 | """
 27 | 
 28 | 
 29 | @pytest.fixture(scope="session")
 30 | def sample_translated_scc_no_brackets():
 31 |     return """Scenarist_SCC V1.0
 32 | 
 33 | 00:00:09:05 Erase Non-displayed Memory Erase Non-displayed Memory Resume Caption Loading Resume Caption Loading row 15, column 00, with plain white text. row 15, column 00, with plain white text. (  cl oc k  ti ck in g  ) Erase Displayed Memory Erase Displayed Memory End Of Caption End Of Caption
 34 | 
 35 | 00:00:12:08 Erase Displayed Memory Erase Displayed Memory
 36 | 
 37 | 00:00:13:18 Erase Non-displayed Memory Erase Non-displayed Memory Resume Caption Loading Resume Caption Loading row 13, column 00, with plain white text. row 13, column 00, with plain white text. MA N: row 14, column 00, with plain white text. row 14, column 00, with plain white text. Wh en  w e  th in k row 15, column 00, with plain white text. row 15, column 00, with plain white text. of  " E  eq ua ls  m  c -s qu ar ed ", Erase Displayed Memory Erase Displayed Memory End Of Caption End Of Caption
 38 | 
 39 | 00:00:16:03 Erase Non-displayed Memory Erase Non-displayed Memory Resume Caption Loading Resume Caption Loading row 15, column 00, with plain white text. row 15, column 00, with plain white text. we  h av e  th is  v is io n  of  E in st ei n Erase Displayed Memory Erase Displayed Memory End Of Caption End Of Caption
 40 | 
 41 | 00:00:17:20 Erase Non-displayed Memory Erase Non-displayed Memory Resume Caption Loading Resume Caption Loading row 14, column 00, with plain white text. row 14, column 00, with plain white text. as  a n  ol d,  w ri nk ly  m an row 15, column 00, with plain white text. row 15, column 00, with plain white text. wi th  w hi te  h ai r. Erase Displayed Memory Erase Displayed Memory End Of Caption End Of Caption
 42 | 
 43 | 00:00:19:13 Erase Non-displayed Memory Erase Non-displayed Memory Resume Caption Loading Resume Caption Loading row 13, column 00, with plain white text. row 13, column 00, with plain white text. MA N  2: row 14, column 00, with plain white text. row 14, column 00, with plain white text. E  eq ua ls  m  c -s qu ar ed  i s row 15, column 00, with plain white text. row 15, column 00, with plain white text. no t  ab ou t  an  o ld  E in st ei n. Erase Displayed Memory Erase Displayed Memory End Of Caption End Of Caption
 44 | 
 45 | 00:00:25:16 Erase Non-displayed Memory Erase Non-displayed Memory Resume Caption Loading Resume Caption Loading row 13, column 00, with plain white text. row 13, column 00, with plain white text. MA N  2: row 14, column 00, with plain white text. row 14, column 00, with plain white text. It 's  a ll  a bo ut  a n  et er na l row 15, column 00, with plain white text. row 15, column 00, with plain white text. Ei ns te in . Erase Displayed Memory Erase Displayed Memory End Of Caption End Of Caption
 46 | 
 47 | 00:00:31:15 Erase Non-displayed Memory Erase Non-displayed Memory Resume Caption Loading Resume Caption Loading row 15, column 00, with plain white text. row 15, column 00, with plain white text. <L AU GH IN G  &  WH OO PS !> Erase Displayed Memory Erase Displayed Memory End Of Caption End Of Caption
 48 | 
 49 | 00:00:36:04 Erase Displayed Memory Erase Displayed Memory
 50 | 
 51 | """
 52 | 
 53 | 
 54 | @pytest.fixture(scope="session")
 55 | def sample_translated_scc_commands_not_found():
 56 |     return """Scenarist_SCC V1.0
 57 | 
 58 | 00:04:36;06 942x 942x 942x 942x [row 01, column 12, with plain white text.] [MA] [Ä] 525x c8cx ba8x
 59 | """
 60 | 
 61 | 
 62 | @pytest.fixture(scope="session")
 63 | def sample_translated_scc_custom_brackets():
 64 |     return """Scenarist_SCC V1.0
 65 | 
 66 | 00:00:09:05 {Erase Non-displayed Memory} {Erase Non-displayed Memory} {Resume Caption Loading} {Resume Caption Loading} {row 15, column 00, with plain white text.} {row 15, column 00, with plain white text.} {( } {cl} {oc} {k } {ti} {ck} {in} {g } {)} {Erase Displayed Memory} {Erase Displayed Memory} {End Of Caption} {End Of Caption}
 67 | 
 68 | 00:00:12:08 {Erase Displayed Memory} {Erase Displayed Memory}
 69 | 
 70 | 00:00:13:18 {Erase Non-displayed Memory} {Erase Non-displayed Memory} {Resume Caption Loading} {Resume Caption Loading} {row 13, column 00, with plain white text.} {row 13, column 00, with plain white text.} {MA} {N:} {row 14, column 00, with plain white text.} {row 14, column 00, with plain white text.} {Wh} {en} { w} {e } {th} {in} {k} {row 15, column 00, with plain white text.} {row 15, column 00, with plain white text.} {of} { "} {E } {eq} {ua} {ls} { m} { c} {-s} {qu} {ar} {ed} {",} {Erase Displayed Memory} {Erase Displayed Memory} {End Of Caption} {End Of Caption}
 71 | 
 72 | 00:00:16:03 {Erase Non-displayed Memory} {Erase Non-displayed Memory} {Resume Caption Loading} {Resume Caption Loading} {row 15, column 00, with plain white text.} {row 15, column 00, with plain white text.} {we} { h} {av} {e } {th} {is} { v} {is} {io} {n } {of} { E} {in} {st} {ei} {n} {Erase Displayed Memory} {Erase Displayed Memory} {End Of Caption} {End Of Caption}
 73 | 
 74 | 00:00:17:20 {Erase Non-displayed Memory} {Erase Non-displayed Memory} {Resume Caption Loading} {Resume Caption Loading} {row 14, column 00, with plain white text.} {row 14, column 00, with plain white text.} {as} { a} {n } {ol} {d,} { w} {ri} {nk} {ly} { m} {an} {row 15, column 00, with plain white text.} {row 15, column 00, with plain white text.} {wi} {th} { w} {hi} {te} { h} {ai} {r.} {Erase Displayed Memory} {Erase Displayed Memory} {End Of Caption} {End Of Caption}
 75 | 
 76 | 00:00:19:13 {Erase Non-displayed Memory} {Erase Non-displayed Memory} {Resume Caption Loading} {Resume Caption Loading} {row 13, column 00, with plain white text.} {row 13, column 00, with plain white text.} {MA} {N } {2:} {row 14, column 00, with plain white text.} {row 14, column 00, with plain white text.} {E } {eq} {ua} {ls} { m} { c} {-s} {qu} {ar} {ed} { i} {s} {row 15, column 00, with plain white text.} {row 15, column 00, with plain white text.} {no} {t } {ab} {ou} {t } {an} { o} {ld} { E} {in} {st} {ei} {n.} {Erase Displayed Memory} {Erase Displayed Memory} {End Of Caption} {End Of Caption}
 77 | 
 78 | 00:00:25:16 {Erase Non-displayed Memory} {Erase Non-displayed Memory} {Resume Caption Loading} {Resume Caption Loading} {row 13, column 00, with plain white text.} {row 13, column 00, with plain white text.} {MA} {N } {2:} {row 14, column 00, with plain white text.} {row 14, column 00, with plain white text.} {It} {'s} { a} {ll} { a} {bo} {ut} { a} {n } {et} {er} {na} {l} {row 15, column 00, with plain white text.} {row 15, column 00, with plain white text.} {Ei} {ns} {te} {in} {.} {Erase Displayed Memory} {Erase Displayed Memory} {End Of Caption} {End Of Caption}
 79 | 
 80 | 00:00:31:15 {Erase Non-displayed Memory} {Erase Non-displayed Memory} {Resume Caption Loading} {Resume Caption Loading} {row 15, column 00, with plain white text.} {row 15, column 00, with plain white text.} {<L} {AU} {GH} {IN} {G } {& } {WH} {OO} {PS} {!>} {Erase Displayed Memory} {Erase Displayed Memory} {End Of Caption} {End Of Caption}
 81 | 
 82 | 00:00:36:04 {Erase Displayed Memory} {Erase Displayed Memory}
 83 | 
 84 | """
 85 | 
 86 | 
 87 | @pytest.fixture(scope="session")
 88 | def sample_translated_scc_special_and_extended_characters():
 89 |     return """Scenarist_SCC V1.0
 90 | 
 91 | 00:00:16;29 [ ] [®] [°] [½] [¿] [™] [¢] [£] 
 92 | 
 93 | 00:04:36;06 [♪] [à] [ ] [è] [â] [ê] [î] [ô] [û]
 94 | 
 95 | 00:08:00;00 [É] [Ó] [Ú] [Ü] [ü] [‘] [¡] [*] [’] [—] [©]
 96 | 
 97 | 00:12:00;23 [℠] [•] [“] [”] [À] [Â] [Ç] [È] [Ê] [Ë] [ë] [Î] [Ï] 
 98 | 
 99 | 00:16:24;11 [ï] [Ô] [Ù] [ù] [Û] [«] [»] [Ã] [ã] [Í] [Ì] [ì] [Ò]
100 | 
101 | 00:20:19;12 [ò] [Õ] [õ] [{] [}] [\\] [^] [_] [¦] [~] [Ä] [ä] [Ö]
102 | 
103 | 00:24:39;28 [ö] [ß] [¥] [¤] [|] [Å] [å] [Ø] [ø] [┌] [┐] [└] [┘]
104 | """
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright (c) 2012-2025 PBS.org
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/docs/introduction.rst:
--------------------------------------------------------------------------------
  1 | Introduction
  2 | ============
  3 | 
  4 | ``pycaption`` is a caption reading/writing module. Use one of the given
  5 | Readers to read content into a CaptionSet object,
  6 | and then use one of the Writers to output the CaptionSet into
  7 | captions of your desired format.
  8 | 
  9 | Turn a caption into multiple caption outputs:
 10 | 
 11 | ::
 12 | 
 13 |     srt_caps = '''1
 14 |     00:00:09,209 --> 00:00:12,312
 15 |     This is an example SRT file,
 16 |     which, while extremely short,
 17 |     is still a valid SRT file.
 18 |     '''
 19 | 
 20 |     converter = CaptionConverter()
 21 |     converter.read(srt_caps, SRTReader())
 22 |     print(converter.write(SAMIWriter()))
 23 |     print(converter.write(DFXPWriter()))
 24 |     print(converter.write(pycaption.transcript.TranscriptWriter()))
 25 |     print(converter.write(MicroDVDWriter()))
 26 | 
 27 | Not sure what format the caption is in? Detect it:
 28 | 
 29 | ::
 30 | 
 31 |     from pycaption import detect_format
 32 | 
 33 |     caps = '''1
 34 |     00:00:01,500 --> 00:00:12,345
 35 |     Small caption'''
 36 | 
 37 |     reader = detect_format(caps)
 38 |     if reader:
 39 |         print(SAMIWriter().write(reader().read(caps)))
 40 | 
 41 | Or if you expect to have only a subset of the supported input formats:
 42 | 
 43 | ::
 44 | 
 45 |     caps = '''1
 46 |     00:00:01,500 --> 00:00:12,345
 47 |     Small caption'''
 48 | 
 49 |     if SRTReader().detect(caps):
 50 |         print(SAMIWriter().write(SRTReader().read(caps)))
 51 |     elif DFXPReader().detect(caps):
 52 |         print(SAMIWriter().write(DFXPReader().read(caps)))
 53 |     elif SCCReader().detect(caps):
 54 |         print(SAMIWriter().write(SCCReader().read(caps)))
 55 |     elif MicroDVDReader().detect(caps)
 56 |         print(SAMIWriter().write(MicroDVDReader().read(caps)))
 57 | 
 58 | Python Usage
 59 | ------------
 60 | 
 61 | Example: Convert from SAMI to DFXP
 62 | 
 63 | ::
 64 | 
 65 |     from pycaption import SAMIReader, DFXPWriter
 66 | 
 67 |     sami = '''<SAMI><HEAD><TITLE>NOVA3213</TITLE><STYLE TYPE="text/css">
 68 |     <!--
 69 |     P { margin-left:  1pt;
 70 |         margin-right: 1pt;
 71 |         margin-bottom: 2pt;
 72 |         margin-top: 2pt;
 73 |         text-align: center;
 74 |         font-size: 10pt;
 75 |         font-family: Arial;
 76 |         font-weight: normal;
 77 |         font-style: normal;
 78 |         color: #ffffff; }
 79 | 
 80 |     .ENCC {Name: English; lang: en-US; SAMI_Type: CC;}
 81 |     .FRCC {Name: French; lang: fr-cc; SAMI_Type: CC;}
 82 | 
 83 |     --></STYLE></HEAD><BODY>
 84 |     <SYNC start="9209"><P class="ENCC">
 85 |            ( clock ticking )
 86 |     </P><P class="FRCC">
 87 |            FRENCH LINE 1!
 88 |     </P></SYNC>
 89 |     <SYNC start="12312"><P class="ENCC">&nbsp;</P></SYNC>
 90 |     <SYNC start="14848"><P class="ENCC">
 91 |                   MAN:<br/>
 92 |              <span style="text-align:center;font-size:10">When <i>we</i> think</span><br/>
 93 |         of E equals m c-squared,
 94 |     </P><P class="FRCC">
 95 |            FRENCH LINE 2?
 96 |     </P></SYNC>'''
 97 | 
 98 |     print DFXPWriter().write(SAMIReader().read(sami))
 99 | 
100 | Which will output the following:
101 | 
102 | ::
103 | 
104 |     <?xml version="1.0" encoding="utf-8"?>
105 |     <tt xml:lang="en" xmlns="http://www.w3.org/ns/ttml" xmlns:tts="http://www.w3.org/ns/ttml#styling">
106 |      <head>
107 |       <styling>
108 |        <style id="p" tts:color="#fff" tts:fontfamily="Arial" tts:fontsize="10pt" tts:textAlign="center"/>
109 |       </styling>
110 |      </head>
111 |      <body>
112 |       <div xml:lang="fr-cc">
113 |        <p begin="00:00:09.209" end="00:00:14.848" style="p">
114 |         FRENCH LINE 1!
115 |        </p>
116 |        <p begin="00:00:14.848" end="00:00:18.848" style="p">
117 |         FRENCH LINE 2?
118 |        </p>
119 |       </div>
120 |       <div xml:lang="en-US">
121 |        <p begin="00:00:09.209" end="00:00:12.312" style="p">
122 |         ( clock ticking )
123 |        </p>
124 |        <p begin="00:00:14.848" end="00:00:18.848" style="p">
125 |         MAN:<br/>
126 |         <span tts:fontsize="10" tts:textAlign="center">When</span> <span tts:fontStyle="italic">we</span> think<br/>
127 |         of E equals m c-squared,
128 |        </p>
129 |       </div>
130 |      </body>
131 |     </tt>
132 | 
133 | Default Language
134 | ----------------
135 | 
136 | If language is not detected you can set a default one in your environment.
137 | If there is no default language we use 'und' ( a special identifier for an undetermined language according to ISO 639-2 )
138 | 
139 | ::
140 | 
141 |    PYCAPTION_DEFAULT_LANG = "en-US"
142 | 
143 | 
144 | 
145 | Positioning
146 | -----------
147 | 
148 | Some caption formats support positioning information and PyCaption tries to preserve it when possible. In the process, some adjustments are made. Some of these adjustments can be customized by properly initializing the Writer class.
149 | 
150 | .. py:class:: BaseWriter(relativize=True, video_width=None, video_height=None, fit_to_screen=True)
151 | 
152 |     :param relativize: If True (default), converts absolute positioning
153 |             values (e.g. px) to percentage. ATTENTION: WebVTT does not support
154 |             absolute positioning. If relativize is set to False and it finds
155 |             an absolute positioning parameter for a given caption, it will
156 |             ignore all positioning for that cue and show it in the default
157 |             position.
158 |     :param video_width: The width of the video for which the captions being
159 |             converted were made. This is necessary for relativization.
160 |     :param video_height: The height of the video for which the captions
161 |             being converted were made. This is necessary for relativization.
162 |     :param fit_to_screen: If extent is not set or if origin + extent > 100%,
163 |             (re)calculate it based on origin. It is a pycaption fix for caption
164 |             files that are technically valid but contains inconsistent settings
165 |             that may cause long captions to be cut out of the screen.
166 | 
167 | Examples
168 | ~~~~~~~~
169 | 
170 | * DFXP to WebVTT
171 | 
172 | ::
173 | 
174 |     from pycaption import DFXPReader, WebVTTWriter
175 |     dfxp = u"""<?xml version="1.0" encoding="utf-8"?>
176 |     <tt xml:lang="en-us"
177 |         xmlns="http://www.w3.org/ns/ttml"
178 |         xmlns:tts='http://www.w3.org/ns/ttml#styling'
179 |         >
180 |     <head>
181 |         <layout>
182 |             <region xml:id="fourthQuadrant" tts:textAlign='left' tts:origin='320px 180px' tts:extent='320px 180px'/>
183 |         </layout>
184 |     </head>
185 |     <body>
186 |         <div>
187 |             <p region="fourthQuadrant" begin='00:00:01.000' end='00:00:03.000'>
188 |             I'm in the fourth quadrant!
189 |             </p>
190 |         </div>
191 |     </body>
192 |     </tt>"""
193 |     caption_set = DFXPReader().read(dfxp)
194 |     print WebVTTWriter(video_width=640, video_height=360).write(caption_set)
195 | 
196 | The code above should output:
197 | 
198 | ::
199 | 
200 |     WEBVTT
201 | 
202 |     00:01.000 --> 00:03.000 align:left position:50%,start line:50% size:50%
203 |     I'm in the fourth quadrant!
204 | 
205 | Note that px values were converted to percentages. This can only be done if
206 | a reference such as video_width or height are sent as parameters based on which
207 | we can calculate the relative values. If the WebVTTWriter is initialized without
208 | them and the input file contains px values, when the `.write` method is called,
209 | it will raise `RelativizationError`.
210 | 
211 | * DFXP to DFXP
212 | 
213 | ::
214 | 
215 |     from pycaption import DFXPReader, DFXPWriter
216 |     dfxp = u"""<?xml version="1.0" encoding="utf-8"?>
217 |     <tt xml:lang="en-us"
218 |         xmlns="http://www.w3.org/ns/ttml"
219 |         xmlns:tts='http://www.w3.org/ns/ttml#styling'
220 |         >
221 |     <head>
222 |         <layout>
223 |             <region xml:id="invalidRegion" tts:textAlign='left' tts:origin='360px 180px' tts:extent='420px 240px'/>
224 |         </layout>
225 |     </head>
226 |     <body>
227 |         <div>
228 |             <p region="invalidRegion" begin='00:00:01.000' end='00:00:03.000'>
229 |             I'm a long caption and I'm cropped by the right side of the screen.
230 |             </p>
231 |         </div>
232 |     </body>
233 |     </tt>"""
234 |     caption_set = DFXPReader().read(dfxp)
235 | 
236 | This input is syntactically valid but presents two problems:
237 | 
238 | #. Positioning relies on absolute values (px). In systems that ingest one video
239 |    and an associated caption file and outputs several formats for different
240 |    platforms, this is a problem. A caption shifted 960px to the left in a 1920x1080
241 |    video, for example, disappears in a 640x360 one.
242 | #. Assuming a 640x360 resolution, the positioning specified above results in an
243 |    overflowing cue box which in turn results in cropped content when the caption
244 |    text is long enough.
245 | 
246 | Here are some examples of Writer initialization:
247 | 
248 | ::
249 | 
250 |     >>> print DFXPWriter().write(caption_set)
251 |     RelativizationError: At least one of video width or height must be given as a reference
252 | 
253 |     >>> print DFXPWriter(relativize=False).write(caption_set)
254 |     ValueError: Units must be relativized before extent can be calculated based on origin.
255 | 
256 |     >>> print DFXPWriter(relativize=False, fit_to_screen=False).write(caption_set)
257 |     <?xml version="1.0" encoding="utf-8"?>
258 |     <tt xml:lang="en" xmlns="http://www.w3.org/ns/ttml" xmlns:tts="http://www.w3.org/ns/ttml#styling">
259 |      <head>
260 |       <styling>
261 |        <style tts:color="white" tts:fontFamily="monospace" tts:fontSize="1c" xml:id="default"/>
262 |       </styling>
263 |       <layout>
264 |        <region tts:displayAlign="after" tts:extent="420px 240px" tts:origin="360px 180px" tts:textAlign="left" xml:id="r0"/>
265 |       </layout>
266 |      </head>
267 |      <body>
268 |       <div region="r0" xml:lang="en-US">
269 |        <p begin="00:00:01.000" end="00:00:03.000" region="r0" style="default">
270 |         I'm a long caption and I'm cropped by the right side of the screen.
271 |        </p>
272 |       </div>
273 |      </body>
274 |     </tt>
275 | 
276 |     >>> print DFXPWriter(video_width=640, video_height=360, fit_to_screen=False).write(caption_set)
277 |     <?xml version="1.0" encoding="utf-8"?>
278 |     <tt xml:lang="en" xmlns="http://www.w3.org/ns/ttml" xmlns:tts="http://www.w3.org/ns/ttml#styling">
279 |      <head>
280 |       <styling>
281 |        <style tts:color="white" tts:fontFamily="monospace" tts:fontSize="1c" xml:id="default"/>
282 |       </styling>
283 |       <layout>
284 |        <region tts:displayAlign="after" tts:extent="420px 240px" tts:origin="360px 180px" tts:textAlign="left" xml:id="r0"/>
285 |        <region tts:displayAlign="after" tts:extent="65.63% 66.67%" tts:origin="56.25% 50%" tts:textAlign="left" xml:id="r1"/>
286 |       </layout>
287 |      </head>
288 |      <body>
289 |       <div region="r0" xml:lang="en-US">
290 |        <p begin="00:00:01.000" end="00:00:03.000" region="r1" style="default">
291 |         I'm a long caption and I'm cropped by the right side of the screen.
292 |        </p>
293 |       </div>
294 |      </body>
295 |     </tt>
296 | 
297 | In the last example the values are relativized but ``origin + extent > 100%``, which
298 | still results in the caption being cropped.
299 | 
300 | ::
301 | 
302 | 
303 |     >>> print DFXPWriter(video_width=640, video_height=360).write(caption_set)
304 |     <?xml version="1.0" encoding="utf-8"?>
305 |     <tt xml:lang="en" xmlns="http://www.w3.org/ns/ttml" xmlns:tts="http://www.w3.org/ns/ttml#styling">
306 |      <head>
307 |       <styling>
308 |        <style tts:color="white" tts:fontFamily="monospace" tts:fontSize="1c" xml:id="default"/>
309 |       </styling>
310 |       <layout>
311 |        <region tts:displayAlign="after" tts:extent="420px 240px" tts:origin="360px 180px" tts:textAlign="left" xml:id="r0"/>
312 |        <region tts:displayAlign="after" tts:extent="43.75% 50%" tts:origin="56.25% 50%" tts:textAlign="left" xml:id="r1"/>
313 |       </layout>
314 |      </head>
315 |      <body>
316 |       <div region="r0" xml:lang="en-US">
317 |        <p begin="00:00:01.000" end="00:00:03.000" region="r1" style="default">
318 |         I'm a long caption and I'm cropped by the right side of the screen.
319 |        </p>
320 |       </div>
321 |      </body>
322 |     </tt>
323 | 
324 | Now the positioning is corrected and the caption is guaranteed to be within the
325 | visible region of the screen.
326 | 
327 | **NOTE**: The region ``r0`` is still defined using absolute values. This is a bug that
328 | should be fixed in the next release. In any case it is harmless because it is
329 | overwritten by the relative values in ``r1``.
330 | 


--------------------------------------------------------------------------------
/pycaption/base.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from collections import defaultdict
  3 | from datetime import timedelta
  4 | from numbers import Number
  5 | 
  6 | from .exceptions import CaptionReadError, CaptionReadTimingError
  7 | 
  8 | # `und` a special identifier for an undetermined language according to ISO 639-2
  9 | DEFAULT_LANGUAGE_CODE = os.getenv("PYCAPTION_DEFAULT_LANG", "und")
 10 | 
 11 | 
 12 | def force_byte_string(content):
 13 |     try:
 14 |         return content.encode("UTF-8")
 15 |     except UnicodeEncodeError:
 16 |         raise RuntimeError("Invalid content encoding")
 17 |     except UnicodeDecodeError:
 18 |         return content
 19 | 
 20 | 
 21 | class CaptionConverter:
 22 |     def __init__(self, captions=None):
 23 |         self.captions = captions if captions else []
 24 | 
 25 |     def read(self, content, caption_reader):
 26 |         try:
 27 |             self.captions = caption_reader.read(content)
 28 |         except AttributeError as e:
 29 |             raise Exception(e)
 30 |         return self
 31 | 
 32 |     def write(self, caption_writer):
 33 |         try:
 34 |             return caption_writer.write(self.captions)
 35 |         except AttributeError as e:
 36 |             raise Exception(e)
 37 | 
 38 | 
 39 | class BaseReader:
 40 |     def __init__(self, *args, **kwargs):
 41 |         pass
 42 | 
 43 |     def detect(self, content):
 44 |         if content:
 45 |             return True
 46 |         else:
 47 |             return False
 48 | 
 49 |     def read(self, content):
 50 |         return CaptionSet({DEFAULT_LANGUAGE_CODE: []})
 51 | 
 52 | 
 53 | class BaseWriter:
 54 |     def __init__(
 55 |         self, relativize=True, video_width=None, video_height=None, fit_to_screen=True
 56 |     ):
 57 |         """
 58 |         Initialize writer with the given parameters.
 59 | 
 60 |         :param relativize: If True (default), converts absolute positioning
 61 |             values (e.g. px) to percentage. ATTENTION: WebVTT does not support
 62 |             absolute positioning. If relativize is set to False and it finds
 63 |             an absolute positioning parameter for a given caption, it will
 64 |             ignore all positioning for that cue and show it in the default
 65 |             position.
 66 |         :param video_width: The width of the video for which the captions being
 67 |             converted were made. This is necessary for relativization.
 68 |         :param video_height: The height of the video for which the captions
 69 |             being converted were made. This is necessary for relativization.
 70 |         :param fit_to_screen: If extent is not set or
 71 |             if origin + extent > 100%, (re)calculate it based on origin.
 72 |             It is a pycaption fix for caption files that are technically valid
 73 |             but contains inconsistent settings that may cause long captions to
 74 |             be cut out of the screen.
 75 |         """
 76 |         self.relativize = relativize
 77 |         self.video_width = video_width
 78 |         self.video_height = video_height
 79 |         self.fit_to_screen = fit_to_screen
 80 | 
 81 |     def _relativize_and_fit_to_screen(self, layout_info):
 82 |         if layout_info:
 83 |             if self.relativize:
 84 |                 # Transform absolute values (e.g. px) into percentages
 85 |                 layout_info = layout_info.as_percentage_of(
 86 |                     self.video_width, self.video_height
 87 |                 )
 88 |             if self.fit_to_screen:
 89 |                 # Make sure origin + extent <= 100%
 90 |                 layout_info = layout_info.fit_to_screen()
 91 |         return layout_info
 92 | 
 93 |     def write(self, content):
 94 |         return content
 95 | 
 96 | 
 97 | class Style:
 98 |     def __init__(self):
 99 |         pass
100 | 
101 | 
102 | class CaptionNode:
103 |     """
104 |     A single node within a caption, representing either
105 |     text, a style, or a linebreak.
106 | 
107 |     Rules:
108 |         1. All nodes should have the property layout_info set.
109 |         The value None means specifically that no positioning information
110 |         should be specified. Each reader is to supply its own default
111 |         values (if necessary) when reading their respective formats.
112 |     """
113 | 
114 |     TEXT = 1
115 |     # When and if this is extended, it might be better to turn it into a
116 |     # property of the node, not a type of node itself.
117 |     STYLE = 2
118 |     BREAK = 3
119 | 
120 |     def __init__(
121 |         self, type_, layout_info=None, content=None, start=None, position=None
122 |     ):
123 |         """
124 |         :type type_: int
125 |         :type layout_info: Layout
126 |         """
127 |         self.type_ = type_
128 |         self.content = content
129 |         self.position = position
130 | 
131 |         # Boolean. Marks the beginning/ end of a Style node.
132 |         self.start = start
133 |         self.layout_info = layout_info
134 | 
135 |     def __repr__(self):
136 |         t = self.type_
137 | 
138 |         if t == CaptionNode.TEXT:
139 |             return repr(self.content)
140 |         elif t == CaptionNode.BREAK:
141 |             return repr("BREAK")
142 |         elif t == CaptionNode.STYLE:
143 |             return repr(f"STYLE: {self.start} {self.content}")
144 |         else:
145 |             raise RuntimeError(f"Unknown node type: {t}")
146 | 
147 |     @staticmethod
148 |     def create_text(text, layout_info=None, position=None):
149 |         return CaptionNode(
150 |             type_=CaptionNode.TEXT,
151 |             layout_info=layout_info,
152 |             position=position,
153 |             content=text,
154 |         )
155 | 
156 |     @staticmethod
157 |     def create_style(start, content, layout_info=None):
158 |         return CaptionNode(
159 |             type_=CaptionNode.STYLE,
160 |             layout_info=layout_info,
161 |             content=content,
162 |             start=start,
163 |         )
164 | 
165 |     @staticmethod
166 |     def create_break(layout_info=None, content=None):
167 |         return CaptionNode(
168 |             type_=CaptionNode.BREAK, layout_info=layout_info, content=content
169 |         )
170 | 
171 | 
172 | class Caption:
173 |     """
174 |     A single caption, including the time and styling information
175 |     for its display.
176 |     """
177 | 
178 |     def __init__(self, start, end, nodes, style={}, layout_info=None):
179 |         """
180 |         Initialize the Caption object
181 |         :param start: The start time in microseconds
182 |         :type start: Number
183 |         :param end: The end time in microseconds
184 |         :type end: Number
185 |         :param nodes: A list of CaptionNodes
186 |         :type nodes: list
187 |         :param style: A dictionary with CSS-like styling rules
188 |         :type style: dict
189 |         :param layout_info: A Layout object with the necessary positioning
190 |             information
191 |         :type layout_info: Layout
192 |         """
193 |         if not isinstance(start, Number):
194 |             raise CaptionReadTimingError(
195 |                 "Captions must be initialized with a" " valid start time"
196 |             )
197 |         if not isinstance(end, Number):
198 |             raise CaptionReadTimingError(
199 |                 "Captions must be initialized with a" " valid end time"
200 |             )
201 |         if not nodes:
202 |             raise CaptionReadError("Node list cannot be empty")
203 |         self.start = start
204 |         self.end = end
205 |         self.nodes = nodes
206 |         self.style = style
207 |         self.layout_info = layout_info
208 | 
209 |     def is_empty(self):
210 |         return len(self.nodes) == 0
211 | 
212 |     def format_start(self, msec_separator=None):
213 |         """
214 |         Format the start time value in milliseconds into a string
215 |         value suitable for some of the supported output formats (ex.
216 |         SRT, DFXP).
217 |         """
218 |         return self._format_timestamp(self.start, msec_separator)
219 | 
220 |     def format_end(self, msec_separator=None):
221 |         """
222 |         Format the end time value in milliseconds into a string value suitable
223 |         for some of the supported output formats (ex. SRT, DFXP).
224 |         """
225 |         return self._format_timestamp(self.end, msec_separator)
226 | 
227 |     def __repr__(self):
228 |         return repr(f"{self.format_start()} --> {self.format_end()}\n{self.get_text()}")
229 | 
230 |     def get_text_nodes(self):
231 |         """
232 |         Get the text of the caption.
233 |         """
234 | 
235 |         def get_text_for_node(node):
236 |             if node.type_ == CaptionNode.TEXT:
237 |                 return node.content
238 |             if node.type_ == CaptionNode.BREAK:
239 |                 return "\n"
240 |             return ""
241 | 
242 |         return [get_text_for_node(node) for node in self.nodes]
243 | 
244 |     def get_text(self):
245 |         text_nodes = self.get_text_nodes()
246 |         return "".join(text_nodes).strip()
247 | 
248 |     def _format_timestamp(self, microseconds, msec_separator=None):
249 |         duration = timedelta(microseconds=microseconds)
250 |         hours, rem = divmod(duration.seconds, 3600)
251 |         minutes, seconds = divmod(rem, 60)
252 |         milliseconds = f"{duration.microseconds // 1000:03d}"
253 |         timestamp = (
254 |             f"{hours:02d}:{minutes:02d}:{seconds:02d}"
255 |             f"{msec_separator or '.'}{milliseconds:.3s}"
256 |         )
257 |         return timestamp
258 | 
259 | 
260 | class CaptionList(list):
261 |     """A list of captions with a layout object attached to it"""
262 | 
263 |     def __init__(self, iterable=None, layout_info=None):
264 |         """
265 |         :param iterable: An iterator used to populate the caption list
266 |         :param Layout layout_info: A Layout object with the positioning info
267 |         """
268 |         self.layout_info = layout_info
269 |         args = [iterable] if iterable else []
270 |         super().__init__(*args)
271 | 
272 |     def __getslice__(self, i, j):
273 |         return CaptionList(list.__getslice__(self, i, j), layout_info=self.layout_info)
274 | 
275 |     def __getitem__(self, y):
276 |         item = list.__getitem__(self, y)
277 |         if isinstance(item, Caption):
278 |             return item
279 |         return CaptionList(item, layout_info=self.layout_info)
280 | 
281 |     def __add__(self, other):
282 |         add_is_safe = (
283 |             not hasattr(other, "layout_info")
284 |             or not other.layout_info
285 |             or self.layout_info == other.layout_info
286 |         )
287 |         if add_is_safe:
288 |             return CaptionList(list.__add__(self, other), layout_info=self.layout_info)
289 |         else:
290 |             raise ValueError(
291 |                 "Cannot add CaptionList objects with different layout_info"
292 |             )
293 | 
294 |     def __mul__(self, other):
295 |         return CaptionList(list.__mul__(self, other), layout_info=self.layout_info)
296 | 
297 |     __rmul__ = __mul__
298 | 
299 | 
300 | class CaptionSet:
301 |     """
302 |     A set of captions in potentially multiple languages,
303 |     all representing the same underlying content.
304 | 
305 |     The .layout_info attribute, keeps information that should be inherited
306 |     by all the children.
307 |     """
308 | 
309 |     def __init__(self, captions, styles={}, layout_info=None):
310 |         """
311 |         :param captions: A dictionary of the format {'language': CaptionList}
312 |         :param styles: A dictionary with CSS-like styling rules
313 |         :param Layout layout_info: A Layout object with the positioning info
314 |         """
315 |         self._captions = captions
316 |         self._styles = styles
317 |         self.layout_info = layout_info
318 | 
319 |     def set_captions(self, lang, captions):
320 |         self._captions[lang] = captions
321 | 
322 |     def get_languages(self):
323 |         return list(self._captions.keys())
324 | 
325 |     def get_captions(self, lang):
326 |         return self._captions.get(lang, [])
327 | 
328 |     def add_style(self, selector, rules):
329 |         """
330 |         :param selector: The selector indicating the elements to which the
331 |             rules should be applied.
332 |         :param rules: A dictionary with CSS-like styling rules.
333 |         """
334 |         self._styles[selector] = rules
335 | 
336 |     def get_style(self, selector):
337 |         """
338 |         Returns a dictionary with CSS-like styling rules for a given selector.
339 |         :param selector: The selector whose rules should be returned (e.g. an
340 |             element or class name).
341 |         """
342 |         return self._styles.get(selector, {})
343 | 
344 |     def get_styles(self):
345 |         return sorted(self._styles.items())
346 | 
347 |     def set_styles(self, styles):
348 |         self._styles = styles
349 | 
350 |     def is_empty(self):
351 |         return all([len(captions) == 0 for captions in list(self._captions.values())])
352 | 
353 |     def set_layout_info(self, lang, layout_info):
354 |         self._captions[lang].layout_info = layout_info
355 | 
356 |     def get_layout_info(self, lang):
357 |         caption_list = self._captions.get(lang)
358 |         if caption_list:
359 |             return caption_list.layout_info
360 |         return None
361 | 
362 |     def adjust_caption_timing(self, offset=0, rate_skew=1.0):
363 |         """
364 |         Adjust the timing according to offset and rate_skew.
365 |         Skew is applied first, then offset.
366 | 
367 |         e.g. if skew == 1.1, and offset is 5, a caption originally
368 |         displayed from 10-11 seconds would instead be at 16-17.1
369 |         """
370 |         for lang in self.get_languages():
371 |             captions = self.get_captions(lang)
372 |             out_captions = CaptionList()
373 |             for caption in captions:
374 |                 caption.start = caption.start * rate_skew + offset
375 |                 caption.end = caption.end * rate_skew + offset
376 |                 if caption.start >= 0:
377 |                     out_captions.append(caption)
378 |             self.set_captions(lang, out_captions)
379 | 
380 | 
381 | # Functions
382 | def merge_concurrent_captions(caption_set):
383 |     """Merge captions that have the same start and end times"""
384 |     for lang in caption_set.get_languages():
385 |         captions = caption_set.get_captions(lang)
386 |         last_caption = None
387 |         concurrent_captions = CaptionList()
388 |         merged_captions = CaptionList()
389 |         for caption in captions:
390 |             if last_caption:
391 |                 last_timespan = last_caption.start, last_caption.end
392 |                 current_timespan = caption.start, caption.end
393 |                 if current_timespan == last_timespan:
394 |                     concurrent_captions.append(caption)
395 |                     last_caption = caption
396 |                     continue
397 |                 else:
398 |                     merged_captions.append(merge(concurrent_captions))
399 |             concurrent_captions = [caption]
400 |             last_caption = caption
401 | 
402 |         if concurrent_captions:
403 |             merged_captions.append(merge(concurrent_captions))
404 |         if merged_captions:
405 |             caption_set.set_captions(lang, merged_captions)
406 |     return caption_set
407 | 
408 | 
409 | def merge(captions):
410 |     """
411 |     Merge list of captions into one caption. The start/end times from the first
412 |     caption are kept.
413 |     """
414 |     new_nodes = []
415 |     for caption in captions:
416 |         if new_nodes:
417 |             new_nodes.append(CaptionNode.create_break())
418 |         for node in caption.nodes:
419 |             new_nodes.append(node)
420 |     caption = Caption(captions[0].start, captions[0].end, new_nodes, captions[0].style)
421 |     return caption
422 | 


--------------------------------------------------------------------------------