├── tests ├── __init__.py ├── fixtures │ ├── __init__.py │ ├── microdvd.py │ ├── srt.py │ ├── webvtt.py │ └── translated_scc.py ├── test_functions.py ├── test_microdvd_conversion.py ├── test_scc_translator.py ├── test_base.py ├── test_srt_conversion.py ├── test_microdvd.py ├── test_scc_conversion.py ├── test_srt.py ├── test_sami_conversion.py ├── test_webvtt_conversion.py ├── test_geometry.py ├── test_dfxp_extras.py ├── mixins.py ├── conftest.py ├── test_webvtt.py ├── test_sami.py └── test_dfxp.py ├── .github ├── CODEOWNERS └── workflows │ ├── create_github_release.yml │ ├── release.yml │ ├── release_test_pypi.yml │ └── unit_tests.yml ├── MANIFEST.in ├── docs ├── requirements.txt ├── extensibility.rst ├── index.rst ├── supported_formats.rst ├── Makefile ├── changelog.rst ├── conf.py └── introduction.rst ├── test_requirements.txt ├── pycaption ├── dfxp │ ├── __init__.py │ └── extras.py ├── utils.py ├── transcript.py ├── exceptions.py ├── scc │ ├── translator.py │ └── state_machines.py ├── __init__.py ├── microdvd.py ├── srt.py └── base.py ├── .readthedocs.yaml ├── .pre-commit-config.yaml ├── .gitignore ├── examples ├── example.scc └── example.sub ├── run_tests.sh ├── docker-compose.yml ├── README.rst ├── setup.py └── LICENSE /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/fixtures/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @pbs/core-services 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include tests * 2 | include README.rst 3 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==7.2.6 2 | sphinx_rtd_theme==1.3.0 3 | readthedocs-sphinx-search==0.3.2 -------------------------------------------------------------------------------- /test_requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-cov 3 | beautifulsoup4>=4.12.1 4 | lxml>=4.9.1 5 | cssutils>=2.0.0 -------------------------------------------------------------------------------- /pycaption/dfxp/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import * # noqa: F401, F403 2 | from .extras import SinglePositioningDFXPWriter, LegacyDFXPWriter # noqa: F401 3 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: "ubuntu-22.04" 5 | tools: 6 | python: "3.11" 7 | 8 | # Build from the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Explicitly set the version of Python and its requirements 13 | python: 14 | install: 15 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /docs/extensibility.rst: -------------------------------------------------------------------------------- 1 | Extensibility 2 | ============= 3 | 4 | Different readers and writers are easy to add if you would like to: - 5 | Read/Write a previously unsupported format - Read/Write a supported 6 | format in a different way (more styling?) 7 | 8 | Simply follow the format of a current Reader or Writer, and edit to your 9 | heart's desire. 10 | -------------------------------------------------------------------------------- /pycaption/utils.py: -------------------------------------------------------------------------------- 1 | def is_leaf(element): 2 | """ 3 | Return True if the element is a leaf, False otherwise. The element is 4 | considered a leaf if it is either NavigableString or the "br" tag 5 | :param element: A BeautifulSoup tag or NavigableString 6 | """ 7 | name = getattr(element, 'name', None) 8 | if not name or name == 'br': 9 | return True 10 | return False 11 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: git://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.0.1 4 | hooks: 5 | - id: end-of-file-fixer 6 | - id: trailing-whitespace 7 | - id: debug-statements 8 | 9 | - repo: git://github.com/PyCQA/flake8 10 | rev: 3.9.2 11 | hooks: 12 | - id: flake8 13 | args: [ 14 | '--exclude=tests/fixtures*', 15 | '--ignore=W503,C901', 16 | '--max-line-length=80', 17 | ] 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | .cache 16 | .eggs 17 | 18 | # Installer logs 19 | pip-log.txt 20 | 21 | # Unit test / coverage reports 22 | .coverage 23 | .tox 24 | coverage.xml 25 | 26 | #Translations 27 | *.mo 28 | 29 | #Mr Developer 30 | .mr.developer.cfg 31 | 32 | # Sphinx documentation build 33 | docs/_build 34 | 35 | # PyCharm files 36 | .idea/ 37 | 38 | # Environments 39 | env/ 40 | venv/ 41 | 42 | # Pyenv files 43 | .python-version 44 | -------------------------------------------------------------------------------- /tests/test_functions.py: -------------------------------------------------------------------------------- 1 | from pycaption import DFXPReader 2 | from pycaption.base import merge_concurrent_captions 3 | 4 | 5 | class TestFunctions: 6 | def test_merge_concurrent_captions(self, dfxp_with_concurrent_captions): 7 | initial_caption_set = DFXPReader().read(dfxp_with_concurrent_captions) 8 | initial_captions = initial_caption_set.get_captions('en-US') 9 | caption_set = merge_concurrent_captions(initial_caption_set) 10 | captions = caption_set.get_captions('en-US') 11 | 12 | assert len(initial_captions) == 5 13 | assert len(captions) == 3 14 | -------------------------------------------------------------------------------- /examples/example.scc: -------------------------------------------------------------------------------- 1 | Scenarist_SCC V1.0 2 | 3 | 00:00:00:00 9420 94d0 9723 4ce5 f4f2 6120 f4f2 6164 75e3 e964 6120 61ec 2045 7370 61fe efec 94f2 97a1 9137 20a1 92a7 d5ef ef79 e5a1 20a1 92a7 d62a 6def 6eef 73a1 2080 9137 9420 942c 942f 9420 94d0 97a1 9137 204c ef20 ece5 20ec ef20 ec61 e92c 20ec ef20 ec61 e920 ec61 e920 4 | 5 | 00:00:05:08 942c 9420 9470 9723 544f c44f d3ba 20d3 5e2c 20e5 7320 e3e9 e5f2 f4ef 20c1 ec6d 61ae 6 | 7 | 00:00:08:17 9420 942c 942f 9420 94d0 9723 cdc1 cd49 ba20 a180 92a7 d661 6def 7320 6120 64e9 76e5 f2f4 e9f2 6eef 8 | 9 | 00:00:10:04 9420 942c 942f 10 | 11 | 00:00:11:03 942c 12 | -------------------------------------------------------------------------------- /run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DOCKER_CMD="docker compose -p pycaption" 3 | 4 | SERVICE="test_py312" 5 | 6 | if [ "$@" ]; then 7 | if [ "$1" == "test_py38" ] || [ "$1" == "test_py39" ] || 8 | [ "$1" == "test_py310" ] || [ "$1" == "test_py311" ] || [ "$1" == "test_py312" ]; then 9 | SERVICE="$1" 10 | fi 11 | fi 12 | 13 | $DOCKER_CMD build "$SERVICE" 14 | 15 | function cleanup { 16 | echo "Cleaning up ..." 17 | $DOCKER_CMD down && $DOCKER_CMD rm -fv 18 | } 19 | 20 | $DOCKER_CMD run --rm "$SERVICE" 21 | 22 | if [ $? != 0 ]; then 23 | cleanup 24 | exit 1 25 | else 26 | cleanup 27 | fi 28 | 29 | 30 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. pycaption documentation master file, created by 2 | sphinx-quickstart on Thu Feb 12 12:18:37 2015. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to pycaption's documentation! 7 | ===================================== 8 | 9 | ``pycaption`` is a python library for converting caption formats. 10 | 11 | 12 | Table of contents 13 | ================= 14 | 15 | .. toctree:: 16 | :maxdepth: 3 17 | 18 | Introduction 19 | Supported formats 20 | Extensibility 21 | Changelog 22 | -------------------------------------------------------------------------------- /tests/test_microdvd_conversion.py: -------------------------------------------------------------------------------- 1 | from pycaption import MicroDVDReader, MicroDVDWriter, SAMIReader 2 | 3 | from tests.mixins import MicroDVDTestingMixIn 4 | 5 | 6 | class TestMicroDVDtoMicroDVD(MicroDVDTestingMixIn): 7 | def test_microdvd_to_microdvd_conversion(self, sample_microdvd): 8 | caption_set = MicroDVDReader().read(sample_microdvd) 9 | results = MicroDVDWriter().write(caption_set) 10 | 11 | assert isinstance(results, str) 12 | self.assert_microdvd_equals(sample_microdvd, results) 13 | 14 | 15 | class TestSAMItoMicroDVD(MicroDVDTestingMixIn): 16 | def test_sami_to_micro_dvd_conversion(self, sample_microdvd_2, sample_sami): 17 | caption_set = SAMIReader().read(sample_sami) 18 | results = MicroDVDWriter().write(caption_set) 19 | 20 | assert isinstance(results, str) 21 | self.assert_microdvd_equals(sample_microdvd_2, results) 22 | -------------------------------------------------------------------------------- /examples/example.sub: -------------------------------------------------------------------------------- 1 | {230}{307}( clock ticking ) 2 | {371}{433}MAN:|When we think|of E equals m c-squared, 3 | {433}{468}we have this vision of Einstein 4 | {468}{522}as an old, wrinkly man|with white hair. 5 | {522}{669}MAN 2:|E equals m c-squared is|not about an old Einstein. 6 | {669}{754}It's actually about|a young, energetic, dynamic, 7 | {754}{805}even a sexy Einstein. 8 | {841}{953}ACTOR AS EINSTEIN:|What would I see if I rode|on a beam of light? 9 | {1087}{1137}MAN:|Perhaps some sort 10 | {1137}{1197}of electrical force is emanating 11 | {1197}{1224}outwards from|the wire. 12 | {1224}{1255}What? 13 | {1255}{1282}MAN:|Faraday, my dear boy, 14 | {1282}{1317}electricity flows|through a wire, 15 | {1317}{1373}not sideways to it. 16 | {1373}{1412}You see, John? 17 | {1412}{1431}You see? 18 | {1549}{1614}MAN:|It is my great ambition|to demonstrate 19 | {1614}{1713}that nature is a closed system; 20 | {1713}{1762}that in any transformation, 21 | {1762}{1919}no amount of matter, no mass,|is ever lost and none is gained. 22 | -------------------------------------------------------------------------------- /.github/workflows/create_github_release.yml: -------------------------------------------------------------------------------- 1 | name: Release PyCaption on GitHub 2 | 3 | on: 4 | push: 5 | tags: 6 | - "[0-9]+.[0-9]+.[0-9]+" 7 | 8 | jobs: 9 | call-unit-tests-workflow: 10 | name: Run unit tests 11 | uses: pbs/pycaption/.github/workflows/unit_tests.yml@main 12 | secrets: 13 | SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} 14 | SLACK_CHANNEL_ID: ${{ secrets.SLACK_CHANNEL_ID }} 15 | 16 | create-release: 17 | name: Release PyCaption on GitHub 18 | needs: call-unit-tests-workflow 19 | runs-on: ubuntu-latest 20 | steps: 21 | - uses: actions/checkout@v2 22 | 23 | - name: Extract release notes for current version 24 | env: 25 | TAG: ${{ github.ref }} 26 | run: | 27 | sudo apt-get install -y --no-install-recommends pandoc 28 | pandoc docs/changelog.rst -f rst -t gfm -o changelog.md 29 | sed -n "/## ${TAG#refs/tags/}/,/## /p" changelog.md | sed -e "/## /d" >> notes.md 30 | 31 | - name: Create release on GitHub 32 | run: gh release create ${{ github.ref }} --notes-file notes.md 33 | env: 34 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 35 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Publish PyCaption to PyPI 2 | 3 | on: workflow_dispatch 4 | 5 | jobs: 6 | call-unit-tests-workflow: 7 | name: Run unit tests 8 | uses: pbs/pycaption/.github/workflows/unit_tests.yml@main 9 | secrets: 10 | SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} 11 | SLACK_CHANNEL_ID: ${{ secrets.SLACK_CHANNEL_ID }} 12 | 13 | build-n-publish: 14 | name: Build and publish PyCaption to PyPI 15 | needs: call-unit-tests-workflow 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v2 19 | 20 | - name: Set up Python 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: 3.9 24 | 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install build 29 | python -m pip install --upgrade twine 30 | 31 | - name: Build package 32 | run: python -m build 33 | timeout-minutes: 10 34 | 35 | - name: Publish package on PyPI 36 | run: python -m twine upload --verbose dist/* 37 | env: 38 | TWINE_USERNAME: __token__ 39 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | test_py310: 5 | image: python:3.10-slim-bullseye 6 | command: sh -c " 7 | cd pycaption; 8 | pip install --upgrade pip; 9 | pip install -r test_requirements.txt; 10 | pip install -e .; 11 | pytest -vvvv --color=yes --junit-xml=junit.xml --cov=pycaption --cov-report xml:coverage.xml; 12 | " 13 | volumes: 14 | - .:/pycaption 15 | 16 | test_py311: 17 | image: python:3.11-slim-bullseye 18 | command: sh -c " 19 | cd pycaption; 20 | pip install --upgrade pip; 21 | pip install -r test_requirements.txt; 22 | pip install -e .; 23 | pytest -vvvv --color=yes --junit-xml=junit.xml --cov=pycaption --cov-report xml:coverage.xml; 24 | " 25 | volumes: 26 | - .:/pycaption 27 | 28 | test_py312: 29 | image: python:3.12-slim-bullseye 30 | command: sh -c " 31 | cd pycaption; 32 | pip install --upgrade pip; 33 | pip install -r test_requirements.txt; 34 | pip install -e .; 35 | pytest -vvvv --color=yes --junit-xml=junit.xml --cov=pycaption --cov-report xml:coverage.xml; 36 | " 37 | volumes: 38 | - .:/pycaption -------------------------------------------------------------------------------- /pycaption/transcript.py: -------------------------------------------------------------------------------- 1 | from pycaption.base import BaseWriter, CaptionNode 2 | 3 | 4 | class TranscriptWriter(BaseWriter): 5 | def __init__(self, *args, **kw): 6 | try: 7 | from nltk import PunktSentenceTokenizer 8 | 9 | self.tokenizer = PunktSentenceTokenizer() 10 | except ModuleNotFoundError as exc: 11 | raise ModuleNotFoundError( 12 | "Missing Dependency: You must install nltk" 13 | ) from exc 14 | 15 | def write(self, captions): 16 | transcripts = [] 17 | 18 | for lang in captions.get_languages(): 19 | lang_transcript = "" 20 | 21 | for caption in captions.get_captions(lang): 22 | lang_transcript = self._strip_text(caption.nodes, lang_transcript) 23 | 24 | lang_transcript = "\n".join(self.tokenizer.tokenize(lang_transcript)) 25 | transcripts.append(lang_transcript) 26 | 27 | return "\n".join(transcripts) 28 | 29 | def _strip_text(self, elements, lang_transcript): 30 | return " ".join( 31 | [lang_transcript] 32 | + [el.content for el in elements if el.type_ == CaptionNode.TEXT] 33 | ) 34 | -------------------------------------------------------------------------------- /pycaption/exceptions.py: -------------------------------------------------------------------------------- 1 | class CaptionReadError(Exception): 2 | """ 3 | Generic error raised when the reading of the caption file failed. 4 | """ 5 | def __str__(self): 6 | return f'{self.__class__.__name__}({self.args[0]})' 7 | 8 | 9 | class CaptionReadNoCaptions(CaptionReadError): 10 | """ 11 | Error raised when the provided caption file was not containing any 12 | actual captions. 13 | """ 14 | 15 | 16 | class CaptionReadSyntaxError(CaptionReadError): 17 | """ 18 | Error raised when the provided caption file has syntax errors and could 19 | not be parsed. 20 | """ 21 | 22 | 23 | class CaptionReadTimingError(CaptionReadError): 24 | """ 25 | Error raised when a Caption is initialized with invalid timings. 26 | """ 27 | 28 | 29 | class RelativizationError(Exception): 30 | """ 31 | Error raised when absolute positioning cannot be converted to 32 | percentage 33 | """ 34 | 35 | 36 | class InvalidInputError(RuntimeError): 37 | """Error raised when the input is invalid (i.e. a unicode string)""" 38 | 39 | 40 | class CaptionLineLengthError(CaptionReadError): 41 | """ 42 | Error raised when a Caption has a line longer than 32 characters. 43 | """ 44 | -------------------------------------------------------------------------------- /.github/workflows/release_test_pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish PyCaption to Test PyPI 2 | 3 | on: workflow_dispatch 4 | 5 | jobs: 6 | call-unit-tests-workflow: 7 | name: Run unit tests 8 | uses: ./.github/workflows/unit_tests.yml 9 | secrets: 10 | SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} 11 | SLACK_CHANNEL_ID: ${{ secrets.SLACK_CHANNEL_ID }} 12 | 13 | build-n-publish: 14 | name: Build and publish PyCaption to Test PyPI 15 | needs: call-unit-tests-workflow 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v2 19 | 20 | - name: Set up Python 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: 3.9 24 | 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install build 29 | python -m pip install --upgrade twine 30 | 31 | - name: Build package 32 | run: python -m build 33 | timeout-minutes: 10 34 | 35 | - name: Publish package on Test PyPI 36 | run: python -m twine upload --verbose dist/* 37 | env: 38 | TWINE_USERNAME: __token__ 39 | TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }} 40 | TWINE_REPOSITORY: "testpypi" 41 | -------------------------------------------------------------------------------- /pycaption/scc/translator.py: -------------------------------------------------------------------------------- 1 | from pycaption.scc.constants import ALL_CHARACTERS, COMMAND_LABELS 2 | 3 | 4 | def translate_scc(scc_content, brackets='[]'): 5 | """ 6 | Replaces hexadecimal words with their meaning 7 | 8 | In order to make SCC files more human-readable and easier to debug, 9 | this function is used to replace command codes with their labels and 10 | character bytes with their actual characters 11 | 12 | :param scc_content: SCC captions to be translated 13 | :type scc_content: str 14 | :param brackets: Brackets to group the translated content of a command 15 | :type brackets: str 16 | :return: Translated SCC captions 17 | :rtype: str 18 | """ 19 | opening_bracket, closing_bracket = brackets if brackets else ('', '') 20 | scc_elements = set(scc_content.split()) 21 | for elem in scc_elements: 22 | name = COMMAND_LABELS.get(elem, ALL_CHARACTERS.get(elem)) 23 | # If a 2 byte command was not found, try retrieving 1 byte characters 24 | if not name: 25 | char1 = ALL_CHARACTERS.get(elem[:2]) 26 | char2 = ALL_CHARACTERS.get(elem[2:]) 27 | if char1 is not None and char2 is not None: 28 | name = f"{char1}{char2}" 29 | if name: 30 | scc_content = scc_content.replace( 31 | elem, f"{opening_bracket}{name}{closing_bracket}") 32 | return scc_content 33 | -------------------------------------------------------------------------------- /tests/test_scc_translator.py: -------------------------------------------------------------------------------- 1 | from pycaption.scc.translator import translate_scc 2 | 3 | 4 | class TestSCCTranslator: 5 | 6 | def test_successful_translation( 7 | self, sample_scc_pop_on, sample_translated_scc_success): 8 | result = translate_scc(sample_scc_pop_on) 9 | 10 | assert sample_translated_scc_success == result 11 | 12 | def test_no_brackets( 13 | self, sample_scc_pop_on, sample_translated_scc_no_brackets): 14 | result = translate_scc(sample_scc_pop_on, brackets=None) 15 | 16 | assert sample_translated_scc_no_brackets == result 17 | 18 | def test_custom_brackets( 19 | self, sample_scc_pop_on, sample_translated_scc_custom_brackets): 20 | result = translate_scc(sample_scc_pop_on, brackets="{}") 21 | 22 | assert sample_translated_scc_custom_brackets == result 23 | 24 | def test_commands_not_found(self, sample_scc_with_unknown_commands, 25 | sample_translated_scc_commands_not_found): 26 | result = translate_scc(sample_scc_with_unknown_commands) 27 | 28 | assert sample_translated_scc_commands_not_found == result 29 | 30 | def test_special_and_extended_characters( 31 | self, sample_scc_special_and_extended_characters, 32 | sample_translated_scc_special_and_extended_characters): 33 | result = translate_scc(sample_scc_special_and_extended_characters) 34 | 35 | assert sample_translated_scc_special_and_extended_characters == result 36 | -------------------------------------------------------------------------------- /pycaption/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import ( 2 | CaptionConverter, CaptionNode, Caption, CaptionList, CaptionSet, 3 | ) 4 | from .dfxp import DFXPWriter, DFXPReader 5 | from .microdvd import MicroDVDReader, MicroDVDWriter 6 | from .sami import SAMIReader, SAMIWriter 7 | from .srt import SRTReader, SRTWriter 8 | from .scc import SCCReader, SCCWriter 9 | from .scc.translator import translate_scc 10 | from .transcript import TranscriptWriter 11 | from .webvtt import WebVTTReader, WebVTTWriter 12 | from .exceptions import ( 13 | CaptionReadError, CaptionReadNoCaptions, CaptionReadSyntaxError, CaptionLineLengthError 14 | ) 15 | 16 | 17 | __all__ = [ 18 | 'CaptionConverter', 'DFXPReader', 'DFXPWriter', 'MicroDVDReader', 19 | 'MicroDVDWriter', 'SAMIReader', 'SAMIWriter', 'SRTReader', 'SRTWriter', 20 | 'SCCReader', 'SCCWriter', 'translate_scc', 'WebVTTReader', 'WebVTTWriter', 21 | 'CaptionReadError', 'CaptionReadNoCaptions', 'CaptionReadSyntaxError', 22 | 'detect_format', 'CaptionNode', 'Caption', 'CaptionList', 'CaptionSet', 23 | 'TranscriptWriter' 24 | ] 25 | 26 | SUPPORTED_READERS = ( 27 | DFXPReader, MicroDVDReader, WebVTTReader, SAMIReader, SRTReader, SCCReader, 28 | ) 29 | 30 | 31 | def detect_format(caps): 32 | """ 33 | Detect the format of the provided caption string. 34 | 35 | :returns: the reader class for the detected format. 36 | """ 37 | if not len(caps): 38 | raise CaptionReadNoCaptions("Empty caption file") 39 | 40 | for reader in SUPPORTED_READERS: 41 | if reader().detect(caps): 42 | return reader 43 | 44 | return None 45 | -------------------------------------------------------------------------------- /tests/fixtures/microdvd.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture(scope="session") 5 | def sample_microdvd(): 6 | return """{230}{307}( clock ticking ) 7 | {371}{425}MAN:|When we think|\u266a ...say bow, wow, \u266a 8 | {425}{468}we have this vision of Einstein 9 | {468}{522}as an old, wrinkly man|with white hair. 10 | {522}{669}MAN 2:|E equals m c-squared is|not about an old Einstein. 11 | {669}{805}MAN 2:|It's all about an eternal Einstein. 12 | {805}{905} 13 | """ 14 | 15 | 16 | @pytest.fixture(scope="session") 17 | def sample_microdvd_2(): 18 | return """{230}{307}( clock ticking ) 19 | {371}{425}MAN:|When we think|\u266a ...say bow, wow, \u266a 20 | {425}{468}we have this vision of Einstein 21 | {468}{522}|as an old, wrinkly man|with white hair. 22 | {522}{669}MAN 2:|E equals m c-squared is|not about an old Einstein. 23 | {669}{805}MAN 2:|It's all about an eternal Einstein. 24 | {805}{905} 25 | """ 26 | 27 | 28 | @pytest.fixture(scope="session") 29 | def sample_microdvd_invalid_format(): 30 | return """{230}{307}( clock ticking ) 31 | {}{425}{567} MAN:|When we think|\u266a ...say bow, wow, \u266a 32 | {425}{468}we have this vision of Einstein 33 | """ 34 | 35 | 36 | @pytest.fixture(scope="session") 37 | def missing_fps_sample_microdvd(): 38 | return """{301}{307}( clock ticking ) 39 | {0}{0} MAN:|When we think|\u266a ...say bow, wow, \u266a 40 | """ 41 | 42 | 43 | @pytest.fixture(scope="session") 44 | def sample_microdvd_empty(): 45 | return """ 46 | """ 47 | 48 | 49 | @pytest.fixture(scope="session") 50 | def sample_microdvd_empty_cue_output(): 51 | return """{30}{57}abc""" 52 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | py-caption 2 | ========== 3 | 4 | |Build Status| |Python Versions| |Pre-Commit| |Dependencies| |License| 5 | 6 | ``pycaption`` is a caption reading/writing module. Use one of the given Readers 7 | to read content into a CaptionSet object, and then use one of the Writers to 8 | output the CaptionSet into captions of your desired format. 9 | 10 | Tested with Python versions 3.8, 3.9, 3.10, 3.11 and 3.12. 11 | (for Python 2 use pycaption < 1.0.0) 12 | 13 | For details, see the `documentation `__. 14 | 15 | License 16 | ------- 17 | 18 | This module is Copyright (c) 2012-2025 PBS.org and is available under the `Apache 19 | License, Version 2.0 `__. 20 | 21 | .. |Build Status| image:: https://github.com/pbs/pycaption/actions/workflows/unit_tests.yml/badge.svg 22 | :target: https://github.com/pbs/pycaption/actions/workflows/unit_tests.yml 23 | :alt: Unit Tests 24 | 25 | .. |Pre-Commit| image:: https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white 26 | :target: https://github.com/pre-commit/pre-commit 27 | :alt: pre-commit 28 | 29 | .. |Dependencies| image:: https://img.shields.io/librariesio/release/pypi/pycaption 30 | :target: https://libraries.io/pypi/pycaption 31 | :alt: Dependencies 32 | 33 | .. |Python Versions| image:: https://img.shields.io/pypi/pyversions/pycaption 34 | :target: https://pypi.org/project/pycaption/ 35 | :alt: Python Versions 36 | 37 | .. |License| image:: https://img.shields.io/github/license/pbs/pycaption 38 | :target: https://github.com/pbs/pycaption/blob/main/LICENSE 39 | :alt: License 40 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | 4 | from setuptools import find_packages, setup 5 | 6 | README_PATH = os.path.join( 7 | os.path.abspath(os.path.dirname(__file__)), 8 | "README.rst", 9 | ) 10 | 11 | dependencies = [ 12 | "beautifulsoup4>=4.12.1", 13 | "lxml>=4.9.1", 14 | "cssutils>=2.0.0", 15 | ] 16 | 17 | dev_dependencies = ["pytest", "pytest-lazy-fixture"] 18 | 19 | transcript_dependencies = ["nltk==3.9.1"] 20 | 21 | setup( 22 | name="pycaption", 23 | version="2.2.19", 24 | description="Closed caption converter", 25 | long_description=open(README_PATH).read(), 26 | author="Joe Norton", 27 | author_email="joey@nortoncrew.com", 28 | project_urls={ 29 | "Source": "https://github.com/pbs/pycaption", 30 | "Documentation": "https://pycaption.readthedocs.io/", 31 | "Release notes": "https://pycaption.readthedocs.io" "/en/stable/changelog.html", 32 | }, 33 | python_requires=">=3.10,<4.0", 34 | install_requires=dependencies, 35 | extras_require={"dev": dev_dependencies, "transcript": transcript_dependencies}, 36 | packages=find_packages(), 37 | include_package_data=True, 38 | classifiers=[ 39 | "Development Status :: 5 - Production/Stable", 40 | "License :: OSI Approved :: Apache Software License", 41 | "Operating System :: OS Independent", 42 | "Programming Language :: Python", 43 | "Programming Language :: Python :: 3", 44 | "Programming Language :: Python :: 3.10", 45 | "Programming Language :: Python :: 3.11", 46 | "Programming Language :: Python :: 3.12", 47 | "Topic :: Software Development :: Libraries", 48 | "Topic :: Software Development :: Libraries :: Python Modules", 49 | "Topic :: Multimedia :: Video", 50 | ], 51 | test_suite="tests", 52 | ) 53 | -------------------------------------------------------------------------------- /tests/test_base.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pycaption.base import CaptionList, Caption 4 | 5 | 6 | class TestCaption: 7 | def setup_method(self): 8 | self.caption = Caption(0, 999999999999, ['test']) 9 | 10 | def test_format_start(self): 11 | assert self.caption.format_start() == '00:00:00.000' 12 | 13 | def test_format_end(self): 14 | assert self.caption.format_end() == '13:46:39.999' 15 | 16 | 17 | class TestCaptionList: 18 | def setup_method(self): 19 | self.layout_info = "My Layout" 20 | self.caps = CaptionList([1, 2, 3], layout_info=self.layout_info) 21 | 22 | def test_splice(self): 23 | newcaps = self.caps[1:] 24 | 25 | assert isinstance(newcaps, CaptionList) 26 | assert newcaps.layout_info == self.layout_info 27 | 28 | def test_mul(self): 29 | newcaps = self.caps * 2 30 | 31 | assert isinstance(newcaps, CaptionList) 32 | assert newcaps.layout_info == self.layout_info 33 | 34 | def test_rmul(self): 35 | newcaps = 2 * self.caps 36 | 37 | assert isinstance(newcaps, CaptionList) 38 | assert newcaps.layout_info == self.layout_info 39 | 40 | def test_add_list_to_caption_list(self): 41 | newcaps = self.caps + [9, 8, 7] 42 | 43 | assert isinstance(newcaps, CaptionList) 44 | assert newcaps.layout_info == self.layout_info 45 | 46 | def test_add_two_caption_lists(self): 47 | newcaps = self.caps + CaptionList([4], layout_info=None) 48 | 49 | assert isinstance(newcaps, CaptionList) 50 | assert newcaps.layout_info == self.layout_info 51 | 52 | newcaps = self.caps + CaptionList([4], layout_info=self.layout_info) 53 | 54 | assert isinstance(newcaps, CaptionList) 55 | assert newcaps.layout_info == self.layout_info 56 | 57 | with pytest.raises(ValueError): 58 | newcaps = self.caps + CaptionList([4], layout_info="Other Layout") 59 | -------------------------------------------------------------------------------- /tests/test_srt_conversion.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from pycaption import ( 4 | DFXPReader, SAMIReader, SRTReader, SRTWriter, WebVTTReader, 5 | ) 6 | 7 | from tests.mixins import SRTTestingMixIn 8 | 9 | 10 | class TestDFXPtoSRT(SRTTestingMixIn): 11 | def setup_class(self): 12 | self.reader = DFXPReader() 13 | self.writer = SRTWriter() 14 | 15 | def test_dfxp_to_srt_conversion(self, sample_srt, sample_dfxp): 16 | caption_set = self.reader.read(sample_dfxp) 17 | results = self.writer.write(caption_set) 18 | 19 | assert isinstance(results, str) 20 | self.assert_srt_equals(sample_srt, results) 21 | 22 | def test_dfxp_empty_cue_to_srt(self, sample_srt_empty_cue_output, 23 | sample_dfxp_empty_cue): 24 | caption_set = self.reader.read(sample_dfxp_empty_cue) 25 | results = self.writer.write(caption_set) 26 | 27 | self.assert_srt_equals(sample_srt_empty_cue_output, results) 28 | 29 | 30 | class TestSAMItoSRT(SRTTestingMixIn): 31 | def test_sami_to_srt_conversion(self, sample_srt, sample_sami): 32 | caption_set = SAMIReader().read(sample_sami) 33 | results = SRTWriter().write(caption_set) 34 | 35 | assert isinstance(results, str) 36 | self.assert_srt_equals(sample_srt, results) 37 | 38 | 39 | class TestSRTtoSRT(SRTTestingMixIn): 40 | def setup_class(self): 41 | self.reader = SRTReader() 42 | self.writer = SRTWriter() 43 | 44 | def test_srt_to_srt_conversion(self, sample_srt): 45 | caption_set = self.reader.read(sample_srt) 46 | results = self.writer.write(caption_set) 47 | 48 | assert isinstance(results, str) 49 | self.assert_srt_equals(sample_srt, results) 50 | 51 | def test_multiple_lines_for_one_sentence(self, samples_srt_same_time): 52 | caption_set = self.reader.read(samples_srt_same_time) 53 | results = self.writer.write(caption_set) 54 | sentences = re.split(r"\n\d\n", results) 55 | 56 | assert 3 == len(sentences) 57 | assert 4 == len(sentences[0].splitlines()) 58 | 59 | 60 | class TestWebVTTtoSRT(SRTTestingMixIn): 61 | def test_webvtt_to_srt_conversion(self, sample_srt, sample_webvtt): 62 | caption_set = WebVTTReader().read(sample_webvtt) 63 | results = SRTWriter().write(caption_set) 64 | 65 | assert isinstance(results, str) 66 | self.assert_srt_equals(sample_srt, results) 67 | -------------------------------------------------------------------------------- /tests/test_microdvd.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pycaption import MicroDVDReader, CaptionReadNoCaptions 4 | from pycaption.exceptions import CaptionReadSyntaxError, CaptionReadTimingError 5 | from pycaption.base import DEFAULT_LANGUAGE_CODE 6 | from tests.mixins import ReaderTestingMixIn 7 | 8 | 9 | class TestMicroDVDReader(ReaderTestingMixIn): 10 | def setup_class(self): 11 | self.reader = MicroDVDReader() 12 | 13 | def test_positive_answer_for_detection(self, sample_microdvd): 14 | super().assert_positive_answer_for_detection(sample_microdvd) 15 | 16 | def test_negative_answer_for_detection_dfxp(self, sample_dfxp): 17 | super().assert_negative_answer_for_detection(sample_dfxp) 18 | 19 | def test_negative_answer_for_detection_sami(self, sample_sami): 20 | super().assert_negative_answer_for_detection(sample_sami) 21 | 22 | def test_negative_answer_for_detection_scc_pop_on(self, sample_scc_pop_on): 23 | super().assert_negative_answer_for_detection(sample_scc_pop_on) 24 | 25 | def test_negative_answer_for_detection_srt(self, sample_srt): 26 | super().assert_negative_answer_for_detection(sample_srt) 27 | 28 | def test_negative_answer_for_detection_webvtt(self, sample_webvtt): 29 | super().assert_negative_answer_for_detection(sample_webvtt) 30 | 31 | def test_caption_length(self, sample_microdvd): 32 | captions = MicroDVDReader().read(sample_microdvd) 33 | 34 | assert 7 == len(captions.get_captions(DEFAULT_LANGUAGE_CODE)) 35 | 36 | def test_proper_timestamps(self, sample_microdvd): 37 | captions = MicroDVDReader().read(sample_microdvd) 38 | paragraph = captions.get_captions(DEFAULT_LANGUAGE_CODE)[2] 39 | 40 | # due to lossy nature of microsec -> frame# we check that 41 | # conversion is within a second of expected value 42 | # (fyi: timestamps in examples/ and tests/fixtures/ differ) 43 | assert abs(17350000 - paragraph.start) < 10 ** 6 44 | assert abs(18752000 - paragraph.end) < 10 ** 6 45 | 46 | def test_empty_file(self, sample_microdvd_empty): 47 | with pytest.raises(CaptionReadNoCaptions): 48 | MicroDVDReader().read(sample_microdvd_empty) 49 | 50 | def test_invalid_format(self, sample_microdvd_invalid_format): 51 | with pytest.raises(CaptionReadSyntaxError): 52 | MicroDVDReader().read(sample_microdvd_invalid_format) 53 | 54 | def test_no_fps_provided(self, missing_fps_sample_microdvd): 55 | with pytest.raises(CaptionReadTimingError): 56 | MicroDVDReader().read(missing_fps_sample_microdvd) 57 | -------------------------------------------------------------------------------- /tests/test_scc_conversion.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pycaption import ( 4 | DFXPWriter, 5 | SCCReader, 6 | SCCWriter, 7 | SRTReader, 8 | SRTWriter, 9 | WebVTTWriter, 10 | ) 11 | from tests.mixins import CaptionSetTestingMixIn 12 | 13 | # This is quite fuzzy at the moment. 14 | TOLERANCE_MICROSECONDS = 600 * 1000 15 | 16 | 17 | class TestSRTtoSCCtoSRT(CaptionSetTestingMixIn): 18 | def _test_srt_to_scc_to_srt_conversion(self, srt_captions): 19 | captions_1 = SRTReader().read(srt_captions) 20 | scc_results = SCCWriter().write(captions_1) 21 | scc_captions = SCCReader().read(scc_results) 22 | srt_results = SRTWriter().write(scc_captions) 23 | captions_2 = SRTReader().read(srt_results) 24 | self.assert_captionset_almost_equals( 25 | captions_1, captions_2, TOLERANCE_MICROSECONDS 26 | ) 27 | 28 | def test_srt_to_scc_to_srt_conversion(self, sample_srt_ascii): 29 | self._test_srt_to_scc_to_srt_conversion(sample_srt_ascii) 30 | 31 | 32 | # The following test fails -- maybe a bug with SCCReader 33 | # def test_srt_to_srt_unicode_conversion(self): 34 | # self._test_srt_to_scc_to_srt_conversion(SAMPLE_SRT_UNICODE) 35 | 36 | 37 | class TestSCCtoDFXP: 38 | def test_scc_to_dfxp( 39 | self, sample_dfxp_from_scc_output, sample_scc_multiple_positioning 40 | ): 41 | caption_set = SCCReader().read(sample_scc_multiple_positioning) 42 | dfxp = DFXPWriter(relativize=False, fit_to_screen=False).write(caption_set) 43 | assert sample_dfxp_from_scc_output == dfxp 44 | 45 | def test_dfxp_is_valid_xml_when_scc_source_has_weird_italic_commands( 46 | self, 47 | sample_dfxp_with_properly_closing_spans_output, 48 | sample_scc_created_dfxp_with_wrongly_closing_spans, 49 | ): 50 | caption_set = SCCReader().read( 51 | sample_scc_created_dfxp_with_wrongly_closing_spans 52 | ) 53 | 54 | dfxp = DFXPWriter().write(caption_set) 55 | 56 | assert dfxp == sample_dfxp_with_properly_closing_spans_output 57 | 58 | def test_dfxp_is_valid_xml_when_scc_source_has_ampersand_character( 59 | self, sample_dfxp_with_ampersand_character, sample_scc_with_ampersand_character 60 | ): 61 | caption_set = SCCReader().read(sample_scc_with_ampersand_character) 62 | 63 | dfxp = DFXPWriter().write(caption_set) 64 | 65 | assert dfxp == sample_dfxp_with_ampersand_character 66 | 67 | 68 | class TestSCCToWebVTT: 69 | def test_webvtt_newlines_are_properly_rendered( 70 | self, 71 | sample_webvtt_from_scc_properly_writes_newlines_output, 72 | scc_that_generates_webvtt_with_proper_newlines, 73 | ): 74 | caption_set = SCCReader().read(scc_that_generates_webvtt_with_proper_newlines) 75 | webvtt = WebVTTWriter().write(caption_set) 76 | 77 | assert webvtt == sample_webvtt_from_scc_properly_writes_newlines_output 78 | -------------------------------------------------------------------------------- /tests/test_srt.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pycaption import SRTReader, CaptionReadNoCaptions 4 | from tests.mixins import ReaderTestingMixIn 5 | 6 | 7 | class TestSRTReader(ReaderTestingMixIn): 8 | def setup_class(self): 9 | self.reader = SRTReader() 10 | 11 | def test_positive_answer_for_detection(self, sample_srt): 12 | super().assert_positive_answer_for_detection(sample_srt) 13 | 14 | def test_negative_answer_for_detection_dfxp(self, sample_dfxp): 15 | super().assert_negative_answer_for_detection(sample_dfxp) 16 | 17 | def test_negative_answer_for_detection_microdvd(self, sample_microdvd): 18 | super().assert_negative_answer_for_detection(sample_microdvd) 19 | 20 | def test_negative_answer_for_detection_sami(self, sample_sami): 21 | super().assert_negative_answer_for_detection(sample_sami) 22 | 23 | def test_negative_answer_for_detection_scc_pop_on(self, sample_scc_pop_on): 24 | super().assert_negative_answer_for_detection(sample_scc_pop_on) 25 | 26 | def test_negative_answer_for_detection_webvtt(self, sample_webvtt): 27 | super().assert_negative_answer_for_detection(sample_webvtt) 28 | 29 | def test_caption_length(self, sample_srt): 30 | captions = self.reader.read(sample_srt) 31 | 32 | assert 7 == len(captions.get_captions("en-US")) 33 | 34 | def test_proper_timestamps(self, sample_srt): 35 | captions = self.reader.read(sample_srt) 36 | third_paragraph = captions.get_captions("en-US")[2] 37 | 38 | assert 17000000 == third_paragraph.start 39 | assert 18752000 == third_paragraph.end 40 | 41 | def test_numeric_captions(self, sample_srt_numeric): 42 | captions = self.reader.read(sample_srt_numeric) 43 | paragraphs = captions.get_captions("en-US") 44 | 45 | assert 7 == len(captions.get_captions("en-US")) 46 | assert paragraphs[-3].get_text() == "NUMBER IS 662-429-84-77." 47 | assert paragraphs[-1].get_text() == "3" 48 | 49 | def test_empty_file(self, sample_srt_empty): 50 | with pytest.raises(CaptionReadNoCaptions) as exc_info: 51 | self.reader.read(sample_srt_empty) 52 | assert exc_info.value.args[0] == 'empty caption file' 53 | 54 | def test_extra_empty_line(self, sample_srt_blank_lines): 55 | captions = self.reader.read(sample_srt_blank_lines) 56 | paragraphs = captions.get_captions("en-US") 57 | 58 | assert 2 == len(paragraphs) 59 | assert '\n' not in paragraphs[0].get_text() 60 | assert '\n' not in paragraphs[1].get_text() 61 | 62 | def test_extra_trailing_empty_line(self, sample_srt_trailing_blanks): 63 | captions = self.reader.read(sample_srt_trailing_blanks) 64 | paragraphs = captions.get_captions("en-US") 65 | 66 | assert 2 == len(paragraphs) 67 | assert '\n' not in paragraphs[0].get_text() 68 | assert '\n' not in paragraphs[1].get_text() 69 | 70 | def test_timestamps_without_micro( 71 | self, sample_srt_timestamps_without_microseconds): 72 | captions = self.reader.read(sample_srt_timestamps_without_microseconds) 73 | first_paragraph = captions.get_captions("en-US")[0] 74 | 75 | assert 13000000 == first_paragraph.start 76 | assert 16000000 == first_paragraph.end 77 | -------------------------------------------------------------------------------- /.github/workflows/unit_tests.yml: -------------------------------------------------------------------------------- 1 | name: Unit Tests 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | workflow_call: 9 | secrets: 10 | SLACK_BOT_TOKEN: 11 | required: true 12 | SLACK_CHANNEL_ID: 13 | required: true 14 | workflow_dispatch: 15 | 16 | jobs: 17 | build: 18 | 19 | runs-on: ubuntu-latest 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | python-version: ["py310", "py311", "py312"] 24 | 25 | steps: 26 | - uses: actions/checkout@v2 27 | 28 | - name: Run Test 29 | id: tests 30 | run: | 31 | ./run_tests.sh test_${{ matrix.python-version }} 32 | continue-on-error: true 33 | 34 | - name: Archive production artifacts 35 | uses: actions/upload-artifact@v4 36 | with: 37 | name: test-report-${{ matrix.python-version }} 38 | path: junit.xml 39 | 40 | - name: Archive code coverage results 41 | uses: actions/upload-artifact@v4 42 | with: 43 | name: code-coverage-report-${{ matrix.python-version }} 44 | path: coverage.xml 45 | 46 | - name: Add context info to env 47 | run: | 48 | sudo apt-get install -y --no-install-recommends libxml-xpath-perl 49 | COVERAGE=`xpath -q -e "floor(/coverage/@line-rate * 100)" coverage.xml` 50 | FAILED_AMOUNT=`xpath -q -e "number(/testsuites/testsuite/@failures)" junit.xml` 51 | SKIPPED_AMOUNT=`xpath -q -e "number(/testsuites/testsuite/@skipped)" junit.xml` 52 | PASSED_AMOUNT=`xpath -q -e "/testsuites/testsuite/@tests - $SKIPPED_AMOUNT - $FAILED_AMOUNT" junit.xml` 53 | echo "COVERAGE=$COVERAGE" >> $GITHUB_ENV 54 | echo "FAILED_AMOUNT=$FAILED_AMOUNT" >> $GITHUB_ENV 55 | echo "PASSED_AMOUNT=$PASSED_AMOUNT" >> $GITHUB_ENV 56 | ${{ contains(github.ref, 'tags/') }} \ 57 | && BRANCH='refs/heads/main' \ 58 | || BRANCH=${{ github.head_ref || github.ref }} 59 | echo "BRANCH=${BRANCH#refs/*/}" >> $GITHUB_ENV 60 | 61 | - name: Notify if test FAILED 62 | uses: archive/github-actions-slack@v2.0.0 63 | with: 64 | slack-bot-user-oauth-access-token: ${{ secrets.SLACK_BOT_TOKEN }} 65 | slack-channel: ${{ secrets.SLACK_CHANNEL_ID }} 66 | slack-text: ":boom: *${{ env.FAILED_AMOUNT }}* Pycaption test(s) failed for Python *${{ matrix.python-version }}* on the *${{ env.BRANCH }}* branch (triggered by *${{ github.actor }}*)" 67 | if: steps.tests.outcome == 'failure' && !github.event.pull_request.head.repo.fork 68 | 69 | - name: Mark test failure 70 | run: exit 1 71 | if: steps.tests.outcome == 'failure' 72 | 73 | - name: Slack notify tests PASSED 74 | uses: archive/github-actions-slack@v2.0.0 75 | with: 76 | slack-bot-user-oauth-access-token: ${{ secrets.SLACK_BOT_TOKEN }} 77 | slack-channel: ${{ secrets.SLACK_CHANNEL_ID }} 78 | slack-text: ":rocket: All (*${{ env.PASSED_AMOUNT }}*) Pycaption tests passed for Python *${{ matrix.python-version }}* covering *${{ env.COVERAGE }}%* of code on the *${{ env.BRANCH }}* branch (triggered by *${{ github.actor }}*)" 79 | if: steps.tests.outcome == 'success' && !github.event.pull_request.head.repo.fork 80 | -------------------------------------------------------------------------------- /tests/fixtures/srt.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture(scope="session") 5 | def sample_srt(): 6 | return """1 7 | 00:00:09,209 --> 00:00:12,312 8 | ( clock ticking ) 9 | 10 | 2 11 | 00:00:14,848 --> 00:00:17,000 12 | MAN: 13 | When we think 14 | \u266a ...say bow, wow, \u266a 15 | 16 | 3 17 | 00:00:17,000 --> 00:00:18,752 18 | we have this vision of Einstein 19 | 20 | 4 21 | 00:00:18,752 --> 00:00:20,887 22 | as an old, wrinkly man 23 | with white hair. 24 | 25 | 5 26 | 00:00:20,887 --> 00:00:26,760 27 | MAN 2: 28 | E equals m c-squared is 29 | not about an old Einstein. 30 | 31 | 6 32 | 00:00:26,760 --> 00:00:32,200 33 | MAN 2: 34 | It's all about an eternal Einstein. 35 | 36 | 7 37 | 00:00:32,200 --> 00:00:36,200 38 | 39 | """ 40 | 41 | 42 | @pytest.fixture(scope="session") 43 | def sample_srt_ascii(): 44 | return """1 45 | 00:00:09,209 --> 00:00:12,312 46 | ( clock ticking ) 47 | 48 | 2 49 | 00:00:14,848 --> 00:00:17,000 50 | MAN: 51 | When we think 52 | of "E equals m c-squared", 53 | 54 | 3 55 | 00:00:17,000 --> 00:00:18,752 56 | we have this vision of Einstein 57 | 58 | 4 59 | 00:00:18,752 --> 00:00:20,887 60 | as an old, wrinkly man 61 | with white hair. 62 | 63 | 5 64 | 00:00:20,887 --> 00:00:26,760 65 | MAN 2: 66 | E equals m c-squared is 67 | not about an old Einstein. 68 | 69 | 6 70 | 00:00:26,760 --> 00:00:32,200 71 | MAN 2: 72 | It's all about an eternal Einstein. 73 | 74 | 7 75 | 00:00:32,200 --> 00:00:34,400 76 | 77 | 78 | 8 79 | 00:00:34,400 --> 00:00:38,400 80 | some more text 81 | """ 82 | 83 | 84 | @pytest.fixture(scope="session") 85 | def sample_srt_numeric(): 86 | return """35 87 | 00:00:32,290 --> 00:00:32,890 88 | TO FIND HIM. IF 89 | 90 | 36 91 | 00:00:32,990 --> 00:00:33,590 92 | YOU HAVE ANY INFORMATION 93 | 94 | 37 95 | 00:00:33,690 --> 00:00:34,290 96 | THAT CAN HELP, CALL THE 97 | 98 | 38 99 | 00:00:34,390 --> 00:00:35,020 100 | STOPPERS LINE. THAT 101 | 102 | 39 103 | 00:00:35,120 --> 00:00:35,760 104 | NUMBER IS 662-429-84-77. 105 | 106 | 40 107 | 00:00:35,860 --> 00:00:36,360 108 | STD OUT 109 | 110 | 41 111 | 00:00:36,460 --> 00:02:11,500 112 | 3 113 | """ 114 | 115 | 116 | @pytest.fixture(scope="session") 117 | def sample_srt_empty(): 118 | return """ 119 | """ 120 | 121 | 122 | @pytest.fixture(scope="session") 123 | def sample_srt_blank_lines(): 124 | return """35 125 | 00:00:32,290 --> 00:00:32,890 126 | 127 | 128 | 36 129 | 00:00:32,990 --> 00:00:33,590 130 | YOU HAVE ANY INFORMATION 131 | 132 | """ 133 | 134 | 135 | @pytest.fixture(scope="session") 136 | def sample_srt_trailing_blanks(): 137 | return """35 138 | 00:00:32,290 --> 00:00:32,890 139 | HELP I SAY 140 | 141 | 142 | 36 143 | 00:00:32,990 --> 00:00:33,590 144 | YOU HAVE ANY INFORMATION 145 | 146 | 147 | 148 | """ 149 | 150 | 151 | @pytest.fixture(scope="session") 152 | def samples_srt_same_time(): 153 | return """1 154 | 00:00:05,213 --> 00:00:10,552 155 | SO NO ONE TOLD YOU 156 | 157 | 2 158 | 00:00:05,213 --> 00:00:10,552 159 | LIFE WAS GONNA BE THIS WAY 160 | 161 | 3 162 | 00:00:10,566 --> 00:00:10,580 163 | YOUR JOB IS A JOKE, YOUR ARE BROKE 164 | 165 | 4 166 | 00:00:10,594 --> 00:00:10,600 167 | IT IS LIKE YOU ARE ALWAYS STUCK 168 | 169 | 5 170 | 00:00:10,594 --> 00:00:10,600 171 | IN A SECOND GEAR 172 | """ 173 | 174 | 175 | @pytest.fixture(scope="session") 176 | def sample_srt_empty_cue_output(): 177 | return """\ 178 | 1 179 | 00:00:01,209 --> 00:00:02,312 180 | abc 181 | """ 182 | 183 | 184 | @pytest.fixture(scope="session") 185 | def sample_srt_timestamps_without_microseconds(): 186 | return """\ 187 | 1 188 | 00:00:13 --> 00:00:16 189 | Guard this envelope. 190 | If anything happens 191 | to me 192 | 193 | 2 194 | 00:00:16 --> 00:00:18 195 | see that it reaches 196 | the hands of Mr 197 | Sherlock Holmes 198 | """ 199 | -------------------------------------------------------------------------------- /pycaption/microdvd.py: -------------------------------------------------------------------------------- 1 | import re 2 | from copy import deepcopy 3 | 4 | from .base import ( 5 | BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode, 6 | DEFAULT_LANGUAGE_CODE, 7 | ) 8 | from .exceptions import ( 9 | CaptionReadNoCaptions, CaptionReadSyntaxError, CaptionReadTimingError, 10 | InvalidInputError, 11 | ) 12 | 13 | 14 | class MicroDVDReader(BaseReader): 15 | def detect(self, content): 16 | return re.match(r"{\d+}{\d+}", content) is not None 17 | 18 | def read(self, content, lang=DEFAULT_LANGUAGE_CODE): 19 | if not isinstance(content, str): 20 | raise InvalidInputError('The content is not a unicode string.') 21 | 22 | lines = content.splitlines() 23 | captions = CaptionList() 24 | fps = 25.0 25 | for line in lines: 26 | if not line: 27 | continue 28 | 29 | m = re.match(r"{(\d+)}{(\d+)}(.*)", line) 30 | if not m: 31 | raise CaptionReadSyntaxError( 32 | "Line does not match expected format") 33 | 34 | start, end, txt = m.groups() 35 | 36 | if start == '0' and end == '0': 37 | try: 38 | fps = float(txt) 39 | continue 40 | except ValueError: 41 | raise CaptionReadTimingError( 42 | 'FPS information is not provided') 43 | 44 | caption_start = self._framestomicro(int(start), fps) 45 | caption_end = self._framestomicro(int(end), fps) 46 | nodes = [] 47 | 48 | for line in txt.split('|'): 49 | # skip extra blank lines 50 | if line != '': 51 | nodes.append(CaptionNode.create_text(line)) 52 | nodes.append(CaptionNode.create_break()) 53 | 54 | # remove last line break from end of caption list 55 | if len(nodes): 56 | nodes.pop() 57 | 58 | caption = Caption(caption_start, caption_end, nodes) 59 | captions.append(caption) 60 | 61 | caption_set = CaptionSet({lang: captions}) 62 | caption_set.set_captions(lang, captions) 63 | 64 | if caption_set.is_empty(): 65 | raise CaptionReadNoCaptions("Empty caption file") 66 | 67 | return caption_set 68 | 69 | def _framestomicro(self, framenum, fps=25.0): 70 | return int(framenum / fps * (10 ** 6)) 71 | 72 | 73 | class MicroDVDWriter(BaseWriter): 74 | def write(self, caption_set): 75 | caption_set = deepcopy(caption_set) 76 | 77 | captions = [] 78 | 79 | for lang in caption_set.get_languages(): 80 | captions.append( 81 | self._recreate_lang(caption_set.get_captions(lang)) 82 | ) 83 | 84 | return ''.join(captions) 85 | 86 | def _microtoframes(self, micro, fps=25.0): 87 | return int(micro * fps / (10 ** 6)) 88 | 89 | def _recreate_lang(self, captions): 90 | sub = '' 91 | 92 | for caption in captions: 93 | start = self._microtoframes(caption.start) 94 | end = self._microtoframes(caption.end) 95 | sub += f'{{{start}}}{{{end}}}' 96 | 97 | new_content = '' 98 | for node in caption.nodes: 99 | new_content = self._recreate_line(new_content, node) 100 | 101 | # Eliminate excessive line breaks 102 | new_content = new_content.strip() + '\n' 103 | while '\n\n' in new_content: 104 | new_content = new_content.replace('\n\n', '\n') 105 | # Break unnecessary on last line 106 | while '|\n' in new_content: 107 | new_content = new_content.replace('|\n', '\n') 108 | 109 | sub += new_content 110 | 111 | return sub 112 | 113 | def _recreate_line(self, sub, line): 114 | if line.type_ == CaptionNode.TEXT: 115 | return sub + line.content 116 | elif line.type_ == CaptionNode.BREAK: 117 | return sub + '|' 118 | else: 119 | return sub 120 | -------------------------------------------------------------------------------- /tests/test_sami_conversion.py: -------------------------------------------------------------------------------- 1 | from pycaption import ( 2 | DFXPReader, SAMIReader, SAMIWriter, SRTReader, WebVTTReader, 3 | ) 4 | 5 | from .mixins import SAMITestingMixIn 6 | 7 | # Arbitrary values used to test relativization 8 | VIDEO_WIDTH = 640 9 | VIDEO_HEIGHT = 360 10 | 11 | 12 | class TestDFXPtoSAMI(SAMITestingMixIn): 13 | def setup_method(self): 14 | self.reader = DFXPReader() 15 | self.writer = SAMIWriter() 16 | 17 | def test_dfxp_to_sami_conversion(self, sample_sami, sample_dfxp): 18 | caption_set = self.reader.read(sample_dfxp) 19 | result = self.writer.write(caption_set) 20 | 21 | assert isinstance(result, str) 22 | self.assert_sami_captions_equal(sample_sami, result) 23 | 24 | def test_dfxp_to_sami_with_nested_spans( 25 | self, sample_sami_from_dfxp_with_nested_spans, 26 | sample_dfxp_with_nested_spans): 27 | caption_set = self.reader.read(sample_dfxp_with_nested_spans) 28 | result = self.writer.write(caption_set) 29 | 30 | assert isinstance(result, str) 31 | self.assert_sami_captions_equal(sample_sami_from_dfxp_with_nested_spans, 32 | result) 33 | 34 | def test_dfxp_to_sami_with_margins( 35 | self, sample_dfxp_from_sami_with_margins): 36 | caption_set = self.reader.read(sample_dfxp_from_sami_with_margins) 37 | result = SAMIWriter(video_width=VIDEO_WIDTH, 38 | video_height=VIDEO_HEIGHT).write(caption_set) 39 | margins = ["margin-right: 6.04%;", 40 | "margin-bottom: 0%;", 41 | "margin-top: 0%;", 42 | "margin-left: 6.04%;"] 43 | 44 | assert all(margin in result for margin in margins) 45 | 46 | def test_dfxp_empty_cue_to_sami(self, sample_sami_empty_cue_output, 47 | sample_dfxp_empty_cue): 48 | caption_set = self.reader.read(sample_dfxp_empty_cue) 49 | result = self.writer.write(caption_set) 50 | 51 | self.assert_sami_captions_equal(sample_sami_empty_cue_output, result) 52 | 53 | 54 | class TestSRTtoSAMI(SAMITestingMixIn): 55 | def test_srt_to_sami_conversion(self, sample_sami, sample_srt): 56 | caption_set = SRTReader().read(sample_srt) 57 | result = SAMIWriter().write(caption_set) 58 | 59 | assert isinstance(result, str) 60 | self.assert_sami_captions_equal(sample_sami, result) 61 | 62 | 63 | class TestSAMItoSAMI(SAMITestingMixIn): 64 | def setup_method(self): 65 | self.reader = SAMIReader() 66 | self.writer = SAMIWriter() 67 | 68 | def test_sami_to_sami_conversion(self, sample_sami): 69 | caption_set = self.reader.read(sample_sami) 70 | result = SAMIWriter(relativize=False, 71 | fit_to_screen=False).write(caption_set) 72 | 73 | assert isinstance(result, str) 74 | self.assert_sami_captions_equal(sample_sami, result) 75 | 76 | def test_sami_with_multi_lang(self, sample_sami_with_separate_multi_lang): 77 | caption_set = self.reader.read(sample_sami_with_separate_multi_lang) 78 | result = self.writer.write(caption_set) 79 | 80 | assert isinstance(result, str) 81 | self.assert_sami_captions_equal(sample_sami_with_separate_multi_lang, 82 | result) 83 | 84 | def test_is_relativized(self, sample_sami_partial_margins_relativized, 85 | sample_sami_partial_margins): 86 | # Absolute positioning settings (e.g. px) are converted to percentages 87 | caption_set = self.reader.read(sample_sami_partial_margins) 88 | result = SAMIWriter( 89 | video_width=VIDEO_WIDTH, video_height=VIDEO_HEIGHT 90 | ).write(caption_set) 91 | 92 | self.assert_sami_captions_equal(sample_sami_partial_margins_relativized, 93 | result) 94 | 95 | def test_missing_language_conversion(self, sample_sami_with_lang, 96 | sample_sami_no_lang): 97 | caption_set = self.reader.read(sample_sami_no_lang) 98 | result = self.writer.write(caption_set) 99 | 100 | assert isinstance(result, str) 101 | self.assert_sami_captions_equal(sample_sami_with_lang, result) 102 | assert "lang: und;" in result 103 | 104 | 105 | class TestWebVTTtoSAMI(SAMITestingMixIn): 106 | def test_webvtt_to_sami_conversion(self, sample_sami, sample_webvtt): 107 | caption_set = WebVTTReader().read(sample_webvtt) 108 | result = SAMIWriter().write(caption_set) 109 | 110 | assert isinstance(result, str) 111 | self.assert_sami_captions_equal(sample_sami, result) 112 | -------------------------------------------------------------------------------- /tests/test_webvtt_conversion.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from pycaption import ( 4 | SAMIReader, SRTReader, WebVTTReader, WebVTTWriter, DFXPWriter, 5 | MicroDVDWriter, 6 | ) 7 | 8 | from tests.mixins import ( 9 | WebVTTTestingMixIn, DFXPTestingMixIn, MicroDVDTestingMixIn, 10 | ) 11 | 12 | 13 | class TestSAMItoWebVTT(WebVTTTestingMixIn): 14 | def test_conversion(self, sample_webvtt_from_sami, sample_sami): 15 | caption_set = SAMIReader().read(sample_sami) 16 | results = WebVTTWriter( 17 | video_width=640, video_height=360).write(caption_set) 18 | 19 | assert isinstance(results, str) 20 | self.assert_webvtt_equals(sample_webvtt_from_sami, results) 21 | 22 | def test_style_tags_conversion(self, sample_webvtt_from_sami_with_style, 23 | sample_sami_with_style_tags): 24 | caption_set = SAMIReader().read(sample_sami_with_style_tags) 25 | results = WebVTTWriter( 26 | video_width=640, video_height=360).write(caption_set) 27 | 28 | assert isinstance(results, str) 29 | self.assert_webvtt_equals(sample_webvtt_from_sami_with_style, results) 30 | 31 | def test_css_inline_style_conversion( 32 | self, sample_webvtt_from_sami_with_style, 33 | sample_sami_with_css_inline_style): 34 | caption_set = SAMIReader().read(sample_sami_with_css_inline_style) 35 | results = WebVTTWriter( 36 | video_width=640, video_height=360).write(caption_set) 37 | 38 | assert isinstance(results, str) 39 | self.assert_webvtt_equals(sample_webvtt_from_sami_with_style, results) 40 | 41 | def test_css_id_style_conversion( 42 | self, sample_webvtt_from_sami_with_id_style, 43 | sample_sami_with_css_id_style): 44 | caption_set = SAMIReader().read(sample_sami_with_css_id_style) 45 | results = WebVTTWriter( 46 | video_width=640, video_height=360).write(caption_set) 47 | 48 | assert isinstance(results, str) 49 | self.assert_webvtt_equals(sample_webvtt_from_sami_with_id_style, 50 | results) 51 | 52 | 53 | class TestSRTtoWebVTT(WebVTTTestingMixIn): 54 | def test_srt_to_webvtt_conversion(self, sample_webvtt_from_srt, sample_srt): 55 | caption_set = SRTReader().read(sample_srt) 56 | results = WebVTTWriter().write(caption_set) 57 | 58 | assert isinstance(results, str) 59 | self.assert_webvtt_equals(sample_webvtt_from_srt, results) 60 | 61 | 62 | class TestWebVTTtoWebVTT(WebVTTTestingMixIn): 63 | def test_webvtt_to_webvtt_conversion(self, sample_webvtt_from_webvtt, 64 | sample_webvtt): 65 | caption_set = WebVTTReader().read(sample_webvtt) 66 | results = WebVTTWriter().write(caption_set) 67 | 68 | assert isinstance(results, str) 69 | self.assert_webvtt_equals(sample_webvtt_from_webvtt, results) 70 | 71 | def test_cue_settings_are_kept(self, sample_webvtt_with_cue_settings): 72 | caption_set = WebVTTReader().read(sample_webvtt_with_cue_settings) 73 | 74 | webvtt = WebVTTWriter().write(caption_set) 75 | 76 | assert sample_webvtt_with_cue_settings == webvtt 77 | 78 | def test_positioning_is_kept(self, 79 | sample_webvtt_keeps_positioning): 80 | caption_set = WebVTTReader().read( 81 | sample_webvtt_keeps_positioning) 82 | results = WebVTTWriter().write(caption_set) 83 | 84 | assert sample_webvtt_keeps_positioning == results 85 | 86 | def test_output_timestamps(self, sample_webvtt_timestamps): 87 | expected_timestamp_line_pattern = re.compile( 88 | r'^(\d{2,}):(\d{2})(:\d{2})?\.(\d{3}) ' 89 | r'--> (\d{2,}):(\d{2})(:\d{2})?\.(\d{3})') 90 | 91 | caption_set = WebVTTReader().read(sample_webvtt_timestamps) 92 | results = WebVTTWriter().write(caption_set).splitlines() 93 | 94 | assert re.match(expected_timestamp_line_pattern, results[2]) 95 | assert re.match(expected_timestamp_line_pattern, results[5]) 96 | 97 | # # TODO: Write a test that includes a WebVTT file with style tags 98 | # # That will fail because the styles used in the cues are not tracked. 99 | 100 | 101 | class TestWebVTTtoDFXP(DFXPTestingMixIn): 102 | def test_conversion(self, sample_dfxp, sample_webvtt): 103 | caption_set = WebVTTReader().read(sample_webvtt) 104 | results = DFXPWriter().write(caption_set) 105 | 106 | assert isinstance(results, str) 107 | self.assert_dfxp_equals( 108 | sample_dfxp, results, ignore_styling=True, ignore_spans=True 109 | ) 110 | 111 | 112 | class TestWebVTTtoMicroDVD(MicroDVDTestingMixIn): 113 | def test_webvtt_to_microdvd_conversion(self, sample_microdvd, 114 | sample_webvtt): 115 | caption_set = WebVTTReader().read(sample_webvtt) 116 | results = MicroDVDWriter().write(caption_set) 117 | 118 | assert isinstance(results, str) 119 | self.assert_microdvd_equals(sample_microdvd, results) 120 | -------------------------------------------------------------------------------- /tests/test_geometry.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pycaption import CaptionReadSyntaxError 4 | from pycaption.geometry import Size, Point, Stretch, Padding, UnitEnum, Layout 5 | 6 | 7 | class TestIsValidGeometryObject: 8 | def test_size_is_valid(self): 9 | with pytest.raises(TypeError): 10 | Size() 11 | 12 | with pytest.raises(ValueError): 13 | Size(None, None) 14 | 15 | def test_point_is_valid(self): 16 | with pytest.raises(TypeError): 17 | Point() 18 | 19 | with pytest.raises(ValueError): 20 | Point(None, None) 21 | 22 | def test_stretch_is_valid(self): 23 | with pytest.raises(TypeError): 24 | Stretch() 25 | 26 | with pytest.raises(ValueError): 27 | Stretch(None, None) 28 | 29 | 30 | class TestIsRelative: 31 | def test_size_is_relative(self): 32 | size_px = Size(30, UnitEnum.PIXEL) 33 | size_percent = Size(30, UnitEnum.PERCENT) 34 | 35 | assert not size_px.is_relative() 36 | assert size_percent.is_relative() 37 | 38 | def test_point_is_relative(self): 39 | size_px = Size(30, UnitEnum.PIXEL) 40 | size_px2 = Size(30, UnitEnum.PIXEL) 41 | 42 | size_percent = Size(30, UnitEnum.PERCENT) 43 | size_percent2 = Size(30, UnitEnum.PERCENT) 44 | 45 | point_abs = Point(size_px, size_px2) 46 | point_mix = Point(size_percent, size_px) 47 | point_rel = Point(size_percent, size_percent2) 48 | 49 | assert not point_abs.is_relative() 50 | assert not point_mix.is_relative() 51 | assert point_rel.is_relative() 52 | 53 | def test_stretch_is_relative(self): 54 | size_px = Size(30, UnitEnum.PIXEL) 55 | size_px2 = Size(30, UnitEnum.PIXEL) 56 | 57 | size_percent = Size(30, UnitEnum.PERCENT) 58 | size_percent2 = Size(30, UnitEnum.PERCENT) 59 | 60 | stretch_abs = Stretch(size_px, size_px2) 61 | stretch_mix = Stretch(size_percent, size_px) 62 | stretch_rel = Stretch(size_percent, size_percent2) 63 | 64 | assert not stretch_abs.is_relative() 65 | assert not stretch_mix.is_relative() 66 | assert stretch_rel.is_relative() 67 | 68 | def test_padding_is_relative(self): 69 | size_px = Size(30, UnitEnum.PIXEL) 70 | size_px2 = Size(30, UnitEnum.PIXEL) 71 | size_px3 = Size(30, UnitEnum.PIXEL) 72 | size_px4 = Size(30, UnitEnum.PIXEL) 73 | 74 | size_percent = Size(30, UnitEnum.PERCENT) 75 | size_percent2 = Size(30, UnitEnum.PERCENT) 76 | size_percent3 = Size(30, UnitEnum.PERCENT) 77 | size_percent4 = Size(30, UnitEnum.PERCENT) 78 | 79 | padding_abs = Padding(size_px, size_px2, size_px3, size_px4) 80 | padding_mix = Padding(size_px, size_px2, size_px3, size_percent) 81 | padding_rel = Padding( 82 | size_percent, size_percent2, size_percent3, size_percent4) 83 | 84 | assert not padding_abs.is_relative() 85 | assert not padding_mix.is_relative() 86 | assert padding_rel.is_relative() 87 | 88 | def test_layout_is_relative(self): 89 | empty_layout = Layout() 90 | 91 | size_px = Size(30, UnitEnum.PIXEL) 92 | size_px2 = Size(30, UnitEnum.PIXEL) 93 | 94 | size_percent = Size(30, UnitEnum.PERCENT) 95 | size_percent2 = Size(30, UnitEnum.PERCENT) 96 | 97 | point_abs = Point(size_px, size_px2) 98 | point_rel = Point(size_percent, size_percent2) 99 | 100 | stretch_abs = Stretch(size_px, size_px2) 101 | stretch_rel = Stretch(size_percent, size_percent2) 102 | 103 | layout_abs = Layout( 104 | origin=point_abs, 105 | extent=stretch_abs, 106 | padding=None 107 | ) 108 | 109 | layout_mix = Layout( 110 | origin=point_abs, 111 | extent=stretch_rel, 112 | padding=None 113 | ) 114 | 115 | layout_rel = Layout( 116 | origin=point_rel, 117 | extent=stretch_rel, 118 | padding=None 119 | ) 120 | 121 | assert empty_layout.is_relative() 122 | assert not layout_abs.is_relative() 123 | assert not layout_mix.is_relative() 124 | assert layout_rel.is_relative() 125 | 126 | 127 | class TestSize: 128 | @pytest.mark.parametrize('string, value, unit', [ 129 | ('1px', 1.0, UnitEnum.PIXEL), ('2.3em', 2.3, UnitEnum.EM), 130 | ('12.34%', 12.34, UnitEnum.PERCENT), ('1.234c', 1.234, UnitEnum.CELL), 131 | ('10pt', 10.0, UnitEnum.PT), ('0', 0.0, UnitEnum.PIXEL)]) 132 | def test_valid_size_from_string(self, string, value, unit): 133 | size = Size.from_string(string) 134 | 135 | assert size.value == value 136 | assert size.unit == unit 137 | 138 | @pytest.mark.parametrize('string', ['10', '11,1px', '12xx', '%', 'o1pt']) 139 | def test_invalid_size_from_string(self, string): 140 | with pytest.raises(CaptionReadSyntaxError) as exc_info: 141 | Size.from_string(string) 142 | 143 | assert exc_info.value.args[0].startswith(f"Invalid size: {string}.") -------------------------------------------------------------------------------- /pycaption/scc/state_machines.py: -------------------------------------------------------------------------------- 1 | from ..exceptions import CaptionReadSyntaxError 2 | 3 | 4 | class _PositioningTracker: 5 | """Helps determine the positioning of a node, having kept track of 6 | positioning-related commands. 7 | """ 8 | 9 | def __init__(self, positioning=None): 10 | """ 11 | :param positioning: positioning information (row, column) 12 | :type positioning: tuple[int] 13 | """ 14 | self._positions = [positioning] 15 | self._break_required = False 16 | self._repositioning_required = False 17 | # Since the actual column is not applied when encountering a line break 18 | # this attribute is used to store it and determine by comparison if the 19 | # next positioning is actually a Tab Offset 20 | self._last_column = None 21 | 22 | def update_positioning(self, positioning): 23 | """Being notified of a position change, updates the internal state, 24 | to as to be able to tell if it was a trivial change (a simple line 25 | break) or not. 26 | 27 | :type positioning: tuple[int] 28 | :param positioning: a tuple (row, col) 29 | """ 30 | current = self._positions[-1] 31 | 32 | if not current: 33 | if positioning: 34 | # Set the positioning for the first time 35 | self._positions = [positioning] 36 | return 37 | 38 | row, col = current 39 | if self._break_required: 40 | col = self._last_column 41 | new_row, new_col = positioning 42 | is_tab_offset = new_row == row and col + 1 <= new_col <= col + 3 43 | # One line below will be treated as line break, not repositioning 44 | if new_row == row + 1: 45 | self._positions.append((new_row, col)) 46 | self._break_required = True 47 | self._last_column = new_col 48 | # Tab offsets after line breaks will be ignored to avoid repositioning 49 | elif self._break_required and is_tab_offset: 50 | return 51 | # force not to reposition on the same coordinates 52 | elif positioning == current: 53 | return 54 | else: 55 | # Reset the "current" position altogether. 56 | self._positions = [positioning] 57 | # Tab offsets are not interpreted as repositioning, but adjustments 58 | # to the previous PAC command 59 | if not is_tab_offset: 60 | self._repositioning_required = True 61 | 62 | def get_current_position(self): 63 | """Returns the current usable position 64 | 65 | :rtype: tuple[int] 66 | 67 | :raise: CaptionReadSyntaxError 68 | """ 69 | if not any(self._positions): 70 | raise CaptionReadSyntaxError("No Preamble Address Code [PAC] was provided") 71 | else: 72 | return self._positions[0] 73 | 74 | def is_repositioning_required(self): 75 | """Determines whether the current positioning has changed non-trivially 76 | 77 | Trivial would be mean that a line break should suffice. 78 | :rtype: bool 79 | """ 80 | return self._repositioning_required 81 | 82 | def acknowledge_position_changed(self): 83 | """Acknowledge the position tracer that the position was changed""" 84 | self._repositioning_required = False 85 | 86 | def is_linebreak_required(self): 87 | """If the current position is simply one line below the previous. 88 | :rtype: bool 89 | """ 90 | return self._break_required 91 | 92 | def acknowledge_linebreak_consumed(self): 93 | """Call to acknowledge that the line required was consumed""" 94 | self._break_required = False 95 | 96 | 97 | class DefaultProvidingPositionTracker(_PositioningTracker): 98 | """A _PositioningTracker that provides if needed a default value (14, 0), or 99 | uses the last positioning value set anywhere in the document 100 | """ 101 | 102 | default = (14, 0) 103 | 104 | def __init__(self, positioning=None, default=None): 105 | """ 106 | :type positioning: tuple[int] 107 | :param positioning: a tuple of ints (row, column) 108 | 109 | :type default: tuple[int] 110 | :param default: a tuple of ints (row, column) to use as fallback 111 | """ 112 | super().__init__(positioning) 113 | 114 | if default: 115 | self.default = default 116 | 117 | def get_current_position(self): 118 | """Returns the currently tracked positioning, the last positioning that 119 | was set (anywhere), or the default it was initiated with 120 | 121 | :rtype: tuple[int] 122 | """ 123 | try: 124 | return super().get_current_position() 125 | except CaptionReadSyntaxError: 126 | return self.default 127 | 128 | def update_positioning(self, positioning): 129 | """If called, sets this positioning as the default, then delegates 130 | to the super class. 131 | 132 | :param positioning: a tuple of ints (row, col) 133 | :type positioning: tuple[int] 134 | """ 135 | if positioning: 136 | self.default = positioning 137 | 138 | super().update_positioning(positioning) 139 | -------------------------------------------------------------------------------- /pycaption/srt.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from .base import ( 4 | BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode, 5 | ) 6 | from .exceptions import CaptionReadNoCaptions, InvalidInputError 7 | 8 | 9 | class SRTReader(BaseReader): 10 | def detect(self, content): 11 | lines = content.splitlines() 12 | if lines[0].isdigit() and '-->' in lines[1]: 13 | return True 14 | else: 15 | return False 16 | 17 | def read(self, content, lang='en-US'): 18 | if not isinstance(content, str): 19 | raise InvalidInputError('The content is not a unicode string.') 20 | 21 | lines = content.splitlines() 22 | start_line = 0 23 | captions = CaptionList() 24 | 25 | while start_line < len(lines): 26 | if not lines[start_line].isdigit(): 27 | break 28 | 29 | end_line = self._find_text_line(start_line, lines) 30 | 31 | timing = lines[start_line + 1].split('-->') 32 | start = self._srttomicro(timing[0].strip(' \r\n')) 33 | end = self._srttomicro(timing[1].strip(' \r\n')) 34 | 35 | nodes = [] 36 | 37 | for line in lines[start_line + 2:end_line - 1]: 38 | # skip extra blank lines 39 | if not nodes or line != '': 40 | nodes.append(CaptionNode.create_text(line)) 41 | nodes.append(CaptionNode.create_break()) 42 | 43 | if len(nodes): 44 | # remove last line break from end of caption list 45 | nodes.pop() 46 | caption = Caption(start, end, nodes) 47 | captions.append(caption) 48 | 49 | start_line = end_line 50 | 51 | caption_set = CaptionSet({lang: captions}) 52 | 53 | if caption_set.is_empty(): 54 | raise CaptionReadNoCaptions("empty caption file") 55 | 56 | return caption_set 57 | 58 | def _srttomicro(self, stamp): 59 | timesplit = stamp.split(':') 60 | if ',' not in timesplit[2]: 61 | timesplit[2] += ',000' 62 | secsplit = timesplit[2].split(',') 63 | microseconds = (int(timesplit[0]) * 3600000000 64 | + int(timesplit[1]) * 60000000 65 | + int(secsplit[0]) * 1000000 66 | + int(secsplit[1]) * 1000) 67 | 68 | return microseconds 69 | 70 | def _find_text_line(self, start_line, lines): 71 | end_line = start_line 72 | 73 | found = False 74 | while end_line < len(lines): 75 | if lines[end_line].strip() == "": 76 | found = True 77 | elif found is True: 78 | end_line -= 1 79 | break 80 | end_line += 1 81 | 82 | return end_line + 1 83 | 84 | 85 | class SRTWriter(BaseWriter): 86 | def write(self, caption_set): 87 | caption_set = deepcopy(caption_set) 88 | 89 | srt_captions = [] 90 | 91 | for lang in caption_set.get_languages(): 92 | srt_captions.append( 93 | self._recreate_lang(caption_set.get_captions(lang)) 94 | ) 95 | 96 | caption_content = 'MULTI-LANGUAGE SRT\n'.join(srt_captions) 97 | return caption_content 98 | 99 | def _recreate_lang(self, captions): 100 | # Merge caption's that are on the exact same timestamp otherwise some 101 | # players will play them in reversed order, libass specifically which is 102 | # used quite a lot, including VLC and MPV. 103 | 104 | merged_captions = [captions[0]] if captions else [] 105 | 106 | for caption in captions[1:]: 107 | # Merge if the timestamp is the same as last caption 108 | if (caption.start, caption.end) == ( 109 | merged_captions[-1].start, merged_captions[-1].end): 110 | merged_captions[-1] = Caption( 111 | start=caption.start, 112 | end=caption.end, 113 | nodes=(merged_captions[-1].nodes 114 | + [CaptionNode.create_break()] 115 | + caption.nodes)) 116 | else: 117 | # Different timestamp, end of merging, append new caption 118 | merged_captions.append(caption) 119 | captions = merged_captions 120 | 121 | srt = '' 122 | count = 1 123 | 124 | for caption in captions: 125 | srt += f'{count}\n' 126 | 127 | start = caption.format_start(msec_separator=',') 128 | end = caption.format_end(msec_separator=',') 129 | 130 | srt += f'{start[:12]} --> {end[:12]}\n' 131 | 132 | new_content = '' 133 | for node in caption.nodes: 134 | new_content = self._recreate_line(new_content, node) 135 | 136 | # Eliminate excessive line breaks 137 | new_content = new_content.strip() 138 | 139 | srt += f"{new_content}\n\n" 140 | count += 1 141 | 142 | return srt[:-1] # remove unwanted newline at end of file 143 | 144 | def _recreate_line(self, srt, line): 145 | if line.type_ == CaptionNode.TEXT: 146 | return srt + f'{line.content} ' 147 | elif line.type_ == CaptionNode.BREAK: 148 | return srt + '\n' 149 | else: 150 | return srt 151 | -------------------------------------------------------------------------------- /docs/supported_formats.rst: -------------------------------------------------------------------------------- 1 | Supported formats 2 | ================== 3 | 4 | Read: - DFXP/TTML - SAMI - SCC - SRT - WebVTT 5 | 6 | Write: - DFXP/TTML - SAMI - SRT - Transcript - WebVTT 7 | 8 | See the `examples 9 | folder `__ for 10 | example captions that currently can be read correctly. 11 | 12 | SAMI Reader / Writer :: `spec `__ 13 | ---------------------------------------------------------------------------------------- 14 | 15 | Microsoft Synchronized Accessible Media Interchange. Supports multiple 16 | languages. 17 | 18 | Supported Styling: - text-align - italics - font-size - font-family - 19 | color 20 | 21 | If the SAMI file is not valid XML (e.g. unclosed tags), will still 22 | attempt to read it. 23 | 24 | DFXP/TTML Reader / Writer :: `spec `__ 25 | ------------------------------------------------------------------- 26 | 27 | The W3 standard. Supports multiple languages. 28 | 29 | Supported Styling: - text-align - italics - font-size - font-family - 30 | color 31 | 32 | SRT Reader / Writer :: `spec `__ 33 | ---------------------------------------------------------------------------------------- 34 | 35 | SubRip captions. If given multiple languages to write, will output all 36 | joined together by a 'MULTI-LANGUAGE SRT' line. 37 | 38 | Supported Styling: - None 39 | 40 | Assumes input language is english. To change: 41 | 42 | :: 43 | 44 | pycaps = SRTReader().read(srt_content, lang='fr') 45 | 46 | WebVTT Reader / Writer :: `spec `__ 47 | ----------------------------------------------------------------- 48 | 49 | **WebVTT** is a W3C standard for displaying timed text in HTML5. Its 50 | specification is currently (as of February 2015) in draft stage and 51 | therefore not all features are implemented by major players, the same 52 | being true for ``pycaption``. 53 | 54 | By default, the reader assumes the language is English and the writer 55 | returns the first language it finds in the caption set. You can specify 56 | a language using the ``lang`` parameter: 57 | 58 | :: 59 | 60 | pycaps = WebVTTReader().read(content, lang='fr') 61 | 62 | If you need to adjust all timestamps in a WebVTT, you can use the 63 | ``time_shift_milliseconds`` parameter which moves the timestamps 64 | forward (positive integer) or backward (negative integer) with 65 | the specified amount: 66 | 67 | :: 68 | 69 | pycaps = WebVTTReader(time_shift_milliseconds=1154).read(content) 70 | 71 | Styling 72 | ^^^^^^^ 73 | 74 | Styling in WebVTT can be done via inline tags (e.g. ````, ```` etc.) or external 75 | CSS rules applied to text wrapped in class (````) or voice (````) tags. 76 | 77 | ``pycaption`` currently only keeps *voice tags* on conversion. 78 | 79 | Example: 80 | 81 | :: 82 | 83 | Hi, my name is Fred 84 | 85 | is converted to 86 | 87 | :: 88 | 89 | Fred: Hi, my name is Fred 90 | 91 | The following WebVTT supported tags are stripped off the cue text: 92 | 93 | - ````, ````, ````, ````, ````, ````, ```` and timestamp tags (````) 94 | 95 | Non-supported tags are left unchanged as a natural part of the cue text with no 96 | special meaning. 97 | 98 | Positioning 99 | ^^^^^^^^^^^ 100 | 101 | The WebVTT specs allow customizing the position of cues by configuring a 102 | number of cue settings. ``pycaption`` currently only *maintains positioning 103 | information on writing*, in which case it supports the following settings: 104 | 105 | - A WebVTT line position cue setting. 106 | - A WebVTT text position cue setting. 107 | - A WebVTT size cue setting. 108 | - A WebVTT alignment cue setting. 109 | 110 | ``pycaption`` **does not** support: 111 | 112 | - A WebVTT vertical text cue setting. 113 | - A WebVTT region cue setting. 114 | 115 | Refer to the `official WebVTT specification`_ for details about the cue 116 | settings. 117 | 118 | .. _official WebVTT specification: http://dev.w3.org/html5/webvtt/#webvtt-cue-settings 119 | 120 | SCC Reader :: `spec `__ 121 | ----------------------------------------------------------------------------------------------- 122 | 123 | Scenarist Closed Caption format. Assumes Channel 1 input. 124 | 125 | Supported Styling: - italics 126 | 127 | By default, the SCC Reader does not simulate roll-up captions. To enable 128 | roll-ups: 129 | 130 | :: 131 | 132 | pycaps = SCCReader().read(scc_content, simulate_roll_up=True) 133 | 134 | Also, assumes input language is english. To change: 135 | 136 | :: 137 | 138 | pycaps = SCCReader().read(scc_content, lang='fr') 139 | 140 | Now has the option of specifying an offset (measured in seconds) for the 141 | timestamp. For example, if the SCC file is 45 seconds ahead of the 142 | video: 143 | 144 | :: 145 | 146 | pycaps = SCCReader().read(scc_content, offset=45) 147 | 148 | The SCC Reader handles both dropframe and non-dropframe captions, and 149 | will auto-detect which format the captions are in. 150 | 151 | For debugging purposes, the SCC captions can be translated into a human readable 152 | form as following: 153 | :: 154 | 155 | translated_scc = translate_scc(scc_content, brackets="[]") 156 | 157 | Square brackets are used by default, but they can be replaced with other 158 | brackets or None. 159 | 160 | Transcript Writer 161 | ----------------- 162 | 163 | Text stripped of styling, arranged in sentences. 164 | 165 | Supported Styling: - None 166 | 167 | The transcript writer uses natural sentence boundary detection 168 | algorithms to create the transcript. 169 | -------------------------------------------------------------------------------- /tests/test_dfxp_extras.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from bs4 import BeautifulSoup 3 | 4 | from pycaption.dfxp.base import _create_internal_alignment 5 | from pycaption.dfxp import ( 6 | SinglePositioningDFXPWriter, DFXPReader, DFXP_DEFAULT_REGION, 7 | DFXP_DEFAULT_REGION_ID, LegacyDFXPWriter, 8 | ) 9 | from pycaption.geometry import ( 10 | HorizontalAlignmentEnum, VerticalAlignmentEnum, Layout, Alignment, 11 | ) 12 | 13 | 14 | class TestSinglePositioningDFXPWRiter: 15 | def test_only_the_default_region_is_created( 16 | self, sample_dfxp_to_render_with_only_default_positioning_input): 17 | caption_set = DFXPReader().read( 18 | sample_dfxp_to_render_with_only_default_positioning_input 19 | ) 20 | 21 | dfxp = SinglePositioningDFXPWriter().write(caption_set) 22 | layout = BeautifulSoup(dfxp, features='html.parser').findChild('layout') 23 | 24 | assert len(layout.findChildren('region')) == 1 25 | 26 | def test_only_the_default_region_is_referenced( 27 | self, sample_dfxp_to_render_with_only_default_positioning_input): 28 | caption_set = DFXPReader().read( 29 | sample_dfxp_to_render_with_only_default_positioning_input 30 | ) 31 | 32 | dfxp = SinglePositioningDFXPWriter().write(caption_set) 33 | 34 | soup = BeautifulSoup(dfxp, features='html.parser') 35 | 36 | for elem in soup.findAll(): 37 | if 'region' in elem.attrs: 38 | assert elem['region'] == DFXP_DEFAULT_REGION_ID 39 | 40 | def test_only_the_custom_region_is_created( 41 | self, sample_dfxp_to_render_with_only_default_positioning_input): 42 | caption_set = DFXPReader().read( 43 | sample_dfxp_to_render_with_only_default_positioning_input 44 | ) 45 | 46 | new_region = Layout( 47 | alignment=Alignment( 48 | HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP 49 | ) 50 | ) 51 | 52 | dfxp = SinglePositioningDFXPWriter(new_region).write(caption_set) 53 | # Using a different parser, because this preserves letter case 54 | # The output file is ok, but when parsing it, the "regular" parses 55 | # loses letter case. 56 | layout = BeautifulSoup(dfxp, features='xml').findChild('layout') 57 | 58 | region = layout.findChild('region') 59 | text_align = region['tts:textAlign'] 60 | display_align = region['tts:displayAlign'] 61 | 62 | internal_alignment = _create_internal_alignment( 63 | text_align, display_align 64 | ) 65 | 66 | assert len(layout.findChildren('region')) == 1 67 | assert internal_alignment.horizontal == HorizontalAlignmentEnum.LEFT 68 | assert internal_alignment.vertical == VerticalAlignmentEnum.TOP 69 | 70 | def test_only_the_specified_custom_attributes_are_created_for_the_region( 71 | self, sample_dfxp_to_render_with_only_default_positioning_input 72 | ): 73 | caption_set = DFXPReader().read( 74 | sample_dfxp_to_render_with_only_default_positioning_input 75 | ) 76 | 77 | new_region = Layout( 78 | alignment=Alignment( 79 | HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP 80 | ) 81 | ) 82 | 83 | dfxp = SinglePositioningDFXPWriter(new_region).write(caption_set) 84 | 85 | region = BeautifulSoup(dfxp, features='lxml').find('region') 86 | 87 | assert 'xml:id' in region.attrs 88 | assert region.attrs['xml:id'] != DFXP_DEFAULT_REGION_ID 89 | assert len(region.attrs) == 3 90 | 91 | def test_only_the_custom_region_is_referenced( 92 | self, sample_dfxp_to_render_with_only_default_positioning_input): 93 | caption_set = DFXPReader().read( 94 | sample_dfxp_to_render_with_only_default_positioning_input 95 | ) 96 | 97 | # it's easier to copy this than create a new one 98 | new_region = deepcopy(DFXP_DEFAULT_REGION) 99 | new_region.alignment.horizontal = HorizontalAlignmentEnum.LEFT 100 | new_region.alignment.vertical = VerticalAlignmentEnum.TOP 101 | 102 | dfxp = SinglePositioningDFXPWriter(new_region).write(caption_set) 103 | 104 | soup = BeautifulSoup(dfxp, features='html.parser') 105 | 106 | # get the region_id created, and see it's the one referenced 107 | created_region_id = soup.find('region')['xml:id'] 108 | 109 | referenced_region_ids = set() 110 | 111 | for elem in soup.findAll(): 112 | if 'region' in elem.attrs: 113 | referenced_region_ids.add(elem.attrs['region']) 114 | 115 | assert len(referenced_region_ids) == 1 116 | assert referenced_region_ids.pop() == created_region_id 117 | 118 | def test_styles_dont_contain_text_align_attribute( 119 | self, sample_dfxp_to_render_with_only_default_positioning_input): 120 | caption_set = DFXPReader().read( 121 | sample_dfxp_to_render_with_only_default_positioning_input 122 | ) 123 | 124 | result = SinglePositioningDFXPWriter().write(caption_set) 125 | 126 | caption_set = DFXPReader().read(result) 127 | 128 | for _, style in caption_set.get_styles(): 129 | assert 'text-align' not in style 130 | 131 | 132 | class TestLegacyDFXPWriter: 133 | def test_default_style_is_written_to_output_file( 134 | self, sample_dfxp_with_templated_style): 135 | caption_set = DFXPReader(read_invalid_positioning=True).read( 136 | sample_dfxp_with_templated_style.format( 137 | style_name="foxy_the_squirrel")) 138 | 139 | result = LegacyDFXPWriter().write(caption_set) 140 | 141 | assert result.count('foxy_the_squirrel') == 2 142 | -------------------------------------------------------------------------------- /tests/mixins.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | from bs4 import BeautifulSoup 5 | 6 | from pycaption.exceptions import InvalidInputError 7 | 8 | 9 | class ReaderTestingMixIn: 10 | """ 11 | Provide test case capabilities for asserting common Reader functionalities. 12 | """ 13 | 14 | def assert_positive_answer_for_detection(self, matching_sample): 15 | assert self.reader.detect(matching_sample) is True 16 | 17 | def assert_negative_answer_for_detection(self, different_sample): 18 | assert self.reader.detect(different_sample) is False 19 | 20 | def test_reader_only_supports_unicode_input(self): 21 | with pytest.raises(InvalidInputError) as exc_info: 22 | self.reader.read(b'') 23 | assert exc_info.value.args[0] == 'The content is not a unicode string.' 24 | 25 | 26 | class WebVTTTestingMixIn: 27 | """ 28 | Provide specialized test case capabilities for asserting on WebVTT content. 29 | """ 30 | 31 | def _extract_webvtt_captions(self, content): 32 | return tuple(line.strip() for line in content.splitlines()) 33 | 34 | def assert_webvtt_equals(self, first, second): 35 | """ 36 | Assert that two WebVTT contents are equal. 37 | """ 38 | first_items = self._extract_webvtt_captions(first) 39 | second_items = self._extract_webvtt_captions(second) 40 | 41 | assert first_items == second_items 42 | 43 | 44 | class SRTTestingMixIn: 45 | """ 46 | Provide specialized test case capabilities for asserting on SRT content. 47 | """ 48 | 49 | def _extract_srt_captions(self, content): 50 | return tuple(line.strip() for line in content.splitlines()) 51 | 52 | def assert_srt_equals(self, first, second): 53 | """ 54 | Assert that two SRT contents are equal. 55 | """ 56 | first_items = self._extract_srt_captions(first) 57 | second_items = self._extract_srt_captions(second) 58 | 59 | assert first_items == second_items 60 | 61 | 62 | class CaptionSetTestingMixIn: 63 | def assert_captionset_almost_equals(self, first, second, 64 | tolerance_microseconds): 65 | """ 66 | Assert that two caption sets have equal text except for newlines, 67 | and differences in timing that are less than tolerance_microseconds. 68 | """ 69 | 70 | captions_1 = first.get_captions(list(first.get_languages())[0]) 71 | captions_2 = second.get_captions(list(first.get_languages())[0]) 72 | 73 | def get_text_for_caption(caption): 74 | text = caption.get_text() 75 | text = re.sub(r'\s+', ' ', text) 76 | 77 | return text 78 | 79 | text_1 = [get_text_for_caption(caption) for caption in captions_1] 80 | text_2 = [get_text_for_caption(caption) for caption in captions_2] 81 | 82 | def close_enough(ts1, ts2): 83 | return abs(ts1 - ts2) < tolerance_microseconds 84 | 85 | start_differences = [ 86 | (caption_1.start, caption_2.start) 87 | for caption_1, caption_2 in zip(captions_1, captions_2) 88 | if not close_enough(caption_1.start, caption_2.start) 89 | ] 90 | 91 | end_differences = [ 92 | (caption_1.end, caption_2.end) 93 | for caption_1, caption_2 in zip(captions_1, captions_2) 94 | if not close_enough(caption_1.end, caption_2.end) 95 | ] 96 | 97 | assert text_1 == text_2 98 | assert start_differences == [] 99 | assert end_differences == [] 100 | 101 | 102 | class DFXPTestingMixIn: 103 | """ 104 | Provide specialized test case capabilities for asserting on DFXP content. 105 | """ 106 | 107 | def _remove_styling(self, soup): 108 | for style in soup('styling'): 109 | style.clear() 110 | 111 | for paragraph in soup('p'): 112 | if 'style' in paragraph.attrs: 113 | del paragraph.attrs['style'] 114 | 115 | def _remove_spans(self, soup): 116 | for span in soup('span'): 117 | span.unwrap() 118 | 119 | def _trim_text(self, soup): 120 | for paragraph in soup('p'): 121 | paragraph.string = paragraph.text.strip() 122 | 123 | def assert_dfxp_equals(self, first, second, 124 | ignore_styling=False, 125 | ignore_spans=False): 126 | first_soup = BeautifulSoup(first, 'lxml') 127 | second_soup = BeautifulSoup(second, 'lxml') 128 | 129 | if ignore_styling: 130 | self._remove_styling(first_soup) 131 | self._remove_styling(second_soup) 132 | 133 | if ignore_spans: 134 | self._remove_spans(first_soup) 135 | self._remove_spans(second_soup) 136 | 137 | self._trim_text(first_soup) 138 | self._trim_text(second_soup) 139 | 140 | assert first_soup == second_soup 141 | 142 | 143 | class SAMITestingMixIn: 144 | """ 145 | Provide specialized test case capabilities for asserting on SAMI content. 146 | """ 147 | 148 | def _extract_sami_captions(self, soup): 149 | return tuple( 150 | (caption.attrs['start'], caption.p.text.strip()) 151 | for caption in soup.select('sync') 152 | ) 153 | 154 | def assert_sami_captions_equal(self, first, second): 155 | first_soup = BeautifulSoup(first, 'lxml') 156 | second_soup = BeautifulSoup(second, 'lxml') 157 | 158 | first_items = self._extract_sami_captions(first_soup) 159 | second_items = self._extract_sami_captions(second_soup) 160 | 161 | assert first_items == second_items 162 | 163 | 164 | class MicroDVDTestingMixIn: 165 | """ 166 | Provide specialized test case capabilities for asserting on MicroDVD content. 167 | """ # noqa 168 | 169 | def _extract_micro_dvd_captions(self, content): 170 | return tuple(line.strip() for line in content.splitlines()) 171 | 172 | def assert_microdvd_equals(self, first, second): 173 | """ 174 | Assert that two MicroDVD contents are equal. 175 | """ 176 | first_items = self._extract_micro_dvd_captions(first) 177 | second_items = self._extract_micro_dvd_captions(second) 178 | 179 | assert first_items == second_items 180 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from tests.fixtures.dfxp import ( # noqa: F401 2 | sample_dfxp, sample_dfxp_with_inline_style, sample_dfxp_with_defined_style, 3 | sample_dfxp_with_inherited_style, sample_dfxp_without_region_and_style, 4 | sample_dfxp_with_positioning, sample_dfxp_with_relativized_positioning, 5 | sample_dfxp_empty, sample_dfxp_syntax_error, 6 | sample_dfxp_from_sami_with_positioning, 7 | sample_dfxp_long_cue, sample_dfxp_long_cue_fit_to_screen, 8 | sample_dfxp_from_sami_with_margins, sample_dfxp_from_sami_with_lang_margins, 9 | sample_dfxp_from_sami_with_span, sample_dfxp_from_sami_with_bad_span_align, 10 | sample_dfxp_invalid_but_supported_positioning_input, 11 | sample_dfxp_invalid_but_supported_positioning_output, 12 | sample_dfxp_multiple_regions_input, sample_dfxp_multiple_regions_output, 13 | sample_dfxp_to_render_with_only_default_positioning_input, 14 | sample_dfxp_output, sample_dfxp_style_tag_with_no_xml_id_input, 15 | sample_dfxp_style_tag_with_no_xml_id_output, sample_dfxp_from_scc_output, 16 | sample_dfxp_with_properly_closing_spans_output, 17 | sample_dfxp_for_legacy_writer_input, sample_dfxp_for_legacy_writer_output, 18 | sample_dfxp_with_templated_style, sample_dfxp_with_escaped_apostrophe, 19 | sample_dfxp_with_alternative_timing_formats, sample_dfxp_empty_paragraph, 20 | sample_dfxp_only_spaces_paragraph, sample_dfxp_incorrect_time_format, 21 | sample_dfxp_missing_begin, sample_dfxp_missing_end_and_dur, 22 | sample_dfxp_with_frame_timing, sample_dfxp_empty_cue, 23 | sample_dfxp_empty_cue_output, sample_dfxp_default_styling_p_tags, 24 | sample_dfxp_invalid_positioning_value_template, 25 | sample_dfxp_multiple_captions_with_the_same_timing, 26 | sample_dfxp_with_ampersand_character, sample_dfxp_with_nested_spans, 27 | dfxp_style_region_align_conflict, dfxp_with_concurrent_captions, 28 | ) 29 | from tests.fixtures.microdvd import ( # noqa: F401 30 | sample_microdvd, sample_microdvd_2, 31 | sample_microdvd_invalid_format, missing_fps_sample_microdvd, 32 | sample_microdvd_empty, sample_microdvd_empty_cue_output, 33 | ) 34 | from tests.fixtures.sami import ( # noqa: F401 35 | sample_sami, sample_sami_with_style_tags, 36 | sample_sami_with_css_inline_style, sample_sami_with_css_id_style, 37 | sample_sami_empty, sample_sami_syntax_error, 38 | sample_sami_double_br, sample_sami_partial_margins, 39 | sample_sami_partial_margins_relativized, sample_sami_lang_margin, 40 | sample_sami_with_span, sample_sami_with_bad_span_align, 41 | sample_sami_with_bad_div_align, sample_sami_with_p_align, 42 | sample_sami_with_p_and_span_align, sample_sami_with_multiple_span_aligns, 43 | sample_sami_no_lang, sample_sami_with_lang, sample_sami_with_multi_lang, 44 | sample_sami_with_multiple_p, sample_sami_empty_cue_output, 45 | sample_sami_with_invalid_inline_style, 46 | sample_sami_including_hexadecimal_charref, 47 | sample_sami_including_decimal_charref, 48 | sample_sami_including_html5_entityref, sample_sami_with_unclosed_tag, 49 | sample_sami_with_inline_lang, sample_sami_from_dfxp_with_nested_spans, 50 | sample_sami_with_separate_multi_lang, sample_sami_missing_start 51 | ) 52 | from tests.fixtures.scc import ( # noqa: F401 53 | sample_scc_created_dfxp_with_wrongly_closing_spans, 54 | scc_that_generates_webvtt_with_proper_newlines, 55 | sample_scc_produces_captions_with_start_and_end_time_the_same, 56 | sample_scc_pop_on, sample_scc_multiple_positioning, sample_scc_with_italics, 57 | sample_scc_empty, sample_scc_roll_up_ru2, sample_scc_roll_up_ru3, 58 | sample_no_positioning_at_all_scc, sample_scc_with_line_too_long, 59 | sample_scc_no_explicit_end_to_last_caption, sample_scc_flashing_cue, 60 | sample_scc_eoc_first_command, sample_scc_with_extended_characters, 61 | sample_scc_with_ampersand_character, sample_scc_multiple_formats, 62 | sample_scc_duplicate_tab_offset, sample_scc_duplicate_special_characters, 63 | sample_scc_tab_offset, sample_scc_with_unknown_commands, 64 | sample_scc_special_and_extended_characters, sample_scc_mid_row_before_text_pop, 65 | sample_scc_mid_row_before_text_roll, sample_scc_mid_row_before_text_paint, 66 | sample_scc_mid_row_following_text_no_text_before_italics_off_pop, 67 | sample_scc_mid_row_following_text_no_text_before_italics_off_roll, 68 | sample_scc_mid_row_following_text_no_text_before_italics_off_paint, 69 | sample_scc_mid_row_following_text_no_text_before_italics_on_pop, 70 | sample_scc_mid_row_following_text_no_text_before_italics_on_roll, 71 | sample_scc_mid_row_following_text_no_text_before_italics_on_paint, 72 | sample_scc_mid_row_with_space_before_pop, 73 | sample_scc_mid_row_with_space_before_roll, 74 | sample_scc_mid_row_with_space_before_paint, 75 | sample_scc_with_spaces_at_eol_pop, 76 | sample_scc_with_spaces_at_eol_roll, 77 | sample_scc_with_spaces_at_eol_paint, 78 | ) 79 | from tests.fixtures.srt import ( # noqa: F401 80 | sample_srt, sample_srt_ascii, sample_srt_numeric, sample_srt_empty, 81 | sample_srt_blank_lines, sample_srt_trailing_blanks, 82 | samples_srt_same_time, sample_srt_empty_cue_output, 83 | sample_srt_timestamps_without_microseconds, 84 | ) 85 | from tests.fixtures.translated_scc import ( # noqa: F401 86 | sample_translated_scc_custom_brackets, sample_translated_scc_success, 87 | sample_translated_scc_commands_not_found, sample_translated_scc_no_brackets, 88 | sample_translated_scc_special_and_extended_characters 89 | ) 90 | from tests.fixtures.webvtt import ( # noqa: F401 91 | sample_webvtt, sample_webvtt_from_dfxp, sample_webvtt_from_sami, 92 | sample_webvtt_from_sami_with_style, sample_webvtt_from_sami_with_id_style, 93 | sample_webvtt_from_dfxp_with_style, 94 | sample_webvtt_keeps_positioning, 95 | sample_webvtt_from_dfxp_with_positioning_and_style, 96 | sample_webvtt_from_srt, sample_webvtt_from_webvtt, 97 | sample_webvtt_2, sample_webvtt_empty, sample_webvtt_double_br, 98 | sample_webvtt_output_long_cue, webvtt_from_dfxp_with_conflicting_align, 99 | sample_webvtt_with_cue_settings, 100 | sample_webvtt_from_scc_properly_writes_newlines_output, 101 | sample_webvtt_last_cue_zero_start, sample_webvtt_empty_cue, 102 | sample_webvtt_multi_lang_en, sample_webvtt_multi_lang_de, 103 | sample_webvtt_empty_cue_output, sample_webvtt_timestamps 104 | ) 105 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pycaption.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pycaption.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/pycaption" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pycaption" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | Changelog 2 | --------- 3 | 2.2.19 4 | ^^^^^^ 5 | - Remove support for python 3.8 and 3.9. 6 | 7 | 2.2.18 8 | ^^^^^^ 9 | - Update changelog and new release tag. 10 | 11 | 2.2.17 12 | ^^^^^^ 13 | - Update nltk from 3.8.0 to 3.9.1. 14 | 15 | 2.2.16 16 | ^^^^^^ 17 | - Update copyright details. 18 | 19 | 2.2.15 20 | ^^^^^^ 21 | - Always skip doubled special characters, not just in case the cue starters are doubled. 22 | 23 | 2.2.14 24 | ^^^^^^ 25 | - Fix an issue with WebVTT writer text positioning on break inside a cue. 26 | - Prevent creating a repositioning command to the same coordinates. 27 | 28 | 2.2.13 29 | ^^^^^^ 30 | - Mid-row codes only add spaces only if there isn't one before. 31 | - Mid-row codes add spaces only if they affect the text in the same row (not adding if it follows break or PACS). 32 | - Remove spaces to the end of the lines. 33 | - Close italics on receiving another style setting command. 34 | - Throw an CaptionReadNoCaptions error in case of an empty input file is provided. 35 | - Ignore repositioning commands which are not followed by any text before breaks. 36 | - Mid-row codes will not add the space if it is in front of punctuation. 37 | - Fix a bug with background codes when the InstructionNodeCreator collection is empty. 38 | - Fix a bug WebVTT writer adding double line breaks. 39 | 40 | 2.2.12 41 | ^^^^^^ 42 | - Pinned nltk to 3.8.0 43 | 44 | 2.2.11 45 | ^^^^^^ 46 | - A space should not be placed before a mid row code if it follows a PAC command or a Tab Offset 47 | - The backspace command should be treated like other commands and duplicates should be skipped if PAC commands are duplicated 48 | - Prevent webvtt writer from creating a new cue in case of line break 49 | - In case of style setting PAC which also breaks the line, we add the break first, then the style tag 50 | 51 | 2.2.10 52 | ^^^^^ 53 | - Yanked. 54 | 55 | 2.2.9 56 | ^^^^^ 57 | - Yanked. 58 | 59 | 2.2.8 60 | ^^^^^ 61 | - Honor backspaces on captions in scc files 62 | - When mid-row codes which are preceded by a PAC command don't add spaces 63 | - Mid row codes which don't follow after a PAC and don't have a style reset command before will add a space to the end of the previous text node 64 | - Mid row codes which don't follow after a PAC and have a style reset command before will add a space to the beginning of the next text node 65 | - Background color codes to delete the space in front 66 | 67 | 2.2.7 68 | ^^^^^ 69 | - The cursor moves automatically one column to the right after each character or Mid-Row Code received. 70 | 71 | 2.2.6 72 | ^^^^^ 73 | - Pass the caption cue time with all error messages. 74 | 75 | 2.2.5 76 | ^^^^^ 77 | - Yanked. 78 | 79 | 2.2.4 80 | ^^^^^ 81 | - Skip duplicated extended characters. 82 | 83 | 2.2.3 84 | ^^^^^ 85 | - Add new substitute character to ignore before extended character in SCC input files 86 | 87 | 2.2.2 88 | ^^^^^ 89 | - Remove support for Python 3.6 & 3.7 90 | - Restrict SCC source files to 31 characters per line (32 will throw an exception) 91 | - Bump readthedocs-sphinx-search from 0.3.1 to 0.3.2 92 | - Change Apache copyright licensing (ending) copyright year 93 | 94 | 2.2.1 95 | ^^^^^ 96 | - Ignore the substitute character that comes before the extended character in SCC files. 97 | 98 | 2.2.0 99 | ^^^^^ 100 | - Added support for Python 3.11 101 | - Added support for Beautifulsoup 4.12.2 102 | - Remove support for Beautifulsoup < 4.12.1 103 | - DFXP captions now end consistently with a newline 104 | 105 | 2.1.1 106 | ^^^^^ 107 | - Added nltk as transcript dependency 108 | 109 | 2.1.0 110 | ^^^^^ 111 | - Remove upper limit for dependency versions to solve vulnerabilities 112 | 113 | 2.0.9 114 | ^^^^^ 115 | - Changed DFXPReader default horizontal alignment from 'center' to 'start' 116 | - Updated WebVTT horizontal alignment from 'middle' to 'center' 117 | 118 | 2.0.8 119 | ^^^^^ 120 | - Added support for Python 3.10 121 | - Added default start align to WebVTTWriter 122 | 123 | 2.0.7 124 | ^^^^^ 125 | - Implemented skipping duplicate special characters for SCCReader 126 | - Added support for beautifulsoup 4.10 and lxml 4.8 127 | - Added pytest and pytest-lazy-fixture as development dependencies 128 | 129 | 2.0.6 130 | ^^^^^ 131 | - Updated Size.from_string() to accept 0 size without measuring unit 132 | - Replaced ValueError with CaptionReadSyntaxError for invalid sizes passed to Size.from_string() 133 | - Updated DFXPReader timestamp validation according to TTML time expression specs 134 | - Updated flashing cues validation for SCCReader to raise a CaptionReadTimingError 135 | - Fixed SCC translator not recognising special and extended characters 136 | - Raise CaptionReadTimingError for missing 'start' on SAMIReader 137 | 138 | 2.0.5 139 | ^^^^^ 140 | - Updated DFXPReader to ignore paragraphs that only contain spaces, tabs or new lines 141 | - Added CaptionReadTimingError for invalid SCC timestamps 142 | - Added CaptionReadSyntaxError for invalid colors in SAMIReader 143 | - Raise CaptionReadTimingError when missing 'begin' or 'end' and 'dur' time on DFXPReader 144 | 145 | 2.0.4 146 | ^^^^^ 147 | - Updated the counting of frames to happen after processing SCC commands 148 | - Made all SCC-sourced captions which have a difference of up to 5 frames between them more fluid 149 | 150 | 2.0.3 151 | ^^^^^ 152 | - Implemented time shift for WebVTTReader 153 | - Removed WebVTTWriter 'start' position alignment 154 | - Updated the SCC Pop-On caption timing logic 155 | - Fixed the correction of end times for multiple last captions 156 | - Fixed bug when flushing implicit buffers and old key was None 157 | 158 | 2.0.2 159 | ^^^^^ 160 | - Implemented Tab Offset commands for SCCReader 161 | - Implemented caption safe area limits (80% horizontally and 90% vertically) 162 | - Implemented SCC translator 163 | 164 | 2.0.1 165 | ^^^^^ 166 | - Added newline between merged SRT captions with overlapping timestamps 167 | - Updated tests for SAMI format 168 | - Updated tests for SRT format 169 | - Added zero padding to 1-digit hours outputted by WebVTTWriter 170 | 171 | 2.0.0 172 | ^^^^^ 173 | - Dropped support for Python 3.5 174 | - Updated tests to run using pytest 175 | - Added pre-commit config 176 | 177 | 1.0.7 178 | ^^^^^ 179 | - Fixed issue with SCC paint-on buffer not being cleared after storing 180 | - Removed null DFXPReader captions from the resulting caption list 181 | - Updated SCCReader double command handling to include the positioning and tab offset case 182 | 183 | 1.0.6 184 | ^^^^^ 185 | - Added MicroDVD format 186 | - Fix for missing end times when reading multiple SAMI paragraphs inside a SYNC 187 | - Fix for wrong order when multiple SRT captions have the same timestamp 188 | - Fix for DFXP timestamps adding leading zeros to 2-digit hours 189 | - Added support for BeautifulSoup 4.9 190 | - Added tests for SCC to DFXP conversion when the source contains ampersands 191 | - Added support for Python 3.9 192 | 193 | 1.0.5 194 | ^^^^^ 195 | - Added language parameter to WebVTTWriter 196 | - Fix for TranscriptWriter merging words at caption boundary 197 | - Updated documentation with positioning information 198 | - Updated DFXP reader to fallback to the document's language if no language is present on individual
199 | - Introduced PYCAPTION_DEFAULT_LANG environment variable and set it to default to 'und' 200 | - Fixed DFXPReader timestamp validation to accept frames and frames conversion to microseconds 201 | 202 | 1.0.4 203 | ^^^^^ 204 | - Included tests in PyPI tarball 205 | - Ignore WebVTT empty cues instead of raising an exception 206 | - Updated BeautifulSoup version to >=4.8.1,<4.9 and fixed failing tests 207 | - Handled index error when sending bad timestamp for DFXP format 208 | 209 | 1.0.3 210 | ^^^^^ 211 | - Fixed issue with SCC reader including both special characters and their potential substitute 212 | - Modified enum34 dependency to versions under Python 3.4 213 | - Removed Python 3.4 and added 3.6, 3.7 and 3.8 to Travis tests 214 | 215 | 1.0.2 216 | ^^^^^ 217 | - Fixed typos in SCC positioning codes 218 | - Added missing SCC positioning codes to positioning map 219 | 220 | 1.0.0 221 | ^^^^^ 222 | - Added Python 3 support 223 | 224 | 0.5.x 225 | ^^^^^ 226 | - Added positioning support 227 | - Created documentation 228 | -------------------------------------------------------------------------------- /tests/test_webvtt.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pycaption import ( 4 | WebVTTReader, WebVTTWriter, SAMIReader, DFXPReader, 5 | CaptionReadNoCaptions, CaptionReadError, CaptionReadSyntaxError, 6 | ) 7 | from tests.mixins import ReaderTestingMixIn 8 | 9 | 10 | class TestWebVTTReader(ReaderTestingMixIn): 11 | def setup_method(self): 12 | self.reader = WebVTTReader() 13 | 14 | def test_positive_answer_for_detection(self, sample_webvtt): 15 | super().assert_positive_answer_for_detection(sample_webvtt) 16 | 17 | def test_negative_answer_for_detection_dfxp(self, sample_dfxp): 18 | super().assert_negative_answer_for_detection(sample_dfxp) 19 | 20 | def test_negative_answer_for_detection_microdvd(self, sample_microdvd): 21 | super().assert_negative_answer_for_detection(sample_microdvd) 22 | 23 | def test_negative_answer_for_detection_sami(self, sample_sami): 24 | super().assert_negative_answer_for_detection(sample_sami) 25 | 26 | def test_negative_answer_for_detection_scc_pop_on(self, sample_scc_pop_on): 27 | super().assert_negative_answer_for_detection(sample_scc_pop_on) 28 | 29 | def test_negative_answer_for_detection_srt(self, sample_srt): 30 | super().assert_negative_answer_for_detection(sample_srt) 31 | 32 | def test_caption_length(self, sample_webvtt_2): 33 | captions = self.reader.read(sample_webvtt_2) 34 | 35 | assert len(captions.get_captions('en-US')) == 7 36 | 37 | def test_read_supports_multiple_languages(self, sample_webvtt): 38 | captions = self.reader.read(sample_webvtt, lang='es') 39 | 40 | assert captions.get_captions('es') is not None 41 | 42 | def test_proper_timestamps(self, sample_webvtt): 43 | captions = self.reader.read(sample_webvtt) 44 | cue = captions.get_captions('en-US')[2] 45 | 46 | assert cue.start == 17000000 47 | assert cue.end == 18752000 48 | 49 | def test_forward_time_shift(self, sample_webvtt): 50 | captions = WebVTTReader(time_shift_milliseconds=15).read(sample_webvtt) 51 | cue = captions.get_captions('en-US')[2] 52 | 53 | assert cue.start == 17015000 54 | assert cue.end == 18767000 55 | 56 | def test_backward_time_shift(self, sample_webvtt): 57 | captions = WebVTTReader(time_shift_milliseconds=-15).read(sample_webvtt) 58 | cue = captions.get_captions('en-US')[2] 59 | 60 | assert cue.start == 16985000 61 | assert cue.end == 18737000 62 | 63 | def test_webvtt_cue_components_removed_from_text(self): 64 | result = self.reader._remove_styles( 65 | "Wikipedia is a great adventure. It may have " 66 | "its shortcomings, but it is the largest collective " 67 | "knowledge construction endevour base text " 68 | "annotation Yes, indeed!" 69 | ) 70 | expected = ( 71 | "Wikipedia is a great adventure. It may have " 72 | "its shortcomings, but it is the largest collective " 73 | "knowledge construction endevour base text annotation" 74 | " Audry: Yes, indeed!" 75 | ) 76 | assert result == expected 77 | 78 | def test_empty_file(self, sample_webvtt_empty): 79 | with pytest.raises(CaptionReadNoCaptions): 80 | WebVTTReader().read(sample_webvtt_empty) 81 | 82 | def test_not_ignoring_timing_errors(self): 83 | # todo: same assert w/ different arguments -> this can be parametrized; 84 | with pytest.raises(CaptionReadError): 85 | WebVTTReader(ignore_timing_errors=False).read( 86 | "\n" "00:00:20.000 --> 00:00:10.000\n" "foo bar baz") 87 | 88 | with pytest.raises(CaptionReadError): 89 | WebVTTReader(ignore_timing_errors=False).read( 90 | "00:00:20.000 --> 00:00:10.000\n" 91 | "Start time is greater than end time.\n" 92 | ) 93 | 94 | with pytest.raises(CaptionReadError): 95 | WebVTTReader(ignore_timing_errors=False).read( 96 | "00:00:20.000 --> 00:00:30.000\n" 97 | "Start times should be consecutive.\n" 98 | "\n" 99 | "00:00:10.000 --> 00:00:20.000\n" 100 | "This cue starts before the previous one.\n" 101 | ) 102 | 103 | def test_ignoring_timing_errors(self): 104 | # Even if timing errors are ignored, this has to raise an exception 105 | with pytest.raises(CaptionReadSyntaxError): 106 | WebVTTReader().read( 107 | "\nNOTE invalid cue stamp\n00:00:20.000 --> \nfoo bar baz\n") 108 | 109 | # And this too 110 | with pytest.raises(CaptionReadSyntaxError): 111 | WebVTTReader().read("\n00:00:20,000 --> 00:00:22,000\n" 112 | "Note the comma instead of point.\n") 113 | 114 | # todo: at this point it can be split into 2 separate tests 115 | try: 116 | WebVTTReader().read( 117 | "\n" 118 | "00:00:20.000 --> 00:00:10.000\n" 119 | "Start time is greater than end time.\n" 120 | ) 121 | except CaptionReadError: 122 | pytest.fail("Shouldn't raise CaptionReadError") 123 | 124 | try: 125 | WebVTTReader().read( 126 | "\n" 127 | "00:00:20.000 --> 00:00:30.000\n" 128 | "Start times should be consecutive.\n" 129 | "\n" 130 | "00:00:10.000 --> 00:00:20.000\n" 131 | "This cue starts before the previous one.\n" 132 | ) 133 | except CaptionReadError: 134 | pytest.fail("Shouldn't raise CaptionReadError") 135 | 136 | def test_invalid_files(self): 137 | with pytest.raises(CaptionReadError): 138 | WebVTTReader(ignore_timing_errors=False).read( 139 | "00:00:20.000 --> 00:00:10.000\n" 140 | "Start time is greater than end time.") 141 | 142 | with pytest.raises(CaptionReadError): 143 | WebVTTReader(ignore_timing_errors=False).read( 144 | "00:00:20.000 --> 00:00:30.000\n" 145 | "Start times should be consecutive.\n" 146 | "\n" 147 | "00:00:10.000 --> 00:00:20.000\n" 148 | "This cue starts before the previous one.\n" 149 | ) 150 | 151 | def test_zero_start(self, sample_webvtt_last_cue_zero_start): 152 | captions = self.reader.read(sample_webvtt_last_cue_zero_start) 153 | cue = captions.get_captions('en-US')[0] 154 | 155 | assert cue.start == 0 156 | 157 | def test_webvtt_empty_cue(self, sample_webvtt_empty_cue): 158 | assert 1 == len(self.reader.read( 159 | sample_webvtt_empty_cue).get_captions('en-US')) 160 | 161 | 162 | class TestWebVTTWriter: 163 | def setup_method(self): 164 | self.writer = WebVTTWriter() 165 | 166 | def test_double_br(self, sample_webvtt_double_br, sample_sami_double_br): 167 | caption_set = SAMIReader().read(sample_sami_double_br) 168 | results = WebVTTWriter().write(caption_set) 169 | 170 | assert sample_webvtt_double_br == results 171 | 172 | def test_break_node_positioning_is_ignored( 173 | self, webvtt_from_dfxp_with_conflicting_align, 174 | dfxp_style_region_align_conflict): 175 | caption_set = DFXPReader().read(dfxp_style_region_align_conflict) 176 | results = WebVTTWriter().write(caption_set) 177 | 178 | assert webvtt_from_dfxp_with_conflicting_align == results 179 | 180 | def test_lang_option(self, sample_webvtt_multi_lang_en, 181 | sample_webvtt_multi_lang_de, 182 | sample_sami_with_multi_lang): 183 | caption_set = SAMIReader().read(sample_sami_with_multi_lang) 184 | results = WebVTTWriter().write(caption_set, 'de-DE') 185 | 186 | assert sample_webvtt_multi_lang_de == results 187 | results = WebVTTWriter().write(caption_set, 'en-US') 188 | assert sample_webvtt_multi_lang_en == results 189 | -------------------------------------------------------------------------------- /tests/test_sami.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import pytest 4 | 5 | from pycaption import SAMIReader, CaptionReadNoCaptions, CaptionReadSyntaxError 6 | from pycaption.exceptions import CaptionReadTimingError 7 | from pycaption.geometry import HorizontalAlignmentEnum, Size, UnitEnum # noqa 8 | from tests.mixins import ReaderTestingMixIn 9 | 10 | 11 | class TestSAMIReader(ReaderTestingMixIn): 12 | def setup_method(self): 13 | self.reader = SAMIReader() 14 | 15 | def test_positive_answer_for_detection(self, sample_sami): 16 | super().assert_positive_answer_for_detection(sample_sami) 17 | 18 | def test_negative_answer_for_detection_dfxp(self, sample_dfxp): 19 | super().assert_negative_answer_for_detection(sample_dfxp) 20 | 21 | def test_negative_answer_for_detection_microdvd(self, sample_microdvd): 22 | super().assert_negative_answer_for_detection(sample_microdvd) 23 | 24 | def test_negative_answer_for_detection_scc_pop_on(self, sample_scc_pop_on): 25 | super().assert_negative_answer_for_detection(sample_scc_pop_on) 26 | 27 | def test_negative_answer_for_detection_srt(self, sample_srt): 28 | super().assert_negative_answer_for_detection(sample_srt) 29 | 30 | def test_negative_answer_for_detection_webvtt(self, sample_webvtt): 31 | super().assert_negative_answer_for_detection(sample_webvtt) 32 | 33 | def test_caption_length(self, sample_sami): 34 | caption_set = self.reader.read(sample_sami) 35 | 36 | assert 7 == len(caption_set.get_captions("en-US")) 37 | 38 | def test_proper_timestamps(self, sample_sami): 39 | caption_set = self.reader.read(sample_sami) 40 | paragraph = caption_set.get_captions("en-US")[2] 41 | 42 | assert 17000000 == paragraph.start 43 | assert 18752000 == paragraph.end 44 | 45 | def test_missing_start(self, sample_sami_missing_start): 46 | with pytest.raises(CaptionReadTimingError) as exc_info: 47 | self.reader.read(sample_sami_missing_start) 48 | 49 | assert exc_info.value.args[0].startswith( 50 | "Missing start time on the following line: ") 51 | 52 | def test_6digit_color_code_from_6digit_input(self, sample_sami): 53 | caption_set = self.reader.read(sample_sami) 54 | p_style = caption_set.get_style("p") 55 | 56 | assert "#ffeedd" == p_style['color'] 57 | 58 | def test_6digit_color_code_from_3digit_input(self, sample_sami): 59 | sample_sami = deepcopy(sample_sami) 60 | caption_set = self.reader.read(sample_sami.replace("#ffeedd", "#fed")) 61 | p_style = caption_set.get_style("p") 62 | 63 | assert "#ffeedd" == p_style['color'] 64 | 65 | def test_invalid_color_code(self, sample_sami): 66 | with pytest.raises(CaptionReadSyntaxError) as exc_info: 67 | self.reader.read(sample_sami.replace("#ffeedd", "ffffff")) 68 | assert exc_info.value.args[0] == \ 69 | "Invalid color value: ffffff. Check for missing # before hex " \ 70 | "values or misspelled color values." 71 | 72 | def test_empty_file(self, sample_sami_empty): 73 | with pytest.raises(CaptionReadNoCaptions): 74 | self.reader.read(sample_sami_empty) 75 | 76 | def test_invalid_markup_is_properly_handled(self, sample_sami_syntax_error): 77 | caption_set = self.reader.read(sample_sami_syntax_error) 78 | 79 | assert 2 == len(caption_set.get_captions("en-US")) 80 | 81 | def test_partial_margins(self, sample_sami_partial_margins): 82 | caption_set = self.reader.read(sample_sami_partial_margins) 83 | # Ensure that undefined margins are converted to explicitly nil padding 84 | # (i.e. "0%") 85 | 86 | assert caption_set.layout_info.padding.to_xml_attribute() == \ 87 | '0% 29pt 0% 29pt' 88 | 89 | def test_sami_with_bad_span_align(self, sample_sami_with_bad_span_align): 90 | caption_set = self.reader.read(sample_sami_with_bad_span_align) 91 | caption = caption_set.get_captions('en-US')[0] 92 | 93 | assert caption.layout_info.alignment.horizontal == \ 94 | HorizontalAlignmentEnum.RIGHT 95 | 96 | def test_sami_with_bad_div_align(self, sample_sami_with_bad_div_align): 97 | caption_set = self.reader.read(sample_sami_with_bad_div_align) 98 | caption = caption_set.get_captions('en-US')[0] 99 | 100 | assert caption.layout_info.alignment.horizontal == \ 101 | HorizontalAlignmentEnum.RIGHT 102 | 103 | def test_sami_with_p_align(self, sample_sami_with_p_align): 104 | caption_set = self.reader.read(sample_sami_with_p_align) 105 | caption = caption_set.get_captions('en-US')[0] 106 | 107 | assert caption.layout_info.alignment.horizontal == \ 108 | HorizontalAlignmentEnum.RIGHT 109 | 110 | def test_sami_with_p_and_span_align(self, 111 | sample_sami_with_p_and_span_align): 112 | """ align DOES NOT override

align if it is specified inline. 113 | """ 114 | caption_set = self.reader.read(sample_sami_with_p_and_span_align) 115 | caption = caption_set.get_captions('en-US')[0] 116 | 117 | assert caption.layout_info.alignment.horizontal == \ 118 | HorizontalAlignmentEnum.RIGHT 119 | 120 | def test_sami_with_invalid_inline_style( 121 | self, sample_sami_with_invalid_inline_style): 122 | caption_set = self.reader.read(sample_sami_with_invalid_inline_style) 123 | caption = caption_set.get_captions("en-US")[0] 124 | 125 | assert caption.layout_info.alignment is None 126 | 127 | def test_sami_including_hexadecimal_charref( 128 | self, sample_sami_including_hexadecimal_charref): 129 | caption_set = self.reader.read( 130 | sample_sami_including_hexadecimal_charref) 131 | paragraph = caption_set.get_captions("en-US")[0] 132 | 133 | assert '> >' == paragraph.get_text() 134 | 135 | def test_sami_including_decimal_charref( 136 | self, sample_sami_including_decimal_charref): 137 | caption_set = self.reader.read(sample_sami_including_decimal_charref) 138 | paragraph = caption_set.get_captions("en-US")[0] 139 | 140 | assert '> >' == paragraph.get_text() 141 | 142 | def test_sami_including_html5_entityref( 143 | self, sample_sami_including_html5_entityref): 144 | caption_set = self.reader.read(sample_sami_including_html5_entityref) 145 | paragraph = caption_set.get_captions("en-US")[0] 146 | 147 | assert '&starf_&starf' == paragraph.get_text() 148 | 149 | def test_html_file(self): 150 | with pytest.raises(CaptionReadSyntaxError) as exc_info: 151 | self.reader.read("") 152 | assert exc_info.value.args[0] == 'SAMI File seems to be an HTML file.' 153 | 154 | def test_no_cc_available(self): 155 | no_cc = 'no closed captioning available' 156 | with pytest.raises(CaptionReadSyntaxError) as exc_info: 157 | self.reader.read(f"{no_cc}") 158 | assert exc_info.value.args[0] == f'SAMI File contains "{no_cc}"' 159 | 160 | def test_sami_with_unclosed_tag(self, sample_sami_with_unclosed_tag): 161 | caption_set = self.reader.read(sample_sami_with_unclosed_tag) 162 | paragraph = caption_set.get_captions("en-US")[0] 163 | 164 | assert '.' == paragraph.get_text() 165 | 166 | def test_sami_with_inline_lang(self, sample_sami_with_inline_lang): 167 | caption_set = self.reader.read(sample_sami_with_inline_lang) 168 | paragraph = caption_set.get_captions("en")[0] 169 | 170 | assert 'Inlined.' == paragraph.get_text() 171 | 172 | def test_proper_with_timestamps_with_multiple_paragraph( 173 | self, sample_sami_with_multiple_p): 174 | captions = self.reader.read(sample_sami_with_multiple_p) 175 | paragraph_1 = captions.get_captions("en-US")[0] 176 | paragraph_2 = captions.get_captions("en-US")[1] 177 | 178 | assert paragraph_1.start == paragraph_2.start 179 | assert paragraph_1.end == paragraph_2.end 180 | -------------------------------------------------------------------------------- /tests/fixtures/webvtt.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture(scope="session") 5 | def sample_webvtt(): 6 | return """WEBVTT 7 | 8 | 00:09.209 --> 00:12.312 9 | ( clock ticking ) 10 | 11 | 00:14.848 --> 00:17.000 12 | MAN: 13 | When we think 14 | ♪ ...say bow, wow, ♪ 15 | 16 | 00:17.000 --> 00:18.752 17 | we have this vision of Einstein 18 | 19 | 00:18.752 --> 00:20.887 20 | as an old, wrinkly man 21 | with white hair. 22 | 23 | 00:20.887 --> 00:26.760 24 | MAN 2: 25 | E equals m c-squared is 26 | not about an old Einstein. 27 | 28 | 00:26.760 --> 00:32.200 29 | MAN 2: 30 | It's all about an eternal Einstein. 31 | 32 | 00:32.200 --> 00:36.200 33 | 34 | """ 35 | 36 | 37 | @pytest.fixture(scope="session") 38 | def sample_webvtt_from_dfxp(): 39 | return """WEBVTT 40 | 41 | 00:09.209 --> 00:12.312 align:start 42 | ( clock ticking ) 43 | 44 | 00:14.848 --> 00:17.000 align:start 45 | MAN: 46 | When we think 47 | ♪ ...say bow, wow, ♪ 48 | 49 | 00:17.000 --> 00:18.752 align:right 50 | we have this vision of Einstein 51 | 52 | 00:18.752 --> 00:20.887 align:start 53 |   54 | as an old, wrinkly man 55 | with white hair. 56 | 57 | 00:20.887 --> 00:26.760 align:start 58 | MAN 2: 59 | E equals m c-squared is 60 | not about an old Einstein. 61 | 62 | 00:26.760 --> 00:32.200 align:start 63 | MAN 2: 64 | It's all about an eternal Einstein. 65 | 66 | 00:32.200 --> 00:36.200 align:start 67 | <LAUGHING & WHOOPS!> 68 | """ 69 | 70 | 71 | @pytest.fixture(scope="session") 72 | def sample_webvtt_from_sami(): 73 | return """WEBVTT 74 | 75 | 00:09.209 --> 00:12.312 76 | ( clock ticking ) 77 | 78 | 00:14.848 --> 00:17.000 79 | MAN: 80 | When we think 81 | ♪ ...say bow, wow, ♪ 82 | 83 | 00:17.000 --> 00:18.752 align:right 84 | we have this vision of Einstein 85 | 86 | 00:18.752 --> 00:20.887 87 |   88 | as an old, wrinkly man 89 | with white hair. 90 | 91 | 00:20.887 --> 00:26.760 92 | MAN 2: 93 | E equals m c-squared is 94 | not about an old Einstein. 95 | 96 | 00:26.760 --> 00:32.200 97 | MAN 2: 98 | It's all about an eternal Einstein. 99 | 100 | 00:32.200 --> 00:36.200 101 | <LAUGHING & WHOOPS!> 102 | """ 103 | 104 | 105 | @pytest.fixture(scope="session") 106 | def sample_webvtt_from_sami_with_style(): 107 | return """WEBVTT 108 | 109 | 00:09.209 --> 00:12.312 110 | I do not want to go home. 111 | I don't like it there. 112 | """ 113 | 114 | 115 | @pytest.fixture(scope="session") 116 | def sample_webvtt_from_sami_with_id_style(): 117 | return """WEBVTT 118 | 119 | 00:09.209 --> 00:12.312 120 | This is in italics. 121 | 122 | 00:14.848 --> 00:17.000 123 | This is underlined. 124 | 125 | 00:17.000 --> 00:18.752 126 | This is bold. 127 | 128 | 00:20.887 --> 00:26.760 129 | This is everything together. 130 | """ 131 | 132 | 133 | @pytest.fixture(scope="session") 134 | def sample_webvtt_from_dfxp_with_style(): 135 | return """WEBVTT 136 | 137 | 00:09.209 --> 00:12.312 138 | This is italic, bold, underline, everything together in one tag, and nested. 139 | """ 140 | 141 | 142 | @pytest.fixture(scope="session") 143 | def sample_webvtt_keeps_positioning(): 144 | return """WEBVTT 145 | 146 | 00:01.000 --> 00:03.000 align:start position:25% line:25% size:50% 147 | You might not remember us. We are a typical transparent region with centered text that has an outline. 148 | 149 | 00:03.500 --> 00:05.000 align:right position:25% line:25% size:50% 150 | had personality. 151 | 152 | 00:05.500 --> 00:07.000 align:left position:50% line:50% size:25% 153 | Hello there, children! Have you seen any visitors? 154 | 155 | 00:07.500 --> 00:09.000 align:right position:25% line:75% size:25% 156 | This is 157 | the last cue 158 | """ 159 | 160 | 161 | @pytest.fixture(scope="session") 162 | def sample_webvtt_from_dfxp_with_positioning_and_style(): 163 | return """WEBVTT 164 | 165 | 00:01.000 --> 00:03.000 position:25% line:25% size:50% 166 | You might not remember us. We are a typical transparent region with centered text that has an outline. 167 | 168 | 00:03.500 --> 00:05.000 align:right position:25% line:25% size:50% 169 | had personality. 170 | 171 | 00:05.500 --> 00:07.000 align:left position:50% line:50% size:25% 172 | Hello there, children! Have you seen any visitors? 173 | 174 | 00:07.500 --> 00:09.000 align:right position:25% line:75% size:25% 175 | This is 176 | the last cue 177 | """ 178 | 179 | 180 | @pytest.fixture(scope="session") 181 | def sample_webvtt_from_srt(): 182 | return """WEBVTT 183 | 184 | 00:09.209 --> 00:12.312 185 | ( clock ticking ) 186 | 187 | 00:14.848 --> 00:17.000 188 | MAN: 189 | When we think 190 | ♪ ...say bow, wow, ♪ 191 | 192 | 00:17.000 --> 00:18.752 193 | we have this vision of Einstein 194 | 195 | 00:18.752 --> 00:20.887 196 | as an old, wrinkly man 197 | with white hair. 198 | 199 | 00:20.887 --> 00:26.760 200 | MAN 2: 201 | E equals m c-squared is 202 | not about an old Einstein. 203 | 204 | 00:26.760 --> 00:32.200 205 | MAN 2: 206 | It's all about an eternal Einstein. 207 | 208 | 00:32.200 --> 00:36.200 209 | <LAUGHING & WHOOPS!> 210 | """ 211 | 212 | 213 | # This is not equal to the input because we accept unescaped illegal characters 214 | # when reading (because many players do so) but escape them when writing 215 | # in order to conform to the specification. 216 | @pytest.fixture(scope="session") 217 | def sample_webvtt_from_webvtt(sample_webvtt_from_srt): 218 | return sample_webvtt_from_srt 219 | 220 | 221 | @pytest.fixture(scope="session") 222 | def sample_webvtt_2(): 223 | return """WEBVTT 224 | 225 | 1 226 | 00:00:00.000 --> 00:00:43.000 227 | - HELLO WORLD! 228 | 229 | 2 230 | 00:00:59.000 --> 00:01:30.000 231 | - LOOKING GOOOOD. 232 | 233 | 3 234 | 00:01:40.000 --> 00:02:00.000 235 | - HA HA HA! 236 | 237 | 4 238 | 00:02:05.105 --> 00:03:07.007 239 | - HI. WELCOME TO SESAME STREET. 240 | 241 | 5 242 | 00:04:07.007 --> 00:05:38.441 243 | ON TONIGHT'S SHOW... 244 | 245 | 6 246 | 00:05:58.441 --> 00:06:40.543 247 | - I'M NOT GOING TO WATCH THIS. 248 | 249 | 7 250 | 00:07:10.543 --> 00:07:51.711 251 | HEY. WATCH THIS. 252 | """ 253 | 254 | 255 | @pytest.fixture(scope="session") 256 | def sample_webvtt_empty(): 257 | return """WEBVTT 258 | """ 259 | 260 | 261 | @pytest.fixture(scope="session") 262 | def sample_webvtt_double_br(): 263 | return """WEBVTT 264 | 265 | 00:14.848 --> 00:18.848 266 | MAN: 267 |   268 | When we think 269 | of "E equals m c-squared", 270 | """ 271 | 272 | 273 | @pytest.fixture(scope="session") 274 | def sample_webvtt_output_long_cue(): 275 | return """\ 276 | WEBVTT 277 | 278 | 00:01.000 --> 00:02.000 align:start 279 | NARRATOR: 280 | 281 | 00:02.000 --> 00:03.000 position:25% line:25% size:65% 282 | They built the largest, most incredible, wildest, craziest, 283 | 284 | 00:03.000 --> 00:04.000 align:start 285 | most complex machine in history. 286 | """ 287 | 288 | 289 | @pytest.fixture(scope="session") 290 | def webvtt_from_dfxp_with_conflicting_align(): 291 | return """WEBVTT 292 | 293 | 00:04.537 --> 00:07.841 294 | IT'S WORD GIRL♫ 295 | 296 | 00:08.537 --> 00:10.841 297 | ♫WORD UP, 298 | IT'S WORD GIRL♫ 299 | """ 300 | 301 | 302 | @pytest.fixture(scope="session") 303 | def sample_webvtt_with_cue_settings(): 304 | return """\ 305 | WEBVTT 306 | 307 | 00:01.000 --> 00:06.000 align:center position:37% line:74% 308 | 37% 74% - NARRATOR: 309 | 310 | 00:01.000 --> 00:06.000 this is invalid, but will also be kept 311 | They built the largest, 312 | """ 313 | 314 | 315 | @pytest.fixture(scope="session") 316 | def sample_webvtt_from_scc_properly_writes_newlines_output(): 317 | return """\ 318 | WEBVTT 319 | 320 | 21:30.000 --> 21:34.000 align:left position:20% line:83% size:70% 321 | aa 322 | bb 323 | """ 324 | 325 | 326 | @pytest.fixture(scope="session") 327 | def sample_webvtt_last_cue_zero_start(): 328 | return """WEBVTT 329 | 330 | 00:00.000 --> 00:12.312 331 | ( clock ticking )""" 332 | 333 | 334 | @pytest.fixture(scope="session") 335 | def sample_webvtt_empty_cue(): 336 | return """WEBVTT 337 | 338 | 1 339 | 00:00.000 --> 00:02.000 340 | 341 | 00:04.000 --> 00:05.000 342 | Transcribed by Celestials 343 | """ 344 | 345 | 346 | @pytest.fixture(scope="session") 347 | def sample_webvtt_multi_lang_en(): 348 | return """WEBVTT 349 | 350 | 00:14.848 --> 00:18.848 351 | Butterfly. 352 | """ 353 | 354 | 355 | @pytest.fixture(scope="session") 356 | def sample_webvtt_multi_lang_de(): 357 | return """WEBVTT 358 | 359 | 00:14.848 --> 00:18.848 360 | Schmetterling. 361 | """ 362 | 363 | 364 | @pytest.fixture(scope="session") 365 | def sample_webvtt_empty_cue_output(): 366 | return """\ 367 | WEBVTT 368 | 369 | 00:01.209 --> 00:02.312 align:start position:10% line:10% size:80% 370 | abc 371 | """ 372 | 373 | 374 | @pytest.fixture(scope="session") 375 | def sample_webvtt_timestamps(): 376 | return """WEBVTT 377 | 378 | 01:01.001 --> 10:10.100 379 | Test zero padded and two digit timestamps without hours 380 | 381 | 01:01:01.001 --> 10:10:10.100 382 | Test zero padded and two digit timestamps without hours""" 383 | -------------------------------------------------------------------------------- /tests/test_dfxp.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pycaption import DFXPReader, CaptionReadNoCaptions 4 | from pycaption.exceptions import ( 5 | CaptionReadSyntaxError, CaptionReadError, CaptionReadTimingError, 6 | ) 7 | from pycaption.geometry import ( 8 | UnitEnum, HorizontalAlignmentEnum, VerticalAlignmentEnum, 9 | ) 10 | from tests.mixins import ReaderTestingMixIn 11 | 12 | 13 | class TestDFXPReader(ReaderTestingMixIn): 14 | def setup_class(self): 15 | self.reader = DFXPReader() 16 | 17 | def test_positive_answer_for_detection(self, sample_dfxp): 18 | super().assert_positive_answer_for_detection(sample_dfxp) 19 | 20 | def test_negative_answer_for_microdvd(self, sample_microdvd): 21 | super().assert_negative_answer_for_detection(sample_microdvd) 22 | 23 | def test_negative_answer_for_sami(self, sample_sami): 24 | super().assert_negative_answer_for_detection(sample_sami) 25 | 26 | def test_negative_answer_for_scc_on_pop_on(self, sample_scc_pop_on): 27 | super().assert_negative_answer_for_detection(sample_scc_pop_on) 28 | 29 | def test_negative_answer_for_srt(self, sample_srt): 30 | super().assert_negative_answer_for_detection(sample_srt) 31 | 32 | def test_negative_answer_for_webvtt(self, sample_webvtt): 33 | super().assert_negative_answer_for_detection(sample_webvtt) 34 | 35 | def test_caption_length(self, sample_dfxp): 36 | captions = DFXPReader().read(sample_dfxp) 37 | 38 | assert 7 == len(captions.get_captions("en-US")) 39 | 40 | def test_proper_timestamps(self, sample_dfxp): 41 | captions = DFXPReader().read(sample_dfxp) 42 | paragraph = captions.get_captions("en-US")[2] 43 | 44 | assert 17000000 == paragraph.start 45 | assert 18752000 == paragraph.end 46 | 47 | def test_incorrect_time_format(self, sample_dfxp_incorrect_time_format): 48 | with pytest.raises(CaptionReadTimingError) as exc_info: 49 | DFXPReader().read(sample_dfxp_incorrect_time_format) 50 | 51 | assert exc_info.value.args[0].startswith("Invalid timestamp: 0:05.") 52 | 53 | def test_missing_begin(self, sample_dfxp_missing_begin): 54 | with pytest.raises(CaptionReadTimingError) as exc_info: 55 | DFXPReader().read(sample_dfxp_missing_begin) 56 | assert exc_info.value.args[0].startswith('Missing begin time on line ') 57 | 58 | def test_missing_end_and_dur(self, sample_dfxp_missing_end_and_dur): 59 | with pytest.raises(CaptionReadTimingError) as exc_info: 60 | DFXPReader().read(sample_dfxp_missing_end_and_dur) 61 | assert exc_info.value.args[0].startswith( 62 | 'Missing end time or duration on line ') 63 | 64 | def test_convert_timestamp_to_microseconds(self): 65 | reader = DFXPReader() 66 | 67 | assert 1 == reader._convert_timestamp_to_microseconds("0.001ms") 68 | assert 2000 == reader._convert_timestamp_to_microseconds("2ms") 69 | assert 1000000 == reader._convert_timestamp_to_microseconds("1s") 70 | assert 1234567 == reader._convert_timestamp_to_microseconds("1.234567s") 71 | assert 180000000 == reader._convert_timestamp_to_microseconds("3m") 72 | assert 14400000000 == reader._convert_timestamp_to_microseconds("4h") 73 | assert 53333 == reader._convert_timestamp_to_microseconds("1.6f") 74 | # Tick values are not supported 75 | with pytest.raises(NotImplementedError): 76 | reader._convert_timestamp_to_microseconds("2.3t") 77 | 78 | @pytest.mark.parametrize('timestamp, microseconds', [ 79 | ('12:23:34', 44614000000), ('23:34:45:56', 84886866666), 80 | ('34:45:56.7', 125156700000), ('13:24:35.67', 48275670000), 81 | ('24:35:46.456', 88546456000), ('1:23:34', 5014000000)]) 82 | def test_clock_time(self, timestamp, microseconds): 83 | assert DFXPReader()._convert_timestamp_to_microseconds( 84 | timestamp) == microseconds 85 | 86 | @pytest.mark.parametrize('timestamp', [ 87 | '1:1:11', '1:11:1', '1:11:11:1', '11:11:11:11.11', '11:11:11,11', 88 | '11.11.11.11', '11:11:11.', 'o1:11:11']) 89 | def test_invalid_timestamp(self, timestamp): 90 | with pytest.raises(CaptionReadTimingError) as exc_info: 91 | DFXPReader()._convert_timestamp_to_microseconds(timestamp) 92 | 93 | def test_empty_file(self, sample_dfxp_empty): 94 | with pytest.raises(CaptionReadNoCaptions): 95 | DFXPReader().read(sample_dfxp_empty) 96 | 97 | def test_invalid_markup_is_properly_handled(self, sample_dfxp_syntax_error): 98 | captions = DFXPReader().read(sample_dfxp_syntax_error) 99 | 100 | assert 2 == len(captions.get_captions("en")) 101 | 102 | def test_caption_error_for_invalid_positioning_values( 103 | self, sample_dfxp_invalid_positioning_value_template): 104 | invalid_value_dfxp = ( 105 | sample_dfxp_invalid_positioning_value_template. 106 | format(origin="px 5px") 107 | ) 108 | with pytest.raises(CaptionReadSyntaxError): 109 | DFXPReader().read(invalid_value_dfxp) 110 | 111 | def test_caption_error_for_invalid_or_unsupported_positioning_units( 112 | self, sample_dfxp_invalid_positioning_value_template): 113 | invalid_dfxp = sample_dfxp_invalid_positioning_value_template.format( 114 | origin="6foo 7bar" 115 | ) 116 | with pytest.raises(CaptionReadSyntaxError): 117 | DFXPReader().read(invalid_dfxp) 118 | 119 | def test_individual_timings_of_captions_with_matching_timespec_are_kept( 120 | self, sample_dfxp_multiple_captions_with_the_same_timing 121 | ): 122 | captionset = DFXPReader().read( 123 | sample_dfxp_multiple_captions_with_the_same_timing 124 | ) 125 | expected_timings = [(9209000, 12312000)] * 3 126 | actual_timings = [(c_.start, c_.end) for c_ in 127 | captionset.get_captions('en-US')] 128 | 129 | assert expected_timings == actual_timings 130 | 131 | def test_individual_texts_of_captions_with_matching_timespec_are_kept( 132 | self, sample_dfxp_multiple_captions_with_the_same_timing): 133 | captionset = DFXPReader().read( 134 | sample_dfxp_multiple_captions_with_the_same_timing 135 | ) 136 | 137 | expected_texts = ['Some text here', 138 | 'Some text there', 139 | 'Caption texts are everywhere!'] 140 | actual_texts = [c_.nodes[0].content for c_ in 141 | captionset.get_captions("en-US")] 142 | 143 | assert expected_texts == actual_texts 144 | 145 | def test_individual_layouts_of_captions_with_matching_timespec_are_kept( 146 | self, sample_dfxp_multiple_captions_with_the_same_timing 147 | ): 148 | captionset = DFXPReader().read( 149 | sample_dfxp_multiple_captions_with_the_same_timing 150 | ) 151 | expected_layouts = [ 152 | (((10, UnitEnum.PERCENT), (10, UnitEnum.PERCENT)), None, None, 153 | (HorizontalAlignmentEnum.START, VerticalAlignmentEnum.BOTTOM)), 154 | (((40, UnitEnum.PERCENT), (40, UnitEnum.PERCENT)), None, None, 155 | (HorizontalAlignmentEnum.START, VerticalAlignmentEnum.BOTTOM)), 156 | (((10, UnitEnum.PERCENT), (70, UnitEnum.PERCENT)), None, None, 157 | (HorizontalAlignmentEnum.START, VerticalAlignmentEnum.BOTTOM))] 158 | actual_layouts = [c_.layout_info.serialized() for c_ in 159 | captionset.get_captions('en-US')] 160 | 161 | assert expected_layouts == actual_layouts 162 | 163 | def test_properly_converts_timing( 164 | self, sample_dfxp_with_alternative_timing_formats): 165 | caption_set = DFXPReader().read( 166 | sample_dfxp_with_alternative_timing_formats) 167 | caps = caption_set.get_captions('en-US') 168 | 169 | assert caps[0].start == 1900000 170 | assert caps[0].end == 3050000 171 | assert caps[1].start == 4000000 172 | assert caps[1].end == 5200000 173 | 174 | def test_empty_paragraph(self, sample_dfxp_empty_paragraph): 175 | try: 176 | DFXPReader().read(sample_dfxp_empty_paragraph) 177 | except CaptionReadError: 178 | pytest.fail("Failing on empty paragraph") 179 | 180 | def test_only_spaces_paragraph(self, sample_dfxp_only_spaces_paragraph): 181 | caption_set = DFXPReader().read(sample_dfxp_only_spaces_paragraph) 182 | caps = caption_set.get_captions('en-US') 183 | 184 | assert len(caps) == 1 185 | 186 | def test_properly_converts_frametiming(self, sample_dfxp_with_frame_timing): 187 | caption_set = DFXPReader().read(sample_dfxp_with_frame_timing) 188 | caps = caption_set.get_captions('en-US') 189 | 190 | assert caps[0].end == 12233333 191 | assert caps[0].start == 9666666 192 | 193 | def test_empty_cue(self, sample_dfxp_empty_cue): 194 | caption_set = DFXPReader().read(sample_dfxp_empty_cue) 195 | caps = caption_set.get_captions('en-US') 196 | 197 | assert len(caps) == 1 198 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # 2 | # pycaption documentation build configuration file, created by 3 | # sphinx-quickstart on Thu Feb 12 12:18:37 2015. 4 | # 5 | # This file is execfile()d with the current directory set to its 6 | # containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sphinx_rtd_theme 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | # needs_sphinx = '1.0' 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [] 33 | 34 | # Add any paths that contain templates here, relative to this directory. 35 | templates_path = ['_templates'] 36 | 37 | # The suffix of source filenames. 38 | source_suffix = '.rst' 39 | 40 | # The encoding of source files. 41 | # source_encoding = 'utf-8-sig' 42 | 43 | # The master toctree document. 44 | master_doc = 'index' 45 | 46 | # General information about the project. 47 | project = 'pycaption' 48 | copyright = '2012-2025, PBS.org ' \ 49 | '(available under the Apache License, Version 2.0)' 50 | 51 | # The version info for the project you're documenting, acts as replacement for 52 | # |version| and |release|, also used in various other places throughout the 53 | # built documents. 54 | # 55 | # The short X.Y version. 56 | version = '2.2.19' 57 | # The full version, including alpha/beta/rc tags. 58 | release = '2.2.19' 59 | 60 | # The language for content autogenerated by Sphinx. Refer to documentation 61 | # for a list of supported languages. 62 | # language = None 63 | 64 | # There are two options for replacing |today|: either, you set today to some 65 | # non-false value, then it is used: 66 | # today = '' 67 | # Else, today_fmt is used as the format for a strftime call. 68 | # today_fmt = '%B %d, %Y' 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | exclude_patterns = ['_build'] 73 | 74 | # The reST default role (used for this markup: `text`) to use for all 75 | # documents. 76 | # default_role = None 77 | 78 | # If true, '()' will be appended to :func: etc. cross-reference text. 79 | # add_function_parentheses = True 80 | 81 | # If true, the current module name will be prepended to all description 82 | # unit titles (such as .. function::). 83 | # add_module_names = True 84 | 85 | # If true, sectionauthor and moduleauthor directives will be shown in the 86 | # output. They are ignored by default. 87 | # show_authors = False 88 | 89 | # The name of the Pygments (syntax highlighting) style to use. 90 | pygments_style = 'sphinx' 91 | 92 | # A list of ignored prefixes for module index sorting. 93 | # modindex_common_prefix = [] 94 | 95 | # If true, keep warnings as "system message" paragraphs in the built documents. 96 | # keep_warnings = False 97 | 98 | 99 | # -- Options for HTML output ---------------------------------------------- 100 | 101 | # The theme to use for HTML and HTML Help pages. See the documentation for 102 | # a list of builtin themes. 103 | # html_theme = 'sphinx_rtd_theme' 104 | html_theme = "sphinx_rtd_theme" 105 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 106 | 107 | # Theme options are theme-specific and customize the look and feel of a theme 108 | # further. For a list of options available for each theme, see the 109 | # documentation. 110 | # html_theme_options = {} 111 | 112 | # Add any paths that contain custom themes here, relative to this directory. 113 | # html_theme_path = [] 114 | 115 | # The name for this set of Sphinx documents. If None, it defaults to 116 | # " v documentation". 117 | # html_title = None 118 | 119 | # A shorter title for the navigation bar. Default is the same as html_title. 120 | # html_short_title = None 121 | 122 | # The name of an image file (relative to this directory) to place at the top 123 | # of the sidebar. 124 | # html_logo = None 125 | 126 | # The name of an image file (within the static path) to use as favicon of the 127 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 128 | # pixels large. 129 | # html_favicon = None 130 | 131 | # Add any paths that contain custom static files (such as style sheets) here, 132 | # relative to this directory. They are copied after the builtin static files, 133 | # so a file named "default.css" will overwrite the builtin "default.css". 134 | html_static_path = ['_static'] 135 | 136 | # Add any extra paths that contain custom files (such as robots.txt or 137 | # .htaccess) here, relative to this directory. These files are copied 138 | # directly to the root of the documentation. 139 | # html_extra_path = [] 140 | 141 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 142 | # using the given strftime format. 143 | # html_last_updated_fmt = '%b %d, %Y' 144 | 145 | # If true, SmartyPants will be used to convert quotes and dashes to 146 | # typographically correct entities. 147 | # html_use_smartypants = True 148 | 149 | # Custom sidebar templates, maps document names to template names. 150 | # html_sidebars = {} 151 | 152 | # Additional templates that should be rendered to pages, maps page names to 153 | # template names. 154 | # html_additional_pages = {} 155 | 156 | # If false, no module index is generated. 157 | # html_domain_indices = True 158 | 159 | # If false, no index is generated. 160 | # html_use_index = True 161 | 162 | # If true, the index is split into individual pages for each letter. 163 | # html_split_index = False 164 | 165 | # If true, links to the reST sources are added to the pages. 166 | # html_show_sourcelink = True 167 | 168 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 169 | # html_show_sphinx = True 170 | 171 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 172 | # html_show_copyright = True 173 | 174 | # If true, an OpenSearch description file will be output, and all pages will 175 | # contain a tag referring to it. The value of this option must be the 176 | # base URL from which the finished HTML is served. 177 | # html_use_opensearch = '' 178 | 179 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 180 | # html_file_suffix = None 181 | 182 | # Output file base name for HTML help builder. 183 | htmlhelp_basename = 'pycaptiondoc' 184 | 185 | 186 | # -- Options for LaTeX output --------------------------------------------- 187 | 188 | latex_elements = { 189 | # The paper size ('letterpaper' or 'a4paper'). 190 | # 'papersize': 'letterpaper', 191 | 192 | # The font size ('10pt', '11pt' or '12pt'). 193 | # 'pointsize': '10pt', 194 | 195 | # Additional stuff for the LaTeX preamble. 196 | # 'preamble': '', 197 | } 198 | 199 | # Grouping the document tree into LaTeX files. List of tuples 200 | # (source start file, target name, title, 201 | # author, documentclass [howto, manual, or own class]). 202 | latex_documents = [ 203 | ('index', 'pycaption.tex', 'pycaption Documentation', 204 | 'PBS', 'manual'), 205 | ] 206 | 207 | # The name of an image file (relative to this directory) to place at the top of 208 | # the title page. 209 | # latex_logo = None 210 | 211 | # For "manual" documents, if this is true, then toplevel headings are parts, 212 | # not chapters. 213 | # latex_use_parts = False 214 | 215 | # If true, show page references after internal links. 216 | # latex_show_pagerefs = False 217 | 218 | # If true, show URL addresses after external links. 219 | # latex_show_urls = False 220 | 221 | # Documents to append as an appendix to all manuals. 222 | # latex_appendices = [] 223 | 224 | # If false, no module index is generated. 225 | # latex_domain_indices = True 226 | 227 | 228 | # -- Options for manual page output --------------------------------------- 229 | 230 | # One entry per manual page. List of tuples 231 | # (source start file, name, description, authors, manual section). 232 | man_pages = [ 233 | ('index', 'pycaption', 'pycaption Documentation', 234 | ['PBS'], 1) 235 | ] 236 | 237 | # If true, show URL addresses after external links. 238 | # man_show_urls = False 239 | 240 | 241 | # -- Options for Texinfo output ------------------------------------------- 242 | 243 | # Grouping the document tree into Texinfo files. List of tuples 244 | # (source start file, target name, title, author, 245 | # dir menu entry, description, category) 246 | texinfo_documents = [ 247 | ('index', 'pycaption', 'pycaption Documentation', 248 | 'PBS', 'pycaption', 'One line description of project.', 249 | 'Miscellaneous'), 250 | ] 251 | 252 | # Documents to append as an appendix to all manuals. 253 | # texinfo_appendices = [] 254 | 255 | # If false, no module index is generated. 256 | # texinfo_domain_indices = True 257 | 258 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 259 | # texinfo_show_urls = 'footnote' 260 | 261 | # If true, do not generate a @detailmenu in the "Top" node's menu. 262 | # texinfo_no_detailmenu = False 263 | -------------------------------------------------------------------------------- /pycaption/dfxp/extras.py: -------------------------------------------------------------------------------- 1 | # We thought about making pycaption.base objects immutable. This would be nice 2 | # in a lot of cases, but since the transformations on them could be quite 3 | # complex, the deepcopy method is good enough sometimes. 4 | from copy import deepcopy 5 | from xml.sax.saxutils import escape 6 | 7 | from bs4 import BeautifulSoup 8 | 9 | from .base import DFXPWriter, DFXP_DEFAULT_REGION 10 | from ..base import BaseWriter, CaptionNode, merge_concurrent_captions 11 | 12 | LEGACY_DFXP_BASE_MARKUP = ''' 13 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | ''' 22 | 23 | LEGACY_DFXP_DEFAULT_STYLE = { 24 | 'color': 'white', 25 | 'font-family': 'monospace', 26 | 'font-size': '1c', 27 | } 28 | 29 | LEGACY_DFXP_DEFAULT_STYLE_ID = 'default' 30 | LEGACY_DFXP_DEFAULT_REGION_ID = 'bottom' 31 | 32 | LEGACY_DFXP_DEFAULT_REGION = { 33 | 'text-align': 'center', 34 | 'display-align': 'after' 35 | } 36 | 37 | 38 | class SinglePositioningDFXPWriter(DFXPWriter): 39 | """ 40 | A dfxp writer, that ignores all positioning, using a single provided value 41 | """ 42 | def __init__(self, default_positioning=DFXP_DEFAULT_REGION, 43 | *args, **kwargs): 44 | super().__init__(*args, **kwargs) 45 | self.default_positioning = default_positioning 46 | 47 | def write(self, captions_set, force=''): 48 | """Writes a DFXP file using the positioning provided in the initializer 49 | 50 | :type captions_set: pycaption.base.CaptionSet 51 | :param force: only write this language, if available in the CaptionSet 52 | :rtype: str 53 | """ 54 | captions_set = self._create_single_positioning_caption_set( 55 | captions_set, self.default_positioning) 56 | 57 | return super().write(captions_set, force) # noqa 58 | 59 | @staticmethod 60 | def _create_single_positioning_caption_set(caption_set, positioning): 61 | """Return a caption where all the positioning information was 62 | replaced from positioning 63 | 64 | :type caption_set: pycaption.base.CaptionSet 65 | :rtype: pycaption.base.CaptionSet 66 | """ 67 | # If SinglePositioningDFXPWriter would modify the state of the caption 68 | # set, any writer using the same caption_set thereafter would be 69 | # affected. At the moment we know we don't use any other writers, but 70 | # this is important and mustn't be neglected 71 | caption_set = deepcopy(caption_set) 72 | caption_set = merge_concurrent_captions(caption_set) 73 | caption_set.layout_info = positioning 74 | 75 | for lang in caption_set.get_languages(): 76 | caption_set.set_layout_info(lang, positioning) 77 | 78 | caption_list = caption_set.get_captions(lang) 79 | for caption in caption_list: 80 | caption.layout_info = positioning 81 | 82 | for node in caption.nodes: 83 | if hasattr(node, 'layout_info'): 84 | node.layout_info = positioning 85 | 86 | for _, style in caption_set.get_styles(): 87 | if 'text-align' in style: 88 | style.pop('text-align') 89 | 90 | return caption_set 91 | 92 | 93 | class LegacyDFXPWriter(BaseWriter): 94 | """Ported the legacy DFXPWriter from 0.4.5""" 95 | def __init__(self, *args, **kw): 96 | self.p_style = False 97 | self.open_span = False 98 | 99 | def write(self, caption_set, force=''): 100 | caption_set = deepcopy(caption_set) 101 | caption_set = merge_concurrent_captions(caption_set) 102 | 103 | dfxp = BeautifulSoup(LEGACY_DFXP_BASE_MARKUP, 'lxml-xml') 104 | dfxp.find('tt')['xml:lang'] = "en" 105 | 106 | for style_id, style in caption_set.get_styles(): 107 | if style != {}: 108 | dfxp = self._recreate_styling_tag(style_id, style, dfxp) 109 | if not caption_set.get_styles(): 110 | dfxp = self._recreate_styling_tag( 111 | LEGACY_DFXP_DEFAULT_STYLE_ID, LEGACY_DFXP_DEFAULT_STYLE, dfxp) 112 | 113 | # XXX For now we will always use this default region. In the future if 114 | # regions are provided, they will be kept 115 | dfxp = self._recreate_region_tag( 116 | LEGACY_DFXP_DEFAULT_REGION_ID, LEGACY_DFXP_DEFAULT_REGION, dfxp) 117 | 118 | body = dfxp.find('body') 119 | 120 | if force: 121 | langs = [self._force_language(force, caption_set.get_languages())] 122 | else: 123 | langs = caption_set.get_languages() 124 | 125 | for lang in langs: 126 | div = dfxp.new_tag('div') 127 | div['xml:lang'] = lang 128 | 129 | for caption in caption_set.get_captions(lang): 130 | if caption.style: 131 | caption_style = caption.style 132 | caption_style.update( 133 | {'region': LEGACY_DFXP_DEFAULT_REGION_ID}) 134 | else: 135 | caption_style = {'class': LEGACY_DFXP_DEFAULT_STYLE_ID, 136 | 'region': LEGACY_DFXP_DEFAULT_REGION_ID} 137 | p = self._recreate_p_tag(caption, caption_style, dfxp) 138 | div.append(p) 139 | 140 | body.append(div) 141 | 142 | caption_content = dfxp.prettify(formatter=None) 143 | return caption_content 144 | 145 | # force the DFXP to only have one language, trying to match on "force" 146 | def _force_language(self, force, langs): 147 | for lang in langs: 148 | if force == lang: 149 | return lang 150 | 151 | return langs[-1] 152 | 153 | def _recreate_region_tag(self, region_id, styling, dfxp): 154 | dfxp_region = dfxp.new_tag('region') 155 | dfxp_region.attrs.update({'xml:id': region_id}) 156 | 157 | attributes = self._recreate_style(styling, dfxp) 158 | dfxp_region.attrs.update(attributes) 159 | 160 | new_tag = dfxp.new_tag('region') 161 | new_tag.attrs.update({'xml:id': region_id}) 162 | if dfxp_region != new_tag: 163 | dfxp.find('layout').append(dfxp_region) 164 | return dfxp 165 | 166 | def _recreate_styling_tag(self, style, content, dfxp): 167 | dfxp_style = dfxp.new_tag('style') 168 | dfxp_style.attrs.update({'xml:id': style}) 169 | 170 | attributes = self._recreate_style(content, dfxp) 171 | dfxp_style.attrs.update(attributes) 172 | 173 | new_tag = dfxp.new_tag('style') 174 | new_tag.attrs.update({'xml:id': style}) 175 | if dfxp_style != new_tag: 176 | dfxp.find('styling').append(dfxp_style) 177 | 178 | return dfxp 179 | 180 | def _recreate_p_tag(self, caption, caption_style, dfxp): 181 | start = caption.format_start() 182 | end = caption.format_end() 183 | p = dfxp.new_tag("p", begin=start, end=end) 184 | p.string = self._recreate_text(caption, dfxp) 185 | 186 | if dfxp.find("style", {"xml:id": "p"}): 187 | p['style'] = 'p' 188 | 189 | p.attrs.update(self._recreate_style(caption_style, dfxp)) 190 | 191 | return p 192 | 193 | def _recreate_text(self, caption, dfxp): 194 | line = '' 195 | 196 | for node in caption.nodes: 197 | if node.type_ == CaptionNode.TEXT: 198 | line += escape(node.content) + ' ' 199 | 200 | elif node.type_ == CaptionNode.BREAK: 201 | line = line.rstrip() + '
\n ' 202 | 203 | elif node.type_ == CaptionNode.STYLE: 204 | line = self._recreate_span(line, node, dfxp) 205 | 206 | return line.rstrip() 207 | 208 | def _recreate_span(self, line, node, dfxp): 209 | if node.start: 210 | styles = '' 211 | 212 | content_with_style = self._recreate_style(node.content, dfxp) 213 | for style, value in list(content_with_style.items()): 214 | styles += f' {style}="{value}"' 215 | 216 | if styles: 217 | if self.open_span: 218 | line = line.rstrip() + ' ' 219 | line += f'' 220 | self.open_span = True 221 | 222 | elif self.open_span: 223 | line = line.rstrip() + ' ' 224 | self.open_span = False 225 | 226 | return line 227 | 228 | def _recreate_style(self, content, dfxp): 229 | dfxp_style = {} 230 | 231 | if 'region' in content: 232 | if dfxp.find('region', {'xml:id': content['region']}): 233 | dfxp_style['region'] = content['region'] 234 | if 'class' in content: 235 | if dfxp.find("style", {"xml:id": content['class']}): 236 | dfxp_style['style'] = content['class'] 237 | if 'text-align' in content: 238 | dfxp_style['tts:textAlign'] = content['text-align'] 239 | if 'italics' in content: 240 | dfxp_style['tts:fontStyle'] = 'italic' 241 | if 'font-family' in content: 242 | dfxp_style['tts:fontFamily'] = content['font-family'] 243 | if 'font-size' in content: 244 | dfxp_style['tts:fontSize'] = content['font-size'] 245 | if 'color' in content: 246 | dfxp_style['tts:color'] = content['color'] 247 | if 'display-align' in content: 248 | dfxp_style['tts:displayAlign'] = content['display-align'] 249 | 250 | return dfxp_style 251 | -------------------------------------------------------------------------------- /tests/fixtures/translated_scc.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture(scope="session") 5 | def sample_translated_scc_success(): 6 | return """Scenarist_SCC V1.0 7 | 8 | 00:00:09:05 [Erase Non-displayed Memory] [Erase Non-displayed Memory] [Resume Caption Loading] [Resume Caption Loading] [row 15, column 00, with plain white text.] [row 15, column 00, with plain white text.] [( ] [cl] [oc] [k ] [ti] [ck] [in] [g ] [)] [Erase Displayed Memory] [Erase Displayed Memory] [End Of Caption] [End Of Caption] 9 | 10 | 00:00:12:08 [Erase Displayed Memory] [Erase Displayed Memory] 11 | 12 | 00:00:13:18 [Erase Non-displayed Memory] [Erase Non-displayed Memory] [Resume Caption Loading] [Resume Caption Loading] [row 13, column 00, with plain white text.] [row 13, column 00, with plain white text.] [MA] [N:] [row 14, column 00, with plain white text.] [row 14, column 00, with plain white text.] [Wh] [en] [ w] [e ] [th] [in] [k] [row 15, column 00, with plain white text.] [row 15, column 00, with plain white text.] [of] [ "] [E ] [eq] [ua] [ls] [ m] [ c] [-s] [qu] [ar] [ed] [",] [Erase Displayed Memory] [Erase Displayed Memory] [End Of Caption] [End Of Caption] 13 | 14 | 00:00:16:03 [Erase Non-displayed Memory] [Erase Non-displayed Memory] [Resume Caption Loading] [Resume Caption Loading] [row 15, column 00, with plain white text.] [row 15, column 00, with plain white text.] [we] [ h] [av] [e ] [th] [is] [ v] [is] [io] [n ] [of] [ E] [in] [st] [ei] [n] [Erase Displayed Memory] [Erase Displayed Memory] [End Of Caption] [End Of Caption] 15 | 16 | 00:00:17:20 [Erase Non-displayed Memory] [Erase Non-displayed Memory] [Resume Caption Loading] [Resume Caption Loading] [row 14, column 00, with plain white text.] [row 14, column 00, with plain white text.] [as] [ a] [n ] [ol] [d,] [ w] [ri] [nk] [ly] [ m] [an] [row 15, column 00, with plain white text.] [row 15, column 00, with plain white text.] [wi] [th] [ w] [hi] [te] [ h] [ai] [r.] [Erase Displayed Memory] [Erase Displayed Memory] [End Of Caption] [End Of Caption] 17 | 18 | 00:00:19:13 [Erase Non-displayed Memory] [Erase Non-displayed Memory] [Resume Caption Loading] [Resume Caption Loading] [row 13, column 00, with plain white text.] [row 13, column 00, with plain white text.] [MA] [N ] [2:] [row 14, column 00, with plain white text.] [row 14, column 00, with plain white text.] [E ] [eq] [ua] [ls] [ m] [ c] [-s] [qu] [ar] [ed] [ i] [s] [row 15, column 00, with plain white text.] [row 15, column 00, with plain white text.] [no] [t ] [ab] [ou] [t ] [an] [ o] [ld] [ E] [in] [st] [ei] [n.] [Erase Displayed Memory] [Erase Displayed Memory] [End Of Caption] [End Of Caption] 19 | 20 | 00:00:25:16 [Erase Non-displayed Memory] [Erase Non-displayed Memory] [Resume Caption Loading] [Resume Caption Loading] [row 13, column 00, with plain white text.] [row 13, column 00, with plain white text.] [MA] [N ] [2:] [row 14, column 00, with plain white text.] [row 14, column 00, with plain white text.] [It] ['s] [ a] [ll] [ a] [bo] [ut] [ a] [n ] [et] [er] [na] [l] [row 15, column 00, with plain white text.] [row 15, column 00, with plain white text.] [Ei] [ns] [te] [in] [.] [Erase Displayed Memory] [Erase Displayed Memory] [End Of Caption] [End Of Caption] 21 | 22 | 00:00:31:15 [Erase Non-displayed Memory] [Erase Non-displayed Memory] [Resume Caption Loading] [Resume Caption Loading] [row 15, column 00, with plain white text.] [row 15, column 00, with plain white text.] [] [Erase Displayed Memory] [Erase Displayed Memory] [End Of Caption] [End Of Caption] 23 | 24 | 00:00:36:04 [Erase Displayed Memory] [Erase Displayed Memory] 25 | 26 | """ 27 | 28 | 29 | @pytest.fixture(scope="session") 30 | def sample_translated_scc_no_brackets(): 31 | return """Scenarist_SCC V1.0 32 | 33 | 00:00:09:05 Erase Non-displayed Memory Erase Non-displayed Memory Resume Caption Loading Resume Caption Loading row 15, column 00, with plain white text. row 15, column 00, with plain white text. ( cl oc k ti ck in g ) Erase Displayed Memory Erase Displayed Memory End Of Caption End Of Caption 34 | 35 | 00:00:12:08 Erase Displayed Memory Erase Displayed Memory 36 | 37 | 00:00:13:18 Erase Non-displayed Memory Erase Non-displayed Memory Resume Caption Loading Resume Caption Loading row 13, column 00, with plain white text. row 13, column 00, with plain white text. MA N: row 14, column 00, with plain white text. row 14, column 00, with plain white text. Wh en w e th in k row 15, column 00, with plain white text. row 15, column 00, with plain white text. of " E eq ua ls m c -s qu ar ed ", Erase Displayed Memory Erase Displayed Memory End Of Caption End Of Caption 38 | 39 | 00:00:16:03 Erase Non-displayed Memory Erase Non-displayed Memory Resume Caption Loading Resume Caption Loading row 15, column 00, with plain white text. row 15, column 00, with plain white text. we h av e th is v is io n of E in st ei n Erase Displayed Memory Erase Displayed Memory End Of Caption End Of Caption 40 | 41 | 00:00:17:20 Erase Non-displayed Memory Erase Non-displayed Memory Resume Caption Loading Resume Caption Loading row 14, column 00, with plain white text. row 14, column 00, with plain white text. as a n ol d, w ri nk ly m an row 15, column 00, with plain white text. row 15, column 00, with plain white text. wi th w hi te h ai r. Erase Displayed Memory Erase Displayed Memory End Of Caption End Of Caption 42 | 43 | 00:00:19:13 Erase Non-displayed Memory Erase Non-displayed Memory Resume Caption Loading Resume Caption Loading row 13, column 00, with plain white text. row 13, column 00, with plain white text. MA N 2: row 14, column 00, with plain white text. row 14, column 00, with plain white text. E eq ua ls m c -s qu ar ed i s row 15, column 00, with plain white text. row 15, column 00, with plain white text. no t ab ou t an o ld E in st ei n. Erase Displayed Memory Erase Displayed Memory End Of Caption End Of Caption 44 | 45 | 00:00:25:16 Erase Non-displayed Memory Erase Non-displayed Memory Resume Caption Loading Resume Caption Loading row 13, column 00, with plain white text. row 13, column 00, with plain white text. MA N 2: row 14, column 00, with plain white text. row 14, column 00, with plain white text. It 's a ll a bo ut a n et er na l row 15, column 00, with plain white text. row 15, column 00, with plain white text. Ei ns te in . Erase Displayed Memory Erase Displayed Memory End Of Caption End Of Caption 46 | 47 | 00:00:31:15 Erase Non-displayed Memory Erase Non-displayed Memory Resume Caption Loading Resume Caption Loading row 15, column 00, with plain white text. row 15, column 00, with plain white text. Erase Displayed Memory Erase Displayed Memory End Of Caption End Of Caption 48 | 49 | 00:00:36:04 Erase Displayed Memory Erase Displayed Memory 50 | 51 | """ 52 | 53 | 54 | @pytest.fixture(scope="session") 55 | def sample_translated_scc_commands_not_found(): 56 | return """Scenarist_SCC V1.0 57 | 58 | 00:04:36;06 942x 942x 942x 942x [row 01, column 12, with plain white text.] [MA] [Ä] 525x c8cx ba8x 59 | """ 60 | 61 | 62 | @pytest.fixture(scope="session") 63 | def sample_translated_scc_custom_brackets(): 64 | return """Scenarist_SCC V1.0 65 | 66 | 00:00:09:05 {Erase Non-displayed Memory} {Erase Non-displayed Memory} {Resume Caption Loading} {Resume Caption Loading} {row 15, column 00, with plain white text.} {row 15, column 00, with plain white text.} {( } {cl} {oc} {k } {ti} {ck} {in} {g } {)} {Erase Displayed Memory} {Erase Displayed Memory} {End Of Caption} {End Of Caption} 67 | 68 | 00:00:12:08 {Erase Displayed Memory} {Erase Displayed Memory} 69 | 70 | 00:00:13:18 {Erase Non-displayed Memory} {Erase Non-displayed Memory} {Resume Caption Loading} {Resume Caption Loading} {row 13, column 00, with plain white text.} {row 13, column 00, with plain white text.} {MA} {N:} {row 14, column 00, with plain white text.} {row 14, column 00, with plain white text.} {Wh} {en} { w} {e } {th} {in} {k} {row 15, column 00, with plain white text.} {row 15, column 00, with plain white text.} {of} { "} {E } {eq} {ua} {ls} { m} { c} {-s} {qu} {ar} {ed} {",} {Erase Displayed Memory} {Erase Displayed Memory} {End Of Caption} {End Of Caption} 71 | 72 | 00:00:16:03 {Erase Non-displayed Memory} {Erase Non-displayed Memory} {Resume Caption Loading} {Resume Caption Loading} {row 15, column 00, with plain white text.} {row 15, column 00, with plain white text.} {we} { h} {av} {e } {th} {is} { v} {is} {io} {n } {of} { E} {in} {st} {ei} {n} {Erase Displayed Memory} {Erase Displayed Memory} {End Of Caption} {End Of Caption} 73 | 74 | 00:00:17:20 {Erase Non-displayed Memory} {Erase Non-displayed Memory} {Resume Caption Loading} {Resume Caption Loading} {row 14, column 00, with plain white text.} {row 14, column 00, with plain white text.} {as} { a} {n } {ol} {d,} { w} {ri} {nk} {ly} { m} {an} {row 15, column 00, with plain white text.} {row 15, column 00, with plain white text.} {wi} {th} { w} {hi} {te} { h} {ai} {r.} {Erase Displayed Memory} {Erase Displayed Memory} {End Of Caption} {End Of Caption} 75 | 76 | 00:00:19:13 {Erase Non-displayed Memory} {Erase Non-displayed Memory} {Resume Caption Loading} {Resume Caption Loading} {row 13, column 00, with plain white text.} {row 13, column 00, with plain white text.} {MA} {N } {2:} {row 14, column 00, with plain white text.} {row 14, column 00, with plain white text.} {E } {eq} {ua} {ls} { m} { c} {-s} {qu} {ar} {ed} { i} {s} {row 15, column 00, with plain white text.} {row 15, column 00, with plain white text.} {no} {t } {ab} {ou} {t } {an} { o} {ld} { E} {in} {st} {ei} {n.} {Erase Displayed Memory} {Erase Displayed Memory} {End Of Caption} {End Of Caption} 77 | 78 | 00:00:25:16 {Erase Non-displayed Memory} {Erase Non-displayed Memory} {Resume Caption Loading} {Resume Caption Loading} {row 13, column 00, with plain white text.} {row 13, column 00, with plain white text.} {MA} {N } {2:} {row 14, column 00, with plain white text.} {row 14, column 00, with plain white text.} {It} {'s} { a} {ll} { a} {bo} {ut} { a} {n } {et} {er} {na} {l} {row 15, column 00, with plain white text.} {row 15, column 00, with plain white text.} {Ei} {ns} {te} {in} {.} {Erase Displayed Memory} {Erase Displayed Memory} {End Of Caption} {End Of Caption} 79 | 80 | 00:00:31:15 {Erase Non-displayed Memory} {Erase Non-displayed Memory} {Resume Caption Loading} {Resume Caption Loading} {row 15, column 00, with plain white text.} {row 15, column 00, with plain white text.} {} {Erase Displayed Memory} {Erase Displayed Memory} {End Of Caption} {End Of Caption} 81 | 82 | 00:00:36:04 {Erase Displayed Memory} {Erase Displayed Memory} 83 | 84 | """ 85 | 86 | 87 | @pytest.fixture(scope="session") 88 | def sample_translated_scc_special_and_extended_characters(): 89 | return """Scenarist_SCC V1.0 90 | 91 | 00:00:16;29 [ ] [®] [°] [½] [¿] [™] [¢] [£] 92 | 93 | 00:04:36;06 [♪] [à] [ ] [è] [â] [ê] [î] [ô] [û] 94 | 95 | 00:08:00;00 [É] [Ó] [Ú] [Ü] [ü] [‘] [¡] [*] [’] [—] [©] 96 | 97 | 00:12:00;23 [℠] [•] [“] [”] [À] [Â] [Ç] [È] [Ê] [Ë] [ë] [Î] [Ï] 98 | 99 | 00:16:24;11 [ï] [Ô] [Ù] [ù] [Û] [«] [»] [Ã] [ã] [Í] [Ì] [ì] [Ò] 100 | 101 | 00:20:19;12 [ò] [Õ] [õ] [{] [}] [\\] [^] [_] [¦] [~] [Ä] [ä] [Ö] 102 | 103 | 00:24:39;28 [ö] [ß] [¥] [¤] [|] [Å] [å] [Ø] [ø] [┌] [┐] [└] [┘] 104 | """ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright (c) 2012-2025 PBS.org 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /docs/introduction.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============ 3 | 4 | ``pycaption`` is a caption reading/writing module. Use one of the given 5 | Readers to read content into a CaptionSet object, 6 | and then use one of the Writers to output the CaptionSet into 7 | captions of your desired format. 8 | 9 | Turn a caption into multiple caption outputs: 10 | 11 | :: 12 | 13 | srt_caps = '''1 14 | 00:00:09,209 --> 00:00:12,312 15 | This is an example SRT file, 16 | which, while extremely short, 17 | is still a valid SRT file. 18 | ''' 19 | 20 | converter = CaptionConverter() 21 | converter.read(srt_caps, SRTReader()) 22 | print(converter.write(SAMIWriter())) 23 | print(converter.write(DFXPWriter())) 24 | print(converter.write(pycaption.transcript.TranscriptWriter())) 25 | print(converter.write(MicroDVDWriter())) 26 | 27 | Not sure what format the caption is in? Detect it: 28 | 29 | :: 30 | 31 | from pycaption import detect_format 32 | 33 | caps = '''1 34 | 00:00:01,500 --> 00:00:12,345 35 | Small caption''' 36 | 37 | reader = detect_format(caps) 38 | if reader: 39 | print(SAMIWriter().write(reader().read(caps))) 40 | 41 | Or if you expect to have only a subset of the supported input formats: 42 | 43 | :: 44 | 45 | caps = '''1 46 | 00:00:01,500 --> 00:00:12,345 47 | Small caption''' 48 | 49 | if SRTReader().detect(caps): 50 | print(SAMIWriter().write(SRTReader().read(caps))) 51 | elif DFXPReader().detect(caps): 52 | print(SAMIWriter().write(DFXPReader().read(caps))) 53 | elif SCCReader().detect(caps): 54 | print(SAMIWriter().write(SCCReader().read(caps))) 55 | elif MicroDVDReader().detect(caps) 56 | print(SAMIWriter().write(MicroDVDReader().read(caps))) 57 | 58 | Python Usage 59 | ------------ 60 | 61 | Example: Convert from SAMI to DFXP 62 | 63 | :: 64 | 65 | from pycaption import SAMIReader, DFXPWriter 66 | 67 | sami = '''NOVA3213 84 |

85 | ( clock ticking ) 86 |

87 | FRENCH LINE 1! 88 |

89 |

 

90 |

91 | MAN:
92 | When we think
93 | of E equals m c-squared, 94 |

95 | FRENCH LINE 2? 96 |

''' 97 | 98 | print DFXPWriter().write(SAMIReader().read(sami)) 99 | 100 | Which will output the following: 101 | 102 | :: 103 | 104 | 105 | 106 | 107 | 108 |