├── tests
    ├── __init__.py
    ├── data
    │   ├── apostrphone_in_name_after.srt
    │   ├── subtitle_author_after.srt
    │   ├── subtitle_symbols_after.srt
    │   ├── apostrphone_in_name_before.srt
    │   ├── subtitle_italics_after.srt
    │   ├── hour_in_dialogue_after.srt
    │   ├── hour_in_dialogue_before.srt
    │   ├── subtitle_bom_after.srt
    │   ├── subtitle_bom_before.srt
    │   ├── subtitle_space_parsing_after.srt
    │   ├── subtitle_symbols_before.srt
    │   ├── subtitle_space_parsing_before.srt
    │   ├── subtitle_commas_after.srt
    │   ├── subtitle_commas_before.srt
    │   ├── subtitle_font_after.srt
    │   ├── subtitle_music_after.srt
    │   ├── subtitle_italics_before.srt
    │   ├── subtitle_font_before.srt
    │   ├── subtitle_angle_brackets_after.srt
    │   ├── subtitle_angle_brackets_before.srt
    │   ├── subtitle_sound_effects_after.srt
    │   ├── subtitle_example_after.srt
    │   ├── subtitle_author_before.srt
    │   ├── subtitle_names_after.srt
    │   ├── subtitle_music_before.srt
    │   ├── subtitle_sound_effects_before.srt
    │   ├── subtitle_names_before.srt
    │   └── subtitle_example_before.srt
    ├── io_test.py
    └── subtitles_test.py
├── subtitle_filter
    ├── libs
    │   ├── __init__.py
    │   └── subtitle.py
    ├── __init__.py
    └── bin
    │   └── filter-subtitles.py
├── .travis.yml
├── LICENSE
├── .github
    └── workflows
    │   └── python-publish.yml
├── setup.py
├── .gitignore
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/subtitle_filter/libs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/subtitle_filter/__init__.py:
--------------------------------------------------------------------------------
1 | from subtitle_filter.libs.subtitle import Subtitles
2 | 


--------------------------------------------------------------------------------
/tests/data/apostrphone_in_name_after.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:10:50,567 --> 00:10:52,569
3 | Yo!
4 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_author_after.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:02:18,389 --> 00:02:19,929
3 | Hey, darling.
4 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_symbols_after.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:02:31,229 --> 00:02:33,939
3 | Morning, Maeve.
4 | 


--------------------------------------------------------------------------------
/tests/data/apostrphone_in_name_before.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:10:50,567 --> 00:10:52,569
3 | MOTHER'S MILK:
4 | Yo!
5 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_italics_after.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:02:22,048 --> 00:02:25,083
3 | <i>Now wait a minute, Elliot.</i>
4 | 


--------------------------------------------------------------------------------
/tests/data/hour_in_dialogue_after.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:45:52,000 --> 00:45:55,295
3 | Just be in Stillwell's office
4 | at 10:00.
5 | 


--------------------------------------------------------------------------------
/tests/data/hour_in_dialogue_before.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:45:52,000 --> 00:45:55,295
3 | Just be in Stillwell's office
4 | at 10:00.
5 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_bom_after.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:01:21,999 --> 00:01:23,292
3 | Gosh.
4 | 
5 | 2
6 | 00:02:03,666 --> 00:02:05,459
7 | How long until we get to his estate?
8 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_bom_before.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:01:21,999 --> 00:01:23,292
3 | Gosh.
4 | 
5 | 2
6 | 00:02:03,666 --> 00:02:05,459
7 | How long until we get to his estate?
8 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_space_parsing_after.srt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 00:00:06,507 --> 00:00:08,467
 3 | The world
 4 | is broken.
 5 | 
 6 | 2
 7 | 00:00:10,386 --> 00:00:12,555
 8 | men who were born
 9 | with great power believed
10 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_symbols_before.srt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 00:00:05,118 --> 00:00:15,118
 3 | ?? ## _
 4 | 
 5 | 1
 6 | 00:00:05,118 --> 00:00:15,118
 7 | - --- 
 8 | - ¶
 9 | 
10 | 4
11 | 00:02:31,229 --> 00:02:33,939
12 | Morning, Maeve.


--------------------------------------------------------------------------------
/tests/data/subtitle_space_parsing_before.srt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 
 3 | 00:00:06,507 --> 00:00:08,467
 4 | The world
 5 | is broken.
 6 | 
 7 | 
 8 | 2
 9 | 00:00:10,386 --> 00:00:12,555
10 | men who were born
11 | with great power believed
12 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_commas_after.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:01:42,460 --> 00:01:47,420
3 | By decree, all persons
4 | found guilty of piracy,
5 | 
6 | 2
7 | 00:01:11,600 --> 00:01:14,730
8 | duly appointed representative
9 | of His Majesty, the king.


--------------------------------------------------------------------------------
/tests/data/subtitle_commas_before.srt:
--------------------------------------------------------------------------------
1 | 12
2 | 00:01:42,460 --> 00:01:47,420
3 | By decree,all persons
4 | found guilty of piracy,
5 | 
6 | 5
7 | 00:01:11,600 --> 00:01:14,730
8 | duly appointed representative
9 | of His Majesty , the king.


--------------------------------------------------------------------------------
/tests/data/subtitle_font_after.srt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 00:01:42,709 --> 00:01:46,210
 3 | Something's come alive.
 4 | 
 5 | 2
 6 | 00:01:42,709 --> 00:01:46,210
 7 | Again.
 8 | 
 9 | 3
10 | 00:02:45,238 --> 00:02:46,904
11 | Wait, wait, wait!
12 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_music_after.srt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 00:01:40,723 --> 00:01:42,707
 3 | What's up dawg.
 4 | 
 5 | 2
 6 | 00:01:19,519 --> 00:01:26,557
 7 | Is real
 8 | 
 9 | 3
10 | 00:00:57,939 --> 00:01:01,636
11 | <i>- They want to see the Ram Jam!</i>
12 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_italics_before.srt:
--------------------------------------------------------------------------------
 1 | 46
 2 | 00:01:43,719 --> 00:01:46,506
 3 | - <i>♪ Now that the day is over ♪</i>
 4 | - [beeps]
 5 | 
 6 | 5
 7 | 00:02:22,048 --> 00:02:25,083
 8 | <i>Now wait a minute, Elliot.</i>
 9 | 
10 | 1
11 | 00:01:19,519 --> 00:01:26,557
12 | <i>♪ 
13 | ♪</i>
14 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.5"
 4 |   - "3.6"
 5 |   - "3.7"
 6 |   - "3.8"
 7 | 
 8 | # command to install dependencies
 9 | install:
10 |   - pip install coverage coveralls .
11 | # command to run tests
12 | script: "coverage run -m unittest discover tests/ '*_test.py'"
13 | after_success: "coveralls"
14 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_font_before.srt:
--------------------------------------------------------------------------------
 1 | 3
 2 | 00:01:42,709 --> 00:01:46,210
 3 | <font color="#DF01D7">What</font> Something's come alive.
 4 | 
 5 | 5
 6 | 00:01:42,709 --> 00:01:46,210
 7 | <font color="#DF01D7">(CHUCKLES)
 8 | </font> Again.
 9 | 
10 | 6
11 | 00:02:45,238 --> 00:02:46,904
12 | - <font color="#DF01D7">(GUN COCKS)</font>
13 | - Wait, wait, wait!
14 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_angle_brackets_after.srt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 00:01:40,723 --> 00:01:42,707
 3 | <i> This stays <\i>
 4 | 
 5 | 2
 6 | 00:00:57,088 --> 00:00:58,788
 7 | This goes.
 8 | 
 9 | 3
10 | 00:45:39,418 --> 00:45:41,084
11 | <i> something something <\i>
12 | 
13 | 4
14 | 00:20:36,309 --> 00:20:39,277
15 | <i> - something some
16 | Listen here.<\i>
17 | 
18 | 5
19 | 00:11:31,995 --> 00:11:35,346
20 | <i> something
21 | Listen here.<\i>


--------------------------------------------------------------------------------
/tests/data/subtitle_angle_brackets_before.srt:
--------------------------------------------------------------------------------
 1 | 50
 2 | 00:01:40,723 --> 00:01:42,707
 3 | <i> This stays <\i>
 4 | 
 5 | 28
 6 | 00:00:57,088 --> 00:00:58,788
 7 | <him> This goes.
 8 | 
 9 | 740
10 | 00:45:39,418 --> 00:45:41,084
11 | <i> <hiim> something something <\i>
12 | 
13 | 398
14 | 00:20:36,309 --> 00:20:39,277
15 | <i> -<hiim> something some
16 | Listen here.<\i>
17 | 
18 | 262
19 | 00:11:31,995 --> 00:11:35,346
20 | <hiim> <i> something
21 | Listen here.<\i>


--------------------------------------------------------------------------------
/tests/data/subtitle_sound_effects_after.srt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 00:01:40,723 --> 00:01:42,707
 3 | Examples are tough.
 4 | 
 5 | 2
 6 | 00:00:57,088 --> 00:00:58,788
 7 | <i>And not me?</i>
 8 | 
 9 | 3
10 | 00:11:31,995 --> 00:11:35,346
11 | I'm raising a teenage boy.
12 | 
13 | 4
14 | 00:12:31,995 --> 00:12:35,346
15 | twice now.
16 | 
17 | 5
18 | 00:15:31,995 --> 00:51:35,346
19 | Third time.
20 | 
21 | 6
22 | 00:32:06,925 --> 00:32:09,057
23 | What do you got for me, Cabrera?
24 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_example_after.srt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 00:01:40,723 --> 00:01:42,707
 3 | - It's almost as if...
 4 | - Yes?
 5 | 
 6 | 2
 7 | 00:01:40,723 --> 00:01:42,707
 8 | Gooby pls...
 9 | 
10 | 3
11 | 00:01:42,709 --> 00:01:46,210
12 | Something's come alive.
13 | 
14 | 4
15 | 00:01:42,709 --> 00:01:46,210
16 | Again.
17 | 
18 | 5
19 | 00:02:08,501 --> 00:02:11,869
20 | Guh
21 | 
22 | 6
23 | 00:02:22,048 --> 00:02:25,083
24 | <i>Now wait a minute, Elliot.</i>
25 | 
26 | 7
27 | 00:02:45,238 --> 00:02:46,904
28 | Wait, wait, wait!
29 | 
30 | 8
31 | 00:00:57,088 --> 00:00:58,788
32 | <i>And not me?</i>
33 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_author_before.srt:
--------------------------------------------------------------------------------
 1 | 2
 2 | 00:00:15,142 --> 00:00:25,142
 3 | � ENCODED AND RELEASED BY <font color="#8080ff">Sharpysword</font> ?
 4 | 
 5 | 3
 6 | 00:02:18,389 --> 00:02:19,929
 7 | Hey, darling.
 8 | 
 9 | 3
10 | 00:00:00,062 --> 00:00:02,507
11 | Subtitles by explosiveskull
12 | www.OpenSubtitles.org
13 | 
14 | 3
15 | 00:00:00,062 --> 00:00:02,507
16 | Synced and Corrected by Your Mum.
17 | 
18 | 976
19 | 00:42:12,094 --> 00:42:15,054
20 | Captioning sponsored by
21 | CBS
22 | 
23 | 978
24 | 00:42:18,884 --> 00:42:21,060
25 | Captioned by
26 | Media Access Group at WGBH
27 | access.wgbh.org


--------------------------------------------------------------------------------
/tests/data/subtitle_names_after.srt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 00:01:40,723 --> 00:01:42,707
 3 | - It's almost as if...
 4 | - Yes?
 5 | 
 6 | 2
 7 | 00:01:40,723 --> 00:01:42,707
 8 | Gooby pls...
 9 | 
10 | 3
11 | 00:03:06,139 --> 00:03:07,639
12 | <i> You're gonna need</i>
13 | <i> most of that</i>
14 | 
15 | 4
16 | 00:00:05,377 --> 00:00:06,378
17 | No.
18 | 
19 | 5
20 | 00:03:32,296 --> 00:03:34,840
21 | Regular text.
22 | 
23 | 6
24 | 00:00:55,296 --> 00:00:58,931
25 | Cop cuties,
26 | cute and on duty
27 | 
28 | 7
29 | 00:07:40,362 --> 00:07:44,153
30 | Born in humble circumstances
31 | in Dundee, Scotland,
32 | 
33 | 8
34 | 00:16:06,299 --> 00:16:08,802
35 | See? Hebrews 13:4.
36 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_music_before.srt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 00:01:19,519 --> 00:01:26,557
 3 | <i>♪ ♪</i>
 4 | 
 5 | 2
 6 | 00:01:19,519 --> 00:01:26,557
 7 | <i>♪
 8 | ♪</i>
 9 | 
10 | 3
11 | 00:01:40,723 --> 00:01:42,707
12 | What's up dawg.
13 | 
14 | 4
15 | 00:01:19,519 --> 00:01:26,557
16 | ♪ This sound
17 | Is real
18 | 
19 | 5
20 | 00:01:19,519 --> 00:01:26,557
21 | ♪ This sound
22 | Is not ♪
23 | 
24 | 6
25 | 00:00:57,939 --> 00:01:01,636
26 | <i>- They want to see the Ram Jam!
27 | - # Well, I'm frustrated #</i>
28 | 
29 | 7
30 | 00:01:01,709 --> 00:01:04,007
31 | <i># And outdated #</i>
32 | 
33 | 8
34 | 00:01:33,125 --> 00:01:36,291
35 | ♪<i> 'Cause it sure looks to me</i>
36 | <i>Like them people</i>
37 | <i>Ain't playin'♪</i>
38 | 
39 | 


--------------------------------------------------------------------------------
/tests/io_test.py:
--------------------------------------------------------------------------------
 1 | '''I/O test cases'''
 2 | import copy
 3 | import unittest
 4 | import tempfile
 5 | 
 6 | from os.path import join, dirname
 7 | 
 8 | from subtitle_filter import Subtitles
 9 | 
10 | DATA_DIR = join(dirname(__file__), 'data')
11 | 
12 | class SubtitleFilterFontTestCase(unittest.TestCase):
13 | 
14 |     def setUp(self):
15 |         self.subs_before = Subtitles(join(DATA_DIR, 'subtitle_music_before.srt'))
16 | 
17 |     def test_subtitle_save(self):
18 |         subs = copy.deepcopy(self.subs_before)
19 |         subs.filter(rm_music=False)
20 |         with tempfile.TemporaryDirectory() as dirpath:
21 |             fpath = join(dirpath, 'test.srt')
22 |             subs.save(fpath)
23 |             subs_after = Subtitles(fpath)
24 |         self.assertEqual(self.subs_before, subs_after)


--------------------------------------------------------------------------------
/tests/data/subtitle_sound_effects_before.srt:
--------------------------------------------------------------------------------
 1 | 50
 2 | 00:01:40,723 --> 00:01:42,707
 3 | Examples are tough.
 4 | 
 5 | 28
 6 | 00:00:57,088 --> 00:00:58,788
 7 | - [gunshot]
 8 | - <i>[Shot] And not me?</i>
 9 | 
10 | 740
11 | 00:45:39,418 --> 00:45:41,084
12 | [Barenaked Ladies' "One Week" plays]
13 | 
14 | 398
15 | 00:20:36,309 --> 00:20:39,277
16 | [Roxette's "Listen to
17 | Your Heart" plays softly]
18 | 
19 | 262
20 | 00:11:31,995 --> 00:11:35,346
21 | [chuckles softly]:
22 | I'm raising a teenage boy.
23 | 
24 | 263
25 | 00:12:31,995 --> 00:12:35,346
26 | (chuckles softly):
27 | twice now.
28 | 
29 | 398
30 | 00:20:36,309 --> 00:20:39,277
31 | /BOOM/
32 | 
33 | 400
34 | 00:15:31,995 --> 00:51:35,346
35 | /here we go/:
36 | Third time.
37 | 
38 | 768
39 | 00:32:06,925 --> 00:32:09,057
40 | *Whistles*
41 | What do you got for me, Cabrera?
42 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_names_before.srt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 00:01:40,723 --> 00:01:42,707
 3 | ELLIOT: It's almost as if...
 4 | JOE: Yes?
 5 | 
 6 | 2
 7 | 00:01:40,723 --> 00:01:42,707
 8 | ELLIOT: Gooby pls...
 9 | 
10 | 17
11 | 00:03:06,139 --> 00:03:07,639
12 | Bartender:
13 | <i> You're gonna need</i>
14 | <i> most of that</i>
15 | 
16 | 5
17 | 00:00:05,377 --> 00:00:06,378
18 | THE FOOBAR: No.
19 | 
20 | 39
21 | 00:03:32,296 --> 00:03:34,840
22 | FOO BAR 4: (IN ENGLISH)
23 | Regular text.
24 | 
25 | 20
26 | 00:00:55,296 --> 00:00:58,931
27 | all: ♪ Cop cuties,
28 | cute and on duty ♪
29 | 
30 | 20
31 | 00:00:55,296 --> 00:00:58,931
32 | all: Cop cuties,
33 | cute and on duty
34 | 
35 | 152
36 | 00:07:40,362 --> 00:07:44,153
37 | Man on Video:
38 | Born in humble circumstances
39 | in Dundee, Scotland,
40 | 
41 | 1
42 | 00:16:06,299 --> 00:16:08,802
43 | See? Hebrews 13:4.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Matt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |     release:
 8 |         types: [created]
 9 | 
10 | jobs:
11 |     build_and_deploy:
12 |         runs-on: ubuntu-latest
13 | 
14 |         steps:
15 |             - name: Checkout repository
16 |               uses: actions/checkout@v2
17 | 
18 |             - name: Set up Python
19 |               uses: actions/setup-python@v2
20 |               with:
21 |                   python-version: 3.x
22 | 
23 |             - name: Install dependencies
24 |               run: |
25 |                   python -m pip install --upgrade pip
26 |                   pip install setuptools wheel twine
27 | 
28 |             - name: Build package
29 |               run: python setup.py sdist bdist_wheel
30 | 
31 |             - name: Publish package to PyPI
32 |               uses: pypa/gh-action-pypi-publish@v1.4.2
33 |               with:
34 |                   user: __token__
35 |                   password: ${{ secrets.PYPI_API_TOKEN }}
36 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | '''setup.py, use this to install module'''
 3 | from os import path
 4 | from setuptools import setup
 5 | 
 6 | version = '1.5.0'
 7 | this_dir = path.abspath(path.dirname(__file__))
 8 | with open(path.join(this_dir, 'README.md'), encoding='utf-8') as f:
 9 |     long_description = f.read()
10 | 
11 | setup(
12 |     name='subtitle-filter',
13 |     version=version,
14 |     description='Filter SDH entries and more from .srt files',
15 |     author='Matt Lyon',
16 |     author_email='matthewlyon18@gmail.com',
17 |     url='https://github.com/mattlyon93/filter-subs',
18 |     download_url='https://github.com/mattlyon93/filter-subs/archive/v{}.tar.gz'.format(version),
19 |     long_description=long_description,
20 |     long_description_content_type='text/markdown',
21 |     python_requires='>=3.5',
22 |     license='MIT License',
23 |     packages=['subtitle_filter', 'subtitle_filter/libs'],
24 |     classifiers=[
25 |         'Programming Language :: Python',
26 |         'Operating System :: Unix',
27 |         'Operating System :: MacOS',
28 |         'Topic :: Text Processing :: Filters',
29 |         'Topic :: Multimedia :: Sound/Audio :: Speech',
30 |     ],
31 |     keywords=['subtitle', 'SDH', 'hard-of-hearing', 'filter', 'movie', 'tv'],
32 |     scripts=['subtitle_filter/bin/filter-subtitles.py'],
33 | )
34 | 


--------------------------------------------------------------------------------
/tests/data/subtitle_example_before.srt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 00:01:19,519 --> 00:01:26,557
 3 | <i>♪ ♪</i>
 4 | 
 5 | 1
 6 | 00:01:19,519 --> 00:01:26,557
 7 | <i>♪ 
 8 | ♪</i>
 9 | 
10 | 2
11 | 00:01:40,723 --> 00:01:42,707
12 | ELLIOT: It's almost as if...
13 | JOE: Yes?
14 | 
15 | 2
16 | 00:01:40,723 --> 00:01:42,707
17 | ELLIOT: Gooby pls...
18 | 
19 | 3
20 | 00:01:42,709 --> 00:01:46,210
21 | <font color="#DF01D7">(CHUCCCKLEEES)</font> Something's come alive.
22 | 
23 | 3
24 | 00:01:42,709 --> 00:01:46,210
25 | <font color="#DF01D7">(CHUCKLES)
26 | </font> Again.
27 | 
28 | 4
29 | 00:02:08,501 --> 00:02:11,869
30 | (POPCORN MACHINE DOOR CLOSES) Guh
31 | 
32 | 5
33 | 00:02:22,048 --> 00:02:25,083
34 | <i>Now wait a minute, Elliot.</i>
35 | 
36 | 6
37 | 00:11:08,086 --> 00:11:15,984
38 | - <font color="#D81D1D">Synced and corrected by VitoSilans</font> -
39 | -- <font color="#138CE9">www.Addic7ed.com</font> --
40 | 
41 | 3
42 | 00:00:00,062 --> 00:00:02,507
43 | Subtitles by explosiveskull
44 | www.OpenSubtitles.org
45 | 
46 | 6
47 | 00:02:45,238 --> 00:02:46,904
48 | - <font color="#DF01D7">(GUN COCKS)</font>
49 | - Wait, wait, wait!
50 | 
51 | 46
52 | 00:01:43,719 --> 00:01:46,506
53 | - <i>♪ Now that the day is over ♪</i>
54 | - [beeps]
55 | 
56 | 28
57 | 00:00:57,088 --> 00:00:58,788
58 | - [gunshot]
59 | - <i>[Shot] And not me?</i>
60 | 
61 | 740
62 | 00:45:39,418 --> 00:45:41,084
63 | [Barenaked Ladies' "One Week" plays]
64 | 
65 | 398
66 | 00:20:36,309 --> 00:20:39,277
67 | [Roxette's "Listen to
68 | Your Heart" plays softly]
69 | 
70 | 584
71 | 00:37:43,795 --> 00:37:47,899
72 | *
73 | 
74 | 487
75 | 00:29:34,006 --> 00:29:37,944
76 | - * Send me an angel
77 | who flies from Montgomery *
78 | 
79 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Visual Studio Code
132 | .vscode
133 | 
134 | # pylint
135 | .pylintrc
136 | 
137 | # OS X
138 | *.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # subtitle-filter
  2 | 
  3 | [![PyPI version](https://badge.fury.io/py/subtitle-filter.svg)](https://badge.fury.io/py/subtitle-filter)
  4 | 
  5 | Filter `.srt` subtitle files to remove SDH (Deaf or Hard-of-Hearing) entries and other tags.
  6 | 
  7 | ## Installation
  8 | 
  9 | ```bash
 10 | pip install subtitle-filter
 11 | ```
 12 | 
 13 | ## Usage
 14 | 
 15 | `subtitle-filter` can be used either as a script or a module.
 16 | 
 17 | By default, this module filters the following (in order):
 18 | 
 19 | 1. Removes font tags and text contained within, e.g. `<font color="#DF01D7">Hey\</font>` is removed.
 20 | 2. Removes subtitle entries containing only asterisks: `*`.
 21 | 3. Removes subtitle lines containing `♪` or `#`.
 22 | 4. Removes sound effects: text contained with and including parenthesis `(BANG)`, asterisks `*whisper*`, brackets `[boom]`, and text contained within forward slashes `/POW/`.
 23 | 5. Replaces names with dashes, e.g. `GARY: Hey` or `Gary: Hey` to `- Hey`.
 24 | 6. Removes author tags e.g. `XoXo Subtitles by PwnedDude967 XoXo`.
 25 | 7. Fixes erroneous comma spacing, e.g. `Hey , what's up? Nothing,my man` to `Hey, what's up? Nothing, my man`.
 26 | 8. Removes lone symbols such as `?`, `-`, `#`, `_`.
 27 | 9. Remove leading dashes `-` if there is only one line present.
 28 | 
 29 | ### Script Usage
 30 | 
 31 | Bring up the help display:
 32 | 
 33 | ```bash
 34 | filter-subtitles.py -h
 35 | ```
 36 | 
 37 | Filter a subtitle in place (overwrites original subtitle) with default options.
 38 | 
 39 | ```bash
 40 | filter-subtitles.py -s /path/to/sub.srt
 41 | ```
 42 | 
 43 | Instead of saving to disk, print the output.
 44 | 
 45 | ```bash
 46 | filter-subtitles.py -s /path/to/sub.srt -p
 47 | ```
 48 | 
 49 | Save the output to a different filepath.
 50 | 
 51 | ```bash
 52 | filter-subtitles.py -s /path/to/sub.srt -o /path/to/outsub.srt
 53 | ```
 54 | 
 55 | Custom filter flags.
 56 | 
 57 | ```
 58 | --keep-fonts          Do not remove font tags and text contained within.
 59 | --keep-ast            Do not remove subtitles containing asterisks: (*).
 60 | --keep-music          Do not lines containing 1 or more "♪" symbols.
 61 | --keep-effects        Do not remove text between and including parenthesis
 62 |                         () or brackets []
 63 | --keep-names          Do not replace names in CAPITALS with "-" tags
 64 | --keep-author         Do not remove author tags, eg. Subtitles by some guy.
 65 | --keep-lone-dashes    Do not remove the dash if only one dashed line is present.
 66 | --keep-commas         Do not fix comma spacings.
 67 | ```
 68 | 
 69 | ### Module Usage
 70 | 
 71 | Filter a subtitle in place (overwrites original subtitle) with default options
 72 | 
 73 | ```python
 74 | from subtitle_filter import Subtitles
 75 | 
 76 | subs = Subtitles('/path/to/sub.srt')
 77 | subs.filter()
 78 | subs.save()
 79 | ```
 80 | 
 81 | Instead of saving to disk, print the output.
 82 | 
 83 | ```python
 84 | subs.print()
 85 | ```
 86 | 
 87 | Save the output to a different filepath.
 88 | 
 89 | ```python
 90 | subs.save('/path/to/newsub.srt')
 91 | ```
 92 | 
 93 | Use custom filter flags.
 94 | 
 95 | ```python
 96 | subs.filter(
 97 |     rm_fonts=True,
 98 |     rm_ast=False,
 99 |     rm_music=True,
100 |     rm_effects=True,
101 |     rm_names=False,
102 |     rm_author=False,
103 | )
104 | ```
105 | 
106 | ### Issues & Requests
107 | 
108 | If you spot any issues with the filtered subtitles, or would like to request new features, please create an issue on GitHub and provide examples.
109 | 


--------------------------------------------------------------------------------
/subtitle_filter/bin/filter-subtitles.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | '''Script to Filter SDH tags from subtitles'''
  3 | 
  4 | import argparse
  5 | 
  6 | from subtitle_filter.libs.subtitle import Subtitles
  7 | 
  8 | 
  9 | def run(args):
 10 |     '''Main entry point of script'''
 11 |     subs = Subtitles(args.sub_fpath)
 12 |     subs.filter(
 13 |         rm_fonts=args.rm_fonts,
 14 |         rm_ast=args.rm_ast,
 15 |         rm_music=args.rm_music,
 16 |         rm_effects=args.rm_effects,
 17 |         rm_names=args.rm_names,
 18 |         rm_author=args.rm_author,
 19 |         rm_lone_dashes=args.rm_lone_dashes,
 20 |         fix_commas=args.fix_commas,
 21 |     )
 22 | 
 23 |     if args.print:
 24 |         subs.print()
 25 |         return
 26 | 
 27 |     subs.save(new_filepath=args.out_fpath)
 28 | 
 29 | 
 30 | if __name__ == '__main__':
 31 |     ap = argparse.ArgumentParser(
 32 |         description='Filter subtitles to remove various SDH (Deaf or Hard-of-Hearing) tags.'
 33 |     )
 34 | 
 35 |     ap.add_argument(
 36 |         '-s',
 37 |         '--subtitle',
 38 |         dest='sub_fpath',
 39 |         type=str,
 40 |         help='Subtitle file to filter',
 41 |         required=True,
 42 |     )
 43 |     ap.add_argument(
 44 |         '-o',
 45 |         '--output',
 46 |         dest='out_fpath',
 47 |         type=str,
 48 |         help='Path to save filtered subtitle, omit to save inplace',
 49 |         default=None,
 50 |     )
 51 |     ap.add_argument(
 52 |         '-p',
 53 |         '--print-only',
 54 |         dest='print',
 55 |         action='store_true',
 56 |         default=False,
 57 |         help='Print output subtitles instead of saving to disk.',
 58 |     )
 59 |     ap.add_argument(
 60 |         '--keep-fonts',
 61 |         dest='rm_fonts',
 62 |         default=True,
 63 |         action='store_false',
 64 |         help='Do not remove font tags from subtitles.',
 65 |     )
 66 |     ap.add_argument(
 67 |         '--keep-ast',
 68 |         dest='rm_ast',
 69 |         default=True,
 70 |         action='store_false',
 71 |         help='Do not remove subtitles containing asterisks: (*).',
 72 |     )
 73 |     ap.add_argument(
 74 |         '--keep-music',
 75 |         dest='rm_music',
 76 |         default=True,
 77 |         action='store_false',
 78 |         help='Do not remove "♪" symbols and text contained within two "♪" symbols.',
 79 |     )
 80 |     ap.add_argument(
 81 |         '--keep-effects',
 82 |         dest='rm_effects',
 83 |         default=True,
 84 |         action='store_false',
 85 |         help='Do not remove text between and including parenthesis () or brackets []',
 86 |     )
 87 |     ap.add_argument(
 88 |         '--keep-names',
 89 |         dest='rm_names',
 90 |         default=True,
 91 |         action='store_false',
 92 |         help='Do not replace names in CAPITALS with "-" tags',
 93 |     )
 94 |     ap.add_argument(
 95 |         '--keep-author',
 96 |         dest='rm_author',
 97 |         default=True,
 98 |         action='store_false',
 99 |         help='Do not remove author tags, eg. Subtitles by some guy.',
100 |     )
101 |     ap.add_argument(
102 |         '--keep-lone-dashes',
103 |         dest='rm_lone_dashes',
104 |         default=True,
105 |         action='store_false',
106 |         help='Do not remove lone dashes from subtitles.',
107 |     )
108 |     ap.add_argument(
109 |         '--keep-commas',
110 |         dest='fix_commas',
111 |         default=True,
112 |         action='store_false',
113 |         help='Do not fix comma spacings.',
114 |     )
115 | 
116 |     arguments = ap.parse_args()
117 | 
118 |     run(arguments)
119 | 


--------------------------------------------------------------------------------
/tests/subtitles_test.py:
--------------------------------------------------------------------------------
  1 | '''Subtitle test cases'''
  2 | 
  3 | import unittest
  4 | 
  5 | from os.path import join, dirname
  6 | 
  7 | from subtitle_filter import Subtitles
  8 | 
  9 | DATA_DIR = join(dirname(__file__), 'data')
 10 | 
 11 | 
 12 | class SubtitleFilterFontTestCase(unittest.TestCase):
 13 |     def setUp(self):
 14 |         self.subs_before = Subtitles(join(DATA_DIR, 'subtitle_font_before.srt'))
 15 |         self.subs_after = Subtitles(join(DATA_DIR, 'subtitle_font_after.srt'))
 16 | 
 17 |     def test_subtitle_font(self):
 18 |         self.subs_before.filter()
 19 |         self.assertEqual(self.subs_before, self.subs_after)
 20 | 
 21 | 
 22 | class SubtitleFilterMusicTestCase(unittest.TestCase):
 23 |     def setUp(self):
 24 |         self.subs_before = Subtitles(join(DATA_DIR, 'subtitle_music_before.srt'))
 25 |         self.subs_after = Subtitles(join(DATA_DIR, 'subtitle_music_after.srt'))
 26 | 
 27 |     def test_subtitle_music(self):
 28 |         self.subs_before.filter()
 29 |         self.assertEqual(self.subs_before, self.subs_after)
 30 | 
 31 | 
 32 | class SubtitleFilterSoundEffectsTestCase(unittest.TestCase):
 33 |     def setUp(self):
 34 |         self.subs_before = Subtitles(join(DATA_DIR, 'subtitle_sound_effects_before.srt'))
 35 |         self.subs_after = Subtitles(join(DATA_DIR, 'subtitle_sound_effects_after.srt'))
 36 | 
 37 |     def test_subtitle_sound_effects(self):
 38 |         self.subs_before.filter()
 39 |         self.assertEqual(self.subs_before, self.subs_after)
 40 | 
 41 | 
 42 | class SubtitleFilterItalicsTestCase(unittest.TestCase):
 43 |     def setUp(self):
 44 |         self.subs_before = Subtitles(join(DATA_DIR, 'subtitle_italics_before.srt'))
 45 |         self.subs_after = Subtitles(join(DATA_DIR, 'subtitle_italics_after.srt'))
 46 | 
 47 |     def test_subtitle_italics(self):
 48 |         self.subs_before.filter()
 49 |         self.assertEqual(self.subs_before, self.subs_after)
 50 | 
 51 | 
 52 | class SubtitleAllTestCase(unittest.TestCase):
 53 |     def setUp(self):
 54 |         self.subs_before = Subtitles(join(DATA_DIR, 'subtitle_example_before.srt'))
 55 |         self.subs_after = Subtitles(join(DATA_DIR, 'subtitle_example_after.srt'))
 56 | 
 57 |     def test_subtitle_all(self):
 58 |         self.subs_before.filter()
 59 |         self.assertEqual(self.subs_before, self.subs_after)
 60 | 
 61 | 
 62 | class SubtitleFilterNamesTestCase(unittest.TestCase):
 63 |     def setUp(self):
 64 |         self.subs_before = Subtitles(join(DATA_DIR, 'subtitle_names_before.srt'))
 65 |         self.subs_after = Subtitles(join(DATA_DIR, 'subtitle_names_after.srt'))
 66 | 
 67 |     def test_subtitle_names(self):
 68 |         self.subs_before.filter()
 69 |         self.assertEqual(self.subs_before, self.subs_after)
 70 | 
 71 | 
 72 | class SubtitleFilterSymbolsTestCase(unittest.TestCase):
 73 |     def setUp(self):
 74 |         self.subs_before = Subtitles(join(DATA_DIR, 'subtitle_symbols_before.srt'))
 75 |         self.subs_after = Subtitles(join(DATA_DIR, 'subtitle_symbols_after.srt'))
 76 | 
 77 |     def test_subtitle_symbols(self):
 78 |         self.subs_before.filter()
 79 |         self.assertEqual(self.subs_before, self.subs_after)
 80 | 
 81 | 
 82 | class SubtitleFilterAuthorTestCase(unittest.TestCase):
 83 |     def setUp(self):
 84 |         self.subs_before = Subtitles(join(DATA_DIR, 'subtitle_author_before.srt'))
 85 |         self.subs_after = Subtitles(join(DATA_DIR, 'subtitle_author_after.srt'))
 86 | 
 87 |     def test_subtitle_author(self):
 88 |         self.subs_before.filter()
 89 |         self.assertEqual(self.subs_before, self.subs_after)
 90 | 
 91 | 
 92 | class SubtitleFilterCommaTestCase(unittest.TestCase):
 93 |     def setUp(self):
 94 |         self.subs_before = Subtitles(join(DATA_DIR, 'subtitle_commas_before.srt'))
 95 |         self.subs_after = Subtitles(join(DATA_DIR, 'subtitle_commas_after.srt'))
 96 | 
 97 |     def test_subtitle_commas(self):
 98 |         self.subs_before.filter()
 99 |         self.assertEqual(self.subs_before, self.subs_after)
100 | 
101 | 
102 | class SubtitleParseSpacingTestCase(unittest.TestCase):
103 |     def setUp(self):
104 |         self.subs_before = Subtitles(join(DATA_DIR, 'subtitle_space_parsing_before.srt'))
105 |         self.subs_after = Subtitles(join(DATA_DIR, 'subtitle_space_parsing_after.srt'))
106 | 
107 |     def test_space_parsing_commas(self):
108 |         self.subs_before.filter()
109 |         self.assertEqual(self.subs_before, self.subs_after)
110 | 
111 | 
112 | class SubtitleBOMTestCase(unittest.TestCase):
113 |     def setUp(self):
114 |         self.subs_before = Subtitles(join(DATA_DIR, 'subtitle_bom_before.srt'))
115 |         self.subs_after = Subtitles(join(DATA_DIR, 'subtitle_bom_after.srt'))
116 | 
117 |     def test_bom(self):
118 |         self.subs_before.filter()
119 |         self.assertEqual(self.subs_before, self.subs_after)
120 | 
121 | 
122 | class SubtitleHoursTestCase(unittest.TestCase):
123 |     def setUp(self):
124 |         self.subs_before = Subtitles(join(DATA_DIR, 'hour_in_dialogue_before.srt'))
125 |         self.subs_after = Subtitles(join(DATA_DIR, 'hour_in_dialogue_after.srt'))
126 | 
127 |     def test_hours(self):
128 |         self.subs_before.filter()
129 |         self.assertEqual(self.subs_before, self.subs_after)
130 | 
131 | 
132 | class SubtitleApostrophe(unittest.TestCase):
133 |     def setUp(self):
134 |         self.subs_before = Subtitles(join(DATA_DIR, 'apostrphone_in_name_before.srt'))
135 |         self.subs_after = Subtitles(join(DATA_DIR, 'apostrphone_in_name_after.srt'))
136 | 
137 |     def test_thing(self):
138 |         self.subs_before.filter()
139 |         self.assertEqual(self.subs_before, self.subs_after)
140 | 


--------------------------------------------------------------------------------
/subtitle_filter/libs/subtitle.py:
--------------------------------------------------------------------------------
  1 | '''Module containing Subtitle and Subtitles classes'''
  2 | 
  3 | import os
  4 | import re
  5 | 
  6 | AUTHOR_STRINGS = (
  7 |     'synced and corrected by',
  8 |     'sync and corrections by',
  9 |     'subtitles by',
 10 |     'encoded and released by',
 11 |     'opensubtitles.org',
 12 |     'please rate this subtitle',
 13 |     'captioning sponsored by',
 14 |     'captioned by',
 15 | )
 16 | 
 17 | 
 18 | def has_bom(filename):
 19 |     '''Tests whether file byte order marking'''
 20 |     with open(filename, 'rb') as file:
 21 |         bom_bytes = file.read(4)
 22 |         return bom_bytes.startswith(
 23 |             (b'\xef\xbb\xbf', b'\xff\xfe', b'\xfe\xff', b'\xff\xfe\x00\x00', b'\x00\x00\xfe\xff')
 24 |         )
 25 | 
 26 | 
 27 | class Subtitle:
 28 |     '''Subtitle contents object
 29 |     (invidual subtitle entry)
 30 |     '''
 31 | 
 32 |     def __init__(self):
 33 |         self._index = None
 34 |         self._contents = ''
 35 |         self.start = None
 36 |         self.end = None
 37 | 
 38 |     def __str__(self):
 39 |         return '{}\n{} --> {}\n{}\n'.format(self._index, self.start, self.end, self._contents)
 40 | 
 41 |     def __eq__(self, other):
 42 |         if self.__str__() == other.__str__():
 43 |             return True
 44 |         return False
 45 | 
 46 |     def _contents_to_list(self):
 47 |         if isinstance(self._contents, str):
 48 |             self._contents = self._contents.split('\n')
 49 | 
 50 |     def _contents_to_str(self):
 51 |         if isinstance(self._contents, list):
 52 |             self._contents = '\n'.join(self._contents)
 53 | 
 54 |     @property
 55 |     def index(self):
 56 |         '''Returns the index number for subtitle, or False if index is not assigned'''
 57 |         if self._index is None:
 58 |             return False
 59 |         return self._index
 60 | 
 61 |     @index.setter
 62 |     def index(self, index):
 63 |         self._index = int(index)
 64 | 
 65 |     @property
 66 |     def contents(self):
 67 |         '''Returns the contents lines for the subtitle'''
 68 |         return self._contents
 69 | 
 70 |     @contents.setter
 71 |     def contents(self, item):
 72 |         if self._contents:
 73 |             self._contents += '\n{}'.format(item)
 74 |         else:
 75 |             self._contents = '{}'.format(item)
 76 | 
 77 |     def _filter_empty(self):
 78 |         '''Removes empty quotes from contents list,
 79 |         Converts self.index to 0
 80 |         '''
 81 |         # Set index as 0 for later deletion
 82 |         if not self.contents:
 83 |             self.index = 0
 84 | 
 85 |     @property
 86 |     def lines(self):
 87 |         '''Subtitle entry as a newline separated list'''
 88 |         return [
 89 |             str(self._index),
 90 |             '{} --> {}'.format(self.start, self.end),
 91 |             *self._contents.split('\n'),
 92 |         ]
 93 | 
 94 |     @staticmethod
 95 |     def _remove_comma_space(matchobj):
 96 |         return matchobj.group(0).replace(' ,', ',')
 97 | 
 98 |     @staticmethod
 99 |     def _add_comma_space(matchobj):
100 |         return matchobj.group(0).replace(',', ', ')
101 | 
102 |     def fix_comma_spaces(self):
103 |         '''Fixes comma space seperation'''
104 |         for _ in re.findall(r'[A-Za-z]+\s+,', self._contents):
105 |             self._contents = re.sub(r'[A-Za-z]+\s+,', self._remove_comma_space, self._contents)
106 |         for _ in re.findall(r'[A-Za-z]+,[A-Za-z]+', self._contents):
107 |             self._contents = re.sub(r'[A-Za-z]+,[A-Za-z]+', self._add_comma_space, self._contents)
108 | 
109 |     def remove_font_colours(self):
110 |         '''Removes <font> tags from contents'''
111 |         self._contents = re.sub(r'\<font(.*)\>(.*)\</font\>', '', self._contents, flags=re.DOTALL)
112 |         self._filter_empty()
113 | 
114 |     def remove_music(self):
115 |         '''Removes music symbols from contents'''
116 |         # Remove music symbol behaving as parenthesis
117 |         self._contents = re.sub(r'♪(.*)♪', '', self._contents, flags=re.DOTALL)
118 |         # Remove behaving as inline
119 |         self._contents_to_list()
120 |         for idx, _ in enumerate(self._contents):
121 |             if any(symbol in self._contents[idx] for symbol in ['#', '♪']):
122 |                 self._contents[idx] = ''
123 |         self._contents_to_str()
124 |         self._filter_empty()
125 | 
126 |     def remove_sound_effects(self):
127 |         '''Removes text in between parenthesis, brackets, and forward slashes'''
128 |         # Remove single line brackets
129 |         self._contents_to_list()
130 |         for idx, _ in enumerate(self._contents):
131 |             # Have split this check into a for loop across the delimiters as providing
132 |             # them in one regex expression will yield errors for forward slash within italics
133 |             # tag when used with square brackets/parenthesis. e.g line 8 of
134 |             # subtitle_sound_effects_before.srt
135 |             for prefix, suffix in (('(', ')'), ('[', ']'), ('/', '/'), ('*', '*')):
136 |                 self._contents[idx] = re.sub(
137 |                     rf'[\{prefix}][\S ]*[\{suffix}][\s:]*', '', self._contents[idx]
138 |                 )
139 |         self._remove_lone_symbols()
140 |         self._contents_to_str()
141 |         # Remove multi-line brackets
142 |         self._contents = re.sub(r'[\*\(\[][\S\s]*[\*\)\]][\s:]*', '', self._contents)
143 |         self._filter_empty()
144 | 
145 |     def replace_names(self):
146 |         '''Replace names in all caps'''
147 |         # Care is taken here to preserve genuine sentences with a colon.
148 |         name_regex = r"((?=.*[A-Z])[A-Z0-9 ][A-Z0-9' ]*: *|[A-Z]{1}[a-zA-Z ]+ *: *|^[A-Za-z]+: *)"
149 |         names = re.findall(name_regex, self._contents)
150 |         # dialogues from different people preceeded with -
151 |         # TODO: does this cover the case where the names are the same?
152 |         replacement = '- ' if len(names) > 1 else ''
153 | 
154 |         def replace_if_not_hour(match):
155 |             # group0 = entire match
156 |             start, end = match.span(0)
157 |             original_match = match.string[start:end]
158 | 
159 |             def is_hour():
160 |                 hour_candidate = match.string[start : end + 2].strip()
161 |                 assert (
162 |                     ":" in hour_candidate
163 |                 ), "it has to have a ':' character because it was matched by a regexp"
164 |                 lhs, rhs = hour_candidate.split(":")
165 |                 return rhs and lhs and len(lhs) <= 2 and "".join([lhs, rhs]).isnumeric()
166 | 
167 |             return original_match if is_hour() else replacement
168 | 
169 |         self._contents = re.sub(name_regex, replace_if_not_hour, self._contents).lstrip()
170 |         # TODO: would it make sense to make a context manager and do this on exit and expose all the high level methods
171 |         # in said context manager?
172 |         self._filter_empty()
173 | 
174 |     def remove_author(self):
175 |         '''Removes "Subtitles by" subtitle entries etc'''
176 |         for author_str in AUTHOR_STRINGS:
177 |             if author_str in self._contents.lower():
178 |                 self.index = 0
179 |                 break
180 | 
181 |     def remove_asterisks(self):
182 |         '''Removes line if it contains only an asterisk and/or whitespace'''
183 |         self._contents = re.sub(r'^[\*\s]*$', '', self._contents)
184 |         self._filter_empty()
185 | 
186 |     def fix_italics(self):
187 |         '''Fixes lone <i> or </i> tags, and removes empty <i> tags, and empty dashes'''
188 |         if '<i>' in self._contents and '</i>' not in self._contents:
189 |             self._contents += '</i>'
190 |         if '</i>' in self._contents and '<i>' not in self._contents:
191 |             self._contents = '<i>' + self._contents
192 |         self._contents = re.sub(r'<i>[\_\-\‐\?#\s¶]*</i>', '', self._contents, flags=re.DOTALL)
193 |         self._remove_lone_symbols()
194 | 
195 |     def _remove_lone_symbols(self):
196 |         self._contents_to_list()
197 |         for idx, _ in enumerate(self._contents):
198 |             self._contents[idx] = re.sub(r'^[\_\-\‐\?#\s¶]*$', '', self._contents[idx])
199 |             self._contents[idx] = re.sub(
200 |                 r'^[\_\-\‐\?#\s¶]*<i>[\_\-\‐\?#\s¶]*$', '<i>', self._contents[idx]
201 |             )
202 |             self._contents[idx] = re.sub(
203 |                 r'^[\_\-\‐\?#\s¶]*</i>[\_\-\‐\?#\s¶]*$', '</i>', self._contents[idx]
204 |             )
205 |         # Removes empty strings
206 |         self._contents = list(filter(None, self._contents))
207 |         # Set index as 0 for later deletion
208 |         if len(self.contents) == 0:
209 |             self.index = 0
210 |         self._contents_to_str()
211 | 
212 |     def remove_single_dash(self):
213 |         '''Removes single dashes from contents'''
214 |         if re.match(r'^[^\n]*$', self._contents):
215 |             self._contents = re.sub(r'(?m)^\s*-\s*(.*)$', r'\1', self._contents)
216 |         self._filter_empty()
217 | 
218 | 
219 | class Subtitles:
220 |     '''Content filtering object for subtitles file'''
221 | 
222 |     EXTENSIONS = ['.srt']
223 | 
224 |     def __init__(self, fpath):
225 |         if not os.path.exists(fpath):
226 |             raise IOError('{} does not exist'.format(fpath))
227 |         if not os.path.isfile(fpath):
228 |             raise IOError('{} is not a file'.format(fpath))
229 |         self._fullpath = fpath
230 |         if self.ext not in self.EXTENSIONS:
231 |             raise IOError('{} is not valid subtitle file: {}'.format(self._fullpath, self.ext))
232 |         self._line_list = self._get_line_list()
233 |         self.subtitles = self._parse_subs()
234 | 
235 |     def __repr__(self):
236 |         return "".join(map(str, self.subtitles))
237 | 
238 |     def __eq__(self, other):
239 |         if len(self.subtitles) != len(other.subtitles):
240 |             return False
241 |         for idx, _ in enumerate(self.subtitles):
242 |             if self.subtitles[idx] != other.subtitles[idx]:
243 |                 return False
244 |         return True
245 | 
246 |     @property
247 |     def filepath(self):
248 |         '''Filepath of mediafile'''
249 |         return self._fullpath
250 | 
251 |     @property
252 |     def ext(self):
253 |         '''Extension of mediafile'''
254 |         _, ext = os.path.splitext(self._fullpath)
255 |         return ext
256 | 
257 |     def _get_line_list(self):
258 |         if has_bom(self.filepath):
259 |             with open(self.filepath, 'r', encoding='utf-8-sig') as fdata:
260 |                 line_list = fdata.readlines()
261 |         else:
262 |             with open(self.filepath, 'r', encoding='utf-8') as fdata:
263 |                 line_list = fdata.readlines()
264 |         line_list_filtered = [x.rstrip() for x in line_list]
265 |         return line_list_filtered
266 | 
267 |     def _parse_subs(self):
268 |         sub_list = [Subtitle()]
269 |         for line in self._line_list:
270 |             # If the index has not yet been created in latest sublist item
271 |             if not sub_list[-1].index:
272 |                 try:
273 |                     sub_list[-1].index = int(line)
274 |                 except ValueError:
275 |                     continue
276 |             # Time line
277 |             elif sub_list[-1].start is None:
278 |                 if ' --> ' in line:
279 |                     sub_list[-1].start, sub_list[-1].end = line.split(' --> ')
280 |                 else:
281 |                     continue
282 |             # New subtitle entry
283 |             elif not line:
284 |                 sub_list.append(Subtitle())
285 |             # Contents
286 |             else:
287 |                 sub_list[-1].contents = line
288 |         return sub_list
289 | 
290 |     def filter(self, **kw):
291 |         '''Filters subtitles to remove SDH items'''
292 |         # Filter contents
293 |         if kw.get('rm_fonts', True):
294 |             any(map(lambda sub: sub.remove_font_colours(), self.subtitles))
295 |         if kw.get('rm_ast', True):
296 |             any(map(lambda sub: sub.remove_asterisks(), self.subtitles))
297 |         if kw.get('rm_music', True):
298 |             any(map(lambda sub: sub.remove_music(), self.subtitles))
299 |         if kw.get('rm_effects', True):
300 |             any(map(lambda sub: sub.remove_sound_effects(), self.subtitles))
301 |         if kw.get('rm_names', True):
302 |             any(map(lambda sub: sub.replace_names(), self.subtitles))
303 |         if kw.get('rm_author', True):
304 |             any(map(lambda sub: sub.remove_author(), self.subtitles))
305 |         if kw.get('fix_commas', True):
306 |             any(map(lambda sub: sub.fix_comma_spaces(), self.subtitles))
307 |         if kw.get('rm_lone_dashes', True):
308 |             any(map(lambda sub: sub.remove_single_dash(), self.subtitles))
309 |         any(map(lambda sub: sub.fix_italics(), self.subtitles))
310 |         # Remove filtered items from list
311 |         self.subtitles[:] = [sub for sub in self.subtitles if sub.index]
312 |         # Reassign indices
313 |         for idx, sub in enumerate(self.subtitles):
314 |             sub.index = idx + 1
315 | 
316 |     def print(self):
317 |         '''Prints all subtitle entries'''
318 |         for sub in self.subtitles:
319 |             print(sub)
320 | 
321 |     def save(self, new_filepath=None):
322 |         '''Saves subtitle object to disk,
323 |         omit new_filepath to save inplace
324 |         '''
325 |         if new_filepath is not None:
326 |             self._fullpath = new_filepath
327 |         with open(self._fullpath, 'w', encoding='utf-8') as fp:
328 |             for sub in self.subtitles:
329 |                 fp.write(str(sub) + '\n')
330 | 


--------------------------------------------------------------------------------