├── libs
    ├── __init__.py
    ├── langdetect
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── test_unicode_block.py
    │   │   │   └── test_lang_profile.py
    │   │   ├── test_language.py
    │   │   └── test_detector.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── messages.py
    │   │   ├── lang_profile.py
    │   │   └── ngram.py
    │   ├── __init__.py
    │   ├── language.py
    │   ├── lang_detect_exception.py
    │   ├── detector_factory.py
    │   ├── detector.py
    │   └── profiles
    │   │   ├── gu
    │   │   ├── so
    │   │   └── sw
    └── subcleaner
    │   ├── __init__.py
    │   ├── languages
    │       ├── __init__.py
    │       └── languages.py
    │   ├── cleaner
    │       ├── __init__.py
    │       ├── detectors
    │       │   ├── __init__.py
    │       │   ├── chain.py
    │       │   └── wedged.py
    │       ├── punishers
    │       │   ├── __init__.py
    │       │   ├── time.py
    │       │   ├── regex.py
    │       │   ├── duplicate.py
    │       │   └── adjacency.py
    │       └── cleaner.py
    │   ├── settings
    │       ├── __init__.py
    │       ├── log_config.py
    │       ├── config.py
    │       └── args.py
    │   ├── report_generator.py
    │   ├── sub_block.py
    │   ├── regex_lists.py
    │   ├── main.py
    │   └── subtitle.py
├── regex_profiles
    ├── README.txt
    └── default
    │   ├── indonesian.conf
    │   ├── no_profile.conf
    │   ├── hebrew.conf
    │   ├── english.conf
    │   ├── svenska.conf
    │   ├── spanish.conf
    │   ├── global.conf
    │   ├── dutch.conf
    │   └── portuguese.conf
├── subcleaner.py
├── default_config
    └── subcleaner.conf
├── README.md
└── .gitignore


/libs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libs/langdetect/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libs/langdetect/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libs/subcleaner/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/libs/langdetect/tests/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libs/subcleaner/languages/__init__.py:
--------------------------------------------------------------------------------
1 | from .languages import is_language, get_2letter_code
2 | 


--------------------------------------------------------------------------------
/libs/subcleaner/cleaner/__init__.py:
--------------------------------------------------------------------------------
1 | from .cleaner import find_ads, remove_ads, fix_overlap, unscramble, reset
2 | 


--------------------------------------------------------------------------------
/libs/subcleaner/settings/__init__.py:
--------------------------------------------------------------------------------
1 | from . import config
2 | from . import args
3 | from . import log_config
4 | 


--------------------------------------------------------------------------------
/libs/subcleaner/cleaner/detectors/__init__.py:
--------------------------------------------------------------------------------
1 | from .wedged import detect_wedged
2 | from .chain import detect_chain
3 | 


--------------------------------------------------------------------------------
/libs/langdetect/__init__.py:
--------------------------------------------------------------------------------
1 | from .detector_factory import DetectorFactory, PROFILES_DIRECTORY, detect, detect_langs
2 | from .lang_detect_exception import LangDetectException
3 | 


--------------------------------------------------------------------------------
/libs/subcleaner/cleaner/punishers/__init__.py:
--------------------------------------------------------------------------------
1 | from .adjacency import punish_ad_adjacency
2 | from .duplicate import punish_clone_blocks, move_duplicated, reset_duplicate
3 | from .regex import punish_regex_matches
4 | from .time import punish_quick_first_block, punish_short_duration
5 | 


--------------------------------------------------------------------------------
/libs/langdetect/language.py:
--------------------------------------------------------------------------------
 1 | class Language(object):
 2 |     '''
 3 |     Language is to store the detected language.
 4 |     Detector.get_probabilities() returns a list of Languages.
 5 |     '''
 6 | 
 7 |     def __init__(self, lang, prob):
 8 |         self.lang = lang
 9 |         self.prob = prob
10 | 
11 |     def __repr__(self):
12 |         if self.lang is None:
13 |             return ''
14 |         return '%s:%s' % (self.lang, self.prob)
15 | 
16 |     def __lt__(self, other):
17 |         return self.prob < other.prob
18 | 


--------------------------------------------------------------------------------
/libs/langdetect/lang_detect_exception.py:
--------------------------------------------------------------------------------
 1 | _error_codes = {
 2 |     'NoTextError': 0,
 3 |     'FormatError': 1,
 4 |     'FileLoadError': 2,
 5 |     'DuplicateLangError': 3,
 6 |     'NeedLoadProfileError': 4,
 7 |     'CantDetectError': 5,
 8 |     'CantOpenTrainData': 6,
 9 |     'TrainDataFormatError': 7,
10 |     'InitParamError': 8,
11 | }
12 | 
13 | ErrorCode = type('ErrorCode', (), _error_codes)
14 | 
15 | 
16 | class LangDetectException(Exception):
17 |     def __init__(self, code, message):
18 |         super(LangDetectException, self).__init__(message)
19 |         self.code = code
20 | 
21 |     def get_code(self):
22 |         return self.code
23 | 


--------------------------------------------------------------------------------
/libs/langdetect/utils/messages.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | 
 3 | 
 4 | class Messages(object):
 5 |     MESSAGES_FILENAME = path.join(path.dirname(__file__), 'messages.properties')
 6 | 
 7 |     def __init__(self):
 8 |         self.messages = {}
 9 |         with open(self.MESSAGES_FILENAME, 'r') as f:
10 |             for line in f:
11 |                 key, _, value = line.strip().partition('=')
12 |                 self.messages[key] = value.encode().decode('unicode_escape')
13 | 
14 |     def get_string(self, key):
15 |         return self.messages.get(key, '!%s!' % key)
16 | 
17 | 
18 | _messages = None
19 | def get_string(key):
20 |     global _messages
21 |     if _messages is None:
22 |         _messages = Messages()
23 |     return _messages.get_string(key)
24 | 


--------------------------------------------------------------------------------
/regex_profiles/README.txt:
--------------------------------------------------------------------------------
 1 | Put files in this directory to add custom regex profiles beyond the included profiles.
 2 | Any file put here will override identically named files in the default folder.
 3 | 
 4 | Each profile checks its associated language codes individually. Multiple
 5 | regex profiles can therefore run against the same subtitle if the same language is specified in the profiles.
 6 | You can disable all default profiles in the subcleaner.conf file.
 7 | 
 8 | Regex profiles need to have to a .conf extension.
 9 | Profiles starting with a "." will be also be ignored.
10 | 
11 | Use one of the default profiles as a template to avoid unwanted results. but make sure you go over all the
12 | purge regexes so that they don't contain any words that are real words in your language.
13 | 
14 | 


--------------------------------------------------------------------------------
/libs/langdetect/tests/test_language.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from libs.langdetect.language import Language
 4 | 
 5 | 
 6 | class LanguageTest(unittest.TestCase):
 7 |     def test_language(self):
 8 |         lang = Language(None, 0)
 9 |         self.assertIsNone(lang.lang)
10 |         self.assertEqual(lang.prob, 0.0, 0.0001)
11 |         self.assertEqual(str(lang), '')
12 | 
13 |         lang2 = Language('en', 1.0)
14 |         self.assertEqual(lang2.lang, 'en')
15 |         self.assertEqual(lang2.prob, 1.0, 0.0001)
16 |         self.assertEqual(str(lang2), 'en:1.0')
17 | 
18 |     def test_cmp(self):
19 |         lang1 = Language('a', 0.1)
20 |         lang2 = Language('b', 0.5)
21 | 
22 |         self.assertTrue(lang1 < lang2)
23 |         self.assertFalse(lang1 == lang2)
24 |         self.assertFalse(lang1 > lang1)
25 | 


--------------------------------------------------------------------------------
/subcleaner.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from configparser import DuplicateOptionError
 3 | 
 4 | import logging
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | if __name__ == '__main__':
 9 |     try:
10 |         from libs.subcleaner import main
11 |         main.main()
12 |         exit(0)
13 |     except KeyboardInterrupt:
14 |         logger.warning("subcleaner was interrupted.")
15 |         exit(0)
16 |     except PermissionError as e:
17 |         logger.error("subcleaner ran into a permission error. Permission denied to: \"" + e.filename + "\"")
18 |         exit(1)
19 |     except DuplicateOptionError as e:
20 |         logger.error("subcleaner was unable to read config file \"" + e.args[2].name +
21 |               "\" because there are multiple keys with the same name:\n" 
22 |               "Option '" + e.option + "' already exists in section '" + e.section + "'")
23 |         exit(1)
24 | 


--------------------------------------------------------------------------------
/libs/subcleaner/cleaner/punishers/time.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | from datetime import timedelta
 3 | 
 4 | from libs.subcleaner.subtitle import Subtitle
 5 | 
 6 | 
 7 | def punish_quick_first_block(subtitle: Subtitle) -> None:
 8 |     if not subtitle.blocks:
 9 |         return 
10 |     block = subtitle.blocks[0]
11 |     if block.start_time < timedelta(seconds=1):
12 |         block.regex_matches += 1
13 |         block.hints.append("quick_start")
14 | 
15 | 
16 | def punish_short_duration(subtitle: Subtitle) -> None:
17 |     for block in subtitle.blocks:
18 |         if block.end_time - block.start_time < datetime.timedelta(milliseconds=8/30*1000):
19 |             block.regex_matches += 1
20 |             block.hints.append("short duration")
21 | 
22 |         if block.end_time - block.start_time < datetime.timedelta(milliseconds=3/30*1000):
23 |             block.regex_matches += 1
24 |             block.hints.append("very short duration")
25 | 


--------------------------------------------------------------------------------
/libs/subcleaner/settings/log_config.py:
--------------------------------------------------------------------------------
 1 | import logging.handlers
 2 | import sys
 3 | from . import args, config
 4 | 
 5 | # formatters
 6 | time_formatter = logging.Formatter("{asctime} - {levelname:>8}: {message}", style="{", datefmt='%Y-%m-%d_%H:%M:%S')
 7 | formatter = logging.Formatter("{levelname:>8}: {message}", style="{",)
 8 | 
 9 | base_logger = logging.getLogger()
10 | base_logger.setLevel(logging.INFO)
11 | base_logger.handlers.clear()
12 | 
13 | # file handler
14 | if not args.no_log:
15 |     file_handler = logging.handlers.RotatingFileHandler(config.log_file, maxBytes=10_000_000, backupCount=10, encoding='utf8')
16 |     file_handler.setFormatter(time_formatter)
17 |     file_handler.setLevel(logging.INFO)
18 |     if args.errors_only:
19 |         file_handler.setLevel(logging.ERROR)
20 |     base_logger.addHandler(file_handler)
21 | 
22 | # stdout handler
23 | stout_handler = logging.StreamHandler(sys.stdout)
24 | stout_handler.setFormatter(formatter)
25 | stout_handler.setLevel(logging.INFO)
26 | if args.silent:
27 |     stout_handler.setLevel(logging.WARNING)
28 | if args.errors_only:
29 |     stout_handler.setLevel(logging.ERROR)
30 | if args.debug:
31 |     stout_handler.setLevel(logging.DEBUG)
32 | base_logger.addHandler(stout_handler)
33 | 


--------------------------------------------------------------------------------
/libs/subcleaner/cleaner/punishers/regex.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import List, Tuple, Pattern
 3 | 
 4 | from libs.subcleaner import regex_lists
 5 | from libs.subcleaner.sub_block import SubBlock
 6 | from libs.subcleaner.subtitle import Subtitle
 7 | 
 8 | 
 9 | def punish_regex_matches(subtitle: Subtitle) -> None:
10 |     for block in subtitle.blocks:
11 |         _run_regex_on_block(block, regex_lists.get_purge_regex(subtitle.language), 3)
12 |         _run_regex_on_block(block, regex_lists.get_warning_regex(subtitle.language), 1)
13 | 
14 | 
15 | def _run_regex_on_block(block: SubBlock, regex_list: List[Tuple[str, Pattern]], punishment: int) -> None:
16 |     clean_content = " ".join(block.content.replace("-\n", "-").split())
17 |     for regex in regex_list:
18 |         try:
19 |             result = re.findall(regex[1], clean_content)
20 |             if result and isinstance(result[0], str):
21 |                 result = [r.lower() for r in result]
22 |                 result = set(result)
23 |             else:
24 |                 result = set([t[0].lower() for t in result])
25 | 
26 |         except re.error as e:
27 |             raise ValueError(f"regex {regex[0]} is miss configured: {e.msg}")
28 |         if result:
29 |             block.regex_matches += punishment * len(result)
30 |             for i in range(0, len(result)):
31 |                 block.hints.append(regex[0])
32 | 


--------------------------------------------------------------------------------
/default_config/subcleaner.conf:
--------------------------------------------------------------------------------
 1 | [SETTINGS]
 2 | # main config for subcleaner.
 3 | #
 4 | 
 5 | require_language_profile = true
 6 | # Set "require_language_profile" to false if you wish to clean subtitles in languages that don't have at least one
 7 | # language profile associated with it.
 8 | # bool [default: true]
 9 | #
10 | 
11 | 
12 | relative_path_base = .
13 | # The script will run relative paths from the "relative_path_base" directory instead of your working directory if it exist.
14 | # Recommended to point this to your library base for ease of use. i.e: "/storage/media/library"
15 | # string [default: .]
16 | #
17 | 
18 | 
19 | use_defaults = true
20 | # Set "use_defaults" to false if you wish to disable all default regex configs.
21 | # bool [default: true]
22 | #
23 | 
24 | 
25 | default_language =
26 | # Set which language code subtitles is considered to have if the script is called without specifying language.
27 | # leave empty to allow script to automatically detect language code.
28 | # string [default: ]
29 | #
30 | 
31 | 
32 | log_dir = logs/
33 | # log path:
34 | # Relative paths are from location of script.
35 | # string [default: logs/]
36 | #
37 | 
38 | 
39 | fix_overlaps = true
40 | # Subtitle overlap fixing:
41 | # As per subtitle formatting best practise, there should be at least 2 frames between each subtitle.
42 | # With this enabled it will move two subtitles that are too close to each other by moving the start/stop times
43 | # so they no longer overlap.
44 | # how much each subtitle is moved is weighted by how much text is in each subtitles. more text -> moved more.
45 | # bool [default: true]
46 | #
47 | 


--------------------------------------------------------------------------------
/libs/subcleaner/cleaner/punishers/duplicate.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Dict, List
 3 | 
 4 | from libs.subcleaner.sub_block import SubBlock
 5 | from libs.subcleaner.subtitle import Subtitle
 6 | 
 7 | 
 8 | content_dict: Dict[str, List[SubBlock]] = {}
 9 | content_dict_reverse: [SubBlock, str] = {}
10 | 
11 | 
12 | def punish_clone_blocks(subtitle: Subtitle) -> None:
13 |     for block in subtitle.blocks:
14 |         content = re.sub("[\\s.,:_-]", "", block.content)
15 |         content_dict_reverse[block] = content
16 |         if content not in content_dict:
17 |             content_dict[content] = []
18 |         content_dict[content].append(block)
19 | 
20 |     for duplicate_list in content_dict.values():
21 |         if len(duplicate_list) <= 1:
22 |             continue
23 |         for block in duplicate_list:
24 |             if "♪" in block.content:
25 |                 continue
26 |             block.regex_matches += 1
27 |             block.hints.append("similar_content")
28 | 
29 | 
30 | def move_duplicated(subtitle: Subtitle) -> None:
31 |     for ad_block in subtitle.ad_blocks.copy():
32 |         if "similar_content" not in ad_block.hints:
33 |             continue
34 |         for block in content_dict[content_dict_reverse[ad_block]]:
35 |             subtitle.ad(block)
36 | 
37 |     for warn_block in subtitle.warning_blocks.copy():
38 |         if "similar_content" not in warn_block.hints:
39 |             continue
40 |         for block in content_dict[content_dict_reverse[warn_block]]:
41 |             subtitle.warn(block)
42 | 
43 | 
44 | def reset_duplicate():
45 |     content_dict.clear()
46 |     content_dict_reverse.clear()
47 | 


--------------------------------------------------------------------------------
/libs/subcleaner/cleaner/punishers/adjacency.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Set
 3 | 
 4 | from libs.subcleaner.sub_block import SubBlock
 5 | from libs.subcleaner.subtitle import Subtitle
 6 | 
 7 | 
 8 | def punish_ad_adjacency(subtitle: Subtitle) -> None:
 9 |     nearby_blocks: Set[SubBlock] = set()
10 |     for index in range(0, len(subtitle.blocks)):
11 |         block = subtitle.blocks[index]
12 |         if index < 3:
13 |             nearby_blocks.add(block)
14 |             block.hints.append("close_to_start")
15 |             continue
16 |         if index > len(subtitle.blocks) - 4:
17 |             nearby_blocks.add(block)
18 |             block.hints.append("close_to_end")
19 |             continue
20 |         for compare_block in subtitle.blocks[max(0, index - 15): min(index + 16, len(subtitle.blocks))]:
21 |             if compare_block.regex_matches >= 3 and compare_block != block:
22 |                 nearby_blocks.add(block)
23 |                 block.hints.append("nearby_ad")
24 |                 break
25 | 
26 |     adjacent_blocks: Set[SubBlock] = set()
27 |     for index in range(0, len(subtitle.blocks)):
28 |         block = subtitle.blocks[index]
29 |         for compare_block in subtitle.blocks[max(0, index - 1): min(index + 2, len(subtitle.blocks))]:
30 |             if compare_block.regex_matches >= 2 and compare_block != block:
31 |                 if re.sub(" +", " ", block.content.replace("\n", " ").strip()).count(" ") <= 4:
32 |                     adjacent_blocks.add(block)
33 |                     break
34 | 
35 |     for block in nearby_blocks:
36 |         block.regex_matches += 1
37 | 
38 |     for block in adjacent_blocks:
39 |         block.regex_matches += 1
40 |         block.hints.append("adjacent_ad")
41 | 


--------------------------------------------------------------------------------
/libs/subcleaner/languages/languages.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | from typing import Optional, List, Dict
 4 | 
 5 | languages_json_file = Path(__file__).parent.joinpath("languages.json")
 6 | 
 7 | _languages: List[Dict[str, str]]
 8 | _language_names: List[str] = []
 9 | _language_codes_2: List[str] = []
10 | _language_codes_3: List[str] = []
11 | 
12 | 
13 | def load_language_data() -> None:
14 |     with open(languages_json_file, encoding="UTF-8") as json_file:
15 |         global _languages
16 |         _languages = json.load(json_file)
17 |         for language in _languages:
18 |             _language_names.append(language["name"])
19 |             language["name"] = language["name"].lower().replace(" ", "_")
20 |             if "alpha_2" in language:
21 |                 _language_codes_2.append(language["alpha_2"])
22 |             if "alpha_3" in language:
23 |                 _language_codes_3.append(language["alpha_3"])
24 | 
25 | 
26 | def is_language(lang: str) -> bool:
27 |     if len(lang) == 2:
28 |         return lang in _language_codes_2
29 |     if len(lang) == 3:
30 |         return lang in _language_codes_3
31 |     return lang in _language_names
32 | 
33 | 
34 | def get_2letter_code(lang: str) -> Optional[str]:
35 |     if len(lang) == 2:
36 |         if is_language(lang):
37 |             return lang
38 |         return None
39 | 
40 |     if len(lang) == 3:
41 |         code_type = "alpha_3"
42 |     else:
43 |         code_type = "name"
44 | 
45 |     lang = lang.lower().replace(" ", "_")
46 |     for language in _languages:
47 |         if language[code_type] == lang:
48 |             if "alpha_2" in language:
49 |                 return language["alpha_2"]
50 |             return None
51 | 
52 | 
53 | load_language_data()
54 | 


--------------------------------------------------------------------------------
/libs/langdetect/tests/utils/test_unicode_block.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import six
 4 | 
 5 | from libs.langdetect.utils import unicode_block
 6 | 
 7 | 
 8 | class UnicodeBlockTest(unittest.TestCase):
 9 |     def test_unicode_block(self):
10 |         self.assertEqual(unicode_block.unicode_block(six.u('\u0065')), unicode_block.UNICODE_BASIC_LATIN)
11 |         self.assertEqual(unicode_block.unicode_block(six.u('\u007F')), unicode_block.UNICODE_BASIC_LATIN)
12 |         self.assertEqual(unicode_block.unicode_block(six.u('\u0080')), unicode_block.UNICODE_LATIN_1_SUPPLEMENT)
13 |         self.assertEqual(unicode_block.unicode_block(six.u('\u21FF')), unicode_block.UNICODE_ARROWS)
14 |         self.assertEqual(unicode_block.unicode_block(six.u('\u2200')), unicode_block.UNICODE_MATHEMATICAL_OPERATORS)
15 |         self.assertEqual(unicode_block.unicode_block(six.u('\u2201')), unicode_block.UNICODE_MATHEMATICAL_OPERATORS)
16 |         self.assertEqual(unicode_block.unicode_block(six.u('\u22FF')), unicode_block.UNICODE_MATHEMATICAL_OPERATORS)
17 |         self.assertEqual(unicode_block.unicode_block(six.u('\u2300')), unicode_block.UNICODE_MISCELLANEOUS_TECHNICAL)
18 |         # test only on wide builds (i.e. Python 3)
19 |         if len(six.u('\U0010FFFF')) == 1:
20 |             self.assertEqual(unicode_block.unicode_block(six.u('\U000F0000')), unicode_block.UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_A)
21 |             self.assertEqual(unicode_block.unicode_block(six.u('\U000FFFFF')), unicode_block.UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_A)
22 |             self.assertEqual(unicode_block.unicode_block(six.u('\U00100000')), unicode_block.UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_B)
23 |             self.assertEqual(unicode_block.unicode_block(six.u('\U0010FFFF')), unicode_block.UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_B)
24 | 


--------------------------------------------------------------------------------
/libs/subcleaner/cleaner/detectors/chain.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta
 2 | from typing import List
 3 | 
 4 | from libs.subcleaner.sub_block import SubBlock
 5 | from libs.subcleaner.subtitle import Subtitle
 6 | 
 7 | 
 8 | def detect_chain(subtitle: Subtitle) -> None:
 9 |     chain: List[SubBlock] = []
10 |     identical_count = 0
11 |     for i in range(1, len(subtitle.blocks)):
12 |         block = subtitle.blocks[i]
13 |         pre_block = subtitle.blocks[i - 1]
14 | 
15 |         link: bool = False
16 | 
17 |         if is_link(pre_block, block):
18 |             if pre_block.equal_content(block):
19 |                 identical_count += 1
20 |             link = True
21 | 
22 |         if link:
23 |             if not chain:
24 |                 chain.append(pre_block)
25 |             chain.append(block)
26 |             continue
27 | 
28 |         if len(chain) > 2 + identical_count or any(block in subtitle.ad_blocks for block in chain):
29 |             for chain_block in chain:
30 |                 subtitle.ad(chain_block)
31 |                 chain_block.hints.append("chain_block")
32 | 
33 |         chain.clear()
34 |         identical_count = 0
35 |     if len(chain) > 2 + identical_count or any(block in subtitle.ad_blocks for block in chain):
36 |         for chain_block in chain:
37 |             subtitle.ad(chain_block)
38 |             chain_block.hints.append("chain_block")
39 | 
40 | 
41 | def is_link(block: SubBlock, post_block: SubBlock) -> bool:
42 |     if block.start_time > post_block.start_time:
43 |         block, post_block = post_block, block
44 |     if post_block.start_time - block.end_time > timedelta(milliseconds=500):
45 |         return False
46 | 
47 |     if len(block.content) < len(post_block.content) <= len(block.content) + 2:
48 |         if post_block.content.startswith(block.content) or post_block.content.endswith(block.content):
49 |             return True
50 |     elif len(post_block.content) < len(block.content) <= len(post_block.content) + 2:
51 |         if block.content.startswith(post_block.content) or block.content.endswith(post_block.content):
52 |             return True
53 |     elif block.content.strip() == post_block.content.strip():
54 |         return True
55 | 
56 |     return False
57 | 


--------------------------------------------------------------------------------
/libs/langdetect/tests/utils/test_lang_profile.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import six
 4 | from six.moves import xrange
 5 | 
 6 | from libs.langdetect.utils.lang_profile import LangProfile
 7 | 
 8 | 
 9 | class LangProfileText(unittest.TestCase):
10 |     def test_lang_profile(self):
11 |         profile = LangProfile()
12 |         self.assertIsNone(profile.name)
13 | 
14 |     def test_lang_profile_string_int(self):
15 |         profile = LangProfile('en')
16 |         self.assertEqual(profile.name, 'en')
17 | 
18 |     def test_add(self):
19 |         profile = LangProfile('en')
20 |         profile.add('a')
21 |         self.assertEqual(profile.freq.get('a'), 1)
22 |         profile.add('a')
23 |         self.assertEqual(profile.freq.get('a'), 2)
24 |         profile.omit_less_freq()
25 | 
26 |     def test_add_illegally1(self):
27 |         profile = LangProfile()
28 |         profile.add('a')  # ignore
29 |         self.assertIsNone(profile.freq.get('a'))  # ignored
30 | 
31 |     def test_add_illegally2(self):
32 |         profile = LangProfile('en')
33 |         profile.add('a')
34 |         profile.add('')  # Illegal (string's length of parameter must be between 1 and 3) but ignore
35 |         profile.add('abcd')  # as well
36 |         self.assertEqual(profile.freq.get('a'), 1)
37 |         self.assertIsNone(profile.freq.get(''))  # ignored
38 |         self.assertIsNone(profile.freq.get('abcd'))  # ignored
39 | 
40 |     def test_omit_less_freq(self):
41 |         profile = LangProfile('en')
42 |         grams = six.u('a b c \u3042 \u3044 \u3046 \u3048 \u304a \u304b \u304c \u304d \u304e \u304f').split()
43 |         for i in xrange(5):
44 |             for g in grams:
45 |                 profile.add(g)
46 |         profile.add(six.u('\u3050'))
47 | 
48 |         self.assertEqual(profile.freq.get('a'), 5)
49 |         self.assertEqual(profile.freq.get(six.u('\u3042')), 5)
50 |         self.assertEqual(profile.freq.get(six.u('\u3050')), 1)
51 |         profile.omit_less_freq()
52 |         self.assertIsNone(profile.freq.get('a'))  # omitted
53 |         self.assertEqual(profile.freq.get(six.u('\u3042')), 5)
54 |         self.assertIsNone(profile.freq.get(six.u('\u3050')))  # omitted
55 | 
56 |     def test_omit_less_freq_illegally(self):
57 |         profile = LangProfile()
58 |         profile.omit_less_freq()  # ignore
59 | 


--------------------------------------------------------------------------------
/libs/subcleaner/cleaner/detectors/wedged.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta
 2 | 
 3 | from libs.subcleaner.sub_block import SubBlock
 4 | from libs.subcleaner.subtitle import Subtitle
 5 | 
 6 | 
 7 | def detect_wedged(subtitle: Subtitle) -> None:
 8 |     if len(subtitle.blocks) < 3:
 9 |         return
10 |     for index in range(0, len(subtitle.blocks)):
11 |         block: SubBlock = subtitle.blocks[index]
12 | 
13 |         if index == 0:
14 |             post_block: SubBlock = subtitle.blocks[index + 1]
15 |             if post_block.regex_matches >= 3:
16 |                 if (post_block.start_time - block.end_time) < timedelta(seconds=1):
17 |                     if block in subtitle.warning_blocks:
18 |                         subtitle.ad(block)
19 |                     else:
20 |                         subtitle.warn(block)
21 |                 else:
22 |                     subtitle.warn(block)
23 |                 block.hints.append("wedged_block")
24 |             continue
25 | 
26 |         if index == len(subtitle.blocks) - 1:
27 |             pre_block: SubBlock = subtitle.blocks[index - 1]
28 |             if pre_block.regex_matches < 3:
29 |                 continue
30 |             block.hints.append("wedged_block")
31 |             if (block.start_time - pre_block.end_time) > timedelta(seconds=1):
32 |                 subtitle.warn(block)
33 |                 continue
34 | 
35 |             if block in subtitle.warning_blocks:
36 |                 subtitle.ad(block)
37 |             else:
38 |                 subtitle.warn(block)
39 |             continue
40 | 
41 |         pre_block: SubBlock = subtitle.blocks[index - 1]
42 |         post_block: SubBlock = subtitle.blocks[index + 1]
43 | 
44 |         if pre_block.regex_matches >= 3 and post_block.regex_matches >= 3:
45 |             if (post_block.start_time - block.end_time) < timedelta(seconds=1) and \
46 |                     (block.start_time - pre_block.end_time) < timedelta(seconds=1):
47 |                 subtitle.ad(block)
48 |                 block.hints.append("wedged_block")
49 |                 continue
50 |             if block.regex_matches == 2:
51 |                 subtitle.ad(block)
52 |                 block.hints.append("wedged_block")
53 |                 continue
54 |             else:
55 |                 subtitle.warn(block)
56 |                 block.hints.append("wedged_block")
57 |                 continue
58 | 


--------------------------------------------------------------------------------
/libs/langdetect/utils/lang_profile.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import re
 3 | 
 4 | import libs.six as six
 5 | 
 6 | from .ngram import NGram
 7 | 
 8 | 
 9 | class LangProfile(object):
10 |     MINIMUM_FREQ = 2
11 |     LESS_FREQ_RATIO = 100000
12 | 
13 |     ROMAN_CHAR_RE = re.compile(r'^[A-Za-z]$')
14 |     ROMAN_SUBSTR_RE = re.compile(r'.*[A-Za-z].*')
15 | 
16 |     def __init__(self, name=None, freq=None, n_words=None):
17 |         self.freq = defaultdict(int)
18 |         if freq is not None:
19 |             self.freq.update(freq)
20 | 
21 |         if n_words is None:
22 |             n_words = [0] * NGram.N_GRAM
23 | 
24 |         self.name = name
25 |         self.n_words = n_words
26 | 
27 |     def add(self, gram):
28 |         '''Add n-gram to profile.'''
29 |         if self.name is None or gram is None:  # Illegal
30 |             return
31 |         length = len(gram)
32 |         if length < 1 or length > NGram.N_GRAM:  # Illegal
33 |             return
34 |         self.n_words[length - 1] += 1
35 |         self.freq[gram] += 1
36 | 
37 |     def omit_less_freq(self):
38 |         '''Eliminate below less frequency n-grams and noise Latin alphabets.'''
39 |         if self.name is None:  # Illegal
40 |             return
41 |         threshold = max(self.n_words[0] // self.LESS_FREQ_RATIO, self.MINIMUM_FREQ)
42 | 
43 |         roman = 0
44 |         for key, count in list(six.iteritems(self.freq)):
45 |             if count <= threshold:
46 |                 self.n_words[len(key)-1] -= count
47 |                 del self.freq[key]
48 |             elif self.ROMAN_CHAR_RE.match(key):
49 |                 roman += count
50 | 
51 |         # roman check
52 |         if roman < self.n_words[0] // 3:
53 |             for key, count in list(six.iteritems(self.freq)):
54 |                 if self.ROMAN_SUBSTR_RE.match(key):
55 |                     self.n_words[len(key)-1] -= count
56 |                     del self.freq[key]
57 | 
58 |     def update(self, text):
59 |         '''Update the language profile with (fragmented) text.
60 |         Extract n-grams from text and add their frequency into the profile.
61 |         '''
62 |         if text is None:
63 |             return
64 |         text = NGram.normalize_vi(text)
65 |         gram = NGram()
66 |         for ch in text:
67 |             gram.add_char(ch)
68 |             for n in range(1, NGram.N_GRAM+1):
69 |                 self.add(gram.get(n))
70 | 


--------------------------------------------------------------------------------
/libs/subcleaner/settings/config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from configparser import ConfigParser
 3 | from pathlib import Path
 4 | from typing import Optional
 5 | 
 6 | import libs
 7 | from libs.subcleaner import languages
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | home_dir = Path(libs.__file__).parent.parent
12 | try:
13 |     home_dir = home_dir.relative_to(Path.cwd())
14 | except ValueError:
15 |     pass
16 | regex_dir = home_dir.joinpath("regex_profiles")
17 | 
18 | # for migrating old installations:
19 | if home_dir.joinpath("regex").exists():
20 |     for path in home_dir.joinpath("regex").iterdir():
21 |         new_file = regex_dir.joinpath(path.name)
22 |         if not new_file.exists():
23 |             path.rename(new_file)
24 |         path.unlink()
25 |     home_dir.joinpath("regex").rmdir()
26 | 
27 | default_regex_dir = regex_dir.joinpath("default")
28 | script_file = home_dir.joinpath('subcleaner.py')
29 | 
30 | log_file: Path
31 | use_default_regex: bool
32 | fix_overlaps: bool
33 | relative_base: Path
34 | default_language: Optional[str]
35 | config_file = home_dir.joinpath("subcleaner.conf")
36 | 
37 | if not config_file.is_file():
38 |     config_file.write_text(home_dir.joinpath("default_config", "subcleaner.conf").read_text())
39 | 
40 | cfg = ConfigParser()
41 | cfg.read(str(config_file), encoding="UTF-8")
42 | 
43 | use_default_regex = cfg['SETTINGS'].getboolean("use_defaults", True)
44 | 
45 | sections = cfg.sections()
46 | 
47 | log_dir = Path(cfg["SETTINGS"].get("log_dir", "logs/"))
48 | if not log_dir.is_absolute():
49 |     log_dir = home_dir.joinpath(log_dir)
50 | if not log_dir.exists():
51 |     log_dir.mkdir()
52 | if not log_dir.is_dir():
53 |     raise ValueError(f"log directory: {log_dir} is not a directory")
54 | log_file = log_dir.joinpath("subcleaner.log")
55 | 
56 | relative_base = Path(cfg['SETTINGS'].get("relative_path_base", "."))
57 | if not relative_base.is_absolute():
58 |     relative_base = Path.cwd().joinpath(relative_base)
59 | relative_base = relative_base.resolve()
60 | 
61 | fix_overlaps = cfg['SETTINGS'].getboolean("fix_overlaps", True)
62 | 
63 | default_language = cfg['SETTINGS'].get("default_language", "")
64 | if default_language in ["blank", "Blank", "", "empty", "Empty"]:
65 |     default_language = None
66 | if default_language:
67 |     if not languages.is_language(default_language):
68 |         logger.error("Config error: default language code must be a valid ISO:639 language. Exiting")
69 |         exit(1)
70 | 
71 | use_english_on_all = cfg['SETTINGS'].getboolean("use_english_on_all", False)
72 | require_language_profile = cfg['SETTINGS'].getboolean("require_language_profile", True)
73 | 


--------------------------------------------------------------------------------
/regex_profiles/default/indonesian.conf:
--------------------------------------------------------------------------------
 1 | [META]
 2 | # Indonesian default config.
 3 | 
 4 | # Coma delimited list of language codes associated with this language profile.
 5 | # The script will run on all sub-labels like ":forced" as long as they match the language code.
 6 | # leave empty to apply to all language codes.
 7 | language_codes = id
 8 | 
 9 | 
10 | 
11 | # Information about how to configure the REGEX sections, read at the bottom of the file.
12 | # All regexes are case insensitive!
13 | [WARNING_REGEX]
14 | 
15 | id_warn1: \b\b(iklan|situs|judi|slot|togel|bandar|deposit|cashback|donasi|donatur|penerjemah|subtitle|(di)?terjemah(an|kan))\b\b.
16 | id_warn2: \b(BNI|BRI|BCA|OVO)\b.
17 | id_warn3: \b(pasang|pemasangan|oleh|by|pulsa|AN|SK|S&K)\b.
18 | id_warn4: \b(line|instagram|ig|twitter|tg|telegram)\b.
19 | id_warn6: \.(id|my)
20 | #regex#: Regex goes here.
21 | 
22 | 
23 | [PURGE_REGEX]
24 | 
25 | id_purge1: IDFL|Lebah\s?Ganteng|Pein\s?Akatsuki
26 | id_purge2: trakteer|saweria|GOPAY
27 | id_purge3: (skype|line|instagram|ig|twitter|wa|whatsapp|tg|telegram)\s*&\s*(skype|line|instagram|ig|twitter|wa|whatsapp|tg|telegram)
28 | id_purge4: Alif\s?Fikri\s?Aulia|paint_lapain|EveryAgent|faridusman|NANOsubs|GradyNanoNano|Jackandthewilee
29 | id_purge5: alih tempo|alih bahasa|takarir|subtitel
30 | #regex#: Regex goes here.
31 | 
32 | 
33 | 
34 | #
35 | # -----------------------------------------GUIDE-------------------------------------------------
36 | #
37 | 
38 | # This language profile contains two lists of regex that will look for patterns.
39 | # if you wish to modify or remove any regex, feel free to do so
40 | # but files in the default folder will be overwritten when you update the script.
41 | # You can add and remove keys as long as two keys don't use the same key twice.
42 | 
43 | # WARNING_REGEX:
44 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block.
45 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads.
46 | # 1 warning is ignored
47 | # 2 warnings will be print the block as a WARNING in the log.
48 | # 3 warnings or more will remove the entire block.
49 | 
50 | # PURGE_REGEX:
51 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block.
52 | 
53 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the
54 | # literal character you'll need to escape it with '\'
55 | # for example: matching "www." would require a regex like: "www\."
56 | # you can test regexes online on an regex-tester tool like https://regex101.com/
57 | 
58 | # Feel free to ask me any question on github.
59 | 


--------------------------------------------------------------------------------
/libs/langdetect/tests/test_detector.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import six
 4 | 
 5 | from libs.langdetect.detector_factory import DetectorFactory
 6 | from libs.langdetect.utils.lang_profile import LangProfile
 7 | 
 8 | 
 9 | class DetectorTest(unittest.TestCase):
10 |     TRAINING_EN = 'a a a b b c c d e'
11 |     TRAINING_FR = 'a b b c c c d d d'
12 |     TRAINING_JA = six.u('\u3042 \u3042 \u3042 \u3044 \u3046 \u3048 \u3048')
13 |     JSON_LANG1 = '{"freq":{"A":3,"B":6,"C":3,"AB":2,"BC":1,"ABC":2,"BBC":1,"CBA":1},"n_words":[12,3,4],"name":"lang1"}'
14 |     JSON_LANG2 = '{"freq":{"A":6,"B":3,"C":3,"AA":3,"AB":2,"ABC":1,"ABA":1,"CAA":1},"n_words":[12,5,3],"name":"lang2"}'
15 | 
16 |     def setUp(self):
17 |         self.factory = DetectorFactory()
18 | 
19 |         profile_en = LangProfile('en')
20 |         for w in self.TRAINING_EN.split():
21 |             profile_en.add(w)
22 |         self.factory.add_profile(profile_en, 0, 3)
23 | 
24 |         profile_fr = LangProfile('fr')
25 |         for w in self.TRAINING_FR.split():
26 |             profile_fr.add(w)
27 |         self.factory.add_profile(profile_fr, 1, 3)
28 | 
29 |         profile_ja = LangProfile('ja')
30 |         for w in self.TRAINING_JA.split():
31 |             profile_ja.add(w)
32 |         self.factory.add_profile(profile_ja, 2, 3)
33 | 
34 |     def test_detector1(self):
35 |         detect = self.factory.create()
36 |         detect.append('a')
37 |         self.assertEqual(detect.detect(), 'en')
38 | 
39 |     def test_detector2(self):
40 |         detect = self.factory.create()
41 |         detect.append('b d')
42 |         self.assertEqual(detect.detect(), 'fr')
43 | 
44 |     def test_detector3(self):
45 |         detect = self.factory.create()
46 |         detect.append('d e')
47 |         self.assertEqual(detect.detect(), 'en')
48 | 
49 |     def test_detector4(self):
50 |         detect = self.factory.create()
51 |         detect.append(six.u('\u3042\u3042\u3042\u3042a'))
52 |         self.assertEqual(detect.detect(), 'ja')
53 | 
54 |     def test_lang_list(self):
55 |         langlist = self.factory.get_lang_list()
56 |         self.assertEqual(len(langlist), 3)
57 |         self.assertEqual(langlist[0], 'en')
58 |         self.assertEqual(langlist[1], 'fr')
59 |         self.assertEqual(langlist[2], 'ja')
60 | 
61 |     def test_factory_from_json_string(self):
62 |         self.factory.clear()
63 |         profiles = [self.JSON_LANG1, self.JSON_LANG2]
64 |         self.factory.load_json_profile(profiles)
65 |         langlist = self.factory.get_lang_list()
66 |         self.assertEqual(len(langlist), 2)
67 |         self.assertEqual(langlist[0], 'lang1')
68 |         self.assertEqual(langlist[1], 'lang2')
69 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Subcleaner
 2 | Subcleaner is a python3 script for removing ads from .srt subtitle files.
 3 | The script is more sophisticated than a simple search and delete per line
 4 | and uses different regex profiles for different languages.
 5 | Once the script have identified ad-blocks they get removed and the remaining blocks 
 6 | get re-indexed.
 7 | 
 8 | Can clean entire libraries in recursive mode and works well with [Bazarr](https://github.com/morpheus65535/bazarr) 
 9 | directly installed or as a container from the [linuxserver/bazarr](https://hub.docker.com/r/linuxserver/bazarr) image.
10 | 
11 | # Installing
12 | Cloning and running with python3 should work.
13 | 
14 | ```cd /opt```
15 | 
16 | ```git clone https://github.com/KBlixt/subcleaner.git```
17 | 
18 | ```cd subcleaner```
19 | 
20 | Install the default config simply by running the script once or copy the default config into
21 | the script root directory.
22 | 
23 | ```python3 ./subcleaner.py -h```
24 | 
25 | With the subcleaner.conf file installed you can modify the settings within it.
26 | the config file contains instructions what each of the settings does.
27 | 
28 | ## Bazarr
29 | Unlock the scripts full potential by running it after downloading a subtitle from 
30 | [Bazarr](https://github.com/morpheus65535/bazarr). Enable custom post-processing and use
31 | the command:
32 | 
33 | ```python3 /opt/subcleaner/subcleaner.py "{{subtitles}}" -s``` (note the quotation)
34 | 
35 | It should work 
36 | right out the gate provided the paths and permissions are set up correctly.
37 | 
38 | in the bazarr log it should confirm that the script ran successfully or give you 
39 | an error message that tells you what's wrong. if nothing is output then you've probably 
40 | set the script path wrong.
41 | 
42 | ## Docker
43 | 
44 | If you run Bazarr in a docker container, as you should,
45 | make sure the Bazarr container have access to the script directory. Either
46 | mount /opt/subcleaner directly into the container as a volume or install the script inside 
47 | the Bazarr config directory.
48 | 
49 | I have verified that this works on the [linuxserver/bazarr](https://hub.docker.com/r/linuxserver/bazarr) image.
50 | 
51 | # Languages:
52 | The script have a few language profiles included by default:
53 | 
54 | - English
55 | - Spanish
56 | - Portuguese
57 | - Dutch
58 | - Indonesian
59 | - Swedish
60 | 
61 | If you want to run the script against any other language you'll have to either create a profile for it
62 | or disable the requirement in the subcleaner.conf file. It's recommended to create
63 | a language profile. read the README in the regex_profiles directory for more info and guidance.
64 | 
65 | ### If you make a useful regex profile for a non-default language, PLEASE let me know! 
66 | I'll review it and add it to the included default profiles. And it'll help out others that use 
67 | that language in the future! :)
68 | 
69 | __________________
70 | 
71 | 
72 | # Thank you :)
73 | Please, If you find any issues or have any questions feel free to 
74 | open an issue or discussion.
75 | 
76 | __________________
77 | ###### Future (possibly):
78 | 
79 | * Automatic subtitle deletion if language don't match label.
80 | 
81 | * better ui for confirming/reverting deletion of ads.
82 | 
83 | * ASS support?
84 | 
85 | 


--------------------------------------------------------------------------------
/regex_profiles/default/no_profile.conf:
--------------------------------------------------------------------------------
 1 | [META]
 2 | # default config that applies to any language that are missing an language profile.
 3 | 
 4 | language_codes = no_profile
 5 | 
 6 | 
 7 | 
 8 | # Information about how to configure the REGEX sections, read at the bottom of the file.
 9 | # All regexes are case insensitive!
10 | [WARNING_REGEX]
11 | 
12 | nop_warn1: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)?synch?(ed|ro(nized)?)?|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|confor(m|med)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|sub(s|bed)?|provided|supported|tim(ing|ed)|encoded|edit(ed|s)?|download(ed)?|present(s|ing|ed)|credits|episode)\b
13 | nop_warn2: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)?synch?(ed|ro(nized)?)?|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|confor(m|med)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|sub(s|bed)?|provided|supported|tim(ing|ed)|encoded|edit(ed|s)?|download(ed)?)\W+(by|from)\b
14 | 
15 | nop_warn3: \b(broadcasting|UNiTED\W*TEAM|admitme|ragbear|looklive|Camikaze|SourGrass|mstoll|alire2a)\b
16 | nop_warn4: \b(normita|EhLaNa|playships|metamorfose|sunmenghao|nessundorma|Arun|seriestele|DarKsh|vothaison)\b
17 | nop_warn5: \b(anana|cRosKy|Aramis|misshu|Xenzai|KKB|ydy|swsub|divx|empiremedia|La Fabrique|benj)\b
18 | nop_warn6: \b(dawaith|MoSub|snuif|Golgi|Linwelin|Malikay|Ricana|Sadgeezer|argenteam|tiobetonh|chebinhdan)\b
19 | 
20 | 
21 | [PURGE_REGEX]
22 | 
23 | nop_purge1: \b(caption(s|ed)?|subtitl(e|ed|es|ing)|fixed(?!-)|(re-?)?synch?(?!-)(ed|ro(nized)?)?|rip(ped)?(?!-)|translat(e|ed|ion|ions)|correct(ions|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|subs|provided|encoded|edit(ed|s)?)\W*(by|from)?\W*(:|;)..
24 | 
25 | nop_purge2: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university)
26 | nop_purge3: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clear\W*way\W*law)
27 | nop_purge4: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university)
28 | nop_purge5: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clearway\W*law)
29 | 
30 | 
31 | 
32 | 
33 | #
34 | # -----------------------------------------GUIDE-------------------------------------------------
35 | #
36 | 
37 | # This language profile contains two lists of regex that will look for patterns.
38 | # if you wish to modify or remove any regex, feel free to do so
39 | # but files in the default folder will be overwritten when you update the script.
40 | # You can add and remove keys as long as two keys don't use the same key twice.
41 | 
42 | # WARNING_REGEX:
43 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block.
44 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads.
45 | # 1 warning is ignored
46 | # 2 warnings will be print the block as a WARNING in the log.
47 | # 3 warnings or more will remove the entire block.
48 | 
49 | # PURGE_REGEX:
50 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block.
51 | 
52 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the
53 | # literal character you'll need to escape it with '\'
54 | # for example: matching "www." would require a regex like: "www\."
55 | # you can test regexes online on an regex-tester tool like https://regex101.com/
56 | 
57 | # Feel free to ask me any question on github.
58 | 


--------------------------------------------------------------------------------
/regex_profiles/default/hebrew.conf:
--------------------------------------------------------------------------------
 1 | [META]
 2 | # hebrew default config.
 3 | 
 4 | # Coma delimited list of language codes associated with this language profile.
 5 | # The script will run against all sub-labels like ":forced" as long as they match the language code.
 6 | # leave empty to apply to all language codes.
 7 | language_codes = he, heb, hebrew
 8 | 
 9 | 
10 | 
11 | # Information about how to configure the REGEX sections, read at the bottom of the file.
12 | # All regexes are case insensitive!
13 | [WARNING_REGEX]
14 | 
15 | # מילים שיכולות להופיע גם בפרסומות וגם בדיבור של הסרט
16 | he_warning1: \b(גדעון|צפריר|צפייה מהנה|צפייה נעימה|(?:ו)?נערך|אלמוני|(?:ו)?הפקת|תהנו|דונקי|התרגום|ותיקן|אנונימי|תרגום|עריכה|מורידים|שלד|קרן)\b
17 | 
18 | 
19 | [PURGE_REGEX]
20 | 
21 | #אתרי הורדת כתוביות
22 | he_purge1: \b(Ktuvit|Wizdom|SuperSubtitles|YIFY|Podnapisi|OpenSubtitles|Torec|Extreme|qsubs|imax|IMAX|extremesubs)\b
23 | #קבוצות מתורגמנים
24 | he_purge2: \b(Addic7ed|AnarKey|NDG STUDIOS|Donkey-Cr3w|Extreme|FaLse MeMories|GallifreySubs|HDSubs|Hebits|iSub|Ktuvit|LH|Qsubs|Sdarot|Sub-Faw|sub-lala|Subs|SubsCraft|SubsIL|Taxiron|TLMC|Torec|אולפנים|דורי מדיה אות|אולפני אלרום|פיוזר)\b
25 | #שמות מתורגמנים נפוצים
26 | he_purge3: \b(yoav1610|FK|elia|Godfather|TheRejector|scodoo2|Twilight|Yorai1212|HighLander|soprgal|ItayG|Acamol|qwer90|SnoWhite|נעמה זוהר|קופיקו הבלש|אבישג רז|עומר גפן|פיפו|dvodvo123|epitaph|yuvalh|XmonWoW|DrSub|Afenla|אלכסנדר פן|lala123|Hazy7868|glfinish|עדי-בלי-בצל|ddror|hamima|~Moshe~|Limor EM|דיויד סוויפט|glbegin|foxi9|Shaked7|Tornado|Nunia|rodney_mckay|BA07|Ariel046|Amir|Mozzie|Orpheus|אריאל אפרתי|ZIPC|שירין|iToch|R_A_7|WorkBook|GreenScorpion|נ\.א\.ש|Nobody|שוביקס|Eran-s|סטארבק|אסף פרץ|Outwit|E\.M|erez058|SHR|TOXIN|Idoideas|Hentaiman|RAMIRAMI68|kikmastr|subbie|TerryGoodkind|gil_m|בוביקו)\b
27 | #מילים שמיוחסות לבלוקי כתוביות
28 | he_purge4: \b(?:(?:ו)?תורג[מם]|(?:ו)?סונכר[נן]|(?:ו)?סונכרנו|(?:(?:ו|ה)?סנכרו[נן])|(?:ו)?ס[י]נכר[נן]|ונערך|משמיעה|(?:ו)?הגהה|(?:ו)?קודד(?:ו)?|הקידוד|(?:[וה]?תרגמו)|ותרגום|(?:(?:וה)|(?:ו)|(?:ה))?כתוביות)\b
29 | # מקרי קצה של מילים שמסתיימות בתו :
30 | he_purge5: (?<=\bעברית)(?=:)|(?<=:)(?=עברית\b)|(?<=\bתרגום)(?=:)|(?<=:)(?=תרגום\b)
31 | 
32 | 
33 | #
34 | # -----------------------------------------GUIDE-------------------------------------------------
35 | #
36 | 
37 | # This language profile contains two lists of regex that will look for patterns.
38 | # if you wish to modify or remove any regex, feel free to do so
39 | # but files in the default folder will be overwritten when you update the script.
40 | # You can add and remove keys as long as two keys don't use the same key twice.
41 | 
42 | # WARNING_REGEX:
43 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block.
44 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads.
45 | # 1 warning is ignored
46 | # 2 warnings will be print the block as a WARNING in the log.
47 | # 3 warnings or more will remove the entire block.
48 | 
49 | # PURGE_REGEX:
50 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block.
51 | 
52 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the
53 | # literal character you'll need to escape it with '\'
54 | # for example: matching "www." would require a regex like: "www\."
55 | # you can test regexes online on an regex-tester tool like https://regex101.com/
56 | 
57 | # Feel free to ask me any question on github.
58 | 


--------------------------------------------------------------------------------
/regex_profiles/default/english.conf:
--------------------------------------------------------------------------------
 1 | [META]
 2 | # English default config.
 3 | 
 4 | # Coma delimited list of language codes associated with this language profile.
 5 | # The script will run against all sub-labels like ":forced" as long as they match the language code.
 6 | # leave empty to apply to all language codes.
 7 | language_codes = en, eng, english
 8 | 
 9 | 
10 | 
11 | # Information about how to configure the REGEX sections, read at the bottom of the file.
12 | # All regexes are case insensitive!
13 | [WARNING_REGEX]
14 | 
15 | en_warn1: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)?synch?(ed|ro(nized)?)?|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion)|conform(ed|ing)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|sub(s|bed)?|provided|supported|tim(ing|ed)|encoded|edit(ed|s)?|download(ed)?|present(s|ing|ed)|credits|episode)\b
16 | en_warn2: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)?synch?(ed|ro(nized)?)?|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion)|conform(ed|ing)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|sub(s|bed)?|provided|supported|tim(ing|ed)|encoded|edit(ed|s)?|download(ed)?)\W+(by|from)\b
17 | 
18 | en_warn3: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university)
19 | en_warn4: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clear\W*way\W*law)
20 | en_warn5: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university)
21 | en_warn6: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clearway\W*law)
22 | en_warn7: \b(broadcasting|UNiTED\W*TEAM|admitme|ragbear|looklive|Camikaze|Aramis|Arun|SG)\b
23 | 
24 | en_warn8: English -
25 | en_warn9: English -
26 | 
27 | 
28 | [PURGE_REGEX]
29 | 
30 | en_purge1: \b(caption(s|ed)?|subtitl(e|ed|es|ing)|fixed(?!-)|(re-?)?synch?(?!-)(ed|ro(nized)?)?|rip(ped)?(?!-)|translat(e|ed|ion|ions)|correct(ions|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|subs|provided|encoded|edit(ed|s)?)\W*(by|from)?\W*(:|;)..
31 | en_purge2: ^present(s|ing)?:$
32 | en_purge3: \b(KKB|EhLaNa|ydy|swsub|divx|playships|empiremedia|metamorfose|sunmenghao|nessundorma|vothaison)\b
33 | en_purge4: \b(anana|cRosKy|misshu|seriestele|DarKsh|Xenzai|argenteam|tiobetonh|chebinhdan)\b
34 | en_purge5: \b(normita|dawaith|MoSub|snuif|Golgi|Linwelin|Malikay|Ricana|Sadgeezer|SourGrass|mstoll|alire2a)\b
35 | en_purge6: \b(admit1\.app|4kvod\.tv)\b
36 | #en_purge#: Regex goes here.
37 | 
38 | 
39 | 
40 | #
41 | # -----------------------------------------GUIDE-------------------------------------------------
42 | #
43 | 
44 | # This language profile contains two lists of regex that will look for patterns.
45 | # if you wish to modify or remove any regex, feel free to do so
46 | # but files in the default folder will be overwritten when you update the script.
47 | # You can add and remove keys as long as two keys don't use the same key twice.
48 | 
49 | # WARNING_REGEX:
50 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block.
51 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads.
52 | # 1 warning is ignored
53 | # 2 warnings will be print the block as a WARNING in the log.
54 | # 3 warnings or more will remove the entire block.
55 | 
56 | # PURGE_REGEX:
57 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block.
58 | 
59 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the
60 | # literal character you'll need to escape it with '\'
61 | # for example: matching "www." would require a regex like: "www\."
62 | # you can test regexes online on an regex-tester tool like https://regex101.com/
63 | 
64 | # Feel free to ask me any question on github.
65 | 


--------------------------------------------------------------------------------
/libs/subcleaner/cleaner/cleaner.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from datetime import timedelta
  3 | from pathlib import Path
  4 | from typing import *
  5 | from libs.subcleaner.subtitle import Subtitle
  6 | from libs.subcleaner.settings import args
  7 | 
  8 | from . import detectors, punishers
  9 | from ..sub_block import SubBlock
 10 | 
 11 | ad_blocks: Dict[SubBlock, Set[Path]] = {}
 12 | warning_blocks: Dict[SubBlock, Set[Path]] = {}
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def find_ads(subtitle: Subtitle) -> None:
 18 |     punishers.punish_regex_matches(subtitle)
 19 | 
 20 |     for block in subtitle.blocks:
 21 |         if block.regex_matches == 0:
 22 |             block.regex_matches = -1
 23 | 
 24 |     punishers.punish_quick_first_block(subtitle)
 25 |     punishers.punish_ad_adjacency(subtitle)
 26 |     punishers.punish_clone_blocks(subtitle)
 27 | 
 28 |     for block in subtitle.blocks:
 29 |         if block.regex_matches >= 3:
 30 |             subtitle.ad(block)
 31 |         elif block.regex_matches == 2:
 32 |             subtitle.warn(block)
 33 | 
 34 |     detectors.detect_wedged(subtitle)
 35 |     punishers.move_duplicated(subtitle)
 36 |     detectors.detect_chain(subtitle)
 37 | 
 38 | 
 39 | def reset():
 40 |     punishers.reset_duplicate()
 41 | 
 42 | 
 43 | def remove_ads(subtitle: Subtitle):
 44 |     if args.sensitive and len(subtitle.blocks) > 1:
 45 |         subtitle.warn(subtitle.blocks[0])
 46 |         subtitle.warn(subtitle.blocks[-1])
 47 | 
 48 |         for i in range(1, len(subtitle.blocks) - 1):
 49 |             prev_block = subtitle.blocks[i - 1]
 50 |             block = subtitle.blocks[i]
 51 |             next_block = subtitle.blocks[i + 1]
 52 |             if prev_block in subtitle.ad_blocks or next_block in subtitle.ad_blocks:
 53 |                 subtitle.warn(block)
 54 | 
 55 |     for block in subtitle.ad_blocks:
 56 |         try:
 57 |             subtitle.blocks.remove(block)
 58 |             if "-->" in block.content:
 59 |                 logger.warning(f"potential malformed subtitle blocks in removed block {block.original_index}.")
 60 |         except ValueError:
 61 |             pass
 62 |         for e_block in ad_blocks:
 63 |             if e_block.clean_content == block.clean_content:
 64 |                 ad_blocks[e_block].add(subtitle.short_path)
 65 |                 break
 66 |         else:
 67 |             ad_blocks[block] = {subtitle.short_path}
 68 | 
 69 |     for block in subtitle.warning_blocks:
 70 |         for e_block in warning_blocks:
 71 |             if e_block.clean_content == block.clean_content:
 72 |                 warning_blocks[e_block].add(subtitle.short_path)
 73 |                 break
 74 |         else:
 75 |             warning_blocks[block] = {subtitle.short_path}
 76 | 
 77 |     subtitle.reindex()
 78 | 
 79 | 
 80 | def fix_overlap(subtitle: Subtitle):
 81 |     if len(subtitle.blocks) < 2:
 82 |         return False
 83 |     changes = False
 84 |     previous_block = subtitle.blocks[0]
 85 |     for block in subtitle.blocks[1:]:
 86 |         if not (previous_block.start_time < block.start_time and previous_block.end_time < block.end_time):
 87 |             previous_block = block
 88 |             continue
 89 | 
 90 |         overlap = previous_block.end_time - block.start_time + timedelta(seconds=3 / 30)
 91 |         if timedelta(milliseconds=3) < overlap and (len(block.content) + len(previous_block.content)) > 0:
 92 |             content_ratio = block.duration_seconds / (block.duration_seconds + previous_block.duration_seconds)
 93 |             block.start_time += content_ratio * overlap
 94 |             previous_block.end_time += (content_ratio - 1) * overlap
 95 |             changes = True
 96 |         
 97 |         previous_block = block
 98 |     return changes
 99 | 
100 | 
101 | def unscramble(subtitle: Subtitle):
102 |     subtitle.blocks.sort(key=lambda x: x.start_time)
103 |     for block in subtitle.blocks.copy():
104 |         if block.duration_seconds <= 0:
105 |             subtitle.ad(block)
106 |             block.hints.append("negative_duration")
107 |             subtitle.blocks.remove(block)
108 |     subtitle.reindex()
109 | 


--------------------------------------------------------------------------------
/libs/subcleaner/report_generator.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import *
  3 | 
  4 | from libs.subcleaner.cleaner import cleaner
  5 | from libs.subcleaner.settings import args, config
  6 | from libs.subcleaner.sub_block import SubBlock
  7 | from libs.subcleaner.subtitle import Subtitle
  8 | 
  9 | _report_base = "          | "
 10 | _report: str
 11 | 
 12 | 
 13 | def generate_report(subtitle: Subtitle) -> str:
 14 |     _reset()
 15 |     _add(f"{len(subtitle.ad_blocks)} deleted blocks and {len(subtitle.warning_blocks)} warnings remaining.")
 16 | 
 17 |     if subtitle.ad_blocks:
 18 |         _add("")
 19 |         _add(_deleted_card(subtitle.ad_blocks), " " * 4)
 20 |     if subtitle.warning_blocks and not args.errors_only:
 21 |         _add("")
 22 |         _add(_warning_card(subtitle.warning_blocks), " " * 40)
 23 |         _add("")
 24 |         _add("To delete all remaining warnings run:")
 25 |         _add(f"python3 '{config.script_file}' '{subtitle.short_path}' --destroy {' '.join(subtitle.get_warning_indexes())}")
 26 | 
 27 |     return _report[1:]
 28 | 
 29 | 
 30 | def generate_end_report() -> str:
 31 |     _reset()
 32 |     _add("")
 33 |     _add(_end_deleted_card(cleaner.ad_blocks), " " * 4)
 34 |     _add("")
 35 |     _add(_end_warning_card(cleaner.warning_blocks), " " * 40)
 36 |     _add("")
 37 |     return _report[1:]
 38 | 
 39 | 
 40 | def _add(lines: str, spacer: str = "") -> None:
 41 |     lines = "\n" + lines
 42 | 
 43 |     global _report
 44 |     _report += lines.replace("\n", f"\n{_report_base}{spacer}")
 45 | 
 46 | 
 47 | def _reset() -> None:
 48 |     global _report
 49 |     _report = ""
 50 | 
 51 | 
 52 | def _deleted_card(ad_blocks: Set[SubBlock]) -> str:
 53 |     ad_blocks_list = list(ad_blocks)
 54 |     ad_blocks_list.sort(key=lambda b: b.original_index)
 55 |     card = "[---------Removed Blocks----------]\n"
 56 |     for block in ad_blocks_list:
 57 |         card += f"{block.original_index}\n"
 58 |         card += f"{block}\n"
 59 |         if args.explain:
 60 |             card += f"reasons: ({', '.join(block.hints)})\n"
 61 |         card += "\n"
 62 |     card = card[:-1] + "[---------------------------------]"
 63 |     return card
 64 | 
 65 | 
 66 | def _warning_card(warning_blocks: Set[SubBlock]) -> str:
 67 |     warning_blocks_list = list(warning_blocks)
 68 |     warning_blocks_list.sort(key=lambda b: b.current_index)
 69 |     card = "[---------Warning Blocks----------]\n"
 70 |     for block in warning_blocks_list:
 71 |         card += f"{block.current_index}\n"
 72 |         card += f"{block}\n"
 73 |         if args.explain:
 74 |             card += f"reasons: ({', '.join(block.hints)})\n"
 75 |         card += "\n"
 76 |     card = card[:-1] + "[---------------------------------]"
 77 |     return card
 78 | 
 79 | 
 80 | def _end_deleted_card(ad_blocks: Dict[SubBlock, Set[Path]]) -> str:
 81 | 
 82 |     ad_blocks_list = list((key, value) for key, value in ad_blocks.items())
 83 |     ad_blocks_list.sort(key=lambda b: len(b[1]))
 84 |     card = "[---------All Removed Blocks----------]\n"
 85 |     for block in ad_blocks_list:
 86 |         if len(block[1]) > 4:
 87 |             continue
 88 |         if 0 == block[0].regex_matches or block[0].regex_matches > 9:
 89 |             continue
 90 | 
 91 |         card += f"{block[0].original_index}\n"
 92 |         card += f"{block[0]}\n"
 93 |         if args.explain:
 94 |             card += f"reasons: ({', '.join(block[0].hints)})\n"
 95 |         card += "subtitles: \n" + "\n".join(map(str, block[1])) + "\n"
 96 |         card += "\n"
 97 |     card = card[:-1] + "[---------------------------------]"
 98 |     return card
 99 | 
100 | 
101 | def _end_warning_card(warning_blocks: Dict[SubBlock, Set[Path]]) -> str:
102 |     ad_blocks_list = list((key, value) for key, value in warning_blocks.items())
103 |     ad_blocks_list.sort(key=lambda b: len(b[1]), reverse=True)
104 |     card = "[---------All Warning Blocks----------]\n"
105 |     for block in ad_blocks_list:
106 |         if len(block[1]) < 2:
107 |             continue
108 |         card += f"{block[0].original_index}\n"
109 |         card += f"{block[0]}\n"
110 |         if args.explain:
111 |             card += f"reasons: ({', '.join(block[0].hints)})\n"
112 |         card += "subtitles: \n" + "\n".join(map(str, block[1])) + "\n"
113 |         card += "\n"
114 |     card = card[:-1] + "[---------------------------------]"
115 |     return card
116 | 


--------------------------------------------------------------------------------
/regex_profiles/default/svenska.conf:
--------------------------------------------------------------------------------
 1 | [META]
 2 | # Swedish default config.
 3 | 
 4 | # Coma delimited list of language codes associated with this language profile.
 5 | # The script will run on all sub-labels like ":forced" as long as they match the language code.
 6 | # leave empty to apply to all language codes.
 7 | language_codes = sv, sve, svenska
 8 | 
 9 | 
10 | 
11 | # Information about how to configure the REGEX sections, read at the bottom of the file.
12 | # All regexes are case insensitive!
13 | [WARNING_REGEX]
14 | 
15 | sv_warn1: \b(kompletterad|(under)?text(ning|er)?|sångtext(er)?|(om-?)?syn[ck](ning|ad)?|övers[aä]tt(ning)?|distribution|Ansvarig utgivare|rätt(ning|ad)|regi|Läppsynk|episode?)\b
16 | sv_warn2: \b(kompletterad|(under)?text(ning|er)?|sångtext(er)?|(om-?)?syn[ck](ning|ad)?|övers[aä]tt(ning)?|distribution|Ansvarig utgivare|rätt(ning|ad)|regi|Läppsynk)\W+(av|från)\b
17 | sv_warn3: \.(se|nu)\b
18 | 
19 | sv_warn5: \b(anana|present)\b
20 | #regex#: Regex goes here.
21 | sv_warn6: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)synch?(ed|ro(nized)?)?|(re-?)?synch?(ed|ro(nized)?)|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|confor(m|med)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|sub(s|bed)|provided|supported|encoded|edit(ed|s)?|download(ed)?|present(s|ing|ed)|credits)\b
22 | sv_warn7: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)synch?(ed|ro(nized)?)?|(re-?)?synch?(ed|ro(nized)?)|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|confor(m|med)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|sub(s|bed)|provided|supported|encoded|edit(ed|s)?|download(ed)?|present(s|ing|ed)|credits)\b
23 | 
24 | sv_warn8: \b(Incubator|FRiEND|Swedish|TuX|eXz|Aramis|TAZ)\b
25 | sv_warn9: \b(Incubator|FRiEND|Swedish|TuX|eXz|TAZ)\b
26 | 
27 | 
28 | [PURGE_REGEX]
29 | sv_purge1: \b(kompletterad|(under)?text(ning|er)?|sångtext(er)?|(om-?)?syn[ck](ning|ad)?|övers[aä]tt(ning)?|distribution|Ansvarig utgivare|rätt(ning|ad)|regi|Läppsynk)\W*(av|från)?\W*(:|;)..
30 | 
31 | sv_purge2: \b(Annonsera din produkt|bli en VIP-medlem|de bästa undertexter)\b
32 | sv_purge3: \b(Svensk Medietext|NORDiC RETAiL!|Swesub|Pictures AB|Scandinavian Text|[oö]versattargruppen|Mediatextgruppen)\b
33 | sv_purge4: \b(Bubba67|Dream_Theater|nordicbits|undertexter.se|stoffinho17|simontax|Sweden AB)\b
34 | sv_purge5: \b(StoraStyggaVargen|sdi.?media)\b
35 | sv_purge6: \b(Team Wild Animais Only Relesed|SDI Media|jaymz007|queen-ingela|Iyuno-SDI|Imposter10)\b
36 | 
37 | sv_purge7: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university)
38 | sv_purge8: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clear\W*way\W*law)
39 | sv_purge9: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university)
40 | sv_purge10: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clearway\W*law)
41 | sv_purge11: \b(broadcasting|UNiTED\W*TEAM|admitme|ragbear|looklive|Camikaze)\b
42 | sv_purge12: \b(KKB|EhLaNa|ydy|swsub|divx|playships|empiremedia|metamorfose|sunmenghao|nessundorma|vothaison)\b
43 | sv_purge13: \b(cRosKy|misshu|Arun|seriestele|Sadgeezer|taureane)\b
44 | sv_purge14: \b(normita|dawaith|MoSub|snuif|Golgi|Linwelin|Malikay|Ricana|DarKsh|Xenzai|argenteam|tiobetonh|chebinhdan)\b
45 | sv_purge15: s([äa]song)?\W*\d+[^,]\W*e(pisod)?\W*\d+[^,]
46 | #regex#: Regex goes here.
47 | 
48 | 
49 | #
50 | # -----------------------------------------GUIDE-------------------------------------------------
51 | #
52 | 
53 | # This language profile contains two lists of regex that will look for patterns.
54 | # if you wish to modify or remove any regex, feel free to do so
55 | # but files in the default folder will be overwritten when you update the script.
56 | # You can add and remove keys as long as two keys don't use the same key twice.
57 | 
58 | # WARNING_REGEX:
59 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block.
60 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads.
61 | # 1 warning is ignored
62 | # 2 warnings will be print the block as a WARNING in the log.
63 | # 3 warnings or more will remove the entire block.
64 | 
65 | # PURGE_REGEX:
66 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block.
67 | 
68 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the
69 | # literal character you'll need to escape it with '\'
70 | # for example: matching "www." would require a regex like: "www\."
71 | # you can test regexes online on an regex-tester tool like https://regex101.com/
72 | 
73 | # Feel free to ask me any question on github.
74 | 
75 | 


--------------------------------------------------------------------------------
/libs/subcleaner/sub_block.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import re
  4 | from typing import List
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | 
  9 | class SubBlock:
 10 |     original_index: int
 11 |     current_index: int
 12 |     content: str
 13 |     clean_content: str
 14 |     start_time: datetime.timedelta
 15 |     end_time: datetime.timedelta
 16 |     regex_matches = 0
 17 |     hints: List[str]
 18 | 
 19 |     def __init__(self, block_content: str, original_index_actual: int):
 20 |         lines = block_content.strip().split("\n")
 21 | 
 22 |         if self.is_sub_block_header(lines[0]) and len(lines) > 1 and not self.is_sub_block_header(lines[1]):
 23 |             lines = [""] + lines
 24 | 
 25 |         if lines[0].isnumeric():
 26 |             self.original_index = int(lines[0])
 27 |         else:
 28 |             number = ""
 29 |             for character in lines[0]:
 30 |                 if character.isnumeric():
 31 |                     number += character
 32 |                 else:
 33 |                     break
 34 |             if number:
 35 |                 self.original_index = int(number)
 36 |             else:
 37 |                 self.original_index = original_index_actual
 38 | 
 39 |         if len(lines) < 2 or not self.is_sub_block_header(lines[1]):
 40 |             raise ParsingException(self.original_index, "incorrectly formatted subtitle block")
 41 | 
 42 |         times = lines[1].replace(" ", "").split("-->")
 43 |         try:
 44 |             self.start_time = time_string_to_timedelta(times[0])
 45 |             self.end_time = time_string_to_timedelta(times[1])
 46 |         except ValueError:
 47 |             raise ParsingException(self.original_index, "failed to parse timeframe.")
 48 |         except IndexError:
 49 |             raise ParsingException(self.original_index, "failed to parse timeframe.")
 50 | 
 51 |         if len(lines) > 2:
 52 |             self.content = "\n".join(lines[2:]).strip()
 53 |         else:
 54 |             self.content = ""
 55 |         self.content = self.content.replace("</br>", "\n")
 56 |         self.clean_content = re.sub("[\\s.,:_-]", "", self.content)
 57 |         self.hints = []
 58 | 
 59 |     def equal_content(self, block: "SubBlock") -> bool:
 60 |         t = re.sub("[\\s.,:_-]", "", self.content)
 61 |         o = re.sub("[\\s.,:_-]", "", block.content)
 62 |         return t == o
 63 | 
 64 |     def __str__(self) -> str:
 65 |         string = f"{timedelta_to_time_string(self.start_time)} --> {timedelta_to_time_string(self.end_time)}\n" \
 66 |                  f"{self.content}"
 67 |         return string
 68 | 
 69 |     @classmethod
 70 |     def is_sub_block_header(cls, line: str) -> bool:
 71 |         if "\n" in line:
 72 |             return False
 73 | 
 74 |         times = line.replace(" ", "").split("-->")
 75 |         if len(times) < 2:
 76 |             return False
 77 |         try:
 78 |             time_string_to_timedelta(times[0])
 79 |             time_string_to_timedelta(times[1])
 80 |         except ValueError:
 81 |             return False
 82 |         except IndexError:
 83 |             return False
 84 | 
 85 |         return True
 86 | 
 87 |     @property
 88 |     def duration_seconds(self) -> float:
 89 |         return (self.end_time - self.start_time).total_seconds()
 90 | 
 91 | 
 92 | class ParsingException(Exception):
 93 |     block_index: int
 94 |     subtitle_file: str
 95 |     file_line: int
 96 |     reason: str
 97 | 
 98 |     def __init__(self, block_index, reason):
 99 |         self.block_index = block_index
100 |         self.reason = reason
101 | 
102 |     def __str__(self) -> str:
103 |         return f"Parsing error at block {self.block_index} in file \"{self.subtitle_file}\" line {self.file_line}. reason: {self.reason}"
104 | 
105 | 
106 | def time_string_to_timedelta(time_string: str) -> datetime.timedelta:
107 |     time = time_string.replace(",", ".").replace(" ", "")
108 |     split = time.split(":")
109 | 
110 |     hours = float(split[0])
111 |     minutes = float(split[1])
112 |     seconds = split[2][:6]
113 | 
114 |     seconds_clean = ""
115 |     found_dot = False
116 |     for ch in seconds:
117 |         if ch.isnumeric():
118 |             seconds_clean += ch
119 |         if ch == ".":
120 |             if not found_dot:
121 |                 found_dot = True
122 |                 seconds_clean += ch
123 |     seconds = float(seconds_clean)
124 |     if seconds >= 60:
125 |         raise ValueError()
126 |     if minutes >= 60:
127 |         raise ValueError()
128 | 
129 |     return datetime.timedelta(hours=hours,
130 |                               minutes=minutes,
131 |                               seconds=seconds)
132 | 
133 | 
134 | def timedelta_to_time_string(timedelta: datetime.timedelta) -> str:
135 |     time_string = str(timedelta)
136 |     if "." in time_string:
137 |         time_string = time_string[: -3].replace(".", ",").zfill(12)
138 |     else:
139 |         time_string = f"{time_string},000".zfill(12)
140 |     return time_string
141 | 


--------------------------------------------------------------------------------
/libs/langdetect/detector_factory.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from os import path
  3 | import sys
  4 | 
  5 | try:
  6 |     import simplejson as json
  7 | except ImportError:
  8 |     import json
  9 | 
 10 | from .detector import Detector
 11 | from .lang_detect_exception import ErrorCode, LangDetectException
 12 | from .utils.lang_profile import LangProfile
 13 | 
 14 | 
 15 | class DetectorFactory(object):
 16 |     '''
 17 |     Language Detector Factory Class.
 18 | 
 19 |     This class manages an initialization and constructions of Detector.
 20 | 
 21 |     Before using language detection library,
 22 |     load profiles with DetectorFactory.load_profile(str)
 23 |     and set initialization parameters.
 24 | 
 25 |     When the language detection,
 26 |     construct Detector instance via DetectorFactory.create().
 27 |     See also Detector's sample code.
 28 |     '''
 29 |     seed = None
 30 | 
 31 |     def __init__(self):
 32 |         self.word_lang_prob_map = {}
 33 |         self.langlist = []
 34 | 
 35 |     def load_profile(self, profile_directory):
 36 |         list_files = os.listdir(profile_directory)
 37 |         if not list_files:
 38 |             raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Not found profile: ' + profile_directory)
 39 | 
 40 |         langsize, index = len(list_files), 0
 41 |         for filename in list_files:
 42 |             if filename.startswith('.'):
 43 |                 continue
 44 |             filename = path.join(profile_directory, filename)
 45 |             if not path.isfile(filename):
 46 |                 continue
 47 | 
 48 |             f = None
 49 |             try:
 50 |                 if sys.version_info[0] < 3:
 51 |                     f = open(filename, 'r')
 52 |                 else:
 53 |                     f = open(filename, 'r', encoding='utf-8')
 54 |                 json_data = json.load(f)
 55 |                 profile = LangProfile(**json_data)
 56 |                 self.add_profile(profile, index, langsize)
 57 |                 index += 1
 58 |             except IOError:
 59 |                 raise LangDetectException(ErrorCode.FileLoadError, 'Cannot open "%s"' % filename)
 60 |             except:
 61 |                 raise LangDetectException(ErrorCode.FormatError, 'Profile format error in "%s"' % filename)
 62 |             finally:
 63 |                 if f:
 64 |                     f.close()
 65 | 
 66 |     def load_json_profile(self, json_profiles):
 67 |         langsize, index = len(json_profiles), 0
 68 |         if langsize < 2:
 69 |             raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Need more than 2 profiles.')
 70 | 
 71 |         for json_profile in json_profiles:
 72 |             try:
 73 |                 json_data = json.loads(json_profile)
 74 |                 profile = LangProfile(**json_data)
 75 |                 self.add_profile(profile, index, langsize)
 76 |                 index += 1
 77 |             except:
 78 |                 raise LangDetectException(ErrorCode.FormatError, 'Profile format error.')
 79 | 
 80 |     def add_profile(self, profile, index, langsize):
 81 |         lang = profile.name
 82 |         if lang in self.langlist:
 83 |             raise LangDetectException(ErrorCode.DuplicateLangError, 'Duplicate the same language profile.')
 84 |         self.langlist.append(lang)
 85 | 
 86 |         for word in profile.freq:
 87 |             if word not in self.word_lang_prob_map:
 88 |                 self.word_lang_prob_map[word] = [0.0] * langsize
 89 |             length = len(word)
 90 |             if 1 <= length <= 3:
 91 |                 prob = 1.0 * profile.freq.get(word) / profile.n_words[length - 1]
 92 |                 self.word_lang_prob_map[word][index] = prob
 93 | 
 94 |     def clear(self):
 95 |         self.langlist = []
 96 |         self.word_lang_prob_map = {}
 97 | 
 98 |     def create(self, alpha=None):
 99 |         '''Construct Detector instance with smoothing parameter.'''
100 |         detector = self._create_detector()
101 |         if alpha is not None:
102 |             detector.set_alpha(alpha)
103 |         return detector
104 | 
105 |     def _create_detector(self):
106 |         if not self.langlist:
107 |             raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Need to load profiles.')
108 |         return Detector(self)
109 | 
110 |     def set_seed(self, seed):
111 |         self.seed = seed
112 | 
113 |     def get_lang_list(self):
114 |         return list(self.langlist)
115 | 
116 | 
117 | PROFILES_DIRECTORY = path.join(path.dirname(__file__), 'profiles')
118 | _factory = None
119 | 
120 | def init_factory():
121 |     global _factory
122 |     if _factory is None:
123 |         _factory = DetectorFactory()
124 |         _factory.load_profile(PROFILES_DIRECTORY)
125 | 
126 | def detect(text):
127 |     init_factory()
128 |     detector = _factory.create()
129 |     detector.append(text)
130 |     return detector.detect()
131 | 
132 | 
133 | def detect_langs(text):
134 |     init_factory()
135 |     detector = _factory.create()
136 |     detector.append(text)
137 |     return detector.get_probabilities()
138 | 


--------------------------------------------------------------------------------
/regex_profiles/default/spanish.conf:
--------------------------------------------------------------------------------
 1 | [META]
 2 | # Spanish default config.
 3 | 
 4 | # Comma delimited list of language codes associated with this language profile.
 5 | # The script will run against all sub-labels like ":forced" as long as they match the language code.
 6 | # leave empty to apply to all language codes.
 7 | language_codes = es, spa, spanish
 8 | 
 9 | # Information about how to configure the REGEX sections, read at the bottom of the file.
10 | # All regexes are case insensitive!
11 | [WARNING_REGEX]
12 | 
13 | es_warn1: \b(creado(s)?|subtitu(lo|los|lado|lada|lados)|subtítu(lo|los|lado|lada|lados)|descarg(ado|ar)|(re-?)?sinc(ed|ro(nizado|nizados|nizacion|nización)?)?|modific(ado|ados|ion|iones|ión|iónes)|traduc(e|ido|idos|tora|cion|ciones|ción|ciónes)|correcc(iones|ion|ión|iónes)|correg(ir|ido|idos)|transcri(bido|pcion|pciones|pción|pciónes)|mejor(ado|amientos)|adaptado|ripeo|arreglos)\b
14 | es_warn2: \b(creado(s)?|subtitu(lo|los|lado|lada|lados)|subtítu(lo|los|lado|lada|lados)|descarg(ado|ar)|(re-?)?sinc(ed|ro(nizado|nizados|nizacion|nización)?)?|modific(ado|ados|ion|iones|ión|iónes)|traduc(e|ido|idos|tora|cion|ciones|ción|ciónes)|correcc(iones|ion|ión|iónes)|correg(ir|ido|idos)|transcri(bido|pcion|pciones|pción|pciónes)|mejor(ado|amientos)|adaptado|ripeo|arreglos|subs|hecha)\W+(por|de|by)\b
15 | es_warn3: \b(traduc(e|ido|idos|tora|cion|ciones|ción|ciónes)|transcri(bido|pcion|pciones|pción|pciónes)|subtitu(lo|los|lado|lada|lados)|subtítu(lo|los|lado|lada|lados))\W+(al|en)\b
16 | 
17 | es_warn4: spanish ?(-|]|\/)
18 | es_warn5: \b(spanish|latino|espanol|español|castilian|latin american|castellano)\b
19 | es_warn6: latin american ?(-|]|\/)
20 | es_warn7: castilian ?(-|]|\/)
21 | 
22 | es_warn8:\b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)?synch?(ed|ro(nized)?)?|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|provided|supported|tim(ing|ed)|encoded|edit(ed|s)?|download(ed)?|present(s|ing|ed)|credits|episode)\b
23 | es_warn9:\b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)?synch?(ed|ro(nized)?)?|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|provided|supported|tim(ing|ed)|encoded|edit(ed|s)?|download(ed)?|present(s|ing|ed)|credits|episode)\b
24 | 
25 | es_warn10: \b(Episódio|MKV|youtube|Facebook|Instagram|Twitter|Whatsapp|Tiktok)\b
26 | 
27 | #Common Spanish Translator names - warnings as possibly could appear in a subtitle
28 | es_warn11: \b(Juan Vera|Juan Rico|George Denbrough|Giovanni Mion|Walter Leonard|Richard Bates|Francesc Aloy Bonet|Pilar González Dueñas|Mario Pérez|Paula Mariani|Philipp Schmidt|Hans Santos|Eric Escribano Barreiro)\b
29 | 
30 | [PURGE_REGEX]
31 | 
32 | es_purge1: \b(creado(s)?|subtitu(lo|los|lado|lada|lados)|subtítu(lo|los|lado|lada|lados)|descarg(ado|ar)|(re-?)?sinc(?!-)(ed|ro(nizado|nizados|nizacion|nización)?)?|modific(ado|ados|ion|iones|ión|iónes)|traduc(e|ido|idos|tora|cion|ciones|ción|ciónes)|correcc(iones|ion|ión|iónes)|correg(ir|ido|idos)|transcri(bido|pcion|pciones|pción|pciónes)|mejor(ado|amientos)|adaptado|ripeo|arreglos|subs|hecha)\W*(por|de|by)?\W*(:|;)
33 | 
34 | #Spanish Sub websites
35 | es_purge2: \b(admitme|argenteam|finalanime|subtitulamos|substeam|subdivx|tusubtitulo|thesubfactory|Open Subtitles|miembro VIP|osdb\.link|TranslatorsInc|Translators, Inc|TranslatorslncSubs\.blogspot\.com\.es|Southparkspanish|SUBTITULOS\.es|SUBITULOS\.es|SouthParkNews\.net|subtitules\.es|ShooCat|YYeTs|TaMaBin|P@bs|gratispeliculas|SubAdictos|SerieCanal|playships\.eu|tusseries\.com|subswiki\.com|Subs-Team|SUBTÍTULOS\.ES|U\-Sub\.net)\b
36 | 
37 | #Spanish translation websites / groups
38 | es_purge3: \b(Visiontext|Filmtrans|CARLISHIO|HGWizard|LASERFILM|Fhercho06|Cinesotano|jantoniot|Caichac|cemauli|Drakul|Scarlata|laloonda|japezoa|MarcusL|Kikeguate|KIKEGT|Zagon|KingCreole|Mothernatura|MaLTRaiN|FRH|GCas87|maryluzesp|Marenys|ByAlbis02|ana24horas|Fernando355|Zagonsubs|ikerslot|menoyos|Axel7902|vNaru|livinginthepast|patagonikus|Macias Group|EasyTechOficial|mlmlte|LiarsTeam|OnceUponATEAM)\b
39 | es_purge4: \b(juanchojb|shogun87|Rocio190889|darklin01|R@ul|Mabeas|akallabeth|NicoDipaolo|OsirisTSF|Lord Avestruz|LadyJenny|jeslil7|Giobatta SA|MementMori|la_bestia1962|Natuchia|JJ Porto|marchelo64|c\. oper|SHADOW84\Anfegopi|perroubuntero|Kumara|JosephPools|natycuac|ibvil|SwSub|DarKsh|ShalimarFox|R\[H\]ésus AB\+ Team|Mat Productions|S\. C\. Bananas|Bakugan|M-Rok|YYeTTs|robermgs)\b
40 | 
41 | #
42 | # -----------------------------------------GUIDE-------------------------------------------------
43 | #
44 | 
45 | # This language profile contains two lists of regex that will look for patterns.
46 | # if you wish to modify or remove any regex, feel free to do so
47 | # but files in the default folder will be overwritten when you update the script.
48 | # You can add and remove keys as long as two keys don't use the same key twice.
49 | 
50 | # WARNING_REGEX:
51 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block.
52 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads.
53 | # 1 warning is ignored
54 | # 2 warnings will be print the block as a WARNING in the log.
55 | # 3 warnings or more will remove the entire block.
56 | 
57 | # PURGE_REGEX:
58 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block.
59 | 
60 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the
61 | # literal character you'll need to escape it with '\'
62 | # for example: matching "www." would require a regex like: "www\."
63 | # you can test regexes online on an regex-tester tool like https://regex101.com/
64 | 
65 | # Feel free to ask me any question on github.
66 | 


--------------------------------------------------------------------------------
/libs/subcleaner/regex_lists.py:
--------------------------------------------------------------------------------
  1 | import configparser
  2 | import re
  3 | from pathlib import Path
  4 | from typing import List, Dict, Tuple, Pattern
  5 | 
  6 | from libs.subcleaner.settings import config
  7 | import logging
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | global_profiles: List["GlobalProfile"] = []
 12 | purge_regex: Dict[str, List[Tuple[str, Pattern]]] = {}
 13 | warning_regex: Dict[str, List[Tuple[str, Pattern]]] = {}
 14 | 
 15 | 
 16 | def language_has_profile(language: str):
 17 |     return language in purge_regex
 18 | 
 19 | 
 20 | def get_purge_regex(language: str) -> List[Tuple[str, Pattern]]:
 21 |     if language in purge_regex:
 22 |         return purge_regex[language]
 23 |     return purge_regex["no_profile"]
 24 | 
 25 | 
 26 | def get_warning_regex(language: str) -> List[Tuple[str, Pattern]]:
 27 |     if language in warning_regex:
 28 |         return warning_regex[language]
 29 |     return warning_regex["no_profile"]
 30 | 
 31 | 
 32 | class GlobalProfile:
 33 |     excluded_languages: List[str]
 34 |     purge_regex_lines: List[Tuple[str, Pattern]]
 35 |     warning_regex_lines: List[Tuple[str, Pattern]]
 36 | 
 37 |     def __init__(self, parser, default: bool) -> None:
 38 |         self.purge_regex_lines = []
 39 |         self.warning_regex_lines = []
 40 | 
 41 |         for key, value in list(parser["PURGE_REGEX"].items()):
 42 |             if not default:
 43 |                 key = key + "*"
 44 |             value = f"({value})"
 45 |             compiled_regex = re.compile(value, flags=re.IGNORECASE | re.UNICODE)
 46 |             self.purge_regex_lines.append((key, compiled_regex))
 47 |         for key, value in list(parser["WARNING_REGEX"].items()):
 48 |             if not default:
 49 |                 key = key + "*"
 50 |             value = f"({value})"
 51 |             compiled_regex = re.compile(value, flags=re.IGNORECASE | re.UNICODE)
 52 |             self.warning_regex_lines.append((key, compiled_regex))
 53 | 
 54 |         self.excluded_languages = parser["META"].get("excluded_language_codes", "").replace(" ", "").split(",")
 55 |         for language in self.excluded_languages:
 56 |             if not language:
 57 |                 self.excluded_languages.remove(language)
 58 | 
 59 |         for language in purge_regex:
 60 |             if any(language == excluded_language for excluded_language in self.excluded_languages):
 61 |                 continue
 62 |             purge_regex[language] += self.purge_regex_lines
 63 |             warning_regex[language] += self.warning_regex_lines
 64 | 
 65 | 
 66 | def _load_profile(profile_file: Path, default: bool = True) -> None:
 67 |     parser = configparser.ConfigParser()
 68 | 
 69 |     try:
 70 |         parser.read(profile_file, encoding="utf-8")
 71 | 
 72 |         languages = parser["META"].get("language_codes", "").replace(" ", "")
 73 | 
 74 |         if "excluded_language_codes" in parser["META"].keys() or not languages:
 75 |             global_profiles.append(GlobalProfile(parser, default))
 76 |             return
 77 |         if config.use_english_on_all and default and profile_file.name == "english.conf":
 78 |             global_profiles.append(GlobalProfile(parser, default))
 79 |             for language in languages.split(","):
 80 |                 if language not in purge_regex:
 81 |                     _create_language(language)
 82 |             return
 83 | 
 84 |         for language in languages.split(","):
 85 |             if language not in purge_regex:
 86 |                 _create_language(language)
 87 |             for key, value in list(parser["PURGE_REGEX"].items()):
 88 |                 if not default:
 89 |                     key = key + "*"
 90 |                 value = f"({value})"
 91 |                 compiled_regex = re.compile(value, flags=re.IGNORECASE | re.UNICODE)
 92 |                 purge_regex[language].append((key, compiled_regex))
 93 |             for key, value in list(parser["WARNING_REGEX"].items()):
 94 |                 if not default:
 95 |                     key = key + "*"
 96 |                 value = f"({value})"
 97 |                 compiled_regex = re.compile(value, flags=re.IGNORECASE | re.UNICODE)
 98 |                 warning_regex[language].append((key, compiled_regex))
 99 | 
100 |     except Exception:
101 |         logger.error(f"Incorrectly configured regex language profile: {profile_file.name}")
102 |         exit(1)
103 | 
104 | 
105 | def _create_language(language: str) -> None:
106 |     purge_regex[language] = []
107 |     warning_regex[language] = []
108 | 
109 |     for global_profile in global_profiles:
110 |         if any(language == excluded_language for excluded_language in global_profile.excluded_languages):
111 |             continue
112 |         purge_regex[language] += global_profile.purge_regex_lines
113 |         warning_regex[language] += global_profile.warning_regex_lines
114 | 
115 | 
116 | def _load_regex():
117 |     for default_profile_file in config.default_regex_dir.iterdir():
118 |         if default_profile_file.is_file() and not default_profile_file.name.startswith(".") and default_profile_file.suffix == ".conf":
119 |             for profile_file in config.regex_dir.iterdir():
120 | 
121 |                 if default_profile_file.name == profile_file.name:
122 |                     _load_profile(profile_file)
123 |                     break
124 |             else:
125 |                 _load_profile(default_profile_file, default=True)
126 |     for profile_file in config.regex_dir.iterdir():
127 |         if profile_file.is_file() and not profile_file.name.startswith(".") and profile_file.suffix == ".conf":
128 |             for default_profile_file in config.default_regex_dir.iterdir():
129 | 
130 |                 if default_profile_file.name == profile_file.name:
131 |                     break
132 |             else:
133 |                 _load_profile(profile_file)
134 | 
135 | 
136 | _load_regex()
137 | 


--------------------------------------------------------------------------------
/regex_profiles/default/global.conf:
--------------------------------------------------------------------------------
 1 | [META]
 2 | #
 3 | # --------------------------------------------------------------------------------------------
 4 | # This is a global language profile! It will run against all language codes unless they are excluded!
 5 | # --------------------------------------------------------------------------------------------
 6 | #
 7 | 
 8 | # Coma delimited list of language codes to not run the global regex config against.
 9 | # The script will also ignore any sub-labels like forced as long as they match the language code.
10 | excluded_language_codes =
11 | 
12 | 
13 | 
14 | # Information about how to configure the REGEX sections, read at the bottom of the file.
15 | # All regexes are case insensitive!
16 | [WARNING_REGEX]
17 | 
18 | global_warn1: www\.|https? |\\|@|\.(com|org|net|app|to|eu|to|io)\b
19 | global_warn2: (720|1080)p|HDTV|SHD|blu-?ray|DVD(?!-)|WEB\W?DL|23\.976|\b\d+\W*x\W*\d+\b|[xh]26[54]|™
20 | global_warn3: (720|1080)p|HDTV|WEB\W?DL|23\.976|(^|e(pisode)?)\W?\d+\W*x\W*\d+$|[xh]26[54]|™
21 | global_warn4: \b(CBS|deluxe|vitac?|Sartre|copyright|and TOYOTA|serverpartdeals)\b
22 | global_warn5: (_)
23 | global_warn6: (air date)
24 | global_warn7: ^(Teams?|the|subtitles)$
25 | #global_warn#: Regex goes here.
26 | 
27 | 
28 | [PURGE_REGEX]
29 | 
30 | global_purge1: ([^Ã]|^)©|==|>>|<<|★|=-|-=| ::| ::|\^\^
31 | global_purge2: \.(tv|tk|xyz|sex|porn|xxx|link|ru)\b|https?\W
32 | global_purge3: \bs(eason)?\W*\d+[^,]\W*e(pisode)?\W*\d+[^,]
33 | 
34 | global_purge4: \b(tvsubtitle|YTS|YIFY|opensub(titles)?|sub(scene|rip)|podnapisi|addic7ed|ragbear\W{,2}com|Point\.360)\b
35 | global_purge5: \b(bozxphd|sazu489|psagmeno|anoxmous|9unshofl|BLACKdoor|titlovi|Danishbits|acorn media|hound\W{,2}org|hunddawgs|iSubDB)\b
36 | global_purge6: \b(jodix|LESAIGNEUR|HighCode|explosiveskull|GoldenBeard|Fingal61|srjanapala|nadielostzilla|IESAIGNEUR|kdwluverz)\b
37 | global_purge7: \b(FilthyRichFutures|celebritysex|shareuniversity|AmericasCardroom|saveanilluminati|MCH2022|ALLIN1BOX|marocas62)\b
38 | global_purge8: \b(ClearwayLaw|SG-66|ShalimarFox|Icefre[@a]k|WGBH|KBS World|SweSUB|koreansubguy|R\[ésus|Barbie_on_Weed)\b
39 | global_purge9: \b(Aldi Arman|void_spell|LnlyHikikomori|wingyee|McEphie|robster38|dw817|zathras69|Thamyris|Dan4Jem|JustCosmin|moviesnipipay|delsxyz)\b
40 | global_purge10: \b(a\. b\. m\. j\.|Altyazı: Conan|SDI Media Group|HaruHaruSubs|@whyuandri|WahyuAndri|TheHeLL|RiKi66|KingJAIN|ADONI@|Jesslataree)\b
41 | global_purge11: \b(OrionDeBmk|TheChaosLegion|COLDFUSION \& BAARO|riri13|KOCOWA|@.?vii?ki|OnDemandKorea|MBC America|globosapien)\b
42 | global_purge12: \b(MSMOVIESBD|fightingfansubs|DLAznMovies|ancientmexicanwisdom|cookcountysheriff|MovieFull|300mbmovie|KoreanDramax)\b
43 | global_purge13: \b(extremesubs|3gpBluray|prijevodi-online|torrentgalaxy|Dramatorrent|torrent\.com|HQCINEMAS|WANNATALKAB[OA]UTIT|italiansubs|1000fr|1TamilMV|HDFREE)\b
44 | global_purge14: \b(chuanloon90|designer_pc|m_fouda97|Mr.Scudester|Shari_Kenzie|U-Sub.net|TCS Subtitling)\b
45 | 
46 | global_purge15: \b(rate this subtitle|Subtitle(s)? extracted by|Sync(ed)? (&|and) Clean(ed)?|become VIP member|Subs OCR|the best subtitle(s)?|Timing and Subtitle(s)?|rate this subtitle|Free Online Movie(s)?|Subtitle(s)? Transcribed|Re-Sync \&|English Subtitles|Translation(s)? and adaptation:|Captions by Able|Subtitle Rip|Engsub By|Subtitles brought by|Translation \/ Subtitles)\b
47 | global_purge16: \b(Download MyTotal|itfc subtitles|Built Ford Proud|Captioning sponsored|brought to you by Ford|This is a free sub|Custom subtitle by|For more new Episodes visit|Watch Movies and Series|Advertise your product or brand here|Easy Subtitle(s)? Synchronizer|Watch more movies for free|Brought to you by MrsKorea and mily2|Media Access Group at WGBH|Subtitles brought to you by|UNE SÉRIE ORIGINALE NETFLIX|Brought to you by iRiS|Support us and become a VIP member|Advertise your product or brand here|Caption(s|ing)? made possible by|Visit Our Movie Site|Open Subtitle(s)? MKV Player|Translation(s)? and review by|Spell\-Check and Error\-Correction|Subtitles are brought to you|Translation\. Review by Angel\.|Captions by CSI Australia|Timing and Subs by|Subtitles by The World\Ws Finest Team|Watch and Download free|PLEASE DO NOT UPLOAD ANY OF OUR SUBS|Subtitle by CJ Entertainment)\b
48 | global_purge17: \b(Paramartha|Heavens Subbing Squad|DramaFever|Asian Cinema Encoders|Italian Scrubs Addicted|Kevin \& Tyno)\b
49 | global_purge18: \b(Viki\.com|dramafever\.com|GlowGaze\.Com|seriessub\.com|www\.telegram|d\-addicts\.com|NAPiSY\.info|cinetyp\.ch|lauzabo\.blogspot\.com|Laozhabor\.blogspot\.com|MARIO\.MK|captionmax\.com|firebit\.org|popbitch\.com|swsub\.com|sous-titres\.eu|forom\.\W?com|Csi\-teams\. Fr\. St|GreggBraden\.com|inmymelody\.wordpress\.com|serverpartdeals\.com)
50 | global_purge19: \b(Fansub(s)?|Hardsub(s)?|S u b|Sub Rip:|Terjemahan subtitle oleh)
51 | #global_purge#: Regex goes here.
52 | 
53 | 
54 | 
55 | #
56 | # -----------------------------------------GUIDE-------------------------------------------------
57 | #
58 | 
59 | # This language profile contains two lists of regex that will look for patterns.
60 | # if you wish to modify or remove any regex, feel free to do so
61 | # but files in the default folder will be overwritten when you update the script.
62 | # You can add and remove keys as long as two keys don't use the same key twice.
63 | 
64 | # WARNING_REGEX:
65 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block.
66 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads.
67 | # 1 warning is ignored
68 | # 2 warnings will be print the block as a WARNING in the log.
69 | # 3 warnings or more will remove the entire block.
70 | 
71 | # PURGE_REGEX:
72 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block.
73 | 
74 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the
75 | # literal character you'll need to escape it with '\'
76 | # for example: matching "www." would require a regex like: "www\."
77 | # you can test regexes online on an regex-tester tool like https://regex101.com/
78 | 
79 | # Feel free to ask me any question on github.
80 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/python,windows,pycharm
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,windows,pycharm
  4 | 
  5 | ### PyCharm ###
  6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
  7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  8 | 
  9 | # User-specific stuff
 10 | .idea/**/workspace.xml
 11 | .idea/**/tasks.xml
 12 | .idea/**/usage.statistics.xml
 13 | .idea/**/dictionaries
 14 | .idea/**/shelf
 15 | 
 16 | # AWS User-specific
 17 | .idea/**/aws.xml
 18 | 
 19 | # Generated files
 20 | .idea/**/contentModel.xml
 21 | 
 22 | # Sensitive or high-churn files
 23 | .idea/**/dataSources/
 24 | .idea/**/dataSources.ids
 25 | .idea/**/dataSources.local.xml
 26 | .idea/**/sqlDataSources.xml
 27 | .idea/**/dynamic.xml
 28 | .idea/**/uiDesigner.xml
 29 | .idea/**/dbnavigator.xml
 30 | 
 31 | # Gradle
 32 | .idea/**/gradle.xml
 33 | .idea/**/libraries
 34 | 
 35 | # Gradle and Maven with auto-import
 36 | # When using Gradle or Maven with auto-import, you should exclude module files,
 37 | # since they will be recreated, and may cause churn.  Uncomment if using
 38 | # auto-import.
 39 | # .idea/artifacts
 40 | # .idea/compiler.xml
 41 | # .idea/jarRepositories.xml
 42 | # .idea/modules.xml
 43 | # .idea/*.iml
 44 | # .idea/modules
 45 | # *.iml
 46 | # *.ipr
 47 | 
 48 | # CMake
 49 | cmake-build-*/
 50 | 
 51 | # Mongo Explorer plugin
 52 | .idea/**/mongoSettings.xml
 53 | 
 54 | # File-based project format
 55 | *.iws
 56 | 
 57 | # IntelliJ
 58 | out/
 59 | 
 60 | # mpeltonen/sbt-idea plugin
 61 | .idea_modules/
 62 | 
 63 | # JIRA plugin
 64 | atlassian-ide-plugin.xml
 65 | 
 66 | # Cursive Clojure plugin
 67 | .idea/replstate.xml
 68 | 
 69 | # Crashlytics plugin (for Android Studio and IntelliJ)
 70 | com_crashlytics_export_strings.xml
 71 | crashlytics.properties
 72 | crashlytics-build.properties
 73 | fabric.properties
 74 | 
 75 | # Editor-based Rest Client
 76 | .idea/httpRequests
 77 | 
 78 | # Android studio 3.1+ serialized cache file
 79 | .idea/caches/build_file_checksums.ser
 80 | 
 81 | ### PyCharm Patch ###
 82 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
 83 | 
 84 | # *.iml
 85 | # modules.xml
 86 | # .idea/misc.xml
 87 | # *.ipr
 88 | 
 89 | # Sonarlint plugin
 90 | # https://plugins.jetbrains.com/plugin/7973-sonarlint
 91 | .idea/**/sonarlint/
 92 | 
 93 | # SonarQube Plugin
 94 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
 95 | .idea/**/sonarIssues.xml
 96 | 
 97 | # Markdown Navigator plugin
 98 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
 99 | .idea/**/markdown-navigator.xml
100 | .idea/**/markdown-navigator-enh.xml
101 | .idea/**/markdown-navigator/
102 | 
103 | # Cache file creation bug
104 | # See https://youtrack.jetbrains.com/issue/JBR-2257
105 | .idea/$CACHE_FILE$
106 | 
107 | # CodeStream plugin
108 | # https://plugins.jetbrains.com/plugin/12206-codestream
109 | .idea/codestream.xml
110 | 
111 | ### Python ###
112 | # Byte-compiled / optimized / DLL files
113 | __pycache__/
114 | *.py[cod]
115 | *$py.class
116 | 
117 | # C extensions
118 | *.so
119 | 
120 | # Distribution / packaging
121 | .Python
122 | build/
123 | develop-eggs/
124 | dist/
125 | downloads/
126 | eggs/
127 | .eggs/
128 | lib/
129 | lib64/
130 | parts/
131 | sdist/
132 | var/
133 | wheels/
134 | share/python-wheels/
135 | *.egg-info/
136 | .installed.cfg
137 | *.egg
138 | MANIFEST
139 | 
140 | # PyInstaller
141 | #  Usually these files are written by a python script from a template
142 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
143 | *.manifest
144 | *.spec
145 | 
146 | # Installer logs
147 | pip-log.txt
148 | pip-delete-this-directory.txt
149 | 
150 | # Unit test / coverage reports
151 | htmlcov/
152 | .tox/
153 | .nox/
154 | .coverage
155 | .coverage.*
156 | .cache
157 | nosetests.xml
158 | coverage.xml
159 | *.cover
160 | *.py,cover
161 | .hypothesis/
162 | .pytest_cache/
163 | cover/
164 | 
165 | # Translations
166 | *.mo
167 | *.pot
168 | 
169 | # Django stuff:
170 | *.log
171 | local_settings.py
172 | db.sqlite3
173 | db.sqlite3-journal
174 | 
175 | # Flask stuff:
176 | instance/
177 | .webassets-cache
178 | 
179 | # Scrapy stuff:
180 | .scrapy
181 | 
182 | # Sphinx documentation
183 | docs/_build/
184 | 
185 | # PyBuilder
186 | .pybuilder/
187 | target/
188 | 
189 | # Jupyter Notebook
190 | .ipynb_checkpoints
191 | 
192 | # IPython
193 | profile_default/
194 | ipython_config.py
195 | 
196 | # pyenv
197 | #   For a library or package, you might want to ignore these files since the code is
198 | #   intended to run in multiple environments; otherwise, check them in:
199 | # .python-version
200 | 
201 | # pipenv
202 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
203 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
204 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
205 | #   install all needed dependencies.
206 | #Pipfile.lock
207 | 
208 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
209 | __pypackages__/
210 | 
211 | # Celery stuff
212 | celerybeat-schedule
213 | celerybeat.pid
214 | 
215 | # SageMath parsed files
216 | *.sage.py
217 | 
218 | # Environments
219 | .env
220 | .venv
221 | env/
222 | venv/
223 | ENV/
224 | env.bak/
225 | venv.bak/
226 | 
227 | # Spyder project settings
228 | .spyderproject
229 | .spyproject
230 | 
231 | # Rope project settings
232 | .ropeproject
233 | 
234 | # mkdocs documentation
235 | /site
236 | 
237 | # mypy
238 | .mypy_cache/
239 | .dmypy.json
240 | dmypy.json
241 | 
242 | # Pyre type checker
243 | .pyre/
244 | 
245 | # pytype static type analyzer
246 | .pytype/
247 | 
248 | # Cython debug symbols
249 | cython_debug/
250 | 
251 | ### Windows ###
252 | # Windows thumbnail cache files
253 | Thumbs.db
254 | Thumbs.db:encryptable
255 | ehthumbs.db
256 | ehthumbs_vista.db
257 | 
258 | # Dump file
259 | *.stackdump
260 | 
261 | # Folder config file
262 | [Dd]esktop.ini
263 | 
264 | # Recycle Bin used on file shares
265 | $RECYCLE.BIN/
266 | 
267 | # Windows Installer files
268 | *.cab
269 | *.msi
270 | *.msix
271 | *.msm
272 | *.msp
273 | 
274 | # Windows shortcuts
275 | *.lnk
276 | 
277 | # End of https://www.toptal.com/developers/gitignore/api/python,windows,pycharm
278 | 
279 | /test-dir
280 | /removed.log
281 | /.idea
282 | /subcleaner.conf
283 | /log/
284 | /logs/
285 | *.log*
286 | /regex_profiles/
287 | /databases/
288 | 


--------------------------------------------------------------------------------
/libs/subcleaner/main.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import logging
  3 | from typing import List, Dict
  4 | from .subtitle import Subtitle, ParsingException, FileContentException
  5 | from libs.subcleaner import cleaner, report_generator, languages, regex_lists
  6 | from .settings import args, config
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | files_handled: List[str] = []
 11 | files_failed: Dict[str, str] = {}
 12 | 
 13 | 
 14 | def main():
 15 |     try:
 16 |         for file in args.subtitles:
 17 |             if file.suffix == ".srt":
 18 |                 logger.debug(f"cleaning file: {file}")
 19 |                 clean_file(file)
 20 | 
 21 |         logger.debug(f"path libraries: {args.libraries}")
 22 |         for library in args.libraries:
 23 |             logger.debug(f"cleaning library: {library}")
 24 |             clean_directory(library)
 25 |     except KeyboardInterrupt:
 26 |         logger.info("subcleaner aborted")
 27 | 
 28 |     if files_handled:
 29 |         if args.end_report and len(files_handled) > 1:
 30 |             logger.info("end of run report: \n" + report_generator.generate_end_report())
 31 | 
 32 |         if not files_failed:
 33 |             logger.info(f"subcleaner finished successfully. {len(files_handled)} files cleaned.")
 34 |             if args.silent or args.errors_only:
 35 |                 print(f"subcleaner finished successfully. {len(files_handled)} files cleaned.")
 36 |         else:
 37 |             logger.info(f"subcleaner finished successfully partly. {len(files_handled)}/{len(files_handled) + len(files_failed)} files cleaned successfully.")
 38 |             logger.info(f"failed to clean following files:")
 39 |             for file_name, reason in files_failed.items():
 40 |                 logger.info(f"  - '{file_name}' reason: {reason}")
 41 |             if args.errors_only:
 42 |                 print(f"subcleaner finished successfully partly. {len(files_handled)}/{len(files_handled) + len(files_failed)} files cleaned successfully.")
 43 |     else:
 44 |         if files_failed:
 45 |             logger.error(f"subcleaner didn't successfully clean any files, failed to clean {len(files_failed)} files.")
 46 |             if args.silent:
 47 |                 print(f"subcleaner didn't successfully clean any files, failed to clean {len(files_failed)} files.")
 48 |         else:
 49 |             logger.error("subcleaner didn't find any files to clean!")
 50 |             if args.silent:
 51 |                 print("subcleaner didn't find any files to clean!")
 52 | 
 53 | 
 54 | def clean_file(subtitle_file: Path) -> None:
 55 |     if subtitle_file.name in files_handled or subtitle_file.name in files_failed:
 56 |         return
 57 |     logger.info("[---------------------------------------------------------------------------------]")
 58 |     try:
 59 |         short_file = subtitle_file.relative_to(config.relative_base)
 60 |     except ValueError:
 61 |         short_file = subtitle_file
 62 |     try:
 63 |         logger.info(f"loading subtitle: {short_file}")
 64 |         subtitle = Subtitle(subtitle_file)
 65 |     except (UnicodeDecodeError, ParsingException, FileContentException) as e:
 66 |         logger.error(f"subcleaner was unable to decode the file. reason:")
 67 |         logger.error(e)
 68 |         files_failed[subtitle_file.name] = f"subcleaner was unable to decode the file: {e}"
 69 |         return
 70 |     if not subtitle:
 71 |         logger.warning("Subtitle file is empty.")
 72 |         files_failed[subtitle_file.name] = "Subtitle file is empty."
 73 |         return
 74 |     if config.require_language_profile and not regex_lists.language_has_profile(subtitle.language):
 75 |         logger.warning(f"language '{subtitle.language}' have no regex profile associated with it.")
 76 |         logger.warning(f"either create a regex profile for it or disable require_language_profile in the config.")
 77 |         files_failed[subtitle_file.name] = f"language '{subtitle.language}' have no regex profile associated with it."
 78 |         return
 79 | 
 80 |     logger.info(f"now cleaning subtitle: {subtitle.short_path}")
 81 | 
 82 |     if not subtitle.language_is_correct():
 83 |         logger.warning(f"the language within the file does not match language: '{subtitle.language}'")
 84 |     changes = False
 85 |     cleaner.unscramble(subtitle)
 86 |     cleaner.find_ads(subtitle)
 87 |     if subtitle.ad_blocks:
 88 |         changes = True
 89 |     cleaner.remove_ads(subtitle)
 90 |     if config.fix_overlaps:
 91 |         changes = changes or cleaner.fix_overlap(subtitle)
 92 |     cleaner.reset()
 93 | 
 94 |     if len(subtitle.blocks) == 0:
 95 |         l = list(subtitle.ad_blocks)
 96 |         reasons = l[0].hints
 97 |         for block in l[1:]:
 98 |             for hint in reasons:
 99 |                 if hint not in block.hints:
100 |                     reasons.remove(hint)
101 | 
102 |         logger.error("There might be an issue with the regex or the subtitle file, "
103 |                      "because everything in the subtitle would have gotten deleted. "
104 |                      "Nothing was altered.")
105 |         if reasons:
106 |             logger.error("all removed blocks had common reasons: " + ", ".join(reasons))
107 |         files_failed[subtitle_file.name] = "aborted, removed all subtitles. all removed blocks had common reasons: " + ", ".join(reasons)
108 |         return
109 | 
110 |     logger.info(f"Done. Cleaning report:\n{report_generator.generate_report(subtitle)}\n")
111 |     files_handled.append(subtitle_file.name)
112 |     if changes:
113 |         logger.info("no ads found") 
114 | 
115 |     if args.dry_run:
116 |         subtitle.to_content()
117 |         logger.warning("dry run: nothing was altered.")
118 |     else:
119 |         if changes:
120 |             with subtitle_file.open("w", encoding="UTF-8") as file:
121 |                 file.write(subtitle.to_content())
122 | 
123 | 
124 | def clean_directory(directory: Path) -> None:
125 |     for file in directory.iterdir():
126 |         if file.name.startswith("."):
127 |             continue
128 | 
129 |         if file.is_dir() and not file.is_symlink():
130 |             clean_directory(file)
131 | 
132 |         if not file.is_file() or file.suffix != ".srt":
133 |             continue
134 | 
135 |         if not args.language:
136 |             logger.debug(f"cleaning file: {file}")
137 |             clean_file(file)
138 |             continue
139 | 
140 |         for suffix in file.suffixes[max(-3, -len(file.suffixes)):-1]:
141 |             parsed_lang = suffix.replace(":", "-").replace("_", "-").split("-")[0][1:]
142 |             if languages.is_language(parsed_lang) and args.language == parsed_lang:
143 |                 logger.debug(f"cleaning file: {file}")
144 |                 clean_file(file)
145 | 


--------------------------------------------------------------------------------
/libs/subcleaner/settings/args.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import glob
  4 | import os
  5 | import pathlib
  6 | from pathlib import Path
  7 | from typing import Optional, List
  8 | 
  9 | from libs.subcleaner import languages
 10 | from . import config
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | checked_disks = set("C:")
 15 | 
 16 | 
 17 | def check_disk_liveliness(disk: Path):
 18 |     if disk.drive in checked_disks:
 19 |         return
 20 |     checked_disks.add(disk.drive)
 21 | 
 22 |     try:
 23 |         try:
 24 |             prev_cwd = Path.cwd()
 25 |             os.chdir(disk)
 26 |             os.chdir(prev_cwd)
 27 |             return
 28 |         except FileNotFoundError:
 29 |             tmp_file = disk.joinpath(".subcleaner-disk-liveliness-checker.safe_to_delete")
 30 |             tmp_file.touch()
 31 |             tmp_file.unlink()
 32 |             return
 33 | 
 34 |     except (PermissionError, FileExistsError):
 35 |         return
 36 |     except FileNotFoundError:
 37 |         logger.error(f"The {disk} drive is currently inaccessible. please reconnect to the drive.")
 38 | 
 39 | 
 40 | parser = argparse.ArgumentParser(description="Remove ads from subtitle. Removed blocks are sent to logfile. "
 41 |                                              "Can also check that the subtitle language match the file name language code. ")
 42 | 
 43 | subtitles: List[Path]
 44 | parser.add_argument("subtitle", metavar="SUB", type=str, default=list(), nargs="*",
 45 |                     help="Path to subtitles to run script against. "
 46 |                          "Script currently only compatible with simple .srt files.")
 47 | 
 48 | libraries: List[Path]
 49 | parser.add_argument("--library", "-r", metavar="LIB", type=str, dest="library", default=list(), nargs="*",
 50 |                     help="Run the script also on any subtitle found recursively under directory LIB. "
 51 |                          "If LANG is specified it will only run it on subtitles that have a "
 52 |                          "language label matching LANG.")
 53 | 
 54 | language: Optional[str]
 55 | parser.add_argument("--language", "-l", metavar="LANG", type=str, dest="language", default=None,
 56 |                     help="ISO-639 language code. If this argument is set then the script will "
 57 |                          "assume that the SUB's language is LANG regardless of filenames and content. "
 58 |                          "code may contain :forced or other \"LANG:<tag>\" but these tags will be ignored")
 59 | 
 60 | purge_list: List[int]
 61 | parser.add_argument("--destroy", "-d", type=int, nargs="+", default=list(),
 62 |                     help="original_index of blocks to remove from SUB, this option is not compatible with the "
 63 |                          "library option. When this option is passed the script will mark the "
 64 |                          "specified blocks as ads and then run normally. "
 65 |                          "Example to destroy block 4 and 78: -d 4 78")
 66 | 
 67 | dry_run: bool
 68 | parser.add_argument("--dry-run", "-n", action="store_true", dest="dry_run",
 69 |                     help="Dry run: No files are modified. (debug)")
 70 | 
 71 | silent: bool
 72 | parser.add_argument("--silent", "-s", action="store_true", dest="silent",
 73 |                     help="Silent: Only print warnings or errors in stdout.")
 74 | 
 75 | minimal: bool
 76 | parser.add_argument("--minimal", "-m", action="store_true", dest="minimal",
 77 |                     help=argparse.SUPPRESS)
 78 | 
 79 | removed_only: bool
 80 | parser.add_argument("--removed", "-a", action="store_true", dest="removed_only",
 81 |                     help="Removed Only: Will only show removed blocks in cleaning report.")
 82 | 
 83 | errors_only: bool
 84 | parser.add_argument("--errors", "-e", action="store_true", dest="errors_only",
 85 |                     help="Errors: Only print errors and will run in --dry-run mode.")
 86 | 
 87 | no_log: bool
 88 | parser.add_argument("--no-log", action="store_true", dest="no_log",
 89 |                     help="No log: Nothing is logged to file.")
 90 | 
 91 | sensitive: bool
 92 | parser.add_argument("--sensitive", action="store_true", dest="sensitive",
 93 |                     help="Sensitive: Log all blocks adjacent to ads as warnings (debug).")
 94 | 
 95 | explain: bool
 96 | parser.add_argument("--explain", action="store_true", dest="explain",
 97 |                     help=argparse.SUPPRESS)
 98 | 
 99 | no_explain: bool
100 | parser.add_argument("--no-explain", action="store_true", dest="no_explain",
101 |                     help="No explain: suppresses explanations for why blocks got removed or received warnings.")
102 | 
103 | end_report: bool
104 | parser.add_argument("--end-report", action="store_true", dest="end_report",
105 |                     help="End Report: shows a report at the end displaying unique removed/warning blocks in this run"
106 |                          "removed blocks with less than 9 warnings are sorted from fewest removed block with same content "
107 |                          "and warning is sorted from most warned blocks with the same content. (debug)")
108 | 
109 | debug: bool
110 | parser.add_argument("--debug", action="store_true", dest="debug",
111 |                     help="Debug: argument collection that contains arguments: "
112 |                          "--dry-run, --sensitive and --end-report")
113 | 
114 | args = parser.parse_args()
115 | # check usage:
116 | 
117 | if len(args.subtitle) == 0 and len(args.library) == 0:
118 |     parser.print_help()
119 |     exit()
120 | 
121 | debug = args.debug
122 | if debug:
123 |     print("debug mode.")
124 | 
125 | if debug:
126 |     print(f"arg.library: {args.library}")
127 | 
128 | libraries = []
129 | for library_str in args.library:
130 |     library: Path = Path(library_str)
131 |     if not library.is_absolute():
132 |         if library_str[0:2] == "./":
133 |             library = Path.cwd().joinpath(library)
134 |         else:
135 |             library = config.relative_base.joinpath(library)
136 |     if isinstance(library, pathlib.WindowsPath):
137 |         check_disk_liveliness(Path(library.drive + "/"))
138 | 
139 |     for item in glob.glob(glob.escape(str(library)).replace("[*]", "*")):
140 |         item = Path(item).resolve()
141 |         if item.is_dir():
142 |             libraries.append(item)
143 | 
144 | if debug:
145 |     print(f"arg.subtitle: {args.subtitle}")
146 | 
147 | subtitles = []
148 | for file_str in args.subtitle:
149 |     file = Path(file_str)
150 |     if not file.is_absolute():
151 |         if file_str[0:2] == "./":
152 |             file = Path.cwd().joinpath(file)
153 |         else:
154 |             file = config.relative_base.joinpath(file)
155 |     if isinstance(file, pathlib.WindowsPath):
156 |         check_disk_liveliness(Path(file.drive + "/"))
157 | 
158 |     for item in glob.glob(glob.escape(str(file)).replace("[*]", "*")):
159 |         item = Path(item).resolve()
160 |         if item.is_file() and item.name[-4:] == ".srt":
161 |             subtitles.append(item)
162 | 
163 | language = None
164 | if args.language:
165 |     language = args.language.replace("-", ":").split(":")[0].replace("\"", "").replace("'", "").lower()
166 |     if not languages.is_language(language):
167 |         logger.error("'" + args.language + "' is not a valid ISO-639 language.\n--help for more information.")
168 |         exit(1)
169 | 
170 | destroy_list = args.destroy
171 | if destroy_list and (len(subtitles) != 1 or len(libraries) != 0):
172 |     logger.error("option --destroy require one and only one specified subtitle file.\nsee --help for more info.")
173 |     exit(1)
174 | 
175 | silent = args.silent
176 | no_log = args.no_log
177 | dry_run = args.dry_run or args.debug
178 | errors_only = args.errors_only
179 | removed_only = args.removed_only
180 | sensitive = args.sensitive or args.debug
181 | explain = not args.no_explain
182 | end_report = args.end_report or args.debug
183 | 


--------------------------------------------------------------------------------
/libs/langdetect/detector.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import re
  3 | 
  4 | import libs.six as six
  5 | from .lang_detect_exception import ErrorCode, LangDetectException
  6 | from .language import Language
  7 | from .utils.ngram import NGram
  8 | from .utils.unicode_block import unicode_block
  9 | 
 10 | 
 11 | class Detector(object):
 12 |     '''
 13 |     Detector class is to detect language from specified text.
 14 |     Its instance is able to be constructed via the factory class DetectorFactory.
 15 | 
 16 |     After appending a target text to the Detector instance with .append(string),
 17 |     the detector provides the language detection results for target text via .detect() or .get_probabilities().
 18 | 
 19 |     .detect() method returns a single language name which has the highest probability.
 20 |     .get_probabilities() methods returns a list of multiple _languages and their probabilities.
 21 | 
 22 |     The detector has some parameters for language detection.
 23 |     See set_alpha(double), .set_max_text_length(int) .set_prior_map(dict).
 24 | 
 25 |     Example:
 26 | 
 27 |         from langdetect.detector_factory import DetectorFactory
 28 |         factory = DetectorFactory()
 29 |         factory.load_profile('/path/to/profile/directory')
 30 | 
 31 |         def detect(text):
 32 |             detector = factory.create()
 33 |             detector.append(text)
 34 |             return detector.detect()
 35 | 
 36 |         def detect_langs(text):
 37 |             detector = factory.create()
 38 |             detector.append(text)
 39 |             return detector.get_probabilities()
 40 |     '''
 41 | 
 42 |     ALPHA_DEFAULT = 0.5
 43 |     ALPHA_WIDTH = 0.05
 44 | 
 45 |     ITERATION_LIMIT = 1000
 46 |     PROB_THRESHOLD = 0.1
 47 |     CONV_THRESHOLD = 0.99999
 48 |     BASE_FREQ = 10000
 49 |     UNKNOWN_LANG = 'unknown'
 50 | 
 51 |     URL_RE = re.compile(r'https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}')
 52 |     MAIL_RE = re.compile(r'[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}')
 53 | 
 54 |     def __init__(self, factory):
 55 |         self.word_lang_prob_map = factory.word_lang_prob_map
 56 |         self.langlist = factory.langlist
 57 |         self.seed = factory.seed
 58 |         self.random = random.Random()
 59 |         self.text = ''
 60 |         self.langprob = None
 61 | 
 62 |         self.alpha = self.ALPHA_DEFAULT
 63 |         self.n_trial = 7
 64 |         self.max_text_length = 10000
 65 |         self.prior_map = None
 66 |         self.verbose = False
 67 | 
 68 |     def set_verbose(self):
 69 |         self.verbose = True
 70 | 
 71 |     def set_alpha(self, alpha):
 72 |         self.alpha = alpha
 73 | 
 74 |     def set_prior_map(self, prior_map):
 75 |         '''Set prior information about language probabilities.'''
 76 |         self.prior_map = [0.0] * len(self.langlist)
 77 |         sump = 0.0
 78 |         for i in range(len(self.prior_map)):
 79 |             lang = self.langlist[i]
 80 |             if lang in prior_map:
 81 |                 p = prior_map[lang]
 82 |                 if p < 0:
 83 |                     raise LangDetectException(ErrorCode.InitParamError, 'Prior probability must be non-negative.')
 84 |                 self.prior_map[i] = p
 85 |                 sump += p
 86 |         if sump <= 0.0:
 87 |             raise LangDetectException(ErrorCode.InitParamError, 'More one of prior probability must be non-zero.')
 88 |         for i in range(len(self.prior_map)):
 89 |             self.prior_map[i] /= sump
 90 | 
 91 |     def set_max_text_length(self, max_text_length):
 92 |         '''Specify max size of target text to use for language detection.
 93 |         The default value is 10000(10KB).
 94 |         '''
 95 |         self.max_text_length = max_text_length
 96 | 
 97 |     def append(self, text):
 98 |         '''Append the target text for language detection.
 99 |         If the total size of target text exceeds the limit size specified by
100 |         Detector.set_max_text_length(int), the rest is cut down.
101 |         '''
102 |         text = self.URL_RE.sub(' ', text)
103 |         text = self.MAIL_RE.sub(' ', text)
104 |         text = NGram.normalize_vi(text)
105 |         pre = 0
106 |         for i in range(min(len(text), self.max_text_length)):
107 |             ch = text[i]
108 |             if ch != ' ' or pre != ' ':
109 |                 self.text += ch
110 |             pre = ch
111 | 
112 |     def cleaning_text(self):
113 |         '''Cleaning text to detect
114 |         (eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet).
115 |         '''
116 |         latin_count, non_latin_count = 0, 0
117 |         for ch in self.text:
118 |             if 'A' <= ch <= 'z':
119 |                 latin_count += 1
120 |             elif ch >= six.u('\u0300') and unicode_block(ch) != 'Latin Extended Additional':
121 |                 non_latin_count += 1
122 | 
123 |         if latin_count * 2 < non_latin_count:
124 |             text_without_latin = ''
125 |             for ch in self.text:
126 |                 if ch < 'A' or 'z' < ch:
127 |                     text_without_latin += ch
128 |             self.text = text_without_latin
129 | 
130 |     def detect(self):
131 |         '''Detect language of the target text and return the language name
132 |         which has the highest probability.
133 |         '''
134 |         probabilities = self.get_probabilities()
135 |         if probabilities:
136 |             return probabilities[0].lang
137 |         return self.UNKNOWN_LANG
138 | 
139 |     def get_probabilities(self):
140 |         if self.langprob is None:
141 |             self._detect_block()
142 |         return self._sort_probability(self.langprob)
143 | 
144 |     def _detect_block(self):
145 |         self.cleaning_text()
146 |         ngrams = self._extract_ngrams()
147 |         if not ngrams:
148 |             raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.')
149 | 
150 |         self.langprob = [0.0] * len(self.langlist)
151 | 
152 |         self.random.seed(self.seed)
153 |         for t in range(self.n_trial):
154 |             prob = self._init_probability()
155 |             alpha = self.alpha + self.random.gauss(0.0, 1.0) * self.ALPHA_WIDTH
156 | 
157 |             i = 0
158 |             while True:
159 |                 self._update_lang_prob(prob, self.random.choice(ngrams), alpha)
160 |                 if i % 5 == 0:
161 |                     if self._normalize_prob(prob) > self.CONV_THRESHOLD or i >= self.ITERATION_LIMIT:
162 |                         break
163 |                     if self.verbose:
164 |                         six.print_('>', self._sort_probability(prob))
165 |                 i += 1
166 |             for j in range(len(self.langprob)):
167 |                 self.langprob[j] += prob[j] / self.n_trial
168 |             if self.verbose:
169 |                 six.print_('==>', self._sort_probability(prob))
170 | 
171 |     def _init_probability(self):
172 |         '''Initialize the map of language probabilities.
173 |         If there is the specified prior map, use it as initial map.
174 |         '''
175 |         if self.prior_map is not None:
176 |             return list(self.prior_map)
177 |         else:
178 |             return [1.0 / len(self.langlist)] * len(self.langlist)
179 | 
180 |     def _extract_ngrams(self):
181 |         '''Extract n-grams from target text.'''
182 |         RANGE = list(range(1, NGram.N_GRAM + 1))
183 | 
184 |         result = []
185 |         ngram = NGram()
186 |         for ch in self.text:
187 |             ngram.add_char(ch)
188 |             if ngram.capitalword:
189 |                 continue
190 |             for n in RANGE:
191 |                 # optimized w = ngram.get(n)
192 |                 if len(ngram.grams) < n:
193 |                     break
194 |                 w = ngram.grams[-n:]
195 |                 if w and w != ' ' and w in self.word_lang_prob_map:
196 |                     result.append(w)
197 |         return result
198 | 
199 |     def _update_lang_prob(self, prob, word, alpha):
200 |         '''Update language probabilities with N-gram string(N=1,2,3).'''
201 |         if word is None or word not in self.word_lang_prob_map:
202 |             return False
203 | 
204 |         lang_prob_map = self.word_lang_prob_map[word]
205 |         if self.verbose:
206 |             six.print_('%s(%s): %s' % (word, self._unicode_encode(word), self._word_prob_to_string(lang_prob_map)))
207 | 
208 |         weight = alpha / self.BASE_FREQ
209 |         for i in range(len(prob)):
210 |             prob[i] *= weight + lang_prob_map[i]
211 |         return True
212 | 
213 |     def _word_prob_to_string(self, prob):
214 |         result = ''
215 |         for j in range(len(prob)):
216 |             p = prob[j]
217 |             if p >= 0.00001:
218 |                 result += ' %s:%.5f' % (self.langlist[j], p)
219 |         return result
220 | 
221 |     def _normalize_prob(self, prob):
222 |         '''Normalize probabilities and check convergence by the maximun probability.
223 |         '''
224 |         maxp, sump = 0.0, sum(prob)
225 |         for i in range(len(prob)):
226 |             p = prob[i] / sump
227 |             if maxp < p:
228 |                 maxp = p
229 |             prob[i] = p
230 |         return maxp
231 | 
232 |     def _sort_probability(self, prob):
233 |         result = [Language(lang, p) for (lang, p) in zip(self.langlist, prob) if p > self.PROB_THRESHOLD]
234 |         result.sort(reverse=True)
235 |         return result
236 | 
237 |     def _unicode_encode(self, word):
238 |         buf = ''
239 |         for ch in word:
240 |             if ch >= six.u('\u0080'):
241 |                 st = hex(0x10000 + ord(ch))[2:]
242 |                 while len(st) < 4:
243 |                     st = '0' + st
244 |                 buf += r'\u' + st[1:5]
245 |             else:
246 |                 buf += ch
247 |         return buf
248 | 


--------------------------------------------------------------------------------
/regex_profiles/default/dutch.conf:
--------------------------------------------------------------------------------
  1 | [META]
  2 | # Dutch default config.
  3 | 
  4 | # Comma delimited list of language codes associated with this language profile.
  5 | # The script will run against all sub-labels like ":forced" as long as they match the language code.
  6 | # leave empty to apply to all language codes.
  7 | language_codes = nl, nld, dut, dutch
  8 | 
  9 | 
 10 | 
 11 | # Information about how to configure the REGEX sections, read at the bottom of the file.
 12 | # All regexes are case insensitive!
 13 | 
 14 | [WARNING_REGEX]
 15 | 
 16 | nl_warn1: \b(ondertitel(s|d|ing)?|(bij-?)?vertaa?l(d|er|ing|ingen)|(na-?)?bewerk(t|ing|ingen)|(na)?(ge)?controle(erd)?|(ge)?modific(aties?|eerd)|aan(ge)?vull?(d|ende?|ing|ingen)|aan(ge)?(boden|pas(t|singen))|mogelijk gemaakt|creatief supervisor|correcties?|gecorrigeerd|nagekeken|(na)?gecheckt|(her)?(na)?(ge)?(re)?synch?t?(ronis(atie(correcties?)?|ering|e(er)?d))?|(ge)?transcribee?r(ing|en|d)|transcript(s|ies?)?|verbeter(d|ing|ingen)|herzien(ing|ingen)?|gedownload|geript|(ge)?presenteer(d|t)|aflevering(en)?|episodes?)\b
 17 | nl_warn2: \b(ondertitel(s|d|ing)?|(bij-?)?vertaa?l(d|er|ing|ingen)|(na-?)?bewerk(t|ing|ingen)|(na)?(ge)?controle(erd)?|(ge)?modific(aties?|eerd)|aan(ge)?vull?(d|ende?|ing|ingen)|aan(ge)?(boden|pas(t|singen))|mogelijk gemaakt|creatief supervisor|correcties?|gecorrigeerd|nagekeken|(na)?gecheckt|(her)?(na)?(ge)?(re)?synch?t?(ronis(atie(correcties?)?|ering|e(er)?d))?|(ge)?transcribee?r(ing|en|d)|transcript(s|ies?)?|verbeter(d|ing|ingen)|herzien(ing|ingen)?|gedownload|geript)\W+(door|van)\b
 18 | nl_warn3: \.(nl|nu)\b
 19 | 
 20 | ### Short/common nicknames/phrases
 21 | nl_warn4: \b(888|ac|acolieten|arri[eë]lla|assenza|deluxe|d[eé]sir[eé]e|dutch|eagle|ericsson|heksje|investigator|jolly|jm|lain|mmf|mvv|mvw|oym|orange|pvt|razor|relentless|releases|rq|scarlett|sheeba|simply|skinny|sk|slabak|thc|tokke|vision|vsi|hooky|kwibus|savales|gvdl|mandy|kathmandu|justme|mimir|codar|jeltje|phantom|juggernaut)\b
 22 | nl_warn5: \b(888|ac|acolieten|arri[eë]lla|assenza|d[eé]sir[eé]e|ericsson|heksje|investigator|jm|lain|mmf|mvv|mvw|oym|pvt|relentless|releases|rq|scarlett|sheeba|sk|slabak|thc|tokke|vsi|hooky|savales|gvdl|justme|codar|jeltje)\b
 23 | 
 24 | ### English
 25 | nl_warn6: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)synch?(ed|ro(nized)?)?|(re-?)?synch?(ed|ro(nized)?)|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|confor(m|med)|correct(ions?|ed)|transcri(be|bed|ption|ptions)|improve(d|ments)|sub(s|bed)?|provided|supported|encoded|edit(ed|s)?|download(ed)?|present(s|ing|ed)|credits)\b
 26 | nl_warn7: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)synch?(ed|ro(nized)?)?|(re-?)?synch?(ed|ro(nized)?)|ripped|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|conformed|correct(ions?|ed)|transcri(be|bed|ption|ptions)|improve(d|ments)|sub(s|bed)|provided|supported|encoded|edit(ed|s)?|downloaded|present(s|ing|ed))\b
 27 | 
 28 | ### From no_profile config
 29 | nl_warn8: \b(broadcasting|metamorfose|Arun|Aramis|KKB|ydy|snuif)\b
 30 | 
 31 | #nl_warn#: Regex goes here.
 32 | 
 33 | [PURGE_REGEX]
 34 | 
 35 | nl_purge1: \b(ondertitel(s|d|ing)?|(bij-?)?vertaa?l(d|er|ing|ingen)|(na-?)?bewerk(t|ing|ingen)|(na)?(ge)?controle(erd)?|(ge)?modific(aties?|eerd)|aan(ge)?vull?(d|ende?|ing|ingen)|aan(ge)?(boden|pas(t|singen))|mogelijk gemaakt|creatief supervisor|correcties?|gecorrigeerd|nagekeken|(na)?gecheckt|(her)?(na)?(ge)?(re)?synch?t?(ronis(atie(correcties?)?|ering|e(er)?d))?|(ge)?transcribee?r(ing|en|d)|transcript(s|ies?)?|verbeter(d|ing|ingen)|herzien(ing|ingen)?|gedownload|geript)\W*(door|van)?\W*(:|;)..
 36 | 
 37 | ### Advertisements
 38 | nl_purge2: \b(tv ?-?(piraat|box|aanbod)|p\.j\.|allesin1box|gratisstreamen|goedkope ?webhosting|word vip ?-?member|(beoordeel|download) deze (ondertitel|subtitle)|promoot uw product|areslive|plz donate|streambox)\b
 39 | 
 40 | ### Translation agencies
 41 | nl_purge3: \b(invision|iyuno(mg)?|sdi (media|group)|bti studios|titrafilm|hoek & son[eé]pouse|p2p (ondertiteling|subtitling)|broadcast text international|odmedia|visiontext|amsterdams vertalers[ck]olle[ck]tief)\b
 42 | 
 43 | ### Amateur/volunteer subtitler nicknames
 44 | nl_purge4: \b(Goffini|Muzatte|Suurtje|Daboy|Delapluma|Depositair|Brown-Eyes|Copy2004AP|ED2K4U|Megamaker|SKVCD|pevi70|Nightfalls|WinchesterGirl|pinkGaai|ChillyWitch|meatlove100|apimpnamedslickback|vidioot|OliverKlozoff)\b
 45 | nl_purge5: \b(Ren ?H[oö]k|FuBre|Skip77|Cks321|DevilsBackbone|Appie ?van ?de ?3e|Jamees|Cdrazar|SatCP|Johnny ?Lion|Janty|Pgmaker|Baseripper|L4Y|Flitskikker|WH1T3R0S3|Spookstah|MrTheoW|Thomilla|Zuiberknaf|VitoSilans)\b
 46 | nl_purge6: \b(Cranedriver|Find[eé]k[aà]no|Stevo|AchtAchtAcht|Dweez|Rustroest|cjdijk|pvdc|One2Sub|Zero_1|NederSubs|Kiry|FLAK|eXtAsls|bdzzld|ropo64|fatlobster|DiscoRobert|Peter4871|Marc2008|Thai-?Tanic|Pid0ck|HaiHai)\b
 47 | nl_purge7: \b(MrPallMall|BorisVictor|YouWontKnowWho|JohnP|DZJZMLU|Pielie|SmallBrother|Trilker|MartinH|Bas2003|ThaFlow|minouhse|kDragon|Converted007|D4RK4RMY|ddihzw|kranf|Jaloxaji|michelono|rotzooi1111|Biteme|DutchReality)\b
 48 | 
 49 | ### Professional subtitler names
 50 | nl_purge8: \b((Frank|Richard) B[oe]velander|Marjolein Meeuwisse|Frederik Haan|Brigitta Broeke|Annemarieke Schaap|Maria (Mohr|van Veldhuizen)|Peter (Bosma|van Loenhout)|(Amber|Charlotte|Gerrie|Sylvy|Jeanne) (Bi?(rugg)?|Not|Ti(mm|el))er?mans?|Jenneke Takens|Etienne Lantier|Birgit Leerling|Jos[eé] van de Kamp|Inge van Balgooij|Christiaan Tamerus|Emily Moorlach|Judith (IJpelaar|Schep)|Dirk Klinkenberg)\b
 51 | nl_purge9: \b(Suzan Hogenbirk|Sanne (Derks|Egelmeers|van der Meij)|Tineke (Blokzijl|Haar)|Theresa van der Gruit|Femke Meijer|([JL]orien|Flor[iu]s) (Hakvoort|Franssen|van Rooijen)|Xander Purcell?|Sofie Janssen|Bart Heuvelmans|Mathias Van den Branden|Myl[eè]ne Delfos|Leen Schonken|Maartje van de[nr] (Brink|Zeijden)|Jake Dozier|Tom Steinbusch|Linda van der Logt|Shirley Delnoy|Allettie Bastiaansen)\b
 52 | nl_purge10: \b(Marl(een|oes) (Kerssens|Bakker|Gimpel|Penders)|Lana Goossens|Geert (Spekken|van (den )?(Elzen|Bremen))|Alexander Eckhardt|(Brian|Catharine) Winter|Lars Altena|Sikko Bos|Mar[cky] (Ann Smit|de Jongh|de Klerk)|Edward (van Veen|Rekkers)|Michiel Nijenhuis|Ben Zuidema|Juli[eë]tte van Gurp|Jos Verberne|Stijn van Oosterbos|Mieke Vanhengel|Anke Elzinga|Sara Isabel Lette|Len Van Renterghem)\b
 53 | nl_purge11: \b((Evan?|Owen) (Dorrestein|de Gans)|Sandra Vandenbussche|J\.J\. Ritzer|Karen Lagendijk|Ren[eé] van Vliet|Barbara Born|Roel Salden|Elly van der Meijden|Elize Preenen|Joost Poort|Chris (Freriks|Reuvers)|Diane Loogman|Amabile Keijer|Caroline Snijder|Elisabeth Barber|Harri[eë]t de Vette|Annemiek Krol|Jessica (van Doremalen|Rietveld)|Robert(de Ridder|Geurtsen)|Rico Nouromid|Carla Kloet)\b
 54 | nl_purge12: \b((Rachel|Wietske|Pierre) (van )?(der )?Pol(man)?|Jolanda (Ursem|Jongedijk|van den Berg)|Martijn van Berkel|Mari[ej]k?e (Loonen|Schuring|Kok)|Esther (Daa?ms(teeg)?|van Opstal)|An[ns] (van Bogaert|Bouter)|Naomi Verheggen|Maxime van der Glas|Maaike van der Heijden|Laurence de Moor|Carmen Ket|Anja Stoop|Dennis Strik|Dani[eë]l Vos|Mireille Van Overmeiren|Bonnie Dekker|Jenny Mizrahi)\b
 55 | nl_purge13: \b(Ric?k de (Laat|Best)|Wim Gerbecks|Jordi Schipper|Lieuwe Osinga|Trudy Kloet|Erik Brommeijer|Bianca van der Meulen|Muriel Bouillon|Leonie Klaassen|Noortje Ganzevles|Tessa (Kuijpers|van Schijndel)|Matthijs Dijkstra|Maurice Voogd|Arjan van Tuijl|Nikki van Leeuwen|Cora Sendon|Petra Swelsen|Wouter Groothuis|Cindy Hemink|Deirdre Malone|Martijn Beunk|Monique Houben|Michael Albers|Edina van Daalen)\b
 56 | 
 57 | ### Subtitling/release groups
 58 | nl_purge14: \b(een netflix\W(original\W)?(documentaire|film|serie)|netflix presenteert|QoQ|Quality over Quantity|(simply|pvt) releases|bierdopje\.com|nlondertitels|subtitlesnl|ondertitels?(\.cc|\.com)|nlsub|yifi|(het robot|srt) team|CustomNL|place2home|fmsubs|FTC-SubTeam)\b
 59 | nl_purge15: \bs(eizoen)?\W*\d+[^,]\W*a(flevering)?\W*\d+[^,]
 60 | 
 61 | ### From no_profile config
 62 | nl_purge16: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university)
 63 | nl_purge17: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clear\W*way\W*law)
 64 | nl_purge18: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university)
 65 | nl_purge19: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clearway\W*law)
 66 | nl_purge20: \b(UNiTED\W*TEAM|admitme|ragbear|looklive|Camikaze|SourGrass|mstoll|alire2a)\b
 67 | nl_purge21: \b(normita|EhLaNa|playships|sunmenghao|nessundorma|seriestele|DarKsh|vothaison)\b
 68 | nl_purge22: \b(anana|cRosKy|misshu|Xenzai|swsub|divx|empiremedia|La Fabrique|benj)\b
 69 | nl_purge23: \b(dawaith|MoSub|Golgi|Linwelin|Malikay|Ricana|Sadgeezer|argenteam|tiobetonh|chebinhdan)\b
 70 | 
 71 | #nl_purge#: Regex goes here.
 72 | 
 73 | 
 74 | 
 75 | #
 76 | # -----------------------------------------GUIDE-------------------------------------------------
 77 | #
 78 | 
 79 | # This language profile contains two lists of regex that will look for patterns.
 80 | # if you wish to modify or remove any regex, feel free to do so
 81 | # but files in the default folder will be overwritten when you update the script.
 82 | # You can add and remove keys as long as two keys don't use the same key twice.
 83 | 
 84 | # WARNING_REGEX:
 85 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block.
 86 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads.
 87 | # 1 warning is ignored
 88 | # 2 warnings will print the block as a WARNING in the log.
 89 | # 3 warnings or more will remove the entire block.
 90 | 
 91 | # PURGE_REGEX:
 92 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block.
 93 | 
 94 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the
 95 | # literal character you'll need to escape it with '\'
 96 | # for example: matching "www." would require a regex like: "www\."
 97 | # you can test regexes online on an regex-tester tool like https://regex101.com/
 98 | 
 99 | # Feel free to ask me any question on github.
100 | 


--------------------------------------------------------------------------------
/libs/langdetect/utils/ngram.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import libs.six as six
  4 | 
  5 | from . import messages
  6 | from .unicode_block import (
  7 |     unicode_block,
  8 |     UNICODE_BASIC_LATIN,
  9 |     UNICODE_LATIN_1_SUPPLEMENT,
 10 |     UNICODE_LATIN_EXTENDED_B,
 11 |     UNICODE_GENERAL_PUNCTUATION,
 12 |     UNICODE_ARABIC,
 13 |     UNICODE_LATIN_EXTENDED_ADDITIONAL,
 14 |     UNICODE_HIRAGANA,
 15 |     UNICODE_KATAKANA,
 16 |     UNICODE_BOPOMOFO,
 17 |     UNICODE_BOPOMOFO_EXTENDED,
 18 |     UNICODE_CJK_UNIFIED_IDEOGRAPHS,
 19 |     UNICODE_HANGUL_SYLLABLES,
 20 | )
 21 | 
 22 | 
 23 | class NGram(object):
 24 |     LATIN1_EXCLUDED = messages.get_string('NGram.LATIN1_EXCLUDE')
 25 |     N_GRAM = 3
 26 | 
 27 |     def __init__(self):
 28 |         self.grams = ' '
 29 |         self.capitalword = False
 30 | 
 31 |     def add_char(self, ch):
 32 |         '''Append a character into ngram buffer.'''
 33 |         ch = self.normalize(ch)
 34 |         last_char = self.grams[-1]
 35 |         if last_char == ' ':
 36 |             self.grams = ' '
 37 |             self.capitalword = False
 38 |             if ch == ' ':
 39 |                 return
 40 |         elif len(self.grams) >= self.N_GRAM:
 41 |             self.grams = self.grams[1:]
 42 |         self.grams += ch
 43 | 
 44 |         if ch.isupper():
 45 |             if last_char.isupper():
 46 |                 self.capitalword = True
 47 |         else:
 48 |             self.capitalword = False
 49 | 
 50 |     def get(self, n):
 51 |         '''Get n-gram.'''
 52 |         if self.capitalword:
 53 |             return
 54 |         if n < 1 or n > self.N_GRAM or len(self.grams) < n:
 55 |             return
 56 |         if n == 1:
 57 |             ch = self.grams[-1]
 58 |             if ch == ' ':
 59 |                 return
 60 |             return ch
 61 |         else:
 62 |             return self.grams[-n:]
 63 | 
 64 |     @classmethod
 65 |     def normalize(cls, ch):
 66 |         block = unicode_block(ch)
 67 |         if block == UNICODE_BASIC_LATIN:
 68 |             if ch < 'A' or ('Z' < ch < 'a') or 'z' < ch:
 69 |                 ch = ' '
 70 |         elif block == UNICODE_LATIN_1_SUPPLEMENT:
 71 |             if cls.LATIN1_EXCLUDED.find(ch) >= 0:
 72 |                 ch = ' '
 73 |         elif block == UNICODE_LATIN_EXTENDED_B:
 74 |             # normalization for Romanian
 75 |             if ch == six.u('\u0219'):  # Small S with comma below => with cedilla
 76 |                 ch = six.u('\u015f')
 77 |             if ch == six.u('\u021b'):  # Small T with comma below => with cedilla
 78 |                 ch = six.u('\u0163')
 79 |         elif block == UNICODE_GENERAL_PUNCTUATION:
 80 |             ch = ' '
 81 |         elif block == UNICODE_ARABIC:
 82 |             if ch == six.u('\u06cc'):
 83 |                 ch = six.u('\u064a')  # Farsi yeh => Arabic yeh
 84 |         elif block == UNICODE_LATIN_EXTENDED_ADDITIONAL:
 85 |             if ch >= six.u('\u1ea0'):
 86 |                 ch = six.u('\u1ec3')
 87 |         elif block == UNICODE_HIRAGANA:
 88 |             ch = six.u('\u3042')
 89 |         elif block == UNICODE_KATAKANA:
 90 |             ch = six.u('\u30a2')
 91 |         elif block in (UNICODE_BOPOMOFO, UNICODE_BOPOMOFO_EXTENDED):
 92 |             ch = six.u('\u3105')
 93 |         elif block == UNICODE_CJK_UNIFIED_IDEOGRAPHS:
 94 |             ch = cls.CJK_MAP.get(ch, ch)
 95 |         elif block == UNICODE_HANGUL_SYLLABLES:
 96 |             ch = six.u('\uac00')
 97 |         return ch
 98 | 
 99 |     @classmethod
100 |     def normalize_vi(cls, text):
101 |         '''Normalizer for Vietnamese.
102 |         Normalize Alphabet + Diacritical Mark(U+03xx) into U+1Exx.
103 |         '''
104 |         def repl(m):
105 |             alphabet = cls.TO_NORMALIZE_VI_CHARS.find(m.group(1))
106 |             dmark = cls.DMARK_CLASS.find(m.group(2))  # Diacritical Mark
107 |             return cls.NORMALIZED_VI_CHARS[dmark][alphabet]
108 |         return cls.ALPHABET_WITH_DMARK.sub(repl, text)
109 | 
110 |     NORMALIZED_VI_CHARS = [
111 |         messages.get_string('NORMALIZED_VI_CHARS_0300'),
112 |         messages.get_string('NORMALIZED_VI_CHARS_0301'),
113 |         messages.get_string('NORMALIZED_VI_CHARS_0303'),
114 |         messages.get_string('NORMALIZED_VI_CHARS_0309'),
115 |         messages.get_string('NORMALIZED_VI_CHARS_0323')]
116 |     TO_NORMALIZE_VI_CHARS = messages.get_string('TO_NORMALIZE_VI_CHARS')
117 |     DMARK_CLASS = messages.get_string('DMARK_CLASS')
118 |     ALPHABET_WITH_DMARK = re.compile(
119 |         '([' + TO_NORMALIZE_VI_CHARS + '])([' + DMARK_CLASS + '])',
120 |         re.UNICODE)
121 | 
122 |     # CJK Kanji Normalization Mapping
123 |     CJK_CLASS = [
124 |         messages.get_string('NGram.KANJI_1_0'),
125 |         messages.get_string('NGram.KANJI_1_2'),
126 |         messages.get_string('NGram.KANJI_1_4'),
127 |         messages.get_string('NGram.KANJI_1_8'),
128 |         messages.get_string('NGram.KANJI_1_11'),
129 |         messages.get_string('NGram.KANJI_1_12'),
130 |         messages.get_string('NGram.KANJI_1_13'),
131 |         messages.get_string('NGram.KANJI_1_14'),
132 |         messages.get_string('NGram.KANJI_1_16'),
133 |         messages.get_string('NGram.KANJI_1_18'),
134 |         messages.get_string('NGram.KANJI_1_22'),
135 |         messages.get_string('NGram.KANJI_1_27'),
136 |         messages.get_string('NGram.KANJI_1_29'),
137 |         messages.get_string('NGram.KANJI_1_31'),
138 |         messages.get_string('NGram.KANJI_1_35'),
139 |         messages.get_string('NGram.KANJI_2_0'),
140 |         messages.get_string('NGram.KANJI_2_1'),
141 |         messages.get_string('NGram.KANJI_2_4'),
142 |         messages.get_string('NGram.KANJI_2_9'),
143 |         messages.get_string('NGram.KANJI_2_10'),
144 |         messages.get_string('NGram.KANJI_2_11'),
145 |         messages.get_string('NGram.KANJI_2_12'),
146 |         messages.get_string('NGram.KANJI_2_13'),
147 |         messages.get_string('NGram.KANJI_2_15'),
148 |         messages.get_string('NGram.KANJI_2_16'),
149 |         messages.get_string('NGram.KANJI_2_18'),
150 |         messages.get_string('NGram.KANJI_2_21'),
151 |         messages.get_string('NGram.KANJI_2_22'),
152 |         messages.get_string('NGram.KANJI_2_23'),
153 |         messages.get_string('NGram.KANJI_2_28'),
154 |         messages.get_string('NGram.KANJI_2_29'),
155 |         messages.get_string('NGram.KANJI_2_30'),
156 |         messages.get_string('NGram.KANJI_2_31'),
157 |         messages.get_string('NGram.KANJI_2_32'),
158 |         messages.get_string('NGram.KANJI_2_35'),
159 |         messages.get_string('NGram.KANJI_2_36'),
160 |         messages.get_string('NGram.KANJI_2_37'),
161 |         messages.get_string('NGram.KANJI_2_38'),
162 |         messages.get_string('NGram.KANJI_3_1'),
163 |         messages.get_string('NGram.KANJI_3_2'),
164 |         messages.get_string('NGram.KANJI_3_3'),
165 |         messages.get_string('NGram.KANJI_3_4'),
166 |         messages.get_string('NGram.KANJI_3_5'),
167 |         messages.get_string('NGram.KANJI_3_8'),
168 |         messages.get_string('NGram.KANJI_3_9'),
169 |         messages.get_string('NGram.KANJI_3_11'),
170 |         messages.get_string('NGram.KANJI_3_12'),
171 |         messages.get_string('NGram.KANJI_3_13'),
172 |         messages.get_string('NGram.KANJI_3_15'),
173 |         messages.get_string('NGram.KANJI_3_16'),
174 |         messages.get_string('NGram.KANJI_3_18'),
175 |         messages.get_string('NGram.KANJI_3_19'),
176 |         messages.get_string('NGram.KANJI_3_22'),
177 |         messages.get_string('NGram.KANJI_3_23'),
178 |         messages.get_string('NGram.KANJI_3_27'),
179 |         messages.get_string('NGram.KANJI_3_29'),
180 |         messages.get_string('NGram.KANJI_3_30'),
181 |         messages.get_string('NGram.KANJI_3_31'),
182 |         messages.get_string('NGram.KANJI_3_32'),
183 |         messages.get_string('NGram.KANJI_3_35'),
184 |         messages.get_string('NGram.KANJI_3_36'),
185 |         messages.get_string('NGram.KANJI_3_37'),
186 |         messages.get_string('NGram.KANJI_3_38'),
187 |         messages.get_string('NGram.KANJI_4_0'),
188 |         messages.get_string('NGram.KANJI_4_9'),
189 |         messages.get_string('NGram.KANJI_4_10'),
190 |         messages.get_string('NGram.KANJI_4_16'),
191 |         messages.get_string('NGram.KANJI_4_17'),
192 |         messages.get_string('NGram.KANJI_4_18'),
193 |         messages.get_string('NGram.KANJI_4_22'),
194 |         messages.get_string('NGram.KANJI_4_24'),
195 |         messages.get_string('NGram.KANJI_4_28'),
196 |         messages.get_string('NGram.KANJI_4_34'),
197 |         messages.get_string('NGram.KANJI_4_39'),
198 |         messages.get_string('NGram.KANJI_5_10'),
199 |         messages.get_string('NGram.KANJI_5_11'),
200 |         messages.get_string('NGram.KANJI_5_12'),
201 |         messages.get_string('NGram.KANJI_5_13'),
202 |         messages.get_string('NGram.KANJI_5_14'),
203 |         messages.get_string('NGram.KANJI_5_18'),
204 |         messages.get_string('NGram.KANJI_5_26'),
205 |         messages.get_string('NGram.KANJI_5_29'),
206 |         messages.get_string('NGram.KANJI_5_34'),
207 |         messages.get_string('NGram.KANJI_5_39'),
208 |         messages.get_string('NGram.KANJI_6_0'),
209 |         messages.get_string('NGram.KANJI_6_3'),
210 |         messages.get_string('NGram.KANJI_6_9'),
211 |         messages.get_string('NGram.KANJI_6_10'),
212 |         messages.get_string('NGram.KANJI_6_11'),
213 |         messages.get_string('NGram.KANJI_6_12'),
214 |         messages.get_string('NGram.KANJI_6_16'),
215 |         messages.get_string('NGram.KANJI_6_18'),
216 |         messages.get_string('NGram.KANJI_6_20'),
217 |         messages.get_string('NGram.KANJI_6_21'),
218 |         messages.get_string('NGram.KANJI_6_22'),
219 |         messages.get_string('NGram.KANJI_6_23'),
220 |         messages.get_string('NGram.KANJI_6_25'),
221 |         messages.get_string('NGram.KANJI_6_28'),
222 |         messages.get_string('NGram.KANJI_6_29'),
223 |         messages.get_string('NGram.KANJI_6_30'),
224 |         messages.get_string('NGram.KANJI_6_32'),
225 |         messages.get_string('NGram.KANJI_6_34'),
226 |         messages.get_string('NGram.KANJI_6_35'),
227 |         messages.get_string('NGram.KANJI_6_37'),
228 |         messages.get_string('NGram.KANJI_6_39'),
229 |         messages.get_string('NGram.KANJI_7_0'),
230 |         messages.get_string('NGram.KANJI_7_3'),
231 |         messages.get_string('NGram.KANJI_7_6'),
232 |         messages.get_string('NGram.KANJI_7_7'),
233 |         messages.get_string('NGram.KANJI_7_9'),
234 |         messages.get_string('NGram.KANJI_7_11'),
235 |         messages.get_string('NGram.KANJI_7_12'),
236 |         messages.get_string('NGram.KANJI_7_13'),
237 |         messages.get_string('NGram.KANJI_7_16'),
238 |         messages.get_string('NGram.KANJI_7_18'),
239 |         messages.get_string('NGram.KANJI_7_19'),
240 |         messages.get_string('NGram.KANJI_7_20'),
241 |         messages.get_string('NGram.KANJI_7_21'),
242 |         messages.get_string('NGram.KANJI_7_23'),
243 |         messages.get_string('NGram.KANJI_7_25'),
244 |         messages.get_string('NGram.KANJI_7_28'),
245 |         messages.get_string('NGram.KANJI_7_29'),
246 |         messages.get_string('NGram.KANJI_7_32'),
247 |         messages.get_string('NGram.KANJI_7_33'),
248 |         messages.get_string('NGram.KANJI_7_35'),
249 |         messages.get_string('NGram.KANJI_7_37')]
250 | 
251 |     CJK_MAP = {}
252 | 
253 |     @classmethod
254 |     def _init_cjk_map(cls):
255 |         for cjk_list in cls.CJK_CLASS:
256 |             representative = cjk_list[0]
257 |             for ch in cjk_list:
258 |                 cls.CJK_MAP[ch] = representative
259 | 
260 | NGram._init_cjk_map()
261 | 


--------------------------------------------------------------------------------
/libs/subcleaner/subtitle.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | from typing import List, Set, Dict
  4 | 
  5 | from . import languages
  6 | from .settings import args, config
  7 | from .sub_block import SubBlock, ParsingException
  8 | from libs import langdetect
  9 | from pathlib import Path
 10 | 
 11 | from ..langdetect import LangDetectException
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class Subtitle:
 17 |     blocks: List[SubBlock]
 18 |     ad_blocks: Set[SubBlock]
 19 |     warning_blocks: Set[SubBlock]
 20 |     language: str
 21 |     file: Path
 22 |     short_path: Path
 23 |     pre_content_artifact: str = ""
 24 | 
 25 |     def __init__(self, subtitle_file: Path) -> None:
 26 |         self.file = subtitle_file
 27 |         self.blocks = []
 28 |         self.ad_blocks = set()
 29 |         self.warning_blocks = set()
 30 | 
 31 |         file_content = read_file(self.file)
 32 |         self._parse_file_content(file_content)
 33 | 
 34 |         for i in range(len(self.blocks)):
 35 |             self.blocks[i].current_index = i
 36 |         try:
 37 |             self.short_path = self.file.relative_to(config.relative_base)
 38 |         except ValueError:
 39 |             self.short_path = self.file
 40 | 
 41 |         if not self:
 42 |             raise FileContentException(self.file)
 43 | 
 44 |         if args.language:
 45 |             self.language = args.language
 46 |         else:
 47 |             self.determine_language()
 48 | 
 49 |         if args.destroy_list:
 50 |             self.mark_blocks_for_deletion(args.destroy_list)
 51 | 
 52 |         if len(self.blocks) > 1:
 53 |             prev_block = self.blocks[0]
 54 |             blocks_to_remove: Set[SubBlock] = set()
 55 |             for block in self.blocks[1:]:
 56 |                 if block.content == prev_block.content and (block.start_time - prev_block.end_time).total_seconds() < 1/31:
 57 |                     prev_block.end_time = block.end_time
 58 |                     blocks_to_remove.add(block)
 59 |                     continue
 60 |                 prev_block = block
 61 |             for block in blocks_to_remove:
 62 |                 self.blocks.remove(block)
 63 | 
 64 |     def warn(self, block: SubBlock):
 65 |         if block not in self.ad_blocks:
 66 |             self.warning_blocks.add(block)
 67 | 
 68 |     def ad(self, block: SubBlock):
 69 |         try:
 70 |             self.warning_blocks.remove(block)
 71 |         except KeyError:
 72 |             pass
 73 |         self.ad_blocks.add(block)
 74 | 
 75 |     def _parse_file_content(self, file_content: str) -> None:
 76 |         file_content = file_content.replace("—>", "-->")
 77 |         current_line = 0
 78 |         line_lookup: Dict[str, int] = {}
 79 | 
 80 |         lines = file_content.split("\n")
 81 |         if len(lines) < 2:
 82 |             raise FileContentException(self.file)
 83 |         for line in lines:
 84 |             current_line += 1
 85 |             if "-->" in line:
 86 |                 line_lookup[line] = current_line
 87 |         file_content = re.sub(r'\n\s*\n', '\n', file_content)
 88 |         file_content = file_content.strip()
 89 |         file_content_lines = file_content.split("\n")
 90 |         file_content_lines.append("")
 91 |         self._breakup_block(file_content_lines, line_lookup)
 92 | 
 93 |     def _breakup_block(self, lines: List[str], line_lookup: Dict[str, int]) -> None:
 94 |         last_break = 0
 95 |         start_index = 0
 96 |         for i in range(len(lines)):
 97 |             line = lines[i]
 98 |             if not SubBlock.is_sub_block_header(line) or i == len(lines)-1 or SubBlock.is_sub_block_header(lines[i+1]):
 99 |                 continue
100 |             start_index = i + 1
101 |             if i == 0:
102 |                 last_break = i
103 |                 break
104 | 
105 |             previous_line = lines[i - 1]
106 |             if previous_line[0].isnumeric():
107 |                 last_break = i - 1
108 |             else:
109 |                 last_break = i
110 |             break
111 |         if last_break > 1:
112 |             e = ParsingException(1, "incorrectly formatted subtitle block")
113 |             e.subtitle_file = self.file
114 |             e.file_line = line_lookup.get(lines[last_break], None)
115 |             if not e.file_line:
116 |                 e.file_line = line_lookup.get(lines[last_break + 1], None)
117 |             logger.warning(str(e))
118 | 
119 |             for line in lines[:last_break]:
120 |                 if "-->" in line:
121 |                     line = line + "\n"
122 |                 self.pre_content_artifact += line + "\n"
123 | 
124 |         for i in range(start_index, len(lines)):
125 |             line = lines[i]
126 |             previous_line = lines[i-1]
127 |             if not SubBlock.is_sub_block_header(line) or i == len(lines)-1 or SubBlock.is_sub_block_header(lines[i+1]):
128 |                 continue
129 | 
130 |             if previous_line[0].isnumeric():
131 |                 next_break = i - 1
132 |             else:
133 |                 next_break = i
134 | 
135 |             try:
136 |                 block = SubBlock("\n".join(lines[last_break:next_break]), len(self.blocks) + 1)
137 |             except ParsingException as e:
138 |                 e.subtitle_file = self.file
139 |                 e.file_line = line_lookup.get(lines[last_break], None)
140 |                 if not e.file_line:
141 |                     e.file_line = line_lookup.get(lines[last_break+1], None)
142 |                 if not self.blocks:
143 |                     self.pre_content_artifact += "\n" + "\n".join(lines[last_break:next_break]) + "\n"
144 |                 logger.warning(e)
145 |                 self.blocks[-1].content += "\n\n" + "\n".join(lines[last_break:next_break])
146 |                 continue
147 | 
148 |             if block.content:
149 |                 self.blocks.append(block)
150 |             if "-->" in block.content:
151 |                 self.warn(block)
152 |                 block.hints.append("malformed_block")
153 |             last_break = next_break
154 |         try:
155 |             block = SubBlock("\n".join(lines[last_break:]), len(self.blocks) + 1)
156 |         except ParsingException as e:
157 |             e.subtitle_file = self.file
158 |             e.file_line = line_lookup.get(lines[last_break], None)
159 |             if not e.file_line:
160 |                 e.file_line = line_lookup.get(lines[last_break + 1], None)
161 |             logger.warning(e)
162 |             if not self.blocks:
163 |                 raise e
164 |             self.blocks[-1].content += "\n\n" + "\n".join(lines[last_break:])
165 |             return
166 |         if block.content:
167 |             self.blocks.append(block)
168 |         if "-->" in block.content:
169 |             self.warn(block)
170 |             block.hints.append("malformed_block")
171 | 
172 |     def mark_blocks_for_deletion(self, purge_list: List[int]) -> None:
173 |         for index in purge_list:
174 |             for block in self.blocks:
175 |                 if block.original_index == index:
176 |                     block.regex_matches = 3
177 |                     block.hints.append("destroyed by index")
178 |                     break
179 |             else:
180 |                 if index-1 >= len(self.blocks):
181 |                     continue
182 |                 block = self.blocks[index - 1]
183 |                 if not block.original_index or block.original_index == index:
184 |                     block.regex_matches = 3
185 |                     block.hints.append("destroyed by index")
186 |                 logger.warning("indexing in subtitle does not match with parsed subtitle.")
187 | 
188 |     def language_is_correct(self) -> bool:
189 |         if self.language == "und":
190 |             return True  # unknown language.
191 |         language_code_2 = languages.get_2letter_code(self.language)
192 | 
193 |         if not language_code_2:
194 |             return True  # unknown language.
195 | 
196 |         sub_content: str = ""
197 |         for block in self.blocks:
198 |             sub_content += block.content
199 | 
200 |         if len(sub_content) < 500:
201 |             return True  # not enough content to estimate language.
202 |         try:
203 |             detected_language = langdetect.detect_langs(sub_content)[0]
204 |         except LangDetectException:
205 |             logger.warning(f"{self} can't be analyzed by language detector.")
206 |             return True
207 | 
208 |         return detected_language.lang == language_code_2 and detected_language.prob > 0.8
209 | 
210 |     def determine_language(self) -> None:
211 |         if config.default_language:
212 |             self.language = config.default_language
213 |             return
214 | 
215 |         self.language = "und"
216 | 
217 |         found_hi = False
218 |         found_sdh = False
219 |         for suffix in reversed(self.file.suffixes[max(-3, -len(self.file.suffixes)): -1]):
220 |             parsed_lang = suffix.replace(":", "-").replace("_", "-").split("-")[0][1:]
221 |             if parsed_lang == "hi":
222 |                 found_hi = True
223 |                 continue
224 |             if parsed_lang == "sdh":
225 |                 found_sdh = True
226 |                 continue
227 | 
228 |             if languages.is_language(parsed_lang):
229 |                 self.language = parsed_lang
230 |                 return
231 |         if found_hi:
232 |             self.language = "hi"
233 |             return
234 |         if found_sdh:
235 |             self.language = "sdh" 
236 |             return
237 |         #  todo: parse hi and sdh properly 
238 | 
239 |         sub_content: str = ""
240 |         for block in self.blocks:
241 |             sub_content += block.content
242 |         if len(sub_content) < 500:
243 |             return
244 |         try:
245 |             detected_language = langdetect.detect_langs(sub_content)[0]
246 |         except LangDetectException:
247 |             logger.warning(f"{self} can't be analyzed by language detector.")
248 |             return
249 | 
250 |         if detected_language.prob > 0.9:
251 |             self.language = detected_language.lang
252 | 
253 |     def to_content(self) -> str:
254 |         content = self.pre_content_artifact
255 |         for block in self.blocks:
256 |             content += f"{block.current_index}\n" \
257 |                        f"{block}\n" \
258 |                        f"\n"
259 | 
260 |             if "-->" in block.content:
261 |                 logger.warning(f"potential malformed subtitle blocks in block {block.current_index}.")
262 |         return content[:-1]
263 | 
264 |     def get_warning_indexes(self) -> List[str]:
265 |         l: List[int] = []
266 |         for block in self.warning_blocks:
267 |             l.append(int(block.current_index))
268 |         l.sort()
269 |         return [str(x) for x in l]
270 | 
271 |     def reindex(self):
272 |         index = 1
273 |         for block in self.blocks:
274 |             block.current_index = index
275 |             index += 1
276 |         for block in self.ad_blocks:
277 |             block.current_index = None
278 | 
279 |     def __str__(self) -> str:
280 |         return str(self.file)
281 | 
282 |     def __len__(self) -> int:
283 |         return len(self.blocks)
284 | 
285 |     def __bool__(self) -> bool:
286 |         for block in self.blocks:
287 |             if block.content:
288 |                 return True
289 |         return False
290 | 
291 | 
292 | class FileContentException(Exception):
293 |     subtitle_file: str
294 | 
295 |     def __init__(self, subtitle_file):
296 |         self.subtitle_file = subtitle_file
297 | 
298 |     def __str__(self) -> str:
299 |         return f"File {self.subtitle_file} is empty."
300 | 
301 | 
302 | def read_file(file: Path) -> str:
303 |     file_content: str
304 |     # todo: maybe fix decoding to be more reliable?
305 |     try:
306 |         with file.open("r", encoding="utf-8-sig") as opened_file:
307 |             file_content = opened_file.read()
308 |     except UnicodeDecodeError:
309 |         with file.open("r", encoding="cp1252") as opened_file:
310 |             file_content = opened_file.read()
311 |     if not "-->" in file_content:
312 |         try:
313 |             with file.open("r", encoding="utf-16") as opened_file:
314 |                 file_content = opened_file.read()
315 |         except UnicodeDecodeError:
316 |             try:
317 |                 with file.open("r", encoding="utf-8") as opened_file:
318 |                     file_content = opened_file.read()
319 |             except UnicodeDecodeError:
320 |                 pass
321 | 
322 |     return file_content
323 | 


--------------------------------------------------------------------------------
/libs/langdetect/profiles/gu:
--------------------------------------------------------------------------------
1 | {"freq":{"ૈદિ":382,"g":235,"d":312,"e":960,"c":304,"a":1076,"n":720,"o":584,"l":382,"m":289,"h":369,"i":764,"u":324,"t":728,"s":517,"r":627,"ોટ ":345,"ેસા":764,"ોલ ":730,"ોર ":574,"ોદ ":1827,"ેત્":227,"ેતી":2222,"ેતમ":2186,"ેતપ":357,"ેડબ":253,"ેડા":1366,"ેડી":300,"ેગા":229,"ેગો":375,"ેઘર":242,"ૈકી":6300,"ેશમ":527,"ેશન":12436,"ેવી":831,"ેવા":710,"ઇડર":265,"ેરા":387,"ેરી":718,"આહવ":288,"ેલા":24917,"ેલી":519,"ેલુ":9935,"ેલો":381,"ોઇ ":458,"ેન્":278,"ેપુ":457,"આવે":34862,"ા":337683,"િ":47127,"સ":31472,"હ":20294,"શ":32541,"ષ":5409,"વ":91695,"લ":111041,"ળ":3931,"ર":102867,"ય":39143,"મ":113670,"ભ":35403,"બ":10569,"ફ":1198,"પ":49237,"ન":84304,"ધ":9131,"દ":38743,"થ":6321,"ત":89107,"ણ":9770,"ઢ":1233,"ડ":18443,"ઠ":3507,"જ":54268,"ઝ":1439,"ટ":6287,"ઘ":2525,"ચ":20557,"છ":25106,"ક":72592,"ખ":14557,"ગ":61691,"ઓ":8101,"એ":23599,"ઉ":5095,"ઈ":409,"અ":6168,"ઇ":1975,"આ":43598,"ં":82987,"૫":1391,"૪":875,"૩":2115,"૨":1146,"૯":1054,"૮":946,"૭":1034,"૬":461,"૧":5611,"૦":1269,"ૈ":7159,"ો":34921,"્":89060,"ૌ":562,"ુ":82336,"ી":42473,"ૃ":539,"ૂ":4236,"ે":108368,"આણં":435,"આદિ":1564,"ેશ ":483,"અને":1581,"અન્":344,"e ":271,"અમદ":630,"ેમ ":2812,"ેર ":1681,"ેલ ":1022," ૧":4255," ૩":409," ૨":679," ૫":978," ૪":492," ૭":700," ૯":551," ૮":625,"અગિ":1051," વ":14987," શ":3197," ર":16267," લ":5654," સ":11956," હ":3244," થ":1672," ત":31864," ધ":1718," દ":21808," ડ":1738," ઠ":222," ભ":34182," બ":4095," ય":383," મ":24848," ન":6795," ફ":765," પ":35455," છ":24245," ચ":2656," ઘ":628," ટ":479," ઝ":829," જ":21642," ઓ":682," ગ":30845," ખ":8068," ક":14981," ઉ":4757," એ":23366," આ":43205," ઇ":661," અ":6143,"આંગ":703,"્ચિ":11645,"્ટ્":549,"ોતર":377,"ોદર":1858,"ોનગ":236,"ોટા":473,"ોટી":225,"ોડા":794,"આઠ ":685,"ોની":800,"ોનો":2167,"ોરી":514,"ોળી":244,"ોલી":442,"ંવત":748,"ંબા":470,"ંબુ":281,"ંમત":254,"ંઠા":1406,"ંડવ":256,"ંદુ":455,"્ધ ":712,"ંદો":312,"ંધી":504,"ંતર":406,"ંચા":1418,"ંચમ":1337,"ંગા":221,"્ર ":966,"ોકો":3591,"્ય ":7092,"ંગણ":713,"ંખે":303,"ંગર":288,"્ષ ":789,"્વ ":2508,"એવા":6093,"્ષન":1137,"્ષિ":1509,"્વા":430,"્વે":772,"્વન":6820,"્વર":251,"્યન":12109,"્યત":720,"્યમ":432,"્યપ":428,"્યવ":2272,"્યા":2476,"્યુ":248,"્મદ":563,"્મા":375,"્લો":1068,"્લા":13052,"્રો":432,"્રે":602,"્રમ":649,"્રદ":871,"્રા":2175,"્રિ":476,"્રી":593,"્રહ":375,"્દ્":258,"્થા":288,"્તા":298,"્તી":856,"્તર":2535,"એક ":15869,"ઉદે":246,"ઉપલ":606,"ઉપર":392,"ઉત્":2557,"ઉમર":329,"િત":853,"િણ":1494,"વિજ":359,"ીં":343,"િમ":11976,"િપ":550,"િન":1570,"વાય":500,"િવ":4622,"વાર":525,"િશ":322,"ીક":534,"વામ":999,"િલ":14752,"ીઓ":697,"િય":2671,"િર":803,"વાસ":1799,"ીજ":386,"િહ":232,"વિક":492,"િસ":578,"વાલ":357,"ીત":406,"ીદ":281,"ું":22062,"વાદ":862,"વાન":404,"ીય":2619,"ીમ":860,"ીન":8731,"વાડ":2612,"ીપ":459,"ુક":18441,"ીવ":394,"ુખ":4304,"ીર":426,"ીલ":229,"વાગ":247,"વાઘ":254,"ીસ":510,"ુચ":222,"ુજ":13015,"ાં":48849,"ાઉ":298,"ાઇ":474,"ાક":921,"ાઓ":6449,"ાઘ":283,"ાખ":279,"ાગ":19994,"ાજ":13842,"ાચ":245,"ાટ":1064,"ાડ":4936,"િં":1255,"ાણ":2500,"ાથ":1359,"ાત":14028,"ાદ":2250,"ાન":14000,"ાપ":2091,"ાબ":1765,"ામ":34603,"ાય":4603,"ાર":20818,"ાલ":24953,"ાળ":1774,"વિર":286,"િક":2870,"ાવ":3657,"ાષ":849,"ાસ":4564,"ાહ":1263,"િજ":517,"વિસ":266,"હત":7478,"સી":2165,"સુ":1962,"સે":1528,"સા":8757,"સિ":770,"હવ":567,"સો":764,"હર":240,"સ્":3025,"સૌ":277,"હુ":529,"સા ":522,"હે":3680,"હા":2824,"હિ":2043,"હી":349,"હો":1278,"હ્":385,"શ્":12458,"ષન":1142,"સગ":988,"સર ":276,"શહ":485,"શિ":374,"શા":1696,"શુ":2253,"શી":262,"સં":2001,"ષ્":864,"સમ":828,"સન":417,"સવ":569,"સર":997,"ષા":458,"સદ":328,"સત":251,"સણ":439,"ષિ":1579,"વદ":549,"વન":7118,"વર":2659,"વલ":1055,"શક":498,"વગ":256,"વસા":2722,"વડ":3280,"વત":1294,"વણ":314,"વે":36512,"શન":12531,"વૈ":520,"શમ":627,"વ્":2839,"વસ":7405,"વી":2081,"વિ":2302,"વા":17902,"લો":6663,"લ્":14395,"લે":641,"લા":40018,"લિ":942,"લી":2736,"લુ":28591,"લસ":736,"લવ":315,"વસ્":826,"વસે":1125,"ળી":539,"વં":274,"ળા":1514,"રો":1975,"ર્":7275,"રુ":979,"રી":7304,"રૂ":908,"રે":1966,"રસ":718,"વાં":660,"રહ":1839,"રા":36128,"રિ":1193,"રવ":1135,"લબ":632,"લપ":735,"લય":580,"લન":2427,"લક":304,"લગ":215,"રક":1797,"રગ":229,"રખ":218,"યવ":2310,"રજ":562,"યા":7096,"રડ":414,"યુ":510,"રણ":534,"રત":14232,"રથ":239,"રદ":1032,"શમા":566,"યે":847,"રન":646,"રપ":682,"યો":569,"રબ":221,"રમ":2425,"મર":703,"મમ":2823,"મલ":319,"મહ":10705,"રં":397,"મી":561,"યડ":242,"મુ":6197,"મા":44661,"મિ":1415,"યત":1490,"યપ":480,"યન":12830,"મે":837,"યમ":548,"મ્":803,"મો":2623,"બ્":1122,"ભર":899,"મજ":2296,"મગ":222,"મખ":281,"મણ":336,"મત":581,"મથ":816,"ભા":32622,"ભિ":388,"મપ":589,"ભો":320,"મદ":1417,"મધ":3415,"મન":2890,"બર":1834,"બહ":293,"મં":287,"બી":424,"બુ":465,"બા":2768,"બિ":244,"બો":581,"બે":417,"પો":705,"પ્":4053,"બન":257,"પલ":826,"પહ":227,"પશ":13852,"પર":1594,"પૂ":2675,"પૈ":6312,"પે":306,"પુ":5227,"પી":742,"પિ":245,"પા":6287,"ન્":1623,"નો":5861,"પણ":494,"પત":282,"પટ":291,"પડ":321,"પછ":431,"નવ":1459,"નર":658,"ધ્":3586,"ધો":283,"નપ":579,"નન":237,"ને":4269,"નુ":10812,"પં":2789,"ની":3990,"નિ":691,"ના":45180,"નસ":606,"ધા":1073,"ધુ":315,"ધી":641,"દે":14545,"ધન":837,"દ્":1191,"દો":459,"ધર":916,"સી ":1244,"નગ":2570,"દશ":215,"દસ":731,"દહ":218,"દા":3639,"દિ":5058,"દી":575,"દુ":1289,"દર":3345,"થવ":580,"વેલ":34942,"શના":12337,"થી":1041,"થા":882,"તો":531,"વૈદ":382,"તે":6014,"દક":1517,"થય":782,"ત્":12219,"થમ":1093,"થક":777,"તી":4432,"તુ":553,"તા":26724,"તિ":1123,"તન":712,"ણે":327,"તપ":511,"તર":3967,"તલ":356,"તમ":2718,"ણા":2366,"ણી":846,"ણવ":795,"તઘ":640,"ડો":2767,"ડુ":286,"ડે":1027,"ણં":571,"ડી":3535,"ડિ":410,"ડા":5615,"ડવ":598,"ડર":315,"ડભ":218,"ડબ":277,"ઠા":1831,"ટ્":673,"ટે":774,"વ્ય":2778,"ટિ":249,"સે ":1144,"ટી":761,"છે":23574,"જન":365,"છી":486,"ઝઘ":245,"જય":244,"છો":305,"જબ":1127,"જર":11973,"જે":2020,"જો":465,"જિ":13950,"જા":1399,"જુ":2555,"જી":794,"જ્":13119,"ઝર":271,"શુપ":2187,"ઝા":567,"સંવ":756,"ટક":225,"સંત":320,"સંખ":325,"ટા":840,"ટલ":348,"ગા":17980,"ગુ":12125,"ગિ":1160,"ઘડ":264,"ગી":427,"૯ ":627,"ગ્":891,"ગો":1143,"ઘર":948,"ઘો":548,"ચર":560,"ચમ":1383,"ચા":2321,"ચિ":12038,"ચી":259,"જં":215,"ચો":437,"ચ્":251,"જક":251,"શહે":470,"૫ ":1091,"કર":2446,"કમ":270,"કલ":513,"કપ":478,"ખં":251,"કડ":644,"ખલ":227,"ક્":3347,"કો":5466,"કે":1500,"૭ ":792,"કુ":6686,"કૃ":229,"કા":21625,"કી":6922,"કિ":350,"કહ":564,"કવ":583,"ગવ":1115,"ગલ":272,"ગર":2876,"ગમ":18397,"ખ્":4351,"૮ ":673,"ખે":6309,"ગન":575,"ગણ":1117,"ગઢ":779,"ખા":2071,"૧ ":1168,"શાળ":1055,"શાસ":252,"એવ":6158,"૨ ":320,"૩ ":1541,"૪ ":593,"ઓન":283,"એક":16184,"૦ ":810,"ઉદ":317,"ઉત":2595,"ઉપ":1216,"સગવ":904,"ઉમ":378,"આં":1102,"અગ":1187,"અં":436,"ઇડ":287,"ષના":751,"આહ":296,"ષનો":362,"આવ":35199,"આદ":1695,"આઠ":804,"આણ":440,"અમ":826,"અર":256,"અન":2071,"ંવ":912,"ંસ":486,"ંત":1429,"ંથ":310,"ંદ":2195,"ંધ":939,"ંબ":1038,"ંભ":281,"ંમ":287,"ંક":540,"શ્ચ":11655,"ંગ":2855,"ંખ":377,"ંચ":4016,"ંજ":464,"ંટ":564,"ંડ":982,"ંઠ":1433,"હે ":1152,"શ્ર":320,"શ્વ":348,"૧૩":1467,"૧૧":1091,"૧૯":283,"૧૦":694,"વડો":2345,"ોટ":1277,"ોડ":1517,"ોજ":373,"વલી":267,"ોન":3584,"ોધ":322,"ોત":616,"ોદ":3809,"ોગ":284,"ોક":3826,"ોઇ":523,"્ટ":1147,"્ત":4385,"્ણ":325,"્દ":602,"્થ":651,"્ધ":1001,"્પ":336,"્બ":262,"વર્":2136,"્ક":609,"્ગ":267,"્ચ":11827,"ોમ":362,"ોલ":1612,"ોય":246,"ોર":2045,"ોવ":257,"વલસ":582,"ોળ":645,"્સ":302,"્ષ":3855,"્વ":11263,"્લ":14321,"્ર":8621,"્ય":26212,"્મ":1641,"ૂર":2771,"ુદ":699,"ુધ":815,"ુન":510,"ુણ":524,"ુત":213,"ુમ":369,"ુર":8763,"ુપ":2377,"ુવ":804,"ુસ":354,"ુલ":6351,"ૂચ":717,"વનો":376,"વનુ":6376,"ૃત":300,"ેક":249,"ેત":5179,"ેડ":2071,"ેટ":618,"ેઠ":219,"ેજ":326,"ેગ":653,"ેઘ":271,"ષા ":222,"વતા":228,"ેર":3345,"ેલ":37195,"ૈક":6315,"ેશ":13804,"ેવ":2024,"ેન":1094,"ેપ":540,"ેમ":3602,"ૈદ":389,"ેસ":990,"હિં":776,"હાલ":1485,"હાર":571,"હિન":919,"ઇ ":1018,"આ ":2702,"ાં ":41350,"ઓ ":7041,"હેર":525,"હેલ":239,"હેવ":536,"હેસ":730,"એ ":663,"ાઇ ":270,"હોદ":867,"હ્મ":326,"ાઓ ":6349,"ં ":62940,"ાગ ":214,"ાડ ":990,"ાદ ":1109,"ાણ ":231,"ાત ":12086,"ાન ":979,"ામ ":12051,"ાલ ":1653,"ાર ":2510,"ાય ":3046,"ાવ ":357,"િક ":1785,"ાસ ":1162,"ષિણ":1462,"ે ":34827,"ો ":11870,"સણા":264,"ષ ":917,"સ ":4627,"સમો":292,"વ ":3614,"શ ":598,"સરા":281,"ષ્ટ":602,"િ ":698,"ુ ":1292,"ી ":24520,"ા ":103799,"સવા":466,"સુર":1060,"દ ":4991,"થ ":242,"સીઓ":544,"ન ":4557,"સુદ":463,"ધ ":850,"સાડ":665,"સાત":801,"સાણ":973,"સાગ":213,"પ ":563,"સાય":2235,"સામ":236,"સાર":662,"સાવ":281,"સાબ":1406,"બ ":1292,"મ ":27791,"સોન":293,"ર ":16908,"ય ":11483,"લ ":10111,"હતા":268,"હત્":6808,"ળ ":723,"ક ":19636,"ગ ":1433,"સ્વ":268,"સ્થ":430,"ચ ":2010,"સ્ટ":235,"સ્ત":1316,"સ્ક":359,"જ ":4325,"ટ ":1263,"ડ ":2185,"ઠ ":958,"ઢ ":645,"હવે":238,"ણ ":3405,"હવા":327,"ત ":27700,"ૂચ ":710,"િત્":251,"ાસા":294,"ાસિ":270,"ાસી":1732,"ાહો":869,"ાષા":325,"ાસણ":337,"ુલ ":6018,"ાસર":237,"ાષ્":514,"ાલન":2232,"ાલપ":364,"ાલય":562,"ંગ ":904,"ારે":564,"ાર્":428,"ારો":350,"ારી":908,"ારા":1301,"ારત":13028,"ારમ":342,"ારડ":261,"ાયત":689,"ાયડ":229,"ામા":16128,"ુર ":3220,"ાવી":500,"ાવા":1445,"ાવલ":274,"િકે":222,"િક્":471,"ંચ ":1075,"ાવત":219,"ાળા":1219,"ાલો":667,"ાલુ":18139,"ાલી":427,"ાલિ":223,"ાલા":242,"ાનો":1081,"ંટ ":317,"ાનપ":429,"ાના":6337,"ાનુ":3493,"ાની":787,"ંજ ":235,"ાદર":601,"ામપ":347,"ંત ":433,"ામન":2336,"ામમ":2770,"ાબર":1409,"ાપ્":387,"ુદ ":457,"ાપી":523,"ાપુ":503,"ાપા":321,"ંદ ":612,"ીદા":229,"ીનગ":468,"ીના":6429,"ીને":1070,"ીની":270,"ીનો":264,"િસ્":315,"િલ્":13910,"િલો":366,"િવસ":2730,"િવા":1650,"િનો":226,"િના":989,"િયા":2403,"ાંટ":436,"ાંઠ":1422,"ાંડ":461,"ાંગ":1094,"ાંચ":1295,"ાંત":590,"ાંધ":547,"ાંદ":419,"ાંસ":263,"ીઓ ":502,"િમ ":11653,"િપ ":367,"િત ":360,"િણ ":1447,"ાણા":1231,"ાણી":404,"ાતી":550,"ાત્":264,"ાથમ":992,"ાતે":354,"ાટી":295,"ાટે":280,"િંમ":251,"િંદ":527,"ાડી":1596,"ાડા":2060,"ાજક":237,"ાજી":257,"ાજ્":12491,"ીય ":442,"ાકી":371,"ાગમ":18270,"ાગન":475,"ાઉદ":238,"ું ":21442,"ૂર્":2560,"ેટ ":236,"ુણા":437,"ુજબ":1107,"ુજર":11840,"ુપા":2208,"ુધન":587,"ીયન":389,"ીમા":379,"ીયા":1731,"ીમખ":262,"ીસમ":292,"ુકા":17641,"ુકો":529,"ુખ્":4233,"ુરી":2300,"ુરુ":440,"ુરા":1358,"ુરત":829,"ુવા":696,"તઘર":640,"ણવા":740,"દસ ":512,"ણાવ":406,"દા ":938,"તપુ":465,"તનગ":252,"દી ":294,"તના":216,"દુ ":479,"તમજ":2184,"તમા":321,"તરી":269,"તરા":350,"તો ":423,"થા ":316,"થી ":996,"નવ ":475,"થવા":574,"ના ":41544,"ને ":3899,"ની ":3744,"નો ":5537,"દરા":2352,"દરમ":423,"તું":277,"તાલ":18057,"તાર":306,"તાપ":566,"તાન":302,"ધા ":271,"તેમ":3056,"તેર":1418,"તેન":315,"દક્":1484,"ત્ત":2727,"થમિ":987,"ત્વ":7552,"ત્ય":281,"ત્ર":1452,"થયે":642,"નપુ":541,"પી ":564,"ધીન":468,"ધાર":249,"ધાન":290,"નવસ":483,"નસવ":370,"ધ્ય":3481,"નર્":560,"દુધ":605,"દેપ":250,"દેશ":13431,"દેવ":535,"ધની":595,"દાવ":797,"દિક":394,"દાર":330,"દાદ":262,"દિવ":4303,"દાહ":863,"નગર":2214,"નગઢ":247,"પર ":286,"પણ ":420,"દોદ":293,"દ્વ":331,"દ્ર":524,"ધરા":484,"બા ":381,"પટે":233,"પાવ":384,"પાર":273,"પાલ":2355,"પાટ":403,"પાડ":735,"પાં":1274,"પશ્":11653,"પશુ":2197,"પલબ":596,"પરા":655,"પંચ":2575,"નું":10559,"નાં":923,"નાર":240,"નામ":306,"નાન":1138,"પછી":428,"ન્ય":559,"ન્દ":395,"બહુ":235,"બાક":373,"બાર":800,"બાય":217,"રજ ":269,"મા ":715,"મી ":253,"યડ ":217,"યન ":455,"બરક":1386,"મો ":1343,"પૂર":2616,"પુર":4959,"પૈક":6303," આ ":2656,"પોર":340," એ ":485,"પ્ર":3459,"પ્ય":389,"માં":41157,"માટ":294,"માન":402,"માણ":276,"માત":359,"માલ":325,"માર":263,"મિક":1063,"મહત":6802,"મહા":1848,"મહિ":850,"મહુ":233,"મહે":917,"યત્":701,"મેઘ":271,"મુખ":4261,"મુજ":1125,"મુવ":353,"યતઘ":640,"મપુ":552,"રે ":767,"મમા":2808,"મધ્":3365,"મદા":1324,"રી ":4562,"મના":2419,"રો ":232,"મજુ":2185,"મખે":264,"રા ":5544,"મતન":247,"મથક":765,"ભિલ":318,"ભાર":12956,"ભાગ":18607,"ભાષ":335,"બોર":287,"રત ":13401,"યા ":2563,"રમ ":485,"યો ":254,"ભરૂ":710,"બ્ર":421,"બ્ધ":597,"ળા ":1138,"રેગ":384,"ળી ":385,"રોત":365,"રાં":597,"રાય":238,"રામ":591,"રાવ":421,"રિક":325,"રાષ":510,"રાડ":230,"રાણ":287,"રાત":11870,"રાથ":991,"રાપ":541,"રાજ":13365,"રીય":783,"રીન":986,"રું":396,"રીક":260,"રિય":414,"રૂચ":710,"રવા":911,"રહવ":219,"રહે":1153,"રહ્":327,"રપુ":361,"રમ્":382,"રમા":804,"લો ":1417,"લા ":25318,"રના":235,"યેલ":764,"લી ":1812,"રદે":816,"રડી":227,"યાલ":588,"યાર":1934,"યાન":576,"યાપ":305,"રકા":1582,"લય ":553,"યવસ":2216,"યપૂ":410,"મોડ":217,"મોટ":454,"યનો":326,"યના":11717,"મ્ય":415,"યમા":407,"લન ":2219,"મ્બ":232,"લ્લ":14131,"વે ":1140,"લોલ":465,"લોડ":396,"લોદ":449,"લોક":3642,"વા ":7780,"વી ":1602,"લુક":18002,"લીમ":321,"લુણ":389,"લિય":312,"લું":9913,"વસ ":2535,"લાન":6932,"લિપ":369,"લાસ":286,"લાવ":223,"લાલ":307,"લામ":5764,"લસા":677,"શક ":371,"વર ":236,"લબ્":597,"વદ ":505,"લપુ":412,"વત ":759,"વડ ":417,"ર્ષ":1963,"ર્વ":2626,"ર્ય":411,"ર્મ":900,"કી ":503,"કા ":605,"કે ":834,"કો ":1354," ૧૦":680,"૧૦ ":535,"૧૧ ":1036,"૧૩ ":1412,"ગઢ ":538," ૧૩":1456," ૧૧":1079," ૧૯":274,"ગર ":1959,"કડી":283,"કડા":244,"કરવ":314,"કરી":1162,"કવા":445," હો":303," હિ":944," હા":540," સો":475," સૌ":273," સ્":714," સિ":322," સા":3562," સુ":1717," હત":654," સમ":395," સર":370," સત":226,"કાલ":238,"કામ":9196,"કાર":676,"કીન":6304,"કુલ":6004," લુ":420," લી":386," લિ":414," લા":253,"કહે":527," લો":3728,"કાં":1539," રહ":1203," રા":13805,"કાન":2517,"કાઓ":6055," સં":1919," શા":1268," શિ":264," શહ":475," સગ":922,"ઘર ":662," શ્":290," વા":1551," વિ":1924," વસ":1926," વ્":2581," વૈ":515," વે":236," વડ":2142," વર":2148," શક":474," વલ":624," વદ":498," પછ":431," પટ":261," પણ":391," નો":313," પા":2926," પુ":759," પૂ":2113," પૈ":6312," પર":651," પશ":13847," પહ":226," ધા":386," દ્":305," દે":13156," નગ":894," ધર":567," ના":1491," નિ":408," નસ":380," ને":386," પં":2780," ધો":248," નવ":1299," નર":628," મધ":3389," ભિ":335," ભા":32144," મથ":749," ભર":862," મે":521," મો":1041," મા":2519," મુ":5993," મહ":9472," પ્":3289," પો":348," બો":476," બે":295," મં":217," બી":247," બા":1559," બહ":292," ડા":372,"કોન":2841," ડે":860,"કોળ":215,"કોટ":356,"ક્ષ":1887,"ક્ર":890," ત્":349," થય":778," દક":1478," તે":5306," થવ":379," દર":551," દુ":722," દિ":2858," દા":1294,"કેટ":221," દસ":704," તર":348," તિ":233," તા":24717," ૯ ":497," ગો":553," ગ્":552," ગુ":12029,"ખેડ":1595,"ખેત":4414," ગા":16841,"ગના":399," ૮ ":570," ખે":5659," ખા":1675," ગણ":237," ચર":392," ઘો":266," છો":266,"ગણવ":693," ઝઘ":243," છે":23548," ચો":383," જં":214," ચા":714," ચિ":300," ઝા":412," જ્":480," જુ":259," જા":631," જિ":13843," જો":332," જે":1921,"ખાસ":931," એવ":6158,"ખાન":217," એક":16184,"ખાત":404," ૫ ":907," ૪ ":431," કહ":562," કવ":319," કુ":6308," કા":1279," કો":831," ૭ ":640," કે":929," ક્":383," કડ":504," કપ":373," કલ":240," કર":2121," ઇડ":271," આહ":296," આવ":35196," આદ":1582," ઉત":2589," ઉપ":1212," ઉમ":372,"ગવડ":906,"ગરહ":219,"ગમા":18349,"ખ્ય":4323," અં":432," અગ":1187," આં":1036," અન":2069," અર":252," અમ":825," આઠ":804," આણ":440,"ગાં":635,"ગામ":16798,"ગિય":1057,"ગુજ":11824,"ઘડી":254,"ગોર":435,"ગોધ":222,"ગ્ર":743,"ઘરજ":242,"છી ":438,"ઘોડ":335," જ ":2898,"છે ":23415,"જબ ":1105,"ચરો":362,"ચાર":560,"ચાય":582,"ચિમ":11652,"ચાં":471,"ચાગ":369,"જી ":285,"જા ":331,"ચમહ":1210,"જે ":397,"જકો":214,"ઝઘડ":245,"જિલ":13824,"જુર":2204,"છોટ":245,"જરા":11822,"ઝાલ":270,"ઠા ":1548,"ટી ":375,"ટા ":336,"જેવ":714,"જેત":374,"જ્ય":12917,"ટે ":225,"ડી ":2334,"ડા ":4375,"ડર ":279,"ટેલ":254,"ટાઉ":239,"ડો ":637,"ણી ":405,"ણા ":1630,"ટ્ર":578,"ડેર":603,"ડેડ":232,"તી ":3871,"ડોદ":1844,"તે ":714,"ડિય":272,"ડાસ":222,"ણંદ":569,"ડીય":740,"તિ ":370,"તા ":7106,"તર ":2912,"થક ":760,"ડાં":355,"ડબ્":254,"ણે ":229},"n_words":[2118540,2468202,1874859],"name":"gu"}


--------------------------------------------------------------------------------
/regex_profiles/default/portuguese.conf:
--------------------------------------------------------------------------------
  1 | [META]
  2 | # Portuguese default config.
  3 | 
  4 | # Comma delimited list of language codes associated with this language profile.
  5 | # The script will run against all sub-labels like ":forced" as long as they match the language code.
  6 | # leave empty to apply to all language codes.
  7 | language_codes = pt, por, portuguese
  8 | 
  9 | 
 10 | # Information about how to configure the REGEX sections, read at the bottom of the file.
 11 | # All regexes are case insensitive!
 12 | [WARNING_REGEX]
 13 | 
 14 | ### Some Keywords for Translating, Subtitling, Sync, etc...
 15 | pt_warn1: \b(Legend(a|e|ado|ar|as)?|(Res)Sincroni(a|zada|zado|zação|zações)(s)?|Tradu(za|zir|zido|zida|ir|ção|ções)(s)?)\b
 16 | pt_warn2: \b(Rip(ada|ado|ped)(s)?|Corrig(ida|ido)(s)?|Corre(ção|ções)|Re(s|ss)ync|Revi(sar|sada|sado|são|sões)(s)?)\b
 17 | 
 18 | ### Usual Keywords and Phrases 
 19 | pt_warn3: \b(CREIA EM DEUS SEMPRE|DESCOBRIDOR DE PLUTÃO|O FUTURO É AGORA|Junte-se a nós|QUER SE JUNTAR A NÓS|Visitem o site para mais informações)\b
 20 | pt_warn4: \b(DIGA NÃO À CENSURA|TV NUNCA MAIS|FILME JÁ|POR UM MONTE DE NERD DOIDO|POR DOIS VELHOS GAGÁS|A p r e s e n t|Dica para download)\b
 21 | pt_warn5: \b(anos fazendo Arte para você|Agradecimento(s))\b
 22 | pt_warn6: \b(Siga nosso perfil|Siga-nos (no twitter|nas redes sociais))\b
 23 | pt_warn7: \b(Qualidade é InSUBstituível|Quality is Everything|Quer legendar co(nosco|m a gente)|Quer legendas)\b
 24 | pt_warn8: \b(Batmans|bielo|Bozano|Gamaia|Cassão|Chei|chicon|Chucky|CHULOS|Coco de Rato|Danielly|Darks|DarkSide|Darrow|Darwina|Davros|Deluxe|Duda)\b
 25 | pt_warn9: \b(Enjoy|Team|Esmera|Fahrenhheit|GeekS|Ghost|Guerra_|Hirschen|Honoré|InSanos|InSUBs|JouJou|JVM|KiKo)\b
 26 | pt_warn10: \b(Lalinha|LariS|League of Legends|Leonessa|Leooni|locke|Lunardelli|ManiacS|Marines|Marvetes|Mullr|Murrice|MUSKETEERS|NaNNa|Nava|NEXUS-(6|9)|Nova Prime)\b
 27 | pt_warn11: \b(Optimus|Otoni|Patronnus|Patyy|Pirandello|Pirandelo|Pix|PT-BR|PT-Subs|Pumari|Rainbow|Reaper|Release|Renatinha|Renegados|Rezinha|Rouge)\b
 28 | pt_warn12: \b(Salomao|Sardinha|Satsuki|Takehara|Tati|thaais|Vahainen|Vahainen²|wallop|Will Graham|Wuornos|Yang|Zeh)\b
 29 | pt_warn13: \b(Episódio|ENGLISH|MKV|UNITED|XEROX|Deluxe|Facebook|Instagram|Twitter|PT.BR|Whatsapp|Tiktok|MARVEL STUDIOS)\b
 30 | pt_warn14: \b(HBO((| )Max|GO)|Apple(| )TV|Disney+|Disney(| )Plus)\b
 31 | 
 32 | #pt_warn#: Regex goes here.
 33 | 
 34 | 
 35 | [PURGE_REGEX]
 36 | 
 37 | ### Temporada XX Episodio XX
 38 | pt_purge0: \bt(emporada)?\W*\d+[^,]\W*e(pis(o|ó)dio)?\W*\d+[^,]
 39 | 
 40 | ### Subtitler Nicknames / Membros de grupos de legenda
 41 | pt_purge1: \b(0tavi0|1N73RC3P70R|3runo|@ndré Roch@)
 42 | pt_purge2: \b(A.Valim|AdctdGrl|adrianrkt|Adrih87|afi25|Ahenius|AirtonSub|akitemostudo|AlanCristianoBr|AlbanioFPC|Albergi|AlbustigriS|alcobor|Alexandre(MT|Metal))
 43 | pt_purge3: \b(AlexMagno|AlineMarin|Alphankh|(Á|A)lvaroEJ|alxmota|Amand@|Anap9|anap²|anchorboy|Andrebavila|anoXmous|ARDiLOZO|AriadinaPrates|Artaquilus)
 44 | pt_purge4: \b(arthurdenner|Artrixzera|arturfreire|Atchiman|athomas|Atlantes Eddy|Audio8|AugustCr|Austhra|azamba89)
 45 | pt_purge5: \b(Baco Dionisio|bacontarin|BadWolf|Bakugan|batman.inc(00)|Baudrillard|BBorges|Bello_Brasil|Ben Reilly|Ben197|Bereuza|BethRockefeller|bgarland|Biamussolin|BigTasty)
 46 | pt_purge6: \b(BINHOCV|BITCH|BLClaudio|BLuk|Bobdvd|borbabarba|bozxphd|BrandonMotif|brayanatsix|BRENYS|BruFeiden|brunastark|Brunnen-G|BrunoLoko|Brunowsk|Btarth|btsix|Buckley97)
 47 | pt_purge7: \b(CacauDias|caconti|Cacstim46|Caio Kameda|Caio(15|albanezi|l|ski)|Capejuna|cbsgrillo|Celow|celso(drx|jp)|Cesart|cezarrezzo|CHaandde|Chacalbhz|ChaosCosmico)
 48 | pt_purge8: \b(chereguedel|ChronoAlvein|CiCiNHA|cinefala|Clebertsf|Cond(e)Vla(d)|coriango4|Cotter CS|cricknick|crisvs|Cross65|Cumby|curiango4|Cybervicious|Cynthiam|c_wolff)
 49 | pt_purge9: \b(D3QU1NH4|D4VR0S|D@nipbr|DanDee|danidc|DanielG|daninegredo|dani_nemo|DarkEagle|Darkway|DebCarda|Deberle|DedaGlima|deGroote|deiaoliveira|Denarians|derson78|DiabboVerdde|DianaP)
 50 | pt_purge10: \b(Dicaoli|dinho1903|Diogodine|Diogo.vix|diogo(dasilva|matos)|Dolinsky|Doris_The_Man|Dougsan|DrCaio|DreamMetal|dreeh|Drope|DSergio|dtlagreca|DudSS|duh_sobieski|Durenkian|Dyxtendent)
 51 | pt_purge11: \b(e.gomide|Eagle_1984BR|edmadness|EduCLJ|EIWoOdBIUeS|Elderfel|ElFrijole|eliasyss|elsubtitle|ElWoOdBlUeS|ericarockcity|ErosCohen|eryckcampista|Eryx|explosiveskull)
 52 | pt_purge12: \b(fagmiranda|Fanuelbenne|Farnezi|Fefavrin|Felipemaximus|FellipeMarcel|Fernandoleao|Fe_Fratta|Finovsk|fish_n_chips)
 53 | pt_purge13: \b(FLeCha³|Flechudo|Floomers|FormigosaJr|fotojrFoxxy|fox_sts|Fr0g|frankensubber|fscolari|Fulanapster|FxJeloka)
 54 | pt_purge14: \b(GabeOKane|Gaboro|gabriel3color|GabyReis|galaksoda|Galassio|gameonbels|GBelds|General GeeK|Gerigato|germanabh)
 55 | pt_purge15: \b(GFaria|GGoedert|GiRoberta|gkarnikow|GoianoDoido|gorecorpsed|GPMaus|Grego²|GuiZahn|gusss|gusx|Gybiru)
 56 | pt_purge16: \b(HaloSouza|Hatter|Helder1965|hell_ena|Honoré legendou|Huoo)
 57 | pt_purge17: \b(IagoM|IceBreaker|imaycon|imdavros|Insane Metal|IrioMk|Ironnerd|IsaacA|IsaMF|Ivandrofly|IvanHalen|Ivanz|Ivekiø|i_ravena)
 58 | pt_purge18: \b(JadalSarduu|JAIGDeTITLES|JaspCardoso|JBarra_|Jehhuty|JennyB|JessyBrug|jfbruna|JhéFranchetti|Jluizsd|John2nitro|John93)
 59 | pt_purge19: \b(JohnnyBoy|José Cesamildo|JotaKretli|ju.Alves|JuliusMarques|Juli_Ca|JuMascarenhas|Junio_Tk2|Just4Fun|jvFlores|JVMRL)
 60 | pt_purge20: \b(KahGarcia|KahX|Kakko|Kalash|KarolusM|karynasb|KaylaSRP|Kayronrdm|Kcyre|kDragon|KenziG|Kesya_Lele|KetchSketch|KiLL3R|kindtwin|KnaveofHearts|Koelax|Konsquildo|Kuantou)
 61 | pt_purge21: \b(L3MOS|Lady(.)Devon|LagerthaL|laiiss|LaisRosas|LAPUMiA|LayAires|LayHolmes|laylamot|ldegroote|Lecko_alx|LEECHER05|leeht|Legionario13|Leifáklärd)
 62 | pt_purge22: \b(Lekaakel|LelaBastos|leo191|leojiu|leorutodb|Letirreis|LexJT|LeZzZaDo|LFeitosa|lhenrique|Li4rs|LikaPoetisa|Liporage|lLeandro|lletaif|lostlocke)
 63 | pt_purge23: \b(LqRner|Lu (Colorada|Stoker)|luanmarzulo|LucasFB|LucaSkywalker|LucasScript|lucasvsriveiro|LucyLo|LuizSK|LukeWhosoever|luscafusca|l_lost)
 64 | pt_purge24: \b(M.Esquivel|M4rzulo|macedo540|macflii|MadGirl|madhater|Mad Titan|MaKTaiL|MaLorencini|Malucat|Marcio_br|Marck93|MarcRip|mari.luz|MarianaR|Marinhojmc|MariTMS|MARK-ONE)
 65 | pt_purge25: \b(marmotadebermudas|MasterHit|Mastther|MatheusBozetti|MatheusM|Matvix|mawricio58|Maxikd|MaximoPoder|MayAC|mayared|mazepo|mcaio|mctosco123|meggie40|mellodemenezes|Mhaser)
 66 | pt_purge26: \b(MilleG(.)|MiltonGGJ|MissBia|MissG|Miss_Foster|MisterNauta|MiTaHD|MitanidaniJP|mmachado7|Monteiroide|Monybelle|Morbeck|Moviehash|MrRamonster|Mrs.CaT|mychael.ds)
 67 | pt_purge27: \b(N.Honda|n0Te|Nandus|Nati_nina|NatLittleHand|NatSol|nattyck|NayCielo²|Nbkiller|NetLion|NGed|Nightcrawler|NikaBrasil|Noirgof|NoriegaRJ|NoSpoiler|NoT-XoR|nuganath|nytubi)
 68 | pt_purge28: \b(OmiMau)
 69 | pt_purge29: \b(pablo.cesar.90813|Padfoot|Pampbs|Paniago|Paranhosgomes|patinatiluft|PaulaCrespo|paulinhaM|paulostriker|Pedrorms|PedroSPJ|Peposo|Petrogui|Pichocho|Pinguim(.)SP)
 70 | pt_purge30: \b(Pirata-Tuga|Pitombeira|Pointless|Ponomarenko|PowerPlay|Predator_Alpha|primoeerie|PsychoBrasco|PsycoWave|Psyhead|Pt-Mighters|puraserena|Purpleness)
 71 | pt_purge31: \b(R.Zen|Rachmaninoff|RadTail|rafa1504|Rafael UPD|RafaMontagner|Ranko|ratcicle360|RCuestas|Re Guedes|Recov2.0|recrutacreepy|RedSoldierBR|RedTail|renatamm)
 72 | pt_purge32: \b(RenatoCochrane|Reptarop|rezimm|rhuannalves|Rhuanpci|RicardoMica|richlips|rickSG|rmasaranha|rMonta|robfilho|RocketJao|Rodrigo880414|Rominho|RSQuint|rubenfmsilva|rushe)
 73 | pt_purge33: \b(Saaresto|samhk222|Samuholmes|Sarabp|Saylorman|ScarNeedle|SenpaiBaka|Shockey|SilneiS|Sk@llTow|skoad|skøad|skØad|SlipknotPE|SLRipsPT|Snoopysoft|SOFTITLER)
 74 | pt_purge34: \b(SongMade|Sonic( |)2099|SpoiledCat22|Sr(.)( )Loko|Stark²|StarManiacO|Studzes|sub.Trader|Subsfreak|subXpacio|SuB_VersioN|super_zed)
 75 | pt_purge35: \b(Tati( )Saaresto|Tati_89|Tchodz|Tecsamp|tellos0|ThaySoul|The H@tter|The Pilgrim|Thedao|The_Tozz|Thiago Legionário|ThiagoW|ThuNderSubs|ToBe_AFM|Tranceman|trancero_ssa|Trecker1963|TuGAZx)
 76 | pt_purge36: \b(UliPetit)
 77 | pt_purge37: \b(valuuh|vanagamer|Vansgomes|Vegafloyd|VHanelli|VicodinTrip|victorcruel|Vikingbyheart|VIKT0R|vikyor1|Vinilator|ViniTimm|virtualnet|vitckari|Vitørr|vivisilusion|Voitek_|V¡¢¡öµ§|Vódinha)
 78 | pt_purge38: \b(wal_ny|WesleyP|whataisa|willian_as|willy_br|WISHMAKER|Witchdoctor|wkiane|wribeiro)
 79 | pt_purge39: \b(XandeAlves|xaplef|Xlima2003|YsoseriousM|Yuca|Yuca²|yuki_briza|Y_Lima|ZeitG3eist|Zetnos|ZeusRevoLTs|zicadora²|Zinho_1976|ZORAXbr)
 80 | pt_purge40: \b(©yßë® V¡¢¡öµ§|©yßë®V¡¢¡öµ§)
 81 | 
 82 | # Instagram/Twitter @ Profiles
 83 | pt_purge41: \b(@)(avelarneco|b99bra|b99noice|b99noicesmort|briedanversx|cezarrezzo|citeiperalta|ddharis|detailsamberg|diogomatos_|drcaio|dres|helder1965(.)|iarasantos97)\b
 84 | pt_purge42: \b(@)(IdiotasI|imaycon|JBarra_|jluizsd|Julhynha|K_G_B_Dublados|Lecko_alx|lm.samara|paimspring|renatamm|scherbatksy|silneisoaress|Turmagumela|Vahainen|whaIIow|__Fagundes)\b
 85 | 
 86 | ### Addional small nicknames with trailing check, to avoid false positives
 87 | pt_purge43: \b(Dres|Dres²|exande|FerM|Fél|GoDo|Ick|JesKa|Jubler|KBLO|LauraA|LeBraz|LeilaC|Lub's|LuFer|Mabu|mands|Repta|Tozz|Tozzi|VUno)\b
 88 | 
 89 | # Known Portuguese Translators names, professional or not
 90 | pt_purge44: \b(Andre Esteves|ADRIANO PEDROSO|Alan Carlos da Silva|Alysson Navarro|Ana Linhares|Carlos Eduardo Niemeyer Teixeira|Cecilia Bedin|Cinthia Alencar|Dilma Machado|Dina Almeida|Diogo José|DREI MARC|Eduardo Nakamura|Eduardo Penteado|Eric Raupp)\b
 91 | pt_purge45: \b(Felipe Aguiar|Felipe Miranda|Filippe (Brandão|Vasconcellos)|Flávia Fusaro|Florinda Lopes|Gabriella Aly|Guilherme (Ferreira|Vasques)|Iara Regina Brazil|Iara Santos|Ibsertson Medeiros|Jairo de Paula|Juliana Gallo|Leandro Woyakoski|Lucas Perissê|Lúcia Leão|Marcela Almeida)\b
 92 | pt_purge46: \b(Marina Baird|Marina Fragano Baird|Marisa Borgerth|Marya Bravo|Matheus Borba|Medeiros Rafael|Michael Lemos|Monika Pecegueiro do Amaral|Mário Menezes|Natasha Marques|Nicole Bracco|Paula Padilha|Paulo Frederico Costa)\b
 93 | pt_purge47: \b(Paulo Frederico da Costa|Pedro Trindade|Pedro Verri|Priscilla Rother|Rafael Magiolino|Reinaldo S. Renzo|Renato Ximenes|Rita Macedo|Rodrigo Barros|Rodrigo Valois|Rodrigo Vieira|Rosana Cocink)\b
 94 | pt_purge48: \b(Samuel Aiala|Selma Bertoncini|Sergio Cantu|Sylbeth Soriano|Sylvio Santiago Fortaleza|Tabita Carvalho|Thais Kitahara|Tiago Aquino|Valéria Egidio|Valmir Martins|Waldir Lopes|Walter Santos|Wilson Vieira)\b
 95 |  
 96 | 
 97 | 
 98 | ### Subtitle Groups \ Equipes de legendas
 99 | ### Instagram @ Profiles 
100 | pt_purge50: \b(@)(aboutskins|AceSubsLegendas|CabronesTeam|ComicSubs|ConSubs|crimesubs|DarkLegendas|darklegenders|EnjoyTeam1|EquipeLi4rs|griotsteam)\b
101 | pt_purge51: \b(@)(inSanosTV|inSanosubs|InSUBs|lotsubs|ManiacSubs|NERDSubs|Queens_OfTheLab|renegados_subs|SuBMakerS|subsfiction|themarinesbr|UnitedTeam)\b
102 | 
103 | pt_purge52: \b(4Elements|4ever.tv|aboutskinsbrasil|AceSubs|Alvinos Brasil|ANP® Rio|Art Subs|ArtSubs|Brooklyn 99 Brasil|BR_FILMES|Cabeças-de-Teia|cabronesteam|comicsubs|Companhia das Palavras|ConLegenders|CreepySubs|CrimeSubs)\b
104 | pt_purge53: \b(Dark Navy|Dark Squad|darklegendas|DarkLegenders|darksite|EnjoyTeam|EnjoyTeam(.)|Forom.com|gameofthronesbr|GeekSubs|GRIOTS|griotsteam|handmaidsbrasil|IdIoTaS.iNfErIoReS|IdiotasInferiores|inSanos.tv|inSanostv)\b
105 | pt_purge54: \b(inSanosubs|JDDigitalArt|Joldies Apresenta|legendas.tv|legendastv|legendasemserie|legendasfree|LegendeConosco|Legendei.com|Legenders|legseries|Li4rs|loschulosteam|lotsubs|ManiacSubs|ModerFokers|Máquina Tradutora Nacional)\b
106 | pt_purge55: \b(NoSpoiler|PT-Subs|P2MBRASIL|Perazza(.)|Queens Of The Lab|REALITYKINGS|RED SKY FILMES|Red WB Team|RedWheelBarrowTeam|Renegados Subs|RenegadosSubs|SceneLovers|ScoopVideos|SDI Media Group|SFSubs|SiNNERS|SKAsubs|SOSTeamLTV|SubMakers)\b
107 | pt_purge56: \b(SubsHeaven|SubsOTF|The Marines|the.marines|the.marinesbr|Time Pink|TimeLady|TusSeries|tvsubtitles|Underground(| )Subs|UNITED Team|united4ever|UnitedTeam|VIDEOLAR|Visiontext|Wonder(| )Subs|www.inSanos)\b
108 | 
109 | ### OTHER KEYWORDS
110 | pt_purge57: \b(SRTEd|Subpack|subscene|UNRATED|DvDrip|Translation|EXTREME|BRAZILIAN|PORTUGUESE)\b
111 | 
112 | 
113 | # Common phrases / Used by Sub Groups 
114 | pt_purge60: \b(Avalie esta legenda|Anuncie( | o )seu produto ou marca aqui|Apoiar-nos e tornar-se membro VIP|Contribua tornando-se um usuário VIP|remova todos os anúncios|Quality is Everything|Quality Is Everythig|Ajude outros usuários a escolher)\b
115 | pt_purge61: \b(Nerds Eager to Rock Doing Subtitles|Noobs fazendo Subs|TERRA DOS LATICÍNIOS|Making the Difference|MAKE A DIFFERENCE|KID BENGALA|Qualidade é InSUBstituível|Enjoy apresenta|sejaseupropriopastor|Sua melhor aposta em legendas|Your Last Hope)\b
116 | 
117 | 
118 | # Keywords that ends with colon ":" 
119 | # Revis(ado|ada|ão)
120 | # Tradu(zido|zida|ção|ções)
121 | # Legend(a|as|ada|ado)
122 | # Sincroni(a|as|zada|zado|zação)
123 | # TODO: Add secondary word "por" before the colon ":" for example "Traduzido por:"
124 | # TODO: Add secondary word "de" before the colon ":" for example "Com Tradução de:"
125 | pt_purge71: \b(revis)(\S+\:)
126 | pt_purge72: \b(tradu)(\S+\:)
127 | pt_purge73: \b(legenda)(\S+\:)
128 | pt_purge74: \b(sincroni)(\S+\:)
129 | pt_purge75: \b(agradeciment)(\S+\:)
130 | #pt_purge76: \b(Episódio)(\S+\:)
131 | 
132 | #pt_purge#: Regex goes here.
133 | 
134 | 
135 | #
136 | # -----------------------------------------GUIDE-------------------------------------------------
137 | #
138 | 
139 | # This language profile contains two lists of regex that will look for patterns.
140 | # if you wish to modify or remove any regex, feel free to do so
141 | # but files in the default folder will be overwritten when you update the script.
142 | # You can add and remove keys as long as two keys don't use the same key twice.
143 | 
144 | # WARNING_REGEX:
145 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block.
146 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads.
147 | # 1 warning is ignored
148 | # 2 warnings will be print the block as a WARNING in the log.
149 | # 3 warnings or more will remove the entire block.
150 | 
151 | # PURGE_REGEX:
152 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block.
153 | 
154 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the
155 | # literal character you'll need to escape it with '\'
156 | # for example: matching "www." would require a regex like: "www\."
157 | # you can test regexes online on an regex-tester tool like https://regex101.com/
158 | 
159 | # Feel free to ask me any question on github.
160 | 


--------------------------------------------------------------------------------
/libs/langdetect/profiles/so:
--------------------------------------------------------------------------------
1 | {"freq":{"YO ":13,"jec":34,"jee":32,"D":313,"E":183,"F":66,"G":214,"A":673,"B":249,"C":240,"L":152,"M":367,"N":163,"O":122,"H":180,"I":236,"J":129,"K":173,"U":82,"T":107,"W":226,"V":11,"Q":76,"P":22,"S":486,"R":114,"Y":96,"X":120,"Z":10,"f":458,"g":2154,"d":5233,"e":4497,"b":2102,"c":900,"a":24510,"n":3878,"o":5982,"l":3786,"m":2460,"j":397,"k":2897,"h":3132,"i":6615,"w":2306,"v":27,"u":3829,"t":1545,"s":2871,"r":2895,"q":718,"p":77,"z":23,"y":3607,"x":1698,"jaa":13,"jab":16,"jar":10,"jam":12,"Xam":11,"joo":14,"Xas":10,"jis":14,"jir":95,"jii":13,"jid":17,"jo ":15,"Far":12,"isk":69,"ism":12,"isl":25,"iso":22,"isu":42,"ist":67,"ita":17,"is ":71,"ion":20,"ir ":84,"irs":56,"irt":28,"iro":22,"irk":32,"iri":56,"isi":32,"ish":96,"ise":18,"isb":17,"Wux":23,"isa":134,"ire":16,"ira":131,"iyi":10,"iyo":394,"iya":423,"iye":65,"ixi":16," l":598," m":880,"kii":161," n":189," o":537," h":365," i":795," j":267," k":1328," d":1214," e":328," f":95," g":401," a":1317," b":593," c":361," y":296," x":283," u":599," t":376," w":1834," q":291," p":20," s":807," r":112,"km ":14," J":125," K":142," H":119," I":161," N":93," O":34," L":81," M":322," B":217,"khd":24," C":229,"kha":11," A":275," F":59," G":169," D":236," E":41," Z":10," Y":40," X":90," S":438," R":66," Q":69," P":18," W":211," U":33," T":83,"kee":20,"key":11,"kh ":38,"Web":10,"Waa":56,"ku ":434,"kor":15,"Wax":40,"koo":94,"War":17,"XEE":11,"مد":16,"Gal":22,"و":25,"ي":76,"ف":13,"ق":12,"ل":77,"م":62,"ن":31,"ه":13,"د":46,"ح":26,"ب":37,"ة":21,"ا":98,"أ":11,"ع":29,"ش":21,"س":23,"ر":49,"kar":49,"kas":30,"kan":49,"kal":143,"kam":32,"kad":48,"kac":14,"kab":10,"kaa":81,"ka ":1268,"A ":83," Ga":53," Ge":18,"Da":59,"DU":11,"Cu":18,"Co":13,"DE":11," Fi":17,"Ce":13,"DH":15,"Ci":23," Ha":35,"Du":13,"EY":13," Go":61," Gu":12,"EG":11,"De":45,"EE":45,"EL":14,"Di":29,"Dh":36,"H ":16,"GA":19,"Fa":23," IY":15,"Er":12," Ho":29,"ha ":334," Hi":37,"Ge":18," Ji":25,"Ga":53,"حم":18,"HA":35,"I ":13," Ja":63," KA":16," Is":32," It":29,"GM":12," In":35,"Fi":17,"ham":43,"han":102," Ka":28,"hal":48,"haw":17,"hax":44,"haq":58," Ki":19,"har":45,"has":76," Kh":10," Ju":19,"hah":12,"hab":77,"haa":189,"had":144,"hac":36,"AS":15,"AR":23," MA":17,"AX":27," La":22,"AY":15,"BA":11," Li":11,"C ":10,"AD":43,"AA":51,"AB":14,"AG":11," Ko":23,"AH":23,"hay":333,"AL":37," Ku":26,"AM":13,"AN":35," Ma":180,"Ax":18,"Ar":12,"D ":22,"بن":10," Mi":27,"Ba":101,"CA":15,"Af":65,"بد":10,"he ":25,"Aa":22,"Ab":33,"Ad":10,"Am":17," Lu":25,"Al":38," Ne":14,"Bu":30," Na":32,"Ca":127,"DA":43,"E ":30,"Bi":19,"Be":25,"hda":27,"Bo":30,"Hin":18," Mu":78,"hel":22,"Ku":26,"hee":112,"Ko":23,"hey":26,"hex":72,"Li":11,"N ":26,"her":11,"MA":41,"La":22,"Lu":25,"hi ":27,"Mi":27,"NK":10,"ال":51,"O ":34,"NA":12,"Ma":180,"Mu":79,"Ne":14,"Na":32," Am":16," Al":38,"Nu":16," Af":65,"No":12,"OO":18," Ad":10," Aa":22," Ab":33," Ba":101," CA":12," Ax":18," Ar":12,"hig":23," Be":25,"hid":12," Bi":19,"hin":40,"Go":61,"him":17,"Gu":12," Bo":30,"hii":170," Bu":30,"his":24,"hir":31,"Ha":35," Ca":127,"Hi":37," Ce":13," DE":10," Ci":23,"IN":12,"Ho":29," DH":13,"IS":10," Co":12," Cu":18,"IY":20," Da":59," Di":29," Dh":36,"In":36," De":45,"Is":32,"It":30,"Ja":63,"KA":33," Du":13,"Ji":25," Er":12,"Ju":19,"LA":35,"Ka":28,"Kh":10,"ho ":53,"Har":14,"Ki":19,"LE":16," Fa":23,"gma":64,"go ":32," Xi":13," Xa":51,"UU":11,"yuu":26," Wu":23,"To":11,"Th":10," Wi":15," We":12,"Ta":37," Wa":133,"St":13,"Su":23,"Wu":23,"gob":97,"Wi":16,"Wa":133,"XA":19,"We":12,"XE":12,"Y ":18,"yst":29," Yu":14,"yso":15," Ya":10,"WA":26,"gmo":41,"ysa":93,"Qa":26,"Qo":17," م":12,"RA":10,"S ":18," ع":21," ا":48,"goo":52,"R ":20," ب":13,"gsa":14,"gu ":229,"Si":17,"Sh":86,"gsi":12,"So":180,"Ru":12,"U ":11,"Sa":70,"TA":13,"Re":13,"SH":11,"Ro":11,"yoo":24,"Qu":16,"SA":16,"Ra":20,"gud":22," Nu":16," No":12,"gta":43," Ra":20," Qu":16,"b ":130," Ro":11," Re":13,"guu":20,"gun":12,"a ":5909," Qo":17," Qa":26,"شي":10," Su":23," St":13," Ta":37,"Ya":10," Th":10,"Yu":14," To":11," Ru":12," Sa":70,"Xa":51,"YO":15," Sh":86," Si":17,"Xi":13," So":180," WA":20,"ري":12,"Gob":48," ja":60,"i ":853,"ye ":36,"ian":11," iy":365," ji":127,"ge":93," je":47,"ga":1135,"fk":16,"Ing":16," im":15," in":148," il":54," ii":23,"ic ":14,"fi":49,"fr":45,"fu":47,"ft":29,"fo":18," is":155," ka":688," kh":13,"hd":44,"he":286," ki":46," ke":11,"ha":1580,"gn":11,"gm":108," jo":14,"gl":15,"gi":72,"id ":171,"gu":305,"iba":32,"gt":52,"gs":27,"gr":15," ju":17,"go":196,"du":188,"dw":36,"dy":13,"g ":83," ha":190,"ea":16,"eb":72,"yee":61,"ec":51," he":28,"ed":360,"de":252,"dd":113,"di":494,"dh":632,"dk":189,"dl":33," go":117,"do":234,"dn":22," gu":55,"ia ":36,"ex":102,"ey":554,"fa":110,"h ":441," id":15,"fe":17,"eh":54,"ib ":32,"eg":202," hi":20,"ee":1263,"el":242,"ek":35," ho":120,"ei":12,"yey":26,"en":172,"em":31,"et":26,"es":93,"er":287,"ya ":266,"ca":427," ni":37,"e ":881," ne":15,"bs":21," na":54,"br":36,"bu":104,"bt":55,"bn":18,"bo":234,"bk":30,"bl":13," mu":48,"ig ":10,"bi":355,"bb":15,"bd":41,"be":201,"db":11,"da":2087," og":18,"f ":98,"cy":18," of":16,"cu":41,"ct":11,"cs":27,"co":62,"cm":24,"cn":13,"cl":19,"ci":73," nu":10,"ch":33," no":73,"ce":64,"cd":20,"yad":111,"yag":10," le":91,"c ":51,"yaa":287," la":334,"icm":22," ku":465,"ici":14," km":14,"ica":25," ko":88," me":49,"az":10,"ay":1458,"idu":13," mi":187,"ba":817,"d ":893,"at":134,"as":580,"yd ":29,"ido":43,"ar":1307,"aq":237," ma":590,"ax":1066,"aw":157,"idk":12,"yay":52," lu":25,"ak":76,"al":1647,"idi":35,"yaw":11,"idh":19,"ai":29,"aj":59,"yar":45,"am":590,"an":1951,"yaq":50,"yan":13,"ac":260,"ida":140,"ad":2243,"aa":4171," lo":138,"ab":630,"ag":664,"ah":1152,"yah":134,"af":128,"iib":15,"nu":38,"iic":11,"nt":263," af":45,"ns":59," ah":473," aa":208,"iig":13," ab":31,"iid":50,"no":160,"nn":18," ad":49,"q ":34," am":103," an":18,"iik":48,"iin":164,"ny":57,"yka":17,"iil":93," al":21,"iim":26,"iis":199,"iir":65,"of":78,"iiq":14,"oc":29," ax":10,"od":156," ar":26,"ob":291," aq":21," as":29,"om":340,"on":186," ba":344,"ok":16,"ol":273," ay":246,"og":129,"il ":80,"ot":41,"os":90," bi":107,"op":10,"oo":1738," be":63,"or":236,"oq":49,"yn ":105," bo":34,"r ":475,"ox":10,"ow":125,"oy":128," bu":35,"pa":14," ca":238,"im ":21,"ika":50,"lo":386,"ige":10,"lm":39,"ll":110,"ls":27,"iga":247,"ii ":339,"lw":14,"lu":48,"igi":31,"yo ":488,"ly":56,"igu":13,"igt":12,"o ":2012,"ma":1465,"mb":52,"mh":21,"me":199,"mk":39,"mi":333,"mp":19,"mo":102,"yna":98,"mu":85,"ihi":82,"yni":14,"na":851,"nb":30,"yne":30,"nc":10,"nd":137,"ne":107,"nf":30,"ng":58,"ynt":29,"ni":213,"nk":312,"nl":21,"imo":20,"ju":17,"jo":31," ee":295,"imi":21,"ki":203,"kh":95,"ke":48,"ind":29,"ina":80," fa":48,"yga":15,"ka":1778,"yi ":19,"m ":103," fu":10,"ino":13,"kt":20," fo":12,"ku":558,"int":102,"ins":10,"ko":130,"ine":14,"ing":16," fi":17,"ini":10,"km":16,"ink":82," ge":36,"li":577,"lk":332,"le":352," ga":186,"ld":23,"lg":22,"inu":15,"la":1306,"lb":52,"iny":13,"n ":1478," co":22,"ht":11,"hu":92,"ikh":54," ce":15,"hi":387,"hn":16,"ho":217," ci":36,"ila":160,"id":471,"ic":103,"yin":59,"ib":108,"ia":61,"ih":88,"in ":262,"ig":350," da":424,"if":21,"yih":49,"yig":21," cu":34,"hy":12,"k ":24,"iq":21," do":45,"ilo":13,"ir":438,"is":630,"it":49,"ill":18,"ilk":32,"ix":28,"ilm":12,"ii":1062,"ij":21,"ik":134," de":120,"ili":51,"il":385,"im":170,"in":663,"io":30," di":70,"yir":13," dh":511,"ima":76,"je":69,"ji":178,"iy":896," du":39,"l ":398,"ja":82,"xi":123,"xo":56,"xm":34,"xw":27,"xu":185,"xb":18,"xa":850,"xe":161,"xd":67,"wg":11,"wi":81,"how":15,"wl":60,"wo":26,"wu":102,"hog":13,"y ":1137,"wa":1722,"wd":13,"hoo":55,"we":185,"hor":60," yi":55," yu":13,"uy":12,"ux":164,"uw":34,"uu":720," ye":13,"ve":10," ya":211,"x ":140," xo":33,"uj":15,"uk":28,"ul":200,"uf":20," xi":90,"ug":210,"uh":16,"uq":90,"ur":259,"hna":12," xu":39,"us":114,"ut":54,"um":90,"un":214,"tu":47,"ub":104,"ua":11,"ud":145,"uc":17," xe":16,"w ":59," xa":103,"to":175,"hul":37,"tr":25,"te":120,"ti":246,"th":37,"ta":784,"su":111,"ss":19,"st":173,"sw":12,"sl":47,"sk":106,"sm":25,"so":371,"sr":10,"sc":17,"se":101,"sh":456,"ي ":20,"xme":19,"si":404,"xma":13,"u ":1296,"sa":722,"sb":21,"rr":20,"rs":115,"rt":160,"ru":77,"rw":11,"rx":11,"ry":27,"ro":144,"rn":40,"rm":32,"rl":22,"rk":200,"ri":397,"hu ":11,"rg":35,"re":258,"rd":49,"rc":12,"rb":25,"ra":754,"t ":51,"qu":35,"qs":10,"xoo":44,"qo":163,"IYO":15,"qi":33,"qe":23,"qa":334,"qd":61,"s ":240,"pu":15,"pr":14," ru":12," u ":194," sa":221," se":17," si":157," sh":112," so":259," qu":21,"xya":13," ra":48," re":33,"ن ":17," ro":11," qe":14," qa":168," qo":69," qi":18," oo":464," or":10,"huu":29," wa":1582," we":88," wo":12," wu":102," wi":39," uu":195,"xud":12,"xuu":133,"Hoo":12," tu":36," us":16," ur":10,"م ":11," um":12," un":11," ug":131,"yg":19," ta":231,"ye":133,"yd":48,"ya":998,"yb":27,"xwe":21,"xy":17," su":25,"yu":34,"ys":166," to":18," th":15," ti":62,"yo":522,"yn":280," te":11,"yk":19,"yi":189,"fee":11,"xey":58,"xee":54,"far":32,"fad":21,"faa":24,"Suu":12,"Axm":14,"xir":17,"xis":13,"xil":26,"xii":17,"xid":14,"xig":24,"Sta":10,"xa ":169,"eyb":17,"eya":63,"eys":74,"Tal":11,"eyn":163,"eyo":14,"eyk":10,"xda":51,"eyd":16,"eye":14,"exa":10,"exd":12,"exe":51,"xe ":46,"xar":38,"Ban":18,"Baa":14,"Bad":22,"xam":54,"xan":16,"Bar":23,"xay":166,"xba":16,"xaa":341,"xad":27,"xag":13,"wux":100,"Aas":11,"Shi":22,"She":12,"Sha":50,"ex ":21,"Af ":19,"ey ":159,"er ":103,"es ":21,"eri":33,"ere":30,"era":49,"Afr":32,"esh":28,"esa":10,"ers":11,"ern":14,"ekh":16,"en ":89,"ela":47,"ele":26,"eli":17,"ell":42,"elo":15,"emb":19,"ena":28,"wla":53,"eny":12,"egm":90,"ego":14,"egt":11,"Som":32,"Soo":136,"woq":10,"el ":65,"wda":13,"Buu":11,"Bur":11,"we ":12,"gir":17,"gii":26,"wey":124,"wee":27,"gey":15,"gee":44,"wi ":14,"wis":10,"wii":22,"Sal":11,"gab":12,"gac":45,"gad":26,"DA ":20,"gaa":436,"gar":35,"gay":21,"gal":70,"gan":69,"ga ":388,"San":27,"wa ":22,"Cab":27,"waq":26,"wan":30,"wal":39,"wax":715,"way":45,"Cal":18,"war":52,"was":18,"Car":40,"waa":581,"wad":168,"Bel":10,"fur":37,"Bis":12,"fri":39,"fii":15,"Boo":10,"fka":13,"da ":918,"de ":22,"dad":131,"daa":159,"dab":19,"dal":113,"WAX":16,"dag":65,"dah":101,"dar":51,"dan":291,"dam":39,"day":61,"dax":79,"daw":32,"Cum":10,"dda":74,"dde":11,"ddi":17,"cun":14,"EEY":13,"EEL":14,"EGM":11,"Deg":30,"cyo":15,"uxu":126,"Daa":22,"Dag":10,"Dal":10,"uxa":15,"uun":88,"uul":63,"uum":13,"uug":15,"uud":50,"uux":10,"ux ":12,"uus":29,"uur":74,"uuq":18,"uut":24,"uwa":28,"co ":26,"cma":23,"ush":13,"usi":11,"use":13,"uu ":316,"usu":26,"uso":11,"uti":16,"uta":19,"cod":10,"com":11,"uqa":33,"uqd":36,"ura":37,"ure":10,"uri":31,"urk":17,"urt":32,"uru":37,"ur ":39,"csi":14,"uma":56,"unt":32,"unk":27,"uni":11,"una":85,"cel":30,"uka":13,"cee":17,"uls":10,"ulo":20,"ull":14,"ulk":27,"uli":14,"ule":16,"ula":26,"un ":29,"che":12,"ul ":36,"ciy":12,"cii":28,"uga":40,"ugu":128,"ugs":11,"ed ":184,"ebi":20,"uf ":13,"uda":33,"udi":12,"eb ":12,"udu":37,"ug ":18,"ega":53,"ub ":32,"eek":25,"een":99,"eel":138,"eem":18,"eeb":23,"eeg":65,"eed":229,"eey":113,"eh ":42,"ees":56,"eer":157,"edk":18,"edi":12,"ede":22,"eda":72,"uba":39,"ubb":11,"edu":15,"ud ":36,"edo":11,"ecl":12,"ece":25,"ee ":319,"dwe":25,"dwa":11,"duu":57,"tuu":22,"doo":96,"dow":37,"tri":10,"The":10,"dna":12,"to ":75,"Dhe":14,"Dhu":12,"dun":12,"dul":20,"dug":23,"too":69,"du ":45,"tii":59,"tig":10,"tir":66,"dha":335,"tio":16,"tic":26,"dhu":33,"dib":25,"dhi":112,"dhe":122,"dho":21,"der":19,"dex":18,"dey":16,"dee":48,"deg":96,"den":15,"di ":38,"dle":11,"dla":17,"tee":36,"dku":14,"dki":33,"do ":77,"ter":36,"diy":39,"din":26,"ti ":29,"dir":60,"dis":51,"dig":42,"dii":165,"dil":12,"dka":134,"the":16,"rga":14,"ri ":48,"rge":14,"rey":42,"ree":110,"rda":15,"rdh":16,"re ":77,"rco":10,"rax":25,"ray":99,"rar":15,"ras":44,"rat":10,"rba":11,"rah":41,"ran":54,"ram":17,"rak":12,"rab":82,"raa":165,"rad":87,"rs ":11,"roo":48,"rna":16,"rne":11,"rni":10,"ro ":63,"rma":23,"Nab":15,"rla":13,"rku":10,"rko":10,"rki":41,"rke":18,"rka":117,"riy":58,"ris":28,"rig":31,"rii":110,"rik":46,"rin":21,"ric":16,"rya":13,"rur":10,"run":18,"ruu":10,"ry ":11,"rsi":16,"rsa":63,"rsh":15,"rta":110,"rto":18,"rte":11,"rti":11,"rub":12,"saa":120,"sab":11,"sad":52,"sag":23,"sah":11,"sal":49,"sam":47,"sbi":14,"san":191,"sas":14,"sar":33,"say":43,"sa ":99,"sha":242,"sho":46,"she":41,"shi":83,"si ":68,"siy":42,"sid":91,"shu":10,"sil":13,"sim":38,"sii":82,"sig":32,"se ":61,"sh ":17,"see":14,"sow":16,"som":59,"soo":214,"soc":14,"su ":25,"sla":30,"sku":37,"ska":59,"so ":55,"sma":15,"حمد":15,"ste":15,"sta":66,"sto":28,"sti":41,"sub":11,"suf":12,"sug":13,"sul":11,"suu":22,"tal":42,"tag":10,"tah":87,"taa":194,"tad":13,"tay":60,"tar":33,"tan":31,"tam":13,"te ":13,"ta ":272,"bka":23,"biy":71,"bis":28,"bir":12,"bil":48,"bin":31,"big":38,"bii":37,"bo ":47,"bol":129,"bna":15,"boo":24,"bba":12,"be ":19,"ban":61,"bal":43,"bah":27,"bad":232,"baa":96,"bab":12,"bay":35,"bax":34,"bas":10,"bar":156,"bdi":25,"bdu":11,"bi ":69,"bee":145,"ber":11,"bey":12,"ca ":55,"car":35,"cas":13,"can":24,"cay":13,"cab":20,"cad":53,"caa":145,"cal":33,"cag":16,"bri":13,"bra":15,"bsa":11,"bta":33,"bti":13,"bur":20,"bul":12,"buu":52,"aka":19,"am ":40,"aki":23,"aji":27,"ajo":16,"qa ":12,"al ":136,"ahi":41,"qar":20,"qay":16,"aho":10,"qad":44,"qab":47,"qaa":149,"ahd":20,"qan":14,"qal":17,"ahe":26,"aha":697,"agm":13,"agt":24,"agu":76,"ago":29,"aq ":22,"qdi":38,"qda":17,"any":23,"ano":51,"ann":10,"ant":70,"ans":32,"ane":21,"ang":10," ال":46,"ani":87,"ank":185,"ana":385,"anb":26,"and":92,"amu":23,"amo":10,"amk":32,"amh":19,"ami":82,"ame":93,"amb":16,"ama":257,"aly":20,"qey":14,"alo":160,"alm":17,"all":22,"alk":165,"alg":17,"ali":424,"ald":14,"ale":110,"ala":480,"alb":42,"an ":924,"aba":194,"abd":37,"abe":56,"abi":146,"abk":18,"abo":40,"abt":38,"abu":36,"aca":130,"aab":114,"aac":13,"aaa":15,"aaf":38,"aag":64,"aad":398,"aaj":28,"aak":21,"aah":75,"aan":742,"aal":743,"aam":113,"aas":211,"aar":259,"aaq":41,"aaw":32,"aat":37,"aay":89,"aax":19,"ad ":334,"qiy":15,"ac ":19,"aa ":1110,"qii":10,"ab ":33,"afr":11,"aft":15,"afi":18,"aga":458,"age":12,"ah ":325,"afa":38,"ado":85,"adl":23,"adk":153,"adn":12,"adh":26,"adi":223,"add":96,"ade":66,"ag ":29,"adw":22,"adu":44,"aci":16,"ace":10,"Qar":12,"acd":15,"ada":1138,"af ":19,"acy":15,"acs":19,"qor":48,"qoo":60,"qof":24,"axi":13,"axm":15,"axo":15,"axu":15,"axa":702,"axb":16,"axd":50,"axe":90,"ayi":11,"ayo":52,"ayn":115,"ays":84,"ayu":13,"axy":16,"axw":26,"ayb":10,"aya":151,"ayg":11,"ayd":32,"aye":26,"ba ":84,"qur":24,"at ":11,"arg":25,"are":96,"ard":30,"arb":14,"ara":357,"aro":72,"arn":19,"arm":17,"arl":10,"ark":135,"ari":153,"aru":20,"ars":39,"art":72,"asa":99,"ary":14,"asi":106,"ash":156,"ase":12,"aso":31,"ask":17,"ar ":198,"as ":80,"aqa":111,"aqi":13,"aqo":51,"ax ":98,"awe":20,"ay ":932,"awa":46,"awl":31,"awi":33,"ata":37,"asu":12,"ast":33,"ato":18,"ate":17,"ra ":58,"ati":34,"ngi":20,"ni ":47,"Isl":11,"neh":11,"ng ":11,"nee":16,"nfu":25,"ney":14,"ne ":43,"ndh":18,"ndi":22,"nan":17,"nac":45,"nad":83,"nah":41,"nab":18,"naa":131,"Ito":28,"nbe":15,"nd ":69,"AXE":10,"AY ":10,"nba":11,"AXA":12,"nay":47,"nax":11,"na ":412,"Jab":13,"Jan":13,"Jam":22,"KA ":11,"KAL":10,"nya":38,"AAL":13,"ADA":25,"nuu":21,"nto":13,"nti":37,"nta":176,"nte":24,"nsi":15,"nsa":22,"AHA":14,"noo":67,"noq":18,"nna":11,"ALA":17,"nle":12,"no ":59,"nki":22,"nka":271,"AN ":16,"nii":13,"nih":11,"nig":39,"niy":10,"nis":15,"nim":17,"nin":39,"ogu":24,"oga":60,"Jub":11,"ol ":60,"oco":11,"odi":15,"of ":38,"oda":43,"ofe":10,"LA ":12,"د ":29,"oba":86,"od ":60,"obo":134,"obi":38,"ة ":21,"oyi":94,"oya":10,"owl":29,"ow ":45,"ost":14,"ota":10,"ose":28,"os ":15,"oon":114,"ool":98,"oom":198,"oof":13,"oog":60,"ood":123,"oob":124,"or ":39,"ooy":111,"oow":16,"oot":14,"oos":65,"oor":31,"Koo":13,"ore":44,"ori":14,"osa":11,"ort":21,"oqo":37,"oqd":11,"ora":61,"ola":52,"on ":52,"olk":99,"ole":20,"olo":14,"oly":10,"ona":28,"onf":25,"oni":16,"onk":11,"ons":12,"ont":14,"oma":298,"oo ":749,"omp":12,"la ":241,"le ":159,"laa":281,"lab":61,"lac":11,"lad":232,"laf":10,"lah":96,"lag":116,"lal":23,"lan":88,"lam":27,"las":21,"lay":70,"lba":15,"lbe":31,"kuw":22,"kuu":18,"kun":22,"kul":14,"kto":17,"MAD":13,"lom":11,"loo":176,"lmo":12,"lmi":13,"lma":10,"lsh":13,"Luu":11,"li ":92,"lga":16,"ley":29,"leh":35,"lee":98,"lo ":165,"lla":49,"lle":32,"lka":311,"lki":14,"lis":19,"lin":48,"lim":15,"liy":204,"lid":28,"lia":24,"lib":24,"lil":40,"lii":17,"lig":30,"ma ":133,"maa":361,"mac":36,"mah":24,"mad":229,"mag":226,"mar":193,"mas":14,"mal":133,"man":32,"may":23,"max":25,"mba":26,"mbe":10,"me ":19,"med":68,"mee":72,"mey":24,"luq":12,"luu":17,"مد ":15,"lya":33,"lyo":10,"Mar":22,"Mas":10,"Mag":51,"Mad":20,"Maa":17,"Max":25,"moo":35,"muq":17,"muu":16,"mul":10,"Mux":13,"mhu":20,"Muq":24,"Mud":14,"mi ":19,"min":17,"mil":14,"mis":11,"miy":27,"mig":18,"mid":170,"mij":10,"mii":25,"mo ":60,"mka":33},"n_words":[94077,109135,83288],"name":"so"}


--------------------------------------------------------------------------------
/libs/langdetect/profiles/sw:
--------------------------------------------------------------------------------
1 | {"freq":{"jer":348,"jen":305,"ji ":6234,"D":1805,"E":874,"F":1081,"G":1202,"A":4461,"B":2717,"C":2251,"L":1530,"M":12761,"N":2782,"O":860,"H":1677,"I":2605,"J":2641,"K":12188,"U":3120,"T":5185,"W":4730,"V":1116,"P":2090,"S":3343,"R":1632,"Y":517,"Z":395,"f":11048,"g":13829,"d":15034,"e":46694,"Feb":214,"b":19688,"c":9784,"a":289584,"n":90468,"o":57043,"l":42025,"m":53651,"j":21456,"k":76835,"h":32492,"i":164978,"w":60984,"v":3863,"u":57506,"t":40551,"s":35298,"r":27443,"p":13501,"z":18893,"y":38832,"x":501,"jar":185,"jan":137,"jaw":201,"é":167,"jim":1500,"jin":4267,"jil":163,"jij":492,"jia":221,"jib":3854,"ito":288,"itu":317,"itw":269,"isp":140,"ist":592,"ita":1061,"ite":213,"iti":334,"ivy":133,"iwa":2430,"ius":183,"ipo":224,"ipi":265,"is ":521,"ion":720,"iop":279,"ipa":165,"ipe":219,"iro":173,"iri":997,"isi":902,"ish":5756,"isa":694,"ire":164,"ira":314,"ja ":1529,"iyo":4644,"iye":227,"izo":242,"izi":413,"iza":568," l":8602,"kif":518," m":27935," n":19872," o":327,"kik":333," h":7652," i":9059,"kij":166,"kim":258," j":5212,"kil":389," k":27977," d":1010," e":802," f":914,"kia":390," g":257," a":6533," b":1252," c":2191,"kiw":279," y":17767," z":2257,"kin":442," u":4361,"kio":148," t":2402,"kip":379," w":34366," v":1482,"kis":520," p":2154,"kit":315," s":6097," r":837,"ki ":2193," J":2627," K":12017," H":1638," I":2128," N":2678," O":803," L":1487," M":12665," B":2646," C":2112," A":4277," F":1046," G":1172," D":1740," E":782," Z":375," Y":513,"и":142," S":3229," R":1588,"а":137," P":2015," W":4707," V":1031," U":3052," T":5117,"kea":156,"kem":150,"ke ":1988,"ku ":187,"kri":520,"kon":141,"koa":3734,"ko ":1214,"ل":165,"ا":240,"juu":155,"jul":257,"jum":177,"kaz":5045,"kaw":137,"kat":14149,"kar":374,"kas":316,"kan":2795,"kao":197,"kal":354,"kam":1048,"kad":160,"kab":375,"ka ":19783," Ga":196,"Da":365," Ge":229,"Co":364," Fr":177,"Ch":770," Ha":622," He":218," Go":142,"Do":469," Gr":177," Gu":142,"De":497,"Di":169,"Fe":311," Id":148,"Fa":160," Hu":173," Ho":177," II":154,"ha ":2668," Hi":392,"Ge":229," Ji":535,"Ga":198," Je":286,"I ":397," Ja":792,"Fr":177," Ir":284," Is":141," It":181," In":316," Ik":143," Il":224,"ham":522,"han":444,"hap":154," Ka":2225,"hai":238,"haj":163,"hak":611,"hal":314," Ke":708," Ki":3568,"har":1714,"has":255,"hat":148," Jo":255,"II ":207," Ju":691,"hag":267,"hab":181,"had":740," La":231," Le":207," Li":441," Ko":414," Ku":695," Kw":4009,"Au":181," Ma":4258," Mb":461,"Ar":475,"As":222," Mk":3388,"Ba":771," Mi":685," Mj":478," Me":615,"Af":445,"he ":544,"Ag":372," Lo":213,"Am":241,"An":463,"Ap":290," Lu":315,"Al":840," Ne":518,"Bu":429,"Br":278," Na":464,"Ca":592," Ni":435,"Bi":308," Mt":420,"Be":362," Mp":146," Mo":643,"Bo":282," Mu":471," Mw":545,"Ku":695,"Kw":4009,"Ko":415,"hez":299,"Le":210,"Li":441,"hes":336,"her":275,"hen":226,"hem":395,"La":231,"Lu":315,"Lo":213,"Me":621,"hi ":3880,"Mi":690,"Mj":478,"Mk":3388,"Ma":4263,"Mb":461,"Mw":546,"Mu":475,"Mt":420,"Mp":146,"Mo":643,"Ni":437,"Ne":518,"Na":466," Ap":290," Am":240," An":463," Al":833,"Ny":247," Ag":372," Af":443,"No":466," Ba":766,"Ok":277," Au":181," As":222," Ar":474," Be":362," Bi":308,"hio":2603,"Gr":177,"Go":143,"hin":1991,"him":244,"hil":432,"Gu":142," Bo":282,"hii":230," Br":278," Bu":429,"his":266,"hir":394,"Ha":622," Ca":582,"hiy":239,"He":219,"II":286,"Hi":393," Ch":768,"Ho":179,"Hu":173," Co":362,"K ":152,"Id":148," Da":365," Di":167,"In":317," De":495,"Ik":143,"Il":226,"Is":141,"It":181," Do":469,"Ir":284,"Ja":792,"Ji":536,"Je":286,"Jo":255,"Ju":691,"Ka":2234,"Has":225,"ho ":334," Fe":311,"Ki":3577," Fa":159,"Ke":708,"Us":172,"Ut":325,"Ur":181,"go ":920,"Un":355,"Uk":150,"Ul":189,"Ui":244,"Uj":249,"Uh":170,"Uf":251,"Uc":175,"Tu":237,"To":205,"Th":275,"Te":258," Wi":3377,"Ta":3841," We":188," Wa":1003,"St":260,"Su":178,"Wi":3380,"Wa":1003,"We":189," Zi":141," Za":152,"Vi":670," Yo":250,"Pr":150,"Pe":270,"goz":233,"Pa":858,"Po":195,"Pi":163,"gom":190,"gon":205,"gos":279,"gor":306,"Se":532,"gu ":424,"Si":424,"Sh":518,"So":239,"Ru":370,"Sa":668,"Re":188,"Ri":138,"Ro":385,"Ra":354," Po":195,"guj":253," Pi":163," Pe":270," Pa":857," Ny":247," No":466," Ok":277," Ra":354,"b ":211," Ro":385,"gwe":166," Re":188," Ri":138,"gwa":280,"guz":429," Pr":150,"a ":143240," Su":178," St":248," Ta":3838," Th":274,"Yo":250," Te":257," To":205," Ru":370," Sa":668," Sh":517," Si":421," Se":528," So":239," Vi":666," Tu":231,"Za":152,"Zi":141," Uc":175," Uf":251," Uh":170," Ui":243," Uj":249," Uk":150," Ul":189," Un":355," Ur":181," Us":172," Ut":325," ja":134,"iak":142,"i ":52347,"ian":874," ji":4522,"ias":364,"ge":1928,"iar":235," je":226,"ga":2900," im":145," in":3363," ik":274," il":4878,"fi":1075,"fr":504,"fu":1927,"fo":752,"ibl":142,"ibi":603," ka":16147,"gw":483," ki":3027,"he":2541,"ibu":4111,"ha":8898,"gl":145,"gi":1836,"gh":1233,"gu":1858,"iba":566," ju":300,"go":2336,"du":838,"dw":136,"g ":607," ha":1606,"ea":1091,"eb":539," he":144,"ec":251,"ed":686,"de":1841,"di":4816,"dh":617,"do":1639,"ia ":9119,"dr":203,"ew":912,"ex":163,"eu":261,"ev":332,"ey":739,"ez":1828,"fa":6104,"h ":704," id":219,"fe":174,"eh":737," hi":990,"eg":644,"ef":303,"ee":307,"el":2120,"ek":2577,"ej":155," ho":139,"ei":650,"ep":643,"eo":1165,"en":9965,"em":2423,"et":1296," hu":4749,"es":2258,"er":4147," nj":147,"ca":364," ni":9330,"e ":10467," ng":147," nd":690,"bw":843," nc":2455," na":6269,"br":408,"bu":5373,"bo":2905," mw":6857,"bl":321," mu":4335," mt":648," ms":331,"bi":2134," mp":280," mo":680," mn":1501,"be":1280," mm":157,"ifu":393,"da":3239,"f ":246,"ifo":606," of":164,"co":390," ny":523,"ck":301,"ci":283,"ch":7388,"ce":365,"ifa":585," le":184,"c ":192," li":859," la":7153," ku":5668,"ich":830," kw":2736," km":140,"ica":140," ko":150," me":184," mf":368,"az":6015,"ay":5308," mi":1257,"ba":6016," mj":5191," mk":1617,"d ":1205,"at":22079,"as":4908,"ar":9773," ma":3449," mb":469,"aw":1490," mc":155,"av":414,"au":1667," lu":341,"ak":14678,"al":8458,"idi":551,"ai":5267,"aj":1998,"ao":6210,"ap":5739,"ide":157,"am":9111,"an":29556,"ac":1224,"ad":3126,"ida":813,"aa":1773,"ab":2568,"ag":1596,"ah":1414,"ae":682,"af":1092,"nu":591,"nt":1270,"ns":4895,"no":1318,"nn":478," am":1335," an":488,"nz":5093," ai":153,"iin":242,"ny":7307," aj":134," ak":183," al":2589,"of":4380," au":941,"oc":308,"od":678,"oa":4118,"ob":631," at":195," as":220,"om":1846,"on":3853,"ok":2328," ba":679,"ol":1930,"oi":1488,"oj":1425,"og":855,"oh":360,"ija":140,"ot":1280," bi":222,"os":1066,"ov":580,"ou":534,"ije":137,"op":845,"oo":318,"or":2938,"iji":1232,"r ":1622,"ow":244,"oz":397,"oy":154,"pe":836,"pa":6921,"po":1264,"ph":151,"pi":2193,"ika":13864,"lo":1408,"lm":337,"Ida":135,"ll":791,"ls":182,"iga":224,"ii ":525,"lu":868,"lt":178,"igh":170,"igi":384,"ly":147,"o ":24303,"mc":173,"igo":169,"ma":8274,"mb":6660,"mh":261,"me":2630,"mf":564,"mk":1733,"ml":210,"mi":3477,"mj":5199,"mn":1546,"mm":321,"mp":578,"ihe":138,"mo":6079,"mr":140,"mt":753,"ms":447,"mu":6394,"mw":6988,"ihi":187,"p ":352,"na":23279,"nc":2788,"nd":5575,"ne":2353,"ng":6858,"ni":24361,"nj":567,"nk":135,"imo":196," es":141," en":369,"ju":713,"imf":161,"ime":354," el":223,"jo":133,"imi":180,"ki":6922,"kh":154,"ind":834,"ke":2748,"ina":8001," fa":353,"ka":45110,"imu":392,"m ":727," fu":177,"kw":3124,"ino":181,"ks":210,"kt":463,"ku":10532,"ins":133,"ko":5804,"ine":479,"ing":1959,"kr":669," fi":274,"ini":4598,"km":156,"li":17984,"le":2997,"ld":221,"lf":159,"la":14880,"lb":250,"iny":275,"n ":3144,"iko":612,"hw":492,"ht":198,"hu":6825,"iki":2488,"hi":11111," ch":2090,"hn":150,"ho":1180,"ila":4379,"id":1813,"ic":1403,"ib":5595,"ia":11251,"ih":490,"in ":378,"ig":1252," da":146,"if":1790,"ie":672,"iku":2496,"k ":628,"ilo":373,"ir":1982,"is":9376,"it":2904,"ill":288,"iu":466,"iv":385,"iw":2556,"ii":989,"ij":1580,"ik":19966," de":224,"ili":8251,"il":13887,"im":4832,"in":17333,"io":4395,"ile":321,"ip":1169,"ima":914,"je":934,"imb":2471,"io ":2960,"ji":17145,"iz":1362,"iy":4997," du":302,"l ":1018,"ja":2368,"z ":191,"wi":1773,"wo":202,"vy":671," za":1702,"y ":1239,"wa":56175," zi":456,"we":2203,"vi":1632,"vu":418,"vo":138,"uz":1451,"uw":2877,"uv":252,"uu":3068," ye":258,"ve":578," ya":17428,"va":328,"x ":213,"ui":563,"uj":4429,"uk":1643,"ul":2575,"ue":357,"uf":741,"ug":901,"uh":626,"ur":1919,"us":3274,"ut":2784,"um":5397,"un":5099,"uo":368,"up":1077,"ty":166,"tu":2287,"tt":391,"tw":473,"ub":1112,"ua":2111,"ud":534,"uc":476,"w ":435,"to":4407,"huk":345,"hul":146,"tl":220,"ts":343,"tr":455,"te":2280,"ti":12092,"th":999,"ta":14867,"su":644,"ss":500,"st":1842,"sw":308,"sl":142,"sk":865,"sm":139,"sp":289,"so":683,"sc":179,"se":5649,"sh":8151,"si":4764,"u ":13704,"sa":7736,"rr":220,"rs":467,"rt":620,"ru":2279,"ry":287,"ro":1786,"rn":619,"rm":257,"rl":223,"rk":320,"ri":8157,"rg":403,"re":3855,"rd":556,"rc":143,"rb":136,"ra":5018,"t ":1231,"s ":3025,"pt":348,"pu":357,"pw":193,"pr":381," sa":589," se":4480," si":369," sh":318," ra":432," ri":188,"hwa":473,"huo":175,"hum":2789,"hun":282,"hus":506,"hur":418,"huu":1333," pe":176," pa":632," pi":931," wa":33135," we":275," vy":396," wi":862," vi":1013," uc":144,"zi":8597,"ze":368,"za":8043," tu":189,"zw":257," us":165," ut":249," up":502," um":247,"zu":272," un":1571," uk":210,"zo":952," ul":573," uh":139," ta":1410,"ye":2395,"ya":24129,"yu":306," to":170," th":289,"yo":5888," te":201,"yi":4283,"Apr":266,"Asi":146,"Aru":195,"far":316,"fam":283,"fan":4203,"fal":292,"fa ":488,"eya":259,"Bah":237,"Bar":140,"eza":1136,"ezo":172,"ezi":237,"eta":229,"ete":154,"eti":253,"est":247,"ett":212,"ew ":355,"evi":165,"ewe":148,"ey ":361,"ewa":358,"er ":615,"epa":149,"es ":640,"ept":299,"eri":650,"ere":660,"era":456,"Afr":406,"esh":359,"ese":306,"esa":279,"eru":498,"Ago":254,"ert":152,"ers":339,"eku":184,"en ":297,"ela":204,"ele":786,"eli":360,"ell":177,"eo ":852,"emb":1055,"ema":157,"eme":314,"emi":276,"emu":365,"ene":704,"eng":671,"ena":283,"end":498,"eno":221,"eni":486,"ens":4087,"ent":441,"eny":1803,"Ali":478,"ege":351,"Ame":158,"ehe":647,"Ana":176,"el ":260,"eke":267,"eka":1754,"giz":193,"gir":232,"gin":349,"gid":165,"ght":136,"gha":925,"gi ":572,"gen":204,"ger":781,"ge ":611,"gaz":140,"gar":155,"gan":693,"ga ":1334,"Cal":307,"fup":194,"Bib":137,"fua":317,"fum":143,"fun":167,"fri":445,"fu ":810,"for":356,"fo ":342,"fil":269,"fik":168,"fiz":146,"da ":1525,"de ":752,"dad":386,"dae":220,"dar":151,"dan":305,"dam":173,"Des":272,"Dar":167,"Chi":216,"Chu":136,"Cha":300,"ch ":165,"cha":2430,"chu":596,"ck ":143,"che":571,"chi":3152,"cho":370,"ed ":154,"ebr":313,"ea ":663,"ei ":346,"efu":197,"edi":297,"ee ":156,"don":150,"dom":308,"dol":151,"dog":335,"dun":335,"dha":302,"dia":330,"dhi":240,"der":146,"deg":261,"del":152,"di ":2661,"do ":429,"Dod":240,"diy":201,"din":291,"dis":387,"dik":302,"ri ":2373,"rez":420,"rea":148,"ref":154,"reh":266,"ren":163,"rek":1672,"re ":305,"rd ":213,"ras":256,"rat":173,"Ni ":218,"New":381,"rai":160,"ran":867,"ram":226,"rab":297,"rad":150,"ron":135,"rog":253,"rne":169,"rni":283,"ro ":593,"riw":166,"ris":508,"ril":300,"rik":1688,"rin":373,"ria":769,"rib":1011,"ric":160,"rk ":191,"ruf":262,"rum":452,"ruk":315,"rus":423,"ry ":194,"rse":228,"Nya":144,"rua":234,"rt ":160,"ru ":273,"sab":458,"sac":139,"san":482,"sas":180,"sa ":5643,"Nov":242,"sha":1745,"sho":271,"she":240,"shi":5099,"si ":1365,"siw":355,"sia":608,"shw":458,"shu":187,"sis":157,"sin":881,"sil":283,"sim":158,"sik":319,"sey":212,"ser":175,"set":147,"Okt":259,"seh":319,"sen":4083,"sem":335,"spa":151,"son":242,"su ":198,"st ":167,"sko":136,"ska":599,"so ":134,"ssa":198,"ste":192,"sta":295,"sto":444,"sti":401,"str":197,"swa":181,"tai":280,"taj":233,"tak":462,"tal":339,"taa":220,"tab":242,"taw":344,"tat":292,"tar":668,"tao":3872,"tan":641,"tam":288,"te ":507,"ta ":6480,"pa ":765,"pat":4120,"pak":235,"pap":248,"pam":300,"pan":895,"pi ":233,"ped":156,"Pap":368,"pia":789,"pil":189,"pin":267,"pis":162,"pit":144,"po ":743,"pte":287,"pri":298,"pwa":189,"Rai":176,"ra ":1932,"ngo":958,"ngi":1065,"ngu":1084,"ngw":363,"ni ":18823,"Iri":209,"nge":937,"nga":1742,"Ita":147,"neo":505,"nes":161,"ng ":405,"nch":2504,"ne ":911,"ndu":263,"ndo":574,"ndi":1835,"nde":1085,"nda":1162,"nak":251,"nal":257,"nam":1855,"nan":221,"nao":1457,"nap":185,"nac":183,"nad":288,"naf":402,"nai":158,"naj":196,"nd ":409,"nat":353,"nas":439,"nay":454,"na ":15738,"Jan":271,"Jam":281,"nya":1379,"Jer":215,"nye":1338,"nyi":4239,"nus":133,"nua":282,"Jim":174,"Jin":277,"nti":403,"nta":151,"nte":177,"nsi":211,"nsa":4269,"nt ":232,"ns ":140,"nne":236,"no ":948,"nji":138,"nja":269,"Joh":134,"nia":4199,"nis":530,"ogo":593,"ois":1291,"oji":173,"oja":1149,"Jul":285,"Jun":259,"odo":288,"of ":150,"ofu":134,"ofa":3991,"oa ":3810,"oan":188,"oba":375,"nza":3817,"nzi":1111,"Kai":144,"Kag":175,"Kal":167,"Kan":354,"Kat":474,"Kas":372,"Kar":232,"Ken":632,"ozi":165,"Kis":329,"Kir":165,"Kit":204,"Kin":148,"Kib":138,"Kia":309,"ote":378,"Kik":287,"Kil":453,"Kim":202,"oto":331,"Kig":295,"Kii":249,"ost":309,"ota":195,"ove":320,"opo":325,"os ":178,"or ":161,"Kon":197,"orn":300,"oro":673,"ore":188,"ori":369,"ort":147,"ora":378,"ola":427,"on ":838,"oli":431,"ole":357,"olo":331,"oka":1580,"oke":163,"oko":236,"oku":141,"ona":230,"ond":383,"one":151,"ong":860,"oni":784,"oma":766,"omb":303,"omi":249,"omo":182,"op ":143,"la ":8089,"le ":1011,"Kwa":3975,"laa":157,"lai":293,"lak":564,"lan":660,"lam":497,"lat":186,"lay":3727,"Kus":393,"lba":165,"kuz":236,"kuw":2713,"kuu":1305,"kut":1795,"kus":492,"kur":190,"kup":186,"kun":409,"kum":210,"kul":297,"kuj":187,"kwe":591,"kwa":2512,"kub":762,"kuf":233,"kuh":134,"kua":620,"kto":308,"lom":136,"loj":136,"lme":241,"Lin":225,"lug":350,"lu ":155,"li ":2787,"lez":192,"lew":193,"lev":140,"les":155,"leo":178,"lem":198,"len":254,"lek":133,"lo ":347,"lla":138,"lle":153,"lli":198,"ll ":147,"lit":241,"lis":337,"lip":257,"lio":738,"lin":627,"lim":922,"liz":411,"liy":4415,"liw":979,"lic":340,"lia":1497,"lik":2742,"lil":529,"lih":179,"lif":397,"ma ":2611,"mb ":139,"maa":449,"maj":397,"mak":522,"mad":206,"mae":140,"mag":342,"mar":439,"mas":613,"mal":159,"mam":161,"man":1055,"mat":406,"mba":3047,"mbi":361,"mbe":389,"mbo":2343,"me ":516,"mbu":267,"mch":170,"met":211,"mer":252,"men":492,"mfa":152,"mez":387,"mfu":373,"Mei":250,"Man":216,"Mar":1940,"Mas":472,"Mag":282,"Mak":206,"Mac":287,"Mbe":273,"mpi":142,"mon":163,"moj":1127,"mpa":160,"Mor":279,"mu ":1602,"mtu":175,"mto":226,"Mic":182,"Mis":147,"msh":144,"mta":228,"mwe":383,"mwi":345,"Mko":3178,"mwa":6205,"Mku":138,"Mji":464,"muj":3839,"muz":374,"mhu":232,"Mtw":147,"mi ":359,"mji":5175,"min":192,"mil":749,"Mwa":460,"mit":295,"mia":630,"mik":321,"mo ":4413,"mku":1038,"mko":539,"mna":1501,"mmo":145,"Wik":149,"Wil":3077,"Wan":148,"zwa":252,"zi ":5785,"zai":249,"zaj":254,"zam":177,"zan":3194,"zal":783,"zar":173,"zo ":612,"zia":533,"zin":815,"zil":197,"zik":548,"zis":240,"一":303,"yof":3874,"yot":286,"za ":2981,"ye ":1320,"yen":237,"ya ":21762,"yar":252,"yan":567,"yao":167,"yam":250,"yak":657,"yo ":973,"yin":213,"yik":3954,"一一":144,"Tan":3407,"Tab":164,"Shi":315,"Sin":201,"Sep":283,"we ":401,"wez":265,"wen":1037,"wim":286,"wil":741,"Sal":197,"vyo":257,"wa ":33121,"wap":4111,"wan":3901,"wal":617,"wam":169,"wak":9923,"way":141,"wat":368,"war":238,"was":172,"wai":2667,"wah":176,"vu ":165,"vya":351,"vil":200,"vin":183,"vit":187,"vis":284,"Rom":180,"vem":244,"Vij":328,"uzi":743,"uza":470,"Uje":235,"uwa":2760,"uvu":174,"ush":417,"usi":1319,"use":183,"usa":176,"uu ":2892,"usu":216,"ust":207,"uso":141,"uti":211,"ute":137,"uta":560,"Uin":218,"utu":215,"uto":1436,"us ":536,"Ung":252,"ura":183,"ure":140,"uri":491,"uru":630,"unz":137,"Ula":150,"upa":554,"upi":311,"umu":162,"umi":484,"umo":2705,"uma":686,"umb":661,"ume":297,"uo ":238,"uni":940,"und":747,"una":1741,"ung":1193,"uku":302,"uko":457,"uki":429,"uka":247,"ulu":258,"uli":1405,"ule":192,"ula":478,"ukw":139,"uhu":267,"uji":4010,"uja":302,"Utu":261,"ugh":514,"ufu":352,"uhi":136,"ugu":137,"udi":174,"ubw":695,"uch":343,"ufa":176,"ufi":189,"ua ":369,"uat":317,"uar":494,"uan":690,"uba":185,"Uch":175,"ty ":146,"twa":450,"tur":369,"tun":270,"tum":424,"Ufa":219,"ts ":214,"tu ":896,"The":164,"tts":142,"to ":986,"tob":268,"tom":167,"ton":281,"tok":1553,"tol":482,"tor":246,"tik":8147,"tis":158,"tin":351,"tio":199,"thu":171,"tia":156,"tem":384,"ten":273,"tel":171,"th ":160,"ter":432,"ti ":2389,"the":225,"thi":213,"biw":209,"bis":191,"bil":315,"bin":256,"bo ":2326,"bli":173,"bor":262,"be ":229,"bam":230,"ban":516,"bal":619,"bah":147,"baa":227,"bab":179,"bay":333,"bar":432,"bao":277,"bi ":662,"ber":216,"bel":151,"bey":251,"bia":222,"ce ":176,"bu ":4649,"bru":221,"bur":149,"bun":177,"bwa":786,"aka":10583,"am ":337,"ake":1982,"aki":644,"aji":1355,"aju":170,"al ":304,"aja":293,"ain":393,"air":222,"ais":2933,"aif":267,"aid":437,"ahi":308,"aha":751,"agh":475,"agu":395,"aoi":1233,"anu":344,"anz":4756,"any":4453,"ano":638,"ann":141,"ant":323,"ans":490,"ane":261,"ang":1660,"ani":7747,"anj":260,"ana":4702,"anc":133,"and":2300,"amu":1047,"amo":1890,"amp":179,"amh":222,"ami":838,"ame":637,"amb":1658,"ama":1868,"ao ":4649,"alo":269,"alm":262,"all":133,"ali":5324,"ale":476,"ala":1026,"alb":152,"an ":1167,"akr":376,"aku":502,"ako":215,"aba":751,"abe":140,"abi":660,"abo":208,"abu":582,"ae ":291,"aad":302,"aan":389,"aal":140,"aam":185,"aar":236,"aa ":361,"afi":303,"ai ":477,"aga":223,"age":227,"afu":225,"aen":162,"ael":172,"afa":411,"ado":269,"adh":288,"adi":1538,"ach":840,"ada":637,"azo":205,"azi":5401,"aza":186,"ayo":638,"aya":4140,"aye":284,"ba ":2178,"are":1998,"ard":317,"ara":2057,"aro":249,"ari":3153,"aru":316,"art":243,"au ":993,"asa":1084,"asi":1169,"ash":895,"ask":665,"ar ":568,"apa":4869,"api":162,"apo":406,"as ":271,"aut":148,"awa":1126,"awi":190,"ata":10070,"ast":167,"ass":197,"ato":634,"ate":225,"ati":9962,"ath":135,"atu":749},"n_words":[1316698,1560317,1165243],"name":"sw"}


--------------------------------------------------------------------------------