├── libs ├── __init__.py ├── langdetect │ ├── tests │ │ ├── __init__.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ ├── test_unicode_block.py │ │ │ └── test_lang_profile.py │ │ ├── test_language.py │ │ └── test_detector.py │ ├── utils │ │ ├── __init__.py │ │ ├── messages.py │ │ ├── lang_profile.py │ │ └── ngram.py │ ├── __init__.py │ ├── language.py │ ├── lang_detect_exception.py │ ├── detector_factory.py │ ├── detector.py │ └── profiles │ │ ├── gu │ │ ├── so │ │ └── sw └── subcleaner │ ├── __init__.py │ ├── languages │ ├── __init__.py │ └── languages.py │ ├── cleaner │ ├── __init__.py │ ├── detectors │ │ ├── __init__.py │ │ ├── chain.py │ │ └── wedged.py │ ├── punishers │ │ ├── __init__.py │ │ ├── time.py │ │ ├── regex.py │ │ ├── duplicate.py │ │ └── adjacency.py │ └── cleaner.py │ ├── settings │ ├── __init__.py │ ├── log_config.py │ ├── config.py │ └── args.py │ ├── report_generator.py │ ├── sub_block.py │ ├── regex_lists.py │ ├── main.py │ └── subtitle.py ├── regex_profiles ├── README.txt └── default │ ├── indonesian.conf │ ├── no_profile.conf │ ├── hebrew.conf │ ├── english.conf │ ├── svenska.conf │ ├── spanish.conf │ ├── global.conf │ ├── dutch.conf │ └── portuguese.conf ├── subcleaner.py ├── default_config └── subcleaner.conf ├── README.md └── .gitignore /libs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libs/langdetect/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libs/langdetect/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libs/subcleaner/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /libs/langdetect/tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libs/subcleaner/languages/__init__.py: -------------------------------------------------------------------------------- 1 | from .languages import is_language, get_2letter_code 2 | -------------------------------------------------------------------------------- /libs/subcleaner/cleaner/__init__.py: -------------------------------------------------------------------------------- 1 | from .cleaner import find_ads, remove_ads, fix_overlap, unscramble, reset 2 | -------------------------------------------------------------------------------- /libs/subcleaner/settings/__init__.py: -------------------------------------------------------------------------------- 1 | from . import config 2 | from . import args 3 | from . import log_config 4 | -------------------------------------------------------------------------------- /libs/subcleaner/cleaner/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .wedged import detect_wedged 2 | from .chain import detect_chain 3 | -------------------------------------------------------------------------------- /libs/langdetect/__init__.py: -------------------------------------------------------------------------------- 1 | from .detector_factory import DetectorFactory, PROFILES_DIRECTORY, detect, detect_langs 2 | from .lang_detect_exception import LangDetectException 3 | -------------------------------------------------------------------------------- /libs/subcleaner/cleaner/punishers/__init__.py: -------------------------------------------------------------------------------- 1 | from .adjacency import punish_ad_adjacency 2 | from .duplicate import punish_clone_blocks, move_duplicated, reset_duplicate 3 | from .regex import punish_regex_matches 4 | from .time import punish_quick_first_block, punish_short_duration 5 | -------------------------------------------------------------------------------- /libs/langdetect/language.py: -------------------------------------------------------------------------------- 1 | class Language(object): 2 | ''' 3 | Language is to store the detected language. 4 | Detector.get_probabilities() returns a list of Languages. 5 | ''' 6 | 7 | def __init__(self, lang, prob): 8 | self.lang = lang 9 | self.prob = prob 10 | 11 | def __repr__(self): 12 | if self.lang is None: 13 | return '' 14 | return '%s:%s' % (self.lang, self.prob) 15 | 16 | def __lt__(self, other): 17 | return self.prob < other.prob 18 | -------------------------------------------------------------------------------- /libs/langdetect/lang_detect_exception.py: -------------------------------------------------------------------------------- 1 | _error_codes = { 2 | 'NoTextError': 0, 3 | 'FormatError': 1, 4 | 'FileLoadError': 2, 5 | 'DuplicateLangError': 3, 6 | 'NeedLoadProfileError': 4, 7 | 'CantDetectError': 5, 8 | 'CantOpenTrainData': 6, 9 | 'TrainDataFormatError': 7, 10 | 'InitParamError': 8, 11 | } 12 | 13 | ErrorCode = type('ErrorCode', (), _error_codes) 14 | 15 | 16 | class LangDetectException(Exception): 17 | def __init__(self, code, message): 18 | super(LangDetectException, self).__init__(message) 19 | self.code = code 20 | 21 | def get_code(self): 22 | return self.code 23 | -------------------------------------------------------------------------------- /libs/langdetect/utils/messages.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | 4 | class Messages(object): 5 | MESSAGES_FILENAME = path.join(path.dirname(__file__), 'messages.properties') 6 | 7 | def __init__(self): 8 | self.messages = {} 9 | with open(self.MESSAGES_FILENAME, 'r') as f: 10 | for line in f: 11 | key, _, value = line.strip().partition('=') 12 | self.messages[key] = value.encode().decode('unicode_escape') 13 | 14 | def get_string(self, key): 15 | return self.messages.get(key, '!%s!' % key) 16 | 17 | 18 | _messages = None 19 | def get_string(key): 20 | global _messages 21 | if _messages is None: 22 | _messages = Messages() 23 | return _messages.get_string(key) 24 | -------------------------------------------------------------------------------- /regex_profiles/README.txt: -------------------------------------------------------------------------------- 1 | Put files in this directory to add custom regex profiles beyond the included profiles. 2 | Any file put here will override identically named files in the default folder. 3 | 4 | Each profile checks its associated language codes individually. Multiple 5 | regex profiles can therefore run against the same subtitle if the same language is specified in the profiles. 6 | You can disable all default profiles in the subcleaner.conf file. 7 | 8 | Regex profiles need to have to a .conf extension. 9 | Profiles starting with a "." will be also be ignored. 10 | 11 | Use one of the default profiles as a template to avoid unwanted results. but make sure you go over all the 12 | purge regexes so that they don't contain any words that are real words in your language. 13 | 14 | -------------------------------------------------------------------------------- /libs/langdetect/tests/test_language.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from libs.langdetect.language import Language 4 | 5 | 6 | class LanguageTest(unittest.TestCase): 7 | def test_language(self): 8 | lang = Language(None, 0) 9 | self.assertIsNone(lang.lang) 10 | self.assertEqual(lang.prob, 0.0, 0.0001) 11 | self.assertEqual(str(lang), '') 12 | 13 | lang2 = Language('en', 1.0) 14 | self.assertEqual(lang2.lang, 'en') 15 | self.assertEqual(lang2.prob, 1.0, 0.0001) 16 | self.assertEqual(str(lang2), 'en:1.0') 17 | 18 | def test_cmp(self): 19 | lang1 = Language('a', 0.1) 20 | lang2 = Language('b', 0.5) 21 | 22 | self.assertTrue(lang1 < lang2) 23 | self.assertFalse(lang1 == lang2) 24 | self.assertFalse(lang1 > lang1) 25 | -------------------------------------------------------------------------------- /subcleaner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from configparser import DuplicateOptionError 3 | 4 | import logging 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | if __name__ == '__main__': 9 | try: 10 | from libs.subcleaner import main 11 | main.main() 12 | exit(0) 13 | except KeyboardInterrupt: 14 | logger.warning("subcleaner was interrupted.") 15 | exit(0) 16 | except PermissionError as e: 17 | logger.error("subcleaner ran into a permission error. Permission denied to: \"" + e.filename + "\"") 18 | exit(1) 19 | except DuplicateOptionError as e: 20 | logger.error("subcleaner was unable to read config file \"" + e.args[2].name + 21 | "\" because there are multiple keys with the same name:\n" 22 | "Option '" + e.option + "' already exists in section '" + e.section + "'") 23 | exit(1) 24 | -------------------------------------------------------------------------------- /libs/subcleaner/cleaner/punishers/time.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from datetime import timedelta 3 | 4 | from libs.subcleaner.subtitle import Subtitle 5 | 6 | 7 | def punish_quick_first_block(subtitle: Subtitle) -> None: 8 | if not subtitle.blocks: 9 | return 10 | block = subtitle.blocks[0] 11 | if block.start_time < timedelta(seconds=1): 12 | block.regex_matches += 1 13 | block.hints.append("quick_start") 14 | 15 | 16 | def punish_short_duration(subtitle: Subtitle) -> None: 17 | for block in subtitle.blocks: 18 | if block.end_time - block.start_time < datetime.timedelta(milliseconds=8/30*1000): 19 | block.regex_matches += 1 20 | block.hints.append("short duration") 21 | 22 | if block.end_time - block.start_time < datetime.timedelta(milliseconds=3/30*1000): 23 | block.regex_matches += 1 24 | block.hints.append("very short duration") 25 | -------------------------------------------------------------------------------- /libs/subcleaner/settings/log_config.py: -------------------------------------------------------------------------------- 1 | import logging.handlers 2 | import sys 3 | from . import args, config 4 | 5 | # formatters 6 | time_formatter = logging.Formatter("{asctime} - {levelname:>8}: {message}", style="{", datefmt='%Y-%m-%d_%H:%M:%S') 7 | formatter = logging.Formatter("{levelname:>8}: {message}", style="{",) 8 | 9 | base_logger = logging.getLogger() 10 | base_logger.setLevel(logging.INFO) 11 | base_logger.handlers.clear() 12 | 13 | # file handler 14 | if not args.no_log: 15 | file_handler = logging.handlers.RotatingFileHandler(config.log_file, maxBytes=10_000_000, backupCount=10, encoding='utf8') 16 | file_handler.setFormatter(time_formatter) 17 | file_handler.setLevel(logging.INFO) 18 | if args.errors_only: 19 | file_handler.setLevel(logging.ERROR) 20 | base_logger.addHandler(file_handler) 21 | 22 | # stdout handler 23 | stout_handler = logging.StreamHandler(sys.stdout) 24 | stout_handler.setFormatter(formatter) 25 | stout_handler.setLevel(logging.INFO) 26 | if args.silent: 27 | stout_handler.setLevel(logging.WARNING) 28 | if args.errors_only: 29 | stout_handler.setLevel(logging.ERROR) 30 | if args.debug: 31 | stout_handler.setLevel(logging.DEBUG) 32 | base_logger.addHandler(stout_handler) 33 | -------------------------------------------------------------------------------- /libs/subcleaner/cleaner/punishers/regex.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Tuple, Pattern 3 | 4 | from libs.subcleaner import regex_lists 5 | from libs.subcleaner.sub_block import SubBlock 6 | from libs.subcleaner.subtitle import Subtitle 7 | 8 | 9 | def punish_regex_matches(subtitle: Subtitle) -> None: 10 | for block in subtitle.blocks: 11 | _run_regex_on_block(block, regex_lists.get_purge_regex(subtitle.language), 3) 12 | _run_regex_on_block(block, regex_lists.get_warning_regex(subtitle.language), 1) 13 | 14 | 15 | def _run_regex_on_block(block: SubBlock, regex_list: List[Tuple[str, Pattern]], punishment: int) -> None: 16 | clean_content = " ".join(block.content.replace("-\n", "-").split()) 17 | for regex in regex_list: 18 | try: 19 | result = re.findall(regex[1], clean_content) 20 | if result and isinstance(result[0], str): 21 | result = [r.lower() for r in result] 22 | result = set(result) 23 | else: 24 | result = set([t[0].lower() for t in result]) 25 | 26 | except re.error as e: 27 | raise ValueError(f"regex {regex[0]} is miss configured: {e.msg}") 28 | if result: 29 | block.regex_matches += punishment * len(result) 30 | for i in range(0, len(result)): 31 | block.hints.append(regex[0]) 32 | -------------------------------------------------------------------------------- /default_config/subcleaner.conf: -------------------------------------------------------------------------------- 1 | [SETTINGS] 2 | # main config for subcleaner. 3 | # 4 | 5 | require_language_profile = true 6 | # Set "require_language_profile" to false if you wish to clean subtitles in languages that don't have at least one 7 | # language profile associated with it. 8 | # bool [default: true] 9 | # 10 | 11 | 12 | relative_path_base = . 13 | # The script will run relative paths from the "relative_path_base" directory instead of your working directory if it exist. 14 | # Recommended to point this to your library base for ease of use. i.e: "/storage/media/library" 15 | # string [default: .] 16 | # 17 | 18 | 19 | use_defaults = true 20 | # Set "use_defaults" to false if you wish to disable all default regex configs. 21 | # bool [default: true] 22 | # 23 | 24 | 25 | default_language = 26 | # Set which language code subtitles is considered to have if the script is called without specifying language. 27 | # leave empty to allow script to automatically detect language code. 28 | # string [default: ] 29 | # 30 | 31 | 32 | log_dir = logs/ 33 | # log path: 34 | # Relative paths are from location of script. 35 | # string [default: logs/] 36 | # 37 | 38 | 39 | fix_overlaps = true 40 | # Subtitle overlap fixing: 41 | # As per subtitle formatting best practise, there should be at least 2 frames between each subtitle. 42 | # With this enabled it will move two subtitles that are too close to each other by moving the start/stop times 43 | # so they no longer overlap. 44 | # how much each subtitle is moved is weighted by how much text is in each subtitles. more text -> moved more. 45 | # bool [default: true] 46 | # 47 | -------------------------------------------------------------------------------- /libs/subcleaner/cleaner/punishers/duplicate.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Dict, List 3 | 4 | from libs.subcleaner.sub_block import SubBlock 5 | from libs.subcleaner.subtitle import Subtitle 6 | 7 | 8 | content_dict: Dict[str, List[SubBlock]] = {} 9 | content_dict_reverse: [SubBlock, str] = {} 10 | 11 | 12 | def punish_clone_blocks(subtitle: Subtitle) -> None: 13 | for block in subtitle.blocks: 14 | content = re.sub("[\\s.,:_-]", "", block.content) 15 | content_dict_reverse[block] = content 16 | if content not in content_dict: 17 | content_dict[content] = [] 18 | content_dict[content].append(block) 19 | 20 | for duplicate_list in content_dict.values(): 21 | if len(duplicate_list) <= 1: 22 | continue 23 | for block in duplicate_list: 24 | if "♪" in block.content: 25 | continue 26 | block.regex_matches += 1 27 | block.hints.append("similar_content") 28 | 29 | 30 | def move_duplicated(subtitle: Subtitle) -> None: 31 | for ad_block in subtitle.ad_blocks.copy(): 32 | if "similar_content" not in ad_block.hints: 33 | continue 34 | for block in content_dict[content_dict_reverse[ad_block]]: 35 | subtitle.ad(block) 36 | 37 | for warn_block in subtitle.warning_blocks.copy(): 38 | if "similar_content" not in warn_block.hints: 39 | continue 40 | for block in content_dict[content_dict_reverse[warn_block]]: 41 | subtitle.warn(block) 42 | 43 | 44 | def reset_duplicate(): 45 | content_dict.clear() 46 | content_dict_reverse.clear() 47 | -------------------------------------------------------------------------------- /libs/subcleaner/cleaner/punishers/adjacency.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Set 3 | 4 | from libs.subcleaner.sub_block import SubBlock 5 | from libs.subcleaner.subtitle import Subtitle 6 | 7 | 8 | def punish_ad_adjacency(subtitle: Subtitle) -> None: 9 | nearby_blocks: Set[SubBlock] = set() 10 | for index in range(0, len(subtitle.blocks)): 11 | block = subtitle.blocks[index] 12 | if index < 3: 13 | nearby_blocks.add(block) 14 | block.hints.append("close_to_start") 15 | continue 16 | if index > len(subtitle.blocks) - 4: 17 | nearby_blocks.add(block) 18 | block.hints.append("close_to_end") 19 | continue 20 | for compare_block in subtitle.blocks[max(0, index - 15): min(index + 16, len(subtitle.blocks))]: 21 | if compare_block.regex_matches >= 3 and compare_block != block: 22 | nearby_blocks.add(block) 23 | block.hints.append("nearby_ad") 24 | break 25 | 26 | adjacent_blocks: Set[SubBlock] = set() 27 | for index in range(0, len(subtitle.blocks)): 28 | block = subtitle.blocks[index] 29 | for compare_block in subtitle.blocks[max(0, index - 1): min(index + 2, len(subtitle.blocks))]: 30 | if compare_block.regex_matches >= 2 and compare_block != block: 31 | if re.sub(" +", " ", block.content.replace("\n", " ").strip()).count(" ") <= 4: 32 | adjacent_blocks.add(block) 33 | break 34 | 35 | for block in nearby_blocks: 36 | block.regex_matches += 1 37 | 38 | for block in adjacent_blocks: 39 | block.regex_matches += 1 40 | block.hints.append("adjacent_ad") 41 | -------------------------------------------------------------------------------- /libs/subcleaner/languages/languages.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from typing import Optional, List, Dict 4 | 5 | languages_json_file = Path(__file__).parent.joinpath("languages.json") 6 | 7 | _languages: List[Dict[str, str]] 8 | _language_names: List[str] = [] 9 | _language_codes_2: List[str] = [] 10 | _language_codes_3: List[str] = [] 11 | 12 | 13 | def load_language_data() -> None: 14 | with open(languages_json_file, encoding="UTF-8") as json_file: 15 | global _languages 16 | _languages = json.load(json_file) 17 | for language in _languages: 18 | _language_names.append(language["name"]) 19 | language["name"] = language["name"].lower().replace(" ", "_") 20 | if "alpha_2" in language: 21 | _language_codes_2.append(language["alpha_2"]) 22 | if "alpha_3" in language: 23 | _language_codes_3.append(language["alpha_3"]) 24 | 25 | 26 | def is_language(lang: str) -> bool: 27 | if len(lang) == 2: 28 | return lang in _language_codes_2 29 | if len(lang) == 3: 30 | return lang in _language_codes_3 31 | return lang in _language_names 32 | 33 | 34 | def get_2letter_code(lang: str) -> Optional[str]: 35 | if len(lang) == 2: 36 | if is_language(lang): 37 | return lang 38 | return None 39 | 40 | if len(lang) == 3: 41 | code_type = "alpha_3" 42 | else: 43 | code_type = "name" 44 | 45 | lang = lang.lower().replace(" ", "_") 46 | for language in _languages: 47 | if language[code_type] == lang: 48 | if "alpha_2" in language: 49 | return language["alpha_2"] 50 | return None 51 | 52 | 53 | load_language_data() 54 | -------------------------------------------------------------------------------- /libs/langdetect/tests/utils/test_unicode_block.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import six 4 | 5 | from libs.langdetect.utils import unicode_block 6 | 7 | 8 | class UnicodeBlockTest(unittest.TestCase): 9 | def test_unicode_block(self): 10 | self.assertEqual(unicode_block.unicode_block(six.u('\u0065')), unicode_block.UNICODE_BASIC_LATIN) 11 | self.assertEqual(unicode_block.unicode_block(six.u('\u007F')), unicode_block.UNICODE_BASIC_LATIN) 12 | self.assertEqual(unicode_block.unicode_block(six.u('\u0080')), unicode_block.UNICODE_LATIN_1_SUPPLEMENT) 13 | self.assertEqual(unicode_block.unicode_block(six.u('\u21FF')), unicode_block.UNICODE_ARROWS) 14 | self.assertEqual(unicode_block.unicode_block(six.u('\u2200')), unicode_block.UNICODE_MATHEMATICAL_OPERATORS) 15 | self.assertEqual(unicode_block.unicode_block(six.u('\u2201')), unicode_block.UNICODE_MATHEMATICAL_OPERATORS) 16 | self.assertEqual(unicode_block.unicode_block(six.u('\u22FF')), unicode_block.UNICODE_MATHEMATICAL_OPERATORS) 17 | self.assertEqual(unicode_block.unicode_block(six.u('\u2300')), unicode_block.UNICODE_MISCELLANEOUS_TECHNICAL) 18 | # test only on wide builds (i.e. Python 3) 19 | if len(six.u('\U0010FFFF')) == 1: 20 | self.assertEqual(unicode_block.unicode_block(six.u('\U000F0000')), unicode_block.UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_A) 21 | self.assertEqual(unicode_block.unicode_block(six.u('\U000FFFFF')), unicode_block.UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_A) 22 | self.assertEqual(unicode_block.unicode_block(six.u('\U00100000')), unicode_block.UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_B) 23 | self.assertEqual(unicode_block.unicode_block(six.u('\U0010FFFF')), unicode_block.UNICODE_SUPPLEMENTARY_PRIVATE_USE_AREA_B) 24 | -------------------------------------------------------------------------------- /libs/subcleaner/cleaner/detectors/chain.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | from typing import List 3 | 4 | from libs.subcleaner.sub_block import SubBlock 5 | from libs.subcleaner.subtitle import Subtitle 6 | 7 | 8 | def detect_chain(subtitle: Subtitle) -> None: 9 | chain: List[SubBlock] = [] 10 | identical_count = 0 11 | for i in range(1, len(subtitle.blocks)): 12 | block = subtitle.blocks[i] 13 | pre_block = subtitle.blocks[i - 1] 14 | 15 | link: bool = False 16 | 17 | if is_link(pre_block, block): 18 | if pre_block.equal_content(block): 19 | identical_count += 1 20 | link = True 21 | 22 | if link: 23 | if not chain: 24 | chain.append(pre_block) 25 | chain.append(block) 26 | continue 27 | 28 | if len(chain) > 2 + identical_count or any(block in subtitle.ad_blocks for block in chain): 29 | for chain_block in chain: 30 | subtitle.ad(chain_block) 31 | chain_block.hints.append("chain_block") 32 | 33 | chain.clear() 34 | identical_count = 0 35 | if len(chain) > 2 + identical_count or any(block in subtitle.ad_blocks for block in chain): 36 | for chain_block in chain: 37 | subtitle.ad(chain_block) 38 | chain_block.hints.append("chain_block") 39 | 40 | 41 | def is_link(block: SubBlock, post_block: SubBlock) -> bool: 42 | if block.start_time > post_block.start_time: 43 | block, post_block = post_block, block 44 | if post_block.start_time - block.end_time > timedelta(milliseconds=500): 45 | return False 46 | 47 | if len(block.content) < len(post_block.content) <= len(block.content) + 2: 48 | if post_block.content.startswith(block.content) or post_block.content.endswith(block.content): 49 | return True 50 | elif len(post_block.content) < len(block.content) <= len(post_block.content) + 2: 51 | if block.content.startswith(post_block.content) or block.content.endswith(post_block.content): 52 | return True 53 | elif block.content.strip() == post_block.content.strip(): 54 | return True 55 | 56 | return False 57 | -------------------------------------------------------------------------------- /libs/langdetect/tests/utils/test_lang_profile.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import six 4 | from six.moves import xrange 5 | 6 | from libs.langdetect.utils.lang_profile import LangProfile 7 | 8 | 9 | class LangProfileText(unittest.TestCase): 10 | def test_lang_profile(self): 11 | profile = LangProfile() 12 | self.assertIsNone(profile.name) 13 | 14 | def test_lang_profile_string_int(self): 15 | profile = LangProfile('en') 16 | self.assertEqual(profile.name, 'en') 17 | 18 | def test_add(self): 19 | profile = LangProfile('en') 20 | profile.add('a') 21 | self.assertEqual(profile.freq.get('a'), 1) 22 | profile.add('a') 23 | self.assertEqual(profile.freq.get('a'), 2) 24 | profile.omit_less_freq() 25 | 26 | def test_add_illegally1(self): 27 | profile = LangProfile() 28 | profile.add('a') # ignore 29 | self.assertIsNone(profile.freq.get('a')) # ignored 30 | 31 | def test_add_illegally2(self): 32 | profile = LangProfile('en') 33 | profile.add('a') 34 | profile.add('') # Illegal (string's length of parameter must be between 1 and 3) but ignore 35 | profile.add('abcd') # as well 36 | self.assertEqual(profile.freq.get('a'), 1) 37 | self.assertIsNone(profile.freq.get('')) # ignored 38 | self.assertIsNone(profile.freq.get('abcd')) # ignored 39 | 40 | def test_omit_less_freq(self): 41 | profile = LangProfile('en') 42 | grams = six.u('a b c \u3042 \u3044 \u3046 \u3048 \u304a \u304b \u304c \u304d \u304e \u304f').split() 43 | for i in xrange(5): 44 | for g in grams: 45 | profile.add(g) 46 | profile.add(six.u('\u3050')) 47 | 48 | self.assertEqual(profile.freq.get('a'), 5) 49 | self.assertEqual(profile.freq.get(six.u('\u3042')), 5) 50 | self.assertEqual(profile.freq.get(six.u('\u3050')), 1) 51 | profile.omit_less_freq() 52 | self.assertIsNone(profile.freq.get('a')) # omitted 53 | self.assertEqual(profile.freq.get(six.u('\u3042')), 5) 54 | self.assertIsNone(profile.freq.get(six.u('\u3050'))) # omitted 55 | 56 | def test_omit_less_freq_illegally(self): 57 | profile = LangProfile() 58 | profile.omit_less_freq() # ignore 59 | -------------------------------------------------------------------------------- /libs/subcleaner/cleaner/detectors/wedged.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | from libs.subcleaner.sub_block import SubBlock 4 | from libs.subcleaner.subtitle import Subtitle 5 | 6 | 7 | def detect_wedged(subtitle: Subtitle) -> None: 8 | if len(subtitle.blocks) < 3: 9 | return 10 | for index in range(0, len(subtitle.blocks)): 11 | block: SubBlock = subtitle.blocks[index] 12 | 13 | if index == 0: 14 | post_block: SubBlock = subtitle.blocks[index + 1] 15 | if post_block.regex_matches >= 3: 16 | if (post_block.start_time - block.end_time) < timedelta(seconds=1): 17 | if block in subtitle.warning_blocks: 18 | subtitle.ad(block) 19 | else: 20 | subtitle.warn(block) 21 | else: 22 | subtitle.warn(block) 23 | block.hints.append("wedged_block") 24 | continue 25 | 26 | if index == len(subtitle.blocks) - 1: 27 | pre_block: SubBlock = subtitle.blocks[index - 1] 28 | if pre_block.regex_matches < 3: 29 | continue 30 | block.hints.append("wedged_block") 31 | if (block.start_time - pre_block.end_time) > timedelta(seconds=1): 32 | subtitle.warn(block) 33 | continue 34 | 35 | if block in subtitle.warning_blocks: 36 | subtitle.ad(block) 37 | else: 38 | subtitle.warn(block) 39 | continue 40 | 41 | pre_block: SubBlock = subtitle.blocks[index - 1] 42 | post_block: SubBlock = subtitle.blocks[index + 1] 43 | 44 | if pre_block.regex_matches >= 3 and post_block.regex_matches >= 3: 45 | if (post_block.start_time - block.end_time) < timedelta(seconds=1) and \ 46 | (block.start_time - pre_block.end_time) < timedelta(seconds=1): 47 | subtitle.ad(block) 48 | block.hints.append("wedged_block") 49 | continue 50 | if block.regex_matches == 2: 51 | subtitle.ad(block) 52 | block.hints.append("wedged_block") 53 | continue 54 | else: 55 | subtitle.warn(block) 56 | block.hints.append("wedged_block") 57 | continue 58 | -------------------------------------------------------------------------------- /libs/langdetect/utils/lang_profile.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import re 3 | 4 | import libs.six as six 5 | 6 | from .ngram import NGram 7 | 8 | 9 | class LangProfile(object): 10 | MINIMUM_FREQ = 2 11 | LESS_FREQ_RATIO = 100000 12 | 13 | ROMAN_CHAR_RE = re.compile(r'^[A-Za-z]$') 14 | ROMAN_SUBSTR_RE = re.compile(r'.*[A-Za-z].*') 15 | 16 | def __init__(self, name=None, freq=None, n_words=None): 17 | self.freq = defaultdict(int) 18 | if freq is not None: 19 | self.freq.update(freq) 20 | 21 | if n_words is None: 22 | n_words = [0] * NGram.N_GRAM 23 | 24 | self.name = name 25 | self.n_words = n_words 26 | 27 | def add(self, gram): 28 | '''Add n-gram to profile.''' 29 | if self.name is None or gram is None: # Illegal 30 | return 31 | length = len(gram) 32 | if length < 1 or length > NGram.N_GRAM: # Illegal 33 | return 34 | self.n_words[length - 1] += 1 35 | self.freq[gram] += 1 36 | 37 | def omit_less_freq(self): 38 | '''Eliminate below less frequency n-grams and noise Latin alphabets.''' 39 | if self.name is None: # Illegal 40 | return 41 | threshold = max(self.n_words[0] // self.LESS_FREQ_RATIO, self.MINIMUM_FREQ) 42 | 43 | roman = 0 44 | for key, count in list(six.iteritems(self.freq)): 45 | if count <= threshold: 46 | self.n_words[len(key)-1] -= count 47 | del self.freq[key] 48 | elif self.ROMAN_CHAR_RE.match(key): 49 | roman += count 50 | 51 | # roman check 52 | if roman < self.n_words[0] // 3: 53 | for key, count in list(six.iteritems(self.freq)): 54 | if self.ROMAN_SUBSTR_RE.match(key): 55 | self.n_words[len(key)-1] -= count 56 | del self.freq[key] 57 | 58 | def update(self, text): 59 | '''Update the language profile with (fragmented) text. 60 | Extract n-grams from text and add their frequency into the profile. 61 | ''' 62 | if text is None: 63 | return 64 | text = NGram.normalize_vi(text) 65 | gram = NGram() 66 | for ch in text: 67 | gram.add_char(ch) 68 | for n in range(1, NGram.N_GRAM+1): 69 | self.add(gram.get(n)) 70 | -------------------------------------------------------------------------------- /libs/subcleaner/settings/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from configparser import ConfigParser 3 | from pathlib import Path 4 | from typing import Optional 5 | 6 | import libs 7 | from libs.subcleaner import languages 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | home_dir = Path(libs.__file__).parent.parent 12 | try: 13 | home_dir = home_dir.relative_to(Path.cwd()) 14 | except ValueError: 15 | pass 16 | regex_dir = home_dir.joinpath("regex_profiles") 17 | 18 | # for migrating old installations: 19 | if home_dir.joinpath("regex").exists(): 20 | for path in home_dir.joinpath("regex").iterdir(): 21 | new_file = regex_dir.joinpath(path.name) 22 | if not new_file.exists(): 23 | path.rename(new_file) 24 | path.unlink() 25 | home_dir.joinpath("regex").rmdir() 26 | 27 | default_regex_dir = regex_dir.joinpath("default") 28 | script_file = home_dir.joinpath('subcleaner.py') 29 | 30 | log_file: Path 31 | use_default_regex: bool 32 | fix_overlaps: bool 33 | relative_base: Path 34 | default_language: Optional[str] 35 | config_file = home_dir.joinpath("subcleaner.conf") 36 | 37 | if not config_file.is_file(): 38 | config_file.write_text(home_dir.joinpath("default_config", "subcleaner.conf").read_text()) 39 | 40 | cfg = ConfigParser() 41 | cfg.read(str(config_file), encoding="UTF-8") 42 | 43 | use_default_regex = cfg['SETTINGS'].getboolean("use_defaults", True) 44 | 45 | sections = cfg.sections() 46 | 47 | log_dir = Path(cfg["SETTINGS"].get("log_dir", "logs/")) 48 | if not log_dir.is_absolute(): 49 | log_dir = home_dir.joinpath(log_dir) 50 | if not log_dir.exists(): 51 | log_dir.mkdir() 52 | if not log_dir.is_dir(): 53 | raise ValueError(f"log directory: {log_dir} is not a directory") 54 | log_file = log_dir.joinpath("subcleaner.log") 55 | 56 | relative_base = Path(cfg['SETTINGS'].get("relative_path_base", ".")) 57 | if not relative_base.is_absolute(): 58 | relative_base = Path.cwd().joinpath(relative_base) 59 | relative_base = relative_base.resolve() 60 | 61 | fix_overlaps = cfg['SETTINGS'].getboolean("fix_overlaps", True) 62 | 63 | default_language = cfg['SETTINGS'].get("default_language", "") 64 | if default_language in ["blank", "Blank", "", "empty", "Empty"]: 65 | default_language = None 66 | if default_language: 67 | if not languages.is_language(default_language): 68 | logger.error("Config error: default language code must be a valid ISO:639 language. Exiting") 69 | exit(1) 70 | 71 | use_english_on_all = cfg['SETTINGS'].getboolean("use_english_on_all", False) 72 | require_language_profile = cfg['SETTINGS'].getboolean("require_language_profile", True) 73 | -------------------------------------------------------------------------------- /regex_profiles/default/indonesian.conf: -------------------------------------------------------------------------------- 1 | [META] 2 | # Indonesian default config. 3 | 4 | # Coma delimited list of language codes associated with this language profile. 5 | # The script will run on all sub-labels like ":forced" as long as they match the language code. 6 | # leave empty to apply to all language codes. 7 | language_codes = id 8 | 9 | 10 | 11 | # Information about how to configure the REGEX sections, read at the bottom of the file. 12 | # All regexes are case insensitive! 13 | [WARNING_REGEX] 14 | 15 | id_warn1: \b\b(iklan|situs|judi|slot|togel|bandar|deposit|cashback|donasi|donatur|penerjemah|subtitle|(di)?terjemah(an|kan))\b\b. 16 | id_warn2: \b(BNI|BRI|BCA|OVO)\b. 17 | id_warn3: \b(pasang|pemasangan|oleh|by|pulsa|AN|SK|S&K)\b. 18 | id_warn4: \b(line|instagram|ig|twitter|tg|telegram)\b. 19 | id_warn6: \.(id|my) 20 | #regex#: Regex goes here. 21 | 22 | 23 | [PURGE_REGEX] 24 | 25 | id_purge1: IDFL|Lebah\s?Ganteng|Pein\s?Akatsuki 26 | id_purge2: trakteer|saweria|GOPAY 27 | id_purge3: (skype|line|instagram|ig|twitter|wa|whatsapp|tg|telegram)\s*&\s*(skype|line|instagram|ig|twitter|wa|whatsapp|tg|telegram) 28 | id_purge4: Alif\s?Fikri\s?Aulia|paint_lapain|EveryAgent|faridusman|NANOsubs|GradyNanoNano|Jackandthewilee 29 | id_purge5: alih tempo|alih bahasa|takarir|subtitel 30 | #regex#: Regex goes here. 31 | 32 | 33 | 34 | # 35 | # -----------------------------------------GUIDE------------------------------------------------- 36 | # 37 | 38 | # This language profile contains two lists of regex that will look for patterns. 39 | # if you wish to modify or remove any regex, feel free to do so 40 | # but files in the default folder will be overwritten when you update the script. 41 | # You can add and remove keys as long as two keys don't use the same key twice. 42 | 43 | # WARNING_REGEX: 44 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block. 45 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads. 46 | # 1 warning is ignored 47 | # 2 warnings will be print the block as a WARNING in the log. 48 | # 3 warnings or more will remove the entire block. 49 | 50 | # PURGE_REGEX: 51 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block. 52 | 53 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the 54 | # literal character you'll need to escape it with '\' 55 | # for example: matching "www." would require a regex like: "www\." 56 | # you can test regexes online on an regex-tester tool like https://regex101.com/ 57 | 58 | # Feel free to ask me any question on github. 59 | -------------------------------------------------------------------------------- /libs/langdetect/tests/test_detector.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import six 4 | 5 | from libs.langdetect.detector_factory import DetectorFactory 6 | from libs.langdetect.utils.lang_profile import LangProfile 7 | 8 | 9 | class DetectorTest(unittest.TestCase): 10 | TRAINING_EN = 'a a a b b c c d e' 11 | TRAINING_FR = 'a b b c c c d d d' 12 | TRAINING_JA = six.u('\u3042 \u3042 \u3042 \u3044 \u3046 \u3048 \u3048') 13 | JSON_LANG1 = '{"freq":{"A":3,"B":6,"C":3,"AB":2,"BC":1,"ABC":2,"BBC":1,"CBA":1},"n_words":[12,3,4],"name":"lang1"}' 14 | JSON_LANG2 = '{"freq":{"A":6,"B":3,"C":3,"AA":3,"AB":2,"ABC":1,"ABA":1,"CAA":1},"n_words":[12,5,3],"name":"lang2"}' 15 | 16 | def setUp(self): 17 | self.factory = DetectorFactory() 18 | 19 | profile_en = LangProfile('en') 20 | for w in self.TRAINING_EN.split(): 21 | profile_en.add(w) 22 | self.factory.add_profile(profile_en, 0, 3) 23 | 24 | profile_fr = LangProfile('fr') 25 | for w in self.TRAINING_FR.split(): 26 | profile_fr.add(w) 27 | self.factory.add_profile(profile_fr, 1, 3) 28 | 29 | profile_ja = LangProfile('ja') 30 | for w in self.TRAINING_JA.split(): 31 | profile_ja.add(w) 32 | self.factory.add_profile(profile_ja, 2, 3) 33 | 34 | def test_detector1(self): 35 | detect = self.factory.create() 36 | detect.append('a') 37 | self.assertEqual(detect.detect(), 'en') 38 | 39 | def test_detector2(self): 40 | detect = self.factory.create() 41 | detect.append('b d') 42 | self.assertEqual(detect.detect(), 'fr') 43 | 44 | def test_detector3(self): 45 | detect = self.factory.create() 46 | detect.append('d e') 47 | self.assertEqual(detect.detect(), 'en') 48 | 49 | def test_detector4(self): 50 | detect = self.factory.create() 51 | detect.append(six.u('\u3042\u3042\u3042\u3042a')) 52 | self.assertEqual(detect.detect(), 'ja') 53 | 54 | def test_lang_list(self): 55 | langlist = self.factory.get_lang_list() 56 | self.assertEqual(len(langlist), 3) 57 | self.assertEqual(langlist[0], 'en') 58 | self.assertEqual(langlist[1], 'fr') 59 | self.assertEqual(langlist[2], 'ja') 60 | 61 | def test_factory_from_json_string(self): 62 | self.factory.clear() 63 | profiles = [self.JSON_LANG1, self.JSON_LANG2] 64 | self.factory.load_json_profile(profiles) 65 | langlist = self.factory.get_lang_list() 66 | self.assertEqual(len(langlist), 2) 67 | self.assertEqual(langlist[0], 'lang1') 68 | self.assertEqual(langlist[1], 'lang2') 69 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Subcleaner 2 | Subcleaner is a python3 script for removing ads from .srt subtitle files. 3 | The script is more sophisticated than a simple search and delete per line 4 | and uses different regex profiles for different languages. 5 | Once the script have identified ad-blocks they get removed and the remaining blocks 6 | get re-indexed. 7 | 8 | Can clean entire libraries in recursive mode and works well with [Bazarr](https://github.com/morpheus65535/bazarr) 9 | directly installed or as a container from the [linuxserver/bazarr](https://hub.docker.com/r/linuxserver/bazarr) image. 10 | 11 | # Installing 12 | Cloning and running with python3 should work. 13 | 14 | ```cd /opt``` 15 | 16 | ```git clone https://github.com/KBlixt/subcleaner.git``` 17 | 18 | ```cd subcleaner``` 19 | 20 | Install the default config simply by running the script once or copy the default config into 21 | the script root directory. 22 | 23 | ```python3 ./subcleaner.py -h``` 24 | 25 | With the subcleaner.conf file installed you can modify the settings within it. 26 | the config file contains instructions what each of the settings does. 27 | 28 | ## Bazarr 29 | Unlock the scripts full potential by running it after downloading a subtitle from 30 | [Bazarr](https://github.com/morpheus65535/bazarr). Enable custom post-processing and use 31 | the command: 32 | 33 | ```python3 /opt/subcleaner/subcleaner.py "{{subtitles}}" -s``` (note the quotation) 34 | 35 | It should work 36 | right out the gate provided the paths and permissions are set up correctly. 37 | 38 | in the bazarr log it should confirm that the script ran successfully or give you 39 | an error message that tells you what's wrong. if nothing is output then you've probably 40 | set the script path wrong. 41 | 42 | ## Docker 43 | 44 | If you run Bazarr in a docker container, as you should, 45 | make sure the Bazarr container have access to the script directory. Either 46 | mount /opt/subcleaner directly into the container as a volume or install the script inside 47 | the Bazarr config directory. 48 | 49 | I have verified that this works on the [linuxserver/bazarr](https://hub.docker.com/r/linuxserver/bazarr) image. 50 | 51 | # Languages: 52 | The script have a few language profiles included by default: 53 | 54 | - English 55 | - Spanish 56 | - Portuguese 57 | - Dutch 58 | - Indonesian 59 | - Swedish 60 | 61 | If you want to run the script against any other language you'll have to either create a profile for it 62 | or disable the requirement in the subcleaner.conf file. It's recommended to create 63 | a language profile. read the README in the regex_profiles directory for more info and guidance. 64 | 65 | ### If you make a useful regex profile for a non-default language, PLEASE let me know! 66 | I'll review it and add it to the included default profiles. And it'll help out others that use 67 | that language in the future! :) 68 | 69 | __________________ 70 | 71 | 72 | # Thank you :) 73 | Please, If you find any issues or have any questions feel free to 74 | open an issue or discussion. 75 | 76 | __________________ 77 | ###### Future (possibly): 78 | 79 | * Automatic subtitle deletion if language don't match label. 80 | 81 | * better ui for confirming/reverting deletion of ads. 82 | 83 | * ASS support? 84 | 85 | -------------------------------------------------------------------------------- /regex_profiles/default/no_profile.conf: -------------------------------------------------------------------------------- 1 | [META] 2 | # default config that applies to any language that are missing an language profile. 3 | 4 | language_codes = no_profile 5 | 6 | 7 | 8 | # Information about how to configure the REGEX sections, read at the bottom of the file. 9 | # All regexes are case insensitive! 10 | [WARNING_REGEX] 11 | 12 | nop_warn1: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)?synch?(ed|ro(nized)?)?|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|confor(m|med)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|sub(s|bed)?|provided|supported|tim(ing|ed)|encoded|edit(ed|s)?|download(ed)?|present(s|ing|ed)|credits|episode)\b 13 | nop_warn2: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)?synch?(ed|ro(nized)?)?|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|confor(m|med)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|sub(s|bed)?|provided|supported|tim(ing|ed)|encoded|edit(ed|s)?|download(ed)?)\W+(by|from)\b 14 | 15 | nop_warn3: \b(broadcasting|UNiTED\W*TEAM|admitme|ragbear|looklive|Camikaze|SourGrass|mstoll|alire2a)\b 16 | nop_warn4: \b(normita|EhLaNa|playships|metamorfose|sunmenghao|nessundorma|Arun|seriestele|DarKsh|vothaison)\b 17 | nop_warn5: \b(anana|cRosKy|Aramis|misshu|Xenzai|KKB|ydy|swsub|divx|empiremedia|La Fabrique|benj)\b 18 | nop_warn6: \b(dawaith|MoSub|snuif|Golgi|Linwelin|Malikay|Ricana|Sadgeezer|argenteam|tiobetonh|chebinhdan)\b 19 | 20 | 21 | [PURGE_REGEX] 22 | 23 | nop_purge1: \b(caption(s|ed)?|subtitl(e|ed|es|ing)|fixed(?!-)|(re-?)?synch?(?!-)(ed|ro(nized)?)?|rip(ped)?(?!-)|translat(e|ed|ion|ions)|correct(ions|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|subs|provided|encoded|edit(ed|s)?)\W*(by|from)?\W*(:|;).. 24 | 25 | nop_purge2: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university) 26 | nop_purge3: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clear\W*way\W*law) 27 | nop_purge4: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university) 28 | nop_purge5: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clearway\W*law) 29 | 30 | 31 | 32 | 33 | # 34 | # -----------------------------------------GUIDE------------------------------------------------- 35 | # 36 | 37 | # This language profile contains two lists of regex that will look for patterns. 38 | # if you wish to modify or remove any regex, feel free to do so 39 | # but files in the default folder will be overwritten when you update the script. 40 | # You can add and remove keys as long as two keys don't use the same key twice. 41 | 42 | # WARNING_REGEX: 43 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block. 44 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads. 45 | # 1 warning is ignored 46 | # 2 warnings will be print the block as a WARNING in the log. 47 | # 3 warnings or more will remove the entire block. 48 | 49 | # PURGE_REGEX: 50 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block. 51 | 52 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the 53 | # literal character you'll need to escape it with '\' 54 | # for example: matching "www." would require a regex like: "www\." 55 | # you can test regexes online on an regex-tester tool like https://regex101.com/ 56 | 57 | # Feel free to ask me any question on github. 58 | -------------------------------------------------------------------------------- /regex_profiles/default/hebrew.conf: -------------------------------------------------------------------------------- 1 | [META] 2 | # hebrew default config. 3 | 4 | # Coma delimited list of language codes associated with this language profile. 5 | # The script will run against all sub-labels like ":forced" as long as they match the language code. 6 | # leave empty to apply to all language codes. 7 | language_codes = he, heb, hebrew 8 | 9 | 10 | 11 | # Information about how to configure the REGEX sections, read at the bottom of the file. 12 | # All regexes are case insensitive! 13 | [WARNING_REGEX] 14 | 15 | # מילים שיכולות להופיע גם בפרסומות וגם בדיבור של הסרט 16 | he_warning1: \b(גדעון|צפריר|צפייה מהנה|צפייה נעימה|(?:ו)?נערך|אלמוני|(?:ו)?הפקת|תהנו|דונקי|התרגום|ותיקן|אנונימי|תרגום|עריכה|מורידים|שלד|קרן)\b 17 | 18 | 19 | [PURGE_REGEX] 20 | 21 | #אתרי הורדת כתוביות 22 | he_purge1: \b(Ktuvit|Wizdom|SuperSubtitles|YIFY|Podnapisi|OpenSubtitles|Torec|Extreme|qsubs|imax|IMAX|extremesubs)\b 23 | #קבוצות מתורגמנים 24 | he_purge2: \b(Addic7ed|AnarKey|NDG STUDIOS|Donkey-Cr3w|Extreme|FaLse MeMories|GallifreySubs|HDSubs|Hebits|iSub|Ktuvit|LH|Qsubs|Sdarot|Sub-Faw|sub-lala|Subs|SubsCraft|SubsIL|Taxiron|TLMC|Torec|אולפנים|דורי מדיה אות|אולפני אלרום|פיוזר)\b 25 | #שמות מתורגמנים נפוצים 26 | he_purge3: \b(yoav1610|FK|elia|Godfather|TheRejector|scodoo2|Twilight|Yorai1212|HighLander|soprgal|ItayG|Acamol|qwer90|SnoWhite|נעמה זוהר|קופיקו הבלש|אבישג רז|עומר גפן|פיפו|dvodvo123|epitaph|yuvalh|XmonWoW|DrSub|Afenla|אלכסנדר פן|lala123|Hazy7868|glfinish|עדי-בלי-בצל|ddror|hamima|~Moshe~|Limor EM|דיויד סוויפט|glbegin|foxi9|Shaked7|Tornado|Nunia|rodney_mckay|BA07|Ariel046|Amir|Mozzie|Orpheus|אריאל אפרתי|ZIPC|שירין|iToch|R_A_7|WorkBook|GreenScorpion|נ\.א\.ש|Nobody|שוביקס|Eran-s|סטארבק|אסף פרץ|Outwit|E\.M|erez058|SHR|TOXIN|Idoideas|Hentaiman|RAMIRAMI68|kikmastr|subbie|TerryGoodkind|gil_m|בוביקו)\b 27 | #מילים שמיוחסות לבלוקי כתוביות 28 | he_purge4: \b(?:(?:ו)?תורג[מם]|(?:ו)?סונכר[נן]|(?:ו)?סונכרנו|(?:(?:ו|ה)?סנכרו[נן])|(?:ו)?ס[י]נכר[נן]|ונערך|משמיעה|(?:ו)?הגהה|(?:ו)?קודד(?:ו)?|הקידוד|(?:[וה]?תרגמו)|ותרגום|(?:(?:וה)|(?:ו)|(?:ה))?כתוביות)\b 29 | # מקרי קצה של מילים שמסתיימות בתו : 30 | he_purge5: (?<=\bעברית)(?=:)|(?<=:)(?=עברית\b)|(?<=\bתרגום)(?=:)|(?<=:)(?=תרגום\b) 31 | 32 | 33 | # 34 | # -----------------------------------------GUIDE------------------------------------------------- 35 | # 36 | 37 | # This language profile contains two lists of regex that will look for patterns. 38 | # if you wish to modify or remove any regex, feel free to do so 39 | # but files in the default folder will be overwritten when you update the script. 40 | # You can add and remove keys as long as two keys don't use the same key twice. 41 | 42 | # WARNING_REGEX: 43 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block. 44 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads. 45 | # 1 warning is ignored 46 | # 2 warnings will be print the block as a WARNING in the log. 47 | # 3 warnings or more will remove the entire block. 48 | 49 | # PURGE_REGEX: 50 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block. 51 | 52 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the 53 | # literal character you'll need to escape it with '\' 54 | # for example: matching "www." would require a regex like: "www\." 55 | # you can test regexes online on an regex-tester tool like https://regex101.com/ 56 | 57 | # Feel free to ask me any question on github. 58 | -------------------------------------------------------------------------------- /regex_profiles/default/english.conf: -------------------------------------------------------------------------------- 1 | [META] 2 | # English default config. 3 | 4 | # Coma delimited list of language codes associated with this language profile. 5 | # The script will run against all sub-labels like ":forced" as long as they match the language code. 6 | # leave empty to apply to all language codes. 7 | language_codes = en, eng, english 8 | 9 | 10 | 11 | # Information about how to configure the REGEX sections, read at the bottom of the file. 12 | # All regexes are case insensitive! 13 | [WARNING_REGEX] 14 | 15 | en_warn1: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)?synch?(ed|ro(nized)?)?|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion)|conform(ed|ing)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|sub(s|bed)?|provided|supported|tim(ing|ed)|encoded|edit(ed|s)?|download(ed)?|present(s|ing|ed)|credits|episode)\b 16 | en_warn2: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)?synch?(ed|ro(nized)?)?|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion)|conform(ed|ing)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|sub(s|bed)?|provided|supported|tim(ing|ed)|encoded|edit(ed|s)?|download(ed)?)\W+(by|from)\b 17 | 18 | en_warn3: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university) 19 | en_warn4: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clear\W*way\W*law) 20 | en_warn5: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university) 21 | en_warn6: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clearway\W*law) 22 | en_warn7: \b(broadcasting|UNiTED\W*TEAM|admitme|ragbear|looklive|Camikaze|Aramis|Arun|SG)\b 23 | 24 | en_warn8: English - 25 | en_warn9: English - 26 | 27 | 28 | [PURGE_REGEX] 29 | 30 | en_purge1: \b(caption(s|ed)?|subtitl(e|ed|es|ing)|fixed(?!-)|(re-?)?synch?(?!-)(ed|ro(nized)?)?|rip(ped)?(?!-)|translat(e|ed|ion|ions)|correct(ions|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|subs|provided|encoded|edit(ed|s)?)\W*(by|from)?\W*(:|;).. 31 | en_purge2: ^present(s|ing)?:$ 32 | en_purge3: \b(KKB|EhLaNa|ydy|swsub|divx|playships|empiremedia|metamorfose|sunmenghao|nessundorma|vothaison)\b 33 | en_purge4: \b(anana|cRosKy|misshu|seriestele|DarKsh|Xenzai|argenteam|tiobetonh|chebinhdan)\b 34 | en_purge5: \b(normita|dawaith|MoSub|snuif|Golgi|Linwelin|Malikay|Ricana|Sadgeezer|SourGrass|mstoll|alire2a)\b 35 | en_purge6: \b(admit1\.app|4kvod\.tv)\b 36 | #en_purge#: Regex goes here. 37 | 38 | 39 | 40 | # 41 | # -----------------------------------------GUIDE------------------------------------------------- 42 | # 43 | 44 | # This language profile contains two lists of regex that will look for patterns. 45 | # if you wish to modify or remove any regex, feel free to do so 46 | # but files in the default folder will be overwritten when you update the script. 47 | # You can add and remove keys as long as two keys don't use the same key twice. 48 | 49 | # WARNING_REGEX: 50 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block. 51 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads. 52 | # 1 warning is ignored 53 | # 2 warnings will be print the block as a WARNING in the log. 54 | # 3 warnings or more will remove the entire block. 55 | 56 | # PURGE_REGEX: 57 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block. 58 | 59 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the 60 | # literal character you'll need to escape it with '\' 61 | # for example: matching "www." would require a regex like: "www\." 62 | # you can test regexes online on an regex-tester tool like https://regex101.com/ 63 | 64 | # Feel free to ask me any question on github. 65 | -------------------------------------------------------------------------------- /libs/subcleaner/cleaner/cleaner.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import timedelta 3 | from pathlib import Path 4 | from typing import * 5 | from libs.subcleaner.subtitle import Subtitle 6 | from libs.subcleaner.settings import args 7 | 8 | from . import detectors, punishers 9 | from ..sub_block import SubBlock 10 | 11 | ad_blocks: Dict[SubBlock, Set[Path]] = {} 12 | warning_blocks: Dict[SubBlock, Set[Path]] = {} 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def find_ads(subtitle: Subtitle) -> None: 18 | punishers.punish_regex_matches(subtitle) 19 | 20 | for block in subtitle.blocks: 21 | if block.regex_matches == 0: 22 | block.regex_matches = -1 23 | 24 | punishers.punish_quick_first_block(subtitle) 25 | punishers.punish_ad_adjacency(subtitle) 26 | punishers.punish_clone_blocks(subtitle) 27 | 28 | for block in subtitle.blocks: 29 | if block.regex_matches >= 3: 30 | subtitle.ad(block) 31 | elif block.regex_matches == 2: 32 | subtitle.warn(block) 33 | 34 | detectors.detect_wedged(subtitle) 35 | punishers.move_duplicated(subtitle) 36 | detectors.detect_chain(subtitle) 37 | 38 | 39 | def reset(): 40 | punishers.reset_duplicate() 41 | 42 | 43 | def remove_ads(subtitle: Subtitle): 44 | if args.sensitive and len(subtitle.blocks) > 1: 45 | subtitle.warn(subtitle.blocks[0]) 46 | subtitle.warn(subtitle.blocks[-1]) 47 | 48 | for i in range(1, len(subtitle.blocks) - 1): 49 | prev_block = subtitle.blocks[i - 1] 50 | block = subtitle.blocks[i] 51 | next_block = subtitle.blocks[i + 1] 52 | if prev_block in subtitle.ad_blocks or next_block in subtitle.ad_blocks: 53 | subtitle.warn(block) 54 | 55 | for block in subtitle.ad_blocks: 56 | try: 57 | subtitle.blocks.remove(block) 58 | if "-->" in block.content: 59 | logger.warning(f"potential malformed subtitle blocks in removed block {block.original_index}.") 60 | except ValueError: 61 | pass 62 | for e_block in ad_blocks: 63 | if e_block.clean_content == block.clean_content: 64 | ad_blocks[e_block].add(subtitle.short_path) 65 | break 66 | else: 67 | ad_blocks[block] = {subtitle.short_path} 68 | 69 | for block in subtitle.warning_blocks: 70 | for e_block in warning_blocks: 71 | if e_block.clean_content == block.clean_content: 72 | warning_blocks[e_block].add(subtitle.short_path) 73 | break 74 | else: 75 | warning_blocks[block] = {subtitle.short_path} 76 | 77 | subtitle.reindex() 78 | 79 | 80 | def fix_overlap(subtitle: Subtitle): 81 | if len(subtitle.blocks) < 2: 82 | return False 83 | changes = False 84 | previous_block = subtitle.blocks[0] 85 | for block in subtitle.blocks[1:]: 86 | if not (previous_block.start_time < block.start_time and previous_block.end_time < block.end_time): 87 | previous_block = block 88 | continue 89 | 90 | overlap = previous_block.end_time - block.start_time + timedelta(seconds=3 / 30) 91 | if timedelta(milliseconds=3) < overlap and (len(block.content) + len(previous_block.content)) > 0: 92 | content_ratio = block.duration_seconds / (block.duration_seconds + previous_block.duration_seconds) 93 | block.start_time += content_ratio * overlap 94 | previous_block.end_time += (content_ratio - 1) * overlap 95 | changes = True 96 | 97 | previous_block = block 98 | return changes 99 | 100 | 101 | def unscramble(subtitle: Subtitle): 102 | subtitle.blocks.sort(key=lambda x: x.start_time) 103 | for block in subtitle.blocks.copy(): 104 | if block.duration_seconds <= 0: 105 | subtitle.ad(block) 106 | block.hints.append("negative_duration") 107 | subtitle.blocks.remove(block) 108 | subtitle.reindex() 109 | -------------------------------------------------------------------------------- /libs/subcleaner/report_generator.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import * 3 | 4 | from libs.subcleaner.cleaner import cleaner 5 | from libs.subcleaner.settings import args, config 6 | from libs.subcleaner.sub_block import SubBlock 7 | from libs.subcleaner.subtitle import Subtitle 8 | 9 | _report_base = " | " 10 | _report: str 11 | 12 | 13 | def generate_report(subtitle: Subtitle) -> str: 14 | _reset() 15 | _add(f"{len(subtitle.ad_blocks)} deleted blocks and {len(subtitle.warning_blocks)} warnings remaining.") 16 | 17 | if subtitle.ad_blocks: 18 | _add("") 19 | _add(_deleted_card(subtitle.ad_blocks), " " * 4) 20 | if subtitle.warning_blocks and not args.errors_only: 21 | _add("") 22 | _add(_warning_card(subtitle.warning_blocks), " " * 40) 23 | _add("") 24 | _add("To delete all remaining warnings run:") 25 | _add(f"python3 '{config.script_file}' '{subtitle.short_path}' --destroy {' '.join(subtitle.get_warning_indexes())}") 26 | 27 | return _report[1:] 28 | 29 | 30 | def generate_end_report() -> str: 31 | _reset() 32 | _add("") 33 | _add(_end_deleted_card(cleaner.ad_blocks), " " * 4) 34 | _add("") 35 | _add(_end_warning_card(cleaner.warning_blocks), " " * 40) 36 | _add("") 37 | return _report[1:] 38 | 39 | 40 | def _add(lines: str, spacer: str = "") -> None: 41 | lines = "\n" + lines 42 | 43 | global _report 44 | _report += lines.replace("\n", f"\n{_report_base}{spacer}") 45 | 46 | 47 | def _reset() -> None: 48 | global _report 49 | _report = "" 50 | 51 | 52 | def _deleted_card(ad_blocks: Set[SubBlock]) -> str: 53 | ad_blocks_list = list(ad_blocks) 54 | ad_blocks_list.sort(key=lambda b: b.original_index) 55 | card = "[---------Removed Blocks----------]\n" 56 | for block in ad_blocks_list: 57 | card += f"{block.original_index}\n" 58 | card += f"{block}\n" 59 | if args.explain: 60 | card += f"reasons: ({', '.join(block.hints)})\n" 61 | card += "\n" 62 | card = card[:-1] + "[---------------------------------]" 63 | return card 64 | 65 | 66 | def _warning_card(warning_blocks: Set[SubBlock]) -> str: 67 | warning_blocks_list = list(warning_blocks) 68 | warning_blocks_list.sort(key=lambda b: b.current_index) 69 | card = "[---------Warning Blocks----------]\n" 70 | for block in warning_blocks_list: 71 | card += f"{block.current_index}\n" 72 | card += f"{block}\n" 73 | if args.explain: 74 | card += f"reasons: ({', '.join(block.hints)})\n" 75 | card += "\n" 76 | card = card[:-1] + "[---------------------------------]" 77 | return card 78 | 79 | 80 | def _end_deleted_card(ad_blocks: Dict[SubBlock, Set[Path]]) -> str: 81 | 82 | ad_blocks_list = list((key, value) for key, value in ad_blocks.items()) 83 | ad_blocks_list.sort(key=lambda b: len(b[1])) 84 | card = "[---------All Removed Blocks----------]\n" 85 | for block in ad_blocks_list: 86 | if len(block[1]) > 4: 87 | continue 88 | if 0 == block[0].regex_matches or block[0].regex_matches > 9: 89 | continue 90 | 91 | card += f"{block[0].original_index}\n" 92 | card += f"{block[0]}\n" 93 | if args.explain: 94 | card += f"reasons: ({', '.join(block[0].hints)})\n" 95 | card += "subtitles: \n" + "\n".join(map(str, block[1])) + "\n" 96 | card += "\n" 97 | card = card[:-1] + "[---------------------------------]" 98 | return card 99 | 100 | 101 | def _end_warning_card(warning_blocks: Dict[SubBlock, Set[Path]]) -> str: 102 | ad_blocks_list = list((key, value) for key, value in warning_blocks.items()) 103 | ad_blocks_list.sort(key=lambda b: len(b[1]), reverse=True) 104 | card = "[---------All Warning Blocks----------]\n" 105 | for block in ad_blocks_list: 106 | if len(block[1]) < 2: 107 | continue 108 | card += f"{block[0].original_index}\n" 109 | card += f"{block[0]}\n" 110 | if args.explain: 111 | card += f"reasons: ({', '.join(block[0].hints)})\n" 112 | card += "subtitles: \n" + "\n".join(map(str, block[1])) + "\n" 113 | card += "\n" 114 | card = card[:-1] + "[---------------------------------]" 115 | return card 116 | -------------------------------------------------------------------------------- /regex_profiles/default/svenska.conf: -------------------------------------------------------------------------------- 1 | [META] 2 | # Swedish default config. 3 | 4 | # Coma delimited list of language codes associated with this language profile. 5 | # The script will run on all sub-labels like ":forced" as long as they match the language code. 6 | # leave empty to apply to all language codes. 7 | language_codes = sv, sve, svenska 8 | 9 | 10 | 11 | # Information about how to configure the REGEX sections, read at the bottom of the file. 12 | # All regexes are case insensitive! 13 | [WARNING_REGEX] 14 | 15 | sv_warn1: \b(kompletterad|(under)?text(ning|er)?|sångtext(er)?|(om-?)?syn[ck](ning|ad)?|övers[aä]tt(ning)?|distribution|Ansvarig utgivare|rätt(ning|ad)|regi|Läppsynk|episode?)\b 16 | sv_warn2: \b(kompletterad|(under)?text(ning|er)?|sångtext(er)?|(om-?)?syn[ck](ning|ad)?|övers[aä]tt(ning)?|distribution|Ansvarig utgivare|rätt(ning|ad)|regi|Läppsynk)\W+(av|från)\b 17 | sv_warn3: \.(se|nu)\b 18 | 19 | sv_warn5: \b(anana|present)\b 20 | #regex#: Regex goes here. 21 | sv_warn6: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)synch?(ed|ro(nized)?)?|(re-?)?synch?(ed|ro(nized)?)|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|confor(m|med)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|sub(s|bed)|provided|supported|encoded|edit(ed|s)?|download(ed)?|present(s|ing|ed)|credits)\b 22 | sv_warn7: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)synch?(ed|ro(nized)?)?|(re-?)?synch?(ed|ro(nized)?)|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|confor(m|med)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|sub(s|bed)|provided|supported|encoded|edit(ed|s)?|download(ed)?|present(s|ing|ed)|credits)\b 23 | 24 | sv_warn8: \b(Incubator|FRiEND|Swedish|TuX|eXz|Aramis|TAZ)\b 25 | sv_warn9: \b(Incubator|FRiEND|Swedish|TuX|eXz|TAZ)\b 26 | 27 | 28 | [PURGE_REGEX] 29 | sv_purge1: \b(kompletterad|(under)?text(ning|er)?|sångtext(er)?|(om-?)?syn[ck](ning|ad)?|övers[aä]tt(ning)?|distribution|Ansvarig utgivare|rätt(ning|ad)|regi|Läppsynk)\W*(av|från)?\W*(:|;).. 30 | 31 | sv_purge2: \b(Annonsera din produkt|bli en VIP-medlem|de bästa undertexter)\b 32 | sv_purge3: \b(Svensk Medietext|NORDiC RETAiL!|Swesub|Pictures AB|Scandinavian Text|[oö]versattargruppen|Mediatextgruppen)\b 33 | sv_purge4: \b(Bubba67|Dream_Theater|nordicbits|undertexter.se|stoffinho17|simontax|Sweden AB)\b 34 | sv_purge5: \b(StoraStyggaVargen|sdi.?media)\b 35 | sv_purge6: \b(Team Wild Animais Only Relesed|SDI Media|jaymz007|queen-ingela|Iyuno-SDI|Imposter10)\b 36 | 37 | sv_purge7: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university) 38 | sv_purge8: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clear\W*way\W*law) 39 | sv_purge9: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university) 40 | sv_purge10: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clearway\W*law) 41 | sv_purge11: \b(broadcasting|UNiTED\W*TEAM|admitme|ragbear|looklive|Camikaze)\b 42 | sv_purge12: \b(KKB|EhLaNa|ydy|swsub|divx|playships|empiremedia|metamorfose|sunmenghao|nessundorma|vothaison)\b 43 | sv_purge13: \b(cRosKy|misshu|Arun|seriestele|Sadgeezer|taureane)\b 44 | sv_purge14: \b(normita|dawaith|MoSub|snuif|Golgi|Linwelin|Malikay|Ricana|DarKsh|Xenzai|argenteam|tiobetonh|chebinhdan)\b 45 | sv_purge15: s([äa]song)?\W*\d+[^,]\W*e(pisod)?\W*\d+[^,] 46 | #regex#: Regex goes here. 47 | 48 | 49 | # 50 | # -----------------------------------------GUIDE------------------------------------------------- 51 | # 52 | 53 | # This language profile contains two lists of regex that will look for patterns. 54 | # if you wish to modify or remove any regex, feel free to do so 55 | # but files in the default folder will be overwritten when you update the script. 56 | # You can add and remove keys as long as two keys don't use the same key twice. 57 | 58 | # WARNING_REGEX: 59 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block. 60 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads. 61 | # 1 warning is ignored 62 | # 2 warnings will be print the block as a WARNING in the log. 63 | # 3 warnings or more will remove the entire block. 64 | 65 | # PURGE_REGEX: 66 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block. 67 | 68 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the 69 | # literal character you'll need to escape it with '\' 70 | # for example: matching "www." would require a regex like: "www\." 71 | # you can test regexes online on an regex-tester tool like https://regex101.com/ 72 | 73 | # Feel free to ask me any question on github. 74 | 75 | -------------------------------------------------------------------------------- /libs/subcleaner/sub_block.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import re 4 | from typing import List 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class SubBlock: 10 | original_index: int 11 | current_index: int 12 | content: str 13 | clean_content: str 14 | start_time: datetime.timedelta 15 | end_time: datetime.timedelta 16 | regex_matches = 0 17 | hints: List[str] 18 | 19 | def __init__(self, block_content: str, original_index_actual: int): 20 | lines = block_content.strip().split("\n") 21 | 22 | if self.is_sub_block_header(lines[0]) and len(lines) > 1 and not self.is_sub_block_header(lines[1]): 23 | lines = [""] + lines 24 | 25 | if lines[0].isnumeric(): 26 | self.original_index = int(lines[0]) 27 | else: 28 | number = "" 29 | for character in lines[0]: 30 | if character.isnumeric(): 31 | number += character 32 | else: 33 | break 34 | if number: 35 | self.original_index = int(number) 36 | else: 37 | self.original_index = original_index_actual 38 | 39 | if len(lines) < 2 or not self.is_sub_block_header(lines[1]): 40 | raise ParsingException(self.original_index, "incorrectly formatted subtitle block") 41 | 42 | times = lines[1].replace(" ", "").split("-->") 43 | try: 44 | self.start_time = time_string_to_timedelta(times[0]) 45 | self.end_time = time_string_to_timedelta(times[1]) 46 | except ValueError: 47 | raise ParsingException(self.original_index, "failed to parse timeframe.") 48 | except IndexError: 49 | raise ParsingException(self.original_index, "failed to parse timeframe.") 50 | 51 | if len(lines) > 2: 52 | self.content = "\n".join(lines[2:]).strip() 53 | else: 54 | self.content = "" 55 | self.content = self.content.replace("
", "\n") 56 | self.clean_content = re.sub("[\\s.,:_-]", "", self.content) 57 | self.hints = [] 58 | 59 | def equal_content(self, block: "SubBlock") -> bool: 60 | t = re.sub("[\\s.,:_-]", "", self.content) 61 | o = re.sub("[\\s.,:_-]", "", block.content) 62 | return t == o 63 | 64 | def __str__(self) -> str: 65 | string = f"{timedelta_to_time_string(self.start_time)} --> {timedelta_to_time_string(self.end_time)}\n" \ 66 | f"{self.content}" 67 | return string 68 | 69 | @classmethod 70 | def is_sub_block_header(cls, line: str) -> bool: 71 | if "\n" in line: 72 | return False 73 | 74 | times = line.replace(" ", "").split("-->") 75 | if len(times) < 2: 76 | return False 77 | try: 78 | time_string_to_timedelta(times[0]) 79 | time_string_to_timedelta(times[1]) 80 | except ValueError: 81 | return False 82 | except IndexError: 83 | return False 84 | 85 | return True 86 | 87 | @property 88 | def duration_seconds(self) -> float: 89 | return (self.end_time - self.start_time).total_seconds() 90 | 91 | 92 | class ParsingException(Exception): 93 | block_index: int 94 | subtitle_file: str 95 | file_line: int 96 | reason: str 97 | 98 | def __init__(self, block_index, reason): 99 | self.block_index = block_index 100 | self.reason = reason 101 | 102 | def __str__(self) -> str: 103 | return f"Parsing error at block {self.block_index} in file \"{self.subtitle_file}\" line {self.file_line}. reason: {self.reason}" 104 | 105 | 106 | def time_string_to_timedelta(time_string: str) -> datetime.timedelta: 107 | time = time_string.replace(",", ".").replace(" ", "") 108 | split = time.split(":") 109 | 110 | hours = float(split[0]) 111 | minutes = float(split[1]) 112 | seconds = split[2][:6] 113 | 114 | seconds_clean = "" 115 | found_dot = False 116 | for ch in seconds: 117 | if ch.isnumeric(): 118 | seconds_clean += ch 119 | if ch == ".": 120 | if not found_dot: 121 | found_dot = True 122 | seconds_clean += ch 123 | seconds = float(seconds_clean) 124 | if seconds >= 60: 125 | raise ValueError() 126 | if minutes >= 60: 127 | raise ValueError() 128 | 129 | return datetime.timedelta(hours=hours, 130 | minutes=minutes, 131 | seconds=seconds) 132 | 133 | 134 | def timedelta_to_time_string(timedelta: datetime.timedelta) -> str: 135 | time_string = str(timedelta) 136 | if "." in time_string: 137 | time_string = time_string[: -3].replace(".", ",").zfill(12) 138 | else: 139 | time_string = f"{time_string},000".zfill(12) 140 | return time_string 141 | -------------------------------------------------------------------------------- /libs/langdetect/detector_factory.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os import path 3 | import sys 4 | 5 | try: 6 | import simplejson as json 7 | except ImportError: 8 | import json 9 | 10 | from .detector import Detector 11 | from .lang_detect_exception import ErrorCode, LangDetectException 12 | from .utils.lang_profile import LangProfile 13 | 14 | 15 | class DetectorFactory(object): 16 | ''' 17 | Language Detector Factory Class. 18 | 19 | This class manages an initialization and constructions of Detector. 20 | 21 | Before using language detection library, 22 | load profiles with DetectorFactory.load_profile(str) 23 | and set initialization parameters. 24 | 25 | When the language detection, 26 | construct Detector instance via DetectorFactory.create(). 27 | See also Detector's sample code. 28 | ''' 29 | seed = None 30 | 31 | def __init__(self): 32 | self.word_lang_prob_map = {} 33 | self.langlist = [] 34 | 35 | def load_profile(self, profile_directory): 36 | list_files = os.listdir(profile_directory) 37 | if not list_files: 38 | raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Not found profile: ' + profile_directory) 39 | 40 | langsize, index = len(list_files), 0 41 | for filename in list_files: 42 | if filename.startswith('.'): 43 | continue 44 | filename = path.join(profile_directory, filename) 45 | if not path.isfile(filename): 46 | continue 47 | 48 | f = None 49 | try: 50 | if sys.version_info[0] < 3: 51 | f = open(filename, 'r') 52 | else: 53 | f = open(filename, 'r', encoding='utf-8') 54 | json_data = json.load(f) 55 | profile = LangProfile(**json_data) 56 | self.add_profile(profile, index, langsize) 57 | index += 1 58 | except IOError: 59 | raise LangDetectException(ErrorCode.FileLoadError, 'Cannot open "%s"' % filename) 60 | except: 61 | raise LangDetectException(ErrorCode.FormatError, 'Profile format error in "%s"' % filename) 62 | finally: 63 | if f: 64 | f.close() 65 | 66 | def load_json_profile(self, json_profiles): 67 | langsize, index = len(json_profiles), 0 68 | if langsize < 2: 69 | raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Need more than 2 profiles.') 70 | 71 | for json_profile in json_profiles: 72 | try: 73 | json_data = json.loads(json_profile) 74 | profile = LangProfile(**json_data) 75 | self.add_profile(profile, index, langsize) 76 | index += 1 77 | except: 78 | raise LangDetectException(ErrorCode.FormatError, 'Profile format error.') 79 | 80 | def add_profile(self, profile, index, langsize): 81 | lang = profile.name 82 | if lang in self.langlist: 83 | raise LangDetectException(ErrorCode.DuplicateLangError, 'Duplicate the same language profile.') 84 | self.langlist.append(lang) 85 | 86 | for word in profile.freq: 87 | if word not in self.word_lang_prob_map: 88 | self.word_lang_prob_map[word] = [0.0] * langsize 89 | length = len(word) 90 | if 1 <= length <= 3: 91 | prob = 1.0 * profile.freq.get(word) / profile.n_words[length - 1] 92 | self.word_lang_prob_map[word][index] = prob 93 | 94 | def clear(self): 95 | self.langlist = [] 96 | self.word_lang_prob_map = {} 97 | 98 | def create(self, alpha=None): 99 | '''Construct Detector instance with smoothing parameter.''' 100 | detector = self._create_detector() 101 | if alpha is not None: 102 | detector.set_alpha(alpha) 103 | return detector 104 | 105 | def _create_detector(self): 106 | if not self.langlist: 107 | raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Need to load profiles.') 108 | return Detector(self) 109 | 110 | def set_seed(self, seed): 111 | self.seed = seed 112 | 113 | def get_lang_list(self): 114 | return list(self.langlist) 115 | 116 | 117 | PROFILES_DIRECTORY = path.join(path.dirname(__file__), 'profiles') 118 | _factory = None 119 | 120 | def init_factory(): 121 | global _factory 122 | if _factory is None: 123 | _factory = DetectorFactory() 124 | _factory.load_profile(PROFILES_DIRECTORY) 125 | 126 | def detect(text): 127 | init_factory() 128 | detector = _factory.create() 129 | detector.append(text) 130 | return detector.detect() 131 | 132 | 133 | def detect_langs(text): 134 | init_factory() 135 | detector = _factory.create() 136 | detector.append(text) 137 | return detector.get_probabilities() 138 | -------------------------------------------------------------------------------- /regex_profiles/default/spanish.conf: -------------------------------------------------------------------------------- 1 | [META] 2 | # Spanish default config. 3 | 4 | # Comma delimited list of language codes associated with this language profile. 5 | # The script will run against all sub-labels like ":forced" as long as they match the language code. 6 | # leave empty to apply to all language codes. 7 | language_codes = es, spa, spanish 8 | 9 | # Information about how to configure the REGEX sections, read at the bottom of the file. 10 | # All regexes are case insensitive! 11 | [WARNING_REGEX] 12 | 13 | es_warn1: \b(creado(s)?|subtitu(lo|los|lado|lada|lados)|subtítu(lo|los|lado|lada|lados)|descarg(ado|ar)|(re-?)?sinc(ed|ro(nizado|nizados|nizacion|nización)?)?|modific(ado|ados|ion|iones|ión|iónes)|traduc(e|ido|idos|tora|cion|ciones|ción|ciónes)|correcc(iones|ion|ión|iónes)|correg(ir|ido|idos)|transcri(bido|pcion|pciones|pción|pciónes)|mejor(ado|amientos)|adaptado|ripeo|arreglos)\b 14 | es_warn2: \b(creado(s)?|subtitu(lo|los|lado|lada|lados)|subtítu(lo|los|lado|lada|lados)|descarg(ado|ar)|(re-?)?sinc(ed|ro(nizado|nizados|nizacion|nización)?)?|modific(ado|ados|ion|iones|ión|iónes)|traduc(e|ido|idos|tora|cion|ciones|ción|ciónes)|correcc(iones|ion|ión|iónes)|correg(ir|ido|idos)|transcri(bido|pcion|pciones|pción|pciónes)|mejor(ado|amientos)|adaptado|ripeo|arreglos|subs|hecha)\W+(por|de|by)\b 15 | es_warn3: \b(traduc(e|ido|idos|tora|cion|ciones|ción|ciónes)|transcri(bido|pcion|pciones|pción|pciónes)|subtitu(lo|los|lado|lada|lados)|subtítu(lo|los|lado|lada|lados))\W+(al|en)\b 16 | 17 | es_warn4: spanish ?(-|]|\/) 18 | es_warn5: \b(spanish|latino|espanol|español|castilian|latin american|castellano)\b 19 | es_warn6: latin american ?(-|]|\/) 20 | es_warn7: castilian ?(-|]|\/) 21 | 22 | es_warn8:\b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)?synch?(ed|ro(nized)?)?|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|provided|supported|tim(ing|ed)|encoded|edit(ed|s)?|download(ed)?|present(s|ing|ed)|credits|episode)\b 23 | es_warn9:\b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)?synch?(ed|ro(nized)?)?|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|correct(ions?|ed)|transcri(be|bed|pt|ption|ptions)|improve(d|ments)|provided|supported|tim(ing|ed)|encoded|edit(ed|s)?|download(ed)?|present(s|ing|ed)|credits|episode)\b 24 | 25 | es_warn10: \b(Episódio|MKV|youtube|Facebook|Instagram|Twitter|Whatsapp|Tiktok)\b 26 | 27 | #Common Spanish Translator names - warnings as possibly could appear in a subtitle 28 | es_warn11: \b(Juan Vera|Juan Rico|George Denbrough|Giovanni Mion|Walter Leonard|Richard Bates|Francesc Aloy Bonet|Pilar González Dueñas|Mario Pérez|Paula Mariani|Philipp Schmidt|Hans Santos|Eric Escribano Barreiro)\b 29 | 30 | [PURGE_REGEX] 31 | 32 | es_purge1: \b(creado(s)?|subtitu(lo|los|lado|lada|lados)|subtítu(lo|los|lado|lada|lados)|descarg(ado|ar)|(re-?)?sinc(?!-)(ed|ro(nizado|nizados|nizacion|nización)?)?|modific(ado|ados|ion|iones|ión|iónes)|traduc(e|ido|idos|tora|cion|ciones|ción|ciónes)|correcc(iones|ion|ión|iónes)|correg(ir|ido|idos)|transcri(bido|pcion|pciones|pción|pciónes)|mejor(ado|amientos)|adaptado|ripeo|arreglos|subs|hecha)\W*(por|de|by)?\W*(:|;) 33 | 34 | #Spanish Sub websites 35 | es_purge2: \b(admitme|argenteam|finalanime|subtitulamos|substeam|subdivx|tusubtitulo|thesubfactory|Open Subtitles|miembro VIP|osdb\.link|TranslatorsInc|Translators, Inc|TranslatorslncSubs\.blogspot\.com\.es|Southparkspanish|SUBTITULOS\.es|SUBITULOS\.es|SouthParkNews\.net|subtitules\.es|ShooCat|YYeTs|TaMaBin|P@bs|gratispeliculas|SubAdictos|SerieCanal|playships\.eu|tusseries\.com|subswiki\.com|Subs-Team|SUBTÍTULOS\.ES|U\-Sub\.net)\b 36 | 37 | #Spanish translation websites / groups 38 | es_purge3: \b(Visiontext|Filmtrans|CARLISHIO|HGWizard|LASERFILM|Fhercho06|Cinesotano|jantoniot|Caichac|cemauli|Drakul|Scarlata|laloonda|japezoa|MarcusL|Kikeguate|KIKEGT|Zagon|KingCreole|Mothernatura|MaLTRaiN|FRH|GCas87|maryluzesp|Marenys|ByAlbis02|ana24horas|Fernando355|Zagonsubs|ikerslot|menoyos|Axel7902|vNaru|livinginthepast|patagonikus|Macias Group|EasyTechOficial|mlmlte|LiarsTeam|OnceUponATEAM)\b 39 | es_purge4: \b(juanchojb|shogun87|Rocio190889|darklin01|R@ul|Mabeas|akallabeth|NicoDipaolo|OsirisTSF|Lord Avestruz|LadyJenny|jeslil7|Giobatta SA|MementMori|la_bestia1962|Natuchia|JJ Porto|marchelo64|c\. oper|SHADOW84\Anfegopi|perroubuntero|Kumara|JosephPools|natycuac|ibvil|SwSub|DarKsh|ShalimarFox|R\[H\]ésus AB\+ Team|Mat Productions|S\. C\. Bananas|Bakugan|M-Rok|YYeTTs|robermgs)\b 40 | 41 | # 42 | # -----------------------------------------GUIDE------------------------------------------------- 43 | # 44 | 45 | # This language profile contains two lists of regex that will look for patterns. 46 | # if you wish to modify or remove any regex, feel free to do so 47 | # but files in the default folder will be overwritten when you update the script. 48 | # You can add and remove keys as long as two keys don't use the same key twice. 49 | 50 | # WARNING_REGEX: 51 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block. 52 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads. 53 | # 1 warning is ignored 54 | # 2 warnings will be print the block as a WARNING in the log. 55 | # 3 warnings or more will remove the entire block. 56 | 57 | # PURGE_REGEX: 58 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block. 59 | 60 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the 61 | # literal character you'll need to escape it with '\' 62 | # for example: matching "www." would require a regex like: "www\." 63 | # you can test regexes online on an regex-tester tool like https://regex101.com/ 64 | 65 | # Feel free to ask me any question on github. 66 | -------------------------------------------------------------------------------- /libs/subcleaner/regex_lists.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import re 3 | from pathlib import Path 4 | from typing import List, Dict, Tuple, Pattern 5 | 6 | from libs.subcleaner.settings import config 7 | import logging 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | global_profiles: List["GlobalProfile"] = [] 12 | purge_regex: Dict[str, List[Tuple[str, Pattern]]] = {} 13 | warning_regex: Dict[str, List[Tuple[str, Pattern]]] = {} 14 | 15 | 16 | def language_has_profile(language: str): 17 | return language in purge_regex 18 | 19 | 20 | def get_purge_regex(language: str) -> List[Tuple[str, Pattern]]: 21 | if language in purge_regex: 22 | return purge_regex[language] 23 | return purge_regex["no_profile"] 24 | 25 | 26 | def get_warning_regex(language: str) -> List[Tuple[str, Pattern]]: 27 | if language in warning_regex: 28 | return warning_regex[language] 29 | return warning_regex["no_profile"] 30 | 31 | 32 | class GlobalProfile: 33 | excluded_languages: List[str] 34 | purge_regex_lines: List[Tuple[str, Pattern]] 35 | warning_regex_lines: List[Tuple[str, Pattern]] 36 | 37 | def __init__(self, parser, default: bool) -> None: 38 | self.purge_regex_lines = [] 39 | self.warning_regex_lines = [] 40 | 41 | for key, value in list(parser["PURGE_REGEX"].items()): 42 | if not default: 43 | key = key + "*" 44 | value = f"({value})" 45 | compiled_regex = re.compile(value, flags=re.IGNORECASE | re.UNICODE) 46 | self.purge_regex_lines.append((key, compiled_regex)) 47 | for key, value in list(parser["WARNING_REGEX"].items()): 48 | if not default: 49 | key = key + "*" 50 | value = f"({value})" 51 | compiled_regex = re.compile(value, flags=re.IGNORECASE | re.UNICODE) 52 | self.warning_regex_lines.append((key, compiled_regex)) 53 | 54 | self.excluded_languages = parser["META"].get("excluded_language_codes", "").replace(" ", "").split(",") 55 | for language in self.excluded_languages: 56 | if not language: 57 | self.excluded_languages.remove(language) 58 | 59 | for language in purge_regex: 60 | if any(language == excluded_language for excluded_language in self.excluded_languages): 61 | continue 62 | purge_regex[language] += self.purge_regex_lines 63 | warning_regex[language] += self.warning_regex_lines 64 | 65 | 66 | def _load_profile(profile_file: Path, default: bool = True) -> None: 67 | parser = configparser.ConfigParser() 68 | 69 | try: 70 | parser.read(profile_file, encoding="utf-8") 71 | 72 | languages = parser["META"].get("language_codes", "").replace(" ", "") 73 | 74 | if "excluded_language_codes" in parser["META"].keys() or not languages: 75 | global_profiles.append(GlobalProfile(parser, default)) 76 | return 77 | if config.use_english_on_all and default and profile_file.name == "english.conf": 78 | global_profiles.append(GlobalProfile(parser, default)) 79 | for language in languages.split(","): 80 | if language not in purge_regex: 81 | _create_language(language) 82 | return 83 | 84 | for language in languages.split(","): 85 | if language not in purge_regex: 86 | _create_language(language) 87 | for key, value in list(parser["PURGE_REGEX"].items()): 88 | if not default: 89 | key = key + "*" 90 | value = f"({value})" 91 | compiled_regex = re.compile(value, flags=re.IGNORECASE | re.UNICODE) 92 | purge_regex[language].append((key, compiled_regex)) 93 | for key, value in list(parser["WARNING_REGEX"].items()): 94 | if not default: 95 | key = key + "*" 96 | value = f"({value})" 97 | compiled_regex = re.compile(value, flags=re.IGNORECASE | re.UNICODE) 98 | warning_regex[language].append((key, compiled_regex)) 99 | 100 | except Exception: 101 | logger.error(f"Incorrectly configured regex language profile: {profile_file.name}") 102 | exit(1) 103 | 104 | 105 | def _create_language(language: str) -> None: 106 | purge_regex[language] = [] 107 | warning_regex[language] = [] 108 | 109 | for global_profile in global_profiles: 110 | if any(language == excluded_language for excluded_language in global_profile.excluded_languages): 111 | continue 112 | purge_regex[language] += global_profile.purge_regex_lines 113 | warning_regex[language] += global_profile.warning_regex_lines 114 | 115 | 116 | def _load_regex(): 117 | for default_profile_file in config.default_regex_dir.iterdir(): 118 | if default_profile_file.is_file() and not default_profile_file.name.startswith(".") and default_profile_file.suffix == ".conf": 119 | for profile_file in config.regex_dir.iterdir(): 120 | 121 | if default_profile_file.name == profile_file.name: 122 | _load_profile(profile_file) 123 | break 124 | else: 125 | _load_profile(default_profile_file, default=True) 126 | for profile_file in config.regex_dir.iterdir(): 127 | if profile_file.is_file() and not profile_file.name.startswith(".") and profile_file.suffix == ".conf": 128 | for default_profile_file in config.default_regex_dir.iterdir(): 129 | 130 | if default_profile_file.name == profile_file.name: 131 | break 132 | else: 133 | _load_profile(profile_file) 134 | 135 | 136 | _load_regex() 137 | -------------------------------------------------------------------------------- /regex_profiles/default/global.conf: -------------------------------------------------------------------------------- 1 | [META] 2 | # 3 | # -------------------------------------------------------------------------------------------- 4 | # This is a global language profile! It will run against all language codes unless they are excluded! 5 | # -------------------------------------------------------------------------------------------- 6 | # 7 | 8 | # Coma delimited list of language codes to not run the global regex config against. 9 | # The script will also ignore any sub-labels like forced as long as they match the language code. 10 | excluded_language_codes = 11 | 12 | 13 | 14 | # Information about how to configure the REGEX sections, read at the bottom of the file. 15 | # All regexes are case insensitive! 16 | [WARNING_REGEX] 17 | 18 | global_warn1: www\.|https? |\\|@|\.(com|org|net|app|to|eu|to|io)\b 19 | global_warn2: (720|1080)p|HDTV|SHD|blu-?ray|DVD(?!-)|WEB\W?DL|23\.976|\b\d+\W*x\W*\d+\b|[xh]26[54]|™ 20 | global_warn3: (720|1080)p|HDTV|WEB\W?DL|23\.976|(^|e(pisode)?)\W?\d+\W*x\W*\d+$|[xh]26[54]|™ 21 | global_warn4: \b(CBS|deluxe|vitac?|Sartre|copyright|and TOYOTA|serverpartdeals)\b 22 | global_warn5: (_) 23 | global_warn6: (air date) 24 | global_warn7: ^(Teams?|the|subtitles)$ 25 | #global_warn#: Regex goes here. 26 | 27 | 28 | [PURGE_REGEX] 29 | 30 | global_purge1: ([^Ã]|^)©|==|>>|<<|★|=-|-=| ::| ::|\^\^ 31 | global_purge2: \.(tv|tk|xyz|sex|porn|xxx|link|ru)\b|https?\W 32 | global_purge3: \bs(eason)?\W*\d+[^,]\W*e(pisode)?\W*\d+[^,] 33 | 34 | global_purge4: \b(tvsubtitle|YTS|YIFY|opensub(titles)?|sub(scene|rip)|podnapisi|addic7ed|ragbear\W{,2}com|Point\.360)\b 35 | global_purge5: \b(bozxphd|sazu489|psagmeno|anoxmous|9unshofl|BLACKdoor|titlovi|Danishbits|acorn media|hound\W{,2}org|hunddawgs|iSubDB)\b 36 | global_purge6: \b(jodix|LESAIGNEUR|HighCode|explosiveskull|GoldenBeard|Fingal61|srjanapala|nadielostzilla|IESAIGNEUR|kdwluverz)\b 37 | global_purge7: \b(FilthyRichFutures|celebritysex|shareuniversity|AmericasCardroom|saveanilluminati|MCH2022|ALLIN1BOX|marocas62)\b 38 | global_purge8: \b(ClearwayLaw|SG-66|ShalimarFox|Icefre[@a]k|WGBH|KBS World|SweSUB|koreansubguy|R\[ésus|Barbie_on_Weed)\b 39 | global_purge9: \b(Aldi Arman|void_spell|LnlyHikikomori|wingyee|McEphie|robster38|dw817|zathras69|Thamyris|Dan4Jem|JustCosmin|moviesnipipay|delsxyz)\b 40 | global_purge10: \b(a\. b\. m\. j\.|Altyazı: Conan|SDI Media Group|HaruHaruSubs|@whyuandri|WahyuAndri|TheHeLL|RiKi66|KingJAIN|ADONI@|Jesslataree)\b 41 | global_purge11: \b(OrionDeBmk|TheChaosLegion|COLDFUSION \& BAARO|riri13|KOCOWA|@.?vii?ki|OnDemandKorea|MBC America|globosapien)\b 42 | global_purge12: \b(MSMOVIESBD|fightingfansubs|DLAznMovies|ancientmexicanwisdom|cookcountysheriff|MovieFull|300mbmovie|KoreanDramax)\b 43 | global_purge13: \b(extremesubs|3gpBluray|prijevodi-online|torrentgalaxy|Dramatorrent|torrent\.com|HQCINEMAS|WANNATALKAB[OA]UTIT|italiansubs|1000fr|1TamilMV|HDFREE)\b 44 | global_purge14: \b(chuanloon90|designer_pc|m_fouda97|Mr.Scudester|Shari_Kenzie|U-Sub.net|TCS Subtitling)\b 45 | 46 | global_purge15: \b(rate this subtitle|Subtitle(s)? extracted by|Sync(ed)? (&|and) Clean(ed)?|become VIP member|Subs OCR|the best subtitle(s)?|Timing and Subtitle(s)?|rate this subtitle|Free Online Movie(s)?|Subtitle(s)? Transcribed|Re-Sync \&|English Subtitles|Translation(s)? and adaptation:|Captions by Able|Subtitle Rip|Engsub By|Subtitles brought by|Translation \/ Subtitles)\b 47 | global_purge16: \b(Download MyTotal|itfc subtitles|Built Ford Proud|Captioning sponsored|brought to you by Ford|This is a free sub|Custom subtitle by|For more new Episodes visit|Watch Movies and Series|Advertise your product or brand here|Easy Subtitle(s)? Synchronizer|Watch more movies for free|Brought to you by MrsKorea and mily2|Media Access Group at WGBH|Subtitles brought to you by|UNE SÉRIE ORIGINALE NETFLIX|Brought to you by iRiS|Support us and become a VIP member|Advertise your product or brand here|Caption(s|ing)? made possible by|Visit Our Movie Site|Open Subtitle(s)? MKV Player|Translation(s)? and review by|Spell\-Check and Error\-Correction|Subtitles are brought to you|Translation\. Review by Angel\.|Captions by CSI Australia|Timing and Subs by|Subtitles by The World\Ws Finest Team|Watch and Download free|PLEASE DO NOT UPLOAD ANY OF OUR SUBS|Subtitle by CJ Entertainment)\b 48 | global_purge17: \b(Paramartha|Heavens Subbing Squad|DramaFever|Asian Cinema Encoders|Italian Scrubs Addicted|Kevin \& Tyno)\b 49 | global_purge18: \b(Viki\.com|dramafever\.com|GlowGaze\.Com|seriessub\.com|www\.telegram|d\-addicts\.com|NAPiSY\.info|cinetyp\.ch|lauzabo\.blogspot\.com|Laozhabor\.blogspot\.com|MARIO\.MK|captionmax\.com|firebit\.org|popbitch\.com|swsub\.com|sous-titres\.eu|forom\.\W?com|Csi\-teams\. Fr\. St|GreggBraden\.com|inmymelody\.wordpress\.com|serverpartdeals\.com) 50 | global_purge19: \b(Fansub(s)?|Hardsub(s)?|S u b|Sub Rip:|Terjemahan subtitle oleh) 51 | #global_purge#: Regex goes here. 52 | 53 | 54 | 55 | # 56 | # -----------------------------------------GUIDE------------------------------------------------- 57 | # 58 | 59 | # This language profile contains two lists of regex that will look for patterns. 60 | # if you wish to modify or remove any regex, feel free to do so 61 | # but files in the default folder will be overwritten when you update the script. 62 | # You can add and remove keys as long as two keys don't use the same key twice. 63 | 64 | # WARNING_REGEX: 65 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block. 66 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads. 67 | # 1 warning is ignored 68 | # 2 warnings will be print the block as a WARNING in the log. 69 | # 3 warnings or more will remove the entire block. 70 | 71 | # PURGE_REGEX: 72 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block. 73 | 74 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the 75 | # literal character you'll need to escape it with '\' 76 | # for example: matching "www." would require a regex like: "www\." 77 | # you can test regexes online on an regex-tester tool like https://regex101.com/ 78 | 79 | # Feel free to ask me any question on github. 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/python,windows,pycharm 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,windows,pycharm 4 | 5 | ### PyCharm ### 6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 8 | 9 | # User-specific stuff 10 | .idea/**/workspace.xml 11 | .idea/**/tasks.xml 12 | .idea/**/usage.statistics.xml 13 | .idea/**/dictionaries 14 | .idea/**/shelf 15 | 16 | # AWS User-specific 17 | .idea/**/aws.xml 18 | 19 | # Generated files 20 | .idea/**/contentModel.xml 21 | 22 | # Sensitive or high-churn files 23 | .idea/**/dataSources/ 24 | .idea/**/dataSources.ids 25 | .idea/**/dataSources.local.xml 26 | .idea/**/sqlDataSources.xml 27 | .idea/**/dynamic.xml 28 | .idea/**/uiDesigner.xml 29 | .idea/**/dbnavigator.xml 30 | 31 | # Gradle 32 | .idea/**/gradle.xml 33 | .idea/**/libraries 34 | 35 | # Gradle and Maven with auto-import 36 | # When using Gradle or Maven with auto-import, you should exclude module files, 37 | # since they will be recreated, and may cause churn. Uncomment if using 38 | # auto-import. 39 | # .idea/artifacts 40 | # .idea/compiler.xml 41 | # .idea/jarRepositories.xml 42 | # .idea/modules.xml 43 | # .idea/*.iml 44 | # .idea/modules 45 | # *.iml 46 | # *.ipr 47 | 48 | # CMake 49 | cmake-build-*/ 50 | 51 | # Mongo Explorer plugin 52 | .idea/**/mongoSettings.xml 53 | 54 | # File-based project format 55 | *.iws 56 | 57 | # IntelliJ 58 | out/ 59 | 60 | # mpeltonen/sbt-idea plugin 61 | .idea_modules/ 62 | 63 | # JIRA plugin 64 | atlassian-ide-plugin.xml 65 | 66 | # Cursive Clojure plugin 67 | .idea/replstate.xml 68 | 69 | # Crashlytics plugin (for Android Studio and IntelliJ) 70 | com_crashlytics_export_strings.xml 71 | crashlytics.properties 72 | crashlytics-build.properties 73 | fabric.properties 74 | 75 | # Editor-based Rest Client 76 | .idea/httpRequests 77 | 78 | # Android studio 3.1+ serialized cache file 79 | .idea/caches/build_file_checksums.ser 80 | 81 | ### PyCharm Patch ### 82 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 83 | 84 | # *.iml 85 | # modules.xml 86 | # .idea/misc.xml 87 | # *.ipr 88 | 89 | # Sonarlint plugin 90 | # https://plugins.jetbrains.com/plugin/7973-sonarlint 91 | .idea/**/sonarlint/ 92 | 93 | # SonarQube Plugin 94 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin 95 | .idea/**/sonarIssues.xml 96 | 97 | # Markdown Navigator plugin 98 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced 99 | .idea/**/markdown-navigator.xml 100 | .idea/**/markdown-navigator-enh.xml 101 | .idea/**/markdown-navigator/ 102 | 103 | # Cache file creation bug 104 | # See https://youtrack.jetbrains.com/issue/JBR-2257 105 | .idea/$CACHE_FILE$ 106 | 107 | # CodeStream plugin 108 | # https://plugins.jetbrains.com/plugin/12206-codestream 109 | .idea/codestream.xml 110 | 111 | ### Python ### 112 | # Byte-compiled / optimized / DLL files 113 | __pycache__/ 114 | *.py[cod] 115 | *$py.class 116 | 117 | # C extensions 118 | *.so 119 | 120 | # Distribution / packaging 121 | .Python 122 | build/ 123 | develop-eggs/ 124 | dist/ 125 | downloads/ 126 | eggs/ 127 | .eggs/ 128 | lib/ 129 | lib64/ 130 | parts/ 131 | sdist/ 132 | var/ 133 | wheels/ 134 | share/python-wheels/ 135 | *.egg-info/ 136 | .installed.cfg 137 | *.egg 138 | MANIFEST 139 | 140 | # PyInstaller 141 | # Usually these files are written by a python script from a template 142 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 143 | *.manifest 144 | *.spec 145 | 146 | # Installer logs 147 | pip-log.txt 148 | pip-delete-this-directory.txt 149 | 150 | # Unit test / coverage reports 151 | htmlcov/ 152 | .tox/ 153 | .nox/ 154 | .coverage 155 | .coverage.* 156 | .cache 157 | nosetests.xml 158 | coverage.xml 159 | *.cover 160 | *.py,cover 161 | .hypothesis/ 162 | .pytest_cache/ 163 | cover/ 164 | 165 | # Translations 166 | *.mo 167 | *.pot 168 | 169 | # Django stuff: 170 | *.log 171 | local_settings.py 172 | db.sqlite3 173 | db.sqlite3-journal 174 | 175 | # Flask stuff: 176 | instance/ 177 | .webassets-cache 178 | 179 | # Scrapy stuff: 180 | .scrapy 181 | 182 | # Sphinx documentation 183 | docs/_build/ 184 | 185 | # PyBuilder 186 | .pybuilder/ 187 | target/ 188 | 189 | # Jupyter Notebook 190 | .ipynb_checkpoints 191 | 192 | # IPython 193 | profile_default/ 194 | ipython_config.py 195 | 196 | # pyenv 197 | # For a library or package, you might want to ignore these files since the code is 198 | # intended to run in multiple environments; otherwise, check them in: 199 | # .python-version 200 | 201 | # pipenv 202 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 203 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 204 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 205 | # install all needed dependencies. 206 | #Pipfile.lock 207 | 208 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 209 | __pypackages__/ 210 | 211 | # Celery stuff 212 | celerybeat-schedule 213 | celerybeat.pid 214 | 215 | # SageMath parsed files 216 | *.sage.py 217 | 218 | # Environments 219 | .env 220 | .venv 221 | env/ 222 | venv/ 223 | ENV/ 224 | env.bak/ 225 | venv.bak/ 226 | 227 | # Spyder project settings 228 | .spyderproject 229 | .spyproject 230 | 231 | # Rope project settings 232 | .ropeproject 233 | 234 | # mkdocs documentation 235 | /site 236 | 237 | # mypy 238 | .mypy_cache/ 239 | .dmypy.json 240 | dmypy.json 241 | 242 | # Pyre type checker 243 | .pyre/ 244 | 245 | # pytype static type analyzer 246 | .pytype/ 247 | 248 | # Cython debug symbols 249 | cython_debug/ 250 | 251 | ### Windows ### 252 | # Windows thumbnail cache files 253 | Thumbs.db 254 | Thumbs.db:encryptable 255 | ehthumbs.db 256 | ehthumbs_vista.db 257 | 258 | # Dump file 259 | *.stackdump 260 | 261 | # Folder config file 262 | [Dd]esktop.ini 263 | 264 | # Recycle Bin used on file shares 265 | $RECYCLE.BIN/ 266 | 267 | # Windows Installer files 268 | *.cab 269 | *.msi 270 | *.msix 271 | *.msm 272 | *.msp 273 | 274 | # Windows shortcuts 275 | *.lnk 276 | 277 | # End of https://www.toptal.com/developers/gitignore/api/python,windows,pycharm 278 | 279 | /test-dir 280 | /removed.log 281 | /.idea 282 | /subcleaner.conf 283 | /log/ 284 | /logs/ 285 | *.log* 286 | /regex_profiles/ 287 | /databases/ 288 | -------------------------------------------------------------------------------- /libs/subcleaner/main.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import logging 3 | from typing import List, Dict 4 | from .subtitle import Subtitle, ParsingException, FileContentException 5 | from libs.subcleaner import cleaner, report_generator, languages, regex_lists 6 | from .settings import args, config 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | files_handled: List[str] = [] 11 | files_failed: Dict[str, str] = {} 12 | 13 | 14 | def main(): 15 | try: 16 | for file in args.subtitles: 17 | if file.suffix == ".srt": 18 | logger.debug(f"cleaning file: {file}") 19 | clean_file(file) 20 | 21 | logger.debug(f"path libraries: {args.libraries}") 22 | for library in args.libraries: 23 | logger.debug(f"cleaning library: {library}") 24 | clean_directory(library) 25 | except KeyboardInterrupt: 26 | logger.info("subcleaner aborted") 27 | 28 | if files_handled: 29 | if args.end_report and len(files_handled) > 1: 30 | logger.info("end of run report: \n" + report_generator.generate_end_report()) 31 | 32 | if not files_failed: 33 | logger.info(f"subcleaner finished successfully. {len(files_handled)} files cleaned.") 34 | if args.silent or args.errors_only: 35 | print(f"subcleaner finished successfully. {len(files_handled)} files cleaned.") 36 | else: 37 | logger.info(f"subcleaner finished successfully partly. {len(files_handled)}/{len(files_handled) + len(files_failed)} files cleaned successfully.") 38 | logger.info(f"failed to clean following files:") 39 | for file_name, reason in files_failed.items(): 40 | logger.info(f" - '{file_name}' reason: {reason}") 41 | if args.errors_only: 42 | print(f"subcleaner finished successfully partly. {len(files_handled)}/{len(files_handled) + len(files_failed)} files cleaned successfully.") 43 | else: 44 | if files_failed: 45 | logger.error(f"subcleaner didn't successfully clean any files, failed to clean {len(files_failed)} files.") 46 | if args.silent: 47 | print(f"subcleaner didn't successfully clean any files, failed to clean {len(files_failed)} files.") 48 | else: 49 | logger.error("subcleaner didn't find any files to clean!") 50 | if args.silent: 51 | print("subcleaner didn't find any files to clean!") 52 | 53 | 54 | def clean_file(subtitle_file: Path) -> None: 55 | if subtitle_file.name in files_handled or subtitle_file.name in files_failed: 56 | return 57 | logger.info("[---------------------------------------------------------------------------------]") 58 | try: 59 | short_file = subtitle_file.relative_to(config.relative_base) 60 | except ValueError: 61 | short_file = subtitle_file 62 | try: 63 | logger.info(f"loading subtitle: {short_file}") 64 | subtitle = Subtitle(subtitle_file) 65 | except (UnicodeDecodeError, ParsingException, FileContentException) as e: 66 | logger.error(f"subcleaner was unable to decode the file. reason:") 67 | logger.error(e) 68 | files_failed[subtitle_file.name] = f"subcleaner was unable to decode the file: {e}" 69 | return 70 | if not subtitle: 71 | logger.warning("Subtitle file is empty.") 72 | files_failed[subtitle_file.name] = "Subtitle file is empty." 73 | return 74 | if config.require_language_profile and not regex_lists.language_has_profile(subtitle.language): 75 | logger.warning(f"language '{subtitle.language}' have no regex profile associated with it.") 76 | logger.warning(f"either create a regex profile for it or disable require_language_profile in the config.") 77 | files_failed[subtitle_file.name] = f"language '{subtitle.language}' have no regex profile associated with it." 78 | return 79 | 80 | logger.info(f"now cleaning subtitle: {subtitle.short_path}") 81 | 82 | if not subtitle.language_is_correct(): 83 | logger.warning(f"the language within the file does not match language: '{subtitle.language}'") 84 | changes = False 85 | cleaner.unscramble(subtitle) 86 | cleaner.find_ads(subtitle) 87 | if subtitle.ad_blocks: 88 | changes = True 89 | cleaner.remove_ads(subtitle) 90 | if config.fix_overlaps: 91 | changes = changes or cleaner.fix_overlap(subtitle) 92 | cleaner.reset() 93 | 94 | if len(subtitle.blocks) == 0: 95 | l = list(subtitle.ad_blocks) 96 | reasons = l[0].hints 97 | for block in l[1:]: 98 | for hint in reasons: 99 | if hint not in block.hints: 100 | reasons.remove(hint) 101 | 102 | logger.error("There might be an issue with the regex or the subtitle file, " 103 | "because everything in the subtitle would have gotten deleted. " 104 | "Nothing was altered.") 105 | if reasons: 106 | logger.error("all removed blocks had common reasons: " + ", ".join(reasons)) 107 | files_failed[subtitle_file.name] = "aborted, removed all subtitles. all removed blocks had common reasons: " + ", ".join(reasons) 108 | return 109 | 110 | logger.info(f"Done. Cleaning report:\n{report_generator.generate_report(subtitle)}\n") 111 | files_handled.append(subtitle_file.name) 112 | if changes: 113 | logger.info("no ads found") 114 | 115 | if args.dry_run: 116 | subtitle.to_content() 117 | logger.warning("dry run: nothing was altered.") 118 | else: 119 | if changes: 120 | with subtitle_file.open("w", encoding="UTF-8") as file: 121 | file.write(subtitle.to_content()) 122 | 123 | 124 | def clean_directory(directory: Path) -> None: 125 | for file in directory.iterdir(): 126 | if file.name.startswith("."): 127 | continue 128 | 129 | if file.is_dir() and not file.is_symlink(): 130 | clean_directory(file) 131 | 132 | if not file.is_file() or file.suffix != ".srt": 133 | continue 134 | 135 | if not args.language: 136 | logger.debug(f"cleaning file: {file}") 137 | clean_file(file) 138 | continue 139 | 140 | for suffix in file.suffixes[max(-3, -len(file.suffixes)):-1]: 141 | parsed_lang = suffix.replace(":", "-").replace("_", "-").split("-")[0][1:] 142 | if languages.is_language(parsed_lang) and args.language == parsed_lang: 143 | logger.debug(f"cleaning file: {file}") 144 | clean_file(file) 145 | -------------------------------------------------------------------------------- /libs/subcleaner/settings/args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import glob 4 | import os 5 | import pathlib 6 | from pathlib import Path 7 | from typing import Optional, List 8 | 9 | from libs.subcleaner import languages 10 | from . import config 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | checked_disks = set("C:") 15 | 16 | 17 | def check_disk_liveliness(disk: Path): 18 | if disk.drive in checked_disks: 19 | return 20 | checked_disks.add(disk.drive) 21 | 22 | try: 23 | try: 24 | prev_cwd = Path.cwd() 25 | os.chdir(disk) 26 | os.chdir(prev_cwd) 27 | return 28 | except FileNotFoundError: 29 | tmp_file = disk.joinpath(".subcleaner-disk-liveliness-checker.safe_to_delete") 30 | tmp_file.touch() 31 | tmp_file.unlink() 32 | return 33 | 34 | except (PermissionError, FileExistsError): 35 | return 36 | except FileNotFoundError: 37 | logger.error(f"The {disk} drive is currently inaccessible. please reconnect to the drive.") 38 | 39 | 40 | parser = argparse.ArgumentParser(description="Remove ads from subtitle. Removed blocks are sent to logfile. " 41 | "Can also check that the subtitle language match the file name language code. ") 42 | 43 | subtitles: List[Path] 44 | parser.add_argument("subtitle", metavar="SUB", type=str, default=list(), nargs="*", 45 | help="Path to subtitles to run script against. " 46 | "Script currently only compatible with simple .srt files.") 47 | 48 | libraries: List[Path] 49 | parser.add_argument("--library", "-r", metavar="LIB", type=str, dest="library", default=list(), nargs="*", 50 | help="Run the script also on any subtitle found recursively under directory LIB. " 51 | "If LANG is specified it will only run it on subtitles that have a " 52 | "language label matching LANG.") 53 | 54 | language: Optional[str] 55 | parser.add_argument("--language", "-l", metavar="LANG", type=str, dest="language", default=None, 56 | help="ISO-639 language code. If this argument is set then the script will " 57 | "assume that the SUB's language is LANG regardless of filenames and content. " 58 | "code may contain :forced or other \"LANG:\" but these tags will be ignored") 59 | 60 | purge_list: List[int] 61 | parser.add_argument("--destroy", "-d", type=int, nargs="+", default=list(), 62 | help="original_index of blocks to remove from SUB, this option is not compatible with the " 63 | "library option. When this option is passed the script will mark the " 64 | "specified blocks as ads and then run normally. " 65 | "Example to destroy block 4 and 78: -d 4 78") 66 | 67 | dry_run: bool 68 | parser.add_argument("--dry-run", "-n", action="store_true", dest="dry_run", 69 | help="Dry run: No files are modified. (debug)") 70 | 71 | silent: bool 72 | parser.add_argument("--silent", "-s", action="store_true", dest="silent", 73 | help="Silent: Only print warnings or errors in stdout.") 74 | 75 | minimal: bool 76 | parser.add_argument("--minimal", "-m", action="store_true", dest="minimal", 77 | help=argparse.SUPPRESS) 78 | 79 | removed_only: bool 80 | parser.add_argument("--removed", "-a", action="store_true", dest="removed_only", 81 | help="Removed Only: Will only show removed blocks in cleaning report.") 82 | 83 | errors_only: bool 84 | parser.add_argument("--errors", "-e", action="store_true", dest="errors_only", 85 | help="Errors: Only print errors and will run in --dry-run mode.") 86 | 87 | no_log: bool 88 | parser.add_argument("--no-log", action="store_true", dest="no_log", 89 | help="No log: Nothing is logged to file.") 90 | 91 | sensitive: bool 92 | parser.add_argument("--sensitive", action="store_true", dest="sensitive", 93 | help="Sensitive: Log all blocks adjacent to ads as warnings (debug).") 94 | 95 | explain: bool 96 | parser.add_argument("--explain", action="store_true", dest="explain", 97 | help=argparse.SUPPRESS) 98 | 99 | no_explain: bool 100 | parser.add_argument("--no-explain", action="store_true", dest="no_explain", 101 | help="No explain: suppresses explanations for why blocks got removed or received warnings.") 102 | 103 | end_report: bool 104 | parser.add_argument("--end-report", action="store_true", dest="end_report", 105 | help="End Report: shows a report at the end displaying unique removed/warning blocks in this run" 106 | "removed blocks with less than 9 warnings are sorted from fewest removed block with same content " 107 | "and warning is sorted from most warned blocks with the same content. (debug)") 108 | 109 | debug: bool 110 | parser.add_argument("--debug", action="store_true", dest="debug", 111 | help="Debug: argument collection that contains arguments: " 112 | "--dry-run, --sensitive and --end-report") 113 | 114 | args = parser.parse_args() 115 | # check usage: 116 | 117 | if len(args.subtitle) == 0 and len(args.library) == 0: 118 | parser.print_help() 119 | exit() 120 | 121 | debug = args.debug 122 | if debug: 123 | print("debug mode.") 124 | 125 | if debug: 126 | print(f"arg.library: {args.library}") 127 | 128 | libraries = [] 129 | for library_str in args.library: 130 | library: Path = Path(library_str) 131 | if not library.is_absolute(): 132 | if library_str[0:2] == "./": 133 | library = Path.cwd().joinpath(library) 134 | else: 135 | library = config.relative_base.joinpath(library) 136 | if isinstance(library, pathlib.WindowsPath): 137 | check_disk_liveliness(Path(library.drive + "/")) 138 | 139 | for item in glob.glob(glob.escape(str(library)).replace("[*]", "*")): 140 | item = Path(item).resolve() 141 | if item.is_dir(): 142 | libraries.append(item) 143 | 144 | if debug: 145 | print(f"arg.subtitle: {args.subtitle}") 146 | 147 | subtitles = [] 148 | for file_str in args.subtitle: 149 | file = Path(file_str) 150 | if not file.is_absolute(): 151 | if file_str[0:2] == "./": 152 | file = Path.cwd().joinpath(file) 153 | else: 154 | file = config.relative_base.joinpath(file) 155 | if isinstance(file, pathlib.WindowsPath): 156 | check_disk_liveliness(Path(file.drive + "/")) 157 | 158 | for item in glob.glob(glob.escape(str(file)).replace("[*]", "*")): 159 | item = Path(item).resolve() 160 | if item.is_file() and item.name[-4:] == ".srt": 161 | subtitles.append(item) 162 | 163 | language = None 164 | if args.language: 165 | language = args.language.replace("-", ":").split(":")[0].replace("\"", "").replace("'", "").lower() 166 | if not languages.is_language(language): 167 | logger.error("'" + args.language + "' is not a valid ISO-639 language.\n--help for more information.") 168 | exit(1) 169 | 170 | destroy_list = args.destroy 171 | if destroy_list and (len(subtitles) != 1 or len(libraries) != 0): 172 | logger.error("option --destroy require one and only one specified subtitle file.\nsee --help for more info.") 173 | exit(1) 174 | 175 | silent = args.silent 176 | no_log = args.no_log 177 | dry_run = args.dry_run or args.debug 178 | errors_only = args.errors_only 179 | removed_only = args.removed_only 180 | sensitive = args.sensitive or args.debug 181 | explain = not args.no_explain 182 | end_report = args.end_report or args.debug 183 | -------------------------------------------------------------------------------- /libs/langdetect/detector.py: -------------------------------------------------------------------------------- 1 | import random 2 | import re 3 | 4 | import libs.six as six 5 | from .lang_detect_exception import ErrorCode, LangDetectException 6 | from .language import Language 7 | from .utils.ngram import NGram 8 | from .utils.unicode_block import unicode_block 9 | 10 | 11 | class Detector(object): 12 | ''' 13 | Detector class is to detect language from specified text. 14 | Its instance is able to be constructed via the factory class DetectorFactory. 15 | 16 | After appending a target text to the Detector instance with .append(string), 17 | the detector provides the language detection results for target text via .detect() or .get_probabilities(). 18 | 19 | .detect() method returns a single language name which has the highest probability. 20 | .get_probabilities() methods returns a list of multiple _languages and their probabilities. 21 | 22 | The detector has some parameters for language detection. 23 | See set_alpha(double), .set_max_text_length(int) .set_prior_map(dict). 24 | 25 | Example: 26 | 27 | from langdetect.detector_factory import DetectorFactory 28 | factory = DetectorFactory() 29 | factory.load_profile('/path/to/profile/directory') 30 | 31 | def detect(text): 32 | detector = factory.create() 33 | detector.append(text) 34 | return detector.detect() 35 | 36 | def detect_langs(text): 37 | detector = factory.create() 38 | detector.append(text) 39 | return detector.get_probabilities() 40 | ''' 41 | 42 | ALPHA_DEFAULT = 0.5 43 | ALPHA_WIDTH = 0.05 44 | 45 | ITERATION_LIMIT = 1000 46 | PROB_THRESHOLD = 0.1 47 | CONV_THRESHOLD = 0.99999 48 | BASE_FREQ = 10000 49 | UNKNOWN_LANG = 'unknown' 50 | 51 | URL_RE = re.compile(r'https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}') 52 | MAIL_RE = re.compile(r'[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}') 53 | 54 | def __init__(self, factory): 55 | self.word_lang_prob_map = factory.word_lang_prob_map 56 | self.langlist = factory.langlist 57 | self.seed = factory.seed 58 | self.random = random.Random() 59 | self.text = '' 60 | self.langprob = None 61 | 62 | self.alpha = self.ALPHA_DEFAULT 63 | self.n_trial = 7 64 | self.max_text_length = 10000 65 | self.prior_map = None 66 | self.verbose = False 67 | 68 | def set_verbose(self): 69 | self.verbose = True 70 | 71 | def set_alpha(self, alpha): 72 | self.alpha = alpha 73 | 74 | def set_prior_map(self, prior_map): 75 | '''Set prior information about language probabilities.''' 76 | self.prior_map = [0.0] * len(self.langlist) 77 | sump = 0.0 78 | for i in range(len(self.prior_map)): 79 | lang = self.langlist[i] 80 | if lang in prior_map: 81 | p = prior_map[lang] 82 | if p < 0: 83 | raise LangDetectException(ErrorCode.InitParamError, 'Prior probability must be non-negative.') 84 | self.prior_map[i] = p 85 | sump += p 86 | if sump <= 0.0: 87 | raise LangDetectException(ErrorCode.InitParamError, 'More one of prior probability must be non-zero.') 88 | for i in range(len(self.prior_map)): 89 | self.prior_map[i] /= sump 90 | 91 | def set_max_text_length(self, max_text_length): 92 | '''Specify max size of target text to use for language detection. 93 | The default value is 10000(10KB). 94 | ''' 95 | self.max_text_length = max_text_length 96 | 97 | def append(self, text): 98 | '''Append the target text for language detection. 99 | If the total size of target text exceeds the limit size specified by 100 | Detector.set_max_text_length(int), the rest is cut down. 101 | ''' 102 | text = self.URL_RE.sub(' ', text) 103 | text = self.MAIL_RE.sub(' ', text) 104 | text = NGram.normalize_vi(text) 105 | pre = 0 106 | for i in range(min(len(text), self.max_text_length)): 107 | ch = text[i] 108 | if ch != ' ' or pre != ' ': 109 | self.text += ch 110 | pre = ch 111 | 112 | def cleaning_text(self): 113 | '''Cleaning text to detect 114 | (eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet). 115 | ''' 116 | latin_count, non_latin_count = 0, 0 117 | for ch in self.text: 118 | if 'A' <= ch <= 'z': 119 | latin_count += 1 120 | elif ch >= six.u('\u0300') and unicode_block(ch) != 'Latin Extended Additional': 121 | non_latin_count += 1 122 | 123 | if latin_count * 2 < non_latin_count: 124 | text_without_latin = '' 125 | for ch in self.text: 126 | if ch < 'A' or 'z' < ch: 127 | text_without_latin += ch 128 | self.text = text_without_latin 129 | 130 | def detect(self): 131 | '''Detect language of the target text and return the language name 132 | which has the highest probability. 133 | ''' 134 | probabilities = self.get_probabilities() 135 | if probabilities: 136 | return probabilities[0].lang 137 | return self.UNKNOWN_LANG 138 | 139 | def get_probabilities(self): 140 | if self.langprob is None: 141 | self._detect_block() 142 | return self._sort_probability(self.langprob) 143 | 144 | def _detect_block(self): 145 | self.cleaning_text() 146 | ngrams = self._extract_ngrams() 147 | if not ngrams: 148 | raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.') 149 | 150 | self.langprob = [0.0] * len(self.langlist) 151 | 152 | self.random.seed(self.seed) 153 | for t in range(self.n_trial): 154 | prob = self._init_probability() 155 | alpha = self.alpha + self.random.gauss(0.0, 1.0) * self.ALPHA_WIDTH 156 | 157 | i = 0 158 | while True: 159 | self._update_lang_prob(prob, self.random.choice(ngrams), alpha) 160 | if i % 5 == 0: 161 | if self._normalize_prob(prob) > self.CONV_THRESHOLD or i >= self.ITERATION_LIMIT: 162 | break 163 | if self.verbose: 164 | six.print_('>', self._sort_probability(prob)) 165 | i += 1 166 | for j in range(len(self.langprob)): 167 | self.langprob[j] += prob[j] / self.n_trial 168 | if self.verbose: 169 | six.print_('==>', self._sort_probability(prob)) 170 | 171 | def _init_probability(self): 172 | '''Initialize the map of language probabilities. 173 | If there is the specified prior map, use it as initial map. 174 | ''' 175 | if self.prior_map is not None: 176 | return list(self.prior_map) 177 | else: 178 | return [1.0 / len(self.langlist)] * len(self.langlist) 179 | 180 | def _extract_ngrams(self): 181 | '''Extract n-grams from target text.''' 182 | RANGE = list(range(1, NGram.N_GRAM + 1)) 183 | 184 | result = [] 185 | ngram = NGram() 186 | for ch in self.text: 187 | ngram.add_char(ch) 188 | if ngram.capitalword: 189 | continue 190 | for n in RANGE: 191 | # optimized w = ngram.get(n) 192 | if len(ngram.grams) < n: 193 | break 194 | w = ngram.grams[-n:] 195 | if w and w != ' ' and w in self.word_lang_prob_map: 196 | result.append(w) 197 | return result 198 | 199 | def _update_lang_prob(self, prob, word, alpha): 200 | '''Update language probabilities with N-gram string(N=1,2,3).''' 201 | if word is None or word not in self.word_lang_prob_map: 202 | return False 203 | 204 | lang_prob_map = self.word_lang_prob_map[word] 205 | if self.verbose: 206 | six.print_('%s(%s): %s' % (word, self._unicode_encode(word), self._word_prob_to_string(lang_prob_map))) 207 | 208 | weight = alpha / self.BASE_FREQ 209 | for i in range(len(prob)): 210 | prob[i] *= weight + lang_prob_map[i] 211 | return True 212 | 213 | def _word_prob_to_string(self, prob): 214 | result = '' 215 | for j in range(len(prob)): 216 | p = prob[j] 217 | if p >= 0.00001: 218 | result += ' %s:%.5f' % (self.langlist[j], p) 219 | return result 220 | 221 | def _normalize_prob(self, prob): 222 | '''Normalize probabilities and check convergence by the maximun probability. 223 | ''' 224 | maxp, sump = 0.0, sum(prob) 225 | for i in range(len(prob)): 226 | p = prob[i] / sump 227 | if maxp < p: 228 | maxp = p 229 | prob[i] = p 230 | return maxp 231 | 232 | def _sort_probability(self, prob): 233 | result = [Language(lang, p) for (lang, p) in zip(self.langlist, prob) if p > self.PROB_THRESHOLD] 234 | result.sort(reverse=True) 235 | return result 236 | 237 | def _unicode_encode(self, word): 238 | buf = '' 239 | for ch in word: 240 | if ch >= six.u('\u0080'): 241 | st = hex(0x10000 + ord(ch))[2:] 242 | while len(st) < 4: 243 | st = '0' + st 244 | buf += r'\u' + st[1:5] 245 | else: 246 | buf += ch 247 | return buf 248 | -------------------------------------------------------------------------------- /regex_profiles/default/dutch.conf: -------------------------------------------------------------------------------- 1 | [META] 2 | # Dutch default config. 3 | 4 | # Comma delimited list of language codes associated with this language profile. 5 | # The script will run against all sub-labels like ":forced" as long as they match the language code. 6 | # leave empty to apply to all language codes. 7 | language_codes = nl, nld, dut, dutch 8 | 9 | 10 | 11 | # Information about how to configure the REGEX sections, read at the bottom of the file. 12 | # All regexes are case insensitive! 13 | 14 | [WARNING_REGEX] 15 | 16 | nl_warn1: \b(ondertitel(s|d|ing)?|(bij-?)?vertaa?l(d|er|ing|ingen)|(na-?)?bewerk(t|ing|ingen)|(na)?(ge)?controle(erd)?|(ge)?modific(aties?|eerd)|aan(ge)?vull?(d|ende?|ing|ingen)|aan(ge)?(boden|pas(t|singen))|mogelijk gemaakt|creatief supervisor|correcties?|gecorrigeerd|nagekeken|(na)?gecheckt|(her)?(na)?(ge)?(re)?synch?t?(ronis(atie(correcties?)?|ering|e(er)?d))?|(ge)?transcribee?r(ing|en|d)|transcript(s|ies?)?|verbeter(d|ing|ingen)|herzien(ing|ingen)?|gedownload|geript|(ge)?presenteer(d|t)|aflevering(en)?|episodes?)\b 17 | nl_warn2: \b(ondertitel(s|d|ing)?|(bij-?)?vertaa?l(d|er|ing|ingen)|(na-?)?bewerk(t|ing|ingen)|(na)?(ge)?controle(erd)?|(ge)?modific(aties?|eerd)|aan(ge)?vull?(d|ende?|ing|ingen)|aan(ge)?(boden|pas(t|singen))|mogelijk gemaakt|creatief supervisor|correcties?|gecorrigeerd|nagekeken|(na)?gecheckt|(her)?(na)?(ge)?(re)?synch?t?(ronis(atie(correcties?)?|ering|e(er)?d))?|(ge)?transcribee?r(ing|en|d)|transcript(s|ies?)?|verbeter(d|ing|ingen)|herzien(ing|ingen)?|gedownload|geript)\W+(door|van)\b 18 | nl_warn3: \.(nl|nu)\b 19 | 20 | ### Short/common nicknames/phrases 21 | nl_warn4: \b(888|ac|acolieten|arri[eë]lla|assenza|deluxe|d[eé]sir[eé]e|dutch|eagle|ericsson|heksje|investigator|jolly|jm|lain|mmf|mvv|mvw|oym|orange|pvt|razor|relentless|releases|rq|scarlett|sheeba|simply|skinny|sk|slabak|thc|tokke|vision|vsi|hooky|kwibus|savales|gvdl|mandy|kathmandu|justme|mimir|codar|jeltje|phantom|juggernaut)\b 22 | nl_warn5: \b(888|ac|acolieten|arri[eë]lla|assenza|d[eé]sir[eé]e|ericsson|heksje|investigator|jm|lain|mmf|mvv|mvw|oym|pvt|relentless|releases|rq|scarlett|sheeba|sk|slabak|thc|tokke|vsi|hooky|savales|gvdl|justme|codar|jeltje)\b 23 | 24 | ### English 25 | nl_warn6: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)synch?(ed|ro(nized)?)?|(re-?)?synch?(ed|ro(nized)?)|rip(ped)?|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|confor(m|med)|correct(ions?|ed)|transcri(be|bed|ption|ptions)|improve(d|ments)|sub(s|bed)?|provided|supported|encoded|edit(ed|s)?|download(ed)?|present(s|ing|ed)|credits)\b 26 | nl_warn7: \b(caption(s|ed|ing)?|subtitl(e|ed|es|ing)|fixed|(re-?)synch?(ed|ro(nized)?)?|(re-?)?synch?(ed|ro(nized)?)|ripped|modified|translat(e|ed|ion|ions)|creat(ed|ion|ions)|conformed|correct(ions?|ed)|transcri(be|bed|ption|ptions)|improve(d|ments)|sub(s|bed)|provided|supported|encoded|edit(ed|s)?|downloaded|present(s|ing|ed))\b 27 | 28 | ### From no_profile config 29 | nl_warn8: \b(broadcasting|metamorfose|Arun|Aramis|KKB|ydy|snuif)\b 30 | 31 | #nl_warn#: Regex goes here. 32 | 33 | [PURGE_REGEX] 34 | 35 | nl_purge1: \b(ondertitel(s|d|ing)?|(bij-?)?vertaa?l(d|er|ing|ingen)|(na-?)?bewerk(t|ing|ingen)|(na)?(ge)?controle(erd)?|(ge)?modific(aties?|eerd)|aan(ge)?vull?(d|ende?|ing|ingen)|aan(ge)?(boden|pas(t|singen))|mogelijk gemaakt|creatief supervisor|correcties?|gecorrigeerd|nagekeken|(na)?gecheckt|(her)?(na)?(ge)?(re)?synch?t?(ronis(atie(correcties?)?|ering|e(er)?d))?|(ge)?transcribee?r(ing|en|d)|transcript(s|ies?)?|verbeter(d|ing|ingen)|herzien(ing|ingen)?|gedownload|geript)\W*(door|van)?\W*(:|;).. 36 | 37 | ### Advertisements 38 | nl_purge2: \b(tv ?-?(piraat|box|aanbod)|p\.j\.|allesin1box|gratisstreamen|goedkope ?webhosting|word vip ?-?member|(beoordeel|download) deze (ondertitel|subtitle)|promoot uw product|areslive|plz donate|streambox)\b 39 | 40 | ### Translation agencies 41 | nl_purge3: \b(invision|iyuno(mg)?|sdi (media|group)|bti studios|titrafilm|hoek & son[eé]pouse|p2p (ondertiteling|subtitling)|broadcast text international|odmedia|visiontext|amsterdams vertalers[ck]olle[ck]tief)\b 42 | 43 | ### Amateur/volunteer subtitler nicknames 44 | nl_purge4: \b(Goffini|Muzatte|Suurtje|Daboy|Delapluma|Depositair|Brown-Eyes|Copy2004AP|ED2K4U|Megamaker|SKVCD|pevi70|Nightfalls|WinchesterGirl|pinkGaai|ChillyWitch|meatlove100|apimpnamedslickback|vidioot|OliverKlozoff)\b 45 | nl_purge5: \b(Ren ?H[oö]k|FuBre|Skip77|Cks321|DevilsBackbone|Appie ?van ?de ?3e|Jamees|Cdrazar|SatCP|Johnny ?Lion|Janty|Pgmaker|Baseripper|L4Y|Flitskikker|WH1T3R0S3|Spookstah|MrTheoW|Thomilla|Zuiberknaf|VitoSilans)\b 46 | nl_purge6: \b(Cranedriver|Find[eé]k[aà]no|Stevo|AchtAchtAcht|Dweez|Rustroest|cjdijk|pvdc|One2Sub|Zero_1|NederSubs|Kiry|FLAK|eXtAsls|bdzzld|ropo64|fatlobster|DiscoRobert|Peter4871|Marc2008|Thai-?Tanic|Pid0ck|HaiHai)\b 47 | nl_purge7: \b(MrPallMall|BorisVictor|YouWontKnowWho|JohnP|DZJZMLU|Pielie|SmallBrother|Trilker|MartinH|Bas2003|ThaFlow|minouhse|kDragon|Converted007|D4RK4RMY|ddihzw|kranf|Jaloxaji|michelono|rotzooi1111|Biteme|DutchReality)\b 48 | 49 | ### Professional subtitler names 50 | nl_purge8: \b((Frank|Richard) B[oe]velander|Marjolein Meeuwisse|Frederik Haan|Brigitta Broeke|Annemarieke Schaap|Maria (Mohr|van Veldhuizen)|Peter (Bosma|van Loenhout)|(Amber|Charlotte|Gerrie|Sylvy|Jeanne) (Bi?(rugg)?|Not|Ti(mm|el))er?mans?|Jenneke Takens|Etienne Lantier|Birgit Leerling|Jos[eé] van de Kamp|Inge van Balgooij|Christiaan Tamerus|Emily Moorlach|Judith (IJpelaar|Schep)|Dirk Klinkenberg)\b 51 | nl_purge9: \b(Suzan Hogenbirk|Sanne (Derks|Egelmeers|van der Meij)|Tineke (Blokzijl|Haar)|Theresa van der Gruit|Femke Meijer|([JL]orien|Flor[iu]s) (Hakvoort|Franssen|van Rooijen)|Xander Purcell?|Sofie Janssen|Bart Heuvelmans|Mathias Van den Branden|Myl[eè]ne Delfos|Leen Schonken|Maartje van de[nr] (Brink|Zeijden)|Jake Dozier|Tom Steinbusch|Linda van der Logt|Shirley Delnoy|Allettie Bastiaansen)\b 52 | nl_purge10: \b(Marl(een|oes) (Kerssens|Bakker|Gimpel|Penders)|Lana Goossens|Geert (Spekken|van (den )?(Elzen|Bremen))|Alexander Eckhardt|(Brian|Catharine) Winter|Lars Altena|Sikko Bos|Mar[cky] (Ann Smit|de Jongh|de Klerk)|Edward (van Veen|Rekkers)|Michiel Nijenhuis|Ben Zuidema|Juli[eë]tte van Gurp|Jos Verberne|Stijn van Oosterbos|Mieke Vanhengel|Anke Elzinga|Sara Isabel Lette|Len Van Renterghem)\b 53 | nl_purge11: \b((Evan?|Owen) (Dorrestein|de Gans)|Sandra Vandenbussche|J\.J\. Ritzer|Karen Lagendijk|Ren[eé] van Vliet|Barbara Born|Roel Salden|Elly van der Meijden|Elize Preenen|Joost Poort|Chris (Freriks|Reuvers)|Diane Loogman|Amabile Keijer|Caroline Snijder|Elisabeth Barber|Harri[eë]t de Vette|Annemiek Krol|Jessica (van Doremalen|Rietveld)|Robert(de Ridder|Geurtsen)|Rico Nouromid|Carla Kloet)\b 54 | nl_purge12: \b((Rachel|Wietske|Pierre) (van )?(der )?Pol(man)?|Jolanda (Ursem|Jongedijk|van den Berg)|Martijn van Berkel|Mari[ej]k?e (Loonen|Schuring|Kok)|Esther (Daa?ms(teeg)?|van Opstal)|An[ns] (van Bogaert|Bouter)|Naomi Verheggen|Maxime van der Glas|Maaike van der Heijden|Laurence de Moor|Carmen Ket|Anja Stoop|Dennis Strik|Dani[eë]l Vos|Mireille Van Overmeiren|Bonnie Dekker|Jenny Mizrahi)\b 55 | nl_purge13: \b(Ric?k de (Laat|Best)|Wim Gerbecks|Jordi Schipper|Lieuwe Osinga|Trudy Kloet|Erik Brommeijer|Bianca van der Meulen|Muriel Bouillon|Leonie Klaassen|Noortje Ganzevles|Tessa (Kuijpers|van Schijndel)|Matthijs Dijkstra|Maurice Voogd|Arjan van Tuijl|Nikki van Leeuwen|Cora Sendon|Petra Swelsen|Wouter Groothuis|Cindy Hemink|Deirdre Malone|Martijn Beunk|Monique Houben|Michael Albers|Edina van Daalen)\b 56 | 57 | ### Subtitling/release groups 58 | nl_purge14: \b(een netflix\W(original\W)?(documentaire|film|serie)|netflix presenteert|QoQ|Quality over Quantity|(simply|pvt) releases|bierdopje\.com|nlondertitels|subtitlesnl|ondertitels?(\.cc|\.com)|nlsub|yifi|(het robot|srt) team|CustomNL|place2home|fmsubs|FTC-SubTeam)\b 59 | nl_purge15: \bs(eizoen)?\W*\d+[^,]\W*a(flevering)?\W*\d+[^,] 60 | 61 | ### From no_profile config 62 | nl_purge16: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university) 63 | nl_purge17: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clear\W*way\W*law) 64 | nl_purge18: \b(Filthy\W*Rich\W*Futures|celebrity\W*sex|share\W*university) 65 | nl_purge19: \b(Americas\W*Cardroom|save\W*an\W*illuminati|Clearway\W*law) 66 | nl_purge20: \b(UNiTED\W*TEAM|admitme|ragbear|looklive|Camikaze|SourGrass|mstoll|alire2a)\b 67 | nl_purge21: \b(normita|EhLaNa|playships|sunmenghao|nessundorma|seriestele|DarKsh|vothaison)\b 68 | nl_purge22: \b(anana|cRosKy|misshu|Xenzai|swsub|divx|empiremedia|La Fabrique|benj)\b 69 | nl_purge23: \b(dawaith|MoSub|Golgi|Linwelin|Malikay|Ricana|Sadgeezer|argenteam|tiobetonh|chebinhdan)\b 70 | 71 | #nl_purge#: Regex goes here. 72 | 73 | 74 | 75 | # 76 | # -----------------------------------------GUIDE------------------------------------------------- 77 | # 78 | 79 | # This language profile contains two lists of regex that will look for patterns. 80 | # if you wish to modify or remove any regex, feel free to do so 81 | # but files in the default folder will be overwritten when you update the script. 82 | # You can add and remove keys as long as two keys don't use the same key twice. 83 | 84 | # WARNING_REGEX: 85 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block. 86 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads. 87 | # 1 warning is ignored 88 | # 2 warnings will print the block as a WARNING in the log. 89 | # 3 warnings or more will remove the entire block. 90 | 91 | # PURGE_REGEX: 92 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block. 93 | 94 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the 95 | # literal character you'll need to escape it with '\' 96 | # for example: matching "www." would require a regex like: "www\." 97 | # you can test regexes online on an regex-tester tool like https://regex101.com/ 98 | 99 | # Feel free to ask me any question on github. 100 | -------------------------------------------------------------------------------- /libs/langdetect/utils/ngram.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import libs.six as six 4 | 5 | from . import messages 6 | from .unicode_block import ( 7 | unicode_block, 8 | UNICODE_BASIC_LATIN, 9 | UNICODE_LATIN_1_SUPPLEMENT, 10 | UNICODE_LATIN_EXTENDED_B, 11 | UNICODE_GENERAL_PUNCTUATION, 12 | UNICODE_ARABIC, 13 | UNICODE_LATIN_EXTENDED_ADDITIONAL, 14 | UNICODE_HIRAGANA, 15 | UNICODE_KATAKANA, 16 | UNICODE_BOPOMOFO, 17 | UNICODE_BOPOMOFO_EXTENDED, 18 | UNICODE_CJK_UNIFIED_IDEOGRAPHS, 19 | UNICODE_HANGUL_SYLLABLES, 20 | ) 21 | 22 | 23 | class NGram(object): 24 | LATIN1_EXCLUDED = messages.get_string('NGram.LATIN1_EXCLUDE') 25 | N_GRAM = 3 26 | 27 | def __init__(self): 28 | self.grams = ' ' 29 | self.capitalword = False 30 | 31 | def add_char(self, ch): 32 | '''Append a character into ngram buffer.''' 33 | ch = self.normalize(ch) 34 | last_char = self.grams[-1] 35 | if last_char == ' ': 36 | self.grams = ' ' 37 | self.capitalword = False 38 | if ch == ' ': 39 | return 40 | elif len(self.grams) >= self.N_GRAM: 41 | self.grams = self.grams[1:] 42 | self.grams += ch 43 | 44 | if ch.isupper(): 45 | if last_char.isupper(): 46 | self.capitalword = True 47 | else: 48 | self.capitalword = False 49 | 50 | def get(self, n): 51 | '''Get n-gram.''' 52 | if self.capitalword: 53 | return 54 | if n < 1 or n > self.N_GRAM or len(self.grams) < n: 55 | return 56 | if n == 1: 57 | ch = self.grams[-1] 58 | if ch == ' ': 59 | return 60 | return ch 61 | else: 62 | return self.grams[-n:] 63 | 64 | @classmethod 65 | def normalize(cls, ch): 66 | block = unicode_block(ch) 67 | if block == UNICODE_BASIC_LATIN: 68 | if ch < 'A' or ('Z' < ch < 'a') or 'z' < ch: 69 | ch = ' ' 70 | elif block == UNICODE_LATIN_1_SUPPLEMENT: 71 | if cls.LATIN1_EXCLUDED.find(ch) >= 0: 72 | ch = ' ' 73 | elif block == UNICODE_LATIN_EXTENDED_B: 74 | # normalization for Romanian 75 | if ch == six.u('\u0219'): # Small S with comma below => with cedilla 76 | ch = six.u('\u015f') 77 | if ch == six.u('\u021b'): # Small T with comma below => with cedilla 78 | ch = six.u('\u0163') 79 | elif block == UNICODE_GENERAL_PUNCTUATION: 80 | ch = ' ' 81 | elif block == UNICODE_ARABIC: 82 | if ch == six.u('\u06cc'): 83 | ch = six.u('\u064a') # Farsi yeh => Arabic yeh 84 | elif block == UNICODE_LATIN_EXTENDED_ADDITIONAL: 85 | if ch >= six.u('\u1ea0'): 86 | ch = six.u('\u1ec3') 87 | elif block == UNICODE_HIRAGANA: 88 | ch = six.u('\u3042') 89 | elif block == UNICODE_KATAKANA: 90 | ch = six.u('\u30a2') 91 | elif block in (UNICODE_BOPOMOFO, UNICODE_BOPOMOFO_EXTENDED): 92 | ch = six.u('\u3105') 93 | elif block == UNICODE_CJK_UNIFIED_IDEOGRAPHS: 94 | ch = cls.CJK_MAP.get(ch, ch) 95 | elif block == UNICODE_HANGUL_SYLLABLES: 96 | ch = six.u('\uac00') 97 | return ch 98 | 99 | @classmethod 100 | def normalize_vi(cls, text): 101 | '''Normalizer for Vietnamese. 102 | Normalize Alphabet + Diacritical Mark(U+03xx) into U+1Exx. 103 | ''' 104 | def repl(m): 105 | alphabet = cls.TO_NORMALIZE_VI_CHARS.find(m.group(1)) 106 | dmark = cls.DMARK_CLASS.find(m.group(2)) # Diacritical Mark 107 | return cls.NORMALIZED_VI_CHARS[dmark][alphabet] 108 | return cls.ALPHABET_WITH_DMARK.sub(repl, text) 109 | 110 | NORMALIZED_VI_CHARS = [ 111 | messages.get_string('NORMALIZED_VI_CHARS_0300'), 112 | messages.get_string('NORMALIZED_VI_CHARS_0301'), 113 | messages.get_string('NORMALIZED_VI_CHARS_0303'), 114 | messages.get_string('NORMALIZED_VI_CHARS_0309'), 115 | messages.get_string('NORMALIZED_VI_CHARS_0323')] 116 | TO_NORMALIZE_VI_CHARS = messages.get_string('TO_NORMALIZE_VI_CHARS') 117 | DMARK_CLASS = messages.get_string('DMARK_CLASS') 118 | ALPHABET_WITH_DMARK = re.compile( 119 | '([' + TO_NORMALIZE_VI_CHARS + '])([' + DMARK_CLASS + '])', 120 | re.UNICODE) 121 | 122 | # CJK Kanji Normalization Mapping 123 | CJK_CLASS = [ 124 | messages.get_string('NGram.KANJI_1_0'), 125 | messages.get_string('NGram.KANJI_1_2'), 126 | messages.get_string('NGram.KANJI_1_4'), 127 | messages.get_string('NGram.KANJI_1_8'), 128 | messages.get_string('NGram.KANJI_1_11'), 129 | messages.get_string('NGram.KANJI_1_12'), 130 | messages.get_string('NGram.KANJI_1_13'), 131 | messages.get_string('NGram.KANJI_1_14'), 132 | messages.get_string('NGram.KANJI_1_16'), 133 | messages.get_string('NGram.KANJI_1_18'), 134 | messages.get_string('NGram.KANJI_1_22'), 135 | messages.get_string('NGram.KANJI_1_27'), 136 | messages.get_string('NGram.KANJI_1_29'), 137 | messages.get_string('NGram.KANJI_1_31'), 138 | messages.get_string('NGram.KANJI_1_35'), 139 | messages.get_string('NGram.KANJI_2_0'), 140 | messages.get_string('NGram.KANJI_2_1'), 141 | messages.get_string('NGram.KANJI_2_4'), 142 | messages.get_string('NGram.KANJI_2_9'), 143 | messages.get_string('NGram.KANJI_2_10'), 144 | messages.get_string('NGram.KANJI_2_11'), 145 | messages.get_string('NGram.KANJI_2_12'), 146 | messages.get_string('NGram.KANJI_2_13'), 147 | messages.get_string('NGram.KANJI_2_15'), 148 | messages.get_string('NGram.KANJI_2_16'), 149 | messages.get_string('NGram.KANJI_2_18'), 150 | messages.get_string('NGram.KANJI_2_21'), 151 | messages.get_string('NGram.KANJI_2_22'), 152 | messages.get_string('NGram.KANJI_2_23'), 153 | messages.get_string('NGram.KANJI_2_28'), 154 | messages.get_string('NGram.KANJI_2_29'), 155 | messages.get_string('NGram.KANJI_2_30'), 156 | messages.get_string('NGram.KANJI_2_31'), 157 | messages.get_string('NGram.KANJI_2_32'), 158 | messages.get_string('NGram.KANJI_2_35'), 159 | messages.get_string('NGram.KANJI_2_36'), 160 | messages.get_string('NGram.KANJI_2_37'), 161 | messages.get_string('NGram.KANJI_2_38'), 162 | messages.get_string('NGram.KANJI_3_1'), 163 | messages.get_string('NGram.KANJI_3_2'), 164 | messages.get_string('NGram.KANJI_3_3'), 165 | messages.get_string('NGram.KANJI_3_4'), 166 | messages.get_string('NGram.KANJI_3_5'), 167 | messages.get_string('NGram.KANJI_3_8'), 168 | messages.get_string('NGram.KANJI_3_9'), 169 | messages.get_string('NGram.KANJI_3_11'), 170 | messages.get_string('NGram.KANJI_3_12'), 171 | messages.get_string('NGram.KANJI_3_13'), 172 | messages.get_string('NGram.KANJI_3_15'), 173 | messages.get_string('NGram.KANJI_3_16'), 174 | messages.get_string('NGram.KANJI_3_18'), 175 | messages.get_string('NGram.KANJI_3_19'), 176 | messages.get_string('NGram.KANJI_3_22'), 177 | messages.get_string('NGram.KANJI_3_23'), 178 | messages.get_string('NGram.KANJI_3_27'), 179 | messages.get_string('NGram.KANJI_3_29'), 180 | messages.get_string('NGram.KANJI_3_30'), 181 | messages.get_string('NGram.KANJI_3_31'), 182 | messages.get_string('NGram.KANJI_3_32'), 183 | messages.get_string('NGram.KANJI_3_35'), 184 | messages.get_string('NGram.KANJI_3_36'), 185 | messages.get_string('NGram.KANJI_3_37'), 186 | messages.get_string('NGram.KANJI_3_38'), 187 | messages.get_string('NGram.KANJI_4_0'), 188 | messages.get_string('NGram.KANJI_4_9'), 189 | messages.get_string('NGram.KANJI_4_10'), 190 | messages.get_string('NGram.KANJI_4_16'), 191 | messages.get_string('NGram.KANJI_4_17'), 192 | messages.get_string('NGram.KANJI_4_18'), 193 | messages.get_string('NGram.KANJI_4_22'), 194 | messages.get_string('NGram.KANJI_4_24'), 195 | messages.get_string('NGram.KANJI_4_28'), 196 | messages.get_string('NGram.KANJI_4_34'), 197 | messages.get_string('NGram.KANJI_4_39'), 198 | messages.get_string('NGram.KANJI_5_10'), 199 | messages.get_string('NGram.KANJI_5_11'), 200 | messages.get_string('NGram.KANJI_5_12'), 201 | messages.get_string('NGram.KANJI_5_13'), 202 | messages.get_string('NGram.KANJI_5_14'), 203 | messages.get_string('NGram.KANJI_5_18'), 204 | messages.get_string('NGram.KANJI_5_26'), 205 | messages.get_string('NGram.KANJI_5_29'), 206 | messages.get_string('NGram.KANJI_5_34'), 207 | messages.get_string('NGram.KANJI_5_39'), 208 | messages.get_string('NGram.KANJI_6_0'), 209 | messages.get_string('NGram.KANJI_6_3'), 210 | messages.get_string('NGram.KANJI_6_9'), 211 | messages.get_string('NGram.KANJI_6_10'), 212 | messages.get_string('NGram.KANJI_6_11'), 213 | messages.get_string('NGram.KANJI_6_12'), 214 | messages.get_string('NGram.KANJI_6_16'), 215 | messages.get_string('NGram.KANJI_6_18'), 216 | messages.get_string('NGram.KANJI_6_20'), 217 | messages.get_string('NGram.KANJI_6_21'), 218 | messages.get_string('NGram.KANJI_6_22'), 219 | messages.get_string('NGram.KANJI_6_23'), 220 | messages.get_string('NGram.KANJI_6_25'), 221 | messages.get_string('NGram.KANJI_6_28'), 222 | messages.get_string('NGram.KANJI_6_29'), 223 | messages.get_string('NGram.KANJI_6_30'), 224 | messages.get_string('NGram.KANJI_6_32'), 225 | messages.get_string('NGram.KANJI_6_34'), 226 | messages.get_string('NGram.KANJI_6_35'), 227 | messages.get_string('NGram.KANJI_6_37'), 228 | messages.get_string('NGram.KANJI_6_39'), 229 | messages.get_string('NGram.KANJI_7_0'), 230 | messages.get_string('NGram.KANJI_7_3'), 231 | messages.get_string('NGram.KANJI_7_6'), 232 | messages.get_string('NGram.KANJI_7_7'), 233 | messages.get_string('NGram.KANJI_7_9'), 234 | messages.get_string('NGram.KANJI_7_11'), 235 | messages.get_string('NGram.KANJI_7_12'), 236 | messages.get_string('NGram.KANJI_7_13'), 237 | messages.get_string('NGram.KANJI_7_16'), 238 | messages.get_string('NGram.KANJI_7_18'), 239 | messages.get_string('NGram.KANJI_7_19'), 240 | messages.get_string('NGram.KANJI_7_20'), 241 | messages.get_string('NGram.KANJI_7_21'), 242 | messages.get_string('NGram.KANJI_7_23'), 243 | messages.get_string('NGram.KANJI_7_25'), 244 | messages.get_string('NGram.KANJI_7_28'), 245 | messages.get_string('NGram.KANJI_7_29'), 246 | messages.get_string('NGram.KANJI_7_32'), 247 | messages.get_string('NGram.KANJI_7_33'), 248 | messages.get_string('NGram.KANJI_7_35'), 249 | messages.get_string('NGram.KANJI_7_37')] 250 | 251 | CJK_MAP = {} 252 | 253 | @classmethod 254 | def _init_cjk_map(cls): 255 | for cjk_list in cls.CJK_CLASS: 256 | representative = cjk_list[0] 257 | for ch in cjk_list: 258 | cls.CJK_MAP[ch] = representative 259 | 260 | NGram._init_cjk_map() 261 | -------------------------------------------------------------------------------- /libs/subcleaner/subtitle.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from typing import List, Set, Dict 4 | 5 | from . import languages 6 | from .settings import args, config 7 | from .sub_block import SubBlock, ParsingException 8 | from libs import langdetect 9 | from pathlib import Path 10 | 11 | from ..langdetect import LangDetectException 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class Subtitle: 17 | blocks: List[SubBlock] 18 | ad_blocks: Set[SubBlock] 19 | warning_blocks: Set[SubBlock] 20 | language: str 21 | file: Path 22 | short_path: Path 23 | pre_content_artifact: str = "" 24 | 25 | def __init__(self, subtitle_file: Path) -> None: 26 | self.file = subtitle_file 27 | self.blocks = [] 28 | self.ad_blocks = set() 29 | self.warning_blocks = set() 30 | 31 | file_content = read_file(self.file) 32 | self._parse_file_content(file_content) 33 | 34 | for i in range(len(self.blocks)): 35 | self.blocks[i].current_index = i 36 | try: 37 | self.short_path = self.file.relative_to(config.relative_base) 38 | except ValueError: 39 | self.short_path = self.file 40 | 41 | if not self: 42 | raise FileContentException(self.file) 43 | 44 | if args.language: 45 | self.language = args.language 46 | else: 47 | self.determine_language() 48 | 49 | if args.destroy_list: 50 | self.mark_blocks_for_deletion(args.destroy_list) 51 | 52 | if len(self.blocks) > 1: 53 | prev_block = self.blocks[0] 54 | blocks_to_remove: Set[SubBlock] = set() 55 | for block in self.blocks[1:]: 56 | if block.content == prev_block.content and (block.start_time - prev_block.end_time).total_seconds() < 1/31: 57 | prev_block.end_time = block.end_time 58 | blocks_to_remove.add(block) 59 | continue 60 | prev_block = block 61 | for block in blocks_to_remove: 62 | self.blocks.remove(block) 63 | 64 | def warn(self, block: SubBlock): 65 | if block not in self.ad_blocks: 66 | self.warning_blocks.add(block) 67 | 68 | def ad(self, block: SubBlock): 69 | try: 70 | self.warning_blocks.remove(block) 71 | except KeyError: 72 | pass 73 | self.ad_blocks.add(block) 74 | 75 | def _parse_file_content(self, file_content: str) -> None: 76 | file_content = file_content.replace("—>", "-->") 77 | current_line = 0 78 | line_lookup: Dict[str, int] = {} 79 | 80 | lines = file_content.split("\n") 81 | if len(lines) < 2: 82 | raise FileContentException(self.file) 83 | for line in lines: 84 | current_line += 1 85 | if "-->" in line: 86 | line_lookup[line] = current_line 87 | file_content = re.sub(r'\n\s*\n', '\n', file_content) 88 | file_content = file_content.strip() 89 | file_content_lines = file_content.split("\n") 90 | file_content_lines.append("") 91 | self._breakup_block(file_content_lines, line_lookup) 92 | 93 | def _breakup_block(self, lines: List[str], line_lookup: Dict[str, int]) -> None: 94 | last_break = 0 95 | start_index = 0 96 | for i in range(len(lines)): 97 | line = lines[i] 98 | if not SubBlock.is_sub_block_header(line) or i == len(lines)-1 or SubBlock.is_sub_block_header(lines[i+1]): 99 | continue 100 | start_index = i + 1 101 | if i == 0: 102 | last_break = i 103 | break 104 | 105 | previous_line = lines[i - 1] 106 | if previous_line[0].isnumeric(): 107 | last_break = i - 1 108 | else: 109 | last_break = i 110 | break 111 | if last_break > 1: 112 | e = ParsingException(1, "incorrectly formatted subtitle block") 113 | e.subtitle_file = self.file 114 | e.file_line = line_lookup.get(lines[last_break], None) 115 | if not e.file_line: 116 | e.file_line = line_lookup.get(lines[last_break + 1], None) 117 | logger.warning(str(e)) 118 | 119 | for line in lines[:last_break]: 120 | if "-->" in line: 121 | line = line + "\n" 122 | self.pre_content_artifact += line + "\n" 123 | 124 | for i in range(start_index, len(lines)): 125 | line = lines[i] 126 | previous_line = lines[i-1] 127 | if not SubBlock.is_sub_block_header(line) or i == len(lines)-1 or SubBlock.is_sub_block_header(lines[i+1]): 128 | continue 129 | 130 | if previous_line[0].isnumeric(): 131 | next_break = i - 1 132 | else: 133 | next_break = i 134 | 135 | try: 136 | block = SubBlock("\n".join(lines[last_break:next_break]), len(self.blocks) + 1) 137 | except ParsingException as e: 138 | e.subtitle_file = self.file 139 | e.file_line = line_lookup.get(lines[last_break], None) 140 | if not e.file_line: 141 | e.file_line = line_lookup.get(lines[last_break+1], None) 142 | if not self.blocks: 143 | self.pre_content_artifact += "\n" + "\n".join(lines[last_break:next_break]) + "\n" 144 | logger.warning(e) 145 | self.blocks[-1].content += "\n\n" + "\n".join(lines[last_break:next_break]) 146 | continue 147 | 148 | if block.content: 149 | self.blocks.append(block) 150 | if "-->" in block.content: 151 | self.warn(block) 152 | block.hints.append("malformed_block") 153 | last_break = next_break 154 | try: 155 | block = SubBlock("\n".join(lines[last_break:]), len(self.blocks) + 1) 156 | except ParsingException as e: 157 | e.subtitle_file = self.file 158 | e.file_line = line_lookup.get(lines[last_break], None) 159 | if not e.file_line: 160 | e.file_line = line_lookup.get(lines[last_break + 1], None) 161 | logger.warning(e) 162 | if not self.blocks: 163 | raise e 164 | self.blocks[-1].content += "\n\n" + "\n".join(lines[last_break:]) 165 | return 166 | if block.content: 167 | self.blocks.append(block) 168 | if "-->" in block.content: 169 | self.warn(block) 170 | block.hints.append("malformed_block") 171 | 172 | def mark_blocks_for_deletion(self, purge_list: List[int]) -> None: 173 | for index in purge_list: 174 | for block in self.blocks: 175 | if block.original_index == index: 176 | block.regex_matches = 3 177 | block.hints.append("destroyed by index") 178 | break 179 | else: 180 | if index-1 >= len(self.blocks): 181 | continue 182 | block = self.blocks[index - 1] 183 | if not block.original_index or block.original_index == index: 184 | block.regex_matches = 3 185 | block.hints.append("destroyed by index") 186 | logger.warning("indexing in subtitle does not match with parsed subtitle.") 187 | 188 | def language_is_correct(self) -> bool: 189 | if self.language == "und": 190 | return True # unknown language. 191 | language_code_2 = languages.get_2letter_code(self.language) 192 | 193 | if not language_code_2: 194 | return True # unknown language. 195 | 196 | sub_content: str = "" 197 | for block in self.blocks: 198 | sub_content += block.content 199 | 200 | if len(sub_content) < 500: 201 | return True # not enough content to estimate language. 202 | try: 203 | detected_language = langdetect.detect_langs(sub_content)[0] 204 | except LangDetectException: 205 | logger.warning(f"{self} can't be analyzed by language detector.") 206 | return True 207 | 208 | return detected_language.lang == language_code_2 and detected_language.prob > 0.8 209 | 210 | def determine_language(self) -> None: 211 | if config.default_language: 212 | self.language = config.default_language 213 | return 214 | 215 | self.language = "und" 216 | 217 | found_hi = False 218 | found_sdh = False 219 | for suffix in reversed(self.file.suffixes[max(-3, -len(self.file.suffixes)): -1]): 220 | parsed_lang = suffix.replace(":", "-").replace("_", "-").split("-")[0][1:] 221 | if parsed_lang == "hi": 222 | found_hi = True 223 | continue 224 | if parsed_lang == "sdh": 225 | found_sdh = True 226 | continue 227 | 228 | if languages.is_language(parsed_lang): 229 | self.language = parsed_lang 230 | return 231 | if found_hi: 232 | self.language = "hi" 233 | return 234 | if found_sdh: 235 | self.language = "sdh" 236 | return 237 | # todo: parse hi and sdh properly 238 | 239 | sub_content: str = "" 240 | for block in self.blocks: 241 | sub_content += block.content 242 | if len(sub_content) < 500: 243 | return 244 | try: 245 | detected_language = langdetect.detect_langs(sub_content)[0] 246 | except LangDetectException: 247 | logger.warning(f"{self} can't be analyzed by language detector.") 248 | return 249 | 250 | if detected_language.prob > 0.9: 251 | self.language = detected_language.lang 252 | 253 | def to_content(self) -> str: 254 | content = self.pre_content_artifact 255 | for block in self.blocks: 256 | content += f"{block.current_index}\n" \ 257 | f"{block}\n" \ 258 | f"\n" 259 | 260 | if "-->" in block.content: 261 | logger.warning(f"potential malformed subtitle blocks in block {block.current_index}.") 262 | return content[:-1] 263 | 264 | def get_warning_indexes(self) -> List[str]: 265 | l: List[int] = [] 266 | for block in self.warning_blocks: 267 | l.append(int(block.current_index)) 268 | l.sort() 269 | return [str(x) for x in l] 270 | 271 | def reindex(self): 272 | index = 1 273 | for block in self.blocks: 274 | block.current_index = index 275 | index += 1 276 | for block in self.ad_blocks: 277 | block.current_index = None 278 | 279 | def __str__(self) -> str: 280 | return str(self.file) 281 | 282 | def __len__(self) -> int: 283 | return len(self.blocks) 284 | 285 | def __bool__(self) -> bool: 286 | for block in self.blocks: 287 | if block.content: 288 | return True 289 | return False 290 | 291 | 292 | class FileContentException(Exception): 293 | subtitle_file: str 294 | 295 | def __init__(self, subtitle_file): 296 | self.subtitle_file = subtitle_file 297 | 298 | def __str__(self) -> str: 299 | return f"File {self.subtitle_file} is empty." 300 | 301 | 302 | def read_file(file: Path) -> str: 303 | file_content: str 304 | # todo: maybe fix decoding to be more reliable? 305 | try: 306 | with file.open("r", encoding="utf-8-sig") as opened_file: 307 | file_content = opened_file.read() 308 | except UnicodeDecodeError: 309 | with file.open("r", encoding="cp1252") as opened_file: 310 | file_content = opened_file.read() 311 | if not "-->" in file_content: 312 | try: 313 | with file.open("r", encoding="utf-16") as opened_file: 314 | file_content = opened_file.read() 315 | except UnicodeDecodeError: 316 | try: 317 | with file.open("r", encoding="utf-8") as opened_file: 318 | file_content = opened_file.read() 319 | except UnicodeDecodeError: 320 | pass 321 | 322 | return file_content 323 | -------------------------------------------------------------------------------- /libs/langdetect/profiles/gu: -------------------------------------------------------------------------------- 1 | {"freq":{"ૈદિ":382,"g":235,"d":312,"e":960,"c":304,"a":1076,"n":720,"o":584,"l":382,"m":289,"h":369,"i":764,"u":324,"t":728,"s":517,"r":627,"ોટ ":345,"ેસા":764,"ોલ ":730,"ોર ":574,"ોદ ":1827,"ેત્":227,"ેતી":2222,"ેતમ":2186,"ેતપ":357,"ેડબ":253,"ેડા":1366,"ેડી":300,"ેગા":229,"ેગો":375,"ેઘર":242,"ૈકી":6300,"ેશમ":527,"ેશન":12436,"ેવી":831,"ેવા":710,"ઇડર":265,"ેરા":387,"ેરી":718,"આહવ":288,"ેલા":24917,"ેલી":519,"ેલુ":9935,"ેલો":381,"ોઇ ":458,"ેન્":278,"ેપુ":457,"આવે":34862,"ા":337683,"િ":47127,"સ":31472,"હ":20294,"શ":32541,"ષ":5409,"વ":91695,"લ":111041,"ળ":3931,"ર":102867,"ય":39143,"મ":113670,"ભ":35403,"બ":10569,"ફ":1198,"પ":49237,"ન":84304,"ધ":9131,"દ":38743,"થ":6321,"ત":89107,"ણ":9770,"ઢ":1233,"ડ":18443,"ઠ":3507,"જ":54268,"ઝ":1439,"ટ":6287,"ઘ":2525,"ચ":20557,"છ":25106,"ક":72592,"ખ":14557,"ગ":61691,"ઓ":8101,"એ":23599,"ઉ":5095,"ઈ":409,"અ":6168,"ઇ":1975,"આ":43598,"ં":82987,"૫":1391,"૪":875,"૩":2115,"૨":1146,"૯":1054,"૮":946,"૭":1034,"૬":461,"૧":5611,"૦":1269,"ૈ":7159,"ો":34921,"્":89060,"ૌ":562,"ુ":82336,"ી":42473,"ૃ":539,"ૂ":4236,"ે":108368,"આણં":435,"આદિ":1564,"ેશ ":483,"અને":1581,"અન્":344,"e ":271,"અમદ":630,"ેમ ":2812,"ેર ":1681,"ેલ ":1022," ૧":4255," ૩":409," ૨":679," ૫":978," ૪":492," ૭":700," ૯":551," ૮":625,"અગિ":1051," વ":14987," શ":3197," ર":16267," લ":5654," સ":11956," હ":3244," થ":1672," ત":31864," ધ":1718," દ":21808," ડ":1738," ઠ":222," ભ":34182," બ":4095," ય":383," મ":24848," ન":6795," ફ":765," પ":35455," છ":24245," ચ":2656," ઘ":628," ટ":479," ઝ":829," જ":21642," ઓ":682," ગ":30845," ખ":8068," ક":14981," ઉ":4757," એ":23366," આ":43205," ઇ":661," અ":6143,"આંગ":703,"્ચિ":11645,"્ટ્":549,"ોતર":377,"ોદર":1858,"ોનગ":236,"ોટા":473,"ોટી":225,"ોડા":794,"આઠ ":685,"ોની":800,"ોનો":2167,"ોરી":514,"ોળી":244,"ોલી":442,"ંવત":748,"ંબા":470,"ંબુ":281,"ંમત":254,"ંઠા":1406,"ંડવ":256,"ંદુ":455,"્ધ ":712,"ંદો":312,"ંધી":504,"ંતર":406,"ંચા":1418,"ંચમ":1337,"ંગા":221,"્ર ":966,"ોકો":3591,"્ય ":7092,"ંગણ":713,"ંખે":303,"ંગર":288,"્ષ ":789,"્વ ":2508,"એવા":6093,"્ષન":1137,"્ષિ":1509,"્વા":430,"્વે":772,"્વન":6820,"્વર":251,"્યન":12109,"્યત":720,"્યમ":432,"્યપ":428,"્યવ":2272,"્યા":2476,"્યુ":248,"્મદ":563,"્મા":375,"્લો":1068,"્લા":13052,"્રો":432,"્રે":602,"્રમ":649,"્રદ":871,"્રા":2175,"્રિ":476,"્રી":593,"્રહ":375,"્દ્":258,"્થા":288,"્તા":298,"્તી":856,"્તર":2535,"એક ":15869,"ઉદે":246,"ઉપલ":606,"ઉપર":392,"ઉત્":2557,"ઉમર":329,"િત":853,"િણ":1494,"વિજ":359,"ીં":343,"િમ":11976,"િપ":550,"િન":1570,"વાય":500,"િવ":4622,"વાર":525,"િશ":322,"ીક":534,"વામ":999,"િલ":14752,"ીઓ":697,"િય":2671,"િર":803,"વાસ":1799,"ીજ":386,"િહ":232,"વિક":492,"િસ":578,"વાલ":357,"ીત":406,"ીદ":281,"ું":22062,"વાદ":862,"વાન":404,"ીય":2619,"ીમ":860,"ીન":8731,"વાડ":2612,"ીપ":459,"ુક":18441,"ીવ":394,"ુખ":4304,"ીર":426,"ીલ":229,"વાગ":247,"વાઘ":254,"ીસ":510,"ુચ":222,"ુજ":13015,"ાં":48849,"ાઉ":298,"ાઇ":474,"ાક":921,"ાઓ":6449,"ાઘ":283,"ાખ":279,"ાગ":19994,"ાજ":13842,"ાચ":245,"ાટ":1064,"ાડ":4936,"િં":1255,"ાણ":2500,"ાથ":1359,"ાત":14028,"ાદ":2250,"ાન":14000,"ાપ":2091,"ાબ":1765,"ામ":34603,"ાય":4603,"ાર":20818,"ાલ":24953,"ાળ":1774,"વિર":286,"િક":2870,"ાવ":3657,"ાષ":849,"ાસ":4564,"ાહ":1263,"િજ":517,"વિસ":266,"હત":7478,"સી":2165,"સુ":1962,"સે":1528,"સા":8757,"સિ":770,"હવ":567,"સો":764,"હર":240,"સ્":3025,"સૌ":277,"હુ":529,"સા ":522,"હે":3680,"હા":2824,"હિ":2043,"હી":349,"હો":1278,"હ્":385,"શ્":12458,"ષન":1142,"સગ":988,"સર ":276,"શહ":485,"શિ":374,"શા":1696,"શુ":2253,"શી":262,"સં":2001,"ષ્":864,"સમ":828,"સન":417,"સવ":569,"સર":997,"ષા":458,"સદ":328,"સત":251,"સણ":439,"ષિ":1579,"વદ":549,"વન":7118,"વર":2659,"વલ":1055,"શક":498,"વગ":256,"વસા":2722,"વડ":3280,"વત":1294,"વણ":314,"વે":36512,"શન":12531,"વૈ":520,"શમ":627,"વ્":2839,"વસ":7405,"વી":2081,"વિ":2302,"વા":17902,"લો":6663,"લ્":14395,"લે":641,"લા":40018,"લિ":942,"લી":2736,"લુ":28591,"લસ":736,"લવ":315,"વસ્":826,"વસે":1125,"ળી":539,"વં":274,"ળા":1514,"રો":1975,"ર્":7275,"રુ":979,"રી":7304,"રૂ":908,"રે":1966,"રસ":718,"વાં":660,"રહ":1839,"રા":36128,"રિ":1193,"રવ":1135,"લબ":632,"લપ":735,"લય":580,"લન":2427,"લક":304,"લગ":215,"રક":1797,"રગ":229,"રખ":218,"યવ":2310,"રજ":562,"યા":7096,"રડ":414,"યુ":510,"રણ":534,"રત":14232,"રથ":239,"રદ":1032,"શમા":566,"યે":847,"રન":646,"રપ":682,"યો":569,"રબ":221,"રમ":2425,"મર":703,"મમ":2823,"મલ":319,"મહ":10705,"રં":397,"મી":561,"યડ":242,"મુ":6197,"મા":44661,"મિ":1415,"યત":1490,"યપ":480,"યન":12830,"મે":837,"યમ":548,"મ્":803,"મો":2623,"બ્":1122,"ભર":899,"મજ":2296,"મગ":222,"મખ":281,"મણ":336,"મત":581,"મથ":816,"ભા":32622,"ભિ":388,"મપ":589,"ભો":320,"મદ":1417,"મધ":3415,"મન":2890,"બર":1834,"બહ":293,"મં":287,"બી":424,"બુ":465,"બા":2768,"બિ":244,"બો":581,"બે":417,"પો":705,"પ્":4053,"બન":257,"પલ":826,"પહ":227,"પશ":13852,"પર":1594,"પૂ":2675,"પૈ":6312,"પે":306,"પુ":5227,"પી":742,"પિ":245,"પા":6287,"ન્":1623,"નો":5861,"પણ":494,"પત":282,"પટ":291,"પડ":321,"પછ":431,"નવ":1459,"નર":658,"ધ્":3586,"ધો":283,"નપ":579,"નન":237,"ને":4269,"નુ":10812,"પં":2789,"ની":3990,"નિ":691,"ના":45180,"નસ":606,"ધા":1073,"ધુ":315,"ધી":641,"દે":14545,"ધન":837,"દ્":1191,"દો":459,"ધર":916,"સી ":1244,"નગ":2570,"દશ":215,"દસ":731,"દહ":218,"દા":3639,"દિ":5058,"દી":575,"દુ":1289,"દર":3345,"થવ":580,"વેલ":34942,"શના":12337,"થી":1041,"થા":882,"તો":531,"વૈદ":382,"તે":6014,"દક":1517,"થય":782,"ત્":12219,"થમ":1093,"થક":777,"તી":4432,"તુ":553,"તા":26724,"તિ":1123,"તન":712,"ણે":327,"તપ":511,"તર":3967,"તલ":356,"તમ":2718,"ણા":2366,"ણી":846,"ણવ":795,"તઘ":640,"ડો":2767,"ડુ":286,"ડે":1027,"ણં":571,"ડી":3535,"ડિ":410,"ડા":5615,"ડવ":598,"ડર":315,"ડભ":218,"ડબ":277,"ઠા":1831,"ટ્":673,"ટે":774,"વ્ય":2778,"ટિ":249,"સે ":1144,"ટી":761,"છે":23574,"જન":365,"છી":486,"ઝઘ":245,"જય":244,"છો":305,"જબ":1127,"જર":11973,"જે":2020,"જો":465,"જિ":13950,"જા":1399,"જુ":2555,"જી":794,"જ્":13119,"ઝર":271,"શુપ":2187,"ઝા":567,"સંવ":756,"ટક":225,"સંત":320,"સંખ":325,"ટા":840,"ટલ":348,"ગા":17980,"ગુ":12125,"ગિ":1160,"ઘડ":264,"ગી":427,"૯ ":627,"ગ્":891,"ગો":1143,"ઘર":948,"ઘો":548,"ચર":560,"ચમ":1383,"ચા":2321,"ચિ":12038,"ચી":259,"જં":215,"ચો":437,"ચ્":251,"જક":251,"શહે":470,"૫ ":1091,"કર":2446,"કમ":270,"કલ":513,"કપ":478,"ખં":251,"કડ":644,"ખલ":227,"ક્":3347,"કો":5466,"કે":1500,"૭ ":792,"કુ":6686,"કૃ":229,"કા":21625,"કી":6922,"કિ":350,"કહ":564,"કવ":583,"ગવ":1115,"ગલ":272,"ગર":2876,"ગમ":18397,"ખ્":4351,"૮ ":673,"ખે":6309,"ગન":575,"ગણ":1117,"ગઢ":779,"ખા":2071,"૧ ":1168,"શાળ":1055,"શાસ":252,"એવ":6158,"૨ ":320,"૩ ":1541,"૪ ":593,"ઓન":283,"એક":16184,"૦ ":810,"ઉદ":317,"ઉત":2595,"ઉપ":1216,"સગવ":904,"ઉમ":378,"આં":1102,"અગ":1187,"અં":436,"ઇડ":287,"ષના":751,"આહ":296,"ષનો":362,"આવ":35199,"આદ":1695,"આઠ":804,"આણ":440,"અમ":826,"અર":256,"અન":2071,"ંવ":912,"ંસ":486,"ંત":1429,"ંથ":310,"ંદ":2195,"ંધ":939,"ંબ":1038,"ંભ":281,"ંમ":287,"ંક":540,"શ્ચ":11655,"ંગ":2855,"ંખ":377,"ંચ":4016,"ંજ":464,"ંટ":564,"ંડ":982,"ંઠ":1433,"હે ":1152,"શ્ર":320,"શ્વ":348,"૧૩":1467,"૧૧":1091,"૧૯":283,"૧૦":694,"વડો":2345,"ોટ":1277,"ોડ":1517,"ોજ":373,"વલી":267,"ોન":3584,"ોધ":322,"ોત":616,"ોદ":3809,"ોગ":284,"ોક":3826,"ોઇ":523,"્ટ":1147,"્ત":4385,"્ણ":325,"્દ":602,"્થ":651,"્ધ":1001,"્પ":336,"્બ":262,"વર્":2136,"્ક":609,"્ગ":267,"્ચ":11827,"ોમ":362,"ોલ":1612,"ોય":246,"ોર":2045,"ોવ":257,"વલસ":582,"ોળ":645,"્સ":302,"્ષ":3855,"્વ":11263,"્લ":14321,"્ર":8621,"્ય":26212,"્મ":1641,"ૂર":2771,"ુદ":699,"ુધ":815,"ુન":510,"ુણ":524,"ુત":213,"ુમ":369,"ુર":8763,"ુપ":2377,"ુવ":804,"ુસ":354,"ુલ":6351,"ૂચ":717,"વનો":376,"વનુ":6376,"ૃત":300,"ેક":249,"ેત":5179,"ેડ":2071,"ેટ":618,"ેઠ":219,"ેજ":326,"ેગ":653,"ેઘ":271,"ષા ":222,"વતા":228,"ેર":3345,"ેલ":37195,"ૈક":6315,"ેશ":13804,"ેવ":2024,"ેન":1094,"ેપ":540,"ેમ":3602,"ૈદ":389,"ેસ":990,"હિં":776,"હાલ":1485,"હાર":571,"હિન":919,"ઇ ":1018,"આ ":2702,"ાં ":41350,"ઓ ":7041,"હેર":525,"હેલ":239,"હેવ":536,"હેસ":730,"એ ":663,"ાઇ ":270,"હોદ":867,"હ્મ":326,"ાઓ ":6349,"ં ":62940,"ાગ ":214,"ાડ ":990,"ાદ ":1109,"ાણ ":231,"ાત ":12086,"ાન ":979,"ામ ":12051,"ાલ ":1653,"ાર ":2510,"ાય ":3046,"ાવ ":357,"િક ":1785,"ાસ ":1162,"ષિણ":1462,"ે ":34827,"ો ":11870,"સણા":264,"ષ ":917,"સ ":4627,"સમો":292,"વ ":3614,"શ ":598,"સરા":281,"ષ્ટ":602,"િ ":698,"ુ ":1292,"ી ":24520,"ા ":103799,"સવા":466,"સુર":1060,"દ ":4991,"થ ":242,"સીઓ":544,"ન ":4557,"સુદ":463,"ધ ":850,"સાડ":665,"સાત":801,"સાણ":973,"સાગ":213,"પ ":563,"સાય":2235,"સામ":236,"સાર":662,"સાવ":281,"સાબ":1406,"બ ":1292,"મ ":27791,"સોન":293,"ર ":16908,"ય ":11483,"લ ":10111,"હતા":268,"હત્":6808,"ળ ":723,"ક ":19636,"ગ ":1433,"સ્વ":268,"સ્થ":430,"ચ ":2010,"સ્ટ":235,"સ્ત":1316,"સ્ક":359,"જ ":4325,"ટ ":1263,"ડ ":2185,"ઠ ":958,"ઢ ":645,"હવે":238,"ણ ":3405,"હવા":327,"ત ":27700,"ૂચ ":710,"િત્":251,"ાસા":294,"ાસિ":270,"ાસી":1732,"ાહો":869,"ાષા":325,"ાસણ":337,"ુલ ":6018,"ાસર":237,"ાષ્":514,"ાલન":2232,"ાલપ":364,"ાલય":562,"ંગ ":904,"ારે":564,"ાર્":428,"ારો":350,"ારી":908,"ારા":1301,"ારત":13028,"ારમ":342,"ારડ":261,"ાયત":689,"ાયડ":229,"ામા":16128,"ુર ":3220,"ાવી":500,"ાવા":1445,"ાવલ":274,"િકે":222,"િક્":471,"ંચ ":1075,"ાવત":219,"ાળા":1219,"ાલો":667,"ાલુ":18139,"ાલી":427,"ાલિ":223,"ાલા":242,"ાનો":1081,"ંટ ":317,"ાનપ":429,"ાના":6337,"ાનુ":3493,"ાની":787,"ંજ ":235,"ાદર":601,"ામપ":347,"ંત ":433,"ામન":2336,"ામમ":2770,"ાબર":1409,"ાપ્":387,"ુદ ":457,"ાપી":523,"ાપુ":503,"ાપા":321,"ંદ ":612,"ીદા":229,"ીનગ":468,"ીના":6429,"ીને":1070,"ીની":270,"ીનો":264,"િસ્":315,"િલ્":13910,"િલો":366,"િવસ":2730,"િવા":1650,"િનો":226,"િના":989,"િયા":2403,"ાંટ":436,"ાંઠ":1422,"ાંડ":461,"ાંગ":1094,"ાંચ":1295,"ાંત":590,"ાંધ":547,"ાંદ":419,"ાંસ":263,"ીઓ ":502,"િમ ":11653,"િપ ":367,"િત ":360,"િણ ":1447,"ાણા":1231,"ાણી":404,"ાતી":550,"ાત્":264,"ાથમ":992,"ાતે":354,"ાટી":295,"ાટે":280,"િંમ":251,"િંદ":527,"ાડી":1596,"ાડા":2060,"ાજક":237,"ાજી":257,"ાજ્":12491,"ીય ":442,"ાકી":371,"ાગમ":18270,"ાગન":475,"ાઉદ":238,"ું ":21442,"ૂર્":2560,"ેટ ":236,"ુણા":437,"ુજબ":1107,"ુજર":11840,"ુપા":2208,"ુધન":587,"ીયન":389,"ીમા":379,"ીયા":1731,"ીમખ":262,"ીસમ":292,"ુકા":17641,"ુકો":529,"ુખ્":4233,"ુરી":2300,"ુરુ":440,"ુરા":1358,"ુરત":829,"ુવા":696,"તઘર":640,"ણવા":740,"દસ ":512,"ણાવ":406,"દા ":938,"તપુ":465,"તનગ":252,"દી ":294,"તના":216,"દુ ":479,"તમજ":2184,"તમા":321,"તરી":269,"તરા":350,"તો ":423,"થા ":316,"થી ":996,"નવ ":475,"થવા":574,"ના ":41544,"ને ":3899,"ની ":3744,"નો ":5537,"દરા":2352,"દરમ":423,"તું":277,"તાલ":18057,"તાર":306,"તાપ":566,"તાન":302,"ધા ":271,"તેમ":3056,"તેર":1418,"તેન":315,"દક્":1484,"ત્ત":2727,"થમિ":987,"ત્વ":7552,"ત્ય":281,"ત્ર":1452,"થયે":642,"નપુ":541,"પી ":564,"ધીન":468,"ધાર":249,"ધાન":290,"નવસ":483,"નસવ":370,"ધ્ય":3481,"નર્":560,"દુધ":605,"દેપ":250,"દેશ":13431,"દેવ":535,"ધની":595,"દાવ":797,"દિક":394,"દાર":330,"દાદ":262,"દિવ":4303,"દાહ":863,"નગર":2214,"નગઢ":247,"પર ":286,"પણ ":420,"દોદ":293,"દ્વ":331,"દ્ર":524,"ધરા":484,"બા ":381,"પટે":233,"પાવ":384,"પાર":273,"પાલ":2355,"પાટ":403,"પાડ":735,"પાં":1274,"પશ્":11653,"પશુ":2197,"પલબ":596,"પરા":655,"પંચ":2575,"નું":10559,"નાં":923,"નાર":240,"નામ":306,"નાન":1138,"પછી":428,"ન્ય":559,"ન્દ":395,"બહુ":235,"બાક":373,"બાર":800,"બાય":217,"રજ ":269,"મા ":715,"મી ":253,"યડ ":217,"યન ":455,"બરક":1386,"મો ":1343,"પૂર":2616,"પુર":4959,"પૈક":6303," આ ":2656,"પોર":340," એ ":485,"પ્ર":3459,"પ્ય":389,"માં":41157,"માટ":294,"માન":402,"માણ":276,"માત":359,"માલ":325,"માર":263,"મિક":1063,"મહત":6802,"મહા":1848,"મહિ":850,"મહુ":233,"મહે":917,"યત્":701,"મેઘ":271,"મુખ":4261,"મુજ":1125,"મુવ":353,"યતઘ":640,"મપુ":552,"રે ":767,"મમા":2808,"મધ્":3365,"મદા":1324,"રી ":4562,"મના":2419,"રો ":232,"મજુ":2185,"મખે":264,"રા ":5544,"મતન":247,"મથક":765,"ભિલ":318,"ભાર":12956,"ભાગ":18607,"ભાષ":335,"બોર":287,"રત ":13401,"યા ":2563,"રમ ":485,"યો ":254,"ભરૂ":710,"બ્ર":421,"બ્ધ":597,"ળા ":1138,"રેગ":384,"ળી ":385,"રોત":365,"રાં":597,"રાય":238,"રામ":591,"રાવ":421,"રિક":325,"રાષ":510,"રાડ":230,"રાણ":287,"રાત":11870,"રાથ":991,"રાપ":541,"રાજ":13365,"રીય":783,"રીન":986,"રું":396,"રીક":260,"રિય":414,"રૂચ":710,"રવા":911,"રહવ":219,"રહે":1153,"રહ્":327,"રપુ":361,"રમ્":382,"રમા":804,"લો ":1417,"લા ":25318,"રના":235,"યેલ":764,"લી ":1812,"રદે":816,"રડી":227,"યાલ":588,"યાર":1934,"યાન":576,"યાપ":305,"રકા":1582,"લય ":553,"યવસ":2216,"યપૂ":410,"મોડ":217,"મોટ":454,"યનો":326,"યના":11717,"મ્ય":415,"યમા":407,"લન ":2219,"મ્બ":232,"લ્લ":14131,"વે ":1140,"લોલ":465,"લોડ":396,"લોદ":449,"લોક":3642,"વા ":7780,"વી ":1602,"લુક":18002,"લીમ":321,"લુણ":389,"લિય":312,"લું":9913,"વસ ":2535,"લાન":6932,"લિપ":369,"લાસ":286,"લાવ":223,"લાલ":307,"લામ":5764,"લસા":677,"શક ":371,"વર ":236,"લબ્":597,"વદ ":505,"લપુ":412,"વત ":759,"વડ ":417,"ર્ષ":1963,"ર્વ":2626,"ર્ય":411,"ર્મ":900,"કી ":503,"કા ":605,"કે ":834,"કો ":1354," ૧૦":680,"૧૦ ":535,"૧૧ ":1036,"૧૩ ":1412,"ગઢ ":538," ૧૩":1456," ૧૧":1079," ૧૯":274,"ગર ":1959,"કડી":283,"કડા":244,"કરવ":314,"કરી":1162,"કવા":445," હો":303," હિ":944," હા":540," સો":475," સૌ":273," સ્":714," સિ":322," સા":3562," સુ":1717," હત":654," સમ":395," સર":370," સત":226,"કાલ":238,"કામ":9196,"કાર":676,"કીન":6304,"કુલ":6004," લુ":420," લી":386," લિ":414," લા":253,"કહે":527," લો":3728,"કાં":1539," રહ":1203," રા":13805,"કાન":2517,"કાઓ":6055," સં":1919," શા":1268," શિ":264," શહ":475," સગ":922,"ઘર ":662," શ્":290," વા":1551," વિ":1924," વસ":1926," વ્":2581," વૈ":515," વે":236," વડ":2142," વર":2148," શક":474," વલ":624," વદ":498," પછ":431," પટ":261," પણ":391," નો":313," પા":2926," પુ":759," પૂ":2113," પૈ":6312," પર":651," પશ":13847," પહ":226," ધા":386," દ્":305," દે":13156," નગ":894," ધર":567," ના":1491," નિ":408," નસ":380," ને":386," પં":2780," ધો":248," નવ":1299," નર":628," મધ":3389," ભિ":335," ભા":32144," મથ":749," ભર":862," મે":521," મો":1041," મા":2519," મુ":5993," મહ":9472," પ્":3289," પો":348," બો":476," બે":295," મં":217," બી":247," બા":1559," બહ":292," ડા":372,"કોન":2841," ડે":860,"કોળ":215,"કોટ":356,"ક્ષ":1887,"ક્ર":890," ત્":349," થય":778," દક":1478," તે":5306," થવ":379," દર":551," દુ":722," દિ":2858," દા":1294,"કેટ":221," દસ":704," તર":348," તિ":233," તા":24717," ૯ ":497," ગો":553," ગ્":552," ગુ":12029,"ખેડ":1595,"ખેત":4414," ગા":16841,"ગના":399," ૮ ":570," ખે":5659," ખા":1675," ગણ":237," ચર":392," ઘો":266," છો":266,"ગણવ":693," ઝઘ":243," છે":23548," ચો":383," જં":214," ચા":714," ચિ":300," ઝા":412," જ્":480," જુ":259," જા":631," જિ":13843," જો":332," જે":1921,"ખાસ":931," એવ":6158,"ખાન":217," એક":16184,"ખાત":404," ૫ ":907," ૪ ":431," કહ":562," કવ":319," કુ":6308," કા":1279," કો":831," ૭ ":640," કે":929," ક્":383," કડ":504," કપ":373," કલ":240," કર":2121," ઇડ":271," આહ":296," આવ":35196," આદ":1582," ઉત":2589," ઉપ":1212," ઉમ":372,"ગવડ":906,"ગરહ":219,"ગમા":18349,"ખ્ય":4323," અં":432," અગ":1187," આં":1036," અન":2069," અર":252," અમ":825," આઠ":804," આણ":440,"ગાં":635,"ગામ":16798,"ગિય":1057,"ગુજ":11824,"ઘડી":254,"ગોર":435,"ગોધ":222,"ગ્ર":743,"ઘરજ":242,"છી ":438,"ઘોડ":335," જ ":2898,"છે ":23415,"જબ ":1105,"ચરો":362,"ચાર":560,"ચાય":582,"ચિમ":11652,"ચાં":471,"ચાગ":369,"જી ":285,"જા ":331,"ચમહ":1210,"જે ":397,"જકો":214,"ઝઘડ":245,"જિલ":13824,"જુર":2204,"છોટ":245,"જરા":11822,"ઝાલ":270,"ઠા ":1548,"ટી ":375,"ટા ":336,"જેવ":714,"જેત":374,"જ્ય":12917,"ટે ":225,"ડી ":2334,"ડા ":4375,"ડર ":279,"ટેલ":254,"ટાઉ":239,"ડો ":637,"ણી ":405,"ણા ":1630,"ટ્ર":578,"ડેર":603,"ડેડ":232,"તી ":3871,"ડોદ":1844,"તે ":714,"ડિય":272,"ડાસ":222,"ણંદ":569,"ડીય":740,"તિ ":370,"તા ":7106,"તર ":2912,"થક ":760,"ડાં":355,"ડબ્":254,"ણે ":229},"n_words":[2118540,2468202,1874859],"name":"gu"} -------------------------------------------------------------------------------- /regex_profiles/default/portuguese.conf: -------------------------------------------------------------------------------- 1 | [META] 2 | # Portuguese default config. 3 | 4 | # Comma delimited list of language codes associated with this language profile. 5 | # The script will run against all sub-labels like ":forced" as long as they match the language code. 6 | # leave empty to apply to all language codes. 7 | language_codes = pt, por, portuguese 8 | 9 | 10 | # Information about how to configure the REGEX sections, read at the bottom of the file. 11 | # All regexes are case insensitive! 12 | [WARNING_REGEX] 13 | 14 | ### Some Keywords for Translating, Subtitling, Sync, etc... 15 | pt_warn1: \b(Legend(a|e|ado|ar|as)?|(Res)Sincroni(a|zada|zado|zação|zações)(s)?|Tradu(za|zir|zido|zida|ir|ção|ções)(s)?)\b 16 | pt_warn2: \b(Rip(ada|ado|ped)(s)?|Corrig(ida|ido)(s)?|Corre(ção|ções)|Re(s|ss)ync|Revi(sar|sada|sado|são|sões)(s)?)\b 17 | 18 | ### Usual Keywords and Phrases 19 | pt_warn3: \b(CREIA EM DEUS SEMPRE|DESCOBRIDOR DE PLUTÃO|O FUTURO É AGORA|Junte-se a nós|QUER SE JUNTAR A NÓS|Visitem o site para mais informações)\b 20 | pt_warn4: \b(DIGA NÃO À CENSURA|TV NUNCA MAIS|FILME JÁ|POR UM MONTE DE NERD DOIDO|POR DOIS VELHOS GAGÁS|A p r e s e n t|Dica para download)\b 21 | pt_warn5: \b(anos fazendo Arte para você|Agradecimento(s))\b 22 | pt_warn6: \b(Siga nosso perfil|Siga-nos (no twitter|nas redes sociais))\b 23 | pt_warn7: \b(Qualidade é InSUBstituível|Quality is Everything|Quer legendar co(nosco|m a gente)|Quer legendas)\b 24 | pt_warn8: \b(Batmans|bielo|Bozano|Gamaia|Cassão|Chei|chicon|Chucky|CHULOS|Coco de Rato|Danielly|Darks|DarkSide|Darrow|Darwina|Davros|Deluxe|Duda)\b 25 | pt_warn9: \b(Enjoy|Team|Esmera|Fahrenhheit|GeekS|Ghost|Guerra_|Hirschen|Honoré|InSanos|InSUBs|JouJou|JVM|KiKo)\b 26 | pt_warn10: \b(Lalinha|LariS|League of Legends|Leonessa|Leooni|locke|Lunardelli|ManiacS|Marines|Marvetes|Mullr|Murrice|MUSKETEERS|NaNNa|Nava|NEXUS-(6|9)|Nova Prime)\b 27 | pt_warn11: \b(Optimus|Otoni|Patronnus|Patyy|Pirandello|Pirandelo|Pix|PT-BR|PT-Subs|Pumari|Rainbow|Reaper|Release|Renatinha|Renegados|Rezinha|Rouge)\b 28 | pt_warn12: \b(Salomao|Sardinha|Satsuki|Takehara|Tati|thaais|Vahainen|Vahainen²|wallop|Will Graham|Wuornos|Yang|Zeh)\b 29 | pt_warn13: \b(Episódio|ENGLISH|MKV|UNITED|XEROX|Deluxe|Facebook|Instagram|Twitter|PT.BR|Whatsapp|Tiktok|MARVEL STUDIOS)\b 30 | pt_warn14: \b(HBO((| )Max|GO)|Apple(| )TV|Disney+|Disney(| )Plus)\b 31 | 32 | #pt_warn#: Regex goes here. 33 | 34 | 35 | [PURGE_REGEX] 36 | 37 | ### Temporada XX Episodio XX 38 | pt_purge0: \bt(emporada)?\W*\d+[^,]\W*e(pis(o|ó)dio)?\W*\d+[^,] 39 | 40 | ### Subtitler Nicknames / Membros de grupos de legenda 41 | pt_purge1: \b(0tavi0|1N73RC3P70R|3runo|@ndré Roch@) 42 | pt_purge2: \b(A.Valim|AdctdGrl|adrianrkt|Adrih87|afi25|Ahenius|AirtonSub|akitemostudo|AlanCristianoBr|AlbanioFPC|Albergi|AlbustigriS|alcobor|Alexandre(MT|Metal)) 43 | pt_purge3: \b(AlexMagno|AlineMarin|Alphankh|(Á|A)lvaroEJ|alxmota|Amand@|Anap9|anap²|anchorboy|Andrebavila|anoXmous|ARDiLOZO|AriadinaPrates|Artaquilus) 44 | pt_purge4: \b(arthurdenner|Artrixzera|arturfreire|Atchiman|athomas|Atlantes Eddy|Audio8|AugustCr|Austhra|azamba89) 45 | pt_purge5: \b(Baco Dionisio|bacontarin|BadWolf|Bakugan|batman.inc(00)|Baudrillard|BBorges|Bello_Brasil|Ben Reilly|Ben197|Bereuza|BethRockefeller|bgarland|Biamussolin|BigTasty) 46 | pt_purge6: \b(BINHOCV|BITCH|BLClaudio|BLuk|Bobdvd|borbabarba|bozxphd|BrandonMotif|brayanatsix|BRENYS|BruFeiden|brunastark|Brunnen-G|BrunoLoko|Brunowsk|Btarth|btsix|Buckley97) 47 | pt_purge7: \b(CacauDias|caconti|Cacstim46|Caio Kameda|Caio(15|albanezi|l|ski)|Capejuna|cbsgrillo|Celow|celso(drx|jp)|Cesart|cezarrezzo|CHaandde|Chacalbhz|ChaosCosmico) 48 | pt_purge8: \b(chereguedel|ChronoAlvein|CiCiNHA|cinefala|Clebertsf|Cond(e)Vla(d)|coriango4|Cotter CS|cricknick|crisvs|Cross65|Cumby|curiango4|Cybervicious|Cynthiam|c_wolff) 49 | pt_purge9: \b(D3QU1NH4|D4VR0S|D@nipbr|DanDee|danidc|DanielG|daninegredo|dani_nemo|DarkEagle|Darkway|DebCarda|Deberle|DedaGlima|deGroote|deiaoliveira|Denarians|derson78|DiabboVerdde|DianaP) 50 | pt_purge10: \b(Dicaoli|dinho1903|Diogodine|Diogo.vix|diogo(dasilva|matos)|Dolinsky|Doris_The_Man|Dougsan|DrCaio|DreamMetal|dreeh|Drope|DSergio|dtlagreca|DudSS|duh_sobieski|Durenkian|Dyxtendent) 51 | pt_purge11: \b(e.gomide|Eagle_1984BR|edmadness|EduCLJ|EIWoOdBIUeS|Elderfel|ElFrijole|eliasyss|elsubtitle|ElWoOdBlUeS|ericarockcity|ErosCohen|eryckcampista|Eryx|explosiveskull) 52 | pt_purge12: \b(fagmiranda|Fanuelbenne|Farnezi|Fefavrin|Felipemaximus|FellipeMarcel|Fernandoleao|Fe_Fratta|Finovsk|fish_n_chips) 53 | pt_purge13: \b(FLeCha³|Flechudo|Floomers|FormigosaJr|fotojrFoxxy|fox_sts|Fr0g|frankensubber|fscolari|Fulanapster|FxJeloka) 54 | pt_purge14: \b(GabeOKane|Gaboro|gabriel3color|GabyReis|galaksoda|Galassio|gameonbels|GBelds|General GeeK|Gerigato|germanabh) 55 | pt_purge15: \b(GFaria|GGoedert|GiRoberta|gkarnikow|GoianoDoido|gorecorpsed|GPMaus|Grego²|GuiZahn|gusss|gusx|Gybiru) 56 | pt_purge16: \b(HaloSouza|Hatter|Helder1965|hell_ena|Honoré legendou|Huoo) 57 | pt_purge17: \b(IagoM|IceBreaker|imaycon|imdavros|Insane Metal|IrioMk|Ironnerd|IsaacA|IsaMF|Ivandrofly|IvanHalen|Ivanz|Ivekiø|i_ravena) 58 | pt_purge18: \b(JadalSarduu|JAIGDeTITLES|JaspCardoso|JBarra_|Jehhuty|JennyB|JessyBrug|jfbruna|JhéFranchetti|Jluizsd|John2nitro|John93) 59 | pt_purge19: \b(JohnnyBoy|José Cesamildo|JotaKretli|ju.Alves|JuliusMarques|Juli_Ca|JuMascarenhas|Junio_Tk2|Just4Fun|jvFlores|JVMRL) 60 | pt_purge20: \b(KahGarcia|KahX|Kakko|Kalash|KarolusM|karynasb|KaylaSRP|Kayronrdm|Kcyre|kDragon|KenziG|Kesya_Lele|KetchSketch|KiLL3R|kindtwin|KnaveofHearts|Koelax|Konsquildo|Kuantou) 61 | pt_purge21: \b(L3MOS|Lady(.)Devon|LagerthaL|laiiss|LaisRosas|LAPUMiA|LayAires|LayHolmes|laylamot|ldegroote|Lecko_alx|LEECHER05|leeht|Legionario13|Leifáklärd) 62 | pt_purge22: \b(Lekaakel|LelaBastos|leo191|leojiu|leorutodb|Letirreis|LexJT|LeZzZaDo|LFeitosa|lhenrique|Li4rs|LikaPoetisa|Liporage|lLeandro|lletaif|lostlocke) 63 | pt_purge23: \b(LqRner|Lu (Colorada|Stoker)|luanmarzulo|LucasFB|LucaSkywalker|LucasScript|lucasvsriveiro|LucyLo|LuizSK|LukeWhosoever|luscafusca|l_lost) 64 | pt_purge24: \b(M.Esquivel|M4rzulo|macedo540|macflii|MadGirl|madhater|Mad Titan|MaKTaiL|MaLorencini|Malucat|Marcio_br|Marck93|MarcRip|mari.luz|MarianaR|Marinhojmc|MariTMS|MARK-ONE) 65 | pt_purge25: \b(marmotadebermudas|MasterHit|Mastther|MatheusBozetti|MatheusM|Matvix|mawricio58|Maxikd|MaximoPoder|MayAC|mayared|mazepo|mcaio|mctosco123|meggie40|mellodemenezes|Mhaser) 66 | pt_purge26: \b(MilleG(.)|MiltonGGJ|MissBia|MissG|Miss_Foster|MisterNauta|MiTaHD|MitanidaniJP|mmachado7|Monteiroide|Monybelle|Morbeck|Moviehash|MrRamonster|Mrs.CaT|mychael.ds) 67 | pt_purge27: \b(N.Honda|n0Te|Nandus|Nati_nina|NatLittleHand|NatSol|nattyck|NayCielo²|Nbkiller|NetLion|NGed|Nightcrawler|NikaBrasil|Noirgof|NoriegaRJ|NoSpoiler|NoT-XoR|nuganath|nytubi) 68 | pt_purge28: \b(OmiMau) 69 | pt_purge29: \b(pablo.cesar.90813|Padfoot|Pampbs|Paniago|Paranhosgomes|patinatiluft|PaulaCrespo|paulinhaM|paulostriker|Pedrorms|PedroSPJ|Peposo|Petrogui|Pichocho|Pinguim(.)SP) 70 | pt_purge30: \b(Pirata-Tuga|Pitombeira|Pointless|Ponomarenko|PowerPlay|Predator_Alpha|primoeerie|PsychoBrasco|PsycoWave|Psyhead|Pt-Mighters|puraserena|Purpleness) 71 | pt_purge31: \b(R.Zen|Rachmaninoff|RadTail|rafa1504|Rafael UPD|RafaMontagner|Ranko|ratcicle360|RCuestas|Re Guedes|Recov2.0|recrutacreepy|RedSoldierBR|RedTail|renatamm) 72 | pt_purge32: \b(RenatoCochrane|Reptarop|rezimm|rhuannalves|Rhuanpci|RicardoMica|richlips|rickSG|rmasaranha|rMonta|robfilho|RocketJao|Rodrigo880414|Rominho|RSQuint|rubenfmsilva|rushe) 73 | pt_purge33: \b(Saaresto|samhk222|Samuholmes|Sarabp|Saylorman|ScarNeedle|SenpaiBaka|Shockey|SilneiS|Sk@llTow|skoad|skøad|skØad|SlipknotPE|SLRipsPT|Snoopysoft|SOFTITLER) 74 | pt_purge34: \b(SongMade|Sonic( |)2099|SpoiledCat22|Sr(.)( )Loko|Stark²|StarManiacO|Studzes|sub.Trader|Subsfreak|subXpacio|SuB_VersioN|super_zed) 75 | pt_purge35: \b(Tati( )Saaresto|Tati_89|Tchodz|Tecsamp|tellos0|ThaySoul|The H@tter|The Pilgrim|Thedao|The_Tozz|Thiago Legionário|ThiagoW|ThuNderSubs|ToBe_AFM|Tranceman|trancero_ssa|Trecker1963|TuGAZx) 76 | pt_purge36: \b(UliPetit) 77 | pt_purge37: \b(valuuh|vanagamer|Vansgomes|Vegafloyd|VHanelli|VicodinTrip|victorcruel|Vikingbyheart|VIKT0R|vikyor1|Vinilator|ViniTimm|virtualnet|vitckari|Vitørr|vivisilusion|Voitek_|V¡¢¡öµ§|Vódinha) 78 | pt_purge38: \b(wal_ny|WesleyP|whataisa|willian_as|willy_br|WISHMAKER|Witchdoctor|wkiane|wribeiro) 79 | pt_purge39: \b(XandeAlves|xaplef|Xlima2003|YsoseriousM|Yuca|Yuca²|yuki_briza|Y_Lima|ZeitG3eist|Zetnos|ZeusRevoLTs|zicadora²|Zinho_1976|ZORAXbr) 80 | pt_purge40: \b(©yßë® V¡¢¡öµ§|©yßë®V¡¢¡öµ§) 81 | 82 | # Instagram/Twitter @ Profiles 83 | pt_purge41: \b(@)(avelarneco|b99bra|b99noice|b99noicesmort|briedanversx|cezarrezzo|citeiperalta|ddharis|detailsamberg|diogomatos_|drcaio|dres|helder1965(.)|iarasantos97)\b 84 | pt_purge42: \b(@)(IdiotasI|imaycon|JBarra_|jluizsd|Julhynha|K_G_B_Dublados|Lecko_alx|lm.samara|paimspring|renatamm|scherbatksy|silneisoaress|Turmagumela|Vahainen|whaIIow|__Fagundes)\b 85 | 86 | ### Addional small nicknames with trailing check, to avoid false positives 87 | pt_purge43: \b(Dres|Dres²|exande|FerM|Fél|GoDo|Ick|JesKa|Jubler|KBLO|LauraA|LeBraz|LeilaC|Lub's|LuFer|Mabu|mands|Repta|Tozz|Tozzi|VUno)\b 88 | 89 | # Known Portuguese Translators names, professional or not 90 | pt_purge44: \b(Andre Esteves|ADRIANO PEDROSO|Alan Carlos da Silva|Alysson Navarro|Ana Linhares|Carlos Eduardo Niemeyer Teixeira|Cecilia Bedin|Cinthia Alencar|Dilma Machado|Dina Almeida|Diogo José|DREI MARC|Eduardo Nakamura|Eduardo Penteado|Eric Raupp)\b 91 | pt_purge45: \b(Felipe Aguiar|Felipe Miranda|Filippe (Brandão|Vasconcellos)|Flávia Fusaro|Florinda Lopes|Gabriella Aly|Guilherme (Ferreira|Vasques)|Iara Regina Brazil|Iara Santos|Ibsertson Medeiros|Jairo de Paula|Juliana Gallo|Leandro Woyakoski|Lucas Perissê|Lúcia Leão|Marcela Almeida)\b 92 | pt_purge46: \b(Marina Baird|Marina Fragano Baird|Marisa Borgerth|Marya Bravo|Matheus Borba|Medeiros Rafael|Michael Lemos|Monika Pecegueiro do Amaral|Mário Menezes|Natasha Marques|Nicole Bracco|Paula Padilha|Paulo Frederico Costa)\b 93 | pt_purge47: \b(Paulo Frederico da Costa|Pedro Trindade|Pedro Verri|Priscilla Rother|Rafael Magiolino|Reinaldo S. Renzo|Renato Ximenes|Rita Macedo|Rodrigo Barros|Rodrigo Valois|Rodrigo Vieira|Rosana Cocink)\b 94 | pt_purge48: \b(Samuel Aiala|Selma Bertoncini|Sergio Cantu|Sylbeth Soriano|Sylvio Santiago Fortaleza|Tabita Carvalho|Thais Kitahara|Tiago Aquino|Valéria Egidio|Valmir Martins|Waldir Lopes|Walter Santos|Wilson Vieira)\b 95 | 96 | 97 | 98 | ### Subtitle Groups \ Equipes de legendas 99 | ### Instagram @ Profiles 100 | pt_purge50: \b(@)(aboutskins|AceSubsLegendas|CabronesTeam|ComicSubs|ConSubs|crimesubs|DarkLegendas|darklegenders|EnjoyTeam1|EquipeLi4rs|griotsteam)\b 101 | pt_purge51: \b(@)(inSanosTV|inSanosubs|InSUBs|lotsubs|ManiacSubs|NERDSubs|Queens_OfTheLab|renegados_subs|SuBMakerS|subsfiction|themarinesbr|UnitedTeam)\b 102 | 103 | pt_purge52: \b(4Elements|4ever.tv|aboutskinsbrasil|AceSubs|Alvinos Brasil|ANP® Rio|Art Subs|ArtSubs|Brooklyn 99 Brasil|BR_FILMES|Cabeças-de-Teia|cabronesteam|comicsubs|Companhia das Palavras|ConLegenders|CreepySubs|CrimeSubs)\b 104 | pt_purge53: \b(Dark Navy|Dark Squad|darklegendas|DarkLegenders|darksite|EnjoyTeam|EnjoyTeam(.)|Forom.com|gameofthronesbr|GeekSubs|GRIOTS|griotsteam|handmaidsbrasil|IdIoTaS.iNfErIoReS|IdiotasInferiores|inSanos.tv|inSanostv)\b 105 | pt_purge54: \b(inSanosubs|JDDigitalArt|Joldies Apresenta|legendas.tv|legendastv|legendasemserie|legendasfree|LegendeConosco|Legendei.com|Legenders|legseries|Li4rs|loschulosteam|lotsubs|ManiacSubs|ModerFokers|Máquina Tradutora Nacional)\b 106 | pt_purge55: \b(NoSpoiler|PT-Subs|P2MBRASIL|Perazza(.)|Queens Of The Lab|REALITYKINGS|RED SKY FILMES|Red WB Team|RedWheelBarrowTeam|Renegados Subs|RenegadosSubs|SceneLovers|ScoopVideos|SDI Media Group|SFSubs|SiNNERS|SKAsubs|SOSTeamLTV|SubMakers)\b 107 | pt_purge56: \b(SubsHeaven|SubsOTF|The Marines|the.marines|the.marinesbr|Time Pink|TimeLady|TusSeries|tvsubtitles|Underground(| )Subs|UNITED Team|united4ever|UnitedTeam|VIDEOLAR|Visiontext|Wonder(| )Subs|www.inSanos)\b 108 | 109 | ### OTHER KEYWORDS 110 | pt_purge57: \b(SRTEd|Subpack|subscene|UNRATED|DvDrip|Translation|EXTREME|BRAZILIAN|PORTUGUESE)\b 111 | 112 | 113 | # Common phrases / Used by Sub Groups 114 | pt_purge60: \b(Avalie esta legenda|Anuncie( | o )seu produto ou marca aqui|Apoiar-nos e tornar-se membro VIP|Contribua tornando-se um usuário VIP|remova todos os anúncios|Quality is Everything|Quality Is Everythig|Ajude outros usuários a escolher)\b 115 | pt_purge61: \b(Nerds Eager to Rock Doing Subtitles|Noobs fazendo Subs|TERRA DOS LATICÍNIOS|Making the Difference|MAKE A DIFFERENCE|KID BENGALA|Qualidade é InSUBstituível|Enjoy apresenta|sejaseupropriopastor|Sua melhor aposta em legendas|Your Last Hope)\b 116 | 117 | 118 | # Keywords that ends with colon ":" 119 | # Revis(ado|ada|ão) 120 | # Tradu(zido|zida|ção|ções) 121 | # Legend(a|as|ada|ado) 122 | # Sincroni(a|as|zada|zado|zação) 123 | # TODO: Add secondary word "por" before the colon ":" for example "Traduzido por:" 124 | # TODO: Add secondary word "de" before the colon ":" for example "Com Tradução de:" 125 | pt_purge71: \b(revis)(\S+\:) 126 | pt_purge72: \b(tradu)(\S+\:) 127 | pt_purge73: \b(legenda)(\S+\:) 128 | pt_purge74: \b(sincroni)(\S+\:) 129 | pt_purge75: \b(agradeciment)(\S+\:) 130 | #pt_purge76: \b(Episódio)(\S+\:) 131 | 132 | #pt_purge#: Regex goes here. 133 | 134 | 135 | # 136 | # -----------------------------------------GUIDE------------------------------------------------- 137 | # 138 | 139 | # This language profile contains two lists of regex that will look for patterns. 140 | # if you wish to modify or remove any regex, feel free to do so 141 | # but files in the default folder will be overwritten when you update the script. 142 | # You can add and remove keys as long as two keys don't use the same key twice. 143 | 144 | # WARNING_REGEX: 145 | # In the WARNING_REGEX section each individual match from each regex gives one warning to the subtitle block. 146 | # Blocks also receive additional warnings if they are adjacent to other blocks that contain ads. 147 | # 1 warning is ignored 148 | # 2 warnings will be print the block as a WARNING in the log. 149 | # 3 warnings or more will remove the entire block. 150 | 151 | # PURGE_REGEX: 152 | # Any match against the regexes in the PURGE_REGEX section will remove the entire subtitle block. 153 | 154 | # Remember that regex symbols like \^$.|?*+([{ have special meaning in regex and if you want to test for the 155 | # literal character you'll need to escape it with '\' 156 | # for example: matching "www." would require a regex like: "www\." 157 | # you can test regexes online on an regex-tester tool like https://regex101.com/ 158 | 159 | # Feel free to ask me any question on github. 160 | -------------------------------------------------------------------------------- /libs/langdetect/profiles/so: -------------------------------------------------------------------------------- 1 | {"freq":{"YO ":13,"jec":34,"jee":32,"D":313,"E":183,"F":66,"G":214,"A":673,"B":249,"C":240,"L":152,"M":367,"N":163,"O":122,"H":180,"I":236,"J":129,"K":173,"U":82,"T":107,"W":226,"V":11,"Q":76,"P":22,"S":486,"R":114,"Y":96,"X":120,"Z":10,"f":458,"g":2154,"d":5233,"e":4497,"b":2102,"c":900,"a":24510,"n":3878,"o":5982,"l":3786,"m":2460,"j":397,"k":2897,"h":3132,"i":6615,"w":2306,"v":27,"u":3829,"t":1545,"s":2871,"r":2895,"q":718,"p":77,"z":23,"y":3607,"x":1698,"jaa":13,"jab":16,"jar":10,"jam":12,"Xam":11,"joo":14,"Xas":10,"jis":14,"jir":95,"jii":13,"jid":17,"jo ":15,"Far":12,"isk":69,"ism":12,"isl":25,"iso":22,"isu":42,"ist":67,"ita":17,"is ":71,"ion":20,"ir ":84,"irs":56,"irt":28,"iro":22,"irk":32,"iri":56,"isi":32,"ish":96,"ise":18,"isb":17,"Wux":23,"isa":134,"ire":16,"ira":131,"iyi":10,"iyo":394,"iya":423,"iye":65,"ixi":16," l":598," m":880,"kii":161," n":189," o":537," h":365," i":795," j":267," k":1328," d":1214," e":328," f":95," g":401," a":1317," b":593," c":361," y":296," x":283," u":599," t":376," w":1834," q":291," p":20," s":807," r":112,"km ":14," J":125," K":142," H":119," I":161," N":93," O":34," L":81," M":322," B":217,"khd":24," C":229,"kha":11," A":275," F":59," G":169," D":236," E":41," Z":10," Y":40," X":90," S":438," R":66," Q":69," P":18," W":211," U":33," T":83,"kee":20,"key":11,"kh ":38,"Web":10,"Waa":56,"ku ":434,"kor":15,"Wax":40,"koo":94,"War":17,"XEE":11,"مد":16,"Gal":22,"و":25,"ي":76,"ف":13,"ق":12,"ل":77,"م":62,"ن":31,"ه":13,"د":46,"ح":26,"ب":37,"ة":21,"ا":98,"أ":11,"ع":29,"ش":21,"س":23,"ر":49,"kar":49,"kas":30,"kan":49,"kal":143,"kam":32,"kad":48,"kac":14,"kab":10,"kaa":81,"ka ":1268,"A ":83," Ga":53," Ge":18,"Da":59,"DU":11,"Cu":18,"Co":13,"DE":11," Fi":17,"Ce":13,"DH":15,"Ci":23," Ha":35,"Du":13,"EY":13," Go":61," Gu":12,"EG":11,"De":45,"EE":45,"EL":14,"Di":29,"Dh":36,"H ":16,"GA":19,"Fa":23," IY":15,"Er":12," Ho":29,"ha ":334," Hi":37,"Ge":18," Ji":25,"Ga":53,"حم":18,"HA":35,"I ":13," Ja":63," KA":16," Is":32," It":29,"GM":12," In":35,"Fi":17,"ham":43,"han":102," Ka":28,"hal":48,"haw":17,"hax":44,"haq":58," Ki":19,"har":45,"has":76," Kh":10," Ju":19,"hah":12,"hab":77,"haa":189,"had":144,"hac":36,"AS":15,"AR":23," MA":17,"AX":27," La":22,"AY":15,"BA":11," Li":11,"C ":10,"AD":43,"AA":51,"AB":14,"AG":11," Ko":23,"AH":23,"hay":333,"AL":37," Ku":26,"AM":13,"AN":35," Ma":180,"Ax":18,"Ar":12,"D ":22,"بن":10," Mi":27,"Ba":101,"CA":15,"Af":65,"بد":10,"he ":25,"Aa":22,"Ab":33,"Ad":10,"Am":17," Lu":25,"Al":38," Ne":14,"Bu":30," Na":32,"Ca":127,"DA":43,"E ":30,"Bi":19,"Be":25,"hda":27,"Bo":30,"Hin":18," Mu":78,"hel":22,"Ku":26,"hee":112,"Ko":23,"hey":26,"hex":72,"Li":11,"N ":26,"her":11,"MA":41,"La":22,"Lu":25,"hi ":27,"Mi":27,"NK":10,"ال":51,"O ":34,"NA":12,"Ma":180,"Mu":79,"Ne":14,"Na":32," Am":16," Al":38,"Nu":16," Af":65,"No":12,"OO":18," Ad":10," Aa":22," Ab":33," Ba":101," CA":12," Ax":18," Ar":12,"hig":23," Be":25,"hid":12," Bi":19,"hin":40,"Go":61,"him":17,"Gu":12," Bo":30,"hii":170," Bu":30,"his":24,"hir":31,"Ha":35," Ca":127,"Hi":37," Ce":13," DE":10," Ci":23,"IN":12,"Ho":29," DH":13,"IS":10," Co":12," Cu":18,"IY":20," Da":59," Di":29," Dh":36,"In":36," De":45,"Is":32,"It":30,"Ja":63,"KA":33," Du":13,"Ji":25," Er":12,"Ju":19,"LA":35,"Ka":28,"Kh":10,"ho ":53,"Har":14,"Ki":19,"LE":16," Fa":23,"gma":64,"go ":32," Xi":13," Xa":51,"UU":11,"yuu":26," Wu":23,"To":11,"Th":10," Wi":15," We":12,"Ta":37," Wa":133,"St":13,"Su":23,"Wu":23,"gob":97,"Wi":16,"Wa":133,"XA":19,"We":12,"XE":12,"Y ":18,"yst":29," Yu":14,"yso":15," Ya":10,"WA":26,"gmo":41,"ysa":93,"Qa":26,"Qo":17," م":12,"RA":10,"S ":18," ع":21," ا":48,"goo":52,"R ":20," ب":13,"gsa":14,"gu ":229,"Si":17,"Sh":86,"gsi":12,"So":180,"Ru":12,"U ":11,"Sa":70,"TA":13,"Re":13,"SH":11,"Ro":11,"yoo":24,"Qu":16,"SA":16,"Ra":20,"gud":22," Nu":16," No":12,"gta":43," Ra":20," Qu":16,"b ":130," Ro":11," Re":13,"guu":20,"gun":12,"a ":5909," Qo":17," Qa":26,"شي":10," Su":23," St":13," Ta":37,"Ya":10," Th":10,"Yu":14," To":11," Ru":12," Sa":70,"Xa":51,"YO":15," Sh":86," Si":17,"Xi":13," So":180," WA":20,"ري":12,"Gob":48," ja":60,"i ":853,"ye ":36,"ian":11," iy":365," ji":127,"ge":93," je":47,"ga":1135,"fk":16,"Ing":16," im":15," in":148," il":54," ii":23,"ic ":14,"fi":49,"fr":45,"fu":47,"ft":29,"fo":18," is":155," ka":688," kh":13,"hd":44,"he":286," ki":46," ke":11,"ha":1580,"gn":11,"gm":108," jo":14,"gl":15,"gi":72,"id ":171,"gu":305,"iba":32,"gt":52,"gs":27,"gr":15," ju":17,"go":196,"du":188,"dw":36,"dy":13,"g ":83," ha":190,"ea":16,"eb":72,"yee":61,"ec":51," he":28,"ed":360,"de":252,"dd":113,"di":494,"dh":632,"dk":189,"dl":33," go":117,"do":234,"dn":22," gu":55,"ia ":36,"ex":102,"ey":554,"fa":110,"h ":441," id":15,"fe":17,"eh":54,"ib ":32,"eg":202," hi":20,"ee":1263,"el":242,"ek":35," ho":120,"ei":12,"yey":26,"en":172,"em":31,"et":26,"es":93,"er":287,"ya ":266,"ca":427," ni":37,"e ":881," ne":15,"bs":21," na":54,"br":36,"bu":104,"bt":55,"bn":18,"bo":234,"bk":30,"bl":13," mu":48,"ig ":10,"bi":355,"bb":15,"bd":41,"be":201,"db":11,"da":2087," og":18,"f ":98,"cy":18," of":16,"cu":41,"ct":11,"cs":27,"co":62,"cm":24,"cn":13,"cl":19,"ci":73," nu":10,"ch":33," no":73,"ce":64,"cd":20,"yad":111,"yag":10," le":91,"c ":51,"yaa":287," la":334,"icm":22," ku":465,"ici":14," km":14,"ica":25," ko":88," me":49,"az":10,"ay":1458,"idu":13," mi":187,"ba":817,"d ":893,"at":134,"as":580,"yd ":29,"ido":43,"ar":1307,"aq":237," ma":590,"ax":1066,"aw":157,"idk":12,"yay":52," lu":25,"ak":76,"al":1647,"idi":35,"yaw":11,"idh":19,"ai":29,"aj":59,"yar":45,"am":590,"an":1951,"yaq":50,"yan":13,"ac":260,"ida":140,"ad":2243,"aa":4171," lo":138,"ab":630,"ag":664,"ah":1152,"yah":134,"af":128,"iib":15,"nu":38,"iic":11,"nt":263," af":45,"ns":59," ah":473," aa":208,"iig":13," ab":31,"iid":50,"no":160,"nn":18," ad":49,"q ":34," am":103," an":18,"iik":48,"iin":164,"ny":57,"yka":17,"iil":93," al":21,"iim":26,"iis":199,"iir":65,"of":78,"iiq":14,"oc":29," ax":10,"od":156," ar":26,"ob":291," aq":21," as":29,"om":340,"on":186," ba":344,"ok":16,"ol":273," ay":246,"og":129,"il ":80,"ot":41,"os":90," bi":107,"op":10,"oo":1738," be":63,"or":236,"oq":49,"yn ":105," bo":34,"r ":475,"ox":10,"ow":125,"oy":128," bu":35,"pa":14," ca":238,"im ":21,"ika":50,"lo":386,"ige":10,"lm":39,"ll":110,"ls":27,"iga":247,"ii ":339,"lw":14,"lu":48,"igi":31,"yo ":488,"ly":56,"igu":13,"igt":12,"o ":2012,"ma":1465,"mb":52,"mh":21,"me":199,"mk":39,"mi":333,"mp":19,"mo":102,"yna":98,"mu":85,"ihi":82,"yni":14,"na":851,"nb":30,"yne":30,"nc":10,"nd":137,"ne":107,"nf":30,"ng":58,"ynt":29,"ni":213,"nk":312,"nl":21,"imo":20,"ju":17,"jo":31," ee":295,"imi":21,"ki":203,"kh":95,"ke":48,"ind":29,"ina":80," fa":48,"yga":15,"ka":1778,"yi ":19,"m ":103," fu":10,"ino":13,"kt":20," fo":12,"ku":558,"int":102,"ins":10,"ko":130,"ine":14,"ing":16," fi":17,"ini":10,"km":16,"ink":82," ge":36,"li":577,"lk":332,"le":352," ga":186,"ld":23,"lg":22,"inu":15,"la":1306,"lb":52,"iny":13,"n ":1478," co":22,"ht":11,"hu":92,"ikh":54," ce":15,"hi":387,"hn":16,"ho":217," ci":36,"ila":160,"id":471,"ic":103,"yin":59,"ib":108,"ia":61,"ih":88,"in ":262,"ig":350," da":424,"if":21,"yih":49,"yig":21," cu":34,"hy":12,"k ":24,"iq":21," do":45,"ilo":13,"ir":438,"is":630,"it":49,"ill":18,"ilk":32,"ix":28,"ilm":12,"ii":1062,"ij":21,"ik":134," de":120,"ili":51,"il":385,"im":170,"in":663,"io":30," di":70,"yir":13," dh":511,"ima":76,"je":69,"ji":178,"iy":896," du":39,"l ":398,"ja":82,"xi":123,"xo":56,"xm":34,"xw":27,"xu":185,"xb":18,"xa":850,"xe":161,"xd":67,"wg":11,"wi":81,"how":15,"wl":60,"wo":26,"wu":102,"hog":13,"y ":1137,"wa":1722,"wd":13,"hoo":55,"we":185,"hor":60," yi":55," yu":13,"uy":12,"ux":164,"uw":34,"uu":720," ye":13,"ve":10," ya":211,"x ":140," xo":33,"uj":15,"uk":28,"ul":200,"uf":20," xi":90,"ug":210,"uh":16,"uq":90,"ur":259,"hna":12," xu":39,"us":114,"ut":54,"um":90,"un":214,"tu":47,"ub":104,"ua":11,"ud":145,"uc":17," xe":16,"w ":59," xa":103,"to":175,"hul":37,"tr":25,"te":120,"ti":246,"th":37,"ta":784,"su":111,"ss":19,"st":173,"sw":12,"sl":47,"sk":106,"sm":25,"so":371,"sr":10,"sc":17,"se":101,"sh":456,"ي ":20,"xme":19,"si":404,"xma":13,"u ":1296,"sa":722,"sb":21,"rr":20,"rs":115,"rt":160,"ru":77,"rw":11,"rx":11,"ry":27,"ro":144,"rn":40,"rm":32,"rl":22,"rk":200,"ri":397,"hu ":11,"rg":35,"re":258,"rd":49,"rc":12,"rb":25,"ra":754,"t ":51,"qu":35,"qs":10,"xoo":44,"qo":163,"IYO":15,"qi":33,"qe":23,"qa":334,"qd":61,"s ":240,"pu":15,"pr":14," ru":12," u ":194," sa":221," se":17," si":157," sh":112," so":259," qu":21,"xya":13," ra":48," re":33,"ن ":17," ro":11," qe":14," qa":168," qo":69," qi":18," oo":464," or":10,"huu":29," wa":1582," we":88," wo":12," wu":102," wi":39," uu":195,"xud":12,"xuu":133,"Hoo":12," tu":36," us":16," ur":10,"م ":11," um":12," un":11," ug":131,"yg":19," ta":231,"ye":133,"yd":48,"ya":998,"yb":27,"xwe":21,"xy":17," su":25,"yu":34,"ys":166," to":18," th":15," ti":62,"yo":522,"yn":280," te":11,"yk":19,"yi":189,"fee":11,"xey":58,"xee":54,"far":32,"fad":21,"faa":24,"Suu":12,"Axm":14,"xir":17,"xis":13,"xil":26,"xii":17,"xid":14,"xig":24,"Sta":10,"xa ":169,"eyb":17,"eya":63,"eys":74,"Tal":11,"eyn":163,"eyo":14,"eyk":10,"xda":51,"eyd":16,"eye":14,"exa":10,"exd":12,"exe":51,"xe ":46,"xar":38,"Ban":18,"Baa":14,"Bad":22,"xam":54,"xan":16,"Bar":23,"xay":166,"xba":16,"xaa":341,"xad":27,"xag":13,"wux":100,"Aas":11,"Shi":22,"She":12,"Sha":50,"ex ":21,"Af ":19,"ey ":159,"er ":103,"es ":21,"eri":33,"ere":30,"era":49,"Afr":32,"esh":28,"esa":10,"ers":11,"ern":14,"ekh":16,"en ":89,"ela":47,"ele":26,"eli":17,"ell":42,"elo":15,"emb":19,"ena":28,"wla":53,"eny":12,"egm":90,"ego":14,"egt":11,"Som":32,"Soo":136,"woq":10,"el ":65,"wda":13,"Buu":11,"Bur":11,"we ":12,"gir":17,"gii":26,"wey":124,"wee":27,"gey":15,"gee":44,"wi ":14,"wis":10,"wii":22,"Sal":11,"gab":12,"gac":45,"gad":26,"DA ":20,"gaa":436,"gar":35,"gay":21,"gal":70,"gan":69,"ga ":388,"San":27,"wa ":22,"Cab":27,"waq":26,"wan":30,"wal":39,"wax":715,"way":45,"Cal":18,"war":52,"was":18,"Car":40,"waa":581,"wad":168,"Bel":10,"fur":37,"Bis":12,"fri":39,"fii":15,"Boo":10,"fka":13,"da ":918,"de ":22,"dad":131,"daa":159,"dab":19,"dal":113,"WAX":16,"dag":65,"dah":101,"dar":51,"dan":291,"dam":39,"day":61,"dax":79,"daw":32,"Cum":10,"dda":74,"dde":11,"ddi":17,"cun":14,"EEY":13,"EEL":14,"EGM":11,"Deg":30,"cyo":15,"uxu":126,"Daa":22,"Dag":10,"Dal":10,"uxa":15,"uun":88,"uul":63,"uum":13,"uug":15,"uud":50,"uux":10,"ux ":12,"uus":29,"uur":74,"uuq":18,"uut":24,"uwa":28,"co ":26,"cma":23,"ush":13,"usi":11,"use":13,"uu ":316,"usu":26,"uso":11,"uti":16,"uta":19,"cod":10,"com":11,"uqa":33,"uqd":36,"ura":37,"ure":10,"uri":31,"urk":17,"urt":32,"uru":37,"ur ":39,"csi":14,"uma":56,"unt":32,"unk":27,"uni":11,"una":85,"cel":30,"uka":13,"cee":17,"uls":10,"ulo":20,"ull":14,"ulk":27,"uli":14,"ule":16,"ula":26,"un ":29,"che":12,"ul ":36,"ciy":12,"cii":28,"uga":40,"ugu":128,"ugs":11,"ed ":184,"ebi":20,"uf ":13,"uda":33,"udi":12,"eb ":12,"udu":37,"ug ":18,"ega":53,"ub ":32,"eek":25,"een":99,"eel":138,"eem":18,"eeb":23,"eeg":65,"eed":229,"eey":113,"eh ":42,"ees":56,"eer":157,"edk":18,"edi":12,"ede":22,"eda":72,"uba":39,"ubb":11,"edu":15,"ud ":36,"edo":11,"ecl":12,"ece":25,"ee ":319,"dwe":25,"dwa":11,"duu":57,"tuu":22,"doo":96,"dow":37,"tri":10,"The":10,"dna":12,"to ":75,"Dhe":14,"Dhu":12,"dun":12,"dul":20,"dug":23,"too":69,"du ":45,"tii":59,"tig":10,"tir":66,"dha":335,"tio":16,"tic":26,"dhu":33,"dib":25,"dhi":112,"dhe":122,"dho":21,"der":19,"dex":18,"dey":16,"dee":48,"deg":96,"den":15,"di ":38,"dle":11,"dla":17,"tee":36,"dku":14,"dki":33,"do ":77,"ter":36,"diy":39,"din":26,"ti ":29,"dir":60,"dis":51,"dig":42,"dii":165,"dil":12,"dka":134,"the":16,"rga":14,"ri ":48,"rge":14,"rey":42,"ree":110,"rda":15,"rdh":16,"re ":77,"rco":10,"rax":25,"ray":99,"rar":15,"ras":44,"rat":10,"rba":11,"rah":41,"ran":54,"ram":17,"rak":12,"rab":82,"raa":165,"rad":87,"rs ":11,"roo":48,"rna":16,"rne":11,"rni":10,"ro ":63,"rma":23,"Nab":15,"rla":13,"rku":10,"rko":10,"rki":41,"rke":18,"rka":117,"riy":58,"ris":28,"rig":31,"rii":110,"rik":46,"rin":21,"ric":16,"rya":13,"rur":10,"run":18,"ruu":10,"ry ":11,"rsi":16,"rsa":63,"rsh":15,"rta":110,"rto":18,"rte":11,"rti":11,"rub":12,"saa":120,"sab":11,"sad":52,"sag":23,"sah":11,"sal":49,"sam":47,"sbi":14,"san":191,"sas":14,"sar":33,"say":43,"sa ":99,"sha":242,"sho":46,"she":41,"shi":83,"si ":68,"siy":42,"sid":91,"shu":10,"sil":13,"sim":38,"sii":82,"sig":32,"se ":61,"sh ":17,"see":14,"sow":16,"som":59,"soo":214,"soc":14,"su ":25,"sla":30,"sku":37,"ska":59,"so ":55,"sma":15,"حمد":15,"ste":15,"sta":66,"sto":28,"sti":41,"sub":11,"suf":12,"sug":13,"sul":11,"suu":22,"tal":42,"tag":10,"tah":87,"taa":194,"tad":13,"tay":60,"tar":33,"tan":31,"tam":13,"te ":13,"ta ":272,"bka":23,"biy":71,"bis":28,"bir":12,"bil":48,"bin":31,"big":38,"bii":37,"bo ":47,"bol":129,"bna":15,"boo":24,"bba":12,"be ":19,"ban":61,"bal":43,"bah":27,"bad":232,"baa":96,"bab":12,"bay":35,"bax":34,"bas":10,"bar":156,"bdi":25,"bdu":11,"bi ":69,"bee":145,"ber":11,"bey":12,"ca ":55,"car":35,"cas":13,"can":24,"cay":13,"cab":20,"cad":53,"caa":145,"cal":33,"cag":16,"bri":13,"bra":15,"bsa":11,"bta":33,"bti":13,"bur":20,"bul":12,"buu":52,"aka":19,"am ":40,"aki":23,"aji":27,"ajo":16,"qa ":12,"al ":136,"ahi":41,"qar":20,"qay":16,"aho":10,"qad":44,"qab":47,"qaa":149,"ahd":20,"qan":14,"qal":17,"ahe":26,"aha":697,"agm":13,"agt":24,"agu":76,"ago":29,"aq ":22,"qdi":38,"qda":17,"any":23,"ano":51,"ann":10,"ant":70,"ans":32,"ane":21,"ang":10," ال":46,"ani":87,"ank":185,"ana":385,"anb":26,"and":92,"amu":23,"amo":10,"amk":32,"amh":19,"ami":82,"ame":93,"amb":16,"ama":257,"aly":20,"qey":14,"alo":160,"alm":17,"all":22,"alk":165,"alg":17,"ali":424,"ald":14,"ale":110,"ala":480,"alb":42,"an ":924,"aba":194,"abd":37,"abe":56,"abi":146,"abk":18,"abo":40,"abt":38,"abu":36,"aca":130,"aab":114,"aac":13,"aaa":15,"aaf":38,"aag":64,"aad":398,"aaj":28,"aak":21,"aah":75,"aan":742,"aal":743,"aam":113,"aas":211,"aar":259,"aaq":41,"aaw":32,"aat":37,"aay":89,"aax":19,"ad ":334,"qiy":15,"ac ":19,"aa ":1110,"qii":10,"ab ":33,"afr":11,"aft":15,"afi":18,"aga":458,"age":12,"ah ":325,"afa":38,"ado":85,"adl":23,"adk":153,"adn":12,"adh":26,"adi":223,"add":96,"ade":66,"ag ":29,"adw":22,"adu":44,"aci":16,"ace":10,"Qar":12,"acd":15,"ada":1138,"af ":19,"acy":15,"acs":19,"qor":48,"qoo":60,"qof":24,"axi":13,"axm":15,"axo":15,"axu":15,"axa":702,"axb":16,"axd":50,"axe":90,"ayi":11,"ayo":52,"ayn":115,"ays":84,"ayu":13,"axy":16,"axw":26,"ayb":10,"aya":151,"ayg":11,"ayd":32,"aye":26,"ba ":84,"qur":24,"at ":11,"arg":25,"are":96,"ard":30,"arb":14,"ara":357,"aro":72,"arn":19,"arm":17,"arl":10,"ark":135,"ari":153,"aru":20,"ars":39,"art":72,"asa":99,"ary":14,"asi":106,"ash":156,"ase":12,"aso":31,"ask":17,"ar ":198,"as ":80,"aqa":111,"aqi":13,"aqo":51,"ax ":98,"awe":20,"ay ":932,"awa":46,"awl":31,"awi":33,"ata":37,"asu":12,"ast":33,"ato":18,"ate":17,"ra ":58,"ati":34,"ngi":20,"ni ":47,"Isl":11,"neh":11,"ng ":11,"nee":16,"nfu":25,"ney":14,"ne ":43,"ndh":18,"ndi":22,"nan":17,"nac":45,"nad":83,"nah":41,"nab":18,"naa":131,"Ito":28,"nbe":15,"nd ":69,"AXE":10,"AY ":10,"nba":11,"AXA":12,"nay":47,"nax":11,"na ":412,"Jab":13,"Jan":13,"Jam":22,"KA ":11,"KAL":10,"nya":38,"AAL":13,"ADA":25,"nuu":21,"nto":13,"nti":37,"nta":176,"nte":24,"nsi":15,"nsa":22,"AHA":14,"noo":67,"noq":18,"nna":11,"ALA":17,"nle":12,"no ":59,"nki":22,"nka":271,"AN ":16,"nii":13,"nih":11,"nig":39,"niy":10,"nis":15,"nim":17,"nin":39,"ogu":24,"oga":60,"Jub":11,"ol ":60,"oco":11,"odi":15,"of ":38,"oda":43,"ofe":10,"LA ":12,"د ":29,"oba":86,"od ":60,"obo":134,"obi":38,"ة ":21,"oyi":94,"oya":10,"owl":29,"ow ":45,"ost":14,"ota":10,"ose":28,"os ":15,"oon":114,"ool":98,"oom":198,"oof":13,"oog":60,"ood":123,"oob":124,"or ":39,"ooy":111,"oow":16,"oot":14,"oos":65,"oor":31,"Koo":13,"ore":44,"ori":14,"osa":11,"ort":21,"oqo":37,"oqd":11,"ora":61,"ola":52,"on ":52,"olk":99,"ole":20,"olo":14,"oly":10,"ona":28,"onf":25,"oni":16,"onk":11,"ons":12,"ont":14,"oma":298,"oo ":749,"omp":12,"la ":241,"le ":159,"laa":281,"lab":61,"lac":11,"lad":232,"laf":10,"lah":96,"lag":116,"lal":23,"lan":88,"lam":27,"las":21,"lay":70,"lba":15,"lbe":31,"kuw":22,"kuu":18,"kun":22,"kul":14,"kto":17,"MAD":13,"lom":11,"loo":176,"lmo":12,"lmi":13,"lma":10,"lsh":13,"Luu":11,"li ":92,"lga":16,"ley":29,"leh":35,"lee":98,"lo ":165,"lla":49,"lle":32,"lka":311,"lki":14,"lis":19,"lin":48,"lim":15,"liy":204,"lid":28,"lia":24,"lib":24,"lil":40,"lii":17,"lig":30,"ma ":133,"maa":361,"mac":36,"mah":24,"mad":229,"mag":226,"mar":193,"mas":14,"mal":133,"man":32,"may":23,"max":25,"mba":26,"mbe":10,"me ":19,"med":68,"mee":72,"mey":24,"luq":12,"luu":17,"مد ":15,"lya":33,"lyo":10,"Mar":22,"Mas":10,"Mag":51,"Mad":20,"Maa":17,"Max":25,"moo":35,"muq":17,"muu":16,"mul":10,"Mux":13,"mhu":20,"Muq":24,"Mud":14,"mi ":19,"min":17,"mil":14,"mis":11,"miy":27,"mig":18,"mid":170,"mij":10,"mii":25,"mo ":60,"mka":33},"n_words":[94077,109135,83288],"name":"so"} -------------------------------------------------------------------------------- /libs/langdetect/profiles/sw: -------------------------------------------------------------------------------- 1 | {"freq":{"jer":348,"jen":305,"ji ":6234,"D":1805,"E":874,"F":1081,"G":1202,"A":4461,"B":2717,"C":2251,"L":1530,"M":12761,"N":2782,"O":860,"H":1677,"I":2605,"J":2641,"K":12188,"U":3120,"T":5185,"W":4730,"V":1116,"P":2090,"S":3343,"R":1632,"Y":517,"Z":395,"f":11048,"g":13829,"d":15034,"e":46694,"Feb":214,"b":19688,"c":9784,"a":289584,"n":90468,"o":57043,"l":42025,"m":53651,"j":21456,"k":76835,"h":32492,"i":164978,"w":60984,"v":3863,"u":57506,"t":40551,"s":35298,"r":27443,"p":13501,"z":18893,"y":38832,"x":501,"jar":185,"jan":137,"jaw":201,"é":167,"jim":1500,"jin":4267,"jil":163,"jij":492,"jia":221,"jib":3854,"ito":288,"itu":317,"itw":269,"isp":140,"ist":592,"ita":1061,"ite":213,"iti":334,"ivy":133,"iwa":2430,"ius":183,"ipo":224,"ipi":265,"is ":521,"ion":720,"iop":279,"ipa":165,"ipe":219,"iro":173,"iri":997,"isi":902,"ish":5756,"isa":694,"ire":164,"ira":314,"ja ":1529,"iyo":4644,"iye":227,"izo":242,"izi":413,"iza":568," l":8602,"kif":518," m":27935," n":19872," o":327,"kik":333," h":7652," i":9059,"kij":166,"kim":258," j":5212,"kil":389," k":27977," d":1010," e":802," f":914,"kia":390," g":257," a":6533," b":1252," c":2191,"kiw":279," y":17767," z":2257,"kin":442," u":4361,"kio":148," t":2402,"kip":379," w":34366," v":1482,"kis":520," p":2154,"kit":315," s":6097," r":837,"ki ":2193," J":2627," K":12017," H":1638," I":2128," N":2678," O":803," L":1487," M":12665," B":2646," C":2112," A":4277," F":1046," G":1172," D":1740," E":782," Z":375," Y":513,"и":142," S":3229," R":1588,"а":137," P":2015," W":4707," V":1031," U":3052," T":5117,"kea":156,"kem":150,"ke ":1988,"ku ":187,"kri":520,"kon":141,"koa":3734,"ko ":1214,"ل":165,"ا":240,"juu":155,"jul":257,"jum":177,"kaz":5045,"kaw":137,"kat":14149,"kar":374,"kas":316,"kan":2795,"kao":197,"kal":354,"kam":1048,"kad":160,"kab":375,"ka ":19783," Ga":196,"Da":365," Ge":229,"Co":364," Fr":177,"Ch":770," Ha":622," He":218," Go":142,"Do":469," Gr":177," Gu":142,"De":497,"Di":169,"Fe":311," Id":148,"Fa":160," Hu":173," Ho":177," II":154,"ha ":2668," Hi":392,"Ge":229," Ji":535,"Ga":198," Je":286,"I ":397," Ja":792,"Fr":177," Ir":284," Is":141," It":181," In":316," Ik":143," Il":224,"ham":522,"han":444,"hap":154," Ka":2225,"hai":238,"haj":163,"hak":611,"hal":314," Ke":708," Ki":3568,"har":1714,"has":255,"hat":148," Jo":255,"II ":207," Ju":691,"hag":267,"hab":181,"had":740," La":231," Le":207," Li":441," Ko":414," Ku":695," Kw":4009,"Au":181," Ma":4258," Mb":461,"Ar":475,"As":222," Mk":3388,"Ba":771," Mi":685," Mj":478," Me":615,"Af":445,"he ":544,"Ag":372," Lo":213,"Am":241,"An":463,"Ap":290," Lu":315,"Al":840," Ne":518,"Bu":429,"Br":278," Na":464,"Ca":592," Ni":435,"Bi":308," Mt":420,"Be":362," Mp":146," Mo":643,"Bo":282," Mu":471," Mw":545,"Ku":695,"Kw":4009,"Ko":415,"hez":299,"Le":210,"Li":441,"hes":336,"her":275,"hen":226,"hem":395,"La":231,"Lu":315,"Lo":213,"Me":621,"hi ":3880,"Mi":690,"Mj":478,"Mk":3388,"Ma":4263,"Mb":461,"Mw":546,"Mu":475,"Mt":420,"Mp":146,"Mo":643,"Ni":437,"Ne":518,"Na":466," Ap":290," Am":240," An":463," Al":833,"Ny":247," Ag":372," Af":443,"No":466," Ba":766,"Ok":277," Au":181," As":222," Ar":474," Be":362," Bi":308,"hio":2603,"Gr":177,"Go":143,"hin":1991,"him":244,"hil":432,"Gu":142," Bo":282,"hii":230," Br":278," Bu":429,"his":266,"hir":394,"Ha":622," Ca":582,"hiy":239,"He":219,"II":286,"Hi":393," Ch":768,"Ho":179,"Hu":173," Co":362,"K ":152,"Id":148," Da":365," Di":167,"In":317," De":495,"Ik":143,"Il":226,"Is":141,"It":181," Do":469,"Ir":284,"Ja":792,"Ji":536,"Je":286,"Jo":255,"Ju":691,"Ka":2234,"Has":225,"ho ":334," Fe":311,"Ki":3577," Fa":159,"Ke":708,"Us":172,"Ut":325,"Ur":181,"go ":920,"Un":355,"Uk":150,"Ul":189,"Ui":244,"Uj":249,"Uh":170,"Uf":251,"Uc":175,"Tu":237,"To":205,"Th":275,"Te":258," Wi":3377,"Ta":3841," We":188," Wa":1003,"St":260,"Su":178,"Wi":3380,"Wa":1003,"We":189," Zi":141," Za":152,"Vi":670," Yo":250,"Pr":150,"Pe":270,"goz":233,"Pa":858,"Po":195,"Pi":163,"gom":190,"gon":205,"gos":279,"gor":306,"Se":532,"gu ":424,"Si":424,"Sh":518,"So":239,"Ru":370,"Sa":668,"Re":188,"Ri":138,"Ro":385,"Ra":354," Po":195,"guj":253," Pi":163," Pe":270," Pa":857," Ny":247," No":466," Ok":277," Ra":354,"b ":211," Ro":385,"gwe":166," Re":188," Ri":138,"gwa":280,"guz":429," Pr":150,"a ":143240," Su":178," St":248," Ta":3838," Th":274,"Yo":250," Te":257," To":205," Ru":370," Sa":668," Sh":517," Si":421," Se":528," So":239," Vi":666," Tu":231,"Za":152,"Zi":141," Uc":175," Uf":251," Uh":170," Ui":243," Uj":249," Uk":150," Ul":189," Un":355," Ur":181," Us":172," Ut":325," ja":134,"iak":142,"i ":52347,"ian":874," ji":4522,"ias":364,"ge":1928,"iar":235," je":226,"ga":2900," im":145," in":3363," ik":274," il":4878,"fi":1075,"fr":504,"fu":1927,"fo":752,"ibl":142,"ibi":603," ka":16147,"gw":483," ki":3027,"he":2541,"ibu":4111,"ha":8898,"gl":145,"gi":1836,"gh":1233,"gu":1858,"iba":566," ju":300,"go":2336,"du":838,"dw":136,"g ":607," ha":1606,"ea":1091,"eb":539," he":144,"ec":251,"ed":686,"de":1841,"di":4816,"dh":617,"do":1639,"ia ":9119,"dr":203,"ew":912,"ex":163,"eu":261,"ev":332,"ey":739,"ez":1828,"fa":6104,"h ":704," id":219,"fe":174,"eh":737," hi":990,"eg":644,"ef":303,"ee":307,"el":2120,"ek":2577,"ej":155," ho":139,"ei":650,"ep":643,"eo":1165,"en":9965,"em":2423,"et":1296," hu":4749,"es":2258,"er":4147," nj":147,"ca":364," ni":9330,"e ":10467," ng":147," nd":690,"bw":843," nc":2455," na":6269,"br":408,"bu":5373,"bo":2905," mw":6857,"bl":321," mu":4335," mt":648," ms":331,"bi":2134," mp":280," mo":680," mn":1501,"be":1280," mm":157,"ifu":393,"da":3239,"f ":246,"ifo":606," of":164,"co":390," ny":523,"ck":301,"ci":283,"ch":7388,"ce":365,"ifa":585," le":184,"c ":192," li":859," la":7153," ku":5668,"ich":830," kw":2736," km":140,"ica":140," ko":150," me":184," mf":368,"az":6015,"ay":5308," mi":1257,"ba":6016," mj":5191," mk":1617,"d ":1205,"at":22079,"as":4908,"ar":9773," ma":3449," mb":469,"aw":1490," mc":155,"av":414,"au":1667," lu":341,"ak":14678,"al":8458,"idi":551,"ai":5267,"aj":1998,"ao":6210,"ap":5739,"ide":157,"am":9111,"an":29556,"ac":1224,"ad":3126,"ida":813,"aa":1773,"ab":2568,"ag":1596,"ah":1414,"ae":682,"af":1092,"nu":591,"nt":1270,"ns":4895,"no":1318,"nn":478," am":1335," an":488,"nz":5093," ai":153,"iin":242,"ny":7307," aj":134," ak":183," al":2589,"of":4380," au":941,"oc":308,"od":678,"oa":4118,"ob":631," at":195," as":220,"om":1846,"on":3853,"ok":2328," ba":679,"ol":1930,"oi":1488,"oj":1425,"og":855,"oh":360,"ija":140,"ot":1280," bi":222,"os":1066,"ov":580,"ou":534,"ije":137,"op":845,"oo":318,"or":2938,"iji":1232,"r ":1622,"ow":244,"oz":397,"oy":154,"pe":836,"pa":6921,"po":1264,"ph":151,"pi":2193,"ika":13864,"lo":1408,"lm":337,"Ida":135,"ll":791,"ls":182,"iga":224,"ii ":525,"lu":868,"lt":178,"igh":170,"igi":384,"ly":147,"o ":24303,"mc":173,"igo":169,"ma":8274,"mb":6660,"mh":261,"me":2630,"mf":564,"mk":1733,"ml":210,"mi":3477,"mj":5199,"mn":1546,"mm":321,"mp":578,"ihe":138,"mo":6079,"mr":140,"mt":753,"ms":447,"mu":6394,"mw":6988,"ihi":187,"p ":352,"na":23279,"nc":2788,"nd":5575,"ne":2353,"ng":6858,"ni":24361,"nj":567,"nk":135,"imo":196," es":141," en":369,"ju":713,"imf":161,"ime":354," el":223,"jo":133,"imi":180,"ki":6922,"kh":154,"ind":834,"ke":2748,"ina":8001," fa":353,"ka":45110,"imu":392,"m ":727," fu":177,"kw":3124,"ino":181,"ks":210,"kt":463,"ku":10532,"ins":133,"ko":5804,"ine":479,"ing":1959,"kr":669," fi":274,"ini":4598,"km":156,"li":17984,"le":2997,"ld":221,"lf":159,"la":14880,"lb":250,"iny":275,"n ":3144,"iko":612,"hw":492,"ht":198,"hu":6825,"iki":2488,"hi":11111," ch":2090,"hn":150,"ho":1180,"ila":4379,"id":1813,"ic":1403,"ib":5595,"ia":11251,"ih":490,"in ":378,"ig":1252," da":146,"if":1790,"ie":672,"iku":2496,"k ":628,"ilo":373,"ir":1982,"is":9376,"it":2904,"ill":288,"iu":466,"iv":385,"iw":2556,"ii":989,"ij":1580,"ik":19966," de":224,"ili":8251,"il":13887,"im":4832,"in":17333,"io":4395,"ile":321,"ip":1169,"ima":914,"je":934,"imb":2471,"io ":2960,"ji":17145,"iz":1362,"iy":4997," du":302,"l ":1018,"ja":2368,"z ":191,"wi":1773,"wo":202,"vy":671," za":1702,"y ":1239,"wa":56175," zi":456,"we":2203,"vi":1632,"vu":418,"vo":138,"uz":1451,"uw":2877,"uv":252,"uu":3068," ye":258,"ve":578," ya":17428,"va":328,"x ":213,"ui":563,"uj":4429,"uk":1643,"ul":2575,"ue":357,"uf":741,"ug":901,"uh":626,"ur":1919,"us":3274,"ut":2784,"um":5397,"un":5099,"uo":368,"up":1077,"ty":166,"tu":2287,"tt":391,"tw":473,"ub":1112,"ua":2111,"ud":534,"uc":476,"w ":435,"to":4407,"huk":345,"hul":146,"tl":220,"ts":343,"tr":455,"te":2280,"ti":12092,"th":999,"ta":14867,"su":644,"ss":500,"st":1842,"sw":308,"sl":142,"sk":865,"sm":139,"sp":289,"so":683,"sc":179,"se":5649,"sh":8151,"si":4764,"u ":13704,"sa":7736,"rr":220,"rs":467,"rt":620,"ru":2279,"ry":287,"ro":1786,"rn":619,"rm":257,"rl":223,"rk":320,"ri":8157,"rg":403,"re":3855,"rd":556,"rc":143,"rb":136,"ra":5018,"t ":1231,"s ":3025,"pt":348,"pu":357,"pw":193,"pr":381," sa":589," se":4480," si":369," sh":318," ra":432," ri":188,"hwa":473,"huo":175,"hum":2789,"hun":282,"hus":506,"hur":418,"huu":1333," pe":176," pa":632," pi":931," wa":33135," we":275," vy":396," wi":862," vi":1013," uc":144,"zi":8597,"ze":368,"za":8043," tu":189,"zw":257," us":165," ut":249," up":502," um":247,"zu":272," un":1571," uk":210,"zo":952," ul":573," uh":139," ta":1410,"ye":2395,"ya":24129,"yu":306," to":170," th":289,"yo":5888," te":201,"yi":4283,"Apr":266,"Asi":146,"Aru":195,"far":316,"fam":283,"fan":4203,"fal":292,"fa ":488,"eya":259,"Bah":237,"Bar":140,"eza":1136,"ezo":172,"ezi":237,"eta":229,"ete":154,"eti":253,"est":247,"ett":212,"ew ":355,"evi":165,"ewe":148,"ey ":361,"ewa":358,"er ":615,"epa":149,"es ":640,"ept":299,"eri":650,"ere":660,"era":456,"Afr":406,"esh":359,"ese":306,"esa":279,"eru":498,"Ago":254,"ert":152,"ers":339,"eku":184,"en ":297,"ela":204,"ele":786,"eli":360,"ell":177,"eo ":852,"emb":1055,"ema":157,"eme":314,"emi":276,"emu":365,"ene":704,"eng":671,"ena":283,"end":498,"eno":221,"eni":486,"ens":4087,"ent":441,"eny":1803,"Ali":478,"ege":351,"Ame":158,"ehe":647,"Ana":176,"el ":260,"eke":267,"eka":1754,"giz":193,"gir":232,"gin":349,"gid":165,"ght":136,"gha":925,"gi ":572,"gen":204,"ger":781,"ge ":611,"gaz":140,"gar":155,"gan":693,"ga ":1334,"Cal":307,"fup":194,"Bib":137,"fua":317,"fum":143,"fun":167,"fri":445,"fu ":810,"for":356,"fo ":342,"fil":269,"fik":168,"fiz":146,"da ":1525,"de ":752,"dad":386,"dae":220,"dar":151,"dan":305,"dam":173,"Des":272,"Dar":167,"Chi":216,"Chu":136,"Cha":300,"ch ":165,"cha":2430,"chu":596,"ck ":143,"che":571,"chi":3152,"cho":370,"ed ":154,"ebr":313,"ea ":663,"ei ":346,"efu":197,"edi":297,"ee ":156,"don":150,"dom":308,"dol":151,"dog":335,"dun":335,"dha":302,"dia":330,"dhi":240,"der":146,"deg":261,"del":152,"di ":2661,"do ":429,"Dod":240,"diy":201,"din":291,"dis":387,"dik":302,"ri ":2373,"rez":420,"rea":148,"ref":154,"reh":266,"ren":163,"rek":1672,"re ":305,"rd ":213,"ras":256,"rat":173,"Ni ":218,"New":381,"rai":160,"ran":867,"ram":226,"rab":297,"rad":150,"ron":135,"rog":253,"rne":169,"rni":283,"ro ":593,"riw":166,"ris":508,"ril":300,"rik":1688,"rin":373,"ria":769,"rib":1011,"ric":160,"rk ":191,"ruf":262,"rum":452,"ruk":315,"rus":423,"ry ":194,"rse":228,"Nya":144,"rua":234,"rt ":160,"ru ":273,"sab":458,"sac":139,"san":482,"sas":180,"sa ":5643,"Nov":242,"sha":1745,"sho":271,"she":240,"shi":5099,"si ":1365,"siw":355,"sia":608,"shw":458,"shu":187,"sis":157,"sin":881,"sil":283,"sim":158,"sik":319,"sey":212,"ser":175,"set":147,"Okt":259,"seh":319,"sen":4083,"sem":335,"spa":151,"son":242,"su ":198,"st ":167,"sko":136,"ska":599,"so ":134,"ssa":198,"ste":192,"sta":295,"sto":444,"sti":401,"str":197,"swa":181,"tai":280,"taj":233,"tak":462,"tal":339,"taa":220,"tab":242,"taw":344,"tat":292,"tar":668,"tao":3872,"tan":641,"tam":288,"te ":507,"ta ":6480,"pa ":765,"pat":4120,"pak":235,"pap":248,"pam":300,"pan":895,"pi ":233,"ped":156,"Pap":368,"pia":789,"pil":189,"pin":267,"pis":162,"pit":144,"po ":743,"pte":287,"pri":298,"pwa":189,"Rai":176,"ra ":1932,"ngo":958,"ngi":1065,"ngu":1084,"ngw":363,"ni ":18823,"Iri":209,"nge":937,"nga":1742,"Ita":147,"neo":505,"nes":161,"ng ":405,"nch":2504,"ne ":911,"ndu":263,"ndo":574,"ndi":1835,"nde":1085,"nda":1162,"nak":251,"nal":257,"nam":1855,"nan":221,"nao":1457,"nap":185,"nac":183,"nad":288,"naf":402,"nai":158,"naj":196,"nd ":409,"nat":353,"nas":439,"nay":454,"na ":15738,"Jan":271,"Jam":281,"nya":1379,"Jer":215,"nye":1338,"nyi":4239,"nus":133,"nua":282,"Jim":174,"Jin":277,"nti":403,"nta":151,"nte":177,"nsi":211,"nsa":4269,"nt ":232,"ns ":140,"nne":236,"no ":948,"nji":138,"nja":269,"Joh":134,"nia":4199,"nis":530,"ogo":593,"ois":1291,"oji":173,"oja":1149,"Jul":285,"Jun":259,"odo":288,"of ":150,"ofu":134,"ofa":3991,"oa ":3810,"oan":188,"oba":375,"nza":3817,"nzi":1111,"Kai":144,"Kag":175,"Kal":167,"Kan":354,"Kat":474,"Kas":372,"Kar":232,"Ken":632,"ozi":165,"Kis":329,"Kir":165,"Kit":204,"Kin":148,"Kib":138,"Kia":309,"ote":378,"Kik":287,"Kil":453,"Kim":202,"oto":331,"Kig":295,"Kii":249,"ost":309,"ota":195,"ove":320,"opo":325,"os ":178,"or ":161,"Kon":197,"orn":300,"oro":673,"ore":188,"ori":369,"ort":147,"ora":378,"ola":427,"on ":838,"oli":431,"ole":357,"olo":331,"oka":1580,"oke":163,"oko":236,"oku":141,"ona":230,"ond":383,"one":151,"ong":860,"oni":784,"oma":766,"omb":303,"omi":249,"omo":182,"op ":143,"la ":8089,"le ":1011,"Kwa":3975,"laa":157,"lai":293,"lak":564,"lan":660,"lam":497,"lat":186,"lay":3727,"Kus":393,"lba":165,"kuz":236,"kuw":2713,"kuu":1305,"kut":1795,"kus":492,"kur":190,"kup":186,"kun":409,"kum":210,"kul":297,"kuj":187,"kwe":591,"kwa":2512,"kub":762,"kuf":233,"kuh":134,"kua":620,"kto":308,"lom":136,"loj":136,"lme":241,"Lin":225,"lug":350,"lu ":155,"li ":2787,"lez":192,"lew":193,"lev":140,"les":155,"leo":178,"lem":198,"len":254,"lek":133,"lo ":347,"lla":138,"lle":153,"lli":198,"ll ":147,"lit":241,"lis":337,"lip":257,"lio":738,"lin":627,"lim":922,"liz":411,"liy":4415,"liw":979,"lic":340,"lia":1497,"lik":2742,"lil":529,"lih":179,"lif":397,"ma ":2611,"mb ":139,"maa":449,"maj":397,"mak":522,"mad":206,"mae":140,"mag":342,"mar":439,"mas":613,"mal":159,"mam":161,"man":1055,"mat":406,"mba":3047,"mbi":361,"mbe":389,"mbo":2343,"me ":516,"mbu":267,"mch":170,"met":211,"mer":252,"men":492,"mfa":152,"mez":387,"mfu":373,"Mei":250,"Man":216,"Mar":1940,"Mas":472,"Mag":282,"Mak":206,"Mac":287,"Mbe":273,"mpi":142,"mon":163,"moj":1127,"mpa":160,"Mor":279,"mu ":1602,"mtu":175,"mto":226,"Mic":182,"Mis":147,"msh":144,"mta":228,"mwe":383,"mwi":345,"Mko":3178,"mwa":6205,"Mku":138,"Mji":464,"muj":3839,"muz":374,"mhu":232,"Mtw":147,"mi ":359,"mji":5175,"min":192,"mil":749,"Mwa":460,"mit":295,"mia":630,"mik":321,"mo ":4413,"mku":1038,"mko":539,"mna":1501,"mmo":145,"Wik":149,"Wil":3077,"Wan":148,"zwa":252,"zi ":5785,"zai":249,"zaj":254,"zam":177,"zan":3194,"zal":783,"zar":173,"zo ":612,"zia":533,"zin":815,"zil":197,"zik":548,"zis":240,"一":303,"yof":3874,"yot":286,"za ":2981,"ye ":1320,"yen":237,"ya ":21762,"yar":252,"yan":567,"yao":167,"yam":250,"yak":657,"yo ":973,"yin":213,"yik":3954,"一一":144,"Tan":3407,"Tab":164,"Shi":315,"Sin":201,"Sep":283,"we ":401,"wez":265,"wen":1037,"wim":286,"wil":741,"Sal":197,"vyo":257,"wa ":33121,"wap":4111,"wan":3901,"wal":617,"wam":169,"wak":9923,"way":141,"wat":368,"war":238,"was":172,"wai":2667,"wah":176,"vu ":165,"vya":351,"vil":200,"vin":183,"vit":187,"vis":284,"Rom":180,"vem":244,"Vij":328,"uzi":743,"uza":470,"Uje":235,"uwa":2760,"uvu":174,"ush":417,"usi":1319,"use":183,"usa":176,"uu ":2892,"usu":216,"ust":207,"uso":141,"uti":211,"ute":137,"uta":560,"Uin":218,"utu":215,"uto":1436,"us ":536,"Ung":252,"ura":183,"ure":140,"uri":491,"uru":630,"unz":137,"Ula":150,"upa":554,"upi":311,"umu":162,"umi":484,"umo":2705,"uma":686,"umb":661,"ume":297,"uo ":238,"uni":940,"und":747,"una":1741,"ung":1193,"uku":302,"uko":457,"uki":429,"uka":247,"ulu":258,"uli":1405,"ule":192,"ula":478,"ukw":139,"uhu":267,"uji":4010,"uja":302,"Utu":261,"ugh":514,"ufu":352,"uhi":136,"ugu":137,"udi":174,"ubw":695,"uch":343,"ufa":176,"ufi":189,"ua ":369,"uat":317,"uar":494,"uan":690,"uba":185,"Uch":175,"ty ":146,"twa":450,"tur":369,"tun":270,"tum":424,"Ufa":219,"ts ":214,"tu ":896,"The":164,"tts":142,"to ":986,"tob":268,"tom":167,"ton":281,"tok":1553,"tol":482,"tor":246,"tik":8147,"tis":158,"tin":351,"tio":199,"thu":171,"tia":156,"tem":384,"ten":273,"tel":171,"th ":160,"ter":432,"ti ":2389,"the":225,"thi":213,"biw":209,"bis":191,"bil":315,"bin":256,"bo ":2326,"bli":173,"bor":262,"be ":229,"bam":230,"ban":516,"bal":619,"bah":147,"baa":227,"bab":179,"bay":333,"bar":432,"bao":277,"bi ":662,"ber":216,"bel":151,"bey":251,"bia":222,"ce ":176,"bu ":4649,"bru":221,"bur":149,"bun":177,"bwa":786,"aka":10583,"am ":337,"ake":1982,"aki":644,"aji":1355,"aju":170,"al ":304,"aja":293,"ain":393,"air":222,"ais":2933,"aif":267,"aid":437,"ahi":308,"aha":751,"agh":475,"agu":395,"aoi":1233,"anu":344,"anz":4756,"any":4453,"ano":638,"ann":141,"ant":323,"ans":490,"ane":261,"ang":1660,"ani":7747,"anj":260,"ana":4702,"anc":133,"and":2300,"amu":1047,"amo":1890,"amp":179,"amh":222,"ami":838,"ame":637,"amb":1658,"ama":1868,"ao ":4649,"alo":269,"alm":262,"all":133,"ali":5324,"ale":476,"ala":1026,"alb":152,"an ":1167,"akr":376,"aku":502,"ako":215,"aba":751,"abe":140,"abi":660,"abo":208,"abu":582,"ae ":291,"aad":302,"aan":389,"aal":140,"aam":185,"aar":236,"aa ":361,"afi":303,"ai ":477,"aga":223,"age":227,"afu":225,"aen":162,"ael":172,"afa":411,"ado":269,"adh":288,"adi":1538,"ach":840,"ada":637,"azo":205,"azi":5401,"aza":186,"ayo":638,"aya":4140,"aye":284,"ba ":2178,"are":1998,"ard":317,"ara":2057,"aro":249,"ari":3153,"aru":316,"art":243,"au ":993,"asa":1084,"asi":1169,"ash":895,"ask":665,"ar ":568,"apa":4869,"api":162,"apo":406,"as ":271,"aut":148,"awa":1126,"awi":190,"ata":10070,"ast":167,"ass":197,"ato":634,"ate":225,"ati":9962,"ath":135,"atu":749},"n_words":[1316698,1560317,1165243],"name":"sw"} --------------------------------------------------------------------------------