├── capa_explorer_plugin ├── chardet │ ├── cli │ │ ├── __init__.py │ │ └── chardetect.py │ ├── version.py │ ├── compat.py │ ├── __init__.py │ ├── euctwprober.py │ ├── euckrprober.py │ ├── gb2312prober.py │ ├── big5prober.py │ ├── enums.py │ ├── cp949prober.py │ ├── mbcsgroupprober.py │ ├── utf8prober.py │ ├── mbcharsetprober.py │ ├── sbcsgroupprober.py │ ├── codingstatemachine.py │ ├── eucjpprober.py │ ├── sjisprober.py │ ├── charsetgroupprober.py │ ├── escprober.py │ ├── charsetprober.py │ ├── latin1prober.py │ ├── sbcharsetprober.py │ ├── chardistribution.py │ ├── escsm.py │ ├── langturkishmodel.py │ ├── langthaimodel.py │ ├── langhebrewmodel.py │ ├── langhungarianmodel.py │ ├── langgreekmodel.py │ └── universaldetector.py ├── capa_constants.py ├── __init__.py ├── util.py ├── proxy.py ├── item.py └── view.py ├── img └── plugin_interface_example_1.png ├── .gitignore ├── README.md └── LICENSE /capa_explorer_plugin/chardet/cli/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /capa_explorer_plugin/capa_constants.py: -------------------------------------------------------------------------------- 1 | FILE_SCOPE = "file" 2 | FUNCTION_SCOPE = "function" 3 | BASIC_BLOCK_SCOPE = "basic block" -------------------------------------------------------------------------------- /capa_explorer_plugin/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .cutterplugin import create_cutter_plugin 3 | except Exception: 4 | pass 5 | -------------------------------------------------------------------------------- /img/plugin_interface_example_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ninewayhandshake/capa-explorer/HEAD/img/plugin_interface_example_1.png -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/version.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module exists only to simplify retrieving the version number of chardet 3 | from within setup.py and from chardet subpackages. 4 | 5 | :author: Dan Blanchard (dan.blanchard@gmail.com) 6 | """ 7 | 8 | __version__ = "3.0.4" 9 | VERSION = __version__.split('.') 10 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/compat.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # Contributor(s): 3 | # Dan Blanchard 4 | # Ian Cordasco 5 | # 6 | # This library is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU Lesser General Public 8 | # License as published by the Free Software Foundation; either 9 | # version 2.1 of the License, or (at your option) any later version. 10 | # 11 | # This library is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # Lesser General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU Lesser General Public 17 | # License along with this library; if not, write to the Free Software 18 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 19 | # 02110-1301 USA 20 | ######################### END LICENSE BLOCK ######################### 21 | 22 | import sys 23 | 24 | 25 | if sys.version_info < (3, 0): 26 | PY2 = True 27 | PY3 = False 28 | base_str = (str, unicode) 29 | text_type = unicode 30 | else: 31 | PY2 = False 32 | PY3 = True 33 | base_str = (bytes, str) 34 | text_type = str 35 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/__init__.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # This library is free software; you can redistribute it and/or 3 | # modify it under the terms of the GNU Lesser General Public 4 | # License as published by the Free Software Foundation; either 5 | # version 2.1 of the License, or (at your option) any later version. 6 | # 7 | # This library is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 | # Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public 13 | # License along with this library; if not, write to the Free Software 14 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 15 | # 02110-1301 USA 16 | ######################### END LICENSE BLOCK ######################### 17 | 18 | 19 | from .compat import PY2, PY3 20 | from .universaldetector import UniversalDetector 21 | from .version import __version__, VERSION 22 | 23 | 24 | def detect(byte_str): 25 | """ 26 | Detect the encoding of the given byte string. 27 | 28 | :param byte_str: The byte sequence to examine. 29 | :type byte_str: ``bytes`` or ``bytearray`` 30 | """ 31 | if not isinstance(byte_str, bytearray): 32 | if not isinstance(byte_str, bytes): 33 | raise TypeError('Expected object of type bytes or bytearray, got: ' 34 | '{0}'.format(type(byte_str))) 35 | else: 36 | byte_str = bytearray(byte_str) 37 | detector = UniversalDetector() 38 | detector.feed(byte_str) 39 | return detector.close() 40 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/euctwprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import EUCTWDistributionAnalysis 31 | from .mbcssm import EUCTW_SM_MODEL 32 | 33 | class EUCTWProber(MultiByteCharSetProber): 34 | def __init__(self): 35 | super(EUCTWProber, self).__init__() 36 | self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL) 37 | self.distribution_analyzer = EUCTWDistributionAnalysis() 38 | self.reset() 39 | 40 | @property 41 | def charset_name(self): 42 | return "EUC-TW" 43 | 44 | @property 45 | def language(self): 46 | return "Taiwan" 47 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/euckrprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import EUCKRDistributionAnalysis 31 | from .mbcssm import EUCKR_SM_MODEL 32 | 33 | 34 | class EUCKRProber(MultiByteCharSetProber): 35 | def __init__(self): 36 | super(EUCKRProber, self).__init__() 37 | self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL) 38 | self.distribution_analyzer = EUCKRDistributionAnalysis() 39 | self.reset() 40 | 41 | @property 42 | def charset_name(self): 43 | return "EUC-KR" 44 | 45 | @property 46 | def language(self): 47 | return "Korean" 48 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/gb2312prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import GB2312DistributionAnalysis 31 | from .mbcssm import GB2312_SM_MODEL 32 | 33 | class GB2312Prober(MultiByteCharSetProber): 34 | def __init__(self): 35 | super(GB2312Prober, self).__init__() 36 | self.coding_sm = CodingStateMachine(GB2312_SM_MODEL) 37 | self.distribution_analyzer = GB2312DistributionAnalysis() 38 | self.reset() 39 | 40 | @property 41 | def charset_name(self): 42 | return "GB2312" 43 | 44 | @property 45 | def language(self): 46 | return "Chinese" 47 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/big5prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import Big5DistributionAnalysis 31 | from .mbcssm import BIG5_SM_MODEL 32 | 33 | 34 | class Big5Prober(MultiByteCharSetProber): 35 | def __init__(self): 36 | super(Big5Prober, self).__init__() 37 | self.coding_sm = CodingStateMachine(BIG5_SM_MODEL) 38 | self.distribution_analyzer = Big5DistributionAnalysis() 39 | self.reset() 40 | 41 | @property 42 | def charset_name(self): 43 | return "Big5" 44 | 45 | @property 46 | def language(self): 47 | return "Chinese" 48 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/enums.py: -------------------------------------------------------------------------------- 1 | """ 2 | All of the Enums that are used throughout the chardet package. 3 | 4 | :author: Dan Blanchard (dan.blanchard@gmail.com) 5 | """ 6 | 7 | 8 | class InputState(object): 9 | """ 10 | This enum represents the different states a universal detector can be in. 11 | """ 12 | PURE_ASCII = 0 13 | ESC_ASCII = 1 14 | HIGH_BYTE = 2 15 | 16 | 17 | class LanguageFilter(object): 18 | """ 19 | This enum represents the different language filters we can apply to a 20 | ``UniversalDetector``. 21 | """ 22 | CHINESE_SIMPLIFIED = 0x01 23 | CHINESE_TRADITIONAL = 0x02 24 | JAPANESE = 0x04 25 | KOREAN = 0x08 26 | NON_CJK = 0x10 27 | ALL = 0x1F 28 | CHINESE = CHINESE_SIMPLIFIED | CHINESE_TRADITIONAL 29 | CJK = CHINESE | JAPANESE | KOREAN 30 | 31 | 32 | class ProbingState(object): 33 | """ 34 | This enum represents the different states a prober can be in. 35 | """ 36 | DETECTING = 0 37 | FOUND_IT = 1 38 | NOT_ME = 2 39 | 40 | 41 | class MachineState(object): 42 | """ 43 | This enum represents the different states a state machine can be in. 44 | """ 45 | START = 0 46 | ERROR = 1 47 | ITS_ME = 2 48 | 49 | 50 | class SequenceLikelihood(object): 51 | """ 52 | This enum represents the likelihood of a character following the previous one. 53 | """ 54 | NEGATIVE = 0 55 | UNLIKELY = 1 56 | LIKELY = 2 57 | POSITIVE = 3 58 | 59 | @classmethod 60 | def get_num_categories(cls): 61 | """:returns: The number of likelihood categories in the enum.""" 62 | return 4 63 | 64 | 65 | class CharacterCategory(object): 66 | """ 67 | This enum represents the different categories language models for 68 | ``SingleByteCharsetProber`` put characters into. 69 | 70 | Anything less than CONTROL is considered a letter. 71 | """ 72 | UNDEFINED = 255 73 | LINE_BREAK = 254 74 | SYMBOL = 253 75 | DIGIT = 252 76 | CONTROL = 251 77 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/cp949prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .chardistribution import EUCKRDistributionAnalysis 29 | from .codingstatemachine import CodingStateMachine 30 | from .mbcharsetprober import MultiByteCharSetProber 31 | from .mbcssm import CP949_SM_MODEL 32 | 33 | 34 | class CP949Prober(MultiByteCharSetProber): 35 | def __init__(self): 36 | super(CP949Prober, self).__init__() 37 | self.coding_sm = CodingStateMachine(CP949_SM_MODEL) 38 | # NOTE: CP949 is a superset of EUC-KR, so the distribution should be 39 | # not different. 40 | self.distribution_analyzer = EUCKRDistributionAnalysis() 41 | self.reset() 42 | 43 | @property 44 | def charset_name(self): 45 | return "CP949" 46 | 47 | @property 48 | def language(self): 49 | return "Korean" 50 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/mbcsgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # Proofpoint, Inc. 13 | # 14 | # This library is free software; you can redistribute it and/or 15 | # modify it under the terms of the GNU Lesser General Public 16 | # License as published by the Free Software Foundation; either 17 | # version 2.1 of the License, or (at your option) any later version. 18 | # 19 | # This library is distributed in the hope that it will be useful, 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 | # Lesser General Public License for more details. 23 | # 24 | # You should have received a copy of the GNU Lesser General Public 25 | # License along with this library; if not, write to the Free Software 26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 27 | # 02110-1301 USA 28 | ######################### END LICENSE BLOCK ######################### 29 | 30 | from .charsetgroupprober import CharSetGroupProber 31 | from .utf8prober import UTF8Prober 32 | from .sjisprober import SJISProber 33 | from .eucjpprober import EUCJPProber 34 | from .gb2312prober import GB2312Prober 35 | from .euckrprober import EUCKRProber 36 | from .cp949prober import CP949Prober 37 | from .big5prober import Big5Prober 38 | from .euctwprober import EUCTWProber 39 | 40 | 41 | class MBCSGroupProber(CharSetGroupProber): 42 | def __init__(self, lang_filter=None): 43 | super(MBCSGroupProber, self).__init__(lang_filter=lang_filter) 44 | self.probers = [ 45 | UTF8Prober(), 46 | SJISProber(), 47 | EUCJPProber(), 48 | GB2312Prober(), 49 | EUCKRProber(), 50 | CP949Prober(), 51 | Big5Prober(), 52 | EUCTWProber() 53 | ] 54 | self.reset() 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/cli/chardetect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Script which takes one or more file paths and reports on their detected 4 | encodings 5 | 6 | Example:: 7 | 8 | % chardetect somefile someotherfile 9 | somefile: windows-1252 with confidence 0.5 10 | someotherfile: ascii with confidence 1.0 11 | 12 | If no paths are provided, it takes its input from stdin. 13 | 14 | """ 15 | 16 | from __future__ import absolute_import, print_function, unicode_literals 17 | 18 | import argparse 19 | import sys 20 | 21 | from chardet import __version__ 22 | from chardet.compat import PY2 23 | from chardet.universaldetector import UniversalDetector 24 | 25 | 26 | def description_of(lines, name='stdin'): 27 | """ 28 | Return a string describing the probable encoding of a file or 29 | list of strings. 30 | 31 | :param lines: The lines to get the encoding of. 32 | :type lines: Iterable of bytes 33 | :param name: Name of file or collection of lines 34 | :type name: str 35 | """ 36 | u = UniversalDetector() 37 | for line in lines: 38 | line = bytearray(line) 39 | u.feed(line) 40 | # shortcut out of the loop to save reading further - particularly useful if we read a BOM. 41 | if u.done: 42 | break 43 | u.close() 44 | result = u.result 45 | if PY2: 46 | name = name.decode(sys.getfilesystemencoding(), 'ignore') 47 | if result['encoding']: 48 | return '{0}: {1} with confidence {2}'.format(name, result['encoding'], 49 | result['confidence']) 50 | else: 51 | return '{0}: no result'.format(name) 52 | 53 | 54 | def main(argv=None): 55 | """ 56 | Handles command line arguments and gets things started. 57 | 58 | :param argv: List of arguments, as if specified on the command-line. 59 | If None, ``sys.argv[1:]`` is used instead. 60 | :type argv: list of str 61 | """ 62 | # Get command line arguments 63 | parser = argparse.ArgumentParser( 64 | description="Takes one or more file paths and reports their detected \ 65 | encodings") 66 | parser.add_argument('input', 67 | help='File whose encoding we would like to determine. \ 68 | (default: stdin)', 69 | type=argparse.FileType('rb'), nargs='*', 70 | default=[sys.stdin if PY2 else sys.stdin.buffer]) 71 | parser.add_argument('--version', action='version', 72 | version='%(prog)s {0}'.format(__version__)) 73 | args = parser.parse_args(argv) 74 | 75 | for f in args.input: 76 | if f.isatty(): 77 | print("You are running chardetect interactively. Press " + 78 | "CTRL-D twice at the start of a blank line to signal the " + 79 | "end of your input. If you want help, run chardetect " + 80 | "--help\n", file=sys.stderr) 81 | print(description_of(f, f.name)) 82 | 83 | 84 | if __name__ == '__main__': 85 | main() 86 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/utf8prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .charsetprober import CharSetProber 29 | from .enums import ProbingState, MachineState 30 | from .codingstatemachine import CodingStateMachine 31 | from .mbcssm import UTF8_SM_MODEL 32 | 33 | 34 | 35 | class UTF8Prober(CharSetProber): 36 | ONE_CHAR_PROB = 0.5 37 | 38 | def __init__(self): 39 | super(UTF8Prober, self).__init__() 40 | self.coding_sm = CodingStateMachine(UTF8_SM_MODEL) 41 | self._num_mb_chars = None 42 | self.reset() 43 | 44 | def reset(self): 45 | super(UTF8Prober, self).reset() 46 | self.coding_sm.reset() 47 | self._num_mb_chars = 0 48 | 49 | @property 50 | def charset_name(self): 51 | return "utf-8" 52 | 53 | @property 54 | def language(self): 55 | return "" 56 | 57 | def feed(self, byte_str): 58 | for c in byte_str: 59 | coding_state = self.coding_sm.next_state(c) 60 | if coding_state == MachineState.ERROR: 61 | self._state = ProbingState.NOT_ME 62 | break 63 | elif coding_state == MachineState.ITS_ME: 64 | self._state = ProbingState.FOUND_IT 65 | break 66 | elif coding_state == MachineState.START: 67 | if self.coding_sm.get_current_charlen() >= 2: 68 | self._num_mb_chars += 1 69 | 70 | if self.state == ProbingState.DETECTING: 71 | if self.get_confidence() > self.SHORTCUT_THRESHOLD: 72 | self._state = ProbingState.FOUND_IT 73 | 74 | return self.state 75 | 76 | def get_confidence(self): 77 | unlike = 0.99 78 | if self._num_mb_chars < 6: 79 | unlike *= self.ONE_CHAR_PROB ** self._num_mb_chars 80 | return 1.0 - unlike 81 | else: 82 | return unlike 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # capa-explorer 2 | 3 | Capa explorer for Cutter. 4 | 5 | The goal of this is to port the functionality of the [capa IDA plugin](https://github.com/mandiant/capa/tree/master/capa/ida/plugin) to Cutter. 6 | 7 | ## Features 8 | 9 | - Display capa results in an interactive tree view of rule matches and their locations in the current database 10 | - Search for keywords or phrases found in the Rule Information, Address, or Details columns 11 | - Display rule source content when a user hovers their cursor over a rule match 12 | - Double-click Address column to view associated feature in the Cutter Disassembly view 13 | - Limit tree view results to the function currently displayed in the Cutter Disassembly view; update results as a user navigates to different functions 14 | - Automatically update results when Cutter is used to rename a function 15 | - Select one or more checkboxes to highlight the associated addresses in the Cutter Disassembly view 16 | - Right-click a function match to rename it; the new function name is propagated to Cutter 17 | - Right-click to copy a result by column or by row 18 | - Sort results by column 19 | - Create flags and flagspaces from the loaded matches 20 | - Automatically rename functions with capa matches based on the detected capabilities 21 | 22 | ![](img/plugin_interface_example_1.png) 23 | 24 | ## Installation 25 | 26 | First locate the directory used by cutter for loading plugins. 27 | 28 | The simplest way to do this is to open cutter and go to Edit menu -> Preferences -> Plugins. 29 | The directory you are looking for should be displayed at the top. 30 | 31 | Inside that directory you should find a directory named python. Download or clone this repository, and move capa_explorer_plugin to the python directory. 32 | 33 | ## Usage 34 | 35 | Use capa standalone which can be downloaded here 36 | 37 | https://github.com/mandiant/capa/releases 38 | 39 | Issue the following command to create a JSON report of the binary. 40 | 41 | `capa.exe -j sample.exe > sample.exe.json` 42 | 43 | Open the binary in Cutter and and select "Load JSON file" in the drop down menu in the top right corner of the capa explorer widget to load the report. 44 | 45 | ## Known limitations 46 | - The plugin currently uses r2's ecH command to highlight instructions, while this works the support seems limited in Cutter and at times it can be slow. Ideally BIHighlighter should be used but due to a bug this is currently not exposed in CutterCore. https://github.com/radareorg/cutter/issues/2395 47 | - The main difference between this plugin and the IDA version is that this plugin does not implement a feature extractor and relies on th JSON exports from the IDA plugin or the standalone tool. I have not looked into the possibility of implementing feature extraction with radare2. This may or may not be something I do in the future. 48 | - The plugin has been developed for Cutter version 1.12 and Ive experienced crashes with earlier versions so make sure to check the version you are running if you are experiencing issues. 49 | - This is my first attempt at a Cutter plugin so there are very likely bugs. 50 | - This plugin does not currently handle rebasing, thats on the todo list. 51 | 52 | ## Other issues 53 | If you encounter bugs or have suggestions which are not among the known limitations please create an issue, or even better, a pull request. 54 | 55 | ## Credits 56 | I want to thank Mandiant and the FLARE team for creating capa and making it available to everyone. Most of the code in this repo is taken directly from the official capa IDA plugin and and have received slight modifications to make it work in Cutter. The main goal was to make the user experice as close as possible to the original plugin. 57 | 58 | https://github.com/mandiant/capa 59 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/mbcharsetprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # Proofpoint, Inc. 13 | # 14 | # This library is free software; you can redistribute it and/or 15 | # modify it under the terms of the GNU Lesser General Public 16 | # License as published by the Free Software Foundation; either 17 | # version 2.1 of the License, or (at your option) any later version. 18 | # 19 | # This library is distributed in the hope that it will be useful, 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 | # Lesser General Public License for more details. 23 | # 24 | # You should have received a copy of the GNU Lesser General Public 25 | # License along with this library; if not, write to the Free Software 26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 27 | # 02110-1301 USA 28 | ######################### END LICENSE BLOCK ######################### 29 | 30 | from .charsetprober import CharSetProber 31 | from .enums import ProbingState, MachineState 32 | 33 | 34 | class MultiByteCharSetProber(CharSetProber): 35 | """ 36 | MultiByteCharSetProber 37 | """ 38 | 39 | def __init__(self, lang_filter=None): 40 | super(MultiByteCharSetProber, self).__init__(lang_filter=lang_filter) 41 | self.distribution_analyzer = None 42 | self.coding_sm = None 43 | self._last_char = [0, 0] 44 | 45 | def reset(self): 46 | super(MultiByteCharSetProber, self).reset() 47 | if self.coding_sm: 48 | self.coding_sm.reset() 49 | if self.distribution_analyzer: 50 | self.distribution_analyzer.reset() 51 | self._last_char = [0, 0] 52 | 53 | @property 54 | def charset_name(self): 55 | raise NotImplementedError 56 | 57 | @property 58 | def language(self): 59 | raise NotImplementedError 60 | 61 | def feed(self, byte_str): 62 | for i in range(len(byte_str)): 63 | coding_state = self.coding_sm.next_state(byte_str[i]) 64 | if coding_state == MachineState.ERROR: 65 | self.logger.debug('%s %s prober hit error at byte %s', 66 | self.charset_name, self.language, i) 67 | self._state = ProbingState.NOT_ME 68 | break 69 | elif coding_state == MachineState.ITS_ME: 70 | self._state = ProbingState.FOUND_IT 71 | break 72 | elif coding_state == MachineState.START: 73 | char_len = self.coding_sm.get_current_charlen() 74 | if i == 0: 75 | self._last_char[1] = byte_str[0] 76 | self.distribution_analyzer.feed(self._last_char, char_len) 77 | else: 78 | self.distribution_analyzer.feed(byte_str[i - 1:i + 1], 79 | char_len) 80 | 81 | self._last_char[0] = byte_str[-1] 82 | 83 | if self.state == ProbingState.DETECTING: 84 | if (self.distribution_analyzer.got_enough_data() and 85 | (self.get_confidence() > self.SHORTCUT_THRESHOLD)): 86 | self._state = ProbingState.FOUND_IT 87 | 88 | return self.state 89 | 90 | def get_confidence(self): 91 | return self.distribution_analyzer.get_confidence() 92 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/sbcsgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | from .charsetgroupprober import CharSetGroupProber 30 | from .sbcharsetprober import SingleByteCharSetProber 31 | from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel, 32 | Latin5CyrillicModel, MacCyrillicModel, 33 | Ibm866Model, Ibm855Model) 34 | from .langgreekmodel import Latin7GreekModel, Win1253GreekModel 35 | from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel 36 | # from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel 37 | from .langthaimodel import TIS620ThaiModel 38 | from .langhebrewmodel import Win1255HebrewModel 39 | from .hebrewprober import HebrewProber 40 | from .langturkishmodel import Latin5TurkishModel 41 | 42 | 43 | class SBCSGroupProber(CharSetGroupProber): 44 | def __init__(self): 45 | super(SBCSGroupProber, self).__init__() 46 | self.probers = [ 47 | SingleByteCharSetProber(Win1251CyrillicModel), 48 | SingleByteCharSetProber(Koi8rModel), 49 | SingleByteCharSetProber(Latin5CyrillicModel), 50 | SingleByteCharSetProber(MacCyrillicModel), 51 | SingleByteCharSetProber(Ibm866Model), 52 | SingleByteCharSetProber(Ibm855Model), 53 | SingleByteCharSetProber(Latin7GreekModel), 54 | SingleByteCharSetProber(Win1253GreekModel), 55 | SingleByteCharSetProber(Latin5BulgarianModel), 56 | SingleByteCharSetProber(Win1251BulgarianModel), 57 | # TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) 58 | # after we retrain model. 59 | # SingleByteCharSetProber(Latin2HungarianModel), 60 | # SingleByteCharSetProber(Win1250HungarianModel), 61 | SingleByteCharSetProber(TIS620ThaiModel), 62 | SingleByteCharSetProber(Latin5TurkishModel), 63 | ] 64 | hebrew_prober = HebrewProber() 65 | logical_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, 66 | False, hebrew_prober) 67 | visual_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, True, 68 | hebrew_prober) 69 | hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober) 70 | self.probers.extend([hebrew_prober, logical_hebrew_prober, 71 | visual_hebrew_prober]) 72 | 73 | self.reset() 74 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/codingstatemachine.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | import logging 29 | 30 | from .enums import MachineState 31 | 32 | 33 | class CodingStateMachine(object): 34 | """ 35 | A state machine to verify a byte sequence for a particular encoding. For 36 | each byte the detector receives, it will feed that byte to every active 37 | state machine available, one byte at a time. The state machine changes its 38 | state based on its previous state and the byte it receives. There are 3 39 | states in a state machine that are of interest to an auto-detector: 40 | 41 | START state: This is the state to start with, or a legal byte sequence 42 | (i.e. a valid code point) for character has been identified. 43 | 44 | ME state: This indicates that the state machine identified a byte sequence 45 | that is specific to the charset it is designed for and that 46 | there is no other possible encoding which can contain this byte 47 | sequence. This will to lead to an immediate positive answer for 48 | the detector. 49 | 50 | ERROR state: This indicates the state machine identified an illegal byte 51 | sequence for that encoding. This will lead to an immediate 52 | negative answer for this encoding. Detector will exclude this 53 | encoding from consideration from here on. 54 | """ 55 | def __init__(self, sm): 56 | self._model = sm 57 | self._curr_byte_pos = 0 58 | self._curr_char_len = 0 59 | self._curr_state = None 60 | self.logger = logging.getLogger(__name__) 61 | self.reset() 62 | 63 | def reset(self): 64 | self._curr_state = MachineState.START 65 | 66 | def next_state(self, c): 67 | # for each byte we get its class 68 | # if it is first byte, we also get byte length 69 | byte_class = self._model['class_table'][c] 70 | if self._curr_state == MachineState.START: 71 | self._curr_byte_pos = 0 72 | self._curr_char_len = self._model['char_len_table'][byte_class] 73 | # from byte's class and state_table, we get its next state 74 | curr_state = (self._curr_state * self._model['class_factor'] 75 | + byte_class) 76 | self._curr_state = self._model['state_table'][curr_state] 77 | self._curr_byte_pos += 1 78 | return self._curr_state 79 | 80 | def get_current_charlen(self): 81 | return self._curr_char_len 82 | 83 | def get_coding_state_machine(self): 84 | return self._model['name'] 85 | 86 | @property 87 | def language(self): 88 | return self._model['language'] 89 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/eucjpprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .enums import ProbingState, MachineState 29 | from .mbcharsetprober import MultiByteCharSetProber 30 | from .codingstatemachine import CodingStateMachine 31 | from .chardistribution import EUCJPDistributionAnalysis 32 | from .jpcntx import EUCJPContextAnalysis 33 | from .mbcssm import EUCJP_SM_MODEL 34 | 35 | 36 | class EUCJPProber(MultiByteCharSetProber): 37 | def __init__(self): 38 | super(EUCJPProber, self).__init__() 39 | self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL) 40 | self.distribution_analyzer = EUCJPDistributionAnalysis() 41 | self.context_analyzer = EUCJPContextAnalysis() 42 | self.reset() 43 | 44 | def reset(self): 45 | super(EUCJPProber, self).reset() 46 | self.context_analyzer.reset() 47 | 48 | @property 49 | def charset_name(self): 50 | return "EUC-JP" 51 | 52 | @property 53 | def language(self): 54 | return "Japanese" 55 | 56 | def feed(self, byte_str): 57 | for i in range(len(byte_str)): 58 | # PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte 59 | coding_state = self.coding_sm.next_state(byte_str[i]) 60 | if coding_state == MachineState.ERROR: 61 | self.logger.debug('%s %s prober hit error at byte %s', 62 | self.charset_name, self.language, i) 63 | self._state = ProbingState.NOT_ME 64 | break 65 | elif coding_state == MachineState.ITS_ME: 66 | self._state = ProbingState.FOUND_IT 67 | break 68 | elif coding_state == MachineState.START: 69 | char_len = self.coding_sm.get_current_charlen() 70 | if i == 0: 71 | self._last_char[1] = byte_str[0] 72 | self.context_analyzer.feed(self._last_char, char_len) 73 | self.distribution_analyzer.feed(self._last_char, char_len) 74 | else: 75 | self.context_analyzer.feed(byte_str[i - 1:i + 1], 76 | char_len) 77 | self.distribution_analyzer.feed(byte_str[i - 1:i + 1], 78 | char_len) 79 | 80 | self._last_char[0] = byte_str[-1] 81 | 82 | if self.state == ProbingState.DETECTING: 83 | if (self.context_analyzer.got_enough_data() and 84 | (self.get_confidence() > self.SHORTCUT_THRESHOLD)): 85 | self._state = ProbingState.FOUND_IT 86 | 87 | return self.state 88 | 89 | def get_confidence(self): 90 | context_conf = self.context_analyzer.get_confidence() 91 | distrib_conf = self.distribution_analyzer.get_confidence() 92 | return max(context_conf, distrib_conf) 93 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/sjisprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import SJISDistributionAnalysis 31 | from .jpcntx import SJISContextAnalysis 32 | from .mbcssm import SJIS_SM_MODEL 33 | from .enums import ProbingState, MachineState 34 | 35 | 36 | class SJISProber(MultiByteCharSetProber): 37 | def __init__(self): 38 | super(SJISProber, self).__init__() 39 | self.coding_sm = CodingStateMachine(SJIS_SM_MODEL) 40 | self.distribution_analyzer = SJISDistributionAnalysis() 41 | self.context_analyzer = SJISContextAnalysis() 42 | self.reset() 43 | 44 | def reset(self): 45 | super(SJISProber, self).reset() 46 | self.context_analyzer.reset() 47 | 48 | @property 49 | def charset_name(self): 50 | return self.context_analyzer.charset_name 51 | 52 | @property 53 | def language(self): 54 | return "Japanese" 55 | 56 | def feed(self, byte_str): 57 | for i in range(len(byte_str)): 58 | coding_state = self.coding_sm.next_state(byte_str[i]) 59 | if coding_state == MachineState.ERROR: 60 | self.logger.debug('%s %s prober hit error at byte %s', 61 | self.charset_name, self.language, i) 62 | self._state = ProbingState.NOT_ME 63 | break 64 | elif coding_state == MachineState.ITS_ME: 65 | self._state = ProbingState.FOUND_IT 66 | break 67 | elif coding_state == MachineState.START: 68 | char_len = self.coding_sm.get_current_charlen() 69 | if i == 0: 70 | self._last_char[1] = byte_str[0] 71 | self.context_analyzer.feed(self._last_char[2 - char_len:], 72 | char_len) 73 | self.distribution_analyzer.feed(self._last_char, char_len) 74 | else: 75 | self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3 76 | - char_len], char_len) 77 | self.distribution_analyzer.feed(byte_str[i - 1:i + 1], 78 | char_len) 79 | 80 | self._last_char[0] = byte_str[-1] 81 | 82 | if self.state == ProbingState.DETECTING: 83 | if (self.context_analyzer.got_enough_data() and 84 | (self.get_confidence() > self.SHORTCUT_THRESHOLD)): 85 | self._state = ProbingState.FOUND_IT 86 | 87 | return self.state 88 | 89 | def get_confidence(self): 90 | context_conf = self.context_analyzer.get_confidence() 91 | distrib_conf = self.distribution_analyzer.get_confidence() 92 | return max(context_conf, distrib_conf) 93 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/charsetgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .enums import ProbingState 29 | from .charsetprober import CharSetProber 30 | 31 | 32 | class CharSetGroupProber(CharSetProber): 33 | def __init__(self, lang_filter=None): 34 | super(CharSetGroupProber, self).__init__(lang_filter=lang_filter) 35 | self._active_num = 0 36 | self.probers = [] 37 | self._best_guess_prober = None 38 | 39 | def reset(self): 40 | super(CharSetGroupProber, self).reset() 41 | self._active_num = 0 42 | for prober in self.probers: 43 | if prober: 44 | prober.reset() 45 | prober.active = True 46 | self._active_num += 1 47 | self._best_guess_prober = None 48 | 49 | @property 50 | def charset_name(self): 51 | if not self._best_guess_prober: 52 | self.get_confidence() 53 | if not self._best_guess_prober: 54 | return None 55 | return self._best_guess_prober.charset_name 56 | 57 | @property 58 | def language(self): 59 | if not self._best_guess_prober: 60 | self.get_confidence() 61 | if not self._best_guess_prober: 62 | return None 63 | return self._best_guess_prober.language 64 | 65 | def feed(self, byte_str): 66 | for prober in self.probers: 67 | if not prober: 68 | continue 69 | if not prober.active: 70 | continue 71 | state = prober.feed(byte_str) 72 | if not state: 73 | continue 74 | if state == ProbingState.FOUND_IT: 75 | self._best_guess_prober = prober 76 | return self.state 77 | elif state == ProbingState.NOT_ME: 78 | prober.active = False 79 | self._active_num -= 1 80 | if self._active_num <= 0: 81 | self._state = ProbingState.NOT_ME 82 | return self.state 83 | return self.state 84 | 85 | def get_confidence(self): 86 | state = self.state 87 | if state == ProbingState.FOUND_IT: 88 | return 0.99 89 | elif state == ProbingState.NOT_ME: 90 | return 0.01 91 | best_conf = 0.0 92 | self._best_guess_prober = None 93 | for prober in self.probers: 94 | if not prober: 95 | continue 96 | if not prober.active: 97 | self.logger.debug('%s not active', prober.charset_name) 98 | continue 99 | conf = prober.get_confidence() 100 | self.logger.debug('%s %s confidence = %s', prober.charset_name, prober.language, conf) 101 | if best_conf < conf: 102 | best_conf = conf 103 | self._best_guess_prober = prober 104 | if not self._best_guess_prober: 105 | return 0.0 106 | return best_conf 107 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/escprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .charsetprober import CharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .enums import LanguageFilter, ProbingState, MachineState 31 | from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL, 32 | ISO2022KR_SM_MODEL) 33 | 34 | 35 | class EscCharSetProber(CharSetProber): 36 | """ 37 | This CharSetProber uses a "code scheme" approach for detecting encodings, 38 | whereby easily recognizable escape or shift sequences are relied on to 39 | identify these encodings. 40 | """ 41 | 42 | def __init__(self, lang_filter=None): 43 | super(EscCharSetProber, self).__init__(lang_filter=lang_filter) 44 | self.coding_sm = [] 45 | if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED: 46 | self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL)) 47 | self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL)) 48 | if self.lang_filter & LanguageFilter.JAPANESE: 49 | self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL)) 50 | if self.lang_filter & LanguageFilter.KOREAN: 51 | self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL)) 52 | self.active_sm_count = None 53 | self._detected_charset = None 54 | self._detected_language = None 55 | self._state = None 56 | self.reset() 57 | 58 | def reset(self): 59 | super(EscCharSetProber, self).reset() 60 | for coding_sm in self.coding_sm: 61 | if not coding_sm: 62 | continue 63 | coding_sm.active = True 64 | coding_sm.reset() 65 | self.active_sm_count = len(self.coding_sm) 66 | self._detected_charset = None 67 | self._detected_language = None 68 | 69 | @property 70 | def charset_name(self): 71 | return self._detected_charset 72 | 73 | @property 74 | def language(self): 75 | return self._detected_language 76 | 77 | def get_confidence(self): 78 | if self._detected_charset: 79 | return 0.99 80 | else: 81 | return 0.00 82 | 83 | def feed(self, byte_str): 84 | for c in byte_str: 85 | for coding_sm in self.coding_sm: 86 | if not coding_sm or not coding_sm.active: 87 | continue 88 | coding_state = coding_sm.next_state(c) 89 | if coding_state == MachineState.ERROR: 90 | coding_sm.active = False 91 | self.active_sm_count -= 1 92 | if self.active_sm_count <= 0: 93 | self._state = ProbingState.NOT_ME 94 | return self.state 95 | elif coding_state == MachineState.ITS_ME: 96 | self._state = ProbingState.FOUND_IT 97 | self._detected_charset = coding_sm.get_coding_state_machine() 98 | self._detected_language = coding_sm.language 99 | return self.state 100 | 101 | return self.state 102 | -------------------------------------------------------------------------------- /capa_explorer_plugin/util.py: -------------------------------------------------------------------------------- 1 | from . import chardet 2 | import json 3 | import cutter 4 | import itertools 5 | import CutterBindings 6 | from operator import itemgetter, attrgetter 7 | import re 8 | 9 | def log(msg): 10 | """log to cutter console 11 | 12 | @param msg: message to log 13 | """ 14 | cutter.message(f"[capa explorer]: {msg}") 15 | 16 | def trigger_flags_changed(): 17 | cutter.core().triggerFlagsChanged() 18 | 19 | def trigger_function_renamed(rva, new_name): 20 | cutter.core().triggerFunctionRenamed(rva, new_name) 21 | 22 | def trigger_refresh(): 23 | cutter.refresh() 24 | 25 | def get_config_color(conf_name): 26 | return CutterBindings.Configuration.instance().getColor(conf_name) 27 | 28 | def define_function(location, name): 29 | try: 30 | cutter.core().createFunctionAt(location, name) 31 | except Exception as e: 32 | log(str(e)) 33 | 34 | def rename_function(location, new_name): 35 | try: 36 | cutter.core().renameFunction(location, new_name) 37 | except Exception as e: 38 | log(str(e)) 39 | 40 | def smart_rename_function(location, name): 41 | 42 | # Get function info 43 | function_info = cutter.cmdj('afij @%s' % location) 44 | 45 | if not len(function_info): 46 | # no function at location, trigger analysis 47 | # and refetch function info 48 | cutter.cmd('af @ %s' % location) 49 | function_info = cutter.cmdj('afij @%s' % location) 50 | 51 | old_fn_name = function_info[0]['name'] 52 | 53 | # Sometimes the vivisect feature extractor identifies 54 | # adresses in the middle of functions as function starts 55 | # for some reason. We get the actual addr with r2. 56 | actual_offset = function_info[0]['offset'] 57 | 58 | if old_fn_name.startswith('fcn.'): 59 | # Function has default name, replace all of it. 60 | cutter.cmd('afn {} @ {}'.format(name, actual_offset)) 61 | else: 62 | # Function does not have generic name keep old name as prefix 63 | name = f'{old_fn_name}__{name}' 64 | cutter.cmd('afn {} @ {}'.format(name, actual_offset)) 65 | 66 | def create_flagspace(flagspace): 67 | cutter.cmd('fs %s' % flagspace) 68 | 69 | def create_flag(name, location): 70 | try: 71 | cutter.cmd('f %s @ %s' % (name, location)) 72 | except Exception as e: 73 | log(str(e)) 74 | 75 | def highlight_locations(locations): 76 | cutter.cmd('ecHi red @@=%s' % ' '.join([str(x) for x in locations])) 77 | cutter.refresh() 78 | 79 | def unhighlight_locations(locations): 80 | cutter.cmd('ecH- @@=%s' % ' '.join([str(x) for x in locations])) 81 | cutter.refresh() 82 | 83 | def seek(location): 84 | cutter.core().seek(location) 85 | 86 | def load_capa_json(path): 87 | with open(path, 'rb') as f: 88 | rdata = f.read() 89 | sdata = rdata.decode(chardet.detect(rdata)['encoding']) 90 | data = json.loads(sdata) 91 | return data 92 | 93 | def get_name(location): 94 | function_info = cutter.cmdj('afij @%s' % location) 95 | if len(function_info): 96 | fn_name = function_info[0]['name'] 97 | match_name = f"{fn_name}" 98 | else: 99 | match_name = 'undefined.%s' % location 100 | return match_name 101 | 102 | def get_function_boundries_at_current_location(): 103 | function_info = cutter.cmdj('afij') 104 | if len(function_info): 105 | return (function_info[0]['minbound'], function_info[0]['maxbound']) 106 | else: 107 | return None 108 | 109 | def get_disasm(location): 110 | feature_disasm = cutter.cmdj("pdj 1 @%s" % location) 111 | 112 | if feature_disasm[0].get('disasm'): 113 | disasm = feature_disasm[0].get('disasm') 114 | else: 115 | disasm = 'N/A' 116 | return disasm 117 | 118 | def r2_rule_name(rule_info): 119 | return re.sub(r'.\(\d+ matches\)','', rule_info).replace(' ', '_') 120 | 121 | def capability_rules(doc): 122 | """enumerate the rules in (namespace, name) order that are 'capability' rules (not lib/subscope/disposition/etc).""" 123 | 124 | for (_, _, rule) in sorted( 125 | map(lambda rule: (rule["meta"].get("namespace", ""), rule["meta"]["name"], rule), doc["rules"].values()) 126 | , key=itemgetter(1)): 127 | if rule["meta"].get("lib"): 128 | continue 129 | if rule["meta"].get("capa/subscope"): 130 | continue 131 | if rule["meta"].get("maec/analysis-conclusion"): 132 | continue 133 | if rule["meta"].get("maec/analysis-conclusion-ov"): 134 | continue 135 | if rule["meta"].get("maec/malware-category"): 136 | continue 137 | if rule["meta"].get("maec/malware-category-ov"): 138 | continue 139 | 140 | yield rule 141 | 142 | 143 | def escape_string(s: str) -> str: 144 | """escape special characters""" 145 | s = repr(s) 146 | if not s.startswith(('"', "'")): 147 | # u'hello\r\nworld' -> hello\\r\\nworld 148 | s = s[2:-1] 149 | else: 150 | # 'hello\r\nworld' -> hello\\r\\nworld 151 | s = s[1:-1] 152 | s = s.replace("\\'", "'") # repr() may escape "'" in some edge cases, remove 153 | s = s.replace('"', '\\"') # repr() does not escape '"', add 154 | return s -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/charsetprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | import logging 30 | import re 31 | 32 | from .enums import ProbingState 33 | 34 | 35 | class CharSetProber(object): 36 | 37 | SHORTCUT_THRESHOLD = 0.95 38 | 39 | def __init__(self, lang_filter=None): 40 | self._state = None 41 | self.lang_filter = lang_filter 42 | self.logger = logging.getLogger(__name__) 43 | 44 | def reset(self): 45 | self._state = ProbingState.DETECTING 46 | 47 | @property 48 | def charset_name(self): 49 | return None 50 | 51 | def feed(self, buf): 52 | pass 53 | 54 | @property 55 | def state(self): 56 | return self._state 57 | 58 | def get_confidence(self): 59 | return 0.0 60 | 61 | @staticmethod 62 | def filter_high_byte_only(buf): 63 | buf = re.sub(b'([\x00-\x7F])+', b' ', buf) 64 | return buf 65 | 66 | @staticmethod 67 | def filter_international_words(buf): 68 | """ 69 | We define three types of bytes: 70 | alphabet: english alphabets [a-zA-Z] 71 | international: international characters [\x80-\xFF] 72 | marker: everything else [^a-zA-Z\x80-\xFF] 73 | 74 | The input buffer can be thought to contain a series of words delimited 75 | by markers. This function works to filter all words that contain at 76 | least one international character. All contiguous sequences of markers 77 | are replaced by a single space ascii character. 78 | 79 | This filter applies to all scripts which do not use English characters. 80 | """ 81 | filtered = bytearray() 82 | 83 | # This regex expression filters out only words that have at-least one 84 | # international character. The word may include one marker character at 85 | # the end. 86 | words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', 87 | buf) 88 | 89 | for word in words: 90 | filtered.extend(word[:-1]) 91 | 92 | # If the last character in the word is a marker, replace it with a 93 | # space as markers shouldn't affect our analysis (they are used 94 | # similarly across all languages and may thus have similar 95 | # frequencies). 96 | last_char = word[-1:] 97 | if not last_char.isalpha() and last_char < b'\x80': 98 | last_char = b' ' 99 | filtered.extend(last_char) 100 | 101 | return filtered 102 | 103 | @staticmethod 104 | def filter_with_english_letters(buf): 105 | """ 106 | Returns a copy of ``buf`` that retains only the sequences of English 107 | alphabet and high byte characters that are not between <> characters. 108 | Also retains English alphabet and high byte characters immediately 109 | before occurrences of >. 110 | 111 | This filter can be applied to all scripts which contain both English 112 | characters and extended ASCII characters, but is currently only used by 113 | ``Latin1Prober``. 114 | """ 115 | filtered = bytearray() 116 | in_tag = False 117 | prev = 0 118 | 119 | for curr in range(len(buf)): 120 | # Slice here to get bytes instead of an int with Python 3 121 | buf_char = buf[curr:curr + 1] 122 | # Check if we're coming out of or entering an HTML tag 123 | if buf_char == b'>': 124 | in_tag = False 125 | elif buf_char == b'<': 126 | in_tag = True 127 | 128 | # If current character is not extended-ASCII and not alphabetic... 129 | if buf_char < b'\x80' and not buf_char.isalpha(): 130 | # ...and we're not in a tag 131 | if curr > prev and not in_tag: 132 | # Keep everything after last non-extended-ASCII, 133 | # non-alphabetic character 134 | filtered.extend(buf[prev:curr]) 135 | # Output a space to delimit stretch we kept 136 | filtered.extend(b' ') 137 | prev = curr + 1 138 | 139 | # If we're not in a tag... 140 | if not in_tag: 141 | # Keep everything after last non-extended-ASCII, non-alphabetic 142 | # character 143 | filtered.extend(buf[prev:]) 144 | 145 | return filtered 146 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/latin1prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | from .charsetprober import CharSetProber 30 | from .enums import ProbingState 31 | 32 | FREQ_CAT_NUM = 4 33 | 34 | UDF = 0 # undefined 35 | OTH = 1 # other 36 | ASC = 2 # ascii capital letter 37 | ASS = 3 # ascii small letter 38 | ACV = 4 # accent capital vowel 39 | ACO = 5 # accent capital other 40 | ASV = 6 # accent small vowel 41 | ASO = 7 # accent small other 42 | CLASS_NUM = 8 # total classes 43 | 44 | Latin1_CharToClass = ( 45 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 46 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F 47 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 48 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F 49 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 50 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F 51 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 52 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F 53 | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 54 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F 55 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 56 | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F 57 | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 58 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F 59 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 60 | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F 61 | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87 62 | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F 63 | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97 64 | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F 65 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7 66 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF 67 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 68 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF 69 | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7 70 | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF 71 | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7 72 | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF 73 | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7 74 | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF 75 | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7 76 | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF 77 | ) 78 | 79 | # 0 : illegal 80 | # 1 : very unlikely 81 | # 2 : normal 82 | # 3 : very likely 83 | Latin1ClassModel = ( 84 | # UDF OTH ASC ASS ACV ACO ASV ASO 85 | 0, 0, 0, 0, 0, 0, 0, 0, # UDF 86 | 0, 3, 3, 3, 3, 3, 3, 3, # OTH 87 | 0, 3, 3, 3, 3, 3, 3, 3, # ASC 88 | 0, 3, 3, 3, 1, 1, 3, 3, # ASS 89 | 0, 3, 3, 3, 1, 2, 1, 2, # ACV 90 | 0, 3, 3, 3, 3, 3, 3, 3, # ACO 91 | 0, 3, 1, 3, 1, 1, 1, 3, # ASV 92 | 0, 3, 1, 3, 1, 1, 3, 3, # ASO 93 | ) 94 | 95 | 96 | class Latin1Prober(CharSetProber): 97 | def __init__(self): 98 | super(Latin1Prober, self).__init__() 99 | self._last_char_class = None 100 | self._freq_counter = None 101 | self.reset() 102 | 103 | def reset(self): 104 | self._last_char_class = OTH 105 | self._freq_counter = [0] * FREQ_CAT_NUM 106 | CharSetProber.reset(self) 107 | 108 | @property 109 | def charset_name(self): 110 | return "ISO-8859-1" 111 | 112 | @property 113 | def language(self): 114 | return "" 115 | 116 | def feed(self, byte_str): 117 | byte_str = self.filter_with_english_letters(byte_str) 118 | for c in byte_str: 119 | char_class = Latin1_CharToClass[c] 120 | freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) 121 | + char_class] 122 | if freq == 0: 123 | self._state = ProbingState.NOT_ME 124 | break 125 | self._freq_counter[freq] += 1 126 | self._last_char_class = char_class 127 | 128 | return self.state 129 | 130 | def get_confidence(self): 131 | if self.state == ProbingState.NOT_ME: 132 | return 0.01 133 | 134 | total = sum(self._freq_counter) 135 | if total < 0.01: 136 | confidence = 0.0 137 | else: 138 | confidence = ((self._freq_counter[3] - self._freq_counter[1] * 20.0) 139 | / total) 140 | if confidence < 0.0: 141 | confidence = 0.0 142 | # lower the confidence of latin1 so that other more accurate 143 | # detector can take priority. 144 | confidence = confidence * 0.73 145 | return confidence 146 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/sbcharsetprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | from .charsetprober import CharSetProber 30 | from .enums import CharacterCategory, ProbingState, SequenceLikelihood 31 | 32 | 33 | class SingleByteCharSetProber(CharSetProber): 34 | SAMPLE_SIZE = 64 35 | SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2 36 | POSITIVE_SHORTCUT_THRESHOLD = 0.95 37 | NEGATIVE_SHORTCUT_THRESHOLD = 0.05 38 | 39 | def __init__(self, model, reversed=False, name_prober=None): 40 | super(SingleByteCharSetProber, self).__init__() 41 | self._model = model 42 | # TRUE if we need to reverse every pair in the model lookup 43 | self._reversed = reversed 44 | # Optional auxiliary prober for name decision 45 | self._name_prober = name_prober 46 | self._last_order = None 47 | self._seq_counters = None 48 | self._total_seqs = None 49 | self._total_char = None 50 | self._freq_char = None 51 | self.reset() 52 | 53 | def reset(self): 54 | super(SingleByteCharSetProber, self).reset() 55 | # char order of last character 56 | self._last_order = 255 57 | self._seq_counters = [0] * SequenceLikelihood.get_num_categories() 58 | self._total_seqs = 0 59 | self._total_char = 0 60 | # characters that fall in our sampling range 61 | self._freq_char = 0 62 | 63 | @property 64 | def charset_name(self): 65 | if self._name_prober: 66 | return self._name_prober.charset_name 67 | else: 68 | return self._model['charset_name'] 69 | 70 | @property 71 | def language(self): 72 | if self._name_prober: 73 | return self._name_prober.language 74 | else: 75 | return self._model.get('language') 76 | 77 | def feed(self, byte_str): 78 | if not self._model['keep_english_letter']: 79 | byte_str = self.filter_international_words(byte_str) 80 | if not byte_str: 81 | return self.state 82 | char_to_order_map = self._model['char_to_order_map'] 83 | for i, c in enumerate(byte_str): 84 | # XXX: Order is in range 1-64, so one would think we want 0-63 here, 85 | # but that leads to 27 more test failures than before. 86 | order = char_to_order_map[c] 87 | # XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but 88 | # CharacterCategory.SYMBOL is actually 253, so we use CONTROL 89 | # to make it closer to the original intent. The only difference 90 | # is whether or not we count digits and control characters for 91 | # _total_char purposes. 92 | if order < CharacterCategory.CONTROL: 93 | self._total_char += 1 94 | if order < self.SAMPLE_SIZE: 95 | self._freq_char += 1 96 | if self._last_order < self.SAMPLE_SIZE: 97 | self._total_seqs += 1 98 | if not self._reversed: 99 | i = (self._last_order * self.SAMPLE_SIZE) + order 100 | model = self._model['precedence_matrix'][i] 101 | else: # reverse the order of the letters in the lookup 102 | i = (order * self.SAMPLE_SIZE) + self._last_order 103 | model = self._model['precedence_matrix'][i] 104 | self._seq_counters[model] += 1 105 | self._last_order = order 106 | 107 | charset_name = self._model['charset_name'] 108 | if self.state == ProbingState.DETECTING: 109 | if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD: 110 | confidence = self.get_confidence() 111 | if confidence > self.POSITIVE_SHORTCUT_THRESHOLD: 112 | self.logger.debug('%s confidence = %s, we have a winner', 113 | charset_name, confidence) 114 | self._state = ProbingState.FOUND_IT 115 | elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD: 116 | self.logger.debug('%s confidence = %s, below negative ' 117 | 'shortcut threshhold %s', charset_name, 118 | confidence, 119 | self.NEGATIVE_SHORTCUT_THRESHOLD) 120 | self._state = ProbingState.NOT_ME 121 | 122 | return self.state 123 | 124 | def get_confidence(self): 125 | r = 0.01 126 | if self._total_seqs > 0: 127 | r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) / 128 | self._total_seqs / self._model['typical_positive_ratio']) 129 | r = r * self._freq_char / self._total_char 130 | if r >= 1.0: 131 | r = 0.99 132 | return r 133 | -------------------------------------------------------------------------------- /capa_explorer_plugin/proxy.py: -------------------------------------------------------------------------------- 1 | from . import six 2 | from PySide2 import QtCore 3 | from PySide2.QtCore import Qt 4 | 5 | from .model import CapaExplorerDataModel 6 | 7 | 8 | class CapaExplorerRangeProxyModel(QtCore.QSortFilterProxyModel): 9 | """filter results based on virtual address range as seen by IDA 10 | implements filtering for "limit results by current function" checkbox in plugin UI 11 | minimum and maximum virtual addresses are used to filter results to a specific address range. this allows 12 | basic blocks to be included when limiting results to a specific function 13 | """ 14 | 15 | def __init__(self, parent=None): 16 | """initialize proxy filter""" 17 | super(CapaExplorerRangeProxyModel, self).__init__(parent) 18 | self.min_ea = None 19 | self.max_ea = None 20 | 21 | def lessThan(self, left, right): 22 | """return True if left item is less than right item, else False 23 | @param left: QModelIndex of left 24 | @param right: QModelIndex of right 25 | """ 26 | ldata = left.internalPointer().data(left.column()) 27 | rdata = right.internalPointer().data(right.column()) 28 | 29 | if ( 30 | ldata 31 | and rdata 32 | and left.column() == CapaExplorerDataModel.COLUMN_INDEX_VIRTUAL_ADDRESS 33 | and left.column() == right.column() 34 | ): 35 | # convert virtual address before compare 36 | return int(ldata, 16) < int(rdata, 16) 37 | else: 38 | # compare as lowercase 39 | return ldata.lower() < rdata.lower() 40 | 41 | def filterAcceptsRow(self, row, parent): 42 | """return true if the item in the row indicated by the given row and parent should be included in the model; 43 | otherwise return false 44 | @param row: row number 45 | @param parent: QModelIndex of parent 46 | """ 47 | if self.filter_accepts_row_self(row, parent): 48 | return True 49 | 50 | alpha = parent 51 | while alpha.isValid(): 52 | if self.filter_accepts_row_self(alpha.row(), alpha.parent()): 53 | return True 54 | alpha = alpha.parent() 55 | 56 | if self.index_has_accepted_children(row, parent): 57 | return True 58 | 59 | return False 60 | 61 | def index_has_accepted_children(self, row, parent): 62 | """return True if parent has one or more children that match filter, else False 63 | @param row: row number 64 | @param parent: QModelIndex of parent 65 | """ 66 | model_index = self.sourceModel().index(row, 0, parent) 67 | 68 | if model_index.isValid(): 69 | for idx in range(self.sourceModel().rowCount(model_index)): 70 | if self.filter_accepts_row_self(idx, model_index): 71 | return True 72 | if self.index_has_accepted_children(idx, model_index): 73 | return True 74 | 75 | return False 76 | 77 | def filter_accepts_row_self(self, row, parent): 78 | """return True if filter accepts row, else False 79 | @param row: row number 80 | @param parent: QModelIndex of parent 81 | """ 82 | # filter not set 83 | if self.min_ea is None and self.max_ea is None: 84 | return True 85 | 86 | index = self.sourceModel().index(row, 0, parent) 87 | data = index.internalPointer().data(CapaExplorerDataModel.COLUMN_INDEX_VIRTUAL_ADDRESS) 88 | 89 | # virtual address may be empty 90 | if not data: 91 | return False 92 | 93 | # convert virtual address str to int 94 | ea = int(data, 16) 95 | 96 | if self.min_ea <= ea and ea < self.max_ea: 97 | return True 98 | 99 | return False 100 | 101 | def add_address_range_filter(self, min_ea, max_ea): 102 | """add new address range filter 103 | called when user checks "limit results by current function" in plugin UI 104 | @param min_ea: minimum virtual address as seen by IDA 105 | @param max_ea: maximum virtual address as seen by IDA 106 | """ 107 | self.min_ea = min_ea 108 | self.max_ea = max_ea 109 | 110 | self.setFilterKeyColumn(CapaExplorerDataModel.COLUMN_INDEX_VIRTUAL_ADDRESS) 111 | self.invalidateFilter() 112 | 113 | def reset_address_range_filter(self): 114 | """remove address range filter (accept all results) 115 | called when user un-checks "limit results by current function" in plugin UI 116 | """ 117 | self.min_ea = None 118 | self.max_ea = None 119 | self.invalidateFilter() 120 | 121 | 122 | class CapaExplorerSearchProxyModel(QtCore.QSortFilterProxyModel): 123 | """A SortFilterProxyModel that accepts rows with a substring match for a configurable query. 124 | Looks for matches in the text of all rows. 125 | Displays the entire tree row if any of the tree branches, 126 | that is, you can filter by rule name, or also 127 | filter by "characteristic(nzxor)" to filter matches with some feature. 128 | """ 129 | 130 | def __init__(self, parent=None): 131 | """ """ 132 | super(CapaExplorerSearchProxyModel, self).__init__(parent) 133 | self.query = "" 134 | self.setFilterKeyColumn(-1) # all columns 135 | 136 | def filterAcceptsRow(self, row, parent): 137 | """true if the item in the row indicated by the given row and parent 138 | should be included in the model; otherwise returns false 139 | @param row: int 140 | @param parent: QModelIndex* 141 | @retval True/False 142 | """ 143 | # this row matches, accept it 144 | if self.filter_accepts_row_self(row, parent): 145 | return True 146 | 147 | # the parent of this row matches, accept it 148 | alpha = parent 149 | while alpha.isValid(): 150 | if self.filter_accepts_row_self(alpha.row(), alpha.parent()): 151 | return True 152 | alpha = alpha.parent() 153 | 154 | # this row is a parent, and a child matches, accept it 155 | if self.index_has_accepted_children(row, parent): 156 | return True 157 | 158 | return False 159 | 160 | def index_has_accepted_children(self, row, parent): 161 | """returns True if the given row or its children should be accepted""" 162 | source_model = self.sourceModel() 163 | model_index = source_model.index(row, 0, parent) 164 | 165 | if model_index.isValid(): 166 | for idx in range(source_model.rowCount(model_index)): 167 | if self.filter_accepts_row_self(idx, model_index): 168 | return True 169 | if self.index_has_accepted_children(idx, model_index): 170 | return True 171 | 172 | return False 173 | 174 | def filter_accepts_row_self(self, row, parent): 175 | """returns True if the given row should be accepted""" 176 | if self.query == "": 177 | return True 178 | 179 | source_model = self.sourceModel() 180 | 181 | for column in ( 182 | CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION, 183 | CapaExplorerDataModel.COLUMN_INDEX_VIRTUAL_ADDRESS, 184 | CapaExplorerDataModel.COLUMN_INDEX_DETAILS, 185 | ): 186 | index = source_model.index(row, column, parent) 187 | data = source_model.data(index, Qt.DisplayRole) 188 | 189 | if not data: 190 | continue 191 | 192 | if not isinstance(data, six.string_types): 193 | # sanity check: should already be a string, but double check 194 | continue 195 | 196 | # case in-sensitive matching 197 | if self.query.lower() in data.lower(): 198 | return True 199 | 200 | return False 201 | 202 | def set_query(self, query): 203 | self.query = query 204 | self.invalidateFilter() 205 | 206 | def reset_query(self): 207 | self.set_query("") -------------------------------------------------------------------------------- /capa_explorer_plugin/item.py: -------------------------------------------------------------------------------- 1 | from PySide2 import QtCore 2 | 3 | from .util import get_name, get_disasm, log 4 | 5 | def location_to_hex(location): 6 | """convert location to hex for display""" 7 | ret = "%08X" % int(location) 8 | return ret 9 | 10 | def info_to_name(display): 11 | """extract root value from display name 12 | e.g. function(my_function) => my_function 13 | """ 14 | try: 15 | return display.split("(")[1].rstrip(")") 16 | except IndexError: 17 | return "" 18 | 19 | class CapaExplorerDataItem(object): 20 | """store data for CapaExplorerDataModel""" 21 | 22 | def __init__(self, parent, data): 23 | """initialize item""" 24 | self.pred = parent 25 | self._data = data 26 | self.children = [] 27 | self._checked = False 28 | 29 | # default state for item 30 | self.flags = ( 31 | QtCore.Qt.ItemIsEnabled 32 | | QtCore.Qt.ItemIsSelectable 33 | | QtCore.Qt.ItemIsTristate 34 | | QtCore.Qt.ItemIsUserCheckable 35 | ) 36 | 37 | if self.pred: 38 | self.pred.appendChild(self) 39 | 40 | def setIsEditable(self, isEditable=False): 41 | """modify item editable flags 42 | @param isEditable: True, can edit, False cannot edit 43 | """ 44 | if isEditable: 45 | self.flags |= QtCore.Qt.ItemIsEditable 46 | else: 47 | self.flags &= ~QtCore.Qt.ItemIsEditable 48 | 49 | def setChecked(self, checked): 50 | """set item as checked 51 | @param checked: True, item checked, False item not checked 52 | """ 53 | self._checked = checked 54 | 55 | def isChecked(self): 56 | """get item is checked""" 57 | return self._checked 58 | 59 | def appendChild(self, item): 60 | """add a new child to specified item 61 | @param item: CapaExplorerDataItem 62 | """ 63 | self.children.append(item) 64 | 65 | def child(self, row): 66 | """get child row 67 | @param row: row number 68 | """ 69 | return self.children[row] 70 | 71 | def childCount(self): 72 | """get child count""" 73 | return len(self.children) 74 | 75 | def columnCount(self): 76 | """get column count""" 77 | return len(self._data) 78 | 79 | def data(self, column): 80 | """get data at column 81 | @param: column number 82 | """ 83 | try: 84 | return self._data[column] 85 | except IndexError: 86 | return None 87 | 88 | def parent(self): 89 | """get parent""" 90 | return self.pred 91 | 92 | def row(self): 93 | """get row location""" 94 | if self.pred: 95 | return self.pred.children.index(self) 96 | return 0 97 | 98 | def setData(self, column, value): 99 | """set data in column 100 | @param column: column number 101 | @value: value to set (assume str) 102 | """ 103 | self._data[column] = value 104 | 105 | def children(self): 106 | """yield children""" 107 | for child in self.children: 108 | yield child 109 | 110 | def removeChildren(self): 111 | """remove children""" 112 | del self.children[:] 113 | 114 | def __str__(self): 115 | """get string representation of columns 116 | used for copy-n-paste operations 117 | """ 118 | return " ".join([data for data in self._data if data]) 119 | 120 | @property 121 | def info(self): 122 | """return data stored in information column""" 123 | return self._data[0] 124 | 125 | @property 126 | def location(self): 127 | """return data stored in location column""" 128 | try: 129 | # address stored as str, convert to int before return 130 | return int(self._data[1], 16) 131 | except ValueError: 132 | return None 133 | 134 | @property 135 | def details(self): 136 | """return data stored in details column""" 137 | return self._data[2] 138 | 139 | class CapaExplorerRuleItem(CapaExplorerDataItem): 140 | """store data for rule result""" 141 | 142 | fmt = "%s (%d matches)" 143 | 144 | def __init__(self, parent, name, namespace, count, source): 145 | """initialize item 146 | @param parent: parent node 147 | @param name: rule name 148 | @param namespace: rule namespace 149 | @param count: number of match for this rule 150 | @param source: rule source (tooltip) 151 | """ 152 | display = self.fmt % (name, count) if count > 1 else name 153 | super(CapaExplorerRuleItem, self).__init__(parent, [display, "", namespace]) 154 | self._source = source 155 | 156 | @property 157 | def source(self): 158 | """return rule source to display (tooltip)""" 159 | return self._source 160 | 161 | class CapaExplorerFunctionItem(CapaExplorerDataItem): 162 | """store data for function match""" 163 | 164 | fmt = "function(%s)" 165 | 166 | def __init__(self, parent, location): 167 | """initialize item 168 | @param parent: parent node 169 | @param location: virtual address of function as seen by IDA 170 | """ 171 | super(CapaExplorerFunctionItem, self).__init__( 172 | parent, [self.fmt % get_name(location), location_to_hex(location), ""] 173 | ) 174 | 175 | @property 176 | def info(self): 177 | """return function name""" 178 | info = super(CapaExplorerFunctionItem, self).info 179 | display = info_to_name(info) 180 | return display if display else info 181 | 182 | @info.setter 183 | def info(self, display): 184 | """set function name 185 | called when user changes function name in plugin UI 186 | @param display: new function name to display 187 | """ 188 | self._data[0] = self.fmt % display 189 | 190 | class CapaExplorerBlockItem(CapaExplorerDataItem): 191 | """store data for basic block match""" 192 | 193 | fmt = "basic block(loc_%08X)" 194 | 195 | def __init__(self, parent, location): 196 | """initialize item 197 | @param parent: parent node 198 | @param location: virtual address of basic block as seen by IDA 199 | """ 200 | super(CapaExplorerBlockItem, self).__init__(parent, [self.fmt % int(location), location_to_hex(location), ""]) 201 | 202 | class CapaExplorerSubscopeItem(CapaExplorerDataItem): 203 | """store data for subscope match""" 204 | 205 | fmt = "subscope(%s)" 206 | 207 | def __init__(self, parent, scope): 208 | """initialize item 209 | @param parent: parent node 210 | @param scope: subscope name 211 | """ 212 | super(CapaExplorerSubscopeItem, self).__init__(parent, [self.fmt % scope, "", ""]) 213 | 214 | class CapaExplorerDefaultItem(CapaExplorerDataItem): 215 | """store data for default match e.g. statement (and, or)""" 216 | 217 | def __init__(self, parent, display, details="", location=None): 218 | """initialize item 219 | @param parent: parent node 220 | @param display: text to display in UI 221 | @param details: text to display in details section of UI 222 | @param location: virtual address as seen by IDA 223 | """ 224 | location = location_to_hex(location) if location else "" 225 | super(CapaExplorerDefaultItem, self).__init__(parent, [display, location, details]) 226 | 227 | class CapaExplorerFeatureItem(CapaExplorerDataItem): 228 | """store data for feature match""" 229 | 230 | def __init__(self, parent, display, location="", details=""): 231 | """initialize item 232 | @param parent: parent node 233 | @param display: text to display in UI 234 | @param details: text to display in details section of UI 235 | @param location: virtual address as seen by IDA 236 | """ 237 | location = location_to_hex(location) if location else "" 238 | super(CapaExplorerFeatureItem, self).__init__(parent, [display, location, details]) 239 | 240 | class CapaExplorerInstructionViewItem(CapaExplorerFeatureItem): 241 | """store data for instruction match""" 242 | 243 | def __init__(self, parent, display, location): 244 | """initialize item 245 | details section shows disassembly view for match 246 | @param parent: parent node 247 | @param display: text to display in UI 248 | @param location: virtual address as seen by IDA 249 | """ 250 | 251 | details = get_disasm(location) 252 | super(CapaExplorerInstructionViewItem, self).__init__(parent, display, location=location, details=details) 253 | 254 | class CapaExplorerByteViewItem(CapaExplorerFeatureItem): 255 | """store data for byte match""" 256 | 257 | def __init__(self, parent, display, location): 258 | """initialize item 259 | details section shows byte preview for match 260 | @param parent: parent node 261 | @param display: text to display in UI 262 | @param location: virtual address as seen by IDA 263 | """ 264 | 265 | class CapaExplorerStringViewItem(CapaExplorerFeatureItem): 266 | """store data for string match""" 267 | 268 | def __init__(self, parent, display, location, value): 269 | """initialize item 270 | @param parent: parent node 271 | @param display: text to display in UI 272 | @param location: virtual address as seen by IDA 273 | """ 274 | super(CapaExplorerStringViewItem, self).__init__(parent, display, location=location, details=value) 275 | 276 | class CapaExplorerRuleMatchItem(CapaExplorerDataItem): 277 | """store data for rule match""" 278 | 279 | def __init__(self, parent, display, source=""): 280 | """initialize item 281 | @param parent: parent node 282 | @param display: text to display in UI 283 | @param source: rule match source to display (tooltip) 284 | """ 285 | super(CapaExplorerRuleMatchItem, self).__init__(parent, [display, "", ""]) 286 | self._source = source 287 | 288 | @property 289 | def source(self): 290 | """ return rule contents for display """ 291 | return self._source -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/chardistribution.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .euctwfreq import (EUCTW_CHAR_TO_FREQ_ORDER, EUCTW_TABLE_SIZE, 29 | EUCTW_TYPICAL_DISTRIBUTION_RATIO) 30 | from .euckrfreq import (EUCKR_CHAR_TO_FREQ_ORDER, EUCKR_TABLE_SIZE, 31 | EUCKR_TYPICAL_DISTRIBUTION_RATIO) 32 | from .gb2312freq import (GB2312_CHAR_TO_FREQ_ORDER, GB2312_TABLE_SIZE, 33 | GB2312_TYPICAL_DISTRIBUTION_RATIO) 34 | from .big5freq import (BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE, 35 | BIG5_TYPICAL_DISTRIBUTION_RATIO) 36 | from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE, 37 | JIS_TYPICAL_DISTRIBUTION_RATIO) 38 | 39 | 40 | class CharDistributionAnalysis(object): 41 | ENOUGH_DATA_THRESHOLD = 1024 42 | SURE_YES = 0.99 43 | SURE_NO = 0.01 44 | MINIMUM_DATA_THRESHOLD = 3 45 | 46 | def __init__(self): 47 | # Mapping table to get frequency order from char order (get from 48 | # GetOrder()) 49 | self._char_to_freq_order = None 50 | self._table_size = None # Size of above table 51 | # This is a constant value which varies from language to language, 52 | # used in calculating confidence. See 53 | # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html 54 | # for further detail. 55 | self.typical_distribution_ratio = None 56 | self._done = None 57 | self._total_chars = None 58 | self._freq_chars = None 59 | self.reset() 60 | 61 | def reset(self): 62 | """reset analyser, clear any state""" 63 | # If this flag is set to True, detection is done and conclusion has 64 | # been made 65 | self._done = False 66 | self._total_chars = 0 # Total characters encountered 67 | # The number of characters whose frequency order is less than 512 68 | self._freq_chars = 0 69 | 70 | def feed(self, char, char_len): 71 | """feed a character with known length""" 72 | if char_len == 2: 73 | # we only care about 2-bytes character in our distribution analysis 74 | order = self.get_order(char) 75 | else: 76 | order = -1 77 | if order >= 0: 78 | self._total_chars += 1 79 | # order is valid 80 | if order < self._table_size: 81 | if 512 > self._char_to_freq_order[order]: 82 | self._freq_chars += 1 83 | 84 | def get_confidence(self): 85 | """return confidence based on existing data""" 86 | # if we didn't receive any character in our consideration range, 87 | # return negative answer 88 | if self._total_chars <= 0 or self._freq_chars <= self.MINIMUM_DATA_THRESHOLD: 89 | return self.SURE_NO 90 | 91 | if self._total_chars != self._freq_chars: 92 | r = (self._freq_chars / ((self._total_chars - self._freq_chars) 93 | * self.typical_distribution_ratio)) 94 | if r < self.SURE_YES: 95 | return r 96 | 97 | # normalize confidence (we don't want to be 100% sure) 98 | return self.SURE_YES 99 | 100 | def got_enough_data(self): 101 | # It is not necessary to receive all data to draw conclusion. 102 | # For charset detection, certain amount of data is enough 103 | return self._total_chars > self.ENOUGH_DATA_THRESHOLD 104 | 105 | def get_order(self, byte_str): 106 | # We do not handle characters based on the original encoding string, 107 | # but convert this encoding string to a number, here called order. 108 | # This allows multiple encodings of a language to share one frequency 109 | # table. 110 | return -1 111 | 112 | 113 | class EUCTWDistributionAnalysis(CharDistributionAnalysis): 114 | def __init__(self): 115 | super(EUCTWDistributionAnalysis, self).__init__() 116 | self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER 117 | self._table_size = EUCTW_TABLE_SIZE 118 | self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO 119 | 120 | def get_order(self, byte_str): 121 | # for euc-TW encoding, we are interested 122 | # first byte range: 0xc4 -- 0xfe 123 | # second byte range: 0xa1 -- 0xfe 124 | # no validation needed here. State machine has done that 125 | first_char = byte_str[0] 126 | if first_char >= 0xC4: 127 | return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1 128 | else: 129 | return -1 130 | 131 | 132 | class EUCKRDistributionAnalysis(CharDistributionAnalysis): 133 | def __init__(self): 134 | super(EUCKRDistributionAnalysis, self).__init__() 135 | self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER 136 | self._table_size = EUCKR_TABLE_SIZE 137 | self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO 138 | 139 | def get_order(self, byte_str): 140 | # for euc-KR encoding, we are interested 141 | # first byte range: 0xb0 -- 0xfe 142 | # second byte range: 0xa1 -- 0xfe 143 | # no validation needed here. State machine has done that 144 | first_char = byte_str[0] 145 | if first_char >= 0xB0: 146 | return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1 147 | else: 148 | return -1 149 | 150 | 151 | class GB2312DistributionAnalysis(CharDistributionAnalysis): 152 | def __init__(self): 153 | super(GB2312DistributionAnalysis, self).__init__() 154 | self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER 155 | self._table_size = GB2312_TABLE_SIZE 156 | self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO 157 | 158 | def get_order(self, byte_str): 159 | # for GB2312 encoding, we are interested 160 | # first byte range: 0xb0 -- 0xfe 161 | # second byte range: 0xa1 -- 0xfe 162 | # no validation needed here. State machine has done that 163 | first_char, second_char = byte_str[0], byte_str[1] 164 | if (first_char >= 0xB0) and (second_char >= 0xA1): 165 | return 94 * (first_char - 0xB0) + second_char - 0xA1 166 | else: 167 | return -1 168 | 169 | 170 | class Big5DistributionAnalysis(CharDistributionAnalysis): 171 | def __init__(self): 172 | super(Big5DistributionAnalysis, self).__init__() 173 | self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER 174 | self._table_size = BIG5_TABLE_SIZE 175 | self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO 176 | 177 | def get_order(self, byte_str): 178 | # for big5 encoding, we are interested 179 | # first byte range: 0xa4 -- 0xfe 180 | # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe 181 | # no validation needed here. State machine has done that 182 | first_char, second_char = byte_str[0], byte_str[1] 183 | if first_char >= 0xA4: 184 | if second_char >= 0xA1: 185 | return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63 186 | else: 187 | return 157 * (first_char - 0xA4) + second_char - 0x40 188 | else: 189 | return -1 190 | 191 | 192 | class SJISDistributionAnalysis(CharDistributionAnalysis): 193 | def __init__(self): 194 | super(SJISDistributionAnalysis, self).__init__() 195 | self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER 196 | self._table_size = JIS_TABLE_SIZE 197 | self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO 198 | 199 | def get_order(self, byte_str): 200 | # for sjis encoding, we are interested 201 | # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe 202 | # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe 203 | # no validation needed here. State machine has done that 204 | first_char, second_char = byte_str[0], byte_str[1] 205 | if (first_char >= 0x81) and (first_char <= 0x9F): 206 | order = 188 * (first_char - 0x81) 207 | elif (first_char >= 0xE0) and (first_char <= 0xEF): 208 | order = 188 * (first_char - 0xE0 + 31) 209 | else: 210 | return -1 211 | order = order + second_char - 0x40 212 | if second_char > 0x7F: 213 | order = -1 214 | return order 215 | 216 | 217 | class EUCJPDistributionAnalysis(CharDistributionAnalysis): 218 | def __init__(self): 219 | super(EUCJPDistributionAnalysis, self).__init__() 220 | self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER 221 | self._table_size = JIS_TABLE_SIZE 222 | self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO 223 | 224 | def get_order(self, byte_str): 225 | # for euc-JP encoding, we are interested 226 | # first byte range: 0xa0 -- 0xfe 227 | # second byte range: 0xa1 -- 0xfe 228 | # no validation needed here. State machine has done that 229 | char = byte_str[0] 230 | if char >= 0xA0: 231 | return 94 * (char - 0xA1) + byte_str[1] - 0xa1 232 | else: 233 | return -1 234 | -------------------------------------------------------------------------------- /capa_explorer_plugin/view.py: -------------------------------------------------------------------------------- 1 | from PySide2 import QtWidgets, QtCore 2 | from .item import CapaExplorerFunctionItem 3 | from .model import CapaExplorerDataModel 4 | from .util import log, seek 5 | 6 | MAX_SECTION_SIZE = 750 7 | 8 | 9 | class CapaExplorerQtreeView(QtWidgets.QTreeView): 10 | """tree view used to display hierarchical capa results 11 | 12 | view controls UI action responses and displays data from CapaExplorerDataModel 13 | 14 | view does not modify CapaExplorerDataModel directly - data modifications should be implemented 15 | in CapaExplorerDataModel 16 | """ 17 | 18 | def __init__(self, model, parent=None): 19 | """initialize view""" 20 | super(CapaExplorerQtreeView, self).__init__(parent) 21 | 22 | self.setModel(model) 23 | 24 | self.model = model 25 | self.parent = parent 26 | 27 | # control when we resize columns 28 | self.should_resize_columns = True 29 | 30 | # configure custom UI controls 31 | self.setContextMenuPolicy(QtCore.Qt.CustomContextMenu) 32 | self.setExpandsOnDoubleClick(False) 33 | self.setSortingEnabled(True) 34 | self.model.setDynamicSortFilter(False) 35 | 36 | # configure view columns to auto-resize 37 | for idx in range(CapaExplorerDataModel.COLUMN_COUNT): 38 | self.header().setSectionResizeMode(idx, QtWidgets.QHeaderView.Interactive) 39 | 40 | # disable stretch to enable horizontal scroll for last column, when needed 41 | self.header().setStretchLastSection(False) 42 | 43 | # connect slots to resize columns when expanded or collapsed 44 | self.expanded.connect(self.slot_resize_columns_to_content) 45 | self.collapsed.connect(self.slot_resize_columns_to_content) 46 | 47 | # connect slots 48 | self.customContextMenuRequested.connect(self.slot_custom_context_menu_requested) 49 | self.doubleClicked.connect(self.slot_double_click) 50 | 51 | self.setStyleSheet("QTreeView::item {padding-right: 15 px;padding-bottom: 2 px;}") 52 | 53 | def reset_ui(self, should_sort=True): 54 | """reset user interface changes 55 | 56 | called when view should reset UI display e.g. expand items, resize columns 57 | 58 | @param should_sort: True, sort results after reset, False don't sort results after reset 59 | """ 60 | if should_sort: 61 | self.sortByColumn(CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION, QtCore.Qt.AscendingOrder) 62 | 63 | self.should_resize_columns = False 64 | self.expandToDepth(0) 65 | self.should_resize_columns = True 66 | 67 | self.slot_resize_columns_to_content() 68 | 69 | def slot_resize_columns_to_content(self): 70 | """reset view columns to contents""" 71 | if self.should_resize_columns: 72 | self.header().resizeSections(QtWidgets.QHeaderView.ResizeToContents) 73 | 74 | # limit size of first section 75 | if self.header().sectionSize(0) > MAX_SECTION_SIZE: 76 | self.header().resizeSection(0, MAX_SECTION_SIZE) 77 | 78 | def map_index_to_source_item(self, model_index): 79 | """map proxy model index to source model item 80 | 81 | @param model_index: QModelIndex 82 | 83 | @retval QObject 84 | """ 85 | # assume that self.model here is either: 86 | # - CapaExplorerDataModel, or 87 | # - QSortFilterProxyModel subclass 88 | # 89 | # The ProxyModels may be chained, 90 | # so keep resolving the index the CapaExplorerDataModel. 91 | 92 | model = self.model 93 | while not isinstance(model, CapaExplorerDataModel): 94 | if not model_index.isValid(): 95 | raise ValueError("invalid index") 96 | 97 | model_index = model.mapToSource(model_index) 98 | model = model.sourceModel() 99 | 100 | if not model_index.isValid(): 101 | raise ValueError("invalid index") 102 | 103 | return model_index.internalPointer() 104 | 105 | def send_data_to_clipboard(self, data): 106 | """copy data to the clipboard 107 | 108 | @param data: data to be copied 109 | """ 110 | clip = QtWidgets.QApplication.clipboard() 111 | clip.clear(mode=clip.Clipboard) 112 | clip.setText(data, mode=clip.Clipboard) 113 | 114 | def new_action(self, display, data, slot): 115 | """create action for context menu 116 | 117 | @param display: text displayed to user in context menu 118 | @param data: data passed to slot 119 | @param slot: slot to connect 120 | 121 | @retval QAction 122 | """ 123 | action = QtWidgets.QAction(display, self.parent) 124 | action.setData(data) 125 | action.triggered.connect(lambda checked: slot(action)) 126 | 127 | return action 128 | 129 | def load_default_context_menu_actions(self, data): 130 | """yield actions specific to function custom context menu 131 | 132 | @param data: tuple 133 | 134 | @yield QAction 135 | """ 136 | default_actions = [ 137 | ("Copy column", data, self.slot_copy_column), 138 | ("Copy row", data, self.slot_copy_row), 139 | ] 140 | 141 | # If the row has an address value add the seek 142 | item = data[1] 143 | if item.data(1): 144 | default_actions.insert(0,("Seek to %s" % item.data(1), data, self.slot_seek_to)) 145 | 146 | # add default actions 147 | for action in default_actions: 148 | yield self.new_action(*action) 149 | 150 | def load_function_context_menu_actions(self, data): 151 | """yield actions specific to function custom context menu 152 | 153 | @param data: tuple 154 | 155 | @yield QAction 156 | """ 157 | 158 | if data[1].info.startswith('undefined.'): 159 | function_actions = (("Define function", data, self.slot_rename_function),) 160 | else: 161 | function_actions = (("Rename function", data, self.slot_rename_function),) 162 | 163 | # add function actions 164 | for action in function_actions: 165 | yield self.new_action(*action) 166 | 167 | # add default actions 168 | for action in self.load_default_context_menu_actions(data): 169 | yield action 170 | 171 | def load_default_context_menu(self, pos, item, model_index): 172 | """create default custom context menu 173 | 174 | creates custom context menu containing default actions 175 | 176 | @param pos: cursor position 177 | @param item: CapaExplorerDataItem 178 | @param model_index: QModelIndex 179 | 180 | @retval QMenu 181 | """ 182 | menu = QtWidgets.QMenu() 183 | 184 | for action in self.load_default_context_menu_actions((pos, item, model_index)): 185 | menu.addAction(action) 186 | 187 | return menu 188 | 189 | def load_function_item_context_menu(self, pos, item, model_index): 190 | """create function custom context menu 191 | 192 | creates custom context menu with both default actions and function actions 193 | 194 | @param pos: cursor position 195 | @param item: CapaExplorerDataItem 196 | @param model_index: QModelIndex 197 | 198 | @retval QMenu 199 | """ 200 | menu = QtWidgets.QMenu() 201 | 202 | for action in self.load_function_context_menu_actions((pos, item, model_index)): 203 | menu.addAction(action) 204 | 205 | return menu 206 | 207 | def show_custom_context_menu(self, menu, pos): 208 | """display custom context menu in view 209 | 210 | @param menu: QMenu to display 211 | @param pos: cursor position 212 | """ 213 | if menu: 214 | menu.exec_(self.viewport().mapToGlobal(pos)) 215 | 216 | def slot_copy_column(self, action): 217 | """slot connected to custom context menu 218 | 219 | allows user to select a column and copy the data to clipboard 220 | 221 | @param action: QAction 222 | """ 223 | _, item, model_index = action.data() 224 | self.send_data_to_clipboard(item.data(model_index.column())) 225 | 226 | def slot_copy_row(self, action): 227 | """slot connected to custom context menu 228 | 229 | allows user to select a row and copy the space-delimited data to clipboard 230 | 231 | @param action: QAction 232 | """ 233 | _, item, _ = action.data() 234 | self.send_data_to_clipboard(str(item)) 235 | 236 | def slot_seek_to(self, action): 237 | """slot connected to custom context menu 238 | 239 | allows user to select a row and copy the space-delimited data to clipboard 240 | 241 | @param action: QAction 242 | """ 243 | _, item, _ = action.data() 244 | seek(item.location) 245 | 246 | def slot_rename_function(self, action): 247 | """slot connected to custom context menu 248 | 249 | allows user to select a edit a function name and push changes to IDA 250 | 251 | @param action: QAction 252 | """ 253 | _, item, model_index = action.data() 254 | 255 | # make item temporary edit, reset after user is finished 256 | item.setIsEditable(True) 257 | self.edit(model_index) 258 | item.setIsEditable(False) 259 | 260 | def slot_custom_context_menu_requested(self, pos): 261 | """slot connected to custom context menu request 262 | 263 | displays custom context menu to user containing action relevant to the item selected 264 | 265 | @param pos: cursor position 266 | """ 267 | model_index = self.indexAt(pos) 268 | 269 | if not model_index.isValid(): 270 | return 271 | 272 | item = self.map_index_to_source_item(model_index) 273 | 274 | column = model_index.column() 275 | menu = None 276 | 277 | if CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION == column and isinstance(item, CapaExplorerFunctionItem): 278 | # user hovered function item 279 | menu = self.load_function_item_context_menu(pos, item, model_index) 280 | else: 281 | # user hovered default item 282 | menu = self.load_default_context_menu(pos, item, model_index) 283 | 284 | # show custom context menu at view position 285 | self.show_custom_context_menu(menu, pos) 286 | 287 | def slot_double_click(self, model_index): 288 | """slot connected to double-click event 289 | 290 | if address column clicked, navigate IDA to address, else un/expand item clicked 291 | 292 | @param model_index: QModelIndex 293 | """ 294 | if not model_index.isValid(): 295 | return 296 | 297 | item = self.map_index_to_source_item(model_index) 298 | column = model_index.column() 299 | 300 | if CapaExplorerDataModel.COLUMN_INDEX_VIRTUAL_ADDRESS == column and item.location: 301 | # user double-clicked virtual address column - navigate IDA to address 302 | seek(item.location) 303 | 304 | if CapaExplorerDataModel.COLUMN_INDEX_RULE_INFORMATION == column: 305 | # user double-clicked information column - un/expand 306 | self.collapse(model_index) if self.isExpanded(model_index) else self.expand(model_index) -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/escsm.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .enums import MachineState 29 | 30 | HZ_CLS = ( 31 | 1,0,0,0,0,0,0,0, # 00 - 07 32 | 0,0,0,0,0,0,0,0, # 08 - 0f 33 | 0,0,0,0,0,0,0,0, # 10 - 17 34 | 0,0,0,1,0,0,0,0, # 18 - 1f 35 | 0,0,0,0,0,0,0,0, # 20 - 27 36 | 0,0,0,0,0,0,0,0, # 28 - 2f 37 | 0,0,0,0,0,0,0,0, # 30 - 37 38 | 0,0,0,0,0,0,0,0, # 38 - 3f 39 | 0,0,0,0,0,0,0,0, # 40 - 47 40 | 0,0,0,0,0,0,0,0, # 48 - 4f 41 | 0,0,0,0,0,0,0,0, # 50 - 57 42 | 0,0,0,0,0,0,0,0, # 58 - 5f 43 | 0,0,0,0,0,0,0,0, # 60 - 67 44 | 0,0,0,0,0,0,0,0, # 68 - 6f 45 | 0,0,0,0,0,0,0,0, # 70 - 77 46 | 0,0,0,4,0,5,2,0, # 78 - 7f 47 | 1,1,1,1,1,1,1,1, # 80 - 87 48 | 1,1,1,1,1,1,1,1, # 88 - 8f 49 | 1,1,1,1,1,1,1,1, # 90 - 97 50 | 1,1,1,1,1,1,1,1, # 98 - 9f 51 | 1,1,1,1,1,1,1,1, # a0 - a7 52 | 1,1,1,1,1,1,1,1, # a8 - af 53 | 1,1,1,1,1,1,1,1, # b0 - b7 54 | 1,1,1,1,1,1,1,1, # b8 - bf 55 | 1,1,1,1,1,1,1,1, # c0 - c7 56 | 1,1,1,1,1,1,1,1, # c8 - cf 57 | 1,1,1,1,1,1,1,1, # d0 - d7 58 | 1,1,1,1,1,1,1,1, # d8 - df 59 | 1,1,1,1,1,1,1,1, # e0 - e7 60 | 1,1,1,1,1,1,1,1, # e8 - ef 61 | 1,1,1,1,1,1,1,1, # f0 - f7 62 | 1,1,1,1,1,1,1,1, # f8 - ff 63 | ) 64 | 65 | HZ_ST = ( 66 | MachineState.START,MachineState.ERROR, 3,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07 67 | MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f 68 | MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START, 4,MachineState.ERROR,# 10-17 69 | 5,MachineState.ERROR, 6,MachineState.ERROR, 5, 5, 4,MachineState.ERROR,# 18-1f 70 | 4,MachineState.ERROR, 4, 4, 4,MachineState.ERROR, 4,MachineState.ERROR,# 20-27 71 | 4,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 28-2f 72 | ) 73 | 74 | HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) 75 | 76 | HZ_SM_MODEL = {'class_table': HZ_CLS, 77 | 'class_factor': 6, 78 | 'state_table': HZ_ST, 79 | 'char_len_table': HZ_CHAR_LEN_TABLE, 80 | 'name': "HZ-GB-2312", 81 | 'language': 'Chinese'} 82 | 83 | ISO2022CN_CLS = ( 84 | 2,0,0,0,0,0,0,0, # 00 - 07 85 | 0,0,0,0,0,0,0,0, # 08 - 0f 86 | 0,0,0,0,0,0,0,0, # 10 - 17 87 | 0,0,0,1,0,0,0,0, # 18 - 1f 88 | 0,0,0,0,0,0,0,0, # 20 - 27 89 | 0,3,0,0,0,0,0,0, # 28 - 2f 90 | 0,0,0,0,0,0,0,0, # 30 - 37 91 | 0,0,0,0,0,0,0,0, # 38 - 3f 92 | 0,0,0,4,0,0,0,0, # 40 - 47 93 | 0,0,0,0,0,0,0,0, # 48 - 4f 94 | 0,0,0,0,0,0,0,0, # 50 - 57 95 | 0,0,0,0,0,0,0,0, # 58 - 5f 96 | 0,0,0,0,0,0,0,0, # 60 - 67 97 | 0,0,0,0,0,0,0,0, # 68 - 6f 98 | 0,0,0,0,0,0,0,0, # 70 - 77 99 | 0,0,0,0,0,0,0,0, # 78 - 7f 100 | 2,2,2,2,2,2,2,2, # 80 - 87 101 | 2,2,2,2,2,2,2,2, # 88 - 8f 102 | 2,2,2,2,2,2,2,2, # 90 - 97 103 | 2,2,2,2,2,2,2,2, # 98 - 9f 104 | 2,2,2,2,2,2,2,2, # a0 - a7 105 | 2,2,2,2,2,2,2,2, # a8 - af 106 | 2,2,2,2,2,2,2,2, # b0 - b7 107 | 2,2,2,2,2,2,2,2, # b8 - bf 108 | 2,2,2,2,2,2,2,2, # c0 - c7 109 | 2,2,2,2,2,2,2,2, # c8 - cf 110 | 2,2,2,2,2,2,2,2, # d0 - d7 111 | 2,2,2,2,2,2,2,2, # d8 - df 112 | 2,2,2,2,2,2,2,2, # e0 - e7 113 | 2,2,2,2,2,2,2,2, # e8 - ef 114 | 2,2,2,2,2,2,2,2, # f0 - f7 115 | 2,2,2,2,2,2,2,2, # f8 - ff 116 | ) 117 | 118 | ISO2022CN_ST = ( 119 | MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07 120 | MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f 121 | MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17 122 | MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,# 18-1f 123 | MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 20-27 124 | 5, 6,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 28-2f 125 | MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 30-37 126 | MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,# 38-3f 127 | ) 128 | 129 | ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0) 130 | 131 | ISO2022CN_SM_MODEL = {'class_table': ISO2022CN_CLS, 132 | 'class_factor': 9, 133 | 'state_table': ISO2022CN_ST, 134 | 'char_len_table': ISO2022CN_CHAR_LEN_TABLE, 135 | 'name': "ISO-2022-CN", 136 | 'language': 'Chinese'} 137 | 138 | ISO2022JP_CLS = ( 139 | 2,0,0,0,0,0,0,0, # 00 - 07 140 | 0,0,0,0,0,0,2,2, # 08 - 0f 141 | 0,0,0,0,0,0,0,0, # 10 - 17 142 | 0,0,0,1,0,0,0,0, # 18 - 1f 143 | 0,0,0,0,7,0,0,0, # 20 - 27 144 | 3,0,0,0,0,0,0,0, # 28 - 2f 145 | 0,0,0,0,0,0,0,0, # 30 - 37 146 | 0,0,0,0,0,0,0,0, # 38 - 3f 147 | 6,0,4,0,8,0,0,0, # 40 - 47 148 | 0,9,5,0,0,0,0,0, # 48 - 4f 149 | 0,0,0,0,0,0,0,0, # 50 - 57 150 | 0,0,0,0,0,0,0,0, # 58 - 5f 151 | 0,0,0,0,0,0,0,0, # 60 - 67 152 | 0,0,0,0,0,0,0,0, # 68 - 6f 153 | 0,0,0,0,0,0,0,0, # 70 - 77 154 | 0,0,0,0,0,0,0,0, # 78 - 7f 155 | 2,2,2,2,2,2,2,2, # 80 - 87 156 | 2,2,2,2,2,2,2,2, # 88 - 8f 157 | 2,2,2,2,2,2,2,2, # 90 - 97 158 | 2,2,2,2,2,2,2,2, # 98 - 9f 159 | 2,2,2,2,2,2,2,2, # a0 - a7 160 | 2,2,2,2,2,2,2,2, # a8 - af 161 | 2,2,2,2,2,2,2,2, # b0 - b7 162 | 2,2,2,2,2,2,2,2, # b8 - bf 163 | 2,2,2,2,2,2,2,2, # c0 - c7 164 | 2,2,2,2,2,2,2,2, # c8 - cf 165 | 2,2,2,2,2,2,2,2, # d0 - d7 166 | 2,2,2,2,2,2,2,2, # d8 - df 167 | 2,2,2,2,2,2,2,2, # e0 - e7 168 | 2,2,2,2,2,2,2,2, # e8 - ef 169 | 2,2,2,2,2,2,2,2, # f0 - f7 170 | 2,2,2,2,2,2,2,2, # f8 - ff 171 | ) 172 | 173 | ISO2022JP_ST = ( 174 | MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 00-07 175 | MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 08-0f 176 | MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 10-17 177 | MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,# 18-1f 178 | MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 20-27 179 | MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 6,MachineState.ITS_ME,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,# 28-2f 180 | MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,# 30-37 181 | MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 38-3f 182 | MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,MachineState.START,MachineState.START,# 40-47 183 | ) 184 | 185 | ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) 186 | 187 | ISO2022JP_SM_MODEL = {'class_table': ISO2022JP_CLS, 188 | 'class_factor': 10, 189 | 'state_table': ISO2022JP_ST, 190 | 'char_len_table': ISO2022JP_CHAR_LEN_TABLE, 191 | 'name': "ISO-2022-JP", 192 | 'language': 'Japanese'} 193 | 194 | ISO2022KR_CLS = ( 195 | 2,0,0,0,0,0,0,0, # 00 - 07 196 | 0,0,0,0,0,0,0,0, # 08 - 0f 197 | 0,0,0,0,0,0,0,0, # 10 - 17 198 | 0,0,0,1,0,0,0,0, # 18 - 1f 199 | 0,0,0,0,3,0,0,0, # 20 - 27 200 | 0,4,0,0,0,0,0,0, # 28 - 2f 201 | 0,0,0,0,0,0,0,0, # 30 - 37 202 | 0,0,0,0,0,0,0,0, # 38 - 3f 203 | 0,0,0,5,0,0,0,0, # 40 - 47 204 | 0,0,0,0,0,0,0,0, # 48 - 4f 205 | 0,0,0,0,0,0,0,0, # 50 - 57 206 | 0,0,0,0,0,0,0,0, # 58 - 5f 207 | 0,0,0,0,0,0,0,0, # 60 - 67 208 | 0,0,0,0,0,0,0,0, # 68 - 6f 209 | 0,0,0,0,0,0,0,0, # 70 - 77 210 | 0,0,0,0,0,0,0,0, # 78 - 7f 211 | 2,2,2,2,2,2,2,2, # 80 - 87 212 | 2,2,2,2,2,2,2,2, # 88 - 8f 213 | 2,2,2,2,2,2,2,2, # 90 - 97 214 | 2,2,2,2,2,2,2,2, # 98 - 9f 215 | 2,2,2,2,2,2,2,2, # a0 - a7 216 | 2,2,2,2,2,2,2,2, # a8 - af 217 | 2,2,2,2,2,2,2,2, # b0 - b7 218 | 2,2,2,2,2,2,2,2, # b8 - bf 219 | 2,2,2,2,2,2,2,2, # c0 - c7 220 | 2,2,2,2,2,2,2,2, # c8 - cf 221 | 2,2,2,2,2,2,2,2, # d0 - d7 222 | 2,2,2,2,2,2,2,2, # d8 - df 223 | 2,2,2,2,2,2,2,2, # e0 - e7 224 | 2,2,2,2,2,2,2,2, # e8 - ef 225 | 2,2,2,2,2,2,2,2, # f0 - f7 226 | 2,2,2,2,2,2,2,2, # f8 - ff 227 | ) 228 | 229 | ISO2022KR_ST = ( 230 | MachineState.START, 3,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.ERROR,MachineState.ERROR,# 00-07 231 | MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,# 08-0f 232 | MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,# 10-17 233 | MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,# 18-1f 234 | MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.START,MachineState.START,MachineState.START,MachineState.START,# 20-27 235 | ) 236 | 237 | ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) 238 | 239 | ISO2022KR_SM_MODEL = {'class_table': ISO2022KR_CLS, 240 | 'class_factor': 6, 241 | 'state_table': ISO2022KR_ST, 242 | 'char_len_table': ISO2022KR_CHAR_LEN_TABLE, 243 | 'name': "ISO-2022-KR", 244 | 'language': 'Korean'} 245 | 246 | 247 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/langturkishmodel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ######################## BEGIN LICENSE BLOCK ######################## 3 | # The Original Code is Mozilla Communicator client code. 4 | # 5 | # The Initial Developer of the Original Code is 6 | # Netscape Communications Corporation. 7 | # Portions created by the Initial Developer are Copyright (C) 1998 8 | # the Initial Developer. All Rights Reserved. 9 | # 10 | # Contributor(s): 11 | # Mark Pilgrim - port to Python 12 | # Özgür Baskın - Turkish Language Model 13 | # 14 | # This library is free software; you can redistribute it and/or 15 | # modify it under the terms of the GNU Lesser General Public 16 | # License as published by the Free Software Foundation; either 17 | # version 2.1 of the License, or (at your option) any later version. 18 | # 19 | # This library is distributed in the hope that it will be useful, 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 | # Lesser General Public License for more details. 23 | # 24 | # You should have received a copy of the GNU Lesser General Public 25 | # License along with this library; if not, write to the Free Software 26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 27 | # 02110-1301 USA 28 | ######################### END LICENSE BLOCK ######################### 29 | 30 | # 255: Control characters that usually does not exist in any text 31 | # 254: Carriage/Return 32 | # 253: symbol (punctuation) that does not belong to word 33 | # 252: 0 - 9 34 | 35 | # Character Mapping Table: 36 | Latin5_TurkishCharToOrderMap = ( 37 | 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 38 | 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 39 | 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 40 | 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, 41 | 255, 23, 37, 47, 39, 29, 52, 36, 45, 53, 60, 16, 49, 20, 46, 42, 42 | 48, 69, 44, 35, 31, 51, 38, 62, 65, 43, 56,255,255,255,255,255, 43 | 255, 1, 21, 28, 12, 2, 18, 27, 25, 3, 24, 10, 5, 13, 4, 15, 44 | 26, 64, 7, 8, 9, 14, 32, 57, 58, 11, 22,255,255,255,255,255, 45 | 180,179,178,177,176,175,174,173,172,171,170,169,168,167,166,165, 46 | 164,163,162,161,160,159,101,158,157,156,155,154,153,152,151,106, 47 | 150,149,148,147,146,145,144,100,143,142,141,140,139,138,137,136, 48 | 94, 80, 93,135,105,134,133, 63,132,131,130,129,128,127,126,125, 49 | 124,104, 73, 99, 79, 85,123, 54,122, 98, 92,121,120, 91,103,119, 50 | 68,118,117, 97,116,115, 50, 90,114,113,112,111, 55, 41, 40, 86, 51 | 89, 70, 59, 78, 71, 82, 88, 33, 77, 66, 84, 83,110, 75, 61, 96, 52 | 30, 67,109, 74, 87,102, 34, 95, 81,108, 76, 72, 17, 6, 19,107, 53 | ) 54 | 55 | TurkishLangModel = ( 56 | 3,2,3,3,3,1,3,3,3,3,3,3,3,3,2,1,1,3,3,1,3,3,0,3,3,3,3,3,0,3,1,3, 57 | 3,2,1,0,0,1,1,0,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0,0,2,2,0,0,1,0,0,1, 58 | 3,2,2,3,3,0,3,3,3,3,3,3,3,2,3,1,0,3,3,1,3,3,0,3,3,3,3,3,0,3,0,3, 59 | 3,1,1,0,1,0,1,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,2,2,0,0,0,1,0,1, 60 | 3,3,2,3,3,0,3,3,3,3,3,3,3,2,3,1,1,3,3,0,3,3,1,2,3,3,3,3,0,3,0,3, 61 | 3,1,1,0,0,0,1,0,0,0,0,1,1,0,1,2,1,0,0,0,1,0,0,0,0,2,0,0,0,0,0,1, 62 | 3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,1,3,3,2,0,3,2,1,2,2,1,3,3,0,0,0,2, 63 | 2,2,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,1, 64 | 3,3,3,2,3,3,1,2,3,3,3,3,3,3,3,1,3,2,1,0,3,2,0,1,2,3,3,2,1,0,0,2, 65 | 2,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,2,0,0,0, 66 | 1,0,1,3,3,1,3,3,3,3,3,3,3,1,2,0,0,2,3,0,2,3,0,0,2,2,2,3,0,3,0,1, 67 | 2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 68 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,0,3,2,0,2,3,2,3,3,1,0,0,2, 69 | 3,2,0,0,1,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,2,0,0,1, 70 | 3,3,3,2,3,3,2,3,3,3,3,2,3,3,3,0,3,3,0,0,2,1,0,0,2,3,2,2,0,0,0,2, 71 | 2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,1,0,2,0,0,1, 72 | 3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,0,3,2,0,1,3,2,1,1,3,2,3,2,1,0,0,2, 73 | 2,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0, 74 | 3,3,3,2,3,3,3,3,3,3,3,2,3,3,3,0,3,2,2,0,2,3,0,0,2,2,2,2,0,0,0,2, 75 | 3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,1,0,0,0, 76 | 3,3,3,3,3,3,3,2,2,2,2,3,2,3,3,0,3,3,1,1,2,2,0,0,2,2,3,2,0,0,1,3, 77 | 0,3,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1, 78 | 3,3,3,2,3,3,3,2,1,2,2,3,2,3,3,0,3,2,0,0,1,1,0,1,1,2,1,2,0,0,0,1, 79 | 0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0, 80 | 3,3,3,2,3,3,2,3,2,2,2,3,3,3,3,1,3,1,1,0,3,2,1,1,3,3,2,3,1,0,0,1, 81 | 1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,0,0,1, 82 | 3,2,2,3,3,0,3,3,3,3,3,3,3,2,2,1,0,3,3,1,3,3,0,1,3,3,2,3,0,3,0,3, 83 | 2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 84 | 2,2,2,3,3,0,3,3,3,3,3,3,3,3,3,0,0,3,2,0,3,3,0,3,2,3,3,3,0,3,1,3, 85 | 2,0,0,0,0,0,0,0,0,0,0,1,0,1,2,0,1,0,0,0,0,0,0,0,2,2,0,0,1,0,0,1, 86 | 3,3,3,1,2,3,3,1,0,0,1,0,0,3,3,2,3,0,0,2,0,0,2,0,2,0,0,0,2,0,2,0, 87 | 0,3,1,0,1,0,0,0,2,2,1,0,1,1,2,1,2,2,2,0,2,1,1,0,0,0,2,0,0,0,0,0, 88 | 1,2,1,3,3,0,3,3,3,3,3,2,3,0,0,0,0,2,3,0,2,3,1,0,2,3,1,3,0,3,0,2, 89 | 3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 90 | 3,3,3,1,3,3,2,2,3,2,2,0,1,2,3,0,1,2,1,0,1,0,0,0,1,0,2,2,0,0,0,1, 91 | 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0, 92 | 3,3,3,1,3,3,1,1,3,3,1,1,3,3,1,0,2,1,2,0,2,1,0,0,1,1,2,1,0,0,0,2, 93 | 2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 94 | 3,3,3,1,0,2,1,3,0,0,2,0,0,3,3,0,3,0,0,1,0,1,2,0,0,1,1,2,2,0,1,0, 95 | 0,1,2,1,1,0,1,0,1,1,1,1,1,0,1,1,1,2,2,1,2,0,1,0,0,0,0,0,0,1,0,0, 96 | 3,3,3,2,3,2,3,3,0,2,2,2,3,3,3,0,3,0,0,0,2,2,0,1,2,1,1,1,0,0,0,1, 97 | 0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, 98 | 3,3,3,3,3,3,2,1,2,2,3,3,3,3,2,0,2,0,0,0,2,2,0,0,2,1,3,3,0,0,1,1, 99 | 1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0, 100 | 1,1,2,3,3,0,3,3,3,3,3,3,2,2,0,2,0,2,3,2,3,2,2,2,2,2,2,2,1,3,2,3, 101 | 2,0,2,1,2,2,2,2,1,1,2,2,1,2,2,1,2,0,0,2,1,1,0,2,1,0,0,1,0,0,0,1, 102 | 2,3,3,1,1,1,0,1,1,1,2,3,2,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0, 103 | 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 104 | 3,3,3,2,2,2,3,2,3,2,2,1,3,3,3,0,2,1,2,0,2,1,0,0,1,1,1,1,1,0,0,1, 105 | 2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,2,0,1,0,0,0, 106 | 3,3,3,2,3,3,3,3,3,2,3,1,2,3,3,1,2,0,0,0,0,0,0,0,3,2,1,1,0,0,0,0, 107 | 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 108 | 3,3,3,2,2,3,3,2,1,1,1,1,1,3,3,0,3,1,0,0,1,1,0,0,3,1,2,1,0,0,0,0, 109 | 0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0, 110 | 3,3,3,2,2,3,2,2,2,3,2,1,1,3,3,0,3,0,0,0,0,1,0,0,3,1,1,2,0,0,0,1, 111 | 1,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, 112 | 1,1,1,3,3,0,3,3,3,3,3,2,2,2,1,2,0,2,1,2,2,1,1,0,1,2,2,2,2,2,2,2, 113 | 0,0,2,1,2,1,2,1,0,1,1,3,1,2,1,1,2,0,0,2,0,1,0,1,0,1,0,0,0,1,0,1, 114 | 3,3,3,1,3,3,3,0,1,1,0,2,2,3,1,0,3,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0, 115 | 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 116 | 3,3,2,0,0,2,2,1,0,0,1,0,0,3,3,1,3,0,0,1,1,0,2,0,3,0,0,0,2,0,1,1, 117 | 0,1,2,0,1,2,2,0,2,2,2,2,1,0,2,1,1,0,2,0,2,1,2,0,0,0,0,0,0,0,0,0, 118 | 3,3,3,1,3,2,3,2,0,2,2,2,1,3,2,0,2,1,2,0,1,2,0,0,1,0,2,2,0,0,0,2, 119 | 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0, 120 | 3,3,3,0,3,3,1,1,2,3,1,0,3,2,3,0,3,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0, 121 | 1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 122 | 0,0,0,3,3,0,3,3,2,3,3,2,2,0,0,0,0,1,2,0,1,3,0,0,0,3,1,1,0,3,0,2, 123 | 2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 124 | 3,3,3,1,2,2,1,0,3,1,1,1,1,3,3,2,3,0,0,1,0,1,2,0,2,2,0,2,2,0,2,1, 125 | 0,2,2,1,1,1,1,0,2,1,1,0,1,1,1,1,2,1,2,1,2,0,1,0,1,0,0,0,0,0,0,0, 126 | 3,3,3,0,1,1,3,0,0,1,1,0,0,2,2,0,3,0,0,1,1,0,1,0,0,0,0,0,2,0,0,0, 127 | 0,3,1,0,1,0,1,0,2,0,0,1,0,1,0,1,1,1,2,1,1,0,2,0,0,0,0,0,0,0,0,0, 128 | 3,3,3,0,2,0,2,0,1,1,1,0,0,3,3,0,2,0,0,1,0,0,2,1,1,0,1,0,1,0,1,0, 129 | 0,2,0,1,2,0,2,0,2,1,1,0,1,0,2,1,1,0,2,1,1,0,1,0,0,0,1,1,0,0,0,0, 130 | 3,2,3,0,1,0,0,0,0,0,0,0,0,1,2,0,1,0,0,1,0,0,1,0,0,0,0,0,2,0,0,0, 131 | 0,0,1,1,0,0,1,0,1,0,0,1,0,0,0,2,1,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0, 132 | 3,3,3,0,0,2,3,0,0,1,0,1,0,2,3,2,3,0,0,1,3,0,2,1,0,0,0,0,2,0,1,0, 133 | 0,2,1,0,0,1,1,0,2,1,0,0,1,0,0,1,1,0,1,1,2,0,1,0,0,0,0,1,0,0,0,0, 134 | 3,2,2,0,0,1,1,0,0,0,0,0,0,3,1,1,1,0,0,0,0,0,1,0,0,0,0,0,2,0,1,0, 135 | 0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0, 136 | 0,0,0,3,3,0,2,3,2,2,1,2,2,1,1,2,0,1,3,2,2,2,0,0,2,2,0,0,0,1,2,1, 137 | 3,0,2,1,1,0,1,1,1,0,1,2,2,2,1,1,2,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0, 138 | 0,1,1,2,3,0,3,3,3,2,2,2,2,1,0,1,0,1,0,1,2,2,0,0,2,2,1,3,1,1,2,1, 139 | 0,0,1,1,2,0,1,1,0,0,1,2,0,2,1,1,2,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0, 140 | 3,3,2,0,0,3,1,0,0,0,0,0,0,3,2,1,2,0,0,1,0,0,2,0,0,0,0,0,2,0,1,0, 141 | 0,2,1,1,0,0,1,0,1,2,0,0,1,1,0,0,2,1,1,1,1,0,2,0,0,0,0,0,0,0,0,0, 142 | 3,3,2,0,0,1,0,0,0,0,1,0,0,3,3,2,2,0,0,1,0,0,2,0,1,0,0,0,2,0,1,0, 143 | 0,0,1,1,0,0,2,0,2,1,0,0,1,1,2,1,2,0,2,1,2,1,1,1,0,0,1,1,0,0,0,0, 144 | 3,3,2,0,0,2,2,0,0,0,1,1,0,2,2,1,3,1,0,1,0,1,2,0,0,0,0,0,1,0,1,0, 145 | 0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0, 146 | 3,3,3,2,0,0,0,1,0,0,1,0,0,2,3,1,2,0,0,1,0,0,2,0,0,0,1,0,2,0,2,0, 147 | 0,1,1,2,2,1,2,0,2,1,1,0,0,1,1,0,1,1,1,1,2,1,1,0,0,0,0,0,0,0,0,0, 148 | 3,3,3,0,2,1,2,1,0,0,1,1,0,3,3,1,2,0,0,1,0,0,2,0,2,0,1,1,2,0,0,0, 149 | 0,0,1,1,1,1,2,0,1,1,0,1,1,1,1,0,0,0,1,1,1,0,1,0,0,0,1,0,0,0,0,0, 150 | 3,3,3,0,2,2,3,2,0,0,1,0,0,2,3,1,0,0,0,0,0,0,2,0,2,0,0,0,2,0,0,0, 151 | 0,1,1,0,0,0,1,0,0,1,0,1,1,0,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0, 152 | 3,2,3,0,0,0,0,0,0,0,1,0,0,2,2,2,2,0,0,1,0,0,2,0,0,0,0,0,2,0,1,0, 153 | 0,0,2,1,1,0,1,0,2,1,1,0,0,1,1,2,1,0,2,0,2,0,1,0,0,0,2,0,0,0,0,0, 154 | 0,0,0,2,2,0,2,1,1,1,1,2,2,0,0,1,0,1,0,0,1,3,0,0,0,0,1,0,0,2,1,0, 155 | 0,0,1,0,1,0,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0, 156 | 2,0,0,2,3,0,2,3,1,2,2,0,2,0,0,2,0,2,1,1,1,2,1,0,0,1,2,1,1,2,1,0, 157 | 1,0,2,0,1,0,1,1,0,0,2,2,1,2,1,1,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, 158 | 3,3,3,0,2,1,2,0,0,0,1,0,0,3,2,0,1,0,0,1,0,0,2,0,0,0,1,2,1,0,1,0, 159 | 0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0, 160 | 0,0,0,2,2,0,2,2,1,1,0,1,1,1,1,1,0,0,1,2,1,1,1,0,1,0,0,0,1,1,1,1, 161 | 0,0,2,1,0,1,1,1,0,1,1,2,1,2,1,1,2,0,1,1,2,1,0,2,0,0,0,0,0,0,0,0, 162 | 3,2,2,0,0,2,0,0,0,0,0,0,0,2,2,0,2,0,0,1,0,0,2,0,0,0,0,0,2,0,0,0, 163 | 0,2,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0, 164 | 0,0,0,3,2,0,2,2,0,1,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0, 165 | 2,0,1,0,1,0,1,1,0,0,1,2,0,1,0,1,1,0,0,1,0,1,0,2,0,0,0,0,0,0,0,0, 166 | 2,2,2,0,1,1,0,0,0,1,0,0,0,1,2,0,1,0,0,1,0,0,1,0,0,0,0,1,2,0,1,0, 167 | 0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0, 168 | 2,2,2,2,1,0,1,1,1,0,0,0,0,1,2,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0, 169 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0, 170 | 1,1,2,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 171 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,1, 172 | 0,0,1,2,2,0,2,1,2,1,1,2,2,0,0,0,0,1,0,0,1,1,0,0,2,0,0,0,0,1,0,0, 173 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, 174 | 2,2,2,0,0,0,1,0,0,0,0,0,0,2,2,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, 175 | 0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 176 | 0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0, 177 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 178 | 2,2,2,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0, 179 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0, 180 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 181 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 182 | 0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 183 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 184 | ) 185 | 186 | Latin5TurkishModel = { 187 | 'char_to_order_map': Latin5_TurkishCharToOrderMap, 188 | 'precedence_matrix': TurkishLangModel, 189 | 'typical_positive_ratio': 0.970290, 190 | 'keep_english_letter': True, 191 | 'charset_name': "ISO-8859-9", 192 | 'language': 'Turkish', 193 | } 194 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/langthaimodel.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | # 255: Control characters that usually does not exist in any text 29 | # 254: Carriage/Return 30 | # 253: symbol (punctuation) that does not belong to word 31 | # 252: 0 - 9 32 | 33 | # The following result for thai was collected from a limited sample (1M). 34 | 35 | # Character Mapping Table: 36 | TIS620CharToOrderMap = ( 37 | 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 38 | 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 39 | 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 40 | 252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, # 30 41 | 253,182,106,107,100,183,184,185,101, 94,186,187,108,109,110,111, # 40 42 | 188,189,190, 89, 95,112,113,191,192,193,194,253,253,253,253,253, # 50 43 | 253, 64, 72, 73,114, 74,115,116,102, 81,201,117, 90,103, 78, 82, # 60 44 | 96,202, 91, 79, 84,104,105, 97, 98, 92,203,253,253,253,253,253, # 70 45 | 209,210,211,212,213, 88,214,215,216,217,218,219,220,118,221,222, 46 | 223,224, 99, 85, 83,225,226,227,228,229,230,231,232,233,234,235, 47 | 236, 5, 30,237, 24,238, 75, 8, 26, 52, 34, 51,119, 47, 58, 57, 48 | 49, 53, 55, 43, 20, 19, 44, 14, 48, 3, 17, 25, 39, 62, 31, 54, 49 | 45, 9, 16, 2, 61, 15,239, 12, 42, 46, 18, 21, 76, 4, 66, 63, 50 | 22, 10, 1, 36, 23, 13, 40, 27, 32, 35, 86,240,241,242,243,244, 51 | 11, 28, 41, 29, 33,245, 50, 37, 6, 7, 67, 77, 38, 93,246,247, 52 | 68, 56, 59, 65, 69, 60, 70, 80, 71, 87,248,249,250,251,252,253, 53 | ) 54 | 55 | # Model Table: 56 | # total sequences: 100% 57 | # first 512 sequences: 92.6386% 58 | # first 1024 sequences:7.3177% 59 | # rest sequences: 1.0230% 60 | # negative sequences: 0.0436% 61 | ThaiLangModel = ( 62 | 0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3, 63 | 0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2, 64 | 3,0,3,3,2,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,0,3,2,3,0,2,2,2,3, 65 | 0,2,3,0,0,0,0,1,0,1,2,3,1,1,3,2,2,0,1,1,0,0,1,0,0,0,0,0,0,0,1,1, 66 | 3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,3,3,2,3,2,3,3,2,2,2, 67 | 3,1,2,3,0,3,3,2,2,1,2,3,3,1,2,0,1,3,0,1,0,0,1,0,0,0,0,0,0,0,1,1, 68 | 3,3,2,2,3,3,3,3,1,2,3,3,3,3,3,2,2,2,2,3,3,2,2,3,3,2,2,3,2,3,2,2, 69 | 3,3,1,2,3,1,2,2,3,3,1,0,2,1,0,0,3,1,2,1,0,0,1,0,0,0,0,0,0,1,0,1, 70 | 3,3,3,3,3,3,2,2,3,3,3,3,2,3,2,2,3,3,2,2,3,2,2,2,2,1,1,3,1,2,1,1, 71 | 3,2,1,0,2,1,0,1,0,1,1,0,1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0, 72 | 3,3,3,2,3,2,3,3,2,2,3,2,3,3,2,3,1,1,2,3,2,2,2,3,2,2,2,2,2,1,2,1, 73 | 2,2,1,1,3,3,2,1,0,1,2,2,0,1,3,0,0,0,1,1,0,0,0,0,0,2,3,0,0,2,1,1, 74 | 3,3,2,3,3,2,0,0,3,3,0,3,3,0,2,2,3,1,2,2,1,1,1,0,2,2,2,0,2,2,1,1, 75 | 0,2,1,0,2,0,0,2,0,1,0,0,1,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0, 76 | 3,3,2,3,3,2,0,0,3,3,0,2,3,0,2,1,2,2,2,2,1,2,0,0,2,2,2,0,2,2,1,1, 77 | 0,2,1,0,2,0,0,2,0,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0, 78 | 3,3,2,3,2,3,2,0,2,2,1,3,2,1,3,2,1,2,3,2,2,3,0,2,3,2,2,1,2,2,2,2, 79 | 1,2,2,0,0,0,0,2,0,1,2,0,1,1,1,0,1,0,3,1,1,0,0,0,0,0,0,0,0,0,1,0, 80 | 3,3,2,3,3,2,3,2,2,2,3,2,2,3,2,2,1,2,3,2,2,3,1,3,2,2,2,3,2,2,2,3, 81 | 3,2,1,3,0,1,1,1,0,2,1,1,1,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,2,0,0, 82 | 1,0,0,3,0,3,3,3,3,3,0,0,3,0,2,2,3,3,3,3,3,0,0,0,1,1,3,0,0,0,0,2, 83 | 0,0,1,0,0,0,0,0,0,0,2,3,0,0,0,3,0,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0, 84 | 2,0,3,3,3,3,0,0,2,3,0,0,3,0,3,3,2,3,3,3,3,3,0,0,3,3,3,0,0,0,3,3, 85 | 0,0,3,0,0,0,0,2,0,0,2,1,1,3,0,0,1,0,0,2,3,0,1,0,0,0,0,0,0,0,1,0, 86 | 3,3,3,3,2,3,3,3,3,3,3,3,1,2,1,3,3,2,2,1,2,2,2,3,1,1,2,0,2,1,2,1, 87 | 2,2,1,0,0,0,1,1,0,1,0,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0, 88 | 3,0,2,1,2,3,3,3,0,2,0,2,2,0,2,1,3,2,2,1,2,1,0,0,2,2,1,0,2,1,2,2, 89 | 0,1,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 90 | 3,3,3,3,2,1,3,3,1,1,3,0,2,3,1,1,3,2,1,1,2,0,2,2,3,2,1,1,1,1,1,2, 91 | 3,0,0,1,3,1,2,1,2,0,3,0,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0, 92 | 3,3,1,1,3,2,3,3,3,1,3,2,1,3,2,1,3,2,2,2,2,1,3,3,1,2,1,3,1,2,3,0, 93 | 2,1,1,3,2,2,2,1,2,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2, 94 | 3,3,2,3,2,3,3,2,3,2,3,2,3,3,2,1,0,3,2,2,2,1,2,2,2,1,2,2,1,2,1,1, 95 | 2,2,2,3,0,1,3,1,1,1,1,0,1,1,0,2,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, 96 | 3,3,3,3,2,3,2,2,1,1,3,2,3,2,3,2,0,3,2,2,1,2,0,2,2,2,1,2,2,2,2,1, 97 | 3,2,1,2,2,1,0,2,0,1,0,0,1,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1, 98 | 3,3,3,3,3,2,3,1,2,3,3,2,2,3,0,1,1,2,0,3,3,2,2,3,0,1,1,3,0,0,0,0, 99 | 3,1,0,3,3,0,2,0,2,1,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 100 | 3,3,3,2,3,2,3,3,0,1,3,1,1,2,1,2,1,1,3,1,1,0,2,3,1,1,1,1,1,1,1,1, 101 | 3,1,1,2,2,2,2,1,1,1,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, 102 | 3,2,2,1,1,2,1,3,3,2,3,2,2,3,2,2,3,1,2,2,1,2,0,3,2,1,2,2,2,2,2,1, 103 | 3,2,1,2,2,2,1,1,1,1,0,0,1,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, 104 | 3,3,3,3,3,3,3,3,1,3,3,0,2,1,0,3,2,0,0,3,1,0,1,1,0,1,0,0,0,0,0,1, 105 | 1,0,0,1,0,3,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 106 | 3,0,2,2,2,3,0,0,1,3,0,3,2,0,3,2,2,3,3,3,3,3,1,0,2,2,2,0,2,2,1,2, 107 | 0,2,3,0,0,0,0,1,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, 108 | 3,0,2,3,1,3,3,2,3,3,0,3,3,0,3,2,2,3,2,3,3,3,0,0,2,2,3,0,1,1,1,3, 109 | 0,0,3,0,0,0,2,2,0,1,3,0,1,2,2,2,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1, 110 | 3,2,3,3,2,0,3,3,2,2,3,1,3,2,1,3,2,0,1,2,2,0,2,3,2,1,0,3,0,0,0,0, 111 | 3,0,0,2,3,1,3,0,0,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 112 | 3,1,3,2,2,2,1,2,0,1,3,1,1,3,1,3,0,0,2,1,1,1,1,2,1,1,1,0,2,1,0,1, 113 | 1,2,0,0,0,3,1,1,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,3,1,0,0,0,1,0, 114 | 3,3,3,3,2,2,2,2,2,1,3,1,1,1,2,0,1,1,2,1,2,1,3,2,0,0,3,1,1,1,1,1, 115 | 3,1,0,2,3,0,0,0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 116 | 0,0,0,2,3,0,3,3,0,2,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0, 117 | 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 118 | 0,0,2,3,1,3,0,0,1,2,0,0,2,0,3,3,2,3,3,3,2,3,0,0,2,2,2,0,0,0,2,2, 119 | 0,0,1,0,0,0,0,3,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, 120 | 0,0,0,3,0,2,0,0,0,0,0,0,0,0,0,0,1,2,3,1,3,3,0,0,1,0,3,0,0,0,0,0, 121 | 0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 122 | 3,3,1,2,3,1,2,3,1,0,3,0,2,2,1,0,2,1,1,2,0,1,0,0,1,1,1,1,0,1,0,0, 123 | 1,0,0,0,0,1,1,0,3,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 124 | 3,3,3,3,2,1,0,1,1,1,3,1,2,2,2,2,2,2,1,1,1,1,0,3,1,0,1,3,1,1,1,1, 125 | 1,1,0,2,0,1,3,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,1, 126 | 3,0,2,2,1,3,3,2,3,3,0,1,1,0,2,2,1,2,1,3,3,1,0,0,3,2,0,0,0,0,2,1, 127 | 0,1,0,0,0,0,1,2,0,1,1,3,1,1,2,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, 128 | 0,0,3,0,0,1,0,0,0,3,0,0,3,0,3,1,0,1,1,1,3,2,0,0,0,3,0,0,0,0,2,0, 129 | 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0, 130 | 3,3,1,3,2,1,3,3,1,2,2,0,1,2,1,0,1,2,0,0,0,0,0,3,0,0,0,3,0,0,0,0, 131 | 3,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 132 | 3,0,1,2,0,3,3,3,2,2,0,1,1,0,1,3,0,0,0,2,2,0,0,0,0,3,1,0,1,0,0,0, 133 | 0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 134 | 3,0,2,3,1,2,0,0,2,1,0,3,1,0,1,2,0,1,1,1,1,3,0,0,3,1,1,0,2,2,1,1, 135 | 0,2,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 136 | 3,0,0,3,1,2,0,0,2,2,0,1,2,0,1,0,1,3,1,2,1,0,0,0,2,0,3,0,0,0,1,0, 137 | 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 138 | 3,0,1,1,2,2,0,0,0,2,0,2,1,0,1,1,0,1,1,1,2,1,0,0,1,1,1,0,2,1,1,1, 139 | 0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1, 140 | 0,0,0,2,0,1,3,1,1,1,1,0,0,0,0,3,2,0,1,0,0,0,1,2,0,0,0,1,0,0,0,0, 141 | 0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 142 | 0,0,0,0,0,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, 143 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 144 | 1,0,2,3,2,2,0,0,0,1,0,0,0,0,2,3,2,1,2,2,3,0,0,0,2,3,1,0,0,0,1,1, 145 | 0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0, 146 | 3,3,2,2,0,1,0,0,0,0,2,0,2,0,1,0,0,0,1,1,0,0,0,2,1,0,1,0,1,1,0,0, 147 | 0,1,0,2,0,0,1,0,3,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 148 | 3,3,1,0,0,1,0,0,0,0,0,1,1,2,0,0,0,0,1,0,0,1,3,1,0,0,0,0,1,1,0,0, 149 | 0,1,0,0,0,0,3,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0, 150 | 3,3,1,1,1,1,2,3,0,0,2,1,1,1,1,1,0,2,1,1,0,0,0,2,1,0,1,2,1,1,0,1, 151 | 2,1,0,3,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 152 | 1,3,1,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1, 153 | 0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 154 | 3,3,2,0,0,0,0,0,0,1,2,1,0,1,1,0,2,0,0,1,0,0,2,0,0,0,0,0,0,0,0,0, 155 | 0,0,0,0,0,0,2,0,0,0,1,3,0,1,0,0,0,2,0,0,0,0,0,0,0,1,2,0,0,0,0,0, 156 | 3,3,0,0,1,1,2,0,0,1,2,1,0,1,1,1,0,1,1,0,0,2,1,1,0,1,0,0,1,1,1,0, 157 | 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 158 | 2,2,2,1,0,0,0,0,1,0,0,0,0,3,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0, 159 | 2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 160 | 2,3,0,0,1,1,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 161 | 0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 162 | 3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 163 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 164 | 1,1,0,1,2,0,1,2,0,0,1,1,0,2,0,1,0,0,1,0,0,0,0,1,0,0,0,2,0,0,0,0, 165 | 1,0,0,1,0,1,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 166 | 0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,2,1,3,0,0,0,0,1,1,0,0,0,0,0,0,0,3, 167 | 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 168 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0, 169 | 0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 170 | 2,0,1,0,1,0,0,2,0,0,2,0,0,1,1,2,0,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0, 171 | 1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, 172 | 1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1, 173 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 174 | 3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 175 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,1,0,0,0, 176 | 2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0, 177 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 178 | 2,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0, 179 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 180 | 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 181 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,3,0,0,0, 182 | 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 183 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0, 184 | 1,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0, 185 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 186 | 0,0,1,1,0,0,2,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 187 | 0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 188 | 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 189 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 190 | ) 191 | 192 | TIS620ThaiModel = { 193 | 'char_to_order_map': TIS620CharToOrderMap, 194 | 'precedence_matrix': ThaiLangModel, 195 | 'typical_positive_ratio': 0.926386, 196 | 'keep_english_letter': False, 197 | 'charset_name': "TIS-620", 198 | 'language': 'Thai', 199 | } 200 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright (C) 2020 @9wayhandshake 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/langhebrewmodel.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Simon Montagu 6 | # Portions created by the Initial Developer are Copyright (C) 2005 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # Shoshannah Forbes - original C code (?) 13 | # 14 | # This library is free software; you can redistribute it and/or 15 | # modify it under the terms of the GNU Lesser General Public 16 | # License as published by the Free Software Foundation; either 17 | # version 2.1 of the License, or (at your option) any later version. 18 | # 19 | # This library is distributed in the hope that it will be useful, 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 | # Lesser General Public License for more details. 23 | # 24 | # You should have received a copy of the GNU Lesser General Public 25 | # License along with this library; if not, write to the Free Software 26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 27 | # 02110-1301 USA 28 | ######################### END LICENSE BLOCK ######################### 29 | 30 | # 255: Control characters that usually does not exist in any text 31 | # 254: Carriage/Return 32 | # 253: symbol (punctuation) that does not belong to word 33 | # 252: 0 - 9 34 | 35 | # Windows-1255 language model 36 | # Character Mapping Table: 37 | WIN1255_CHAR_TO_ORDER_MAP = ( 38 | 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 39 | 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 40 | 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 41 | 252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, # 30 42 | 253, 69, 91, 79, 80, 92, 89, 97, 90, 68,111,112, 82, 73, 95, 85, # 40 43 | 78,121, 86, 71, 67,102,107, 84,114,103,115,253,253,253,253,253, # 50 44 | 253, 50, 74, 60, 61, 42, 76, 70, 64, 53,105, 93, 56, 65, 54, 49, # 60 45 | 66,110, 51, 43, 44, 63, 81, 77, 98, 75,108,253,253,253,253,253, # 70 46 | 124,202,203,204,205, 40, 58,206,207,208,209,210,211,212,213,214, 47 | 215, 83, 52, 47, 46, 72, 32, 94,216,113,217,109,218,219,220,221, 48 | 34,116,222,118,100,223,224,117,119,104,125,225,226, 87, 99,227, 49 | 106,122,123,228, 55,229,230,101,231,232,120,233, 48, 39, 57,234, 50 | 30, 59, 41, 88, 33, 37, 36, 31, 29, 35,235, 62, 28,236,126,237, 51 | 238, 38, 45,239,240,241,242,243,127,244,245,246,247,248,249,250, 52 | 9, 8, 20, 16, 3, 2, 24, 14, 22, 1, 25, 15, 4, 11, 6, 23, 53 | 12, 19, 13, 26, 18, 27, 21, 17, 7, 10, 5,251,252,128, 96,253, 54 | ) 55 | 56 | # Model Table: 57 | # total sequences: 100% 58 | # first 512 sequences: 98.4004% 59 | # first 1024 sequences: 1.5981% 60 | # rest sequences: 0.087% 61 | # negative sequences: 0.0015% 62 | HEBREW_LANG_MODEL = ( 63 | 0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0, 64 | 3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1, 65 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2, 66 | 1,2,1,2,1,2,0,0,2,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0, 67 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2, 68 | 1,2,1,3,1,1,0,0,2,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, 69 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,1,2,2,1,3, 70 | 1,2,1,1,2,2,0,0,2,2,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0, 71 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,2,2,2,3,2, 72 | 1,2,1,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0, 73 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,2,3,2,2,3,2,2,2,1,2,2,2,2, 74 | 1,2,1,1,2,2,0,1,2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0, 75 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,0,2,2,2,2,2, 76 | 0,2,0,2,2,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0, 77 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,0,2,2,2, 78 | 0,2,1,2,2,2,0,0,2,1,0,0,0,0,1,0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0, 79 | 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,2,1,2,3,2,2,2, 80 | 1,2,1,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0, 81 | 3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,3,1,0,2,0,2, 82 | 0,2,1,2,2,2,0,0,1,2,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,2,0,0,1,0, 83 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,2,3,2,2,3,2,1,2,1,1,1, 84 | 0,1,1,1,1,1,3,0,1,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, 85 | 3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,0,1,0,0,1,0,0,0,0, 86 | 0,0,1,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 87 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2, 88 | 0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, 89 | 3,3,3,3,3,3,3,3,3,2,3,3,3,2,1,2,3,3,2,3,3,3,3,2,3,2,1,2,0,2,1,2, 90 | 0,2,0,2,2,2,0,0,1,2,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0, 91 | 3,3,3,3,3,3,3,3,3,2,3,3,3,1,2,2,3,3,2,3,2,3,2,2,3,1,2,2,0,2,2,2, 92 | 0,2,1,2,2,2,0,0,1,2,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0, 93 | 3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,2,2,3,3,3,3,1,3,2,2,2, 94 | 0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, 95 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,2,2,2,1,2,2,0,2,2,2,2, 96 | 0,2,0,2,2,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, 97 | 3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,1,3,2,3,3,2,3,3,2,2,1,2,2,2,2,2,2, 98 | 0,2,1,2,1,2,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0, 99 | 3,3,3,3,3,3,2,3,2,3,3,2,3,3,3,3,2,3,2,3,3,3,3,3,2,2,2,2,2,2,2,1, 100 | 0,2,0,1,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0, 101 | 3,3,3,3,3,3,3,3,3,2,1,2,3,3,3,3,3,3,3,2,3,2,3,2,1,2,3,0,2,1,2,2, 102 | 0,2,1,1,2,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0, 103 | 3,3,3,3,3,3,3,3,3,2,3,3,3,3,2,1,3,1,2,2,2,1,2,3,3,1,2,1,2,2,2,2, 104 | 0,1,1,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,2,0,0,0,0,0,0,0,0, 105 | 3,3,3,3,3,3,3,3,3,3,0,2,3,3,3,1,3,3,3,1,2,2,2,2,1,1,2,2,2,2,2,2, 106 | 0,2,0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0, 107 | 3,3,3,3,3,3,2,3,3,3,2,2,3,3,3,2,1,2,3,2,3,2,2,2,2,1,2,1,1,1,2,2, 108 | 0,2,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, 109 | 3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,1,0,0,0,0,0, 110 | 1,0,1,0,0,0,0,0,2,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 111 | 3,3,3,3,3,2,3,3,2,3,1,2,2,2,2,3,2,3,1,1,2,2,1,2,2,1,1,0,2,2,2,2, 112 | 0,1,0,1,2,2,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0, 113 | 3,0,0,1,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2,0, 114 | 0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 115 | 3,0,1,0,1,0,1,1,0,1,1,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0, 116 | 0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 117 | 3,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, 118 | 0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, 119 | 3,2,2,1,2,2,2,2,2,2,2,1,2,2,1,2,2,1,1,1,1,1,1,1,1,2,1,1,0,3,3,3, 120 | 0,3,0,2,2,2,2,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, 121 | 2,2,2,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,1,2,2,2,1,1,1,2,0,1, 122 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 123 | 2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,0,2,2,0,0,0,0,0,0, 124 | 0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 125 | 2,3,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,1,0,2,1,0, 126 | 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 127 | 3,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0, 128 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, 129 | 0,3,1,1,2,2,2,2,2,1,2,2,2,1,1,2,2,2,2,2,2,2,1,2,2,1,0,1,1,1,1,0, 130 | 0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 131 | 3,2,1,1,1,1,2,1,1,2,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0, 132 | 0,0,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,1,0,0, 133 | 2,1,1,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,1,2,1,2,1,1,1,1,0,0,0,0, 134 | 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 135 | 1,2,1,2,2,2,2,2,2,2,2,2,2,1,2,1,2,1,1,2,1,1,1,2,1,2,1,2,0,1,0,1, 136 | 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 137 | 0,3,1,2,2,2,1,2,2,2,2,2,2,2,2,1,2,1,1,1,1,1,1,2,1,2,1,1,0,1,0,1, 138 | 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 139 | 2,1,2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2, 140 | 0,2,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, 141 | 3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0, 142 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 143 | 2,1,1,1,1,1,1,1,0,1,1,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0, 144 | 0,0,0,0,0,0,0,0,2,0,1,1,1,0,1,0,0,0,1,1,0,1,1,0,0,0,0,0,1,1,0,0, 145 | 0,1,1,1,2,1,2,2,2,0,2,0,2,0,1,1,2,1,1,1,1,2,1,0,1,1,0,0,0,0,0,0, 146 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 147 | 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 148 | 1,0,1,0,0,0,0,0,1,0,1,2,2,0,1,0,0,1,1,2,2,1,2,0,2,0,0,0,1,2,0,1, 149 | 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 150 | 0,0,0,0,0,0,0,0,2,0,2,1,2,0,2,0,0,1,1,1,1,1,1,0,1,0,0,0,1,0,0,1, 151 | 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 152 | 0,0,1,0,0,0,0,0,1,0,2,1,1,0,1,0,0,1,1,1,2,2,0,0,1,0,0,0,1,0,0,1, 153 | 1,1,2,1,0,1,1,1,0,1,0,1,1,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,2,2,1, 154 | 0,2,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 155 | 2,1,0,0,1,0,1,1,1,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0, 156 | 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 157 | 1,1,1,1,1,1,1,1,1,2,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0, 158 | 0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,0,1,1,0,1,0,0,0,1,1,0,1, 159 | 2,0,1,0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 160 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 161 | 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 162 | 0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,1,1,2,1,1,2,0,1,0,0,0,1,1,0,1, 163 | 1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 164 | 0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,0,0,2,1,1,2,0,2,0,0,0,1,1,0,1, 165 | 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 166 | 0,0,0,0,0,0,0,0,1,0,2,1,1,0,1,0,0,2,2,1,2,1,1,0,1,0,0,0,1,1,0,1, 167 | 2,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 168 | 0,0,0,0,0,0,0,0,0,0,1,2,2,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,1,0,1, 169 | 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 170 | 0,0,0,0,0,0,0,0,0,0,1,2,2,0,0,0,0,2,1,1,1,0,2,1,1,0,0,0,2,1,0,1, 171 | 1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 172 | 0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,1,1,0,2,1,1,0,1,0,0,0,1,1,0,1, 173 | 2,2,1,1,1,0,1,1,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0, 174 | 0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 175 | 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 176 | 0,0,0,0,0,0,0,0,1,0,2,1,1,0,1,0,0,1,1,0,1,2,1,0,2,0,0,0,1,1,0,1, 177 | 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 178 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 179 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 180 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0, 181 | 0,1,0,0,2,0,2,1,1,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0, 182 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 183 | 1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 184 | 0,0,0,0,0,0,0,0,1,0,1,1,2,0,1,0,0,1,1,1,0,1,0,0,1,0,0,0,1,0,0,1, 185 | 1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 186 | 1,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,2,1,1,1,1,1,0,1,0,0,0,0,1,0,1, 187 | 0,1,1,1,2,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0, 188 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 189 | 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 190 | 0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0, 191 | ) 192 | 193 | Win1255HebrewModel = { 194 | 'char_to_order_map': WIN1255_CHAR_TO_ORDER_MAP, 195 | 'precedence_matrix': HEBREW_LANG_MODEL, 196 | 'typical_positive_ratio': 0.984004, 197 | 'keep_english_letter': False, 198 | 'charset_name': "windows-1255", 199 | 'language': 'Hebrew', 200 | } 201 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/langhungarianmodel.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | # 255: Control characters that usually does not exist in any text 29 | # 254: Carriage/Return 30 | # 253: symbol (punctuation) that does not belong to word 31 | # 252: 0 - 9 32 | 33 | # Character Mapping Table: 34 | Latin2_HungarianCharToOrderMap = ( 35 | 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 36 | 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 37 | 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 38 | 252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, # 30 39 | 253, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47, 40 | 46, 71, 43, 33, 37, 57, 48, 64, 68, 55, 52,253,253,253,253,253, 41 | 253, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8, 42 | 23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,253,253,253,253,253, 43 | 159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174, 44 | 175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190, 45 | 191,192,193,194,195,196,197, 75,198,199,200,201,202,203,204,205, 46 | 79,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220, 47 | 221, 51, 81,222, 78,223,224,225,226, 44,227,228,229, 61,230,231, 48 | 232,233,234, 58,235, 66, 59,236,237,238, 60, 69, 63,239,240,241, 49 | 82, 14, 74,242, 70, 80,243, 72,244, 15, 83, 77, 84, 30, 76, 85, 50 | 245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253, 51 | ) 52 | 53 | win1250HungarianCharToOrderMap = ( 54 | 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 55 | 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 56 | 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 57 | 252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, # 30 58 | 253, 28, 40, 54, 45, 32, 50, 49, 38, 39, 53, 36, 41, 34, 35, 47, 59 | 46, 72, 43, 33, 37, 57, 48, 64, 68, 55, 52,253,253,253,253,253, 60 | 253, 2, 18, 26, 17, 1, 27, 12, 20, 9, 22, 7, 6, 13, 4, 8, 61 | 23, 67, 10, 5, 3, 21, 19, 65, 62, 16, 11,253,253,253,253,253, 62 | 161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176, 63 | 177,178,179,180, 78,181, 69,182,183,184,185,186,187,188,189,190, 64 | 191,192,193,194,195,196,197, 76,198,199,200,201,202,203,204,205, 65 | 81,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220, 66 | 221, 51, 83,222, 80,223,224,225,226, 44,227,228,229, 61,230,231, 67 | 232,233,234, 58,235, 66, 59,236,237,238, 60, 70, 63,239,240,241, 68 | 84, 14, 75,242, 71, 82,243, 73,244, 15, 85, 79, 86, 30, 77, 87, 69 | 245,246,247, 25, 74, 42, 24,248,249,250, 31, 56, 29,251,252,253, 70 | ) 71 | 72 | # Model Table: 73 | # total sequences: 100% 74 | # first 512 sequences: 94.7368% 75 | # first 1024 sequences:5.2623% 76 | # rest sequences: 0.8894% 77 | # negative sequences: 0.0009% 78 | HungarianLangModel = ( 79 | 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 80 | 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2, 81 | 3,2,2,3,3,3,3,3,2,3,3,3,3,3,3,1,2,3,3,3,3,2,3,3,1,1,3,3,0,1,1,1, 82 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0, 83 | 3,2,1,3,3,3,3,3,2,3,3,3,3,3,1,1,2,3,3,3,3,3,3,3,1,1,3,2,0,1,1,1, 84 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, 85 | 3,3,3,3,3,3,3,3,3,3,3,1,1,2,3,3,3,1,3,3,3,3,3,1,3,3,2,2,0,3,2,3, 86 | 0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0, 87 | 3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,3,3,3,3,2,3,3,2,2,3,2,3,2,0,3,2,2, 88 | 0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0, 89 | 3,3,3,3,3,3,2,3,3,3,3,3,2,3,3,3,1,2,3,2,2,3,1,2,3,3,2,2,0,3,3,3, 90 | 0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 91 | 3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,3,2,3,3,3,3,2,3,3,3,3,0,2,3,2, 92 | 0,0,0,1,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 93 | 3,3,3,3,3,3,3,3,3,3,3,1,1,1,3,3,2,1,3,2,2,3,2,1,3,2,2,1,0,3,3,1, 94 | 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 95 | 3,2,2,3,3,3,3,3,1,2,3,3,3,3,1,2,1,3,3,3,3,2,2,3,1,1,3,2,0,1,1,1, 96 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, 97 | 3,3,3,3,3,3,3,3,2,2,3,3,3,3,3,2,1,3,3,3,3,3,2,2,1,3,3,3,0,1,1,2, 98 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0, 99 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,2,3,3,2,3,3,3,2,0,3,2,3, 100 | 0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0, 101 | 3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,3,1,3,2,2,2,3,1,1,3,3,1,1,0,3,3,2, 102 | 0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 103 | 3,3,3,3,3,3,3,2,3,3,3,2,3,2,3,3,3,2,3,3,3,3,3,1,2,3,2,2,0,2,2,2, 104 | 0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 105 | 3,3,3,2,2,2,3,1,3,3,2,2,1,3,3,3,1,1,3,1,2,3,2,3,2,2,2,1,0,2,2,2, 106 | 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0, 107 | 3,1,1,3,3,3,3,3,1,2,3,3,3,3,1,2,1,3,3,3,2,2,3,2,1,0,3,2,0,1,1,0, 108 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 109 | 3,1,1,3,3,3,3,3,1,2,3,3,3,3,1,1,0,3,3,3,3,0,2,3,0,0,2,1,0,1,0,0, 110 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 111 | 3,3,3,3,3,3,2,2,3,3,2,2,2,2,3,3,0,1,2,3,2,3,2,2,3,2,1,2,0,2,2,2, 112 | 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0, 113 | 3,3,3,3,3,3,1,2,3,3,3,2,1,2,3,3,2,2,2,3,2,3,3,1,3,3,1,1,0,2,3,2, 114 | 0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 115 | 3,3,3,1,2,2,2,2,3,3,3,1,1,1,3,3,1,1,3,1,1,3,2,1,2,3,1,1,0,2,2,2, 116 | 0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 117 | 3,3,3,2,1,2,1,1,3,3,1,1,1,1,3,3,1,1,2,2,1,2,1,1,2,2,1,1,0,2,2,1, 118 | 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 119 | 3,3,3,1,1,2,1,1,3,3,1,0,1,1,3,3,2,0,1,1,2,3,1,0,2,2,1,0,0,1,3,2, 120 | 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 121 | 3,2,1,3,3,3,3,3,1,2,3,2,3,3,2,1,1,3,2,3,2,1,2,2,0,1,2,1,0,0,1,1, 122 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, 123 | 3,3,3,3,2,2,2,2,3,1,2,2,1,1,3,3,0,3,2,1,2,3,2,1,3,3,1,1,0,2,1,3, 124 | 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 125 | 3,3,3,2,2,2,3,2,3,3,3,2,1,1,3,3,1,1,1,2,2,3,2,3,2,2,2,1,0,2,2,1, 126 | 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 127 | 1,0,0,3,3,3,3,3,0,0,3,3,2,3,0,0,0,2,3,3,1,0,1,2,0,0,1,1,0,0,0,0, 128 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 129 | 3,1,2,3,3,3,3,3,1,2,3,3,2,2,1,1,0,3,3,2,2,1,2,2,1,0,2,2,0,1,1,1, 130 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 131 | 3,3,2,2,1,3,1,2,3,3,2,2,1,1,2,2,1,1,1,1,3,2,1,1,1,1,2,1,0,1,2,1, 132 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0, 133 | 2,3,3,1,1,1,1,1,3,3,3,0,1,1,3,3,1,1,1,1,1,2,2,0,3,1,1,2,0,2,1,1, 134 | 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 135 | 3,1,0,1,2,1,2,2,0,1,2,3,1,2,0,0,0,2,1,1,1,1,1,2,0,0,1,1,0,0,0,0, 136 | 1,2,1,2,2,2,1,2,1,2,0,2,0,2,2,1,1,2,1,1,2,1,1,1,0,1,0,0,0,1,1,0, 137 | 1,1,1,2,3,2,3,3,0,1,2,2,3,1,0,1,0,2,1,2,2,0,1,1,0,0,1,1,0,0,0,0, 138 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 139 | 1,0,0,3,3,2,2,1,0,0,3,2,3,2,0,0,0,1,1,3,0,0,1,1,0,0,2,1,0,0,0,0, 140 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 141 | 3,1,1,2,2,3,3,1,0,1,3,2,3,1,1,1,0,1,1,1,1,1,3,1,0,0,2,2,0,0,0,0, 142 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 143 | 3,1,1,1,2,2,2,1,0,1,2,3,3,2,0,0,0,2,1,1,1,2,1,1,1,0,1,1,1,0,0,0, 144 | 1,2,2,2,2,2,1,1,1,2,0,2,1,1,1,1,1,2,1,1,1,1,1,1,0,1,1,1,0,0,1,1, 145 | 3,2,2,1,0,0,1,1,2,2,0,3,0,1,2,1,1,0,0,1,1,1,0,1,1,1,1,0,2,1,1,1, 146 | 2,2,1,1,1,2,1,2,1,1,1,1,1,1,1,2,1,1,1,2,3,1,1,1,1,1,1,1,1,1,0,1, 147 | 2,3,3,0,1,0,0,0,3,3,1,0,0,1,2,2,1,0,0,0,0,2,0,0,1,1,1,0,2,1,1,1, 148 | 2,1,1,1,1,1,1,2,1,1,0,1,1,0,1,1,1,0,1,2,1,1,0,1,1,1,1,1,1,1,0,1, 149 | 2,3,3,0,1,0,0,0,2,2,0,0,0,0,1,2,2,0,0,0,0,1,0,0,1,1,0,0,2,0,1,0, 150 | 2,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,2,0,1,1,1,1,1,0,1, 151 | 3,2,2,0,1,0,1,0,2,3,2,0,0,1,2,2,1,0,0,1,1,1,0,0,2,1,0,1,2,2,1,1, 152 | 2,1,1,1,1,1,1,2,1,1,1,1,1,1,0,2,1,0,1,1,0,1,1,1,0,1,1,2,1,1,0,1, 153 | 2,2,2,0,0,1,0,0,2,2,1,1,0,0,2,1,1,0,0,0,1,2,0,0,2,1,0,0,2,1,1,1, 154 | 2,1,1,1,1,2,1,2,1,1,1,2,2,1,1,2,1,1,1,2,1,1,1,1,1,1,1,1,1,1,0,1, 155 | 1,2,3,0,0,0,1,0,3,2,1,0,0,1,2,1,1,0,0,0,0,2,1,0,1,1,0,0,2,1,2,1, 156 | 1,1,0,0,0,1,0,1,1,1,1,1,2,0,0,1,0,0,0,2,0,0,1,1,1,1,1,1,1,1,0,1, 157 | 3,0,0,2,1,2,2,1,0,0,2,1,2,2,0,0,0,2,1,1,1,0,1,1,0,0,1,1,2,0,0,0, 158 | 1,2,1,2,2,1,1,2,1,2,0,1,1,1,1,1,1,1,1,1,2,1,1,0,0,1,1,1,1,0,0,1, 159 | 1,3,2,0,0,0,1,0,2,2,2,0,0,0,2,2,1,0,0,0,0,3,1,1,1,1,0,0,2,1,1,1, 160 | 2,1,0,1,1,1,0,1,1,1,1,1,1,1,0,2,1,0,0,1,0,1,1,0,1,1,1,1,1,1,0,1, 161 | 2,3,2,0,0,0,1,0,2,2,0,0,0,0,2,1,1,0,0,0,0,2,1,0,1,1,0,0,2,1,1,0, 162 | 2,1,1,1,1,2,1,2,1,2,0,1,1,1,0,2,1,1,1,2,1,1,1,1,0,1,1,1,1,1,0,1, 163 | 3,1,1,2,2,2,3,2,1,1,2,2,1,1,0,1,0,2,2,1,1,1,1,1,0,0,1,1,0,1,1,0, 164 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 165 | 2,2,2,0,0,0,0,0,2,2,0,0,0,0,2,2,1,0,0,0,1,1,0,0,1,2,0,0,2,1,1,1, 166 | 2,2,1,1,1,2,1,2,1,1,0,1,1,1,1,2,1,1,1,2,1,1,1,1,0,1,2,1,1,1,0,1, 167 | 1,0,0,1,2,3,2,1,0,0,2,0,1,1,0,0,0,1,1,1,1,0,1,1,0,0,1,0,0,0,0,0, 168 | 1,2,1,2,1,2,1,1,1,2,0,2,1,1,1,0,1,2,0,0,1,1,1,0,0,0,0,0,0,0,0,0, 169 | 2,3,2,0,0,0,0,0,1,1,2,1,0,0,1,1,1,0,0,0,0,2,0,0,1,1,0,0,2,1,1,1, 170 | 2,1,1,1,1,1,1,2,1,0,1,1,1,1,0,2,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1, 171 | 1,2,2,0,1,1,1,0,2,2,2,0,0,0,3,2,1,0,0,0,1,1,0,0,1,1,0,1,1,1,0,0, 172 | 1,1,0,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,2,1,1,1,0,0,1,1,1,0,1,0,1, 173 | 2,1,0,2,1,1,2,2,1,1,2,1,1,1,0,0,0,1,1,0,1,1,1,1,0,0,1,1,1,0,0,0, 174 | 1,2,2,2,2,2,1,1,1,2,0,2,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,1,0, 175 | 1,2,3,0,0,0,1,0,2,2,0,0,0,0,2,2,0,0,0,0,0,1,0,0,1,0,0,0,2,0,1,0, 176 | 2,1,1,1,1,1,0,2,0,0,0,1,2,1,1,1,1,0,1,2,0,1,0,1,0,1,1,1,0,1,0,1, 177 | 2,2,2,0,0,0,1,0,2,1,2,0,0,0,1,1,2,0,0,0,0,1,0,0,1,1,0,0,2,1,0,1, 178 | 2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,0,1,1,1,1,1,0,1, 179 | 1,2,2,0,0,0,1,0,2,2,2,0,0,0,1,1,0,0,0,0,0,1,1,0,2,0,0,1,1,1,0,1, 180 | 1,0,1,1,1,1,1,1,0,1,1,1,1,0,0,1,0,0,1,1,0,1,0,1,1,1,1,1,0,0,0,1, 181 | 1,0,0,1,0,1,2,1,0,0,1,1,1,2,0,0,0,1,1,0,1,0,1,1,0,0,1,0,0,0,0,0, 182 | 0,2,1,2,1,1,1,1,1,2,0,2,0,1,1,0,1,2,1,0,1,1,1,0,0,0,0,0,0,1,0,0, 183 | 2,1,1,0,1,2,0,0,1,1,1,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,2,1,0,1, 184 | 2,2,1,1,1,1,1,2,1,1,0,1,1,1,1,2,1,1,1,2,1,1,0,1,0,1,1,1,1,1,0,1, 185 | 1,2,2,0,0,0,0,0,1,1,0,0,0,0,2,1,0,0,0,0,0,2,0,0,2,2,0,0,2,0,0,1, 186 | 2,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1, 187 | 1,1,2,0,0,3,1,0,2,1,1,1,0,0,1,1,1,0,0,0,1,1,0,0,0,1,0,0,1,0,1,0, 188 | 1,2,1,0,1,1,1,2,1,1,0,1,1,1,1,1,0,0,0,1,1,1,1,1,0,1,0,0,0,1,0,0, 189 | 2,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,2,0,0,0, 190 | 2,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,2,1,1,0,0,1,1,1,1,1,0,1, 191 | 2,1,1,1,2,1,1,1,0,1,1,2,1,0,0,0,0,1,1,1,1,0,1,0,0,0,0,1,0,0,0,0, 192 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 193 | 1,1,0,1,1,1,1,1,0,0,1,1,2,1,0,0,0,1,1,0,0,0,1,1,0,0,1,0,1,0,0,0, 194 | 1,2,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,1,0,0, 195 | 2,0,0,0,1,1,1,1,0,0,1,1,0,0,0,0,0,1,1,1,2,0,0,1,0,0,1,0,1,0,0,0, 196 | 0,1,1,1,1,1,1,1,1,2,0,1,1,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0, 197 | 1,0,0,1,1,1,1,1,0,0,2,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0, 198 | 0,1,1,1,1,1,1,0,1,1,0,1,0,1,1,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0, 199 | 1,0,0,1,1,1,0,0,0,0,1,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0, 200 | 0,1,1,1,1,1,0,0,1,1,0,1,0,1,0,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0, 201 | 0,0,0,1,0,0,0,0,0,0,1,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 202 | 0,1,1,1,0,1,0,0,1,1,0,1,0,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0, 203 | 2,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,0,1,0,0,1,0,1,0,1,1,1,0,0,1,0, 204 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 205 | 1,0,0,1,1,1,1,0,0,0,1,1,1,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0, 206 | 0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0, 207 | ) 208 | 209 | Latin2HungarianModel = { 210 | 'char_to_order_map': Latin2_HungarianCharToOrderMap, 211 | 'precedence_matrix': HungarianLangModel, 212 | 'typical_positive_ratio': 0.947368, 213 | 'keep_english_letter': True, 214 | 'charset_name': "ISO-8859-2", 215 | 'language': 'Hungarian', 216 | } 217 | 218 | Win1250HungarianModel = { 219 | 'char_to_order_map': win1250HungarianCharToOrderMap, 220 | 'precedence_matrix': HungarianLangModel, 221 | 'typical_positive_ratio': 0.947368, 222 | 'keep_english_letter': True, 223 | 'charset_name': "windows-1250", 224 | 'language': 'Hungarian', 225 | } 226 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/langgreekmodel.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | # 255: Control characters that usually does not exist in any text 29 | # 254: Carriage/Return 30 | # 253: symbol (punctuation) that does not belong to word 31 | # 252: 0 - 9 32 | 33 | # Character Mapping Table: 34 | Latin7_char_to_order_map = ( 35 | 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 36 | 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 37 | 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 38 | 252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, # 30 39 | 253, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, # 40 40 | 79,118,105, 83, 67,114,119, 95, 99,109,188,253,253,253,253,253, # 50 41 | 253, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, # 60 42 | 78,115, 65, 66, 58, 76,106,103, 87,107,112,253,253,253,253,253, # 70 43 | 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 80 44 | 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 90 45 | 253,233, 90,253,253,253,253,253,253,253,253,253,253, 74,253,253, # a0 46 | 253,253,253,253,247,248, 61, 36, 46, 71, 73,253, 54,253,108,123, # b0 47 | 110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, # c0 48 | 35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, # d0 49 | 124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, # e0 50 | 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0 51 | ) 52 | 53 | win1253_char_to_order_map = ( 54 | 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, # 00 55 | 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 10 56 | 253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253, # 20 57 | 252,252,252,252,252,252,252,252,252,252,253,253,253,253,253,253, # 30 58 | 253, 82,100,104, 94, 98,101,116,102,111,187,117, 92, 88,113, 85, # 40 59 | 79,118,105, 83, 67,114,119, 95, 99,109,188,253,253,253,253,253, # 50 60 | 253, 72, 70, 80, 81, 60, 96, 93, 89, 68,120, 97, 77, 86, 69, 55, # 60 61 | 78,115, 65, 66, 58, 76,106,103, 87,107,112,253,253,253,253,253, # 70 62 | 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 80 63 | 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, # 90 64 | 253,233, 61,253,253,253,253,253,253,253,253,253,253, 74,253,253, # a0 65 | 253,253,253,253,247,253,253, 36, 46, 71, 73,253, 54,253,108,123, # b0 66 | 110, 31, 51, 43, 41, 34, 91, 40, 52, 47, 44, 53, 38, 49, 59, 39, # c0 67 | 35, 48,250, 37, 33, 45, 56, 50, 84, 57,120,121, 17, 18, 22, 15, # d0 68 | 124, 1, 29, 20, 21, 3, 32, 13, 25, 5, 11, 16, 10, 6, 30, 4, # e0 69 | 9, 8, 14, 7, 2, 12, 28, 23, 42, 24, 64, 75, 19, 26, 27,253, # f0 70 | ) 71 | 72 | # Model Table: 73 | # total sequences: 100% 74 | # first 512 sequences: 98.2851% 75 | # first 1024 sequences:1.7001% 76 | # rest sequences: 0.0359% 77 | # negative sequences: 0.0148% 78 | GreekLangModel = ( 79 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 80 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 81 | 0,0,3,2,2,3,3,3,3,3,3,3,3,1,3,3,3,0,2,2,3,3,0,3,0,3,2,0,3,3,3,0, 82 | 3,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 83 | 0,3,3,3,3,3,0,3,3,0,3,2,3,3,0,3,2,3,3,3,0,0,3,0,3,0,3,3,2,0,0,0, 84 | 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0, 85 | 0,2,3,2,2,3,3,3,3,3,3,3,3,0,3,3,3,3,0,2,3,3,0,3,3,3,3,2,3,3,3,0, 86 | 2,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 87 | 0,2,3,3,2,3,3,3,3,3,3,3,3,3,3,3,3,0,2,1,3,3,3,3,2,3,3,2,3,3,2,0, 88 | 0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 89 | 0,3,3,3,3,0,3,3,3,3,3,3,0,3,3,0,3,3,3,3,3,3,3,3,3,3,0,3,2,3,3,0, 90 | 2,0,1,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, 91 | 0,3,3,3,3,3,2,3,0,0,0,0,3,3,0,3,1,3,3,3,0,3,3,0,3,3,3,3,0,0,0,0, 92 | 2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 93 | 0,3,3,3,3,3,0,3,0,3,3,3,3,3,0,3,2,2,2,3,0,2,3,3,3,3,3,2,3,3,0,0, 94 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 95 | 0,3,3,3,3,3,3,2,2,2,3,3,3,3,0,3,1,3,3,3,3,2,3,3,3,3,3,3,3,2,2,0, 96 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 97 | 0,3,3,3,3,3,2,0,3,0,0,0,3,3,2,3,3,3,3,3,0,0,3,2,3,0,2,3,0,0,0,0, 98 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 99 | 0,3,0,3,3,3,3,0,0,3,3,0,2,3,0,3,0,3,3,3,0,0,3,0,3,0,2,2,3,3,0,0, 100 | 0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 101 | 0,3,3,3,3,3,2,0,3,2,3,3,3,3,0,3,3,3,3,3,0,3,3,2,3,2,3,3,2,0,0,0, 102 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 103 | 0,3,3,2,3,2,3,3,3,3,3,3,0,2,3,2,3,2,2,2,3,2,3,3,2,3,0,2,2,2,3,0, 104 | 2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 105 | 0,0,3,0,0,0,3,3,3,2,3,3,0,0,3,0,3,0,0,0,3,2,0,3,0,3,0,0,2,0,2,0, 106 | 0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 107 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 108 | 0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 109 | 0,3,3,3,3,0,3,3,3,3,3,3,0,3,3,0,3,0,0,0,3,3,0,3,3,3,0,0,1,2,3,0, 110 | 3,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 111 | 0,3,3,3,3,3,2,0,0,3,2,2,3,3,0,3,3,3,3,3,2,1,3,0,3,2,3,3,2,1,0,0, 112 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 113 | 0,0,3,3,0,2,3,3,3,3,3,3,0,0,3,0,3,0,0,0,3,3,0,3,2,3,0,0,3,3,3,0, 114 | 3,0,0,0,2,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 115 | 0,3,3,3,3,0,3,3,3,3,3,3,0,0,3,0,3,0,0,0,3,2,0,3,2,3,0,0,3,2,3,0, 116 | 2,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 117 | 0,0,3,1,2,2,3,3,3,3,3,3,0,2,3,0,3,0,0,0,3,3,0,3,0,2,0,0,2,3,1,0, 118 | 2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 119 | 0,3,0,3,3,3,3,0,3,0,3,3,2,3,0,3,3,3,3,3,3,0,3,3,3,0,2,3,0,0,3,0, 120 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 121 | 0,3,0,3,3,3,0,0,3,0,0,0,3,3,0,3,0,2,3,3,0,0,3,0,3,0,3,3,0,0,0,0, 122 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 123 | 0,0,3,0,0,0,3,3,3,3,3,3,0,0,3,0,2,0,0,0,3,3,0,3,0,3,0,0,2,0,2,0, 124 | 0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 125 | 0,3,3,3,3,3,3,0,3,0,2,0,3,2,0,3,2,3,2,3,0,0,3,2,3,2,3,3,0,0,0,0, 126 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 127 | 0,0,3,0,0,2,3,3,3,3,3,0,0,0,3,0,2,1,0,0,3,2,2,2,0,3,0,0,2,2,0,0, 128 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 129 | 0,3,0,3,3,3,2,0,3,0,3,0,3,3,0,2,1,2,3,3,0,0,3,0,3,0,3,3,0,0,0,0, 130 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 131 | 0,2,3,3,3,0,3,3,3,3,3,3,0,2,3,0,3,0,0,0,2,1,0,2,2,3,0,0,2,2,2,0, 132 | 0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 133 | 0,0,3,0,0,2,3,3,3,2,3,0,0,1,3,0,2,0,0,0,0,3,0,1,0,2,0,0,1,1,1,0, 134 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 135 | 0,3,3,3,3,3,1,0,3,0,0,0,3,2,0,3,2,3,3,3,0,0,3,0,3,2,2,2,1,0,0,0, 136 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 137 | 0,3,0,3,3,3,0,0,3,0,0,0,0,2,0,2,3,3,2,2,2,2,3,0,2,0,2,2,0,0,0,0, 138 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 139 | 0,3,3,3,3,2,0,0,0,0,0,0,2,3,0,2,0,2,3,2,0,0,3,0,3,0,3,1,0,0,0,0, 140 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 141 | 0,0,0,0,0,0,3,2,3,3,2,2,3,0,2,0,3,0,0,0,2,0,0,0,0,1,2,0,2,0,2,0, 142 | 0,2,0,2,0,2,2,0,0,1,0,2,2,2,0,2,2,2,0,2,2,2,0,0,2,0,0,1,0,0,0,0, 143 | 0,2,0,3,3,2,0,0,0,0,0,0,1,3,0,2,0,2,2,2,0,0,2,0,3,0,0,2,0,0,0,0, 144 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 145 | 0,3,0,2,3,2,0,2,2,0,2,0,2,2,0,2,0,2,2,2,0,0,0,0,0,0,2,3,0,0,0,2, 146 | 0,1,2,0,0,0,0,2,2,0,0,0,2,1,0,2,2,0,0,0,0,0,0,1,0,2,0,0,0,0,0,0, 147 | 0,0,2,1,0,2,3,2,2,3,2,3,2,0,0,3,3,3,0,0,3,2,0,0,0,1,1,0,2,0,2,2, 148 | 0,2,0,2,0,2,2,0,0,2,0,2,2,2,0,2,2,2,2,0,0,2,0,0,0,2,0,1,0,0,0,0, 149 | 0,3,0,3,3,2,2,0,3,0,0,0,2,2,0,2,2,2,1,2,0,0,1,2,2,0,0,3,0,0,0,2, 150 | 0,1,2,0,0,0,1,2,0,0,0,0,0,0,0,2,2,0,1,0,0,2,0,0,0,2,0,0,0,0,0,0, 151 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 152 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 153 | 0,2,3,3,2,2,0,0,0,2,0,2,3,3,0,2,0,0,0,0,0,0,2,2,2,0,2,2,0,2,0,2, 154 | 0,2,2,0,0,2,2,2,2,1,0,0,2,2,0,2,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0, 155 | 0,2,0,3,2,3,0,0,0,3,0,0,2,2,0,2,0,2,2,2,0,0,2,0,0,0,0,0,0,0,0,2, 156 | 0,0,2,2,0,0,2,2,2,0,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0, 157 | 0,0,2,0,0,3,2,0,2,2,2,2,2,0,0,0,2,0,0,0,0,2,0,1,0,0,2,0,1,0,0,0, 158 | 0,2,2,2,0,2,2,0,1,2,0,2,2,2,0,2,2,2,2,1,2,2,0,0,2,0,0,0,0,0,0,0, 159 | 0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, 160 | 0,2,0,2,0,2,2,0,0,0,0,1,2,1,0,0,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0, 161 | 0,0,0,3,2,3,0,0,2,0,0,0,2,2,0,2,0,0,0,1,0,0,2,0,2,0,2,2,0,0,0,0, 162 | 0,0,2,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0, 163 | 0,2,2,3,2,2,0,0,0,0,0,0,1,3,0,2,0,2,2,0,0,0,1,0,2,0,0,0,0,0,0,0, 164 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 165 | 0,2,0,2,0,3,2,0,2,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, 166 | 0,0,2,0,0,0,0,1,1,0,0,2,1,2,0,2,2,0,1,0,0,1,0,0,0,2,0,0,0,0,0,0, 167 | 0,3,0,2,2,2,0,0,2,0,0,0,2,0,0,0,2,3,0,2,0,0,0,0,0,0,2,2,0,0,0,2, 168 | 0,1,2,0,0,0,1,2,2,1,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 169 | 0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 170 | 0,2,1,2,0,2,2,0,2,0,0,2,0,0,0,0,1,2,1,0,2,1,0,0,0,0,0,0,0,0,0,0, 171 | 0,0,2,0,0,0,3,1,2,2,0,2,0,0,0,0,2,0,0,0,2,0,0,3,0,0,0,0,2,2,2,0, 172 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 173 | 0,2,1,0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,2, 174 | 0,2,2,0,0,2,2,2,2,2,0,1,2,0,0,0,2,2,0,1,0,2,0,0,2,2,0,0,0,0,0,0, 175 | 0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,2,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0,2, 176 | 0,1,2,0,0,0,0,2,2,1,0,1,0,1,0,2,2,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0, 177 | 0,2,0,1,2,0,0,0,0,0,0,0,0,0,0,2,0,0,2,2,0,0,0,0,1,0,0,0,0,0,0,2, 178 | 0,2,2,0,0,0,0,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0, 179 | 0,2,2,2,2,0,0,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,1, 180 | 0,0,2,0,0,0,0,1,2,0,0,0,0,0,0,2,2,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0, 181 | 0,2,0,2,2,2,0,0,2,0,0,0,0,0,0,0,2,2,2,0,0,0,2,0,0,0,0,0,0,0,0,2, 182 | 0,0,1,0,0,0,0,2,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, 183 | 0,3,0,2,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,2, 184 | 0,0,2,0,0,0,0,2,2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 185 | 0,2,0,2,2,1,0,0,0,0,0,0,2,0,0,2,0,2,2,2,0,0,0,0,0,0,2,0,0,0,0,2, 186 | 0,0,2,0,0,2,0,2,2,0,0,0,0,2,0,2,0,0,0,0,0,2,0,0,0,2,0,0,0,0,0,0, 187 | 0,0,3,0,0,0,2,2,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0, 188 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 189 | 0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 190 | 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0, 191 | 0,2,2,2,2,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1, 192 | 0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 193 | 0,0,0,0,0,0,0,2,2,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, 194 | 0,2,0,0,0,2,0,0,0,0,0,1,0,0,0,0,2,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0, 195 | 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 196 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,2,0,0,0, 197 | 0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0, 198 | 0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 199 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 200 | 0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,2,0,2,0,0,0, 201 | 0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,2,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0, 202 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 203 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 204 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 205 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 206 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 207 | ) 208 | 209 | Latin7GreekModel = { 210 | 'char_to_order_map': Latin7_char_to_order_map, 211 | 'precedence_matrix': GreekLangModel, 212 | 'typical_positive_ratio': 0.982851, 213 | 'keep_english_letter': False, 214 | 'charset_name': "ISO-8859-7", 215 | 'language': 'Greek', 216 | } 217 | 218 | Win1253GreekModel = { 219 | 'char_to_order_map': win1253_char_to_order_map, 220 | 'precedence_matrix': GreekLangModel, 221 | 'typical_positive_ratio': 0.982851, 222 | 'keep_english_letter': False, 223 | 'charset_name': "windows-1253", 224 | 'language': 'Greek', 225 | } 226 | -------------------------------------------------------------------------------- /capa_explorer_plugin/chardet/universaldetector.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | """ 29 | Module containing the UniversalDetector detector class, which is the primary 30 | class a user of ``chardet`` should use. 31 | 32 | :author: Mark Pilgrim (initial port to Python) 33 | :author: Shy Shalom (original C code) 34 | :author: Dan Blanchard (major refactoring for 3.0) 35 | :author: Ian Cordasco 36 | """ 37 | 38 | 39 | import codecs 40 | import logging 41 | import re 42 | 43 | from .charsetgroupprober import CharSetGroupProber 44 | from .enums import InputState, LanguageFilter, ProbingState 45 | from .escprober import EscCharSetProber 46 | from .latin1prober import Latin1Prober 47 | from .mbcsgroupprober import MBCSGroupProber 48 | from .sbcsgroupprober import SBCSGroupProber 49 | 50 | 51 | class UniversalDetector(object): 52 | """ 53 | The ``UniversalDetector`` class underlies the ``chardet.detect`` function 54 | and coordinates all of the different charset probers. 55 | 56 | To get a ``dict`` containing an encoding and its confidence, you can simply 57 | run: 58 | 59 | .. code:: 60 | 61 | u = UniversalDetector() 62 | u.feed(some_bytes) 63 | u.close() 64 | detected = u.result 65 | 66 | """ 67 | 68 | MINIMUM_THRESHOLD = 0.20 69 | HIGH_BYTE_DETECTOR = re.compile(b'[\x80-\xFF]') 70 | ESC_DETECTOR = re.compile(b'(\033|~{)') 71 | WIN_BYTE_DETECTOR = re.compile(b'[\x80-\x9F]') 72 | ISO_WIN_MAP = {'iso-8859-1': 'Windows-1252', 73 | 'iso-8859-2': 'Windows-1250', 74 | 'iso-8859-5': 'Windows-1251', 75 | 'iso-8859-6': 'Windows-1256', 76 | 'iso-8859-7': 'Windows-1253', 77 | 'iso-8859-8': 'Windows-1255', 78 | 'iso-8859-9': 'Windows-1254', 79 | 'iso-8859-13': 'Windows-1257'} 80 | 81 | def __init__(self, lang_filter=LanguageFilter.ALL): 82 | self._esc_charset_prober = None 83 | self._charset_probers = [] 84 | self.result = None 85 | self.done = None 86 | self._got_data = None 87 | self._input_state = None 88 | self._last_char = None 89 | self.lang_filter = lang_filter 90 | self.logger = logging.getLogger(__name__) 91 | self._has_win_bytes = None 92 | self.reset() 93 | 94 | def reset(self): 95 | """ 96 | Reset the UniversalDetector and all of its probers back to their 97 | initial states. This is called by ``__init__``, so you only need to 98 | call this directly in between analyses of different documents. 99 | """ 100 | self.result = {'encoding': None, 'confidence': 0.0, 'language': None} 101 | self.done = False 102 | self._got_data = False 103 | self._has_win_bytes = False 104 | self._input_state = InputState.PURE_ASCII 105 | self._last_char = b'' 106 | if self._esc_charset_prober: 107 | self._esc_charset_prober.reset() 108 | for prober in self._charset_probers: 109 | prober.reset() 110 | 111 | def feed(self, byte_str): 112 | """ 113 | Takes a chunk of a document and feeds it through all of the relevant 114 | charset probers. 115 | 116 | After calling ``feed``, you can check the value of the ``done`` 117 | attribute to see if you need to continue feeding the 118 | ``UniversalDetector`` more data, or if it has made a prediction 119 | (in the ``result`` attribute). 120 | 121 | .. note:: 122 | You should always call ``close`` when you're done feeding in your 123 | document if ``done`` is not already ``True``. 124 | """ 125 | if self.done: 126 | return 127 | 128 | if not len(byte_str): 129 | return 130 | 131 | if not isinstance(byte_str, bytearray): 132 | byte_str = bytearray(byte_str) 133 | 134 | # First check for known BOMs, since these are guaranteed to be correct 135 | if not self._got_data: 136 | # If the data starts with BOM, we know it is UTF 137 | if byte_str.startswith(codecs.BOM_UTF8): 138 | # EF BB BF UTF-8 with BOM 139 | self.result = {'encoding': "UTF-8-SIG", 140 | 'confidence': 1.0, 141 | 'language': ''} 142 | elif byte_str.startswith((codecs.BOM_UTF32_LE, 143 | codecs.BOM_UTF32_BE)): 144 | # FF FE 00 00 UTF-32, little-endian BOM 145 | # 00 00 FE FF UTF-32, big-endian BOM 146 | self.result = {'encoding': "UTF-32", 147 | 'confidence': 1.0, 148 | 'language': ''} 149 | elif byte_str.startswith(b'\xFE\xFF\x00\x00'): 150 | # FE FF 00 00 UCS-4, unusual octet order BOM (3412) 151 | self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 152 | 'confidence': 1.0, 153 | 'language': ''} 154 | elif byte_str.startswith(b'\x00\x00\xFF\xFE'): 155 | # 00 00 FF FE UCS-4, unusual octet order BOM (2143) 156 | self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 157 | 'confidence': 1.0, 158 | 'language': ''} 159 | elif byte_str.startswith((codecs.BOM_LE, codecs.BOM_BE)): 160 | # FF FE UTF-16, little endian BOM 161 | # FE FF UTF-16, big endian BOM 162 | self.result = {'encoding': "UTF-16", 163 | 'confidence': 1.0, 164 | 'language': ''} 165 | 166 | self._got_data = True 167 | if self.result['encoding'] is not None: 168 | self.done = True 169 | return 170 | 171 | # If none of those matched and we've only see ASCII so far, check 172 | # for high bytes and escape sequences 173 | if self._input_state == InputState.PURE_ASCII: 174 | if self.HIGH_BYTE_DETECTOR.search(byte_str): 175 | self._input_state = InputState.HIGH_BYTE 176 | elif self._input_state == InputState.PURE_ASCII and \ 177 | self.ESC_DETECTOR.search(self._last_char + byte_str): 178 | self._input_state = InputState.ESC_ASCII 179 | 180 | self._last_char = byte_str[-1:] 181 | 182 | # If we've seen escape sequences, use the EscCharSetProber, which 183 | # uses a simple state machine to check for known escape sequences in 184 | # HZ and ISO-2022 encodings, since those are the only encodings that 185 | # use such sequences. 186 | if self._input_state == InputState.ESC_ASCII: 187 | if not self._esc_charset_prober: 188 | self._esc_charset_prober = EscCharSetProber(self.lang_filter) 189 | if self._esc_charset_prober.feed(byte_str) == ProbingState.FOUND_IT: 190 | self.result = {'encoding': 191 | self._esc_charset_prober.charset_name, 192 | 'confidence': 193 | self._esc_charset_prober.get_confidence(), 194 | 'language': 195 | self._esc_charset_prober.language} 196 | self.done = True 197 | # If we've seen high bytes (i.e., those with values greater than 127), 198 | # we need to do more complicated checks using all our multi-byte and 199 | # single-byte probers that are left. The single-byte probers 200 | # use character bigram distributions to determine the encoding, whereas 201 | # the multi-byte probers use a combination of character unigram and 202 | # bigram distributions. 203 | elif self._input_state == InputState.HIGH_BYTE: 204 | if not self._charset_probers: 205 | self._charset_probers = [MBCSGroupProber(self.lang_filter)] 206 | # If we're checking non-CJK encodings, use single-byte prober 207 | if self.lang_filter & LanguageFilter.NON_CJK: 208 | self._charset_probers.append(SBCSGroupProber()) 209 | self._charset_probers.append(Latin1Prober()) 210 | for prober in self._charset_probers: 211 | if prober.feed(byte_str) == ProbingState.FOUND_IT: 212 | self.result = {'encoding': prober.charset_name, 213 | 'confidence': prober.get_confidence(), 214 | 'language': prober.language} 215 | self.done = True 216 | break 217 | if self.WIN_BYTE_DETECTOR.search(byte_str): 218 | self._has_win_bytes = True 219 | 220 | def close(self): 221 | """ 222 | Stop analyzing the current document and come up with a final 223 | prediction. 224 | 225 | :returns: The ``result`` attribute, a ``dict`` with the keys 226 | `encoding`, `confidence`, and `language`. 227 | """ 228 | # Don't bother with checks if we're already done 229 | if self.done: 230 | return self.result 231 | self.done = True 232 | 233 | if not self._got_data: 234 | self.logger.debug('no data received!') 235 | 236 | # Default to ASCII if it is all we've seen so far 237 | elif self._input_state == InputState.PURE_ASCII: 238 | self.result = {'encoding': 'ascii', 239 | 'confidence': 1.0, 240 | 'language': ''} 241 | 242 | # If we have seen non-ASCII, return the best that met MINIMUM_THRESHOLD 243 | elif self._input_state == InputState.HIGH_BYTE: 244 | prober_confidence = None 245 | max_prober_confidence = 0.0 246 | max_prober = None 247 | for prober in self._charset_probers: 248 | if not prober: 249 | continue 250 | prober_confidence = prober.get_confidence() 251 | if prober_confidence > max_prober_confidence: 252 | max_prober_confidence = prober_confidence 253 | max_prober = prober 254 | if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): 255 | charset_name = max_prober.charset_name 256 | lower_charset_name = max_prober.charset_name.lower() 257 | confidence = max_prober.get_confidence() 258 | # Use Windows encoding name instead of ISO-8859 if we saw any 259 | # extra Windows-specific bytes 260 | if lower_charset_name.startswith('iso-8859'): 261 | if self._has_win_bytes: 262 | charset_name = self.ISO_WIN_MAP.get(lower_charset_name, 263 | charset_name) 264 | self.result = {'encoding': charset_name, 265 | 'confidence': confidence, 266 | 'language': max_prober.language} 267 | 268 | # Log all prober confidences if none met MINIMUM_THRESHOLD 269 | if self.logger.getEffectiveLevel() == logging.DEBUG: 270 | if self.result['encoding'] is None: 271 | self.logger.debug('no probers hit minimum threshold') 272 | for group_prober in self._charset_probers: 273 | if not group_prober: 274 | continue 275 | if isinstance(group_prober, CharSetGroupProber): 276 | for prober in group_prober.probers: 277 | self.logger.debug('%s %s confidence = %s', 278 | prober.charset_name, 279 | prober.language, 280 | prober.get_confidence()) 281 | else: 282 | self.logger.debug('%s %s confidence = %s', 283 | prober.charset_name, 284 | prober.language, 285 | prober.get_confidence()) 286 | return self.result 287 | --------------------------------------------------------------------------------