├── .gitignore ├── README.md ├── chardet ├── __init__.py ├── big5freq.py ├── big5prober.py ├── chardetect.py ├── chardistribution.py ├── charsetgroupprober.py ├── charsetprober.py ├── codingstatemachine.py ├── compat.py ├── constants.py ├── cp949prober.py ├── escprober.py ├── escsm.py ├── eucjpprober.py ├── euckrfreq.py ├── euckrprober.py ├── euctwfreq.py ├── euctwprober.py ├── gb2312freq.py ├── gb2312prober.py ├── hebrewprober.py ├── jisfreq.py ├── jpcntx.py ├── langbulgarianmodel.py ├── langcyrillicmodel.py ├── langgreekmodel.py ├── langhebrewmodel.py ├── langhungarianmodel.py ├── langthaimodel.py ├── latin1prober.py ├── mbcharsetprober.py ├── mbcsgroupprober.py ├── mbcssm.py ├── sbcharsetprober.py ├── sbcsgroupprober.py ├── sjisprober.py ├── universaldetector.py └── utf8prober.py ├── conf.example.ini ├── css ├── github.css └── github2.css ├── evermark.py ├── evernote ├── __init__.py ├── api │ ├── __init__.py │ └── client.py └── edam │ ├── __init__.py │ ├── error │ ├── __init__.py │ ├── constants.py │ └── ttypes.py │ ├── limits │ ├── __init__.py │ ├── constants.py │ └── ttypes.py │ ├── notestore │ ├── NoteStore-remote │ ├── NoteStore.py │ ├── __init__.py │ ├── constants.py │ └── ttypes.py │ ├── type │ ├── __init__.py │ ├── constants.py │ └── ttypes.py │ └── userstore │ ├── UserStore-remote │ ├── UserStore.py │ ├── __init__.py │ ├── constants.py │ └── ttypes.py ├── img ├── note.jpg ├── notebooks.jpg ├── start.jpg └── workbench.jpg ├── markdown2.py ├── premailer ├── __init__.py ├── __main__.py ├── cache.py ├── merge_style.py └── premailer.py ├── requirements.txt ├── test ├── test1 │ ├── 1.txt │ ├── 2.txt │ └── test.md └── test2 │ ├── 1.txt │ ├── 3.txt │ ├── 5.txt │ └── README.md └── thrift ├── TSCons.py ├── TSerialization.py ├── Thrift.py ├── __init__.py ├── protocol ├── TBase.py ├── TBinaryProtocol.py ├── TCompactProtocol.py ├── TProtocol.py ├── __init__.py └── fastbinary.c ├── server ├── THttpServer.py ├── TNonblockingServer.py ├── TProcessPoolServer.py ├── TServer.py └── __init__.py └── transport ├── THttpClient.py ├── TSSLSocket.py ├── TSocket.py ├── TTransport.py ├── TTwisted.py ├── TZlibTransport.py └── __init__.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | 64 | .idea 65 | /conf.ini 66 | html/* 67 | /*.json 68 | logs/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EverMark 2 | A tool that can sync local markdown/text notes to **Evernote** . 3 | 4 | ## 1 Ability 5 | 6 | ### 1.1 Sync directories 7 | 8 | Sub directories that in the root directory (can be set by user in `conf.ini`) would be synced automatically. 9 | 10 | This means, if there is no notebook in **Evernote** has the same name as the sub directory, then **EverMark** will create one. 11 | 12 | Users can make **EverMark** not to sync specific sub directories by edit `ignore_dirs` (sub directory names are splitted by `,`) in `conf.ini`. 13 | 14 | ### 1.2 Sync notes 15 | 16 | Only files that have `.txt` or `.md` suffix would be synced to **Evernote** . 17 | 18 | `txt` represents plain text files. 19 | 20 | `md` represents **MarkDown** files. 21 | 22 | 23 | ## 2 MarkDown Support 24 | 25 | ### 2.1 Now 26 | 27 | - [x] paragraph 28 | - [x] header 29 | - [x] block quote 30 | - [x] list 31 | - [x] link 32 | - [x] code 33 | - [ ] image 34 | 35 | ### 2.2 Next Version 36 | 37 | - [x] paragraph 38 | - [x] header 39 | - [x] block quote 40 | - [x] list 41 | - [x] link 42 | - [x] code 43 | - [x] image 44 | 45 | ## 3 Workbench structure 46 | 47 | **EverMark** can sync a whole directory(root directory of your workbench) to your **Evernote** account. 48 | 49 | And the directory contains some sub directories(represents notebooks in your **Evernote** account). 50 | 51 | Each sub directory corresponds to a notebook and contains a list of files represent notes in this notebook. 52 | 53 | For example, a local workbench like 54 | 55 | - root directory 56 | - sub_directory (named *sub1*) 57 | - file (named *f1.txt*) 58 | - file (named *f2.md*) 59 | - sub_directory (named *sub2*) 60 | - file (named *f1.md*) 61 | - file (named *f3.txt*) 62 | 63 | would be converted by **EverMark** to **Evernote** note structure: 64 | 65 | - your **Evernote** account 66 | - notebook (named *sub1*) 67 | - note (named *f1*) 68 | - note (named *f2*) 69 | - notebook (named *sub2*) 70 | - note (named *f1*) 71 | - note (named *f3*) 72 | 73 | ## 4 Usage 74 | 75 | ### 4.1 Dependencies 76 | **EverMark** depends on lxml, cssutils, cssselect, oauth2. You need to install them: 77 | 78 | ```shell 79 | pip install lxml, cssutils, cssselect, oauth 80 | ``` 81 | 82 | ### 4.2 Install **EverMark** 83 | **EverMark** is written by **Python**, so firstly make sure that your PC has **Python** installed. 84 | 85 | 1. Download **EverMark** from ***github***. 86 | 87 | 2. Unpack it, and move it to anywhere you like. 88 | 89 | ### 4.3 Get **Evernote** Developer Token 90 | **EverMark** need a **Evernote** Developer Token to access your account. 91 | 92 | So you need to create a **Evernote** Developer Token. 93 | 94 | You can refer to [**Evernote** Developer Token page](https://dev.evernote.com/doc/articles/dev_tokens.php) to get a full understand of **Evernote** Developer Token. 95 | 96 | Or you can just go to [Create **Evernote** Developer Token](https://www.evernote.com/api/DeveloperToken.action) or [创建印象笔记Developer Token](https://app.yinxiang.com/api/DeveloperToken.action) if you are user of **印象笔记** , then click ***Create a developer token*** to create your developer token. 97 | 98 | Copy the Developer Token after you have created it, and save it carefully(anyone have this token can access your notes in **Evernote** !) . 99 | 100 | ### 4.4 Configure **EverMark** 101 | 102 | 1. Open file `conf.example.ini` in the path that **EverMark** is installed to. 103 | 104 | 2. Set `auth_token` to the **Evernote** Developer Token you have created. 105 | 106 | 3. Set `account_type` to `evernote`, or `yinxiang` if you are a **印象笔记** user. 107 | 108 | 4. Rename `conf.example.ini` to `conf.ini`. 109 | 110 | ### 4.5 Run 111 | You can directly execute `evermark.py` in the installed path. 112 | 113 | For example, if the path you install **EverMark** is *path_to_evermark*, then you can start **EverMark** by `python path_to_evermark/evermark.py`. 114 | 115 | Or you can create a link to `path_to_evermark/evermark.py` and execute it anywhere you like. 116 | 117 | The default workbench is `evermark` directory in you `HOME` path. 118 | 119 | ## 5 Example images 120 | 121 | ### 5.1 Workbench 122 | 123 |  124 | 125 | ### 5.2 Start **EverMark** 126 | 127 |  128 | 129 | ### 5.3 Sync result: notebooks 130 | 131 |  132 | 133 | ### 5.4 Sync result: note 134 | 135 |  136 | -------------------------------------------------------------------------------- /chardet/__init__.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # This library is free software; you can redistribute it and/or 3 | # modify it under the terms of the GNU Lesser General Public 4 | # License as published by the Free Software Foundation; either 5 | # version 2.1 of the License, or (at your option) any later version. 6 | # 7 | # This library is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 | # Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public 13 | # License along with this library; if not, write to the Free Software 14 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 15 | # 02110-1301 USA 16 | ######################### END LICENSE BLOCK ######################### 17 | 18 | __version__ = "2.3.0" 19 | from sys import version_info 20 | 21 | 22 | def detect(aBuf): 23 | if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or 24 | (version_info >= (3, 0) and not isinstance(aBuf, bytes))): 25 | raise ValueError('Expected a bytes object, not a unicode object') 26 | 27 | from . import universaldetector 28 | u = universaldetector.UniversalDetector() 29 | u.reset() 30 | u.feed(aBuf) 31 | u.close() 32 | return u.result 33 | -------------------------------------------------------------------------------- /chardet/big5prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import Big5DistributionAnalysis 31 | from .mbcssm import Big5SMModel 32 | 33 | 34 | class Big5Prober(MultiByteCharSetProber): 35 | def __init__(self): 36 | MultiByteCharSetProber.__init__(self) 37 | self._mCodingSM = CodingStateMachine(Big5SMModel) 38 | self._mDistributionAnalyzer = Big5DistributionAnalysis() 39 | self.reset() 40 | 41 | def get_charset_name(self): 42 | return "Big5" 43 | -------------------------------------------------------------------------------- /chardet/chardetect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Script which takes one or more file paths and reports on their detected 4 | encodings 5 | 6 | Example:: 7 | 8 | % chardetect somefile someotherfile 9 | somefile: windows-1252 with confidence 0.5 10 | someotherfile: ascii with confidence 1.0 11 | 12 | If no paths are provided, it takes its input from stdin. 13 | 14 | """ 15 | 16 | from __future__ import absolute_import, print_function, unicode_literals 17 | 18 | import argparse 19 | import sys 20 | from io import open 21 | 22 | from chardet import __version__ 23 | from chardet.universaldetector import UniversalDetector 24 | 25 | 26 | def description_of(lines, name='stdin'): 27 | """ 28 | Return a string describing the probable encoding of a file or 29 | list of strings. 30 | 31 | :param lines: The lines to get the encoding of. 32 | :type lines: Iterable of bytes 33 | :param name: Name of file or collection of lines 34 | :type name: str 35 | """ 36 | u = UniversalDetector() 37 | for line in lines: 38 | u.feed(line) 39 | u.close() 40 | result = u.result 41 | if result['encoding']: 42 | return '{0}: {1} with confidence {2}'.format(name, result['encoding'], 43 | result['confidence']) 44 | else: 45 | return '{0}: no result'.format(name) 46 | 47 | 48 | def main(argv=None): 49 | ''' 50 | Handles command line arguments and gets things started. 51 | 52 | :param argv: List of arguments, as if specified on the command-line. 53 | If None, ``sys.argv[1:]`` is used instead. 54 | :type argv: list of str 55 | ''' 56 | # Get command line arguments 57 | parser = argparse.ArgumentParser( 58 | description="Takes one or more file paths and reports their detected \ 59 | encodings", 60 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 61 | conflict_handler='resolve') 62 | parser.add_argument('input', 63 | help='File whose encoding we would like to determine.', 64 | type=argparse.FileType('rb'), nargs='*', 65 | default=[sys.stdin]) 66 | parser.add_argument('--version', action='version', 67 | version='%(prog)s {0}'.format(__version__)) 68 | args = parser.parse_args(argv) 69 | 70 | for f in args.input: 71 | if f.isatty(): 72 | print("You are running chardetect interactively. Press " + 73 | "CTRL-D twice at the start of a blank line to signal the " + 74 | "end of your input. If you want help, run chardetect " + 75 | "--help\n", file=sys.stderr) 76 | print(description_of(f, f.name)) 77 | 78 | 79 | if __name__ == '__main__': 80 | main() 81 | -------------------------------------------------------------------------------- /chardet/chardistribution.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, 29 | EUCTW_TYPICAL_DISTRIBUTION_RATIO) 30 | from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, 31 | EUCKR_TYPICAL_DISTRIBUTION_RATIO) 32 | from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE, 33 | GB2312_TYPICAL_DISTRIBUTION_RATIO) 34 | from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE, 35 | BIG5_TYPICAL_DISTRIBUTION_RATIO) 36 | from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE, 37 | JIS_TYPICAL_DISTRIBUTION_RATIO) 38 | from .compat import wrap_ord 39 | 40 | ENOUGH_DATA_THRESHOLD = 1024 41 | SURE_YES = 0.99 42 | SURE_NO = 0.01 43 | MINIMUM_DATA_THRESHOLD = 3 44 | 45 | 46 | class CharDistributionAnalysis: 47 | def __init__(self): 48 | # Mapping table to get frequency order from char order (get from 49 | # GetOrder()) 50 | self._mCharToFreqOrder = None 51 | self._mTableSize = None # Size of above table 52 | # This is a constant value which varies from language to language, 53 | # used in calculating confidence. See 54 | # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html 55 | # for further detail. 56 | self._mTypicalDistributionRatio = None 57 | self.reset() 58 | 59 | def reset(self): 60 | """reset analyser, clear any state""" 61 | # If this flag is set to True, detection is done and conclusion has 62 | # been made 63 | self._mDone = False 64 | self._mTotalChars = 0 # Total characters encountered 65 | # The number of characters whose frequency order is less than 512 66 | self._mFreqChars = 0 67 | 68 | def feed(self, aBuf, aCharLen): 69 | """feed a character with known length""" 70 | if aCharLen == 2: 71 | # we only care about 2-bytes character in our distribution analysis 72 | order = self.get_order(aBuf) 73 | else: 74 | order = -1 75 | if order >= 0: 76 | self._mTotalChars += 1 77 | # order is valid 78 | if order < self._mTableSize: 79 | if 512 > self._mCharToFreqOrder[order]: 80 | self._mFreqChars += 1 81 | 82 | def get_confidence(self): 83 | """return confidence based on existing data""" 84 | # if we didn't receive any character in our consideration range, 85 | # return negative answer 86 | if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD: 87 | return SURE_NO 88 | 89 | if self._mTotalChars != self._mFreqChars: 90 | r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars) 91 | * self._mTypicalDistributionRatio)) 92 | if r < SURE_YES: 93 | return r 94 | 95 | # normalize confidence (we don't want to be 100% sure) 96 | return SURE_YES 97 | 98 | def got_enough_data(self): 99 | # It is not necessary to receive all data to draw conclusion. 100 | # For charset detection, certain amount of data is enough 101 | return self._mTotalChars > ENOUGH_DATA_THRESHOLD 102 | 103 | def get_order(self, aBuf): 104 | # We do not handle characters based on the original encoding string, 105 | # but convert this encoding string to a number, here called order. 106 | # This allows multiple encodings of a language to share one frequency 107 | # table. 108 | return -1 109 | 110 | 111 | class EUCTWDistributionAnalysis(CharDistributionAnalysis): 112 | def __init__(self): 113 | CharDistributionAnalysis.__init__(self) 114 | self._mCharToFreqOrder = EUCTWCharToFreqOrder 115 | self._mTableSize = EUCTW_TABLE_SIZE 116 | self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO 117 | 118 | def get_order(self, aBuf): 119 | # for euc-TW encoding, we are interested 120 | # first byte range: 0xc4 -- 0xfe 121 | # second byte range: 0xa1 -- 0xfe 122 | # no validation needed here. State machine has done that 123 | first_char = wrap_ord(aBuf[0]) 124 | if first_char >= 0xC4: 125 | return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1 126 | else: 127 | return -1 128 | 129 | 130 | class EUCKRDistributionAnalysis(CharDistributionAnalysis): 131 | def __init__(self): 132 | CharDistributionAnalysis.__init__(self) 133 | self._mCharToFreqOrder = EUCKRCharToFreqOrder 134 | self._mTableSize = EUCKR_TABLE_SIZE 135 | self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO 136 | 137 | def get_order(self, aBuf): 138 | # for euc-KR encoding, we are interested 139 | # first byte range: 0xb0 -- 0xfe 140 | # second byte range: 0xa1 -- 0xfe 141 | # no validation needed here. State machine has done that 142 | first_char = wrap_ord(aBuf[0]) 143 | if first_char >= 0xB0: 144 | return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1 145 | else: 146 | return -1 147 | 148 | 149 | class GB2312DistributionAnalysis(CharDistributionAnalysis): 150 | def __init__(self): 151 | CharDistributionAnalysis.__init__(self) 152 | self._mCharToFreqOrder = GB2312CharToFreqOrder 153 | self._mTableSize = GB2312_TABLE_SIZE 154 | self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO 155 | 156 | def get_order(self, aBuf): 157 | # for GB2312 encoding, we are interested 158 | # first byte range: 0xb0 -- 0xfe 159 | # second byte range: 0xa1 -- 0xfe 160 | # no validation needed here. State machine has done that 161 | first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1]) 162 | if (first_char >= 0xB0) and (second_char >= 0xA1): 163 | return 94 * (first_char - 0xB0) + second_char - 0xA1 164 | else: 165 | return -1 166 | 167 | 168 | class Big5DistributionAnalysis(CharDistributionAnalysis): 169 | def __init__(self): 170 | CharDistributionAnalysis.__init__(self) 171 | self._mCharToFreqOrder = Big5CharToFreqOrder 172 | self._mTableSize = BIG5_TABLE_SIZE 173 | self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO 174 | 175 | def get_order(self, aBuf): 176 | # for big5 encoding, we are interested 177 | # first byte range: 0xa4 -- 0xfe 178 | # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe 179 | # no validation needed here. State machine has done that 180 | first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1]) 181 | if first_char >= 0xA4: 182 | if second_char >= 0xA1: 183 | return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63 184 | else: 185 | return 157 * (first_char - 0xA4) + second_char - 0x40 186 | else: 187 | return -1 188 | 189 | 190 | class SJISDistributionAnalysis(CharDistributionAnalysis): 191 | def __init__(self): 192 | CharDistributionAnalysis.__init__(self) 193 | self._mCharToFreqOrder = JISCharToFreqOrder 194 | self._mTableSize = JIS_TABLE_SIZE 195 | self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO 196 | 197 | def get_order(self, aBuf): 198 | # for sjis encoding, we are interested 199 | # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe 200 | # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe 201 | # no validation needed here. State machine has done that 202 | first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1]) 203 | if (first_char >= 0x81) and (first_char <= 0x9F): 204 | order = 188 * (first_char - 0x81) 205 | elif (first_char >= 0xE0) and (first_char <= 0xEF): 206 | order = 188 * (first_char - 0xE0 + 31) 207 | else: 208 | return -1 209 | order = order + second_char - 0x40 210 | if second_char > 0x7F: 211 | order = -1 212 | return order 213 | 214 | 215 | class EUCJPDistributionAnalysis(CharDistributionAnalysis): 216 | def __init__(self): 217 | CharDistributionAnalysis.__init__(self) 218 | self._mCharToFreqOrder = JISCharToFreqOrder 219 | self._mTableSize = JIS_TABLE_SIZE 220 | self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO 221 | 222 | def get_order(self, aBuf): 223 | # for euc-JP encoding, we are interested 224 | # first byte range: 0xa0 -- 0xfe 225 | # second byte range: 0xa1 -- 0xfe 226 | # no validation needed here. State machine has done that 227 | char = wrap_ord(aBuf[0]) 228 | if char >= 0xA0: 229 | return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1 230 | else: 231 | return -1 232 | -------------------------------------------------------------------------------- /chardet/charsetgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from . import constants 29 | import sys 30 | from .charsetprober import CharSetProber 31 | 32 | 33 | class CharSetGroupProber(CharSetProber): 34 | def __init__(self): 35 | CharSetProber.__init__(self) 36 | self._mActiveNum = 0 37 | self._mProbers = [] 38 | self._mBestGuessProber = None 39 | 40 | def reset(self): 41 | CharSetProber.reset(self) 42 | self._mActiveNum = 0 43 | for prober in self._mProbers: 44 | if prober: 45 | prober.reset() 46 | prober.active = True 47 | self._mActiveNum += 1 48 | self._mBestGuessProber = None 49 | 50 | def get_charset_name(self): 51 | if not self._mBestGuessProber: 52 | self.get_confidence() 53 | if not self._mBestGuessProber: 54 | return None 55 | # self._mBestGuessProber = self._mProbers[0] 56 | return self._mBestGuessProber.get_charset_name() 57 | 58 | def feed(self, aBuf): 59 | for prober in self._mProbers: 60 | if not prober: 61 | continue 62 | if not prober.active: 63 | continue 64 | st = prober.feed(aBuf) 65 | if not st: 66 | continue 67 | if st == constants.eFoundIt: 68 | self._mBestGuessProber = prober 69 | return self.get_state() 70 | elif st == constants.eNotMe: 71 | prober.active = False 72 | self._mActiveNum -= 1 73 | if self._mActiveNum <= 0: 74 | self._mState = constants.eNotMe 75 | return self.get_state() 76 | return self.get_state() 77 | 78 | def get_confidence(self): 79 | st = self.get_state() 80 | if st == constants.eFoundIt: 81 | return 0.99 82 | elif st == constants.eNotMe: 83 | return 0.01 84 | bestConf = 0.0 85 | self._mBestGuessProber = None 86 | for prober in self._mProbers: 87 | if not prober: 88 | continue 89 | if not prober.active: 90 | if constants._debug: 91 | sys.stderr.write(prober.get_charset_name() 92 | + ' not active\n') 93 | continue 94 | cf = prober.get_confidence() 95 | if constants._debug: 96 | sys.stderr.write('%s confidence = %s\n' % 97 | (prober.get_charset_name(), cf)) 98 | if bestConf < cf: 99 | bestConf = cf 100 | self._mBestGuessProber = prober 101 | if not self._mBestGuessProber: 102 | return 0.0 103 | return bestConf 104 | # else: 105 | # self._mBestGuessProber = self._mProbers[0] 106 | # return self._mBestGuessProber.get_confidence() 107 | -------------------------------------------------------------------------------- /chardet/charsetprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | from . import constants 30 | import re 31 | 32 | 33 | class CharSetProber: 34 | def __init__(self): 35 | pass 36 | 37 | def reset(self): 38 | self._mState = constants.eDetecting 39 | 40 | def get_charset_name(self): 41 | return None 42 | 43 | def feed(self, aBuf): 44 | pass 45 | 46 | def get_state(self): 47 | return self._mState 48 | 49 | def get_confidence(self): 50 | return 0.0 51 | 52 | def filter_high_bit_only(self, aBuf): 53 | aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf) 54 | return aBuf 55 | 56 | def filter_without_english_letters(self, aBuf): 57 | aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf) 58 | return aBuf 59 | 60 | def filter_with_english_letters(self, aBuf): 61 | # TODO 62 | return aBuf 63 | -------------------------------------------------------------------------------- /chardet/codingstatemachine.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .constants import eStart 29 | from .compat import wrap_ord 30 | 31 | 32 | class CodingStateMachine: 33 | def __init__(self, sm): 34 | self._mModel = sm 35 | self._mCurrentBytePos = 0 36 | self._mCurrentCharLen = 0 37 | self.reset() 38 | 39 | def reset(self): 40 | self._mCurrentState = eStart 41 | 42 | def next_state(self, c): 43 | # for each byte we get its class 44 | # if it is first byte, we also get byte length 45 | # PY3K: aBuf is a byte stream, so c is an int, not a byte 46 | byteCls = self._mModel['classTable'][wrap_ord(c)] 47 | if self._mCurrentState == eStart: 48 | self._mCurrentBytePos = 0 49 | self._mCurrentCharLen = self._mModel['charLenTable'][byteCls] 50 | # from byte's class and stateTable, we get its next state 51 | curr_state = (self._mCurrentState * self._mModel['classFactor'] 52 | + byteCls) 53 | self._mCurrentState = self._mModel['stateTable'][curr_state] 54 | self._mCurrentBytePos += 1 55 | return self._mCurrentState 56 | 57 | def get_current_charlen(self): 58 | return self._mCurrentCharLen 59 | 60 | def get_coding_state_machine(self): 61 | return self._mModel['name'] 62 | -------------------------------------------------------------------------------- /chardet/compat.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # Contributor(s): 3 | # Ian Cordasco - port to Python 4 | # 5 | # This library is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 2.1 of the License, or (at your option) any later version. 9 | # 10 | # This library is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public 16 | # License along with this library; if not, write to the Free Software 17 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 18 | # 02110-1301 USA 19 | ######################### END LICENSE BLOCK ######################### 20 | 21 | import sys 22 | 23 | 24 | if sys.version_info < (3, 0): 25 | base_str = (str, unicode) 26 | else: 27 | base_str = (bytes, str) 28 | 29 | 30 | def wrap_ord(a): 31 | if sys.version_info < (3, 0) and isinstance(a, base_str): 32 | return ord(a) 33 | else: 34 | return a 35 | -------------------------------------------------------------------------------- /chardet/constants.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | _debug = 0 30 | 31 | eDetecting = 0 32 | eFoundIt = 1 33 | eNotMe = 2 34 | 35 | eStart = 0 36 | eError = 1 37 | eItsMe = 2 38 | 39 | SHORTCUT_THRESHOLD = 0.95 40 | -------------------------------------------------------------------------------- /chardet/cp949prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import EUCKRDistributionAnalysis 31 | from .mbcssm import CP949SMModel 32 | 33 | 34 | class CP949Prober(MultiByteCharSetProber): 35 | def __init__(self): 36 | MultiByteCharSetProber.__init__(self) 37 | self._mCodingSM = CodingStateMachine(CP949SMModel) 38 | # NOTE: CP949 is a superset of EUC-KR, so the distribution should be 39 | # not different. 40 | self._mDistributionAnalyzer = EUCKRDistributionAnalysis() 41 | self.reset() 42 | 43 | def get_charset_name(self): 44 | return "CP949" 45 | -------------------------------------------------------------------------------- /chardet/escprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from . import constants 29 | from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, 30 | ISO2022KRSMModel) 31 | from .charsetprober import CharSetProber 32 | from .codingstatemachine import CodingStateMachine 33 | from .compat import wrap_ord 34 | 35 | 36 | class EscCharSetProber(CharSetProber): 37 | def __init__(self): 38 | CharSetProber.__init__(self) 39 | self._mCodingSM = [ 40 | CodingStateMachine(HZSMModel), 41 | CodingStateMachine(ISO2022CNSMModel), 42 | CodingStateMachine(ISO2022JPSMModel), 43 | CodingStateMachine(ISO2022KRSMModel) 44 | ] 45 | self.reset() 46 | 47 | def reset(self): 48 | CharSetProber.reset(self) 49 | for codingSM in self._mCodingSM: 50 | if not codingSM: 51 | continue 52 | codingSM.active = True 53 | codingSM.reset() 54 | self._mActiveSM = len(self._mCodingSM) 55 | self._mDetectedCharset = None 56 | 57 | def get_charset_name(self): 58 | return self._mDetectedCharset 59 | 60 | def get_confidence(self): 61 | if self._mDetectedCharset: 62 | return 0.99 63 | else: 64 | return 0.00 65 | 66 | def feed(self, aBuf): 67 | for c in aBuf: 68 | # PY3K: aBuf is a byte array, so c is an int, not a byte 69 | for codingSM in self._mCodingSM: 70 | if not codingSM: 71 | continue 72 | if not codingSM.active: 73 | continue 74 | codingState = codingSM.next_state(wrap_ord(c)) 75 | if codingState == constants.eError: 76 | codingSM.active = False 77 | self._mActiveSM -= 1 78 | if self._mActiveSM <= 0: 79 | self._mState = constants.eNotMe 80 | return self.get_state() 81 | elif codingState == constants.eItsMe: 82 | self._mState = constants.eFoundIt 83 | self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8 84 | return self.get_state() 85 | 86 | return self.get_state() 87 | -------------------------------------------------------------------------------- /chardet/escsm.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .constants import eStart, eError, eItsMe 29 | 30 | HZ_cls = ( 31 | 1,0,0,0,0,0,0,0, # 00 - 07 32 | 0,0,0,0,0,0,0,0, # 08 - 0f 33 | 0,0,0,0,0,0,0,0, # 10 - 17 34 | 0,0,0,1,0,0,0,0, # 18 - 1f 35 | 0,0,0,0,0,0,0,0, # 20 - 27 36 | 0,0,0,0,0,0,0,0, # 28 - 2f 37 | 0,0,0,0,0,0,0,0, # 30 - 37 38 | 0,0,0,0,0,0,0,0, # 38 - 3f 39 | 0,0,0,0,0,0,0,0, # 40 - 47 40 | 0,0,0,0,0,0,0,0, # 48 - 4f 41 | 0,0,0,0,0,0,0,0, # 50 - 57 42 | 0,0,0,0,0,0,0,0, # 58 - 5f 43 | 0,0,0,0,0,0,0,0, # 60 - 67 44 | 0,0,0,0,0,0,0,0, # 68 - 6f 45 | 0,0,0,0,0,0,0,0, # 70 - 77 46 | 0,0,0,4,0,5,2,0, # 78 - 7f 47 | 1,1,1,1,1,1,1,1, # 80 - 87 48 | 1,1,1,1,1,1,1,1, # 88 - 8f 49 | 1,1,1,1,1,1,1,1, # 90 - 97 50 | 1,1,1,1,1,1,1,1, # 98 - 9f 51 | 1,1,1,1,1,1,1,1, # a0 - a7 52 | 1,1,1,1,1,1,1,1, # a8 - af 53 | 1,1,1,1,1,1,1,1, # b0 - b7 54 | 1,1,1,1,1,1,1,1, # b8 - bf 55 | 1,1,1,1,1,1,1,1, # c0 - c7 56 | 1,1,1,1,1,1,1,1, # c8 - cf 57 | 1,1,1,1,1,1,1,1, # d0 - d7 58 | 1,1,1,1,1,1,1,1, # d8 - df 59 | 1,1,1,1,1,1,1,1, # e0 - e7 60 | 1,1,1,1,1,1,1,1, # e8 - ef 61 | 1,1,1,1,1,1,1,1, # f0 - f7 62 | 1,1,1,1,1,1,1,1, # f8 - ff 63 | ) 64 | 65 | HZ_st = ( 66 | eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07 67 | eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f 68 | eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17 69 | 5,eError, 6,eError, 5, 5, 4,eError,# 18-1f 70 | 4,eError, 4, 4, 4,eError, 4,eError,# 20-27 71 | 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f 72 | ) 73 | 74 | HZCharLenTable = (0, 0, 0, 0, 0, 0) 75 | 76 | HZSMModel = {'classTable': HZ_cls, 77 | 'classFactor': 6, 78 | 'stateTable': HZ_st, 79 | 'charLenTable': HZCharLenTable, 80 | 'name': "HZ-GB-2312"} 81 | 82 | ISO2022CN_cls = ( 83 | 2,0,0,0,0,0,0,0, # 00 - 07 84 | 0,0,0,0,0,0,0,0, # 08 - 0f 85 | 0,0,0,0,0,0,0,0, # 10 - 17 86 | 0,0,0,1,0,0,0,0, # 18 - 1f 87 | 0,0,0,0,0,0,0,0, # 20 - 27 88 | 0,3,0,0,0,0,0,0, # 28 - 2f 89 | 0,0,0,0,0,0,0,0, # 30 - 37 90 | 0,0,0,0,0,0,0,0, # 38 - 3f 91 | 0,0,0,4,0,0,0,0, # 40 - 47 92 | 0,0,0,0,0,0,0,0, # 48 - 4f 93 | 0,0,0,0,0,0,0,0, # 50 - 57 94 | 0,0,0,0,0,0,0,0, # 58 - 5f 95 | 0,0,0,0,0,0,0,0, # 60 - 67 96 | 0,0,0,0,0,0,0,0, # 68 - 6f 97 | 0,0,0,0,0,0,0,0, # 70 - 77 98 | 0,0,0,0,0,0,0,0, # 78 - 7f 99 | 2,2,2,2,2,2,2,2, # 80 - 87 100 | 2,2,2,2,2,2,2,2, # 88 - 8f 101 | 2,2,2,2,2,2,2,2, # 90 - 97 102 | 2,2,2,2,2,2,2,2, # 98 - 9f 103 | 2,2,2,2,2,2,2,2, # a0 - a7 104 | 2,2,2,2,2,2,2,2, # a8 - af 105 | 2,2,2,2,2,2,2,2, # b0 - b7 106 | 2,2,2,2,2,2,2,2, # b8 - bf 107 | 2,2,2,2,2,2,2,2, # c0 - c7 108 | 2,2,2,2,2,2,2,2, # c8 - cf 109 | 2,2,2,2,2,2,2,2, # d0 - d7 110 | 2,2,2,2,2,2,2,2, # d8 - df 111 | 2,2,2,2,2,2,2,2, # e0 - e7 112 | 2,2,2,2,2,2,2,2, # e8 - ef 113 | 2,2,2,2,2,2,2,2, # f0 - f7 114 | 2,2,2,2,2,2,2,2, # f8 - ff 115 | ) 116 | 117 | ISO2022CN_st = ( 118 | eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 119 | eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f 120 | eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 121 | eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError,# 18-1f 122 | eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27 123 | 5, 6,eError,eError,eError,eError,eError,eError,# 28-2f 124 | eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37 125 | eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f 126 | ) 127 | 128 | ISO2022CNCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0) 129 | 130 | ISO2022CNSMModel = {'classTable': ISO2022CN_cls, 131 | 'classFactor': 9, 132 | 'stateTable': ISO2022CN_st, 133 | 'charLenTable': ISO2022CNCharLenTable, 134 | 'name': "ISO-2022-CN"} 135 | 136 | ISO2022JP_cls = ( 137 | 2,0,0,0,0,0,0,0, # 00 - 07 138 | 0,0,0,0,0,0,2,2, # 08 - 0f 139 | 0,0,0,0,0,0,0,0, # 10 - 17 140 | 0,0,0,1,0,0,0,0, # 18 - 1f 141 | 0,0,0,0,7,0,0,0, # 20 - 27 142 | 3,0,0,0,0,0,0,0, # 28 - 2f 143 | 0,0,0,0,0,0,0,0, # 30 - 37 144 | 0,0,0,0,0,0,0,0, # 38 - 3f 145 | 6,0,4,0,8,0,0,0, # 40 - 47 146 | 0,9,5,0,0,0,0,0, # 48 - 4f 147 | 0,0,0,0,0,0,0,0, # 50 - 57 148 | 0,0,0,0,0,0,0,0, # 58 - 5f 149 | 0,0,0,0,0,0,0,0, # 60 - 67 150 | 0,0,0,0,0,0,0,0, # 68 - 6f 151 | 0,0,0,0,0,0,0,0, # 70 - 77 152 | 0,0,0,0,0,0,0,0, # 78 - 7f 153 | 2,2,2,2,2,2,2,2, # 80 - 87 154 | 2,2,2,2,2,2,2,2, # 88 - 8f 155 | 2,2,2,2,2,2,2,2, # 90 - 97 156 | 2,2,2,2,2,2,2,2, # 98 - 9f 157 | 2,2,2,2,2,2,2,2, # a0 - a7 158 | 2,2,2,2,2,2,2,2, # a8 - af 159 | 2,2,2,2,2,2,2,2, # b0 - b7 160 | 2,2,2,2,2,2,2,2, # b8 - bf 161 | 2,2,2,2,2,2,2,2, # c0 - c7 162 | 2,2,2,2,2,2,2,2, # c8 - cf 163 | 2,2,2,2,2,2,2,2, # d0 - d7 164 | 2,2,2,2,2,2,2,2, # d8 - df 165 | 2,2,2,2,2,2,2,2, # e0 - e7 166 | 2,2,2,2,2,2,2,2, # e8 - ef 167 | 2,2,2,2,2,2,2,2, # f0 - f7 168 | 2,2,2,2,2,2,2,2, # f8 - ff 169 | ) 170 | 171 | ISO2022JP_st = ( 172 | eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 173 | eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f 174 | eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 175 | eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f 176 | eError, 5,eError,eError,eError, 4,eError,eError,# 20-27 177 | eError,eError,eError, 6,eItsMe,eError,eItsMe,eError,# 28-2f 178 | eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37 179 | eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f 180 | eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47 181 | ) 182 | 183 | ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) 184 | 185 | ISO2022JPSMModel = {'classTable': ISO2022JP_cls, 186 | 'classFactor': 10, 187 | 'stateTable': ISO2022JP_st, 188 | 'charLenTable': ISO2022JPCharLenTable, 189 | 'name': "ISO-2022-JP"} 190 | 191 | ISO2022KR_cls = ( 192 | 2,0,0,0,0,0,0,0, # 00 - 07 193 | 0,0,0,0,0,0,0,0, # 08 - 0f 194 | 0,0,0,0,0,0,0,0, # 10 - 17 195 | 0,0,0,1,0,0,0,0, # 18 - 1f 196 | 0,0,0,0,3,0,0,0, # 20 - 27 197 | 0,4,0,0,0,0,0,0, # 28 - 2f 198 | 0,0,0,0,0,0,0,0, # 30 - 37 199 | 0,0,0,0,0,0,0,0, # 38 - 3f 200 | 0,0,0,5,0,0,0,0, # 40 - 47 201 | 0,0,0,0,0,0,0,0, # 48 - 4f 202 | 0,0,0,0,0,0,0,0, # 50 - 57 203 | 0,0,0,0,0,0,0,0, # 58 - 5f 204 | 0,0,0,0,0,0,0,0, # 60 - 67 205 | 0,0,0,0,0,0,0,0, # 68 - 6f 206 | 0,0,0,0,0,0,0,0, # 70 - 77 207 | 0,0,0,0,0,0,0,0, # 78 - 7f 208 | 2,2,2,2,2,2,2,2, # 80 - 87 209 | 2,2,2,2,2,2,2,2, # 88 - 8f 210 | 2,2,2,2,2,2,2,2, # 90 - 97 211 | 2,2,2,2,2,2,2,2, # 98 - 9f 212 | 2,2,2,2,2,2,2,2, # a0 - a7 213 | 2,2,2,2,2,2,2,2, # a8 - af 214 | 2,2,2,2,2,2,2,2, # b0 - b7 215 | 2,2,2,2,2,2,2,2, # b8 - bf 216 | 2,2,2,2,2,2,2,2, # c0 - c7 217 | 2,2,2,2,2,2,2,2, # c8 - cf 218 | 2,2,2,2,2,2,2,2, # d0 - d7 219 | 2,2,2,2,2,2,2,2, # d8 - df 220 | 2,2,2,2,2,2,2,2, # e0 - e7 221 | 2,2,2,2,2,2,2,2, # e8 - ef 222 | 2,2,2,2,2,2,2,2, # f0 - f7 223 | 2,2,2,2,2,2,2,2, # f8 - ff 224 | ) 225 | 226 | ISO2022KR_st = ( 227 | eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07 228 | eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f 229 | eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17 230 | eError,eError,eError,eError, 5,eError,eError,eError,# 18-1f 231 | eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27 232 | ) 233 | 234 | ISO2022KRCharLenTable = (0, 0, 0, 0, 0, 0) 235 | 236 | ISO2022KRSMModel = {'classTable': ISO2022KR_cls, 237 | 'classFactor': 6, 238 | 'stateTable': ISO2022KR_st, 239 | 'charLenTable': ISO2022KRCharLenTable, 240 | 'name': "ISO-2022-KR"} 241 | 242 | # flake8: noqa 243 | -------------------------------------------------------------------------------- /chardet/eucjpprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | import sys 29 | from . import constants 30 | from .mbcharsetprober import MultiByteCharSetProber 31 | from .codingstatemachine import CodingStateMachine 32 | from .chardistribution import EUCJPDistributionAnalysis 33 | from .jpcntx import EUCJPContextAnalysis 34 | from .mbcssm import EUCJPSMModel 35 | 36 | 37 | class EUCJPProber(MultiByteCharSetProber): 38 | def __init__(self): 39 | MultiByteCharSetProber.__init__(self) 40 | self._mCodingSM = CodingStateMachine(EUCJPSMModel) 41 | self._mDistributionAnalyzer = EUCJPDistributionAnalysis() 42 | self._mContextAnalyzer = EUCJPContextAnalysis() 43 | self.reset() 44 | 45 | def reset(self): 46 | MultiByteCharSetProber.reset(self) 47 | self._mContextAnalyzer.reset() 48 | 49 | def get_charset_name(self): 50 | return "EUC-JP" 51 | 52 | def feed(self, aBuf): 53 | aLen = len(aBuf) 54 | for i in range(0, aLen): 55 | # PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte 56 | codingState = self._mCodingSM.next_state(aBuf[i]) 57 | if codingState == constants.eError: 58 | if constants._debug: 59 | sys.stderr.write(self.get_charset_name() 60 | + ' prober hit error at byte ' + str(i) 61 | + '\n') 62 | self._mState = constants.eNotMe 63 | break 64 | elif codingState == constants.eItsMe: 65 | self._mState = constants.eFoundIt 66 | break 67 | elif codingState == constants.eStart: 68 | charLen = self._mCodingSM.get_current_charlen() 69 | if i == 0: 70 | self._mLastChar[1] = aBuf[0] 71 | self._mContextAnalyzer.feed(self._mLastChar, charLen) 72 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen) 73 | else: 74 | self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen) 75 | self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], 76 | charLen) 77 | 78 | self._mLastChar[0] = aBuf[aLen - 1] 79 | 80 | if self.get_state() == constants.eDetecting: 81 | if (self._mContextAnalyzer.got_enough_data() and 82 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): 83 | self._mState = constants.eFoundIt 84 | 85 | return self.get_state() 86 | 87 | def get_confidence(self): 88 | contxtCf = self._mContextAnalyzer.get_confidence() 89 | distribCf = self._mDistributionAnalyzer.get_confidence() 90 | return max(contxtCf, distribCf) 91 | -------------------------------------------------------------------------------- /chardet/euckrprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import EUCKRDistributionAnalysis 31 | from .mbcssm import EUCKRSMModel 32 | 33 | 34 | class EUCKRProber(MultiByteCharSetProber): 35 | def __init__(self): 36 | MultiByteCharSetProber.__init__(self) 37 | self._mCodingSM = CodingStateMachine(EUCKRSMModel) 38 | self._mDistributionAnalyzer = EUCKRDistributionAnalysis() 39 | self.reset() 40 | 41 | def get_charset_name(self): 42 | return "EUC-KR" 43 | -------------------------------------------------------------------------------- /chardet/euctwprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import EUCTWDistributionAnalysis 31 | from .mbcssm import EUCTWSMModel 32 | 33 | class EUCTWProber(MultiByteCharSetProber): 34 | def __init__(self): 35 | MultiByteCharSetProber.__init__(self) 36 | self._mCodingSM = CodingStateMachine(EUCTWSMModel) 37 | self._mDistributionAnalyzer = EUCTWDistributionAnalysis() 38 | self.reset() 39 | 40 | def get_charset_name(self): 41 | return "EUC-TW" 42 | -------------------------------------------------------------------------------- /chardet/gb2312prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import GB2312DistributionAnalysis 31 | from .mbcssm import GB2312SMModel 32 | 33 | class GB2312Prober(MultiByteCharSetProber): 34 | def __init__(self): 35 | MultiByteCharSetProber.__init__(self) 36 | self._mCodingSM = CodingStateMachine(GB2312SMModel) 37 | self._mDistributionAnalyzer = GB2312DistributionAnalysis() 38 | self.reset() 39 | 40 | def get_charset_name(self): 41 | return "GB2312" 42 | -------------------------------------------------------------------------------- /chardet/latin1prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | from .charsetprober import CharSetProber 30 | from .constants import eNotMe 31 | from .compat import wrap_ord 32 | 33 | FREQ_CAT_NUM = 4 34 | 35 | UDF = 0 # undefined 36 | OTH = 1 # other 37 | ASC = 2 # ascii capital letter 38 | ASS = 3 # ascii small letter 39 | ACV = 4 # accent capital vowel 40 | ACO = 5 # accent capital other 41 | ASV = 6 # accent small vowel 42 | ASO = 7 # accent small other 43 | CLASS_NUM = 8 # total classes 44 | 45 | Latin1_CharToClass = ( 46 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 47 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F 48 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 49 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F 50 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 51 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F 52 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 53 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F 54 | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 55 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F 56 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 57 | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F 58 | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 59 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F 60 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 61 | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F 62 | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87 63 | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F 64 | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97 65 | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F 66 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7 67 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF 68 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 69 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF 70 | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7 71 | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF 72 | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7 73 | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF 74 | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7 75 | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF 76 | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7 77 | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF 78 | ) 79 | 80 | # 0 : illegal 81 | # 1 : very unlikely 82 | # 2 : normal 83 | # 3 : very likely 84 | Latin1ClassModel = ( 85 | # UDF OTH ASC ASS ACV ACO ASV ASO 86 | 0, 0, 0, 0, 0, 0, 0, 0, # UDF 87 | 0, 3, 3, 3, 3, 3, 3, 3, # OTH 88 | 0, 3, 3, 3, 3, 3, 3, 3, # ASC 89 | 0, 3, 3, 3, 1, 1, 3, 3, # ASS 90 | 0, 3, 3, 3, 1, 2, 1, 2, # ACV 91 | 0, 3, 3, 3, 3, 3, 3, 3, # ACO 92 | 0, 3, 1, 3, 1, 1, 1, 3, # ASV 93 | 0, 3, 1, 3, 1, 1, 3, 3, # ASO 94 | ) 95 | 96 | 97 | class Latin1Prober(CharSetProber): 98 | def __init__(self): 99 | CharSetProber.__init__(self) 100 | self.reset() 101 | 102 | def reset(self): 103 | self._mLastCharClass = OTH 104 | self._mFreqCounter = [0] * FREQ_CAT_NUM 105 | CharSetProber.reset(self) 106 | 107 | def get_charset_name(self): 108 | return "windows-1252" 109 | 110 | def feed(self, aBuf): 111 | aBuf = self.filter_with_english_letters(aBuf) 112 | for c in aBuf: 113 | charClass = Latin1_CharToClass[wrap_ord(c)] 114 | freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) 115 | + charClass] 116 | if freq == 0: 117 | self._mState = eNotMe 118 | break 119 | self._mFreqCounter[freq] += 1 120 | self._mLastCharClass = charClass 121 | 122 | return self.get_state() 123 | 124 | def get_confidence(self): 125 | if self.get_state() == eNotMe: 126 | return 0.01 127 | 128 | total = sum(self._mFreqCounter) 129 | if total < 0.01: 130 | confidence = 0.0 131 | else: 132 | confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0) 133 | / total) 134 | if confidence < 0.0: 135 | confidence = 0.0 136 | # lower the confidence of latin1 so that other more accurate 137 | # detector can take priority. 138 | confidence = confidence * 0.73 139 | return confidence 140 | -------------------------------------------------------------------------------- /chardet/mbcharsetprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # Proofpoint, Inc. 13 | # 14 | # This library is free software; you can redistribute it and/or 15 | # modify it under the terms of the GNU Lesser General Public 16 | # License as published by the Free Software Foundation; either 17 | # version 2.1 of the License, or (at your option) any later version. 18 | # 19 | # This library is distributed in the hope that it will be useful, 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 | # Lesser General Public License for more details. 23 | # 24 | # You should have received a copy of the GNU Lesser General Public 25 | # License along with this library; if not, write to the Free Software 26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 27 | # 02110-1301 USA 28 | ######################### END LICENSE BLOCK ######################### 29 | 30 | import sys 31 | from . import constants 32 | from .charsetprober import CharSetProber 33 | 34 | 35 | class MultiByteCharSetProber(CharSetProber): 36 | def __init__(self): 37 | CharSetProber.__init__(self) 38 | self._mDistributionAnalyzer = None 39 | self._mCodingSM = None 40 | self._mLastChar = [0, 0] 41 | 42 | def reset(self): 43 | CharSetProber.reset(self) 44 | if self._mCodingSM: 45 | self._mCodingSM.reset() 46 | if self._mDistributionAnalyzer: 47 | self._mDistributionAnalyzer.reset() 48 | self._mLastChar = [0, 0] 49 | 50 | def get_charset_name(self): 51 | pass 52 | 53 | def feed(self, aBuf): 54 | aLen = len(aBuf) 55 | for i in range(0, aLen): 56 | codingState = self._mCodingSM.next_state(aBuf[i]) 57 | if codingState == constants.eError: 58 | if constants._debug: 59 | sys.stderr.write(self.get_charset_name() 60 | + ' prober hit error at byte ' + str(i) 61 | + '\n') 62 | self._mState = constants.eNotMe 63 | break 64 | elif codingState == constants.eItsMe: 65 | self._mState = constants.eFoundIt 66 | break 67 | elif codingState == constants.eStart: 68 | charLen = self._mCodingSM.get_current_charlen() 69 | if i == 0: 70 | self._mLastChar[1] = aBuf[0] 71 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen) 72 | else: 73 | self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], 74 | charLen) 75 | 76 | self._mLastChar[0] = aBuf[aLen - 1] 77 | 78 | if self.get_state() == constants.eDetecting: 79 | if (self._mDistributionAnalyzer.got_enough_data() and 80 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): 81 | self._mState = constants.eFoundIt 82 | 83 | return self.get_state() 84 | 85 | def get_confidence(self): 86 | return self._mDistributionAnalyzer.get_confidence() 87 | -------------------------------------------------------------------------------- /chardet/mbcsgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # Proofpoint, Inc. 13 | # 14 | # This library is free software; you can redistribute it and/or 15 | # modify it under the terms of the GNU Lesser General Public 16 | # License as published by the Free Software Foundation; either 17 | # version 2.1 of the License, or (at your option) any later version. 18 | # 19 | # This library is distributed in the hope that it will be useful, 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 | # Lesser General Public License for more details. 23 | # 24 | # You should have received a copy of the GNU Lesser General Public 25 | # License along with this library; if not, write to the Free Software 26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 27 | # 02110-1301 USA 28 | ######################### END LICENSE BLOCK ######################### 29 | 30 | from .charsetgroupprober import CharSetGroupProber 31 | from .utf8prober import UTF8Prober 32 | from .sjisprober import SJISProber 33 | from .eucjpprober import EUCJPProber 34 | from .gb2312prober import GB2312Prober 35 | from .euckrprober import EUCKRProber 36 | from .cp949prober import CP949Prober 37 | from .big5prober import Big5Prober 38 | from .euctwprober import EUCTWProber 39 | 40 | 41 | class MBCSGroupProber(CharSetGroupProber): 42 | def __init__(self): 43 | CharSetGroupProber.__init__(self) 44 | self._mProbers = [ 45 | UTF8Prober(), 46 | SJISProber(), 47 | EUCJPProber(), 48 | GB2312Prober(), 49 | EUCKRProber(), 50 | CP949Prober(), 51 | Big5Prober(), 52 | EUCTWProber() 53 | ] 54 | self.reset() 55 | -------------------------------------------------------------------------------- /chardet/sbcharsetprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | import sys 30 | from . import constants 31 | from .charsetprober import CharSetProber 32 | from .compat import wrap_ord 33 | 34 | SAMPLE_SIZE = 64 35 | SB_ENOUGH_REL_THRESHOLD = 1024 36 | POSITIVE_SHORTCUT_THRESHOLD = 0.95 37 | NEGATIVE_SHORTCUT_THRESHOLD = 0.05 38 | SYMBOL_CAT_ORDER = 250 39 | NUMBER_OF_SEQ_CAT = 4 40 | POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1 41 | #NEGATIVE_CAT = 0 42 | 43 | 44 | class SingleByteCharSetProber(CharSetProber): 45 | def __init__(self, model, reversed=False, nameProber=None): 46 | CharSetProber.__init__(self) 47 | self._mModel = model 48 | # TRUE if we need to reverse every pair in the model lookup 49 | self._mReversed = reversed 50 | # Optional auxiliary prober for name decision 51 | self._mNameProber = nameProber 52 | self.reset() 53 | 54 | def reset(self): 55 | CharSetProber.reset(self) 56 | # char order of last character 57 | self._mLastOrder = 255 58 | self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT 59 | self._mTotalSeqs = 0 60 | self._mTotalChar = 0 61 | # characters that fall in our sampling range 62 | self._mFreqChar = 0 63 | 64 | def get_charset_name(self): 65 | if self._mNameProber: 66 | return self._mNameProber.get_charset_name() 67 | else: 68 | return self._mModel['charsetName'] 69 | 70 | def feed(self, aBuf): 71 | if not self._mModel['keepEnglishLetter']: 72 | aBuf = self.filter_without_english_letters(aBuf) 73 | aLen = len(aBuf) 74 | if not aLen: 75 | return self.get_state() 76 | for c in aBuf: 77 | order = self._mModel['charToOrderMap'][wrap_ord(c)] 78 | if order < SYMBOL_CAT_ORDER: 79 | self._mTotalChar += 1 80 | if order < SAMPLE_SIZE: 81 | self._mFreqChar += 1 82 | if self._mLastOrder < SAMPLE_SIZE: 83 | self._mTotalSeqs += 1 84 | if not self._mReversed: 85 | i = (self._mLastOrder * SAMPLE_SIZE) + order 86 | model = self._mModel['precedenceMatrix'][i] 87 | else: # reverse the order of the letters in the lookup 88 | i = (order * SAMPLE_SIZE) + self._mLastOrder 89 | model = self._mModel['precedenceMatrix'][i] 90 | self._mSeqCounters[model] += 1 91 | self._mLastOrder = order 92 | 93 | if self.get_state() == constants.eDetecting: 94 | if self._mTotalSeqs > SB_ENOUGH_REL_THRESHOLD: 95 | cf = self.get_confidence() 96 | if cf > POSITIVE_SHORTCUT_THRESHOLD: 97 | if constants._debug: 98 | sys.stderr.write('%s confidence = %s, we have a' 99 | 'winner\n' % 100 | (self._mModel['charsetName'], cf)) 101 | self._mState = constants.eFoundIt 102 | elif cf < NEGATIVE_SHORTCUT_THRESHOLD: 103 | if constants._debug: 104 | sys.stderr.write('%s confidence = %s, below negative' 105 | 'shortcut threshhold %s\n' % 106 | (self._mModel['charsetName'], cf, 107 | NEGATIVE_SHORTCUT_THRESHOLD)) 108 | self._mState = constants.eNotMe 109 | 110 | return self.get_state() 111 | 112 | def get_confidence(self): 113 | r = 0.01 114 | if self._mTotalSeqs > 0: 115 | r = ((1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs 116 | / self._mModel['mTypicalPositiveRatio']) 117 | r = r * self._mFreqChar / self._mTotalChar 118 | if r >= 1.0: 119 | r = 0.99 120 | return r 121 | -------------------------------------------------------------------------------- /chardet/sbcsgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | from .charsetgroupprober import CharSetGroupProber 30 | from .sbcharsetprober import SingleByteCharSetProber 31 | from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel, 32 | Latin5CyrillicModel, MacCyrillicModel, 33 | Ibm866Model, Ibm855Model) 34 | from .langgreekmodel import Latin7GreekModel, Win1253GreekModel 35 | from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel 36 | from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel 37 | from .langthaimodel import TIS620ThaiModel 38 | from .langhebrewmodel import Win1255HebrewModel 39 | from .hebrewprober import HebrewProber 40 | 41 | 42 | class SBCSGroupProber(CharSetGroupProber): 43 | def __init__(self): 44 | CharSetGroupProber.__init__(self) 45 | self._mProbers = [ 46 | SingleByteCharSetProber(Win1251CyrillicModel), 47 | SingleByteCharSetProber(Koi8rModel), 48 | SingleByteCharSetProber(Latin5CyrillicModel), 49 | SingleByteCharSetProber(MacCyrillicModel), 50 | SingleByteCharSetProber(Ibm866Model), 51 | SingleByteCharSetProber(Ibm855Model), 52 | SingleByteCharSetProber(Latin7GreekModel), 53 | SingleByteCharSetProber(Win1253GreekModel), 54 | SingleByteCharSetProber(Latin5BulgarianModel), 55 | SingleByteCharSetProber(Win1251BulgarianModel), 56 | SingleByteCharSetProber(Latin2HungarianModel), 57 | SingleByteCharSetProber(Win1250HungarianModel), 58 | SingleByteCharSetProber(TIS620ThaiModel), 59 | ] 60 | hebrewProber = HebrewProber() 61 | logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, 62 | False, hebrewProber) 63 | visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True, 64 | hebrewProber) 65 | hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber) 66 | self._mProbers.extend([hebrewProber, logicalHebrewProber, 67 | visualHebrewProber]) 68 | 69 | self.reset() 70 | -------------------------------------------------------------------------------- /chardet/sjisprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | import sys 29 | from .mbcharsetprober import MultiByteCharSetProber 30 | from .codingstatemachine import CodingStateMachine 31 | from .chardistribution import SJISDistributionAnalysis 32 | from .jpcntx import SJISContextAnalysis 33 | from .mbcssm import SJISSMModel 34 | from . import constants 35 | 36 | 37 | class SJISProber(MultiByteCharSetProber): 38 | def __init__(self): 39 | MultiByteCharSetProber.__init__(self) 40 | self._mCodingSM = CodingStateMachine(SJISSMModel) 41 | self._mDistributionAnalyzer = SJISDistributionAnalysis() 42 | self._mContextAnalyzer = SJISContextAnalysis() 43 | self.reset() 44 | 45 | def reset(self): 46 | MultiByteCharSetProber.reset(self) 47 | self._mContextAnalyzer.reset() 48 | 49 | def get_charset_name(self): 50 | return self._mContextAnalyzer.get_charset_name() 51 | 52 | def feed(self, aBuf): 53 | aLen = len(aBuf) 54 | for i in range(0, aLen): 55 | codingState = self._mCodingSM.next_state(aBuf[i]) 56 | if codingState == constants.eError: 57 | if constants._debug: 58 | sys.stderr.write(self.get_charset_name() 59 | + ' prober hit error at byte ' + str(i) 60 | + '\n') 61 | self._mState = constants.eNotMe 62 | break 63 | elif codingState == constants.eItsMe: 64 | self._mState = constants.eFoundIt 65 | break 66 | elif codingState == constants.eStart: 67 | charLen = self._mCodingSM.get_current_charlen() 68 | if i == 0: 69 | self._mLastChar[1] = aBuf[0] 70 | self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:], 71 | charLen) 72 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen) 73 | else: 74 | self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3 75 | - charLen], charLen) 76 | self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], 77 | charLen) 78 | 79 | self._mLastChar[0] = aBuf[aLen - 1] 80 | 81 | if self.get_state() == constants.eDetecting: 82 | if (self._mContextAnalyzer.got_enough_data() and 83 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): 84 | self._mState = constants.eFoundIt 85 | 86 | return self.get_state() 87 | 88 | def get_confidence(self): 89 | contxtCf = self._mContextAnalyzer.get_confidence() 90 | distribCf = self._mDistributionAnalyzer.get_confidence() 91 | return max(contxtCf, distribCf) 92 | -------------------------------------------------------------------------------- /chardet/universaldetector.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | from . import constants 30 | import sys 31 | import codecs 32 | from .latin1prober import Latin1Prober # windows-1252 33 | from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets 34 | from .sbcsgroupprober import SBCSGroupProber # single-byte character sets 35 | from .escprober import EscCharSetProber # ISO-2122, etc. 36 | import re 37 | 38 | MINIMUM_THRESHOLD = 0.20 39 | ePureAscii = 0 40 | eEscAscii = 1 41 | eHighbyte = 2 42 | 43 | 44 | class UniversalDetector: 45 | def __init__(self): 46 | self._highBitDetector = re.compile(b'[\x80-\xFF]') 47 | self._escDetector = re.compile(b'(\033|~{)') 48 | self._mEscCharSetProber = None 49 | self._mCharSetProbers = [] 50 | self.reset() 51 | 52 | def reset(self): 53 | self.result = {'encoding': None, 'confidence': 0.0} 54 | self.done = False 55 | self._mStart = True 56 | self._mGotData = False 57 | self._mInputState = ePureAscii 58 | self._mLastChar = b'' 59 | if self._mEscCharSetProber: 60 | self._mEscCharSetProber.reset() 61 | for prober in self._mCharSetProbers: 62 | prober.reset() 63 | 64 | def feed(self, aBuf): 65 | if self.done: 66 | return 67 | 68 | aLen = len(aBuf) 69 | if not aLen: 70 | return 71 | 72 | if not self._mGotData: 73 | # If the data starts with BOM, we know it is UTF 74 | if aBuf[:3] == codecs.BOM_UTF8: 75 | # EF BB BF UTF-8 with BOM 76 | self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0} 77 | elif aBuf[:4] == codecs.BOM_UTF32_LE: 78 | # FF FE 00 00 UTF-32, little-endian BOM 79 | self.result = {'encoding': "UTF-32LE", 'confidence': 1.0} 80 | elif aBuf[:4] == codecs.BOM_UTF32_BE: 81 | # 00 00 FE FF UTF-32, big-endian BOM 82 | self.result = {'encoding': "UTF-32BE", 'confidence': 1.0} 83 | elif aBuf[:4] == b'\xFE\xFF\x00\x00': 84 | # FE FF 00 00 UCS-4, unusual octet order BOM (3412) 85 | self.result = { 86 | 'encoding': "X-ISO-10646-UCS-4-3412", 87 | 'confidence': 1.0 88 | } 89 | elif aBuf[:4] == b'\x00\x00\xFF\xFE': 90 | # 00 00 FF FE UCS-4, unusual octet order BOM (2143) 91 | self.result = { 92 | 'encoding': "X-ISO-10646-UCS-4-2143", 93 | 'confidence': 1.0 94 | } 95 | elif aBuf[:2] == codecs.BOM_LE: 96 | # FF FE UTF-16, little endian BOM 97 | self.result = {'encoding': "UTF-16LE", 'confidence': 1.0} 98 | elif aBuf[:2] == codecs.BOM_BE: 99 | # FE FF UTF-16, big endian BOM 100 | self.result = {'encoding': "UTF-16BE", 'confidence': 1.0} 101 | 102 | self._mGotData = True 103 | if self.result['encoding'] and (self.result['confidence'] > 0.0): 104 | self.done = True 105 | return 106 | 107 | if self._mInputState == ePureAscii: 108 | if self._highBitDetector.search(aBuf): 109 | self._mInputState = eHighbyte 110 | elif ((self._mInputState == ePureAscii) and 111 | self._escDetector.search(self._mLastChar + aBuf)): 112 | self._mInputState = eEscAscii 113 | 114 | self._mLastChar = aBuf[-1:] 115 | 116 | if self._mInputState == eEscAscii: 117 | if not self._mEscCharSetProber: 118 | self._mEscCharSetProber = EscCharSetProber() 119 | if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt: 120 | self.result = {'encoding': self._mEscCharSetProber.get_charset_name(), 121 | 'confidence': self._mEscCharSetProber.get_confidence()} 122 | self.done = True 123 | elif self._mInputState == eHighbyte: 124 | if not self._mCharSetProbers: 125 | self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), 126 | Latin1Prober()] 127 | for prober in self._mCharSetProbers: 128 | if prober.feed(aBuf) == constants.eFoundIt: 129 | self.result = {'encoding': prober.get_charset_name(), 130 | 'confidence': prober.get_confidence()} 131 | self.done = True 132 | break 133 | 134 | def close(self): 135 | if self.done: 136 | return 137 | if not self._mGotData: 138 | if constants._debug: 139 | sys.stderr.write('no data received!\n') 140 | return 141 | self.done = True 142 | 143 | if self._mInputState == ePureAscii: 144 | self.result = {'encoding': 'ascii', 'confidence': 1.0} 145 | return self.result 146 | 147 | if self._mInputState == eHighbyte: 148 | proberConfidence = None 149 | maxProberConfidence = 0.0 150 | maxProber = None 151 | for prober in self._mCharSetProbers: 152 | if not prober: 153 | continue 154 | proberConfidence = prober.get_confidence() 155 | if proberConfidence > maxProberConfidence: 156 | maxProberConfidence = proberConfidence 157 | maxProber = prober 158 | if maxProber and (maxProberConfidence > MINIMUM_THRESHOLD): 159 | self.result = {'encoding': maxProber.get_charset_name(), 160 | 'confidence': maxProber.get_confidence()} 161 | return self.result 162 | 163 | if constants._debug: 164 | sys.stderr.write('no probers hit minimum threshhold\n') 165 | for prober in self._mCharSetProbers[0].mProbers: 166 | if not prober: 167 | continue 168 | sys.stderr.write('%s confidence = %s\n' % 169 | (prober.get_charset_name(), 170 | prober.get_confidence())) 171 | -------------------------------------------------------------------------------- /chardet/utf8prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from . import constants 29 | from .charsetprober import CharSetProber 30 | from .codingstatemachine import CodingStateMachine 31 | from .mbcssm import UTF8SMModel 32 | 33 | ONE_CHAR_PROB = 0.5 34 | 35 | 36 | class UTF8Prober(CharSetProber): 37 | def __init__(self): 38 | CharSetProber.__init__(self) 39 | self._mCodingSM = CodingStateMachine(UTF8SMModel) 40 | self.reset() 41 | 42 | def reset(self): 43 | CharSetProber.reset(self) 44 | self._mCodingSM.reset() 45 | self._mNumOfMBChar = 0 46 | 47 | def get_charset_name(self): 48 | return "utf-8" 49 | 50 | def feed(self, aBuf): 51 | for c in aBuf: 52 | codingState = self._mCodingSM.next_state(c) 53 | if codingState == constants.eError: 54 | self._mState = constants.eNotMe 55 | break 56 | elif codingState == constants.eItsMe: 57 | self._mState = constants.eFoundIt 58 | break 59 | elif codingState == constants.eStart: 60 | if self._mCodingSM.get_current_charlen() >= 2: 61 | self._mNumOfMBChar += 1 62 | 63 | if self.get_state() == constants.eDetecting: 64 | if self.get_confidence() > constants.SHORTCUT_THRESHOLD: 65 | self._mState = constants.eFoundIt 66 | 67 | return self.get_state() 68 | 69 | def get_confidence(self): 70 | unlike = 0.99 71 | if self._mNumOfMBChar < 6: 72 | for i in range(0, self._mNumOfMBChar): 73 | unlike = unlike * ONE_CHAR_PROB 74 | return 1.0 - unlike 75 | else: 76 | return unlike 77 | -------------------------------------------------------------------------------- /conf.example.ini: -------------------------------------------------------------------------------- 1 | [main] 2 | auth_token= 3 | account_type=yinxiang 4 | test=no 5 | input_encoding= 6 | ignore_dirs=img,test 7 | style=github 8 | log_level=info -------------------------------------------------------------------------------- /evernote/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/evernote/__init__.py -------------------------------------------------------------------------------- /evernote/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/evernote/api/__init__.py -------------------------------------------------------------------------------- /evernote/api/client.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import functools 3 | import inspect 4 | import re 5 | import oauth2 as oauth 6 | import urllib 7 | import urlparse 8 | 9 | import evernote.edam.userstore.UserStore as UserStore 10 | import evernote.edam.notestore.NoteStore as NoteStore 11 | import evernote.edam.userstore.constants as UserStoreConstants 12 | 13 | import thrift.protocol.TBinaryProtocol as TBinaryProtocol 14 | import thrift.transport.THttpClient as THttpClient 15 | 16 | 17 | class EvernoteClient(object): 18 | 19 | def __init__(self, **options): 20 | self.consumer_key = options.get('consumer_key') 21 | self.consumer_secret = options.get('consumer_secret') 22 | self.sandbox = options.get('sandbox', True) 23 | if self.sandbox: 24 | default_service_host = 'sandbox.evernote.com' 25 | else: 26 | default_service_host = 'www.evernote.com' 27 | self.yinxiang = options.get('yinxiang', False) 28 | if self.yinxiang: 29 | default_service_host = 'app.yinxiang.com' 30 | print 'host:', default_service_host 31 | self.service_host = options.get('service_host', default_service_host) 32 | self.additional_headers = options.get('additional_headers', {}) 33 | self.token = options.get('token') 34 | self.secret = options.get('secret') 35 | 36 | def get_request_token(self, callback_url): 37 | client = self._get_oauth_client() 38 | request_url = '%s?oauth_callback=%s' % ( 39 | self._get_endpoint('oauth'), urllib.quote(callback_url)) 40 | 41 | resp, content = client.request(request_url, 'GET') 42 | request_token = dict(urlparse.parse_qsl(content)) 43 | return request_token 44 | 45 | def get_authorize_url(self, request_token): 46 | return '%s?oauth_token=%s' % ( 47 | self._get_endpoint('OAuth.action'), 48 | urllib.quote(request_token['oauth_token'])) 49 | 50 | def get_access_token_dict( 51 | self, oauth_token, oauth_token_secret, oauth_verifier 52 | ): 53 | token = oauth.Token(oauth_token, oauth_token_secret) 54 | token.set_verifier(oauth_verifier) 55 | client = self._get_oauth_client(token) 56 | 57 | resp, content = client.request(self._get_endpoint('oauth'), 'POST') 58 | access_token_dict = dict(urlparse.parse_qsl(content)) 59 | self.token = access_token_dict['oauth_token'] 60 | return access_token_dict 61 | 62 | def get_access_token( 63 | self, oauth_token, oauth_token_secret, oauth_verifier 64 | ): 65 | access_token_dict = self.get_access_token_dict( 66 | oauth_token, 67 | oauth_token_secret, 68 | oauth_verifier 69 | ) 70 | return access_token_dict['oauth_token'] 71 | 72 | def get_user_store(self): 73 | user_store_uri = self._get_endpoint("/edam/user") 74 | store = Store(self.token, UserStore.Client, user_store_uri) 75 | if not store: # Trick for PyDev code completion 76 | store = UserStore.Client() 77 | raise Exception('Should never reach here') 78 | return store 79 | 80 | def get_note_store(self): 81 | user_store = self.get_user_store() 82 | note_store_uri = user_store.getNoteStoreUrl() 83 | store = Store(self.token, NoteStore.Client, note_store_uri) 84 | if not store: # Trick for PyDev code completion 85 | store = NoteStore.Client() 86 | raise Exception('Should never reach here') 87 | return store 88 | 89 | def get_shared_note_store(self, linkedNotebook): 90 | note_store_uri = linkedNotebook.noteStoreUrl 91 | note_store = Store(self.token, NoteStore.Client, note_store_uri) 92 | shared_auth = note_store.authenticateToSharedNotebook( 93 | linkedNotebook.shareKey) 94 | shared_token = shared_auth.authenticationToken 95 | store = Store(shared_token, NoteStore.Client, note_store_uri) 96 | if not store: # Trick for PyDev code completion 97 | store = NoteStore.Client() 98 | raise Exception('Should never reach here') 99 | return store 100 | 101 | def get_business_note_store(self): 102 | user_store = self.get_user_store() 103 | biz_auth = user_store.authenticateToBusiness() 104 | biz_token = biz_auth.authenticationToken 105 | note_store_uri = biz_auth.noteStoreUrl 106 | store = Store(biz_token, NoteStore.Client, note_store_uri) 107 | if not store: # Trick for PyDev code completion 108 | store = NoteStore.Client() 109 | raise Exception('Should never reach here') 110 | return store 111 | 112 | def _get_oauth_client(self, token=None): 113 | consumer = oauth.Consumer(self.consumer_key, self.consumer_secret) 114 | if token: 115 | client = oauth.Client(consumer, token) 116 | else: 117 | client = oauth.Client(consumer) 118 | return client 119 | 120 | def _get_endpoint(self, path=None): 121 | url = "https://%s" % (self.service_host) 122 | if path is not None: 123 | url += "/%s" % path 124 | return url 125 | 126 | 127 | class Store(object): 128 | 129 | def __init__(self, token, client_class, store_url): 130 | self.token = token 131 | m = re.search(':A=(.+):', token) 132 | if m: 133 | self._user_agent_id = m.groups()[0] 134 | else: 135 | self._user_agent_id = '' 136 | self._client = self._get_thrift_client(client_class, store_url) 137 | 138 | def __getattr__(self, name): 139 | def delegate_method(*args, **kwargs): 140 | targetMethod = getattr(self._client, name, None) 141 | if targetMethod is None: 142 | return object.__getattribute__(self, name)(*args, **kwargs) 143 | 144 | org_args = inspect.getargspec(targetMethod).args 145 | if len(org_args) == len(args) + 1: 146 | return targetMethod(*args, **kwargs) 147 | elif 'authenticationToken' in org_args: 148 | skip_args = ['self', 'authenticationToken'] 149 | arg_names = [i for i in org_args if i not in skip_args] 150 | return functools.partial( 151 | targetMethod, authenticationToken=self.token 152 | )(**dict(zip(arg_names, args))) 153 | else: 154 | return targetMethod(*args, **kwargs) 155 | 156 | return delegate_method 157 | 158 | def _get_thrift_client(self, client_class, url): 159 | http_client = THttpClient.THttpClient(url) 160 | http_client.addHeaders(**{ 161 | 'User-Agent': "%s / %s; Python / %s;" 162 | % (self._user_agent_id, self._get_sdk_version(), sys.version.replace('\n',"")) 163 | }) 164 | 165 | thrift_protocol = TBinaryProtocol.TBinaryProtocol(http_client) 166 | return client_class(thrift_protocol) 167 | 168 | def _get_sdk_version(self): 169 | return '%s.%s' % ( 170 | UserStoreConstants.EDAM_VERSION_MAJOR, 171 | UserStoreConstants.EDAM_VERSION_MINOR 172 | ) 173 | -------------------------------------------------------------------------------- /evernote/edam/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/evernote/edam/__init__.py -------------------------------------------------------------------------------- /evernote/edam/error/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants'] 2 | -------------------------------------------------------------------------------- /evernote/edam/error/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py:new_style 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | from ttypes import * 11 | 12 | -------------------------------------------------------------------------------- /evernote/edam/limits/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants'] 2 | -------------------------------------------------------------------------------- /evernote/edam/limits/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py:new_style 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | from ttypes import * 11 | 12 | EDAM_ATTRIBUTE_LEN_MIN = 1 13 | EDAM_ATTRIBUTE_LEN_MAX = 4096 14 | EDAM_ATTRIBUTE_REGEX = "^[^\\p{Cc}\\p{Zl}\\p{Zp}]{1,4096}$" 15 | EDAM_ATTRIBUTE_LIST_MAX = 100 16 | EDAM_ATTRIBUTE_MAP_MAX = 100 17 | EDAM_GUID_LEN_MIN = 36 18 | EDAM_GUID_LEN_MAX = 36 19 | EDAM_GUID_REGEX = "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$" 20 | EDAM_EMAIL_LEN_MIN = 6 21 | EDAM_EMAIL_LEN_MAX = 255 22 | EDAM_EMAIL_LOCAL_REGEX = "^[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(\\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*$" 23 | EDAM_EMAIL_DOMAIN_REGEX = "^[A-Za-z0-9-]+(\\.[A-Za-z0-9-]+)*\\.([A-Za-z]{2,})$" 24 | EDAM_EMAIL_REGEX = "^[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(\\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*@[A-Za-z0-9-]+(\\.[A-Za-z0-9-]+)*\\.([A-Za-z]{2,})$" 25 | EDAM_VAT_REGEX = "^((AT)?U[0-9]{8}|(BE)?0?[0-9]{9}|(BG)?[0-9]{9,10}|(CY)?[0-9]{8}L|(CZ)?[0-9]{8,10}|(DE)?[0-9]{9}|(DK)?[0-9]{8}|(EE)?[0-9]{9}|(EL|GR)?[0-9]{9}|(ES)?[0-9A-Z][0-9]{7}[0-9A-Z]|(FI)?[0-9]{8}|(FR)?[0-9A-Z]{2}[0-9]{9}|(GB)?([0-9]{9}([0-9]{3})?|[A-Z]{2}[0-9]{3})|(HU)?[0-9]{8}|(IE)?[0-9]S[0-9]{5}L|(IT)?[0-9]{11}|(LT)?([0-9]{9}|[0-9]{12})|(LU)?[0-9]{8}|(LV)?[0-9]{11}|(MT)?[0-9]{8}|(NL)?[0-9]{9}B[0-9]{2}|(PL)?[0-9]{10}|(PT)?[0-9]{9}|(RO)?[0-9]{2,10}|(SE)?[0-9]{12}|(SI)?[0-9]{8}|(SK)?[0-9]{10})|[0-9]{9}MVA|[0-9]{6}|CHE[0-9]{9}(TVA|MWST|IVA)$" 26 | EDAM_TIMEZONE_LEN_MIN = 1 27 | EDAM_TIMEZONE_LEN_MAX = 32 28 | EDAM_TIMEZONE_REGEX = "^([A-Za-z_-]+(/[A-Za-z_-]+)*)|(GMT(-|\\+)[0-9]{1,2}(:[0-9]{2})?)$" 29 | EDAM_MIME_LEN_MIN = 3 30 | EDAM_MIME_LEN_MAX = 255 31 | EDAM_MIME_REGEX = "^[A-Za-z]+/[A-Za-z0-9._+-]+$" 32 | EDAM_MIME_TYPE_GIF = "image/gif" 33 | EDAM_MIME_TYPE_JPEG = "image/jpeg" 34 | EDAM_MIME_TYPE_PNG = "image/png" 35 | EDAM_MIME_TYPE_WAV = "audio/wav" 36 | EDAM_MIME_TYPE_MP3 = "audio/mpeg" 37 | EDAM_MIME_TYPE_AMR = "audio/amr" 38 | EDAM_MIME_TYPE_AAC = "audio/aac" 39 | EDAM_MIME_TYPE_M4A = "audio/mp4" 40 | EDAM_MIME_TYPE_MP4_VIDEO = "video/mp4" 41 | EDAM_MIME_TYPE_INK = "application/vnd.evernote.ink" 42 | EDAM_MIME_TYPE_PDF = "application/pdf" 43 | EDAM_MIME_TYPE_DEFAULT = "application/octet-stream" 44 | EDAM_MIME_TYPES = set([ 45 | "image/gif", 46 | "image/jpeg", 47 | "image/png", 48 | "audio/wav", 49 | "audio/mpeg", 50 | "audio/amr", 51 | "application/vnd.evernote.ink", 52 | "application/pdf", 53 | "video/mp4", 54 | "audio/aac", 55 | "audio/mp4", 56 | ]) 57 | EDAM_INDEXABLE_RESOURCE_MIME_TYPES = set([ 58 | "application/msword", 59 | "application/mspowerpoint", 60 | "application/excel", 61 | "application/vnd.ms-word", 62 | "application/vnd.ms-powerpoint", 63 | "application/vnd.ms-excel", 64 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 65 | "application/vnd.openxmlformats-officedocument.presentationml.presentation", 66 | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 67 | "application/vnd.apple.pages", 68 | "application/vnd.apple.numbers", 69 | "application/vnd.apple.keynote", 70 | "application/x-iwork-pages-sffpages", 71 | "application/x-iwork-numbers-sffnumbers", 72 | "application/x-iwork-keynote-sffkey", 73 | ]) 74 | EDAM_SEARCH_QUERY_LEN_MIN = 0 75 | EDAM_SEARCH_QUERY_LEN_MAX = 1024 76 | EDAM_SEARCH_QUERY_REGEX = "^[^\\p{Cc}\\p{Zl}\\p{Zp}]{0,1024}$" 77 | EDAM_HASH_LEN = 16 78 | EDAM_USER_USERNAME_LEN_MIN = 1 79 | EDAM_USER_USERNAME_LEN_MAX = 64 80 | EDAM_USER_USERNAME_REGEX = "^[a-z0-9]([a-z0-9_-]{0,62}[a-z0-9])?$" 81 | EDAM_USER_NAME_LEN_MIN = 1 82 | EDAM_USER_NAME_LEN_MAX = 255 83 | EDAM_USER_NAME_REGEX = "^[^\\p{Cc}\\p{Zl}\\p{Zp}]{1,255}$" 84 | EDAM_TAG_NAME_LEN_MIN = 1 85 | EDAM_TAG_NAME_LEN_MAX = 100 86 | EDAM_TAG_NAME_REGEX = "^[^,\\p{Cc}\\p{Z}]([^,\\p{Cc}\\p{Zl}\\p{Zp}]{0,98}[^,\\p{Cc}\\p{Z}])?$" 87 | EDAM_NOTE_TITLE_LEN_MIN = 1 88 | EDAM_NOTE_TITLE_LEN_MAX = 255 89 | EDAM_NOTE_TITLE_REGEX = "^[^\\p{Cc}\\p{Z}]([^\\p{Cc}\\p{Zl}\\p{Zp}]{0,253}[^\\p{Cc}\\p{Z}])?$" 90 | EDAM_NOTE_CONTENT_LEN_MIN = 0 91 | EDAM_NOTE_CONTENT_LEN_MAX = 5242880 92 | EDAM_APPLICATIONDATA_NAME_LEN_MIN = 3 93 | EDAM_APPLICATIONDATA_NAME_LEN_MAX = 32 94 | EDAM_APPLICATIONDATA_VALUE_LEN_MIN = 0 95 | EDAM_APPLICATIONDATA_VALUE_LEN_MAX = 4092 96 | EDAM_APPLICATIONDATA_ENTRY_LEN_MAX = 4095 97 | EDAM_APPLICATIONDATA_NAME_REGEX = "^[A-Za-z0-9_.-]{3,32}$" 98 | EDAM_APPLICATIONDATA_VALUE_REGEX = "^[^\\p{Cc}]{0,4092}$" 99 | EDAM_NOTEBOOK_NAME_LEN_MIN = 1 100 | EDAM_NOTEBOOK_NAME_LEN_MAX = 100 101 | EDAM_NOTEBOOK_NAME_REGEX = "^[^\\p{Cc}\\p{Z}]([^\\p{Cc}\\p{Zl}\\p{Zp}]{0,98}[^\\p{Cc}\\p{Z}])?$" 102 | EDAM_NOTEBOOK_STACK_LEN_MIN = 1 103 | EDAM_NOTEBOOK_STACK_LEN_MAX = 100 104 | EDAM_NOTEBOOK_STACK_REGEX = "^[^\\p{Cc}\\p{Z}]([^\\p{Cc}\\p{Zl}\\p{Zp}]{0,98}[^\\p{Cc}\\p{Z}])?$" 105 | EDAM_PUBLISHING_URI_LEN_MIN = 1 106 | EDAM_PUBLISHING_URI_LEN_MAX = 255 107 | EDAM_PUBLISHING_URI_REGEX = "^[a-zA-Z0-9.~_+-]{1,255}$" 108 | EDAM_PUBLISHING_URI_PROHIBITED = set([ 109 | "..", 110 | ]) 111 | EDAM_PUBLISHING_DESCRIPTION_LEN_MIN = 1 112 | EDAM_PUBLISHING_DESCRIPTION_LEN_MAX = 200 113 | EDAM_PUBLISHING_DESCRIPTION_REGEX = "^[^\\p{Cc}\\p{Z}]([^\\p{Cc}\\p{Zl}\\p{Zp}]{0,198}[^\\p{Cc}\\p{Z}])?$" 114 | EDAM_SAVED_SEARCH_NAME_LEN_MIN = 1 115 | EDAM_SAVED_SEARCH_NAME_LEN_MAX = 100 116 | EDAM_SAVED_SEARCH_NAME_REGEX = "^[^\\p{Cc}\\p{Z}]([^\\p{Cc}\\p{Zl}\\p{Zp}]{0,98}[^\\p{Cc}\\p{Z}])?$" 117 | EDAM_USER_PASSWORD_LEN_MIN = 6 118 | EDAM_USER_PASSWORD_LEN_MAX = 64 119 | EDAM_USER_PASSWORD_REGEX = "^[A-Za-z0-9!#$%&'()*+,./:;<=>?@^_`{|}~\\[\\]\\\\-]{6,64}$" 120 | EDAM_BUSINESS_URI_LEN_MAX = 32 121 | EDAM_NOTE_TAGS_MAX = 100 122 | EDAM_NOTE_RESOURCES_MAX = 1000 123 | EDAM_USER_TAGS_MAX = 100000 124 | EDAM_BUSINESS_TAGS_MAX = 100000 125 | EDAM_USER_SAVED_SEARCHES_MAX = 100 126 | EDAM_USER_NOTES_MAX = 100000 127 | EDAM_BUSINESS_NOTES_MAX = 500000 128 | EDAM_USER_NOTEBOOKS_MAX = 250 129 | EDAM_BUSINESS_NOTEBOOKS_MAX = 5000 130 | EDAM_USER_RECENT_MAILED_ADDRESSES_MAX = 10 131 | EDAM_USER_MAIL_LIMIT_DAILY_FREE = 50 132 | EDAM_USER_MAIL_LIMIT_DAILY_PREMIUM = 200 133 | EDAM_USER_UPLOAD_LIMIT_FREE = 62914560 134 | EDAM_USER_UPLOAD_LIMIT_PREMIUM = 1073741824 135 | EDAM_USER_UPLOAD_LIMIT_BUSINESS = 2147483647 136 | EDAM_NOTE_SIZE_MAX_FREE = 26214400 137 | EDAM_NOTE_SIZE_MAX_PREMIUM = 104857600 138 | EDAM_RESOURCE_SIZE_MAX_FREE = 26214400 139 | EDAM_RESOURCE_SIZE_MAX_PREMIUM = 104857600 140 | EDAM_USER_LINKED_NOTEBOOK_MAX = 100 141 | EDAM_USER_LINKED_NOTEBOOK_MAX_PREMIUM = 250 142 | EDAM_NOTEBOOK_SHARED_NOTEBOOK_MAX = 250 143 | EDAM_NOTE_CONTENT_CLASS_LEN_MIN = 3 144 | EDAM_NOTE_CONTENT_CLASS_LEN_MAX = 32 145 | EDAM_NOTE_CONTENT_CLASS_REGEX = "^[A-Za-z0-9_.-]{3,32}$" 146 | EDAM_HELLO_APP_CONTENT_CLASS_PREFIX = "evernote.hello." 147 | EDAM_FOOD_APP_CONTENT_CLASS_PREFIX = "evernote.food." 148 | EDAM_CONTENT_CLASS_HELLO_ENCOUNTER = "evernote.hello.encounter" 149 | EDAM_CONTENT_CLASS_HELLO_PROFILE = "evernote.hello.profile" 150 | EDAM_CONTENT_CLASS_FOOD_MEAL = "evernote.food.meal" 151 | EDAM_CONTENT_CLASS_SKITCH_PREFIX = "evernote.skitch" 152 | EDAM_CONTENT_CLASS_SKITCH = "evernote.skitch" 153 | EDAM_CONTENT_CLASS_SKITCH_PDF = "evernote.skitch.pdf" 154 | EDAM_CONTENT_CLASS_PENULTIMATE_PREFIX = "evernote.penultimate." 155 | EDAM_CONTENT_CLASS_PENULTIMATE_NOTEBOOK = "evernote.penultimate.notebook" 156 | EDAM_RELATED_PLAINTEXT_LEN_MIN = 1 157 | EDAM_RELATED_PLAINTEXT_LEN_MAX = 131072 158 | EDAM_RELATED_MAX_NOTES = 25 159 | EDAM_RELATED_MAX_NOTEBOOKS = 1 160 | EDAM_RELATED_MAX_TAGS = 25 161 | EDAM_BUSINESS_NOTEBOOK_DESCRIPTION_LEN_MIN = 1 162 | EDAM_BUSINESS_NOTEBOOK_DESCRIPTION_LEN_MAX = 200 163 | EDAM_BUSINESS_NOTEBOOK_DESCRIPTION_REGEX = "^[^\\p{Cc}\\p{Z}]([^\\p{Cc}\\p{Zl}\\p{Zp}]{0,198}[^\\p{Cc}\\p{Z}])?$" 164 | EDAM_BUSINESS_PHONE_NUMBER_LEN_MAX = 20 165 | EDAM_PREFERENCE_NAME_LEN_MIN = 3 166 | EDAM_PREFERENCE_NAME_LEN_MAX = 32 167 | EDAM_PREFERENCE_VALUE_LEN_MIN = 1 168 | EDAM_PREFERENCE_VALUE_LEN_MAX = 1024 169 | EDAM_MAX_PREFERENCES = 100 170 | EDAM_MAX_VALUES_PER_PREFERENCE = 256 171 | EDAM_PREFERENCE_NAME_REGEX = "^[A-Za-z0-9_.-]{3,32}$" 172 | EDAM_PREFERENCE_VALUE_REGEX = "^[^\\p{Cc}]{1,1024}$" 173 | EDAM_PREFERENCE_SHORTCUTS = "evernote.shortcuts" 174 | EDAM_PREFERENCE_SHORTCUTS_MAX_VALUES = 250 175 | EDAM_DEVICE_ID_LEN_MAX = 32 176 | EDAM_DEVICE_ID_REGEX = "^[^\\p{Cc}]{1,32}$" 177 | EDAM_DEVICE_DESCRIPTION_LEN_MAX = 64 178 | EDAM_DEVICE_DESCRIPTION_REGEX = "^[^\\p{Cc}]{1,64}$" 179 | EDAM_SEARCH_SUGGESTIONS_MAX = 10 180 | EDAM_SEARCH_SUGGESTIONS_PREFIX_LEN_MAX = 1024 181 | EDAM_SEARCH_SUGGESTIONS_PREFIX_LEN_MIN = 2 182 | -------------------------------------------------------------------------------- /evernote/edam/limits/ttypes.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py:new_style 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | 11 | from thrift.transport import TTransport 12 | from thrift.protocol import TBinaryProtocol, TProtocol 13 | try: 14 | from thrift.protocol import fastbinary 15 | except: 16 | fastbinary = None 17 | 18 | 19 | -------------------------------------------------------------------------------- /evernote/edam/notestore/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants', 'NoteStore'] 2 | -------------------------------------------------------------------------------- /evernote/edam/notestore/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py:new_style 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | from ttypes import * 11 | 12 | -------------------------------------------------------------------------------- /evernote/edam/type/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants'] 2 | -------------------------------------------------------------------------------- /evernote/edam/type/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py:new_style 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | from ttypes import * 11 | 12 | CLASSIFICATION_RECIPE_USER_NON_RECIPE = "000" 13 | CLASSIFICATION_RECIPE_USER_RECIPE = "001" 14 | CLASSIFICATION_RECIPE_SERVICE_RECIPE = "002" 15 | EDAM_NOTE_SOURCE_WEB_CLIP = "web.clip" 16 | EDAM_NOTE_SOURCE_MAIL_CLIP = "mail.clip" 17 | EDAM_NOTE_SOURCE_MAIL_SMTP_GATEWAY = "mail.smtp" 18 | -------------------------------------------------------------------------------- /evernote/edam/userstore/UserStore-remote: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Autogenerated by Thrift Compiler 4 | # 5 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 6 | # 7 | # options string: py:new_style 8 | # 9 | 10 | import sys 11 | import pprint 12 | from urlparse import urlparse 13 | from thrift.transport import TTransport 14 | from thrift.transport import TSocket 15 | from thrift.transport import THttpClient 16 | from thrift.protocol import TBinaryProtocol 17 | 18 | import UserStore 19 | from ttypes import * 20 | 21 | if len(sys.argv) <= 1 or sys.argv[1] == '--help': 22 | print '' 23 | print 'Usage: ' + sys.argv[0] + ' [-h host[:port]] [-u url] [-f[ramed]] function [arg1 [arg2...]]' 24 | print '' 25 | print 'Functions:' 26 | print ' bool checkVersion(string clientName, i16 edamVersionMajor, i16 edamVersionMinor)' 27 | print ' BootstrapInfo getBootstrapInfo(string locale)' 28 | print ' AuthenticationResult authenticate(string username, string password, string consumerKey, string consumerSecret, bool supportsTwoFactor)' 29 | print ' AuthenticationResult authenticateLongSession(string username, string password, string consumerKey, string consumerSecret, string deviceIdentifier, string deviceDescription, bool supportsTwoFactor)' 30 | print ' AuthenticationResult completeTwoFactorAuthentication(string authenticationToken, string oneTimeCode, string deviceIdentifier, string deviceDescription)' 31 | print ' void revokeLongSession(string authenticationToken)' 32 | print ' AuthenticationResult authenticateToBusiness(string authenticationToken)' 33 | print ' AuthenticationResult refreshAuthentication(string authenticationToken)' 34 | print ' User getUser(string authenticationToken)' 35 | print ' PublicUserInfo getPublicUserInfo(string username)' 36 | print ' PremiumInfo getPremiumInfo(string authenticationToken)' 37 | print ' string getNoteStoreUrl(string authenticationToken)' 38 | print '' 39 | sys.exit(0) 40 | 41 | pp = pprint.PrettyPrinter(indent = 2) 42 | host = 'localhost' 43 | port = 9090 44 | uri = '' 45 | framed = False 46 | http = False 47 | argi = 1 48 | 49 | if sys.argv[argi] == '-h': 50 | parts = sys.argv[argi+1].split(':') 51 | host = parts[0] 52 | if len(parts) > 1: 53 | port = int(parts[1]) 54 | argi += 2 55 | 56 | if sys.argv[argi] == '-u': 57 | url = urlparse(sys.argv[argi+1]) 58 | parts = url[1].split(':') 59 | host = parts[0] 60 | if len(parts) > 1: 61 | port = int(parts[1]) 62 | else: 63 | port = 80 64 | uri = url[2] 65 | if url[4]: 66 | uri += '?%s' % url[4] 67 | http = True 68 | argi += 2 69 | 70 | if sys.argv[argi] == '-f' or sys.argv[argi] == '-framed': 71 | framed = True 72 | argi += 1 73 | 74 | cmd = sys.argv[argi] 75 | args = sys.argv[argi+1:] 76 | 77 | if http: 78 | transport = THttpClient.THttpClient(host, port, uri) 79 | else: 80 | socket = TSocket.TSocket(host, port) 81 | if framed: 82 | transport = TTransport.TFramedTransport(socket) 83 | else: 84 | transport = TTransport.TBufferedTransport(socket) 85 | protocol = TBinaryProtocol.TBinaryProtocol(transport) 86 | client = UserStore.Client(protocol) 87 | transport.open() 88 | 89 | if cmd == 'checkVersion': 90 | if len(args) != 3: 91 | print 'checkVersion requires 3 args' 92 | sys.exit(1) 93 | pp.pprint(client.checkVersion(args[0],eval(args[1]),eval(args[2]),)) 94 | 95 | elif cmd == 'getBootstrapInfo': 96 | if len(args) != 1: 97 | print 'getBootstrapInfo requires 1 args' 98 | sys.exit(1) 99 | pp.pprint(client.getBootstrapInfo(args[0],)) 100 | 101 | elif cmd == 'authenticate': 102 | if len(args) != 5: 103 | print 'authenticate requires 5 args' 104 | sys.exit(1) 105 | pp.pprint(client.authenticate(args[0],args[1],args[2],args[3],eval(args[4]),)) 106 | 107 | elif cmd == 'authenticateLongSession': 108 | if len(args) != 7: 109 | print 'authenticateLongSession requires 7 args' 110 | sys.exit(1) 111 | pp.pprint(client.authenticateLongSession(args[0],args[1],args[2],args[3],args[4],args[5],eval(args[6]),)) 112 | 113 | elif cmd == 'completeTwoFactorAuthentication': 114 | if len(args) != 4: 115 | print 'completeTwoFactorAuthentication requires 4 args' 116 | sys.exit(1) 117 | pp.pprint(client.completeTwoFactorAuthentication(args[0],args[1],args[2],args[3],)) 118 | 119 | elif cmd == 'revokeLongSession': 120 | if len(args) != 1: 121 | print 'revokeLongSession requires 1 args' 122 | sys.exit(1) 123 | pp.pprint(client.revokeLongSession(args[0],)) 124 | 125 | elif cmd == 'authenticateToBusiness': 126 | if len(args) != 1: 127 | print 'authenticateToBusiness requires 1 args' 128 | sys.exit(1) 129 | pp.pprint(client.authenticateToBusiness(args[0],)) 130 | 131 | elif cmd == 'refreshAuthentication': 132 | if len(args) != 1: 133 | print 'refreshAuthentication requires 1 args' 134 | sys.exit(1) 135 | pp.pprint(client.refreshAuthentication(args[0],)) 136 | 137 | elif cmd == 'getUser': 138 | if len(args) != 1: 139 | print 'getUser requires 1 args' 140 | sys.exit(1) 141 | pp.pprint(client.getUser(args[0],)) 142 | 143 | elif cmd == 'getPublicUserInfo': 144 | if len(args) != 1: 145 | print 'getPublicUserInfo requires 1 args' 146 | sys.exit(1) 147 | pp.pprint(client.getPublicUserInfo(args[0],)) 148 | 149 | elif cmd == 'getPremiumInfo': 150 | if len(args) != 1: 151 | print 'getPremiumInfo requires 1 args' 152 | sys.exit(1) 153 | pp.pprint(client.getPremiumInfo(args[0],)) 154 | 155 | elif cmd == 'getNoteStoreUrl': 156 | if len(args) != 1: 157 | print 'getNoteStoreUrl requires 1 args' 158 | sys.exit(1) 159 | pp.pprint(client.getNoteStoreUrl(args[0],)) 160 | 161 | else: 162 | print 'Unrecognized method %s' % cmd 163 | sys.exit(1) 164 | 165 | transport.close() 166 | -------------------------------------------------------------------------------- /evernote/edam/userstore/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants', 'UserStore'] 2 | -------------------------------------------------------------------------------- /evernote/edam/userstore/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py:new_style 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | from ttypes import * 11 | 12 | EDAM_VERSION_MAJOR = 1 13 | EDAM_VERSION_MINOR = 25 14 | -------------------------------------------------------------------------------- /img/note.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/img/note.jpg -------------------------------------------------------------------------------- /img/notebooks.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/img/notebooks.jpg -------------------------------------------------------------------------------- /img/start.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/img/start.jpg -------------------------------------------------------------------------------- /img/workbench.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/img/workbench.jpg -------------------------------------------------------------------------------- /premailer/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | from .premailer import Premailer, transform 3 | 4 | __version__ = '2.9.7' 5 | -------------------------------------------------------------------------------- /premailer/__main__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | import sys 3 | import argparse 4 | 5 | from .premailer import Premailer 6 | 7 | 8 | def main(args): 9 | """Command-line tool to transform html style to inline css 10 | 11 | Usage:: 12 | 13 | $ echo '