├── .gitignore ├── README.md ├── chardet ├── __init__.py ├── big5freq.py ├── big5prober.py ├── chardetect.py ├── chardistribution.py ├── charsetgroupprober.py ├── charsetprober.py ├── codingstatemachine.py ├── compat.py ├── constants.py ├── cp949prober.py ├── escprober.py ├── escsm.py ├── eucjpprober.py ├── euckrfreq.py ├── euckrprober.py ├── euctwfreq.py ├── euctwprober.py ├── gb2312freq.py ├── gb2312prober.py ├── hebrewprober.py ├── jisfreq.py ├── jpcntx.py ├── langbulgarianmodel.py ├── langcyrillicmodel.py ├── langgreekmodel.py ├── langhebrewmodel.py ├── langhungarianmodel.py ├── langthaimodel.py ├── latin1prober.py ├── mbcharsetprober.py ├── mbcsgroupprober.py ├── mbcssm.py ├── sbcharsetprober.py ├── sbcsgroupprober.py ├── sjisprober.py ├── universaldetector.py └── utf8prober.py ├── conf.example.ini ├── css ├── github.css └── github2.css ├── evermark.py ├── evernote ├── __init__.py ├── api │ ├── __init__.py │ └── client.py └── edam │ ├── __init__.py │ ├── error │ ├── __init__.py │ ├── constants.py │ └── ttypes.py │ ├── limits │ ├── __init__.py │ ├── constants.py │ └── ttypes.py │ ├── notestore │ ├── NoteStore-remote │ ├── NoteStore.py │ ├── __init__.py │ ├── constants.py │ └── ttypes.py │ ├── type │ ├── __init__.py │ ├── constants.py │ └── ttypes.py │ └── userstore │ ├── UserStore-remote │ ├── UserStore.py │ ├── __init__.py │ ├── constants.py │ └── ttypes.py ├── img ├── note.jpg ├── notebooks.jpg ├── start.jpg └── workbench.jpg ├── markdown2.py ├── premailer ├── __init__.py ├── __main__.py ├── cache.py ├── merge_style.py └── premailer.py ├── requirements.txt ├── test ├── test1 │ ├── 1.txt │ ├── 2.txt │ └── test.md └── test2 │ ├── 1.txt │ ├── 3.txt │ ├── 5.txt │ └── README.md └── thrift ├── TSCons.py ├── TSerialization.py ├── Thrift.py ├── __init__.py ├── protocol ├── TBase.py ├── TBinaryProtocol.py ├── TCompactProtocol.py ├── TProtocol.py ├── __init__.py └── fastbinary.c ├── server ├── THttpServer.py ├── TNonblockingServer.py ├── TProcessPoolServer.py ├── TServer.py └── __init__.py └── transport ├── THttpClient.py ├── TSSLSocket.py ├── TSocket.py ├── TTransport.py ├── TTwisted.py ├── TZlibTransport.py └── __init__.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | 64 | .idea 65 | /conf.ini 66 | html/* 67 | /*.json 68 | logs/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EverMark 2 | A tool that can sync local markdown/text notes to **Evernote** . 3 | 4 | ## 1 Ability 5 | 6 | ### 1.1 Sync directories 7 | 8 | Sub directories that in the root directory (can be set by user in `conf.ini`) would be synced automatically. 9 | 10 | This means, if there is no notebook in **Evernote** has the same name as the sub directory, then **EverMark** will create one. 11 | 12 | Users can make **EverMark** not to sync specific sub directories by edit `ignore_dirs` (sub directory names are splitted by `,`) in `conf.ini`. 13 | 14 | ### 1.2 Sync notes 15 | 16 | Only files that have `.txt` or `.md` suffix would be synced to **Evernote** . 17 | 18 | `txt` represents plain text files. 19 | 20 | `md` represents **MarkDown** files. 21 | 22 | 23 | ## 2 MarkDown Support 24 | 25 | ### 2.1 Now 26 | 27 | - [x] paragraph 28 | - [x] header 29 | - [x] block quote 30 | - [x] list 31 | - [x] link 32 | - [x] code 33 | - [ ] image 34 | 35 | ### 2.2 Next Version 36 | 37 | - [x] paragraph 38 | - [x] header 39 | - [x] block quote 40 | - [x] list 41 | - [x] link 42 | - [x] code 43 | - [x] image 44 | 45 | ## 3 Workbench structure 46 | 47 | **EverMark** can sync a whole directory(root directory of your workbench) to your **Evernote** account. 48 | 49 | And the directory contains some sub directories(represents notebooks in your **Evernote** account). 50 | 51 | Each sub directory corresponds to a notebook and contains a list of files represent notes in this notebook. 52 | 53 | For example, a local workbench like 54 | 55 | - root directory 56 | - sub_directory (named *sub1*) 57 | - file (named *f1.txt*) 58 | - file (named *f2.md*) 59 | - sub_directory (named *sub2*) 60 | - file (named *f1.md*) 61 | - file (named *f3.txt*) 62 | 63 | would be converted by **EverMark** to **Evernote** note structure: 64 | 65 | - your **Evernote** account 66 | - notebook (named *sub1*) 67 | - note (named *f1*) 68 | - note (named *f2*) 69 | - notebook (named *sub2*) 70 | - note (named *f1*) 71 | - note (named *f3*) 72 | 73 | ## 4 Usage 74 | 75 | ### 4.1 Dependencies 76 | **EverMark** depends on lxml, cssutils, cssselect, oauth2. You need to install them: 77 | 78 | ```shell 79 | pip install lxml, cssutils, cssselect, oauth 80 | ``` 81 | 82 | ### 4.2 Install **EverMark** 83 | **EverMark** is written by **Python**, so firstly make sure that your PC has **Python** installed. 84 | 85 | 1. Download **EverMark** from ***github***. 86 | 87 | 2. Unpack it, and move it to anywhere you like. 88 | 89 | ### 4.3 Get **Evernote** Developer Token 90 | **EverMark** need a **Evernote** Developer Token to access your account. 91 | 92 | So you need to create a **Evernote** Developer Token. 93 | 94 | You can refer to [**Evernote** Developer Token page](https://dev.evernote.com/doc/articles/dev_tokens.php) to get a full understand of **Evernote** Developer Token. 95 | 96 | Or you can just go to [Create **Evernote** Developer Token](https://www.evernote.com/api/DeveloperToken.action) or [创建印象笔记Developer Token](https://app.yinxiang.com/api/DeveloperToken.action) if you are user of **印象笔记** , then click ***Create a developer token*** to create your developer token. 97 | 98 | Copy the Developer Token after you have created it, and save it carefully(anyone have this token can access your notes in **Evernote** !) . 99 | 100 | ### 4.4 Configure **EverMark** 101 | 102 | 1. Open file `conf.example.ini` in the path that **EverMark** is installed to. 103 | 104 | 2. Set `auth_token` to the **Evernote** Developer Token you have created. 105 | 106 | 3. Set `account_type` to `evernote`, or `yinxiang` if you are a **印象笔记** user. 107 | 108 | 4. Rename `conf.example.ini` to `conf.ini`. 109 | 110 | ### 4.5 Run 111 | You can directly execute `evermark.py` in the installed path. 112 | 113 | For example, if the path you install **EverMark** is *path_to_evermark*, then you can start **EverMark** by `python path_to_evermark/evermark.py`. 114 | 115 | Or you can create a link to `path_to_evermark/evermark.py` and execute it anywhere you like. 116 | 117 | The default workbench is `evermark` directory in you `HOME` path. 118 | 119 | ## 5 Example images 120 | 121 | ### 5.1 Workbench 122 | 123 | ![workbench](img/workbench.jpg) 124 | 125 | ### 5.2 Start **EverMark** 126 | 127 | ![start](img/start.jpg) 128 | 129 | ### 5.3 Sync result: notebooks 130 | 131 | ![notebooks](img/notebooks.jpg) 132 | 133 | ### 5.4 Sync result: note 134 | 135 | ![notebooks](img/note.jpg) 136 | -------------------------------------------------------------------------------- /chardet/__init__.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # This library is free software; you can redistribute it and/or 3 | # modify it under the terms of the GNU Lesser General Public 4 | # License as published by the Free Software Foundation; either 5 | # version 2.1 of the License, or (at your option) any later version. 6 | # 7 | # This library is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 | # Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public 13 | # License along with this library; if not, write to the Free Software 14 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 15 | # 02110-1301 USA 16 | ######################### END LICENSE BLOCK ######################### 17 | 18 | __version__ = "2.3.0" 19 | from sys import version_info 20 | 21 | 22 | def detect(aBuf): 23 | if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or 24 | (version_info >= (3, 0) and not isinstance(aBuf, bytes))): 25 | raise ValueError('Expected a bytes object, not a unicode object') 26 | 27 | from . import universaldetector 28 | u = universaldetector.UniversalDetector() 29 | u.reset() 30 | u.feed(aBuf) 31 | u.close() 32 | return u.result 33 | -------------------------------------------------------------------------------- /chardet/big5prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import Big5DistributionAnalysis 31 | from .mbcssm import Big5SMModel 32 | 33 | 34 | class Big5Prober(MultiByteCharSetProber): 35 | def __init__(self): 36 | MultiByteCharSetProber.__init__(self) 37 | self._mCodingSM = CodingStateMachine(Big5SMModel) 38 | self._mDistributionAnalyzer = Big5DistributionAnalysis() 39 | self.reset() 40 | 41 | def get_charset_name(self): 42 | return "Big5" 43 | -------------------------------------------------------------------------------- /chardet/chardetect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Script which takes one or more file paths and reports on their detected 4 | encodings 5 | 6 | Example:: 7 | 8 | % chardetect somefile someotherfile 9 | somefile: windows-1252 with confidence 0.5 10 | someotherfile: ascii with confidence 1.0 11 | 12 | If no paths are provided, it takes its input from stdin. 13 | 14 | """ 15 | 16 | from __future__ import absolute_import, print_function, unicode_literals 17 | 18 | import argparse 19 | import sys 20 | from io import open 21 | 22 | from chardet import __version__ 23 | from chardet.universaldetector import UniversalDetector 24 | 25 | 26 | def description_of(lines, name='stdin'): 27 | """ 28 | Return a string describing the probable encoding of a file or 29 | list of strings. 30 | 31 | :param lines: The lines to get the encoding of. 32 | :type lines: Iterable of bytes 33 | :param name: Name of file or collection of lines 34 | :type name: str 35 | """ 36 | u = UniversalDetector() 37 | for line in lines: 38 | u.feed(line) 39 | u.close() 40 | result = u.result 41 | if result['encoding']: 42 | return '{0}: {1} with confidence {2}'.format(name, result['encoding'], 43 | result['confidence']) 44 | else: 45 | return '{0}: no result'.format(name) 46 | 47 | 48 | def main(argv=None): 49 | ''' 50 | Handles command line arguments and gets things started. 51 | 52 | :param argv: List of arguments, as if specified on the command-line. 53 | If None, ``sys.argv[1:]`` is used instead. 54 | :type argv: list of str 55 | ''' 56 | # Get command line arguments 57 | parser = argparse.ArgumentParser( 58 | description="Takes one or more file paths and reports their detected \ 59 | encodings", 60 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 61 | conflict_handler='resolve') 62 | parser.add_argument('input', 63 | help='File whose encoding we would like to determine.', 64 | type=argparse.FileType('rb'), nargs='*', 65 | default=[sys.stdin]) 66 | parser.add_argument('--version', action='version', 67 | version='%(prog)s {0}'.format(__version__)) 68 | args = parser.parse_args(argv) 69 | 70 | for f in args.input: 71 | if f.isatty(): 72 | print("You are running chardetect interactively. Press " + 73 | "CTRL-D twice at the start of a blank line to signal the " + 74 | "end of your input. If you want help, run chardetect " + 75 | "--help\n", file=sys.stderr) 76 | print(description_of(f, f.name)) 77 | 78 | 79 | if __name__ == '__main__': 80 | main() 81 | -------------------------------------------------------------------------------- /chardet/chardistribution.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .euctwfreq import (EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, 29 | EUCTW_TYPICAL_DISTRIBUTION_RATIO) 30 | from .euckrfreq import (EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, 31 | EUCKR_TYPICAL_DISTRIBUTION_RATIO) 32 | from .gb2312freq import (GB2312CharToFreqOrder, GB2312_TABLE_SIZE, 33 | GB2312_TYPICAL_DISTRIBUTION_RATIO) 34 | from .big5freq import (Big5CharToFreqOrder, BIG5_TABLE_SIZE, 35 | BIG5_TYPICAL_DISTRIBUTION_RATIO) 36 | from .jisfreq import (JISCharToFreqOrder, JIS_TABLE_SIZE, 37 | JIS_TYPICAL_DISTRIBUTION_RATIO) 38 | from .compat import wrap_ord 39 | 40 | ENOUGH_DATA_THRESHOLD = 1024 41 | SURE_YES = 0.99 42 | SURE_NO = 0.01 43 | MINIMUM_DATA_THRESHOLD = 3 44 | 45 | 46 | class CharDistributionAnalysis: 47 | def __init__(self): 48 | # Mapping table to get frequency order from char order (get from 49 | # GetOrder()) 50 | self._mCharToFreqOrder = None 51 | self._mTableSize = None # Size of above table 52 | # This is a constant value which varies from language to language, 53 | # used in calculating confidence. See 54 | # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html 55 | # for further detail. 56 | self._mTypicalDistributionRatio = None 57 | self.reset() 58 | 59 | def reset(self): 60 | """reset analyser, clear any state""" 61 | # If this flag is set to True, detection is done and conclusion has 62 | # been made 63 | self._mDone = False 64 | self._mTotalChars = 0 # Total characters encountered 65 | # The number of characters whose frequency order is less than 512 66 | self._mFreqChars = 0 67 | 68 | def feed(self, aBuf, aCharLen): 69 | """feed a character with known length""" 70 | if aCharLen == 2: 71 | # we only care about 2-bytes character in our distribution analysis 72 | order = self.get_order(aBuf) 73 | else: 74 | order = -1 75 | if order >= 0: 76 | self._mTotalChars += 1 77 | # order is valid 78 | if order < self._mTableSize: 79 | if 512 > self._mCharToFreqOrder[order]: 80 | self._mFreqChars += 1 81 | 82 | def get_confidence(self): 83 | """return confidence based on existing data""" 84 | # if we didn't receive any character in our consideration range, 85 | # return negative answer 86 | if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD: 87 | return SURE_NO 88 | 89 | if self._mTotalChars != self._mFreqChars: 90 | r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars) 91 | * self._mTypicalDistributionRatio)) 92 | if r < SURE_YES: 93 | return r 94 | 95 | # normalize confidence (we don't want to be 100% sure) 96 | return SURE_YES 97 | 98 | def got_enough_data(self): 99 | # It is not necessary to receive all data to draw conclusion. 100 | # For charset detection, certain amount of data is enough 101 | return self._mTotalChars > ENOUGH_DATA_THRESHOLD 102 | 103 | def get_order(self, aBuf): 104 | # We do not handle characters based on the original encoding string, 105 | # but convert this encoding string to a number, here called order. 106 | # This allows multiple encodings of a language to share one frequency 107 | # table. 108 | return -1 109 | 110 | 111 | class EUCTWDistributionAnalysis(CharDistributionAnalysis): 112 | def __init__(self): 113 | CharDistributionAnalysis.__init__(self) 114 | self._mCharToFreqOrder = EUCTWCharToFreqOrder 115 | self._mTableSize = EUCTW_TABLE_SIZE 116 | self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO 117 | 118 | def get_order(self, aBuf): 119 | # for euc-TW encoding, we are interested 120 | # first byte range: 0xc4 -- 0xfe 121 | # second byte range: 0xa1 -- 0xfe 122 | # no validation needed here. State machine has done that 123 | first_char = wrap_ord(aBuf[0]) 124 | if first_char >= 0xC4: 125 | return 94 * (first_char - 0xC4) + wrap_ord(aBuf[1]) - 0xA1 126 | else: 127 | return -1 128 | 129 | 130 | class EUCKRDistributionAnalysis(CharDistributionAnalysis): 131 | def __init__(self): 132 | CharDistributionAnalysis.__init__(self) 133 | self._mCharToFreqOrder = EUCKRCharToFreqOrder 134 | self._mTableSize = EUCKR_TABLE_SIZE 135 | self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO 136 | 137 | def get_order(self, aBuf): 138 | # for euc-KR encoding, we are interested 139 | # first byte range: 0xb0 -- 0xfe 140 | # second byte range: 0xa1 -- 0xfe 141 | # no validation needed here. State machine has done that 142 | first_char = wrap_ord(aBuf[0]) 143 | if first_char >= 0xB0: 144 | return 94 * (first_char - 0xB0) + wrap_ord(aBuf[1]) - 0xA1 145 | else: 146 | return -1 147 | 148 | 149 | class GB2312DistributionAnalysis(CharDistributionAnalysis): 150 | def __init__(self): 151 | CharDistributionAnalysis.__init__(self) 152 | self._mCharToFreqOrder = GB2312CharToFreqOrder 153 | self._mTableSize = GB2312_TABLE_SIZE 154 | self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO 155 | 156 | def get_order(self, aBuf): 157 | # for GB2312 encoding, we are interested 158 | # first byte range: 0xb0 -- 0xfe 159 | # second byte range: 0xa1 -- 0xfe 160 | # no validation needed here. State machine has done that 161 | first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1]) 162 | if (first_char >= 0xB0) and (second_char >= 0xA1): 163 | return 94 * (first_char - 0xB0) + second_char - 0xA1 164 | else: 165 | return -1 166 | 167 | 168 | class Big5DistributionAnalysis(CharDistributionAnalysis): 169 | def __init__(self): 170 | CharDistributionAnalysis.__init__(self) 171 | self._mCharToFreqOrder = Big5CharToFreqOrder 172 | self._mTableSize = BIG5_TABLE_SIZE 173 | self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO 174 | 175 | def get_order(self, aBuf): 176 | # for big5 encoding, we are interested 177 | # first byte range: 0xa4 -- 0xfe 178 | # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe 179 | # no validation needed here. State machine has done that 180 | first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1]) 181 | if first_char >= 0xA4: 182 | if second_char >= 0xA1: 183 | return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63 184 | else: 185 | return 157 * (first_char - 0xA4) + second_char - 0x40 186 | else: 187 | return -1 188 | 189 | 190 | class SJISDistributionAnalysis(CharDistributionAnalysis): 191 | def __init__(self): 192 | CharDistributionAnalysis.__init__(self) 193 | self._mCharToFreqOrder = JISCharToFreqOrder 194 | self._mTableSize = JIS_TABLE_SIZE 195 | self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO 196 | 197 | def get_order(self, aBuf): 198 | # for sjis encoding, we are interested 199 | # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe 200 | # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe 201 | # no validation needed here. State machine has done that 202 | first_char, second_char = wrap_ord(aBuf[0]), wrap_ord(aBuf[1]) 203 | if (first_char >= 0x81) and (first_char <= 0x9F): 204 | order = 188 * (first_char - 0x81) 205 | elif (first_char >= 0xE0) and (first_char <= 0xEF): 206 | order = 188 * (first_char - 0xE0 + 31) 207 | else: 208 | return -1 209 | order = order + second_char - 0x40 210 | if second_char > 0x7F: 211 | order = -1 212 | return order 213 | 214 | 215 | class EUCJPDistributionAnalysis(CharDistributionAnalysis): 216 | def __init__(self): 217 | CharDistributionAnalysis.__init__(self) 218 | self._mCharToFreqOrder = JISCharToFreqOrder 219 | self._mTableSize = JIS_TABLE_SIZE 220 | self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO 221 | 222 | def get_order(self, aBuf): 223 | # for euc-JP encoding, we are interested 224 | # first byte range: 0xa0 -- 0xfe 225 | # second byte range: 0xa1 -- 0xfe 226 | # no validation needed here. State machine has done that 227 | char = wrap_ord(aBuf[0]) 228 | if char >= 0xA0: 229 | return 94 * (char - 0xA1) + wrap_ord(aBuf[1]) - 0xa1 230 | else: 231 | return -1 232 | -------------------------------------------------------------------------------- /chardet/charsetgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from . import constants 29 | import sys 30 | from .charsetprober import CharSetProber 31 | 32 | 33 | class CharSetGroupProber(CharSetProber): 34 | def __init__(self): 35 | CharSetProber.__init__(self) 36 | self._mActiveNum = 0 37 | self._mProbers = [] 38 | self._mBestGuessProber = None 39 | 40 | def reset(self): 41 | CharSetProber.reset(self) 42 | self._mActiveNum = 0 43 | for prober in self._mProbers: 44 | if prober: 45 | prober.reset() 46 | prober.active = True 47 | self._mActiveNum += 1 48 | self._mBestGuessProber = None 49 | 50 | def get_charset_name(self): 51 | if not self._mBestGuessProber: 52 | self.get_confidence() 53 | if not self._mBestGuessProber: 54 | return None 55 | # self._mBestGuessProber = self._mProbers[0] 56 | return self._mBestGuessProber.get_charset_name() 57 | 58 | def feed(self, aBuf): 59 | for prober in self._mProbers: 60 | if not prober: 61 | continue 62 | if not prober.active: 63 | continue 64 | st = prober.feed(aBuf) 65 | if not st: 66 | continue 67 | if st == constants.eFoundIt: 68 | self._mBestGuessProber = prober 69 | return self.get_state() 70 | elif st == constants.eNotMe: 71 | prober.active = False 72 | self._mActiveNum -= 1 73 | if self._mActiveNum <= 0: 74 | self._mState = constants.eNotMe 75 | return self.get_state() 76 | return self.get_state() 77 | 78 | def get_confidence(self): 79 | st = self.get_state() 80 | if st == constants.eFoundIt: 81 | return 0.99 82 | elif st == constants.eNotMe: 83 | return 0.01 84 | bestConf = 0.0 85 | self._mBestGuessProber = None 86 | for prober in self._mProbers: 87 | if not prober: 88 | continue 89 | if not prober.active: 90 | if constants._debug: 91 | sys.stderr.write(prober.get_charset_name() 92 | + ' not active\n') 93 | continue 94 | cf = prober.get_confidence() 95 | if constants._debug: 96 | sys.stderr.write('%s confidence = %s\n' % 97 | (prober.get_charset_name(), cf)) 98 | if bestConf < cf: 99 | bestConf = cf 100 | self._mBestGuessProber = prober 101 | if not self._mBestGuessProber: 102 | return 0.0 103 | return bestConf 104 | # else: 105 | # self._mBestGuessProber = self._mProbers[0] 106 | # return self._mBestGuessProber.get_confidence() 107 | -------------------------------------------------------------------------------- /chardet/charsetprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | from . import constants 30 | import re 31 | 32 | 33 | class CharSetProber: 34 | def __init__(self): 35 | pass 36 | 37 | def reset(self): 38 | self._mState = constants.eDetecting 39 | 40 | def get_charset_name(self): 41 | return None 42 | 43 | def feed(self, aBuf): 44 | pass 45 | 46 | def get_state(self): 47 | return self._mState 48 | 49 | def get_confidence(self): 50 | return 0.0 51 | 52 | def filter_high_bit_only(self, aBuf): 53 | aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf) 54 | return aBuf 55 | 56 | def filter_without_english_letters(self, aBuf): 57 | aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf) 58 | return aBuf 59 | 60 | def filter_with_english_letters(self, aBuf): 61 | # TODO 62 | return aBuf 63 | -------------------------------------------------------------------------------- /chardet/codingstatemachine.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .constants import eStart 29 | from .compat import wrap_ord 30 | 31 | 32 | class CodingStateMachine: 33 | def __init__(self, sm): 34 | self._mModel = sm 35 | self._mCurrentBytePos = 0 36 | self._mCurrentCharLen = 0 37 | self.reset() 38 | 39 | def reset(self): 40 | self._mCurrentState = eStart 41 | 42 | def next_state(self, c): 43 | # for each byte we get its class 44 | # if it is first byte, we also get byte length 45 | # PY3K: aBuf is a byte stream, so c is an int, not a byte 46 | byteCls = self._mModel['classTable'][wrap_ord(c)] 47 | if self._mCurrentState == eStart: 48 | self._mCurrentBytePos = 0 49 | self._mCurrentCharLen = self._mModel['charLenTable'][byteCls] 50 | # from byte's class and stateTable, we get its next state 51 | curr_state = (self._mCurrentState * self._mModel['classFactor'] 52 | + byteCls) 53 | self._mCurrentState = self._mModel['stateTable'][curr_state] 54 | self._mCurrentBytePos += 1 55 | return self._mCurrentState 56 | 57 | def get_current_charlen(self): 58 | return self._mCurrentCharLen 59 | 60 | def get_coding_state_machine(self): 61 | return self._mModel['name'] 62 | -------------------------------------------------------------------------------- /chardet/compat.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # Contributor(s): 3 | # Ian Cordasco - port to Python 4 | # 5 | # This library is free software; you can redistribute it and/or 6 | # modify it under the terms of the GNU Lesser General Public 7 | # License as published by the Free Software Foundation; either 8 | # version 2.1 of the License, or (at your option) any later version. 9 | # 10 | # This library is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 | # Lesser General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU Lesser General Public 16 | # License along with this library; if not, write to the Free Software 17 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 18 | # 02110-1301 USA 19 | ######################### END LICENSE BLOCK ######################### 20 | 21 | import sys 22 | 23 | 24 | if sys.version_info < (3, 0): 25 | base_str = (str, unicode) 26 | else: 27 | base_str = (bytes, str) 28 | 29 | 30 | def wrap_ord(a): 31 | if sys.version_info < (3, 0) and isinstance(a, base_str): 32 | return ord(a) 33 | else: 34 | return a 35 | -------------------------------------------------------------------------------- /chardet/constants.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | _debug = 0 30 | 31 | eDetecting = 0 32 | eFoundIt = 1 33 | eNotMe = 2 34 | 35 | eStart = 0 36 | eError = 1 37 | eItsMe = 2 38 | 39 | SHORTCUT_THRESHOLD = 0.95 40 | -------------------------------------------------------------------------------- /chardet/cp949prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import EUCKRDistributionAnalysis 31 | from .mbcssm import CP949SMModel 32 | 33 | 34 | class CP949Prober(MultiByteCharSetProber): 35 | def __init__(self): 36 | MultiByteCharSetProber.__init__(self) 37 | self._mCodingSM = CodingStateMachine(CP949SMModel) 38 | # NOTE: CP949 is a superset of EUC-KR, so the distribution should be 39 | # not different. 40 | self._mDistributionAnalyzer = EUCKRDistributionAnalysis() 41 | self.reset() 42 | 43 | def get_charset_name(self): 44 | return "CP949" 45 | -------------------------------------------------------------------------------- /chardet/escprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from . import constants 29 | from .escsm import (HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, 30 | ISO2022KRSMModel) 31 | from .charsetprober import CharSetProber 32 | from .codingstatemachine import CodingStateMachine 33 | from .compat import wrap_ord 34 | 35 | 36 | class EscCharSetProber(CharSetProber): 37 | def __init__(self): 38 | CharSetProber.__init__(self) 39 | self._mCodingSM = [ 40 | CodingStateMachine(HZSMModel), 41 | CodingStateMachine(ISO2022CNSMModel), 42 | CodingStateMachine(ISO2022JPSMModel), 43 | CodingStateMachine(ISO2022KRSMModel) 44 | ] 45 | self.reset() 46 | 47 | def reset(self): 48 | CharSetProber.reset(self) 49 | for codingSM in self._mCodingSM: 50 | if not codingSM: 51 | continue 52 | codingSM.active = True 53 | codingSM.reset() 54 | self._mActiveSM = len(self._mCodingSM) 55 | self._mDetectedCharset = None 56 | 57 | def get_charset_name(self): 58 | return self._mDetectedCharset 59 | 60 | def get_confidence(self): 61 | if self._mDetectedCharset: 62 | return 0.99 63 | else: 64 | return 0.00 65 | 66 | def feed(self, aBuf): 67 | for c in aBuf: 68 | # PY3K: aBuf is a byte array, so c is an int, not a byte 69 | for codingSM in self._mCodingSM: 70 | if not codingSM: 71 | continue 72 | if not codingSM.active: 73 | continue 74 | codingState = codingSM.next_state(wrap_ord(c)) 75 | if codingState == constants.eError: 76 | codingSM.active = False 77 | self._mActiveSM -= 1 78 | if self._mActiveSM <= 0: 79 | self._mState = constants.eNotMe 80 | return self.get_state() 81 | elif codingState == constants.eItsMe: 82 | self._mState = constants.eFoundIt 83 | self._mDetectedCharset = codingSM.get_coding_state_machine() # nopep8 84 | return self.get_state() 85 | 86 | return self.get_state() 87 | -------------------------------------------------------------------------------- /chardet/escsm.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .constants import eStart, eError, eItsMe 29 | 30 | HZ_cls = ( 31 | 1,0,0,0,0,0,0,0, # 00 - 07 32 | 0,0,0,0,0,0,0,0, # 08 - 0f 33 | 0,0,0,0,0,0,0,0, # 10 - 17 34 | 0,0,0,1,0,0,0,0, # 18 - 1f 35 | 0,0,0,0,0,0,0,0, # 20 - 27 36 | 0,0,0,0,0,0,0,0, # 28 - 2f 37 | 0,0,0,0,0,0,0,0, # 30 - 37 38 | 0,0,0,0,0,0,0,0, # 38 - 3f 39 | 0,0,0,0,0,0,0,0, # 40 - 47 40 | 0,0,0,0,0,0,0,0, # 48 - 4f 41 | 0,0,0,0,0,0,0,0, # 50 - 57 42 | 0,0,0,0,0,0,0,0, # 58 - 5f 43 | 0,0,0,0,0,0,0,0, # 60 - 67 44 | 0,0,0,0,0,0,0,0, # 68 - 6f 45 | 0,0,0,0,0,0,0,0, # 70 - 77 46 | 0,0,0,4,0,5,2,0, # 78 - 7f 47 | 1,1,1,1,1,1,1,1, # 80 - 87 48 | 1,1,1,1,1,1,1,1, # 88 - 8f 49 | 1,1,1,1,1,1,1,1, # 90 - 97 50 | 1,1,1,1,1,1,1,1, # 98 - 9f 51 | 1,1,1,1,1,1,1,1, # a0 - a7 52 | 1,1,1,1,1,1,1,1, # a8 - af 53 | 1,1,1,1,1,1,1,1, # b0 - b7 54 | 1,1,1,1,1,1,1,1, # b8 - bf 55 | 1,1,1,1,1,1,1,1, # c0 - c7 56 | 1,1,1,1,1,1,1,1, # c8 - cf 57 | 1,1,1,1,1,1,1,1, # d0 - d7 58 | 1,1,1,1,1,1,1,1, # d8 - df 59 | 1,1,1,1,1,1,1,1, # e0 - e7 60 | 1,1,1,1,1,1,1,1, # e8 - ef 61 | 1,1,1,1,1,1,1,1, # f0 - f7 62 | 1,1,1,1,1,1,1,1, # f8 - ff 63 | ) 64 | 65 | HZ_st = ( 66 | eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07 67 | eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f 68 | eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17 69 | 5,eError, 6,eError, 5, 5, 4,eError,# 18-1f 70 | 4,eError, 4, 4, 4,eError, 4,eError,# 20-27 71 | 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f 72 | ) 73 | 74 | HZCharLenTable = (0, 0, 0, 0, 0, 0) 75 | 76 | HZSMModel = {'classTable': HZ_cls, 77 | 'classFactor': 6, 78 | 'stateTable': HZ_st, 79 | 'charLenTable': HZCharLenTable, 80 | 'name': "HZ-GB-2312"} 81 | 82 | ISO2022CN_cls = ( 83 | 2,0,0,0,0,0,0,0, # 00 - 07 84 | 0,0,0,0,0,0,0,0, # 08 - 0f 85 | 0,0,0,0,0,0,0,0, # 10 - 17 86 | 0,0,0,1,0,0,0,0, # 18 - 1f 87 | 0,0,0,0,0,0,0,0, # 20 - 27 88 | 0,3,0,0,0,0,0,0, # 28 - 2f 89 | 0,0,0,0,0,0,0,0, # 30 - 37 90 | 0,0,0,0,0,0,0,0, # 38 - 3f 91 | 0,0,0,4,0,0,0,0, # 40 - 47 92 | 0,0,0,0,0,0,0,0, # 48 - 4f 93 | 0,0,0,0,0,0,0,0, # 50 - 57 94 | 0,0,0,0,0,0,0,0, # 58 - 5f 95 | 0,0,0,0,0,0,0,0, # 60 - 67 96 | 0,0,0,0,0,0,0,0, # 68 - 6f 97 | 0,0,0,0,0,0,0,0, # 70 - 77 98 | 0,0,0,0,0,0,0,0, # 78 - 7f 99 | 2,2,2,2,2,2,2,2, # 80 - 87 100 | 2,2,2,2,2,2,2,2, # 88 - 8f 101 | 2,2,2,2,2,2,2,2, # 90 - 97 102 | 2,2,2,2,2,2,2,2, # 98 - 9f 103 | 2,2,2,2,2,2,2,2, # a0 - a7 104 | 2,2,2,2,2,2,2,2, # a8 - af 105 | 2,2,2,2,2,2,2,2, # b0 - b7 106 | 2,2,2,2,2,2,2,2, # b8 - bf 107 | 2,2,2,2,2,2,2,2, # c0 - c7 108 | 2,2,2,2,2,2,2,2, # c8 - cf 109 | 2,2,2,2,2,2,2,2, # d0 - d7 110 | 2,2,2,2,2,2,2,2, # d8 - df 111 | 2,2,2,2,2,2,2,2, # e0 - e7 112 | 2,2,2,2,2,2,2,2, # e8 - ef 113 | 2,2,2,2,2,2,2,2, # f0 - f7 114 | 2,2,2,2,2,2,2,2, # f8 - ff 115 | ) 116 | 117 | ISO2022CN_st = ( 118 | eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 119 | eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f 120 | eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 121 | eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError,# 18-1f 122 | eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27 123 | 5, 6,eError,eError,eError,eError,eError,eError,# 28-2f 124 | eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37 125 | eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f 126 | ) 127 | 128 | ISO2022CNCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0) 129 | 130 | ISO2022CNSMModel = {'classTable': ISO2022CN_cls, 131 | 'classFactor': 9, 132 | 'stateTable': ISO2022CN_st, 133 | 'charLenTable': ISO2022CNCharLenTable, 134 | 'name': "ISO-2022-CN"} 135 | 136 | ISO2022JP_cls = ( 137 | 2,0,0,0,0,0,0,0, # 00 - 07 138 | 0,0,0,0,0,0,2,2, # 08 - 0f 139 | 0,0,0,0,0,0,0,0, # 10 - 17 140 | 0,0,0,1,0,0,0,0, # 18 - 1f 141 | 0,0,0,0,7,0,0,0, # 20 - 27 142 | 3,0,0,0,0,0,0,0, # 28 - 2f 143 | 0,0,0,0,0,0,0,0, # 30 - 37 144 | 0,0,0,0,0,0,0,0, # 38 - 3f 145 | 6,0,4,0,8,0,0,0, # 40 - 47 146 | 0,9,5,0,0,0,0,0, # 48 - 4f 147 | 0,0,0,0,0,0,0,0, # 50 - 57 148 | 0,0,0,0,0,0,0,0, # 58 - 5f 149 | 0,0,0,0,0,0,0,0, # 60 - 67 150 | 0,0,0,0,0,0,0,0, # 68 - 6f 151 | 0,0,0,0,0,0,0,0, # 70 - 77 152 | 0,0,0,0,0,0,0,0, # 78 - 7f 153 | 2,2,2,2,2,2,2,2, # 80 - 87 154 | 2,2,2,2,2,2,2,2, # 88 - 8f 155 | 2,2,2,2,2,2,2,2, # 90 - 97 156 | 2,2,2,2,2,2,2,2, # 98 - 9f 157 | 2,2,2,2,2,2,2,2, # a0 - a7 158 | 2,2,2,2,2,2,2,2, # a8 - af 159 | 2,2,2,2,2,2,2,2, # b0 - b7 160 | 2,2,2,2,2,2,2,2, # b8 - bf 161 | 2,2,2,2,2,2,2,2, # c0 - c7 162 | 2,2,2,2,2,2,2,2, # c8 - cf 163 | 2,2,2,2,2,2,2,2, # d0 - d7 164 | 2,2,2,2,2,2,2,2, # d8 - df 165 | 2,2,2,2,2,2,2,2, # e0 - e7 166 | 2,2,2,2,2,2,2,2, # e8 - ef 167 | 2,2,2,2,2,2,2,2, # f0 - f7 168 | 2,2,2,2,2,2,2,2, # f8 - ff 169 | ) 170 | 171 | ISO2022JP_st = ( 172 | eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 173 | eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f 174 | eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 175 | eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f 176 | eError, 5,eError,eError,eError, 4,eError,eError,# 20-27 177 | eError,eError,eError, 6,eItsMe,eError,eItsMe,eError,# 28-2f 178 | eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37 179 | eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f 180 | eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47 181 | ) 182 | 183 | ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) 184 | 185 | ISO2022JPSMModel = {'classTable': ISO2022JP_cls, 186 | 'classFactor': 10, 187 | 'stateTable': ISO2022JP_st, 188 | 'charLenTable': ISO2022JPCharLenTable, 189 | 'name': "ISO-2022-JP"} 190 | 191 | ISO2022KR_cls = ( 192 | 2,0,0,0,0,0,0,0, # 00 - 07 193 | 0,0,0,0,0,0,0,0, # 08 - 0f 194 | 0,0,0,0,0,0,0,0, # 10 - 17 195 | 0,0,0,1,0,0,0,0, # 18 - 1f 196 | 0,0,0,0,3,0,0,0, # 20 - 27 197 | 0,4,0,0,0,0,0,0, # 28 - 2f 198 | 0,0,0,0,0,0,0,0, # 30 - 37 199 | 0,0,0,0,0,0,0,0, # 38 - 3f 200 | 0,0,0,5,0,0,0,0, # 40 - 47 201 | 0,0,0,0,0,0,0,0, # 48 - 4f 202 | 0,0,0,0,0,0,0,0, # 50 - 57 203 | 0,0,0,0,0,0,0,0, # 58 - 5f 204 | 0,0,0,0,0,0,0,0, # 60 - 67 205 | 0,0,0,0,0,0,0,0, # 68 - 6f 206 | 0,0,0,0,0,0,0,0, # 70 - 77 207 | 0,0,0,0,0,0,0,0, # 78 - 7f 208 | 2,2,2,2,2,2,2,2, # 80 - 87 209 | 2,2,2,2,2,2,2,2, # 88 - 8f 210 | 2,2,2,2,2,2,2,2, # 90 - 97 211 | 2,2,2,2,2,2,2,2, # 98 - 9f 212 | 2,2,2,2,2,2,2,2, # a0 - a7 213 | 2,2,2,2,2,2,2,2, # a8 - af 214 | 2,2,2,2,2,2,2,2, # b0 - b7 215 | 2,2,2,2,2,2,2,2, # b8 - bf 216 | 2,2,2,2,2,2,2,2, # c0 - c7 217 | 2,2,2,2,2,2,2,2, # c8 - cf 218 | 2,2,2,2,2,2,2,2, # d0 - d7 219 | 2,2,2,2,2,2,2,2, # d8 - df 220 | 2,2,2,2,2,2,2,2, # e0 - e7 221 | 2,2,2,2,2,2,2,2, # e8 - ef 222 | 2,2,2,2,2,2,2,2, # f0 - f7 223 | 2,2,2,2,2,2,2,2, # f8 - ff 224 | ) 225 | 226 | ISO2022KR_st = ( 227 | eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07 228 | eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f 229 | eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17 230 | eError,eError,eError,eError, 5,eError,eError,eError,# 18-1f 231 | eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27 232 | ) 233 | 234 | ISO2022KRCharLenTable = (0, 0, 0, 0, 0, 0) 235 | 236 | ISO2022KRSMModel = {'classTable': ISO2022KR_cls, 237 | 'classFactor': 6, 238 | 'stateTable': ISO2022KR_st, 239 | 'charLenTable': ISO2022KRCharLenTable, 240 | 'name': "ISO-2022-KR"} 241 | 242 | # flake8: noqa 243 | -------------------------------------------------------------------------------- /chardet/eucjpprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | import sys 29 | from . import constants 30 | from .mbcharsetprober import MultiByteCharSetProber 31 | from .codingstatemachine import CodingStateMachine 32 | from .chardistribution import EUCJPDistributionAnalysis 33 | from .jpcntx import EUCJPContextAnalysis 34 | from .mbcssm import EUCJPSMModel 35 | 36 | 37 | class EUCJPProber(MultiByteCharSetProber): 38 | def __init__(self): 39 | MultiByteCharSetProber.__init__(self) 40 | self._mCodingSM = CodingStateMachine(EUCJPSMModel) 41 | self._mDistributionAnalyzer = EUCJPDistributionAnalysis() 42 | self._mContextAnalyzer = EUCJPContextAnalysis() 43 | self.reset() 44 | 45 | def reset(self): 46 | MultiByteCharSetProber.reset(self) 47 | self._mContextAnalyzer.reset() 48 | 49 | def get_charset_name(self): 50 | return "EUC-JP" 51 | 52 | def feed(self, aBuf): 53 | aLen = len(aBuf) 54 | for i in range(0, aLen): 55 | # PY3K: aBuf is a byte array, so aBuf[i] is an int, not a byte 56 | codingState = self._mCodingSM.next_state(aBuf[i]) 57 | if codingState == constants.eError: 58 | if constants._debug: 59 | sys.stderr.write(self.get_charset_name() 60 | + ' prober hit error at byte ' + str(i) 61 | + '\n') 62 | self._mState = constants.eNotMe 63 | break 64 | elif codingState == constants.eItsMe: 65 | self._mState = constants.eFoundIt 66 | break 67 | elif codingState == constants.eStart: 68 | charLen = self._mCodingSM.get_current_charlen() 69 | if i == 0: 70 | self._mLastChar[1] = aBuf[0] 71 | self._mContextAnalyzer.feed(self._mLastChar, charLen) 72 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen) 73 | else: 74 | self._mContextAnalyzer.feed(aBuf[i - 1:i + 1], charLen) 75 | self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], 76 | charLen) 77 | 78 | self._mLastChar[0] = aBuf[aLen - 1] 79 | 80 | if self.get_state() == constants.eDetecting: 81 | if (self._mContextAnalyzer.got_enough_data() and 82 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): 83 | self._mState = constants.eFoundIt 84 | 85 | return self.get_state() 86 | 87 | def get_confidence(self): 88 | contxtCf = self._mContextAnalyzer.get_confidence() 89 | distribCf = self._mDistributionAnalyzer.get_confidence() 90 | return max(contxtCf, distribCf) 91 | -------------------------------------------------------------------------------- /chardet/euckrprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import EUCKRDistributionAnalysis 31 | from .mbcssm import EUCKRSMModel 32 | 33 | 34 | class EUCKRProber(MultiByteCharSetProber): 35 | def __init__(self): 36 | MultiByteCharSetProber.__init__(self) 37 | self._mCodingSM = CodingStateMachine(EUCKRSMModel) 38 | self._mDistributionAnalyzer = EUCKRDistributionAnalysis() 39 | self.reset() 40 | 41 | def get_charset_name(self): 42 | return "EUC-KR" 43 | -------------------------------------------------------------------------------- /chardet/euctwprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import EUCTWDistributionAnalysis 31 | from .mbcssm import EUCTWSMModel 32 | 33 | class EUCTWProber(MultiByteCharSetProber): 34 | def __init__(self): 35 | MultiByteCharSetProber.__init__(self) 36 | self._mCodingSM = CodingStateMachine(EUCTWSMModel) 37 | self._mDistributionAnalyzer = EUCTWDistributionAnalysis() 38 | self.reset() 39 | 40 | def get_charset_name(self): 41 | return "EUC-TW" 42 | -------------------------------------------------------------------------------- /chardet/gb2312prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from .mbcharsetprober import MultiByteCharSetProber 29 | from .codingstatemachine import CodingStateMachine 30 | from .chardistribution import GB2312DistributionAnalysis 31 | from .mbcssm import GB2312SMModel 32 | 33 | class GB2312Prober(MultiByteCharSetProber): 34 | def __init__(self): 35 | MultiByteCharSetProber.__init__(self) 36 | self._mCodingSM = CodingStateMachine(GB2312SMModel) 37 | self._mDistributionAnalyzer = GB2312DistributionAnalysis() 38 | self.reset() 39 | 40 | def get_charset_name(self): 41 | return "GB2312" 42 | -------------------------------------------------------------------------------- /chardet/latin1prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | from .charsetprober import CharSetProber 30 | from .constants import eNotMe 31 | from .compat import wrap_ord 32 | 33 | FREQ_CAT_NUM = 4 34 | 35 | UDF = 0 # undefined 36 | OTH = 1 # other 37 | ASC = 2 # ascii capital letter 38 | ASS = 3 # ascii small letter 39 | ACV = 4 # accent capital vowel 40 | ACO = 5 # accent capital other 41 | ASV = 6 # accent small vowel 42 | ASO = 7 # accent small other 43 | CLASS_NUM = 8 # total classes 44 | 45 | Latin1_CharToClass = ( 46 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 47 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F 48 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 49 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F 50 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 51 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F 52 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 53 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F 54 | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 55 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F 56 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 57 | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F 58 | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 59 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F 60 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 61 | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F 62 | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87 63 | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F 64 | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97 65 | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F 66 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7 67 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF 68 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 69 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF 70 | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7 71 | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF 72 | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7 73 | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF 74 | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7 75 | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF 76 | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7 77 | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF 78 | ) 79 | 80 | # 0 : illegal 81 | # 1 : very unlikely 82 | # 2 : normal 83 | # 3 : very likely 84 | Latin1ClassModel = ( 85 | # UDF OTH ASC ASS ACV ACO ASV ASO 86 | 0, 0, 0, 0, 0, 0, 0, 0, # UDF 87 | 0, 3, 3, 3, 3, 3, 3, 3, # OTH 88 | 0, 3, 3, 3, 3, 3, 3, 3, # ASC 89 | 0, 3, 3, 3, 1, 1, 3, 3, # ASS 90 | 0, 3, 3, 3, 1, 2, 1, 2, # ACV 91 | 0, 3, 3, 3, 3, 3, 3, 3, # ACO 92 | 0, 3, 1, 3, 1, 1, 1, 3, # ASV 93 | 0, 3, 1, 3, 1, 1, 3, 3, # ASO 94 | ) 95 | 96 | 97 | class Latin1Prober(CharSetProber): 98 | def __init__(self): 99 | CharSetProber.__init__(self) 100 | self.reset() 101 | 102 | def reset(self): 103 | self._mLastCharClass = OTH 104 | self._mFreqCounter = [0] * FREQ_CAT_NUM 105 | CharSetProber.reset(self) 106 | 107 | def get_charset_name(self): 108 | return "windows-1252" 109 | 110 | def feed(self, aBuf): 111 | aBuf = self.filter_with_english_letters(aBuf) 112 | for c in aBuf: 113 | charClass = Latin1_CharToClass[wrap_ord(c)] 114 | freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) 115 | + charClass] 116 | if freq == 0: 117 | self._mState = eNotMe 118 | break 119 | self._mFreqCounter[freq] += 1 120 | self._mLastCharClass = charClass 121 | 122 | return self.get_state() 123 | 124 | def get_confidence(self): 125 | if self.get_state() == eNotMe: 126 | return 0.01 127 | 128 | total = sum(self._mFreqCounter) 129 | if total < 0.01: 130 | confidence = 0.0 131 | else: 132 | confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0) 133 | / total) 134 | if confidence < 0.0: 135 | confidence = 0.0 136 | # lower the confidence of latin1 so that other more accurate 137 | # detector can take priority. 138 | confidence = confidence * 0.73 139 | return confidence 140 | -------------------------------------------------------------------------------- /chardet/mbcharsetprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # Proofpoint, Inc. 13 | # 14 | # This library is free software; you can redistribute it and/or 15 | # modify it under the terms of the GNU Lesser General Public 16 | # License as published by the Free Software Foundation; either 17 | # version 2.1 of the License, or (at your option) any later version. 18 | # 19 | # This library is distributed in the hope that it will be useful, 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 | # Lesser General Public License for more details. 23 | # 24 | # You should have received a copy of the GNU Lesser General Public 25 | # License along with this library; if not, write to the Free Software 26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 27 | # 02110-1301 USA 28 | ######################### END LICENSE BLOCK ######################### 29 | 30 | import sys 31 | from . import constants 32 | from .charsetprober import CharSetProber 33 | 34 | 35 | class MultiByteCharSetProber(CharSetProber): 36 | def __init__(self): 37 | CharSetProber.__init__(self) 38 | self._mDistributionAnalyzer = None 39 | self._mCodingSM = None 40 | self._mLastChar = [0, 0] 41 | 42 | def reset(self): 43 | CharSetProber.reset(self) 44 | if self._mCodingSM: 45 | self._mCodingSM.reset() 46 | if self._mDistributionAnalyzer: 47 | self._mDistributionAnalyzer.reset() 48 | self._mLastChar = [0, 0] 49 | 50 | def get_charset_name(self): 51 | pass 52 | 53 | def feed(self, aBuf): 54 | aLen = len(aBuf) 55 | for i in range(0, aLen): 56 | codingState = self._mCodingSM.next_state(aBuf[i]) 57 | if codingState == constants.eError: 58 | if constants._debug: 59 | sys.stderr.write(self.get_charset_name() 60 | + ' prober hit error at byte ' + str(i) 61 | + '\n') 62 | self._mState = constants.eNotMe 63 | break 64 | elif codingState == constants.eItsMe: 65 | self._mState = constants.eFoundIt 66 | break 67 | elif codingState == constants.eStart: 68 | charLen = self._mCodingSM.get_current_charlen() 69 | if i == 0: 70 | self._mLastChar[1] = aBuf[0] 71 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen) 72 | else: 73 | self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], 74 | charLen) 75 | 76 | self._mLastChar[0] = aBuf[aLen - 1] 77 | 78 | if self.get_state() == constants.eDetecting: 79 | if (self._mDistributionAnalyzer.got_enough_data() and 80 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): 81 | self._mState = constants.eFoundIt 82 | 83 | return self.get_state() 84 | 85 | def get_confidence(self): 86 | return self._mDistributionAnalyzer.get_confidence() 87 | -------------------------------------------------------------------------------- /chardet/mbcsgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # Proofpoint, Inc. 13 | # 14 | # This library is free software; you can redistribute it and/or 15 | # modify it under the terms of the GNU Lesser General Public 16 | # License as published by the Free Software Foundation; either 17 | # version 2.1 of the License, or (at your option) any later version. 18 | # 19 | # This library is distributed in the hope that it will be useful, 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 | # Lesser General Public License for more details. 23 | # 24 | # You should have received a copy of the GNU Lesser General Public 25 | # License along with this library; if not, write to the Free Software 26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 27 | # 02110-1301 USA 28 | ######################### END LICENSE BLOCK ######################### 29 | 30 | from .charsetgroupprober import CharSetGroupProber 31 | from .utf8prober import UTF8Prober 32 | from .sjisprober import SJISProber 33 | from .eucjpprober import EUCJPProber 34 | from .gb2312prober import GB2312Prober 35 | from .euckrprober import EUCKRProber 36 | from .cp949prober import CP949Prober 37 | from .big5prober import Big5Prober 38 | from .euctwprober import EUCTWProber 39 | 40 | 41 | class MBCSGroupProber(CharSetGroupProber): 42 | def __init__(self): 43 | CharSetGroupProber.__init__(self) 44 | self._mProbers = [ 45 | UTF8Prober(), 46 | SJISProber(), 47 | EUCJPProber(), 48 | GB2312Prober(), 49 | EUCKRProber(), 50 | CP949Prober(), 51 | Big5Prober(), 52 | EUCTWProber() 53 | ] 54 | self.reset() 55 | -------------------------------------------------------------------------------- /chardet/sbcharsetprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | import sys 30 | from . import constants 31 | from .charsetprober import CharSetProber 32 | from .compat import wrap_ord 33 | 34 | SAMPLE_SIZE = 64 35 | SB_ENOUGH_REL_THRESHOLD = 1024 36 | POSITIVE_SHORTCUT_THRESHOLD = 0.95 37 | NEGATIVE_SHORTCUT_THRESHOLD = 0.05 38 | SYMBOL_CAT_ORDER = 250 39 | NUMBER_OF_SEQ_CAT = 4 40 | POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1 41 | #NEGATIVE_CAT = 0 42 | 43 | 44 | class SingleByteCharSetProber(CharSetProber): 45 | def __init__(self, model, reversed=False, nameProber=None): 46 | CharSetProber.__init__(self) 47 | self._mModel = model 48 | # TRUE if we need to reverse every pair in the model lookup 49 | self._mReversed = reversed 50 | # Optional auxiliary prober for name decision 51 | self._mNameProber = nameProber 52 | self.reset() 53 | 54 | def reset(self): 55 | CharSetProber.reset(self) 56 | # char order of last character 57 | self._mLastOrder = 255 58 | self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT 59 | self._mTotalSeqs = 0 60 | self._mTotalChar = 0 61 | # characters that fall in our sampling range 62 | self._mFreqChar = 0 63 | 64 | def get_charset_name(self): 65 | if self._mNameProber: 66 | return self._mNameProber.get_charset_name() 67 | else: 68 | return self._mModel['charsetName'] 69 | 70 | def feed(self, aBuf): 71 | if not self._mModel['keepEnglishLetter']: 72 | aBuf = self.filter_without_english_letters(aBuf) 73 | aLen = len(aBuf) 74 | if not aLen: 75 | return self.get_state() 76 | for c in aBuf: 77 | order = self._mModel['charToOrderMap'][wrap_ord(c)] 78 | if order < SYMBOL_CAT_ORDER: 79 | self._mTotalChar += 1 80 | if order < SAMPLE_SIZE: 81 | self._mFreqChar += 1 82 | if self._mLastOrder < SAMPLE_SIZE: 83 | self._mTotalSeqs += 1 84 | if not self._mReversed: 85 | i = (self._mLastOrder * SAMPLE_SIZE) + order 86 | model = self._mModel['precedenceMatrix'][i] 87 | else: # reverse the order of the letters in the lookup 88 | i = (order * SAMPLE_SIZE) + self._mLastOrder 89 | model = self._mModel['precedenceMatrix'][i] 90 | self._mSeqCounters[model] += 1 91 | self._mLastOrder = order 92 | 93 | if self.get_state() == constants.eDetecting: 94 | if self._mTotalSeqs > SB_ENOUGH_REL_THRESHOLD: 95 | cf = self.get_confidence() 96 | if cf > POSITIVE_SHORTCUT_THRESHOLD: 97 | if constants._debug: 98 | sys.stderr.write('%s confidence = %s, we have a' 99 | 'winner\n' % 100 | (self._mModel['charsetName'], cf)) 101 | self._mState = constants.eFoundIt 102 | elif cf < NEGATIVE_SHORTCUT_THRESHOLD: 103 | if constants._debug: 104 | sys.stderr.write('%s confidence = %s, below negative' 105 | 'shortcut threshhold %s\n' % 106 | (self._mModel['charsetName'], cf, 107 | NEGATIVE_SHORTCUT_THRESHOLD)) 108 | self._mState = constants.eNotMe 109 | 110 | return self.get_state() 111 | 112 | def get_confidence(self): 113 | r = 0.01 114 | if self._mTotalSeqs > 0: 115 | r = ((1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs 116 | / self._mModel['mTypicalPositiveRatio']) 117 | r = r * self._mFreqChar / self._mTotalChar 118 | if r >= 1.0: 119 | r = 0.99 120 | return r 121 | -------------------------------------------------------------------------------- /chardet/sbcsgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | from .charsetgroupprober import CharSetGroupProber 30 | from .sbcharsetprober import SingleByteCharSetProber 31 | from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel, 32 | Latin5CyrillicModel, MacCyrillicModel, 33 | Ibm866Model, Ibm855Model) 34 | from .langgreekmodel import Latin7GreekModel, Win1253GreekModel 35 | from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel 36 | from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel 37 | from .langthaimodel import TIS620ThaiModel 38 | from .langhebrewmodel import Win1255HebrewModel 39 | from .hebrewprober import HebrewProber 40 | 41 | 42 | class SBCSGroupProber(CharSetGroupProber): 43 | def __init__(self): 44 | CharSetGroupProber.__init__(self) 45 | self._mProbers = [ 46 | SingleByteCharSetProber(Win1251CyrillicModel), 47 | SingleByteCharSetProber(Koi8rModel), 48 | SingleByteCharSetProber(Latin5CyrillicModel), 49 | SingleByteCharSetProber(MacCyrillicModel), 50 | SingleByteCharSetProber(Ibm866Model), 51 | SingleByteCharSetProber(Ibm855Model), 52 | SingleByteCharSetProber(Latin7GreekModel), 53 | SingleByteCharSetProber(Win1253GreekModel), 54 | SingleByteCharSetProber(Latin5BulgarianModel), 55 | SingleByteCharSetProber(Win1251BulgarianModel), 56 | SingleByteCharSetProber(Latin2HungarianModel), 57 | SingleByteCharSetProber(Win1250HungarianModel), 58 | SingleByteCharSetProber(TIS620ThaiModel), 59 | ] 60 | hebrewProber = HebrewProber() 61 | logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, 62 | False, hebrewProber) 63 | visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True, 64 | hebrewProber) 65 | hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber) 66 | self._mProbers.extend([hebrewProber, logicalHebrewProber, 67 | visualHebrewProber]) 68 | 69 | self.reset() 70 | -------------------------------------------------------------------------------- /chardet/sjisprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | import sys 29 | from .mbcharsetprober import MultiByteCharSetProber 30 | from .codingstatemachine import CodingStateMachine 31 | from .chardistribution import SJISDistributionAnalysis 32 | from .jpcntx import SJISContextAnalysis 33 | from .mbcssm import SJISSMModel 34 | from . import constants 35 | 36 | 37 | class SJISProber(MultiByteCharSetProber): 38 | def __init__(self): 39 | MultiByteCharSetProber.__init__(self) 40 | self._mCodingSM = CodingStateMachine(SJISSMModel) 41 | self._mDistributionAnalyzer = SJISDistributionAnalysis() 42 | self._mContextAnalyzer = SJISContextAnalysis() 43 | self.reset() 44 | 45 | def reset(self): 46 | MultiByteCharSetProber.reset(self) 47 | self._mContextAnalyzer.reset() 48 | 49 | def get_charset_name(self): 50 | return self._mContextAnalyzer.get_charset_name() 51 | 52 | def feed(self, aBuf): 53 | aLen = len(aBuf) 54 | for i in range(0, aLen): 55 | codingState = self._mCodingSM.next_state(aBuf[i]) 56 | if codingState == constants.eError: 57 | if constants._debug: 58 | sys.stderr.write(self.get_charset_name() 59 | + ' prober hit error at byte ' + str(i) 60 | + '\n') 61 | self._mState = constants.eNotMe 62 | break 63 | elif codingState == constants.eItsMe: 64 | self._mState = constants.eFoundIt 65 | break 66 | elif codingState == constants.eStart: 67 | charLen = self._mCodingSM.get_current_charlen() 68 | if i == 0: 69 | self._mLastChar[1] = aBuf[0] 70 | self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:], 71 | charLen) 72 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen) 73 | else: 74 | self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3 75 | - charLen], charLen) 76 | self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], 77 | charLen) 78 | 79 | self._mLastChar[0] = aBuf[aLen - 1] 80 | 81 | if self.get_state() == constants.eDetecting: 82 | if (self._mContextAnalyzer.got_enough_data() and 83 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): 84 | self._mState = constants.eFoundIt 85 | 86 | return self.get_state() 87 | 88 | def get_confidence(self): 89 | contxtCf = self._mContextAnalyzer.get_confidence() 90 | distribCf = self._mDistributionAnalyzer.get_confidence() 91 | return max(contxtCf, distribCf) 92 | -------------------------------------------------------------------------------- /chardet/universaldetector.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | from . import constants 30 | import sys 31 | import codecs 32 | from .latin1prober import Latin1Prober # windows-1252 33 | from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets 34 | from .sbcsgroupprober import SBCSGroupProber # single-byte character sets 35 | from .escprober import EscCharSetProber # ISO-2122, etc. 36 | import re 37 | 38 | MINIMUM_THRESHOLD = 0.20 39 | ePureAscii = 0 40 | eEscAscii = 1 41 | eHighbyte = 2 42 | 43 | 44 | class UniversalDetector: 45 | def __init__(self): 46 | self._highBitDetector = re.compile(b'[\x80-\xFF]') 47 | self._escDetector = re.compile(b'(\033|~{)') 48 | self._mEscCharSetProber = None 49 | self._mCharSetProbers = [] 50 | self.reset() 51 | 52 | def reset(self): 53 | self.result = {'encoding': None, 'confidence': 0.0} 54 | self.done = False 55 | self._mStart = True 56 | self._mGotData = False 57 | self._mInputState = ePureAscii 58 | self._mLastChar = b'' 59 | if self._mEscCharSetProber: 60 | self._mEscCharSetProber.reset() 61 | for prober in self._mCharSetProbers: 62 | prober.reset() 63 | 64 | def feed(self, aBuf): 65 | if self.done: 66 | return 67 | 68 | aLen = len(aBuf) 69 | if not aLen: 70 | return 71 | 72 | if not self._mGotData: 73 | # If the data starts with BOM, we know it is UTF 74 | if aBuf[:3] == codecs.BOM_UTF8: 75 | # EF BB BF UTF-8 with BOM 76 | self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0} 77 | elif aBuf[:4] == codecs.BOM_UTF32_LE: 78 | # FF FE 00 00 UTF-32, little-endian BOM 79 | self.result = {'encoding': "UTF-32LE", 'confidence': 1.0} 80 | elif aBuf[:4] == codecs.BOM_UTF32_BE: 81 | # 00 00 FE FF UTF-32, big-endian BOM 82 | self.result = {'encoding': "UTF-32BE", 'confidence': 1.0} 83 | elif aBuf[:4] == b'\xFE\xFF\x00\x00': 84 | # FE FF 00 00 UCS-4, unusual octet order BOM (3412) 85 | self.result = { 86 | 'encoding': "X-ISO-10646-UCS-4-3412", 87 | 'confidence': 1.0 88 | } 89 | elif aBuf[:4] == b'\x00\x00\xFF\xFE': 90 | # 00 00 FF FE UCS-4, unusual octet order BOM (2143) 91 | self.result = { 92 | 'encoding': "X-ISO-10646-UCS-4-2143", 93 | 'confidence': 1.0 94 | } 95 | elif aBuf[:2] == codecs.BOM_LE: 96 | # FF FE UTF-16, little endian BOM 97 | self.result = {'encoding': "UTF-16LE", 'confidence': 1.0} 98 | elif aBuf[:2] == codecs.BOM_BE: 99 | # FE FF UTF-16, big endian BOM 100 | self.result = {'encoding': "UTF-16BE", 'confidence': 1.0} 101 | 102 | self._mGotData = True 103 | if self.result['encoding'] and (self.result['confidence'] > 0.0): 104 | self.done = True 105 | return 106 | 107 | if self._mInputState == ePureAscii: 108 | if self._highBitDetector.search(aBuf): 109 | self._mInputState = eHighbyte 110 | elif ((self._mInputState == ePureAscii) and 111 | self._escDetector.search(self._mLastChar + aBuf)): 112 | self._mInputState = eEscAscii 113 | 114 | self._mLastChar = aBuf[-1:] 115 | 116 | if self._mInputState == eEscAscii: 117 | if not self._mEscCharSetProber: 118 | self._mEscCharSetProber = EscCharSetProber() 119 | if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt: 120 | self.result = {'encoding': self._mEscCharSetProber.get_charset_name(), 121 | 'confidence': self._mEscCharSetProber.get_confidence()} 122 | self.done = True 123 | elif self._mInputState == eHighbyte: 124 | if not self._mCharSetProbers: 125 | self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), 126 | Latin1Prober()] 127 | for prober in self._mCharSetProbers: 128 | if prober.feed(aBuf) == constants.eFoundIt: 129 | self.result = {'encoding': prober.get_charset_name(), 130 | 'confidence': prober.get_confidence()} 131 | self.done = True 132 | break 133 | 134 | def close(self): 135 | if self.done: 136 | return 137 | if not self._mGotData: 138 | if constants._debug: 139 | sys.stderr.write('no data received!\n') 140 | return 141 | self.done = True 142 | 143 | if self._mInputState == ePureAscii: 144 | self.result = {'encoding': 'ascii', 'confidence': 1.0} 145 | return self.result 146 | 147 | if self._mInputState == eHighbyte: 148 | proberConfidence = None 149 | maxProberConfidence = 0.0 150 | maxProber = None 151 | for prober in self._mCharSetProbers: 152 | if not prober: 153 | continue 154 | proberConfidence = prober.get_confidence() 155 | if proberConfidence > maxProberConfidence: 156 | maxProberConfidence = proberConfidence 157 | maxProber = prober 158 | if maxProber and (maxProberConfidence > MINIMUM_THRESHOLD): 159 | self.result = {'encoding': maxProber.get_charset_name(), 160 | 'confidence': maxProber.get_confidence()} 161 | return self.result 162 | 163 | if constants._debug: 164 | sys.stderr.write('no probers hit minimum threshhold\n') 165 | for prober in self._mCharSetProbers[0].mProbers: 166 | if not prober: 167 | continue 168 | sys.stderr.write('%s confidence = %s\n' % 169 | (prober.get_charset_name(), 170 | prober.get_confidence())) 171 | -------------------------------------------------------------------------------- /chardet/utf8prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from . import constants 29 | from .charsetprober import CharSetProber 30 | from .codingstatemachine import CodingStateMachine 31 | from .mbcssm import UTF8SMModel 32 | 33 | ONE_CHAR_PROB = 0.5 34 | 35 | 36 | class UTF8Prober(CharSetProber): 37 | def __init__(self): 38 | CharSetProber.__init__(self) 39 | self._mCodingSM = CodingStateMachine(UTF8SMModel) 40 | self.reset() 41 | 42 | def reset(self): 43 | CharSetProber.reset(self) 44 | self._mCodingSM.reset() 45 | self._mNumOfMBChar = 0 46 | 47 | def get_charset_name(self): 48 | return "utf-8" 49 | 50 | def feed(self, aBuf): 51 | for c in aBuf: 52 | codingState = self._mCodingSM.next_state(c) 53 | if codingState == constants.eError: 54 | self._mState = constants.eNotMe 55 | break 56 | elif codingState == constants.eItsMe: 57 | self._mState = constants.eFoundIt 58 | break 59 | elif codingState == constants.eStart: 60 | if self._mCodingSM.get_current_charlen() >= 2: 61 | self._mNumOfMBChar += 1 62 | 63 | if self.get_state() == constants.eDetecting: 64 | if self.get_confidence() > constants.SHORTCUT_THRESHOLD: 65 | self._mState = constants.eFoundIt 66 | 67 | return self.get_state() 68 | 69 | def get_confidence(self): 70 | unlike = 0.99 71 | if self._mNumOfMBChar < 6: 72 | for i in range(0, self._mNumOfMBChar): 73 | unlike = unlike * ONE_CHAR_PROB 74 | return 1.0 - unlike 75 | else: 76 | return unlike 77 | -------------------------------------------------------------------------------- /conf.example.ini: -------------------------------------------------------------------------------- 1 | [main] 2 | auth_token= 3 | account_type=yinxiang 4 | test=no 5 | input_encoding= 6 | ignore_dirs=img,test 7 | style=github 8 | log_level=info -------------------------------------------------------------------------------- /evernote/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/evernote/__init__.py -------------------------------------------------------------------------------- /evernote/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/evernote/api/__init__.py -------------------------------------------------------------------------------- /evernote/api/client.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import functools 3 | import inspect 4 | import re 5 | import oauth2 as oauth 6 | import urllib 7 | import urlparse 8 | 9 | import evernote.edam.userstore.UserStore as UserStore 10 | import evernote.edam.notestore.NoteStore as NoteStore 11 | import evernote.edam.userstore.constants as UserStoreConstants 12 | 13 | import thrift.protocol.TBinaryProtocol as TBinaryProtocol 14 | import thrift.transport.THttpClient as THttpClient 15 | 16 | 17 | class EvernoteClient(object): 18 | 19 | def __init__(self, **options): 20 | self.consumer_key = options.get('consumer_key') 21 | self.consumer_secret = options.get('consumer_secret') 22 | self.sandbox = options.get('sandbox', True) 23 | if self.sandbox: 24 | default_service_host = 'sandbox.evernote.com' 25 | else: 26 | default_service_host = 'www.evernote.com' 27 | self.yinxiang = options.get('yinxiang', False) 28 | if self.yinxiang: 29 | default_service_host = 'app.yinxiang.com' 30 | print 'host:', default_service_host 31 | self.service_host = options.get('service_host', default_service_host) 32 | self.additional_headers = options.get('additional_headers', {}) 33 | self.token = options.get('token') 34 | self.secret = options.get('secret') 35 | 36 | def get_request_token(self, callback_url): 37 | client = self._get_oauth_client() 38 | request_url = '%s?oauth_callback=%s' % ( 39 | self._get_endpoint('oauth'), urllib.quote(callback_url)) 40 | 41 | resp, content = client.request(request_url, 'GET') 42 | request_token = dict(urlparse.parse_qsl(content)) 43 | return request_token 44 | 45 | def get_authorize_url(self, request_token): 46 | return '%s?oauth_token=%s' % ( 47 | self._get_endpoint('OAuth.action'), 48 | urllib.quote(request_token['oauth_token'])) 49 | 50 | def get_access_token_dict( 51 | self, oauth_token, oauth_token_secret, oauth_verifier 52 | ): 53 | token = oauth.Token(oauth_token, oauth_token_secret) 54 | token.set_verifier(oauth_verifier) 55 | client = self._get_oauth_client(token) 56 | 57 | resp, content = client.request(self._get_endpoint('oauth'), 'POST') 58 | access_token_dict = dict(urlparse.parse_qsl(content)) 59 | self.token = access_token_dict['oauth_token'] 60 | return access_token_dict 61 | 62 | def get_access_token( 63 | self, oauth_token, oauth_token_secret, oauth_verifier 64 | ): 65 | access_token_dict = self.get_access_token_dict( 66 | oauth_token, 67 | oauth_token_secret, 68 | oauth_verifier 69 | ) 70 | return access_token_dict['oauth_token'] 71 | 72 | def get_user_store(self): 73 | user_store_uri = self._get_endpoint("/edam/user") 74 | store = Store(self.token, UserStore.Client, user_store_uri) 75 | if not store: # Trick for PyDev code completion 76 | store = UserStore.Client() 77 | raise Exception('Should never reach here') 78 | return store 79 | 80 | def get_note_store(self): 81 | user_store = self.get_user_store() 82 | note_store_uri = user_store.getNoteStoreUrl() 83 | store = Store(self.token, NoteStore.Client, note_store_uri) 84 | if not store: # Trick for PyDev code completion 85 | store = NoteStore.Client() 86 | raise Exception('Should never reach here') 87 | return store 88 | 89 | def get_shared_note_store(self, linkedNotebook): 90 | note_store_uri = linkedNotebook.noteStoreUrl 91 | note_store = Store(self.token, NoteStore.Client, note_store_uri) 92 | shared_auth = note_store.authenticateToSharedNotebook( 93 | linkedNotebook.shareKey) 94 | shared_token = shared_auth.authenticationToken 95 | store = Store(shared_token, NoteStore.Client, note_store_uri) 96 | if not store: # Trick for PyDev code completion 97 | store = NoteStore.Client() 98 | raise Exception('Should never reach here') 99 | return store 100 | 101 | def get_business_note_store(self): 102 | user_store = self.get_user_store() 103 | biz_auth = user_store.authenticateToBusiness() 104 | biz_token = biz_auth.authenticationToken 105 | note_store_uri = biz_auth.noteStoreUrl 106 | store = Store(biz_token, NoteStore.Client, note_store_uri) 107 | if not store: # Trick for PyDev code completion 108 | store = NoteStore.Client() 109 | raise Exception('Should never reach here') 110 | return store 111 | 112 | def _get_oauth_client(self, token=None): 113 | consumer = oauth.Consumer(self.consumer_key, self.consumer_secret) 114 | if token: 115 | client = oauth.Client(consumer, token) 116 | else: 117 | client = oauth.Client(consumer) 118 | return client 119 | 120 | def _get_endpoint(self, path=None): 121 | url = "https://%s" % (self.service_host) 122 | if path is not None: 123 | url += "/%s" % path 124 | return url 125 | 126 | 127 | class Store(object): 128 | 129 | def __init__(self, token, client_class, store_url): 130 | self.token = token 131 | m = re.search(':A=(.+):', token) 132 | if m: 133 | self._user_agent_id = m.groups()[0] 134 | else: 135 | self._user_agent_id = '' 136 | self._client = self._get_thrift_client(client_class, store_url) 137 | 138 | def __getattr__(self, name): 139 | def delegate_method(*args, **kwargs): 140 | targetMethod = getattr(self._client, name, None) 141 | if targetMethod is None: 142 | return object.__getattribute__(self, name)(*args, **kwargs) 143 | 144 | org_args = inspect.getargspec(targetMethod).args 145 | if len(org_args) == len(args) + 1: 146 | return targetMethod(*args, **kwargs) 147 | elif 'authenticationToken' in org_args: 148 | skip_args = ['self', 'authenticationToken'] 149 | arg_names = [i for i in org_args if i not in skip_args] 150 | return functools.partial( 151 | targetMethod, authenticationToken=self.token 152 | )(**dict(zip(arg_names, args))) 153 | else: 154 | return targetMethod(*args, **kwargs) 155 | 156 | return delegate_method 157 | 158 | def _get_thrift_client(self, client_class, url): 159 | http_client = THttpClient.THttpClient(url) 160 | http_client.addHeaders(**{ 161 | 'User-Agent': "%s / %s; Python / %s;" 162 | % (self._user_agent_id, self._get_sdk_version(), sys.version.replace('\n',"")) 163 | }) 164 | 165 | thrift_protocol = TBinaryProtocol.TBinaryProtocol(http_client) 166 | return client_class(thrift_protocol) 167 | 168 | def _get_sdk_version(self): 169 | return '%s.%s' % ( 170 | UserStoreConstants.EDAM_VERSION_MAJOR, 171 | UserStoreConstants.EDAM_VERSION_MINOR 172 | ) 173 | -------------------------------------------------------------------------------- /evernote/edam/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/evernote/edam/__init__.py -------------------------------------------------------------------------------- /evernote/edam/error/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants'] 2 | -------------------------------------------------------------------------------- /evernote/edam/error/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py:new_style 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | from ttypes import * 11 | 12 | -------------------------------------------------------------------------------- /evernote/edam/limits/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants'] 2 | -------------------------------------------------------------------------------- /evernote/edam/limits/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py:new_style 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | from ttypes import * 11 | 12 | EDAM_ATTRIBUTE_LEN_MIN = 1 13 | EDAM_ATTRIBUTE_LEN_MAX = 4096 14 | EDAM_ATTRIBUTE_REGEX = "^[^\\p{Cc}\\p{Zl}\\p{Zp}]{1,4096}$" 15 | EDAM_ATTRIBUTE_LIST_MAX = 100 16 | EDAM_ATTRIBUTE_MAP_MAX = 100 17 | EDAM_GUID_LEN_MIN = 36 18 | EDAM_GUID_LEN_MAX = 36 19 | EDAM_GUID_REGEX = "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$" 20 | EDAM_EMAIL_LEN_MIN = 6 21 | EDAM_EMAIL_LEN_MAX = 255 22 | EDAM_EMAIL_LOCAL_REGEX = "^[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(\\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*$" 23 | EDAM_EMAIL_DOMAIN_REGEX = "^[A-Za-z0-9-]+(\\.[A-Za-z0-9-]+)*\\.([A-Za-z]{2,})$" 24 | EDAM_EMAIL_REGEX = "^[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(\\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*@[A-Za-z0-9-]+(\\.[A-Za-z0-9-]+)*\\.([A-Za-z]{2,})$" 25 | EDAM_VAT_REGEX = "^((AT)?U[0-9]{8}|(BE)?0?[0-9]{9}|(BG)?[0-9]{9,10}|(CY)?[0-9]{8}L|(CZ)?[0-9]{8,10}|(DE)?[0-9]{9}|(DK)?[0-9]{8}|(EE)?[0-9]{9}|(EL|GR)?[0-9]{9}|(ES)?[0-9A-Z][0-9]{7}[0-9A-Z]|(FI)?[0-9]{8}|(FR)?[0-9A-Z]{2}[0-9]{9}|(GB)?([0-9]{9}([0-9]{3})?|[A-Z]{2}[0-9]{3})|(HU)?[0-9]{8}|(IE)?[0-9]S[0-9]{5}L|(IT)?[0-9]{11}|(LT)?([0-9]{9}|[0-9]{12})|(LU)?[0-9]{8}|(LV)?[0-9]{11}|(MT)?[0-9]{8}|(NL)?[0-9]{9}B[0-9]{2}|(PL)?[0-9]{10}|(PT)?[0-9]{9}|(RO)?[0-9]{2,10}|(SE)?[0-9]{12}|(SI)?[0-9]{8}|(SK)?[0-9]{10})|[0-9]{9}MVA|[0-9]{6}|CHE[0-9]{9}(TVA|MWST|IVA)$" 26 | EDAM_TIMEZONE_LEN_MIN = 1 27 | EDAM_TIMEZONE_LEN_MAX = 32 28 | EDAM_TIMEZONE_REGEX = "^([A-Za-z_-]+(/[A-Za-z_-]+)*)|(GMT(-|\\+)[0-9]{1,2}(:[0-9]{2})?)$" 29 | EDAM_MIME_LEN_MIN = 3 30 | EDAM_MIME_LEN_MAX = 255 31 | EDAM_MIME_REGEX = "^[A-Za-z]+/[A-Za-z0-9._+-]+$" 32 | EDAM_MIME_TYPE_GIF = "image/gif" 33 | EDAM_MIME_TYPE_JPEG = "image/jpeg" 34 | EDAM_MIME_TYPE_PNG = "image/png" 35 | EDAM_MIME_TYPE_WAV = "audio/wav" 36 | EDAM_MIME_TYPE_MP3 = "audio/mpeg" 37 | EDAM_MIME_TYPE_AMR = "audio/amr" 38 | EDAM_MIME_TYPE_AAC = "audio/aac" 39 | EDAM_MIME_TYPE_M4A = "audio/mp4" 40 | EDAM_MIME_TYPE_MP4_VIDEO = "video/mp4" 41 | EDAM_MIME_TYPE_INK = "application/vnd.evernote.ink" 42 | EDAM_MIME_TYPE_PDF = "application/pdf" 43 | EDAM_MIME_TYPE_DEFAULT = "application/octet-stream" 44 | EDAM_MIME_TYPES = set([ 45 | "image/gif", 46 | "image/jpeg", 47 | "image/png", 48 | "audio/wav", 49 | "audio/mpeg", 50 | "audio/amr", 51 | "application/vnd.evernote.ink", 52 | "application/pdf", 53 | "video/mp4", 54 | "audio/aac", 55 | "audio/mp4", 56 | ]) 57 | EDAM_INDEXABLE_RESOURCE_MIME_TYPES = set([ 58 | "application/msword", 59 | "application/mspowerpoint", 60 | "application/excel", 61 | "application/vnd.ms-word", 62 | "application/vnd.ms-powerpoint", 63 | "application/vnd.ms-excel", 64 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 65 | "application/vnd.openxmlformats-officedocument.presentationml.presentation", 66 | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 67 | "application/vnd.apple.pages", 68 | "application/vnd.apple.numbers", 69 | "application/vnd.apple.keynote", 70 | "application/x-iwork-pages-sffpages", 71 | "application/x-iwork-numbers-sffnumbers", 72 | "application/x-iwork-keynote-sffkey", 73 | ]) 74 | EDAM_SEARCH_QUERY_LEN_MIN = 0 75 | EDAM_SEARCH_QUERY_LEN_MAX = 1024 76 | EDAM_SEARCH_QUERY_REGEX = "^[^\\p{Cc}\\p{Zl}\\p{Zp}]{0,1024}$" 77 | EDAM_HASH_LEN = 16 78 | EDAM_USER_USERNAME_LEN_MIN = 1 79 | EDAM_USER_USERNAME_LEN_MAX = 64 80 | EDAM_USER_USERNAME_REGEX = "^[a-z0-9]([a-z0-9_-]{0,62}[a-z0-9])?$" 81 | EDAM_USER_NAME_LEN_MIN = 1 82 | EDAM_USER_NAME_LEN_MAX = 255 83 | EDAM_USER_NAME_REGEX = "^[^\\p{Cc}\\p{Zl}\\p{Zp}]{1,255}$" 84 | EDAM_TAG_NAME_LEN_MIN = 1 85 | EDAM_TAG_NAME_LEN_MAX = 100 86 | EDAM_TAG_NAME_REGEX = "^[^,\\p{Cc}\\p{Z}]([^,\\p{Cc}\\p{Zl}\\p{Zp}]{0,98}[^,\\p{Cc}\\p{Z}])?$" 87 | EDAM_NOTE_TITLE_LEN_MIN = 1 88 | EDAM_NOTE_TITLE_LEN_MAX = 255 89 | EDAM_NOTE_TITLE_REGEX = "^[^\\p{Cc}\\p{Z}]([^\\p{Cc}\\p{Zl}\\p{Zp}]{0,253}[^\\p{Cc}\\p{Z}])?$" 90 | EDAM_NOTE_CONTENT_LEN_MIN = 0 91 | EDAM_NOTE_CONTENT_LEN_MAX = 5242880 92 | EDAM_APPLICATIONDATA_NAME_LEN_MIN = 3 93 | EDAM_APPLICATIONDATA_NAME_LEN_MAX = 32 94 | EDAM_APPLICATIONDATA_VALUE_LEN_MIN = 0 95 | EDAM_APPLICATIONDATA_VALUE_LEN_MAX = 4092 96 | EDAM_APPLICATIONDATA_ENTRY_LEN_MAX = 4095 97 | EDAM_APPLICATIONDATA_NAME_REGEX = "^[A-Za-z0-9_.-]{3,32}$" 98 | EDAM_APPLICATIONDATA_VALUE_REGEX = "^[^\\p{Cc}]{0,4092}$" 99 | EDAM_NOTEBOOK_NAME_LEN_MIN = 1 100 | EDAM_NOTEBOOK_NAME_LEN_MAX = 100 101 | EDAM_NOTEBOOK_NAME_REGEX = "^[^\\p{Cc}\\p{Z}]([^\\p{Cc}\\p{Zl}\\p{Zp}]{0,98}[^\\p{Cc}\\p{Z}])?$" 102 | EDAM_NOTEBOOK_STACK_LEN_MIN = 1 103 | EDAM_NOTEBOOK_STACK_LEN_MAX = 100 104 | EDAM_NOTEBOOK_STACK_REGEX = "^[^\\p{Cc}\\p{Z}]([^\\p{Cc}\\p{Zl}\\p{Zp}]{0,98}[^\\p{Cc}\\p{Z}])?$" 105 | EDAM_PUBLISHING_URI_LEN_MIN = 1 106 | EDAM_PUBLISHING_URI_LEN_MAX = 255 107 | EDAM_PUBLISHING_URI_REGEX = "^[a-zA-Z0-9.~_+-]{1,255}$" 108 | EDAM_PUBLISHING_URI_PROHIBITED = set([ 109 | "..", 110 | ]) 111 | EDAM_PUBLISHING_DESCRIPTION_LEN_MIN = 1 112 | EDAM_PUBLISHING_DESCRIPTION_LEN_MAX = 200 113 | EDAM_PUBLISHING_DESCRIPTION_REGEX = "^[^\\p{Cc}\\p{Z}]([^\\p{Cc}\\p{Zl}\\p{Zp}]{0,198}[^\\p{Cc}\\p{Z}])?$" 114 | EDAM_SAVED_SEARCH_NAME_LEN_MIN = 1 115 | EDAM_SAVED_SEARCH_NAME_LEN_MAX = 100 116 | EDAM_SAVED_SEARCH_NAME_REGEX = "^[^\\p{Cc}\\p{Z}]([^\\p{Cc}\\p{Zl}\\p{Zp}]{0,98}[^\\p{Cc}\\p{Z}])?$" 117 | EDAM_USER_PASSWORD_LEN_MIN = 6 118 | EDAM_USER_PASSWORD_LEN_MAX = 64 119 | EDAM_USER_PASSWORD_REGEX = "^[A-Za-z0-9!#$%&'()*+,./:;<=>?@^_`{|}~\\[\\]\\\\-]{6,64}$" 120 | EDAM_BUSINESS_URI_LEN_MAX = 32 121 | EDAM_NOTE_TAGS_MAX = 100 122 | EDAM_NOTE_RESOURCES_MAX = 1000 123 | EDAM_USER_TAGS_MAX = 100000 124 | EDAM_BUSINESS_TAGS_MAX = 100000 125 | EDAM_USER_SAVED_SEARCHES_MAX = 100 126 | EDAM_USER_NOTES_MAX = 100000 127 | EDAM_BUSINESS_NOTES_MAX = 500000 128 | EDAM_USER_NOTEBOOKS_MAX = 250 129 | EDAM_BUSINESS_NOTEBOOKS_MAX = 5000 130 | EDAM_USER_RECENT_MAILED_ADDRESSES_MAX = 10 131 | EDAM_USER_MAIL_LIMIT_DAILY_FREE = 50 132 | EDAM_USER_MAIL_LIMIT_DAILY_PREMIUM = 200 133 | EDAM_USER_UPLOAD_LIMIT_FREE = 62914560 134 | EDAM_USER_UPLOAD_LIMIT_PREMIUM = 1073741824 135 | EDAM_USER_UPLOAD_LIMIT_BUSINESS = 2147483647 136 | EDAM_NOTE_SIZE_MAX_FREE = 26214400 137 | EDAM_NOTE_SIZE_MAX_PREMIUM = 104857600 138 | EDAM_RESOURCE_SIZE_MAX_FREE = 26214400 139 | EDAM_RESOURCE_SIZE_MAX_PREMIUM = 104857600 140 | EDAM_USER_LINKED_NOTEBOOK_MAX = 100 141 | EDAM_USER_LINKED_NOTEBOOK_MAX_PREMIUM = 250 142 | EDAM_NOTEBOOK_SHARED_NOTEBOOK_MAX = 250 143 | EDAM_NOTE_CONTENT_CLASS_LEN_MIN = 3 144 | EDAM_NOTE_CONTENT_CLASS_LEN_MAX = 32 145 | EDAM_NOTE_CONTENT_CLASS_REGEX = "^[A-Za-z0-9_.-]{3,32}$" 146 | EDAM_HELLO_APP_CONTENT_CLASS_PREFIX = "evernote.hello." 147 | EDAM_FOOD_APP_CONTENT_CLASS_PREFIX = "evernote.food." 148 | EDAM_CONTENT_CLASS_HELLO_ENCOUNTER = "evernote.hello.encounter" 149 | EDAM_CONTENT_CLASS_HELLO_PROFILE = "evernote.hello.profile" 150 | EDAM_CONTENT_CLASS_FOOD_MEAL = "evernote.food.meal" 151 | EDAM_CONTENT_CLASS_SKITCH_PREFIX = "evernote.skitch" 152 | EDAM_CONTENT_CLASS_SKITCH = "evernote.skitch" 153 | EDAM_CONTENT_CLASS_SKITCH_PDF = "evernote.skitch.pdf" 154 | EDAM_CONTENT_CLASS_PENULTIMATE_PREFIX = "evernote.penultimate." 155 | EDAM_CONTENT_CLASS_PENULTIMATE_NOTEBOOK = "evernote.penultimate.notebook" 156 | EDAM_RELATED_PLAINTEXT_LEN_MIN = 1 157 | EDAM_RELATED_PLAINTEXT_LEN_MAX = 131072 158 | EDAM_RELATED_MAX_NOTES = 25 159 | EDAM_RELATED_MAX_NOTEBOOKS = 1 160 | EDAM_RELATED_MAX_TAGS = 25 161 | EDAM_BUSINESS_NOTEBOOK_DESCRIPTION_LEN_MIN = 1 162 | EDAM_BUSINESS_NOTEBOOK_DESCRIPTION_LEN_MAX = 200 163 | EDAM_BUSINESS_NOTEBOOK_DESCRIPTION_REGEX = "^[^\\p{Cc}\\p{Z}]([^\\p{Cc}\\p{Zl}\\p{Zp}]{0,198}[^\\p{Cc}\\p{Z}])?$" 164 | EDAM_BUSINESS_PHONE_NUMBER_LEN_MAX = 20 165 | EDAM_PREFERENCE_NAME_LEN_MIN = 3 166 | EDAM_PREFERENCE_NAME_LEN_MAX = 32 167 | EDAM_PREFERENCE_VALUE_LEN_MIN = 1 168 | EDAM_PREFERENCE_VALUE_LEN_MAX = 1024 169 | EDAM_MAX_PREFERENCES = 100 170 | EDAM_MAX_VALUES_PER_PREFERENCE = 256 171 | EDAM_PREFERENCE_NAME_REGEX = "^[A-Za-z0-9_.-]{3,32}$" 172 | EDAM_PREFERENCE_VALUE_REGEX = "^[^\\p{Cc}]{1,1024}$" 173 | EDAM_PREFERENCE_SHORTCUTS = "evernote.shortcuts" 174 | EDAM_PREFERENCE_SHORTCUTS_MAX_VALUES = 250 175 | EDAM_DEVICE_ID_LEN_MAX = 32 176 | EDAM_DEVICE_ID_REGEX = "^[^\\p{Cc}]{1,32}$" 177 | EDAM_DEVICE_DESCRIPTION_LEN_MAX = 64 178 | EDAM_DEVICE_DESCRIPTION_REGEX = "^[^\\p{Cc}]{1,64}$" 179 | EDAM_SEARCH_SUGGESTIONS_MAX = 10 180 | EDAM_SEARCH_SUGGESTIONS_PREFIX_LEN_MAX = 1024 181 | EDAM_SEARCH_SUGGESTIONS_PREFIX_LEN_MIN = 2 182 | -------------------------------------------------------------------------------- /evernote/edam/limits/ttypes.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py:new_style 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | 11 | from thrift.transport import TTransport 12 | from thrift.protocol import TBinaryProtocol, TProtocol 13 | try: 14 | from thrift.protocol import fastbinary 15 | except: 16 | fastbinary = None 17 | 18 | 19 | -------------------------------------------------------------------------------- /evernote/edam/notestore/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants', 'NoteStore'] 2 | -------------------------------------------------------------------------------- /evernote/edam/notestore/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py:new_style 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | from ttypes import * 11 | 12 | -------------------------------------------------------------------------------- /evernote/edam/type/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants'] 2 | -------------------------------------------------------------------------------- /evernote/edam/type/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py:new_style 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | from ttypes import * 11 | 12 | CLASSIFICATION_RECIPE_USER_NON_RECIPE = "000" 13 | CLASSIFICATION_RECIPE_USER_RECIPE = "001" 14 | CLASSIFICATION_RECIPE_SERVICE_RECIPE = "002" 15 | EDAM_NOTE_SOURCE_WEB_CLIP = "web.clip" 16 | EDAM_NOTE_SOURCE_MAIL_CLIP = "mail.clip" 17 | EDAM_NOTE_SOURCE_MAIL_SMTP_GATEWAY = "mail.smtp" 18 | -------------------------------------------------------------------------------- /evernote/edam/userstore/UserStore-remote: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Autogenerated by Thrift Compiler 4 | # 5 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 6 | # 7 | # options string: py:new_style 8 | # 9 | 10 | import sys 11 | import pprint 12 | from urlparse import urlparse 13 | from thrift.transport import TTransport 14 | from thrift.transport import TSocket 15 | from thrift.transport import THttpClient 16 | from thrift.protocol import TBinaryProtocol 17 | 18 | import UserStore 19 | from ttypes import * 20 | 21 | if len(sys.argv) <= 1 or sys.argv[1] == '--help': 22 | print '' 23 | print 'Usage: ' + sys.argv[0] + ' [-h host[:port]] [-u url] [-f[ramed]] function [arg1 [arg2...]]' 24 | print '' 25 | print 'Functions:' 26 | print ' bool checkVersion(string clientName, i16 edamVersionMajor, i16 edamVersionMinor)' 27 | print ' BootstrapInfo getBootstrapInfo(string locale)' 28 | print ' AuthenticationResult authenticate(string username, string password, string consumerKey, string consumerSecret, bool supportsTwoFactor)' 29 | print ' AuthenticationResult authenticateLongSession(string username, string password, string consumerKey, string consumerSecret, string deviceIdentifier, string deviceDescription, bool supportsTwoFactor)' 30 | print ' AuthenticationResult completeTwoFactorAuthentication(string authenticationToken, string oneTimeCode, string deviceIdentifier, string deviceDescription)' 31 | print ' void revokeLongSession(string authenticationToken)' 32 | print ' AuthenticationResult authenticateToBusiness(string authenticationToken)' 33 | print ' AuthenticationResult refreshAuthentication(string authenticationToken)' 34 | print ' User getUser(string authenticationToken)' 35 | print ' PublicUserInfo getPublicUserInfo(string username)' 36 | print ' PremiumInfo getPremiumInfo(string authenticationToken)' 37 | print ' string getNoteStoreUrl(string authenticationToken)' 38 | print '' 39 | sys.exit(0) 40 | 41 | pp = pprint.PrettyPrinter(indent = 2) 42 | host = 'localhost' 43 | port = 9090 44 | uri = '' 45 | framed = False 46 | http = False 47 | argi = 1 48 | 49 | if sys.argv[argi] == '-h': 50 | parts = sys.argv[argi+1].split(':') 51 | host = parts[0] 52 | if len(parts) > 1: 53 | port = int(parts[1]) 54 | argi += 2 55 | 56 | if sys.argv[argi] == '-u': 57 | url = urlparse(sys.argv[argi+1]) 58 | parts = url[1].split(':') 59 | host = parts[0] 60 | if len(parts) > 1: 61 | port = int(parts[1]) 62 | else: 63 | port = 80 64 | uri = url[2] 65 | if url[4]: 66 | uri += '?%s' % url[4] 67 | http = True 68 | argi += 2 69 | 70 | if sys.argv[argi] == '-f' or sys.argv[argi] == '-framed': 71 | framed = True 72 | argi += 1 73 | 74 | cmd = sys.argv[argi] 75 | args = sys.argv[argi+1:] 76 | 77 | if http: 78 | transport = THttpClient.THttpClient(host, port, uri) 79 | else: 80 | socket = TSocket.TSocket(host, port) 81 | if framed: 82 | transport = TTransport.TFramedTransport(socket) 83 | else: 84 | transport = TTransport.TBufferedTransport(socket) 85 | protocol = TBinaryProtocol.TBinaryProtocol(transport) 86 | client = UserStore.Client(protocol) 87 | transport.open() 88 | 89 | if cmd == 'checkVersion': 90 | if len(args) != 3: 91 | print 'checkVersion requires 3 args' 92 | sys.exit(1) 93 | pp.pprint(client.checkVersion(args[0],eval(args[1]),eval(args[2]),)) 94 | 95 | elif cmd == 'getBootstrapInfo': 96 | if len(args) != 1: 97 | print 'getBootstrapInfo requires 1 args' 98 | sys.exit(1) 99 | pp.pprint(client.getBootstrapInfo(args[0],)) 100 | 101 | elif cmd == 'authenticate': 102 | if len(args) != 5: 103 | print 'authenticate requires 5 args' 104 | sys.exit(1) 105 | pp.pprint(client.authenticate(args[0],args[1],args[2],args[3],eval(args[4]),)) 106 | 107 | elif cmd == 'authenticateLongSession': 108 | if len(args) != 7: 109 | print 'authenticateLongSession requires 7 args' 110 | sys.exit(1) 111 | pp.pprint(client.authenticateLongSession(args[0],args[1],args[2],args[3],args[4],args[5],eval(args[6]),)) 112 | 113 | elif cmd == 'completeTwoFactorAuthentication': 114 | if len(args) != 4: 115 | print 'completeTwoFactorAuthentication requires 4 args' 116 | sys.exit(1) 117 | pp.pprint(client.completeTwoFactorAuthentication(args[0],args[1],args[2],args[3],)) 118 | 119 | elif cmd == 'revokeLongSession': 120 | if len(args) != 1: 121 | print 'revokeLongSession requires 1 args' 122 | sys.exit(1) 123 | pp.pprint(client.revokeLongSession(args[0],)) 124 | 125 | elif cmd == 'authenticateToBusiness': 126 | if len(args) != 1: 127 | print 'authenticateToBusiness requires 1 args' 128 | sys.exit(1) 129 | pp.pprint(client.authenticateToBusiness(args[0],)) 130 | 131 | elif cmd == 'refreshAuthentication': 132 | if len(args) != 1: 133 | print 'refreshAuthentication requires 1 args' 134 | sys.exit(1) 135 | pp.pprint(client.refreshAuthentication(args[0],)) 136 | 137 | elif cmd == 'getUser': 138 | if len(args) != 1: 139 | print 'getUser requires 1 args' 140 | sys.exit(1) 141 | pp.pprint(client.getUser(args[0],)) 142 | 143 | elif cmd == 'getPublicUserInfo': 144 | if len(args) != 1: 145 | print 'getPublicUserInfo requires 1 args' 146 | sys.exit(1) 147 | pp.pprint(client.getPublicUserInfo(args[0],)) 148 | 149 | elif cmd == 'getPremiumInfo': 150 | if len(args) != 1: 151 | print 'getPremiumInfo requires 1 args' 152 | sys.exit(1) 153 | pp.pprint(client.getPremiumInfo(args[0],)) 154 | 155 | elif cmd == 'getNoteStoreUrl': 156 | if len(args) != 1: 157 | print 'getNoteStoreUrl requires 1 args' 158 | sys.exit(1) 159 | pp.pprint(client.getNoteStoreUrl(args[0],)) 160 | 161 | else: 162 | print 'Unrecognized method %s' % cmd 163 | sys.exit(1) 164 | 165 | transport.close() 166 | -------------------------------------------------------------------------------- /evernote/edam/userstore/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants', 'UserStore'] 2 | -------------------------------------------------------------------------------- /evernote/edam/userstore/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py:new_style 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | from ttypes import * 11 | 12 | EDAM_VERSION_MAJOR = 1 13 | EDAM_VERSION_MINOR = 25 14 | -------------------------------------------------------------------------------- /img/note.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/img/note.jpg -------------------------------------------------------------------------------- /img/notebooks.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/img/notebooks.jpg -------------------------------------------------------------------------------- /img/start.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/img/start.jpg -------------------------------------------------------------------------------- /img/workbench.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/img/workbench.jpg -------------------------------------------------------------------------------- /premailer/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | from .premailer import Premailer, transform 3 | 4 | __version__ = '2.9.7' 5 | -------------------------------------------------------------------------------- /premailer/__main__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, unicode_literals 2 | import sys 3 | import argparse 4 | 5 | from .premailer import Premailer 6 | 7 | 8 | def main(args): 9 | """Command-line tool to transform html style to inline css 10 | 11 | Usage:: 12 | 13 | $ echo '

Title

' | \ 14 | python -m premailer 15 |

16 | $ cat newsletter.html | python -m premailer 17 | """ 18 | 19 | parser = argparse.ArgumentParser(usage='python -m premailer [options]') 20 | 21 | parser.add_argument( 22 | "-f", "--file", nargs='?', type=argparse.FileType('r'), 23 | help="Specifies the input file. The default is stdin.", 24 | default=sys.stdin, dest="infile" 25 | ) 26 | 27 | parser.add_argument( 28 | "-o", "--output", nargs='?', type=argparse.FileType('w'), 29 | help="Specifies the output file. The default is stdout.", 30 | default=sys.stdout, dest="outfile" 31 | ) 32 | 33 | parser.add_argument( 34 | "--base-url", default=None, type=str, dest="base_url" 35 | ) 36 | 37 | parser.add_argument( 38 | "--remove-internal-links", default=True, 39 | help="Remove links that start with a '#' like anchors.", 40 | dest="preserve_internal_links" 41 | ) 42 | 43 | parser.add_argument( 44 | "--exclude-pseudoclasses", default=False, 45 | help="Pseudo classes like p:last-child', p:first-child, etc", 46 | action="store_true", dest="exclude_pseudoclasses" 47 | ) 48 | 49 | parser.add_argument( 50 | "--preserve-style-tags", default=False, 51 | help="Do not delete tags from the html document.", 52 | action="store_true", dest="keep_style_tags" 53 | ) 54 | 55 | parser.add_argument( 56 | "--remove-star-selectors", default=True, 57 | help="All wildcard selectors like '* {color: black}' will be removed.", 58 | action="store_false", dest="include_star_selectors" 59 | ) 60 | 61 | parser.add_argument( 62 | "--remove-classes", default=False, 63 | help="Remove all class attributes from all elements", 64 | action="store_true", dest="remove_classes" 65 | ) 66 | 67 | parser.add_argument( 68 | "--strip-important", default=False, 69 | help="Remove '!important' for all css declarations.", 70 | action="store_true", dest="strip_important" 71 | ) 72 | 73 | parser.add_argument( 74 | "--method", default="html", dest="method", 75 | help="The type of html to output. 'html' for HTML, 'xml' for XHTML." 76 | ) 77 | 78 | parser.add_argument( 79 | "--base-path", default=None, dest="base_path", 80 | help="The base path for all external stylsheets." 81 | ) 82 | 83 | parser.add_argument( 84 | "--external-style", action="append", dest="external_styles", 85 | help="The path to an external stylesheet to be loaded." 86 | ) 87 | 88 | parser.add_argument( 89 | "--css-text", action="append", dest="css_text", 90 | help="CSS text to be applied to the html." 91 | ) 92 | 93 | parser.add_argument( 94 | "--disable-basic-attributes", dest="disable_basic_attributes", 95 | help="Disable provided basic attributes (comma separated)", default=[] 96 | ) 97 | 98 | parser.add_argument( 99 | "--disable-validation", default=False, 100 | action="store_true", dest="disable_validation", 101 | help="Disable CSSParser validation of attributes and values", 102 | ) 103 | 104 | parser.add_argument( 105 | "--pretty", default=False, 106 | action="store_true", 107 | help="Pretty-print the outputted HTML.", 108 | ) 109 | 110 | options = parser.parse_args(args) 111 | 112 | if options.disable_basic_attributes: 113 | options.disable_basic_attributes = ( 114 | options.disable_basic_attributes.split() 115 | ) 116 | 117 | html = options.infile.read() 118 | if hasattr(html, 'decode'): # Forgive me: Python 2 compatability 119 | html = html.decode('utf-8') 120 | 121 | p = Premailer( 122 | html=html, 123 | base_url=options.base_url, 124 | preserve_internal_links=options.preserve_internal_links, 125 | exclude_pseudoclasses=options.exclude_pseudoclasses, 126 | keep_style_tags=options.keep_style_tags, 127 | include_star_selectors=options.include_star_selectors, 128 | remove_classes=options.remove_classes, 129 | strip_important=options.strip_important, 130 | external_styles=options.external_styles, 131 | css_text=options.css_text, 132 | method=options.method, 133 | base_path=options.base_path, 134 | disable_basic_attributes=options.disable_basic_attributes, 135 | disable_validation=options.disable_validation 136 | ) 137 | options.outfile.write(p.transform(pretty_print=options.pretty)) 138 | return 0 139 | 140 | 141 | if __name__ == '__main__': # pragma: no cover 142 | sys.exit(main(sys.argv[1:])) 143 | -------------------------------------------------------------------------------- /premailer/cache.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | 4 | class _HashedSeq(list): 5 | # # From CPython 6 | __slots__ = 'hashvalue' 7 | 8 | def __init__(self, tup, hash=hash): 9 | self[:] = tup 10 | self.hashvalue = hash(tup) 11 | 12 | def __hash__(self): 13 | return self.hashvalue 14 | 15 | 16 | # if we only have nonlocal 17 | class _Cache(object): 18 | def __init__(self): 19 | self.off = False 20 | self.missed = 0 21 | self.cache = {} 22 | 23 | 24 | def function_cache(expected_max_entries=1000): 25 | """ 26 | function_cache is a decorator for caching function call 27 | the argument to the wrapped function must be hashable else 28 | it will not work 29 | 30 | expected_max_entries is for protecting cache failure. If cache 31 | misses more than this number the cache will turn off itself. 32 | Specify None you sure that the cache will not cause memory 33 | limit problem. 34 | 35 | Args: 36 | expected_max_entries(integer OR None): will raise if not correct 37 | 38 | Returns: 39 | function 40 | 41 | """ 42 | if ( 43 | expected_max_entries is not None and 44 | not isinstance(expected_max_entries, int) 45 | ): 46 | raise TypeError( 47 | 'Expected expected_max_entries to be an integer or None' 48 | ) 49 | 50 | # indicator of cache missed 51 | sentinel = object() 52 | 53 | def decorator(func): 54 | cached = _Cache() 55 | 56 | @functools.wraps(func) 57 | def inner(*args, **kwargs): 58 | if cached.off: 59 | return func(*args, **kwargs) 60 | 61 | keys = args 62 | if kwargs: 63 | sorted_items = sorted(kwargs.items()) 64 | for item in sorted_items: 65 | keys += item 66 | 67 | hashed = hash(_HashedSeq(keys)) 68 | result = cached.cache.get(hashed, sentinel) 69 | if result is sentinel: 70 | cached.missed += 1 71 | result = func(*args, **kwargs) 72 | cached.cache[hashed] = result 73 | # # something is wrong if we are here more than expected 74 | # # empty and turn it off 75 | if ( 76 | expected_max_entries is not None and 77 | cached.missed > expected_max_entries 78 | ): 79 | cached.off = True 80 | cached.cache.clear() 81 | 82 | return result 83 | 84 | return inner 85 | return decorator 86 | -------------------------------------------------------------------------------- /premailer/merge_style.py: -------------------------------------------------------------------------------- 1 | import cssutils 2 | import threading 3 | from operator import itemgetter 4 | 5 | 6 | def csstext_to_pairs(csstext): 7 | """ 8 | csstext_to_pairs takes css text and make it to list of 9 | tuple of key,value. 10 | """ 11 | # The lock is required to avoid ``cssutils`` concurrency 12 | # issues documented in issue #65 13 | with csstext_to_pairs._lock: 14 | return sorted( 15 | [ 16 | (prop.name.strip(), prop.propertyValue.cssText.strip()) 17 | for prop in cssutils.parseStyle(csstext) 18 | ], 19 | key=itemgetter(0) 20 | ) 21 | 22 | 23 | csstext_to_pairs._lock = threading.RLock() 24 | 25 | 26 | def merge_styles( 27 | inline_style, 28 | new_styles, 29 | classes, 30 | remove_unset_properties=False 31 | ): 32 | """ 33 | This will merge all new styles where the order is important 34 | The last one will override the first 35 | When that is done it will apply old inline style again 36 | The old inline style is always important and override 37 | all new ones. The inline style must be valid. 38 | 39 | Args: 40 | inline_style(str): the old inline style of the element if there 41 | is one 42 | new_styles: a list of new styles, each element should be 43 | a list of tuple 44 | classes: a list of classes which maps new_styles, important! 45 | remove_unset_properties(bool): Allow us to remove certain CSS 46 | properties with rules that set their value to 'unset' 47 | 48 | Returns: 49 | str: the final style 50 | """ 51 | # building classes 52 | styles = {'': {}} 53 | for pc in set(classes): 54 | styles[pc] = {} 55 | 56 | for i, style in enumerate(new_styles): 57 | for k, v in style: 58 | styles[classes[i]][k] = v 59 | 60 | # keep always the old inline style 61 | if inline_style: 62 | # inline should be a declaration list as I understand 63 | # ie property-name:property-value;... 64 | for k, v in csstext_to_pairs(inline_style): 65 | styles[''][k] = v 66 | 67 | normal_styles = [] 68 | pseudo_styles = [] 69 | for pseudoclass, kv in styles.items(): 70 | if remove_unset_properties: 71 | # Remove rules that we were going to have value 'unset' because 72 | # they effectively are the same as not saying anything about the 73 | # property when inlined 74 | kv = dict( 75 | (k, v) for (k, v) in kv.items() if not v.lower() == 'unset' 76 | ) 77 | if not kv: 78 | continue 79 | if pseudoclass: 80 | pseudo_styles.append( 81 | '%s{%s}' % ( 82 | pseudoclass, 83 | '; '.join('%s:%s' % (k, v) for k, v in sorted(kv.items())) 84 | ) 85 | ) 86 | else: 87 | normal_styles.append('; '.join( 88 | '%s:%s' % (k, v) for k, v in sorted(kv.items()) 89 | )) 90 | 91 | if pseudo_styles: 92 | # if we do or code thing correct this should not happen 93 | # inline style definition: declarations without braces 94 | all_styles = ( 95 | (['{%s}' % ''.join(normal_styles)] + pseudo_styles) 96 | if normal_styles else pseudo_styles 97 | ) 98 | else: 99 | all_styles = normal_styles 100 | 101 | return ' '.join(all_styles).strip() 102 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/requirements.txt -------------------------------------------------------------------------------- /test/test1/1.txt: -------------------------------------------------------------------------------- 1 | 12 2 | 2 3 | 4 | 5 | ads 6 | 7 | asd 8 | s 9 | 10 | 11 | test123 12 | 13 | <> 14 | 15 | 16 | 17 | 你好啊 18 | 19 | a87e 23rjfef,;sdl,vdf vmlmasdl[sflr;g'r.g" -------------------------------------------------------------------------------- /test/test1/2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuwons/EverMark/db943701e092cfb1258e3aeac0742ada722de327/test/test1/2.txt -------------------------------------------------------------------------------- /test/test1/test.md: -------------------------------------------------------------------------------- 1 | # 测试 2 | 3 | ## 第一章 4 | 5 | 只是个测试 6 | 7 | ## 第二章 8 | 9 | 10 | 11 | | 概念 | 意义 | 12 | | ---- | ---- | 13 | | `HEAD` | 当前版本 | 14 | | `HEAD^` | 上一个版本 | 15 | | `HEAD~N` | 往上第 ***N*** 个版本 | 16 | | `origin` | 远程仓库的默认名称 | 17 | | `master` | 默认主干分支名 | 18 | 19 | 20 | 21 | | 指令 | 意义 | 22 | | ----- | ------ | 23 | | `git init` | 创建仓库 | 24 | | `git clone` | 克隆远程仓库 | 25 | | `git status` | 当前状态 | 26 | | `git diff` | 查看更改 | 27 | | `git checkout file ` | 撤销工作区 ***file*** 文件的修改 | 28 | | `git reset HEAD file` | 将暂存区文件 ***file*** 的修改撤销到工作区 | 29 | | `git add` | 将文件修改添加到暂存区 | 30 | | `git commit` | 提交修改(将暂存区提交到当前分支) | 31 | | `git push` | 将本地分支提交到远程分支(设置了upstream) | 32 | | `git push origin master` | 将本地的 ***master*** 分支推送到远程 ***origin*** 分支 | 33 | | `git pull` | 从远程分支获取更改 | 34 | | `git reset --hard commit_id` | 版本回退,将HEAD指向 ***commit_id*** | 35 | | `git log` | 查看提交记录 | 36 | | `git log --graph` | 查看分支合并图 | 37 | | `git reflog` | 查看用户的每一次命令记录 | 38 | | `git merge - -no-ff branch_name` | 合并 ***branch_name*** 分支到当前分支(不使用Fast Forward模式)| 39 | | `git branch` | 查看分支 | 40 | | `git branch branch_name` | 创建分支 ***branch_name*** | 41 | | `git checkout branch_name` | 切换到分支 ***branch_name*** | 42 | | `git branch -d branch_name` | 删除分支 ***branch_name*** | 43 | | `git remote` | 查看远程仓库信息 | 44 | | `git checkout -b local_branch_name origin/remote_branch_name` | 克隆远程分支 ***remote_branch_name*** 到本地分支 ***local_branch_name*** | 45 | | `git branch --set-upstream local_branch_name origin/remote_branch_name` | 设置本地分支 ***local_branch_name*** 的upstream为远程分支 ***remote_branch_name*** | 46 | | `git tag tag_name commit_id` | 在 ***commit_id*** 上设置标签 ***tag_name*** ,如果省略 ***commit_id*** 则在最后的commit上打标签 | 47 | | `git push origin --tags` | 将tag更新到远程仓库 | 48 | -------------------------------------------------------------------------------- /test/test2/1.txt: -------------------------------------------------------------------------------- 1 | 12 2 | 2 3 | 4 | 5 | ads 6 | 7 | asd 8 | s 9 | 10 | 11 | test123 12 | 13 | <> 14 | 15 | 16 | 17 | 你好啊 18 | 19 | a87e 23rjfef,;sdl,vdf vmlmasdl[sflr;g'r.g" -------------------------------------------------------------------------------- /test/test2/3.txt: -------------------------------------------------------------------------------- 1 | 你好 2 | 3 | 吗 4 | 5 | 6 | ? 7 | 8 | 阿迪 -------------------------------------------------------------------------------- /test/test2/5.txt: -------------------------------------------------------------------------------- 1 | 你好 2 | 3 | 吗 4 | 5 | 6 | ? 7 | 8 | 阿迪 -------------------------------------------------------------------------------- /test/test2/README.md: -------------------------------------------------------------------------------- 1 | # wxBot 2 | 3 | **wxBot** 是用Python包装Web微信协议实现的微信机器人框架。 4 | 5 | 目前的消息支持情况: 6 | 7 | Web微信协议参考资料: 8 | 9 | [挖掘微信Web版通信的全过程](http://www.tanhao.me/talk/1466.html/) 10 | 11 | [微信协议简单调研笔记](http://www.blogjava.net/yongboy/archive/2015/11/05/410636.html) 12 | 13 | [qwx: WeChat Qt frontend 微信Qt前端](https://github.com/xiangzhai/qwx) 14 | 15 | ## 1 环境与依赖 16 | 17 | 此版本只能运行于Python 2环境 。 18 | 19 | **wxBot** 用到了Python **requests** , **pypng** , 以及 **pyqrcode** 库。 20 | 21 | 使用之前需要所依赖的库: 22 | 23 | ```bash 24 | pip install requests 25 | pip install pyqrcode 26 | pip install pypng 27 | ``` 28 | 29 | ## 2 快速开发 30 | 31 | 利用 **wxBot** 最简单的方法就是继承WXBot类并实现 `handle_msg_all` 或者 `schedule` 函数,然后实例化子类并调用 `run` 方法 。 32 | 33 | ### 2.1 代码 34 | 35 | 以下的代码对所有来自好友的文本消息回复 *hi* , 并不断向好友 *tb* 发送 *schedule* 。 36 | 37 | `handle_msg_all` 函数用于处理收到的每条消息,而 `schedule` 函数可以做一些任务性的工作(例如不断向好友推送信息或者一些定时任务)。 38 | 39 | ```python 40 | #!/usr/bin/env python 41 | # coding: utf-8 42 | 43 | import time 44 | from wxbot import * 45 | 46 | class MyWXBot(WXBot): 47 | def handle_msg_all(self, msg): 48 | if msg['msg_type_id'] == 4 and msg['content']['type'] == 0: 49 | self.send_msg_by_uid('hi', msg['user']['id']) 50 | 51 | def schedule(self): 52 | self.send_msg('tb', 'schedule') 53 | time.sleep(1) 54 | 55 | def main(): 56 | bot = MyWXBot() 57 | bot.DEBUG = True 58 | bot.run() 59 | 60 | if __name__ == '__main__': 61 | main() 62 | 63 | ``` 64 | 65 | ### 2.2 运行 66 | 67 | 直接用 `python` 运行代码(如运行测试代码 ***test.py*** ): 68 | 69 | ``` python 70 | python test.py 71 | ``` 72 | 73 | ### 2.3 登录微信 74 | 75 | 程序运行之后,会在当前目录下生成二维码图片文件 ***qr.png*** 并自动打开,用微信扫描此二维码并按操作指示确认登录网页微信。 76 | 77 | 如果运行在Linux下,还可以通过设置 **WXBot** 对象的 `conf['qr']` 为 `tty` 的方式直接在终端打印二维码(此方法只能在Linux终端下使用),效果如下: 78 | 79 | ## 3 效果展示 80 | 81 | 测试代码 ***test.py*** 的运行效果: 82 | 83 | 84 | ## 4 接口 85 | ### 4.1 `handle_msg_all` 86 | 87 | `handle_msg_all` 函数的参数 `msg` 是代表一条消息的字典。字段的内容为: 88 | 89 | | 字段名 | 字段内容 | 90 | | ----- | --- | 91 | | `msg_type_id` | 整数,消息类型,具体解释可以查看 **消息类型表** | 92 | | `msg_id` | 字符串,消息id | 93 | | `content` | 字典,消息内容,具体含有的字段请参考 **消息类型表** ,一般含有 `type`(数据类型)与 `data`(数据内容)字段,`type` 与 `data`的对应关系可以参考 **数据类型表** | 94 | | `user` | 字典,消息来源,字典包含 `name`(发送者名称,如果是群则为群名称,如果为微信号,有备注则为备注名,否则为微信号或者群昵称)字段与 `id`(发送者id)字段,都是字符串 | 95 | 96 | 97 | ### 4.2 消息类型表 98 | 99 | | 类型号 | 消息类型 | `content` | 100 | | ----- | --- | ------ | 101 | | 0 | 初始化消息,内部数据 | 无意义,可以忽略 | 102 | | 1 | 自己发送的消息 | 无意义,可以忽略 | 103 | | 2 | 文件消息 | 字典,包含 `type` 与 `data` 字段 | 104 | | 3 | 群消息 | 字典, 包含 `user` (字典,包含 `id` 与 `name`字段,都是字符串,表示发送此消息的群用户)与 `type` 、 `data` 字段,红包消息只有 `type` 字段, 文本消息还有detail、desc字段, 参考 **群文本消息** | 105 | | 4 | 联系人消息 | 字典,包含 `type` 与 `data` 字段 | 106 | | 5 | 公众号消息 | 字典,包含 `type` 与 `data` 字段 | 107 | | 6 | 特殊账号消息 | 字典,包含 `type` 与 `data` 字段 | 108 | | 99 | 未知账号消息 | 无意义,可以忽略 | 109 | 110 | 111 | ### 4.3 数据类型表 112 | 113 | | `type` | 数据类型 | `data` | 114 | | ---- | ---- | ------ | 115 | | 0 | 文本 | 字符串,表示文本消息的具体内容 | 116 | | 1 | 地理位置 | 字符串,表示地理位置 | 117 | | 3 | 图片 | 字符串,图片数据的url,HTTP POST请求此url可以得到jpg文件格式的数据 | 118 | | 4 | 语音 | 字符串,语音数据的url,HTTP POST请求此url可以得到mp3文件格式的数据 | 119 | | 5 | 名片 | 字典,包含 `nickname` (昵称), `alias` (别名),`province` (省份),`city` (城市), `gender` (性别)字段 | 120 | | 6 | 动画 | 字符串, 动画url, HTTP POST请求此url可以得到gif文件格式的数据 | 121 | | 7 | 分享 | 字典,包含 `type` (类型),`title` (标题),`desc` (描述),`url` (链接),`from` (源网站)字段 | 122 | | 8 | 视频 | 不可用 | 123 | | 9 | 视频电话 | 不可用 | 124 | | 10 | 撤回消息 | 不可用 | 125 | | 11 | 空内容 | 空字符串 | 126 | | 12 | 红包 | 不可用 | 127 | | 99 | 未知类型 | 不可用 | 128 | 129 | ### 4.4 群文本消息 130 | 131 | 由于群文本消息中可能含有@信息,因此群文本消息的 `content` 字典除了含有 `type` 与 `data` 字段外,还含有 `detail` 与 `desc` 字段。 132 | 133 | 各字段内容为: 134 | 135 | | 字段 | 内容 | 136 | | --- | ---- | 137 | | `type` | 数据类型, 为0(文本) | 138 | | `data` | 字符串,消息内容,含有@信息 | 139 | | `desc` | 字符串,删除了所有@信息 | 140 | | `detail` | 数组,元素类型为含有 `type` 与 `value` 字段的字典, `type` 为字符串 ***str*** (表示元素为普通字符串,此时value为消息内容) 或 ***at*** (表示元素为@信息, 此时value为所@的用户名) | 141 | 142 | 143 | ### 4.5 WXBot对象属性 144 | 145 | **WXBot** 对象在登录并初始化之后,含有以下的可用数据: 146 | 147 | | 属性 | 描述 | 148 | | ---- | ---- | 149 | | `contact_list` | 当前用户的微信联系人列表 | 150 | | `group_list` | 当前用户的微信群列表 | 151 | | `public_list` | 当前用户关注的公众号列表 | 152 | | `special_list` | 特殊账号列表 | 153 | | `session` | **WXBot** 与WEB微信服务器端交互所用的 **Requests** `Session` 对象 | 154 | 155 | ### 4.6 WXBot对象方法 156 | 157 | **WXBot** 对象还含有一些可以利用的方法 158 | 159 | | 方法 | 描述 | 160 | | ---- | --- | 161 | | `get_icon(id)` | 获取用户icon并保存到本地文件 ***img_[id].jpg*** , `id` 为用户id(Web微信数据) | 162 | | `get_head_img(id)` | 获取用户头像并保存到本地文件 ***img_[id].jpg*** ,`id` 为用户id(Web微信数据) | 163 | | `get_msg_img(msgid)` | 获取图像消息并保存到本地文件 ***img_[msgid].jpg*** , `msgid` 为消息id(Web微信数据) | 164 | | `get_voice(msgid)` | 获取语音消息并保存到本地文件 ***voice_[msgid].mp3*** , `msgid` 为消息id(Web微信数据) | 165 | | `get_contact_name(uid)` | 获取微信id对应的名称,返回一个可能包含 `remark_name` (备注名), `nickname` (昵称), `display_name` (群名称)的字典| 166 | | `send_msg_by_uid(word, dst)` | 向好友发送消息,`word` 为消息字符串,`dst` 为好友用户id(Web微信数据) | 167 | | `send_msg(name, word, isfile)` | 向好友发送消息,`name` 为好友的备注名或者好友微信号, `isfile`为 `False` 时 `word` 为消息,`isfile` 为 `True` 时 `word` 为文件路径(此时向好友发送文件里的每一行),此方法在有重名好友时会有问题,因此更推荐使用 `send_msg_by_uid(word, dst)` | 168 | | `is_contact(uid)` | 判断id为 `uid` 的账号是否是本帐号的好友,返回 `True` (是)或 `False` (不是) | 169 | | `is_public(uid)` | 判断id为 `uid` 的账号是否是本帐号所关注的公众号,返回 `True` (是)或 `False` (不是) | 170 | 171 | 172 | ## 5 群聊机器人示例 173 | 174 | ***bot.py*** 用 **[图灵机器人](http://www.tuling123.com/)** API 以及 **wxBot** 实现了一个自动回复机器人. 175 | 176 | 此机器人会回复来自联系人的消息,以及群里@此账号的消息。 177 | 178 | 并且本帐号可以通过发送 *退下* 、 *走开* 、 *关闭* 、 *关掉* 、 *休息* 、 *滚开* 来关闭机器人的自动回复。 179 | 180 | 也可以通过发送 *出来* 、 *启动* 、 *工作* 来再次开启机器人的自动回复。 181 | 182 | 群聊时需要将对应的群保存到联系人列表。 183 | 184 | 群聊实现效果: 185 | 186 | 187 | ***bot.py*** 的运行方法: 188 | 189 | - 要接入图灵机器人API时: 190 | 191 | 1. 在[图灵机器人官网](http://www.tuling123.com/)注册账号,申请图灵key: [图灵key申请地址](http://www.tuling123.com/html/doc/apikey.html) 192 | 193 | 2. 在 ***bot.py*** 文件所在目录下新建 ***conf.ini*** 文件,内容为:(key字段内容为申请到的图灵key) 194 | 195 | ```txt 196 | [main] 197 | key=1d2678900f734aa0a23734ace8aec5b1 198 | ``` 199 | 200 | 3. 运行 ***bot.py*** 201 | 202 | ```python 203 | python bot.py 204 | ``` 205 | 206 | - 不接入图灵机器人API时(此时机器人对联系人消息以及群里@自己的消息统一回复 *知道了* ): 207 | 1. 运行 ***bot.py*** 208 | 209 | ```python 210 | python bot.py 211 | ``` 212 | 213 | ## 6 帮助项目 214 | 215 | 欢迎对本项目提意见、贡献代码,参考: [如何帮助项目](https://github.com/liuwons/wxBot/wiki/How-to-contribute) 216 | -------------------------------------------------------------------------------- /thrift/TSCons.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | from os import path 21 | from SCons.Builder import Builder 22 | 23 | def scons_env(env, add=''): 24 | opath = path.dirname(path.abspath('$TARGET')) 25 | lstr = 'thrift --gen cpp -o ' + opath + ' ' + add + ' $SOURCE' 26 | cppbuild = Builder(action = lstr) 27 | env.Append(BUILDERS = {'ThriftCpp' : cppbuild}) 28 | 29 | def gen_cpp(env, dir, file): 30 | scons_env(env) 31 | suffixes = ['_types.h', '_types.cpp'] 32 | targets = map(lambda s: 'gen-cpp/' + file + s, suffixes) 33 | return env.ThriftCpp(targets, dir+file+'.thrift') 34 | -------------------------------------------------------------------------------- /thrift/TSerialization.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | from protocol import TBinaryProtocol 21 | from transport import TTransport 22 | 23 | def serialize(thrift_object, protocol_factory = TBinaryProtocol.TBinaryProtocolFactory()): 24 | transport = TTransport.TMemoryBuffer() 25 | protocol = protocol_factory.getProtocol(transport) 26 | thrift_object.write(protocol) 27 | return transport.getvalue() 28 | 29 | def deserialize(base, buf, protocol_factory = TBinaryProtocol.TBinaryProtocolFactory()): 30 | transport = TTransport.TMemoryBuffer(buf) 31 | protocol = protocol_factory.getProtocol(transport) 32 | base.read(protocol) 33 | return base 34 | 35 | -------------------------------------------------------------------------------- /thrift/Thrift.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | import sys 21 | 22 | class TType: 23 | STOP = 0 24 | VOID = 1 25 | BOOL = 2 26 | BYTE = 3 27 | I08 = 3 28 | DOUBLE = 4 29 | I16 = 6 30 | I32 = 8 31 | I64 = 10 32 | STRING = 11 33 | UTF7 = 11 34 | STRUCT = 12 35 | MAP = 13 36 | SET = 14 37 | LIST = 15 38 | UTF8 = 16 39 | UTF16 = 17 40 | 41 | _VALUES_TO_NAMES = ( 'STOP', 42 | 'VOID', 43 | 'BOOL', 44 | 'BYTE', 45 | 'DOUBLE', 46 | None, 47 | 'I16', 48 | None, 49 | 'I32', 50 | None, 51 | 'I64', 52 | 'STRING', 53 | 'STRUCT', 54 | 'MAP', 55 | 'SET', 56 | 'LIST', 57 | 'UTF8', 58 | 'UTF16' ) 59 | 60 | class TMessageType: 61 | CALL = 1 62 | REPLY = 2 63 | EXCEPTION = 3 64 | ONEWAY = 4 65 | 66 | class TProcessor: 67 | 68 | """Base class for procsessor, which works on two streams.""" 69 | 70 | def process(iprot, oprot): 71 | pass 72 | 73 | class TException(Exception): 74 | 75 | """Base class for all thrift exceptions.""" 76 | 77 | # BaseException.message is deprecated in Python v[2.6,3.0) 78 | if (2,6,0) <= sys.version_info < (3,0): 79 | def _get_message(self): 80 | return self._message 81 | def _set_message(self, message): 82 | self._message = message 83 | message = property(_get_message, _set_message) 84 | 85 | def __init__(self, message=None): 86 | Exception.__init__(self, message) 87 | self.message = message 88 | 89 | class TApplicationException(TException): 90 | 91 | """Application level thrift exceptions.""" 92 | 93 | UNKNOWN = 0 94 | UNKNOWN_METHOD = 1 95 | INVALID_MESSAGE_TYPE = 2 96 | WRONG_METHOD_NAME = 3 97 | BAD_SEQUENCE_ID = 4 98 | MISSING_RESULT = 5 99 | INTERNAL_ERROR = 6 100 | PROTOCOL_ERROR = 7 101 | 102 | def __init__(self, type=UNKNOWN, message=None): 103 | TException.__init__(self, message) 104 | self.type = type 105 | 106 | def __str__(self): 107 | if self.message: 108 | return self.message 109 | elif self.type == self.UNKNOWN_METHOD: 110 | return 'Unknown method' 111 | elif self.type == self.INVALID_MESSAGE_TYPE: 112 | return 'Invalid message type' 113 | elif self.type == self.WRONG_METHOD_NAME: 114 | return 'Wrong method name' 115 | elif self.type == self.BAD_SEQUENCE_ID: 116 | return 'Bad sequence ID' 117 | elif self.type == self.MISSING_RESULT: 118 | return 'Missing result' 119 | else: 120 | return 'Default (unknown) TApplicationException' 121 | 122 | def read(self, iprot): 123 | iprot.readStructBegin() 124 | while True: 125 | (fname, ftype, fid) = iprot.readFieldBegin() 126 | if ftype == TType.STOP: 127 | break 128 | if fid == 1: 129 | if ftype == TType.STRING: 130 | self.message = iprot.readString(); 131 | else: 132 | iprot.skip(ftype) 133 | elif fid == 2: 134 | if ftype == TType.I32: 135 | self.type = iprot.readI32(); 136 | else: 137 | iprot.skip(ftype) 138 | else: 139 | iprot.skip(ftype) 140 | iprot.readFieldEnd() 141 | iprot.readStructEnd() 142 | 143 | def write(self, oprot): 144 | oprot.writeStructBegin('TApplicationException') 145 | if self.message != None: 146 | oprot.writeFieldBegin('message', TType.STRING, 1) 147 | oprot.writeString(self.message) 148 | oprot.writeFieldEnd() 149 | if self.type != None: 150 | oprot.writeFieldBegin('type', TType.I32, 2) 151 | oprot.writeI32(self.type) 152 | oprot.writeFieldEnd() 153 | oprot.writeFieldStop() 154 | oprot.writeStructEnd() 155 | -------------------------------------------------------------------------------- /thrift/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | __all__ = ['Thrift', 'TSCons'] 21 | -------------------------------------------------------------------------------- /thrift/protocol/TBase.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | from thrift.Thrift import * 21 | from thrift.protocol import TBinaryProtocol 22 | from thrift.transport import TTransport 23 | 24 | try: 25 | from thrift.protocol import fastbinary 26 | except: 27 | fastbinary = None 28 | 29 | class TBase(object): 30 | __slots__ = [] 31 | 32 | def __repr__(self): 33 | L = ['%s=%r' % (key, getattr(self, key)) 34 | for key in self.__slots__ ] 35 | return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) 36 | 37 | def __eq__(self, other): 38 | if not isinstance(other, self.__class__): 39 | return False 40 | for attr in self.__slots__: 41 | my_val = getattr(self, attr) 42 | other_val = getattr(other, attr) 43 | if my_val != other_val: 44 | return False 45 | return True 46 | 47 | def __ne__(self, other): 48 | return not (self == other) 49 | 50 | def read(self, iprot): 51 | if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: 52 | fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) 53 | return 54 | iprot.readStruct(self, self.thrift_spec) 55 | 56 | def write(self, oprot): 57 | if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: 58 | oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) 59 | return 60 | oprot.writeStruct(self, self.thrift_spec) 61 | 62 | class TExceptionBase(Exception): 63 | # old style class so python2.4 can raise exceptions derived from this 64 | # This can't inherit from TBase because of that limitation. 65 | __slots__ = [] 66 | 67 | __repr__ = TBase.__repr__.im_func 68 | __eq__ = TBase.__eq__.im_func 69 | __ne__ = TBase.__ne__.im_func 70 | read = TBase.read.im_func 71 | write = TBase.write.im_func 72 | 73 | -------------------------------------------------------------------------------- /thrift/protocol/TBinaryProtocol.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | from TProtocol import * 21 | from struct import pack, unpack 22 | 23 | class TBinaryProtocol(TProtocolBase): 24 | 25 | """Binary implementation of the Thrift protocol driver.""" 26 | 27 | # NastyHaxx. Python 2.4+ on 32-bit machines forces hex constants to be 28 | # positive, converting this into a long. If we hardcode the int value 29 | # instead it'll stay in 32 bit-land. 30 | 31 | # VERSION_MASK = 0xffff0000 32 | VERSION_MASK = -65536 33 | 34 | # VERSION_1 = 0x80010000 35 | VERSION_1 = -2147418112 36 | 37 | TYPE_MASK = 0x000000ff 38 | 39 | def __init__(self, trans, strictRead=False, strictWrite=True): 40 | TProtocolBase.__init__(self, trans) 41 | self.strictRead = strictRead 42 | self.strictWrite = strictWrite 43 | 44 | def writeMessageBegin(self, name, type, seqid): 45 | if self.strictWrite: 46 | self.writeI32(TBinaryProtocol.VERSION_1 | type) 47 | self.writeString(name) 48 | self.writeI32(seqid) 49 | else: 50 | self.writeString(name) 51 | self.writeByte(type) 52 | self.writeI32(seqid) 53 | 54 | def writeMessageEnd(self): 55 | pass 56 | 57 | def writeStructBegin(self, name): 58 | pass 59 | 60 | def writeStructEnd(self): 61 | pass 62 | 63 | def writeFieldBegin(self, name, type, id): 64 | self.writeByte(type) 65 | self.writeI16(id) 66 | 67 | def writeFieldEnd(self): 68 | pass 69 | 70 | def writeFieldStop(self): 71 | self.writeByte(TType.STOP); 72 | 73 | def writeMapBegin(self, ktype, vtype, size): 74 | self.writeByte(ktype) 75 | self.writeByte(vtype) 76 | self.writeI32(size) 77 | 78 | def writeMapEnd(self): 79 | pass 80 | 81 | def writeListBegin(self, etype, size): 82 | self.writeByte(etype) 83 | self.writeI32(size) 84 | 85 | def writeListEnd(self): 86 | pass 87 | 88 | def writeSetBegin(self, etype, size): 89 | self.writeByte(etype) 90 | self.writeI32(size) 91 | 92 | def writeSetEnd(self): 93 | pass 94 | 95 | def writeBool(self, bool): 96 | if bool: 97 | self.writeByte(1) 98 | else: 99 | self.writeByte(0) 100 | 101 | def writeByte(self, byte): 102 | buff = pack("!b", byte) 103 | self.trans.write(buff) 104 | 105 | def writeI16(self, i16): 106 | buff = pack("!h", i16) 107 | self.trans.write(buff) 108 | 109 | def writeI32(self, i32): 110 | buff = pack("!i", i32) 111 | self.trans.write(buff) 112 | 113 | def writeI64(self, i64): 114 | buff = pack("!q", i64) 115 | self.trans.write(buff) 116 | 117 | def writeDouble(self, dub): 118 | buff = pack("!d", dub) 119 | self.trans.write(buff) 120 | 121 | def writeString(self, str): 122 | self.writeI32(len(str)) 123 | self.trans.write(str) 124 | 125 | def readMessageBegin(self): 126 | sz = self.readI32() 127 | if sz < 0: 128 | version = sz & TBinaryProtocol.VERSION_MASK 129 | if version != TBinaryProtocol.VERSION_1: 130 | raise TProtocolException(type=TProtocolException.BAD_VERSION, message='Bad version in readMessageBegin: %d' % (sz)) 131 | type = sz & TBinaryProtocol.TYPE_MASK 132 | name = self.readString() 133 | seqid = self.readI32() 134 | else: 135 | if self.strictRead: 136 | raise TProtocolException(type=TProtocolException.BAD_VERSION, message='No protocol version header') 137 | name = self.trans.readAll(sz) 138 | type = self.readByte() 139 | seqid = self.readI32() 140 | return (name, type, seqid) 141 | 142 | def readMessageEnd(self): 143 | pass 144 | 145 | def readStructBegin(self): 146 | pass 147 | 148 | def readStructEnd(self): 149 | pass 150 | 151 | def readFieldBegin(self): 152 | type = self.readByte() 153 | if type == TType.STOP: 154 | return (None, type, 0) 155 | id = self.readI16() 156 | return (None, type, id) 157 | 158 | def readFieldEnd(self): 159 | pass 160 | 161 | def readMapBegin(self): 162 | ktype = self.readByte() 163 | vtype = self.readByte() 164 | size = self.readI32() 165 | return (ktype, vtype, size) 166 | 167 | def readMapEnd(self): 168 | pass 169 | 170 | def readListBegin(self): 171 | etype = self.readByte() 172 | size = self.readI32() 173 | return (etype, size) 174 | 175 | def readListEnd(self): 176 | pass 177 | 178 | def readSetBegin(self): 179 | etype = self.readByte() 180 | size = self.readI32() 181 | return (etype, size) 182 | 183 | def readSetEnd(self): 184 | pass 185 | 186 | def readBool(self): 187 | byte = self.readByte() 188 | if byte == 0: 189 | return False 190 | return True 191 | 192 | def readByte(self): 193 | buff = self.trans.readAll(1) 194 | val, = unpack('!b', buff) 195 | return val 196 | 197 | def readI16(self): 198 | buff = self.trans.readAll(2) 199 | val, = unpack('!h', buff) 200 | return val 201 | 202 | def readI32(self): 203 | buff = self.trans.readAll(4) 204 | val, = unpack('!i', buff) 205 | return val 206 | 207 | def readI64(self): 208 | buff = self.trans.readAll(8) 209 | val, = unpack('!q', buff) 210 | return val 211 | 212 | def readDouble(self): 213 | buff = self.trans.readAll(8) 214 | val, = unpack('!d', buff) 215 | return val 216 | 217 | def readString(self): 218 | len = self.readI32() 219 | str = self.trans.readAll(len) 220 | return str 221 | 222 | 223 | class TBinaryProtocolFactory: 224 | def __init__(self, strictRead=False, strictWrite=True): 225 | self.strictRead = strictRead 226 | self.strictWrite = strictWrite 227 | 228 | def getProtocol(self, trans): 229 | prot = TBinaryProtocol(trans, self.strictRead, self.strictWrite) 230 | return prot 231 | 232 | 233 | class TBinaryProtocolAccelerated(TBinaryProtocol): 234 | 235 | """C-Accelerated version of TBinaryProtocol. 236 | 237 | This class does not override any of TBinaryProtocol's methods, 238 | but the generated code recognizes it directly and will call into 239 | our C module to do the encoding, bypassing this object entirely. 240 | We inherit from TBinaryProtocol so that the normal TBinaryProtocol 241 | encoding can happen if the fastbinary module doesn't work for some 242 | reason. (TODO(dreiss): Make this happen sanely in more cases.) 243 | 244 | In order to take advantage of the C module, just use 245 | TBinaryProtocolAccelerated instead of TBinaryProtocol. 246 | 247 | NOTE: This code was contributed by an external developer. 248 | The internal Thrift team has reviewed and tested it, 249 | but we cannot guarantee that it is production-ready. 250 | Please feel free to report bugs and/or success stories 251 | to the public mailing list. 252 | """ 253 | 254 | pass 255 | 256 | 257 | class TBinaryProtocolAcceleratedFactory: 258 | def getProtocol(self, trans): 259 | return TBinaryProtocolAccelerated(trans) 260 | -------------------------------------------------------------------------------- /thrift/protocol/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | __all__ = ['TProtocol', 'TBinaryProtocol', 'fastbinary', 'TBase'] 21 | -------------------------------------------------------------------------------- /thrift/server/THttpServer.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | import BaseHTTPServer 21 | 22 | from thrift.server import TServer 23 | from thrift.transport import TTransport 24 | 25 | class ResponseException(Exception): 26 | """Allows handlers to override the HTTP response 27 | 28 | Normally, THttpServer always sends a 200 response. If a handler wants 29 | to override this behavior (e.g., to simulate a misconfigured or 30 | overloaded web server during testing), it can raise a ResponseException. 31 | The function passed to the constructor will be called with the 32 | RequestHandler as its only argument. 33 | """ 34 | def __init__(self, handler): 35 | self.handler = handler 36 | 37 | 38 | class THttpServer(TServer.TServer): 39 | """A simple HTTP-based Thrift server 40 | 41 | This class is not very performant, but it is useful (for example) for 42 | acting as a mock version of an Apache-based PHP Thrift endpoint.""" 43 | 44 | def __init__(self, processor, server_address, 45 | inputProtocolFactory, outputProtocolFactory = None, 46 | server_class = BaseHTTPServer.HTTPServer): 47 | """Set up protocol factories and HTTP server. 48 | 49 | See BaseHTTPServer for server_address. 50 | See TServer for protocol factories.""" 51 | 52 | if outputProtocolFactory is None: 53 | outputProtocolFactory = inputProtocolFactory 54 | 55 | TServer.TServer.__init__(self, processor, None, None, None, 56 | inputProtocolFactory, outputProtocolFactory) 57 | 58 | thttpserver = self 59 | 60 | class RequestHander(BaseHTTPServer.BaseHTTPRequestHandler): 61 | def do_POST(self): 62 | # Don't care about the request path. 63 | itrans = TTransport.TFileObjectTransport(self.rfile) 64 | otrans = TTransport.TFileObjectTransport(self.wfile) 65 | itrans = TTransport.TBufferedTransport(itrans, int(self.headers['Content-Length'])) 66 | otrans = TTransport.TMemoryBuffer() 67 | iprot = thttpserver.inputProtocolFactory.getProtocol(itrans) 68 | oprot = thttpserver.outputProtocolFactory.getProtocol(otrans) 69 | try: 70 | thttpserver.processor.process(iprot, oprot) 71 | except ResponseException, exn: 72 | exn.handler(self) 73 | else: 74 | self.send_response(200) 75 | self.send_header("content-type", "application/x-thrift") 76 | self.end_headers() 77 | self.wfile.write(otrans.getvalue()) 78 | 79 | self.httpd = server_class(server_address, RequestHander) 80 | 81 | def serve(self): 82 | self.httpd.serve_forever() 83 | -------------------------------------------------------------------------------- /thrift/server/TProcessPoolServer.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | 21 | import logging 22 | from multiprocessing import Process, Value, Condition, reduction 23 | 24 | from TServer import TServer 25 | from thrift.transport.TTransport import TTransportException 26 | 27 | class TProcessPoolServer(TServer): 28 | 29 | """ 30 | Server with a fixed size pool of worker subprocesses which service requests. 31 | Note that if you need shared state between the handlers - it's up to you! 32 | Written by Dvir Volk, doat.com 33 | """ 34 | 35 | def __init__(self, * args): 36 | TServer.__init__(self, *args) 37 | self.numWorkers = 10 38 | self.workers = [] 39 | self.isRunning = Value('b', False) 40 | self.stopCondition = Condition() 41 | self.postForkCallback = None 42 | 43 | def setPostForkCallback(self, callback): 44 | if not callable(callback): 45 | raise TypeError("This is not a callback!") 46 | self.postForkCallback = callback 47 | 48 | def setNumWorkers(self, num): 49 | """Set the number of worker threads that should be created""" 50 | self.numWorkers = num 51 | 52 | def workerProcess(self): 53 | """Loop around getting clients from the shared queue and process them.""" 54 | 55 | if self.postForkCallback: 56 | self.postForkCallback() 57 | 58 | while self.isRunning.value == True: 59 | try: 60 | client = self.serverTransport.accept() 61 | self.serveClient(client) 62 | except (KeyboardInterrupt, SystemExit): 63 | return 0 64 | except Exception, x: 65 | logging.exception(x) 66 | 67 | def serveClient(self, client): 68 | """Process input/output from a client for as long as possible""" 69 | itrans = self.inputTransportFactory.getTransport(client) 70 | otrans = self.outputTransportFactory.getTransport(client) 71 | iprot = self.inputProtocolFactory.getProtocol(itrans) 72 | oprot = self.outputProtocolFactory.getProtocol(otrans) 73 | 74 | try: 75 | while True: 76 | self.processor.process(iprot, oprot) 77 | except TTransportException, tx: 78 | pass 79 | except Exception, x: 80 | logging.exception(x) 81 | 82 | itrans.close() 83 | otrans.close() 84 | 85 | 86 | def serve(self): 87 | """Start a fixed number of worker threads and put client into a queue""" 88 | 89 | #this is a shared state that can tell the workers to exit when set as false 90 | self.isRunning.value = True 91 | 92 | #first bind and listen to the port 93 | self.serverTransport.listen() 94 | 95 | #fork the children 96 | for i in range(self.numWorkers): 97 | try: 98 | w = Process(target=self.workerProcess) 99 | w.daemon = True 100 | w.start() 101 | self.workers.append(w) 102 | except Exception, x: 103 | logging.exception(x) 104 | 105 | #wait until the condition is set by stop() 106 | 107 | while True: 108 | 109 | self.stopCondition.acquire() 110 | try: 111 | self.stopCondition.wait() 112 | break 113 | except (SystemExit, KeyboardInterrupt): 114 | break 115 | except Exception, x: 116 | logging.exception(x) 117 | 118 | self.isRunning.value = False 119 | 120 | def stop(self): 121 | self.isRunning.value = False 122 | self.stopCondition.acquire() 123 | self.stopCondition.notify() 124 | self.stopCondition.release() 125 | 126 | -------------------------------------------------------------------------------- /thrift/server/TServer.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | import logging 21 | import sys 22 | import os 23 | import traceback 24 | import threading 25 | import Queue 26 | 27 | from thrift.Thrift import TProcessor 28 | from thrift.transport import TTransport 29 | from thrift.protocol import TBinaryProtocol 30 | 31 | class TServer: 32 | 33 | """Base interface for a server, which must have a serve method.""" 34 | 35 | """ 3 constructors for all servers: 36 | 1) (processor, serverTransport) 37 | 2) (processor, serverTransport, transportFactory, protocolFactory) 38 | 3) (processor, serverTransport, 39 | inputTransportFactory, outputTransportFactory, 40 | inputProtocolFactory, outputProtocolFactory)""" 41 | def __init__(self, *args): 42 | if (len(args) == 2): 43 | self.__initArgs__(args[0], args[1], 44 | TTransport.TTransportFactoryBase(), 45 | TTransport.TTransportFactoryBase(), 46 | TBinaryProtocol.TBinaryProtocolFactory(), 47 | TBinaryProtocol.TBinaryProtocolFactory()) 48 | elif (len(args) == 4): 49 | self.__initArgs__(args[0], args[1], args[2], args[2], args[3], args[3]) 50 | elif (len(args) == 6): 51 | self.__initArgs__(args[0], args[1], args[2], args[3], args[4], args[5]) 52 | 53 | def __initArgs__(self, processor, serverTransport, 54 | inputTransportFactory, outputTransportFactory, 55 | inputProtocolFactory, outputProtocolFactory): 56 | self.processor = processor 57 | self.serverTransport = serverTransport 58 | self.inputTransportFactory = inputTransportFactory 59 | self.outputTransportFactory = outputTransportFactory 60 | self.inputProtocolFactory = inputProtocolFactory 61 | self.outputProtocolFactory = outputProtocolFactory 62 | 63 | def serve(self): 64 | pass 65 | 66 | class TSimpleServer(TServer): 67 | 68 | """Simple single-threaded server that just pumps around one transport.""" 69 | 70 | def __init__(self, *args): 71 | TServer.__init__(self, *args) 72 | 73 | def serve(self): 74 | self.serverTransport.listen() 75 | while True: 76 | client = self.serverTransport.accept() 77 | itrans = self.inputTransportFactory.getTransport(client) 78 | otrans = self.outputTransportFactory.getTransport(client) 79 | iprot = self.inputProtocolFactory.getProtocol(itrans) 80 | oprot = self.outputProtocolFactory.getProtocol(otrans) 81 | try: 82 | while True: 83 | self.processor.process(iprot, oprot) 84 | except TTransport.TTransportException, tx: 85 | pass 86 | except Exception, x: 87 | logging.exception(x) 88 | 89 | itrans.close() 90 | otrans.close() 91 | 92 | class TThreadedServer(TServer): 93 | 94 | """Threaded server that spawns a new thread per each connection.""" 95 | 96 | def __init__(self, *args, **kwargs): 97 | TServer.__init__(self, *args) 98 | self.daemon = kwargs.get("daemon", False) 99 | 100 | def serve(self): 101 | self.serverTransport.listen() 102 | while True: 103 | try: 104 | client = self.serverTransport.accept() 105 | t = threading.Thread(target = self.handle, args=(client,)) 106 | t.setDaemon(self.daemon) 107 | t.start() 108 | except KeyboardInterrupt: 109 | raise 110 | except Exception, x: 111 | logging.exception(x) 112 | 113 | def handle(self, client): 114 | itrans = self.inputTransportFactory.getTransport(client) 115 | otrans = self.outputTransportFactory.getTransport(client) 116 | iprot = self.inputProtocolFactory.getProtocol(itrans) 117 | oprot = self.outputProtocolFactory.getProtocol(otrans) 118 | try: 119 | while True: 120 | self.processor.process(iprot, oprot) 121 | except TTransport.TTransportException, tx: 122 | pass 123 | except Exception, x: 124 | logging.exception(x) 125 | 126 | itrans.close() 127 | otrans.close() 128 | 129 | class TThreadPoolServer(TServer): 130 | 131 | """Server with a fixed size pool of threads which service requests.""" 132 | 133 | def __init__(self, *args, **kwargs): 134 | TServer.__init__(self, *args) 135 | self.clients = Queue.Queue() 136 | self.threads = 10 137 | self.daemon = kwargs.get("daemon", False) 138 | 139 | def setNumThreads(self, num): 140 | """Set the number of worker threads that should be created""" 141 | self.threads = num 142 | 143 | def serveThread(self): 144 | """Loop around getting clients from the shared queue and process them.""" 145 | while True: 146 | try: 147 | client = self.clients.get() 148 | self.serveClient(client) 149 | except Exception, x: 150 | logging.exception(x) 151 | 152 | def serveClient(self, client): 153 | """Process input/output from a client for as long as possible""" 154 | itrans = self.inputTransportFactory.getTransport(client) 155 | otrans = self.outputTransportFactory.getTransport(client) 156 | iprot = self.inputProtocolFactory.getProtocol(itrans) 157 | oprot = self.outputProtocolFactory.getProtocol(otrans) 158 | try: 159 | while True: 160 | self.processor.process(iprot, oprot) 161 | except TTransport.TTransportException, tx: 162 | pass 163 | except Exception, x: 164 | logging.exception(x) 165 | 166 | itrans.close() 167 | otrans.close() 168 | 169 | def serve(self): 170 | """Start a fixed number of worker threads and put client into a queue""" 171 | for i in range(self.threads): 172 | try: 173 | t = threading.Thread(target = self.serveThread) 174 | t.setDaemon(self.daemon) 175 | t.start() 176 | except Exception, x: 177 | logging.exception(x) 178 | 179 | # Pump the socket for clients 180 | self.serverTransport.listen() 181 | while True: 182 | try: 183 | client = self.serverTransport.accept() 184 | self.clients.put(client) 185 | except Exception, x: 186 | logging.exception(x) 187 | 188 | 189 | class TForkingServer(TServer): 190 | 191 | """A Thrift server that forks a new process for each request""" 192 | """ 193 | This is more scalable than the threaded server as it does not cause 194 | GIL contention. 195 | 196 | Note that this has different semantics from the threading server. 197 | Specifically, updates to shared variables will no longer be shared. 198 | It will also not work on windows. 199 | 200 | This code is heavily inspired by SocketServer.ForkingMixIn in the 201 | Python stdlib. 202 | """ 203 | 204 | def __init__(self, *args): 205 | TServer.__init__(self, *args) 206 | self.children = [] 207 | 208 | def serve(self): 209 | def try_close(file): 210 | try: 211 | file.close() 212 | except IOError, e: 213 | logging.warning(e, exc_info=True) 214 | 215 | 216 | self.serverTransport.listen() 217 | while True: 218 | client = self.serverTransport.accept() 219 | try: 220 | pid = os.fork() 221 | 222 | if pid: # parent 223 | # add before collect, otherwise you race w/ waitpid 224 | self.children.append(pid) 225 | self.collect_children() 226 | 227 | # Parent must close socket or the connection may not get 228 | # closed promptly 229 | itrans = self.inputTransportFactory.getTransport(client) 230 | otrans = self.outputTransportFactory.getTransport(client) 231 | try_close(itrans) 232 | try_close(otrans) 233 | else: 234 | itrans = self.inputTransportFactory.getTransport(client) 235 | otrans = self.outputTransportFactory.getTransport(client) 236 | 237 | iprot = self.inputProtocolFactory.getProtocol(itrans) 238 | oprot = self.outputProtocolFactory.getProtocol(otrans) 239 | 240 | ecode = 0 241 | try: 242 | try: 243 | while True: 244 | self.processor.process(iprot, oprot) 245 | except TTransport.TTransportException, tx: 246 | pass 247 | except Exception, e: 248 | logging.exception(e) 249 | ecode = 1 250 | finally: 251 | try_close(itrans) 252 | try_close(otrans) 253 | 254 | os._exit(ecode) 255 | 256 | except TTransport.TTransportException, tx: 257 | pass 258 | except Exception, x: 259 | logging.exception(x) 260 | 261 | 262 | def collect_children(self): 263 | while self.children: 264 | try: 265 | pid, status = os.waitpid(0, os.WNOHANG) 266 | except os.error: 267 | pid = None 268 | 269 | if pid: 270 | self.children.remove(pid) 271 | else: 272 | break 273 | 274 | 275 | -------------------------------------------------------------------------------- /thrift/server/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | __all__ = ['TServer', 'TNonblockingServer'] 21 | -------------------------------------------------------------------------------- /thrift/transport/THttpClient.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | from TTransport import TTransportBase 21 | from cStringIO import StringIO 22 | 23 | import urlparse 24 | import httplib 25 | import warnings 26 | import socket 27 | 28 | 29 | class THttpClient(TTransportBase): 30 | 31 | """Http implementation of TTransport base.""" 32 | 33 | def __init__( 34 | self, 35 | uri_or_host, 36 | port=None, 37 | path=None, 38 | proxy_host=None, 39 | proxy_port=None 40 | ): 41 | """THttpClient supports two different types constructor parameters. 42 | 43 | THttpClient(host, port, path) - deprecated 44 | THttpClient(uri) 45 | 46 | Only the second supports https.""" 47 | 48 | """THttpClient supports proxy 49 | THttpClient(host, port, path, proxy_host, proxy_port) - deprecated 50 | ThttpClient(uri, None, None, proxy_host, proxy_port)""" 51 | 52 | if port is not None: 53 | warnings.warn( 54 | "Please use the THttpClient('http://host:port/path') syntax", 55 | DeprecationWarning, 56 | stacklevel=2) 57 | self.host = uri_or_host 58 | self.port = port 59 | assert path 60 | self.path = path 61 | self.scheme = 'http' 62 | else: 63 | parsed = urlparse.urlparse(uri_or_host) 64 | self.scheme = parsed.scheme 65 | assert self.scheme in ('http', 'https') 66 | if self.scheme == 'http': 67 | self.port = parsed.port or httplib.HTTP_PORT 68 | elif self.scheme == 'https': 69 | self.port = parsed.port or httplib.HTTPS_PORT 70 | self.host = parsed.hostname 71 | self.path = parsed.path 72 | if parsed.query: 73 | self.path += '?%s' % parsed.query 74 | 75 | if proxy_host is not None and proxy_port is not None: 76 | self.endpoint_host = proxy_host 77 | self.endpoint_port = proxy_port 78 | self.path = urlparse.urlunparse(( 79 | self.scheme, 80 | "%s:%i" % (self.host, self.port), 81 | self.path, 82 | None, 83 | None, 84 | None 85 | )) 86 | else: 87 | self.endpoint_host = self.host 88 | self.endpoint_port = self.port 89 | 90 | self.__wbuf = StringIO() 91 | self.__http = None 92 | self.__timeout = None 93 | self.__headers = {} 94 | 95 | def open(self): 96 | protocol = httplib.HTTP if self.scheme == 'http' else httplib.HTTPS 97 | self.__http = protocol(self.endpoint_host, self.endpoint_port) 98 | 99 | def close(self): 100 | self.__http.close() 101 | self.__http = None 102 | 103 | def isOpen(self): 104 | return self.__http is not None 105 | 106 | def setTimeout(self, ms): 107 | if not hasattr(socket, 'getdefaulttimeout'): 108 | raise NotImplementedError 109 | 110 | if ms is None: 111 | self.__timeout = None 112 | else: 113 | self.__timeout = ms / 1000.0 114 | 115 | def read(self, sz): 116 | return self.__http.file.read(sz) 117 | 118 | def write(self, buf): 119 | self.__wbuf.write(buf) 120 | 121 | def __withTimeout(f): 122 | def _f(*args, **kwargs): 123 | orig_timeout = socket.getdefaulttimeout() 124 | socket.setdefaulttimeout(args[0].__timeout) 125 | result = f(*args, **kwargs) 126 | socket.setdefaulttimeout(orig_timeout) 127 | return result 128 | return _f 129 | 130 | def addHeaders(self, **kwargs): 131 | self.__headers.update(kwargs) 132 | 133 | def flush(self): 134 | if self.isOpen(): 135 | self.close() 136 | self.open() 137 | 138 | # Pull data out of buffer 139 | data = self.__wbuf.getvalue() 140 | self.__wbuf = StringIO() 141 | 142 | # HTTP request 143 | self.__http.putrequest('POST', self.path) 144 | 145 | # Write headers 146 | self.__http.putheader('Host', self.host) 147 | self.__http.putheader('Content-Type', 'application/x-thrift') 148 | self.__http.putheader('Content-Length', str(len(data))) 149 | for key, value in self.__headers.iteritems(): 150 | self.__http.putheader(key, value) 151 | self.__http.endheaders() 152 | 153 | # Write payload 154 | self.__http.send(data) 155 | 156 | # Get reply to flush the request 157 | self.code, self.message, self.headers = self.__http.getreply() 158 | 159 | # Decorate if we know how to timeout 160 | if hasattr(socket, 'getdefaulttimeout'): 161 | flush = __withTimeout(flush) 162 | -------------------------------------------------------------------------------- /thrift/transport/TSSLSocket.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | import os 20 | import socket 21 | import ssl 22 | 23 | from thrift.transport import TSocket 24 | from thrift.transport.TTransport import TTransportException 25 | 26 | class TSSLSocket(TSocket.TSocket): 27 | """ 28 | SSL implementation of client-side TSocket 29 | 30 | This class creates outbound sockets wrapped using the 31 | python standard ssl module for encrypted connections. 32 | 33 | The protocol used is set using the class variable 34 | SSL_VERSION, which must be one of ssl.PROTOCOL_* and 35 | defaults to ssl.PROTOCOL_TLSv1 for greatest security. 36 | """ 37 | SSL_VERSION = ssl.PROTOCOL_TLSv1 38 | 39 | def __init__(self, host='localhost', port=9090, validate=True, ca_certs=None, unix_socket=None): 40 | """ 41 | @param validate: Set to False to disable SSL certificate validation entirely. 42 | @type validate: bool 43 | @param ca_certs: Filename to the Certificate Authority pem file, possibly a 44 | file downloaded from: http://curl.haxx.se/ca/cacert.pem This is passed to 45 | the ssl_wrap function as the 'ca_certs' parameter. 46 | @type ca_certs: str 47 | 48 | Raises an IOError exception if validate is True and the ca_certs file is 49 | None, not present or unreadable. 50 | """ 51 | self.validate = validate 52 | self.is_valid = False 53 | self.peercert = None 54 | if not validate: 55 | self.cert_reqs = ssl.CERT_NONE 56 | else: 57 | self.cert_reqs = ssl.CERT_REQUIRED 58 | self.ca_certs = ca_certs 59 | if validate: 60 | if ca_certs is None or not os.access(ca_certs, os.R_OK): 61 | raise IOError('Certificate Authority ca_certs file "%s" is not readable, cannot validate SSL certificates.' % (ca_certs)) 62 | TSocket.TSocket.__init__(self, host, port, unix_socket) 63 | 64 | def open(self): 65 | try: 66 | res0 = self._resolveAddr() 67 | for res in res0: 68 | sock_family, sock_type= res[0:2] 69 | ip_port = res[4] 70 | plain_sock = socket.socket(sock_family, sock_type) 71 | self.handle = ssl.wrap_socket(plain_sock, ssl_version=self.SSL_VERSION, 72 | do_handshake_on_connect=True, ca_certs=self.ca_certs, cert_reqs=self.cert_reqs) 73 | self.handle.settimeout(self._timeout) 74 | try: 75 | self.handle.connect(ip_port) 76 | except socket.error, e: 77 | if res is not res0[-1]: 78 | continue 79 | else: 80 | raise e 81 | break 82 | except socket.error, e: 83 | if self._unix_socket: 84 | message = 'Could not connect to secure socket %s' % self._unix_socket 85 | else: 86 | message = 'Could not connect to %s:%d' % (self.host, self.port) 87 | raise TTransportException(type=TTransportException.NOT_OPEN, message=message) 88 | if self.validate: 89 | self._validate_cert() 90 | 91 | def _validate_cert(self): 92 | """internal method to validate the peer's SSL certificate, and to check the 93 | commonName of the certificate to ensure it matches the hostname we 94 | used to make this connection. Does not support subjectAltName records 95 | in certificates. 96 | 97 | raises TTransportException if the certificate fails validation.""" 98 | cert = self.handle.getpeercert() 99 | self.peercert = cert 100 | if 'subject' not in cert: 101 | raise TTransportException(type=TTransportException.NOT_OPEN, 102 | message='No SSL certificate found from %s:%s' % (self.host, self.port)) 103 | fields = cert['subject'] 104 | for field in fields: 105 | # ensure structure we get back is what we expect 106 | if not isinstance(field, tuple): 107 | continue 108 | cert_pair = field[0] 109 | if len(cert_pair) < 2: 110 | continue 111 | cert_key, cert_value = cert_pair[0:2] 112 | if cert_key != 'commonName': 113 | continue 114 | certhost = cert_value 115 | if certhost == self.host: 116 | # success, cert commonName matches desired hostname 117 | self.is_valid = True 118 | return 119 | else: 120 | raise TTransportException(type=TTransportException.UNKNOWN, 121 | message='Host name we connected to "%s" doesn\'t match certificate provided commonName "%s"' % (self.host, certhost)) 122 | raise TTransportException(type=TTransportException.UNKNOWN, 123 | message='Could not validate SSL certificate from host "%s". Cert=%s' % (self.host, cert)) 124 | 125 | class TSSLServerSocket(TSocket.TServerSocket): 126 | """ 127 | SSL implementation of TServerSocket 128 | 129 | This uses the ssl module's wrap_socket() method to provide SSL 130 | negotiated encryption. 131 | """ 132 | SSL_VERSION = ssl.PROTOCOL_TLSv1 133 | 134 | def __init__(self, host=None, port=9090, certfile='cert.pem', unix_socket=None): 135 | """Initialize a TSSLServerSocket 136 | 137 | @param certfile: The filename of the server certificate file, defaults to cert.pem 138 | @type certfile: str 139 | @param host: The hostname or IP to bind the listen socket to, i.e. 'localhost' for only allowing 140 | local network connections. Pass None to bind to all interfaces. 141 | @type host: str 142 | @param port: The port to listen on for inbound connections. 143 | @type port: int 144 | """ 145 | self.setCertfile(certfile) 146 | TSocket.TServerSocket.__init__(self, host, port) 147 | 148 | def setCertfile(self, certfile): 149 | """Set or change the server certificate file used to wrap new connections. 150 | 151 | @param certfile: The filename of the server certificate, i.e. '/etc/certs/server.pem' 152 | @type certfile: str 153 | 154 | Raises an IOError exception if the certfile is not present or unreadable. 155 | """ 156 | if not os.access(certfile, os.R_OK): 157 | raise IOError('No such certfile found: %s' % (certfile)) 158 | self.certfile = certfile 159 | 160 | def accept(self): 161 | plain_client, addr = self.handle.accept() 162 | try: 163 | client = ssl.wrap_socket(plain_client, certfile=self.certfile, 164 | server_side=True, ssl_version=self.SSL_VERSION) 165 | except ssl.SSLError, ssl_exc: 166 | # failed handshake/ssl wrap, close socket to client 167 | plain_client.close() 168 | # raise ssl_exc 169 | # We can't raise the exception, because it kills most TServer derived serve() 170 | # methods. 171 | # Instead, return None, and let the TServer instance deal with it in 172 | # other exception handling. (but TSimpleServer dies anyway) 173 | return None 174 | result = TSocket.TSocket() 175 | result.setHandle(client) 176 | return result 177 | -------------------------------------------------------------------------------- /thrift/transport/TSocket.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | from TTransport import * 21 | import os 22 | import errno 23 | import socket 24 | import sys 25 | 26 | class TSocketBase(TTransportBase): 27 | def _resolveAddr(self): 28 | if self._unix_socket is not None: 29 | return [(socket.AF_UNIX, socket.SOCK_STREAM, None, None, self._unix_socket)] 30 | else: 31 | return socket.getaddrinfo(self.host, self.port, socket.AF_UNSPEC, socket.SOCK_STREAM, 0, socket.AI_PASSIVE | socket.AI_ADDRCONFIG) 32 | 33 | def close(self): 34 | if self.handle: 35 | self.handle.close() 36 | self.handle = None 37 | 38 | class TSocket(TSocketBase): 39 | """Socket implementation of TTransport base.""" 40 | 41 | def __init__(self, host='localhost', port=9090, unix_socket=None): 42 | """Initialize a TSocket 43 | 44 | @param host(str) The host to connect to. 45 | @param port(int) The (TCP) port to connect to. 46 | @param unix_socket(str) The filename of a unix socket to connect to. 47 | (host and port will be ignored.) 48 | """ 49 | 50 | self.host = host 51 | self.port = port 52 | self.handle = None 53 | self._unix_socket = unix_socket 54 | self._timeout = None 55 | 56 | def setHandle(self, h): 57 | self.handle = h 58 | 59 | def isOpen(self): 60 | return self.handle is not None 61 | 62 | def setTimeout(self, ms): 63 | if ms is None: 64 | self._timeout = None 65 | else: 66 | self._timeout = ms/1000.0 67 | 68 | if self.handle is not None: 69 | self.handle.settimeout(self._timeout) 70 | 71 | def open(self): 72 | try: 73 | res0 = self._resolveAddr() 74 | for res in res0: 75 | self.handle = socket.socket(res[0], res[1]) 76 | self.handle.settimeout(self._timeout) 77 | try: 78 | self.handle.connect(res[4]) 79 | except socket.error, e: 80 | if res is not res0[-1]: 81 | continue 82 | else: 83 | raise e 84 | break 85 | except socket.error, e: 86 | if self._unix_socket: 87 | message = 'Could not connect to socket %s' % self._unix_socket 88 | else: 89 | message = 'Could not connect to %s:%d' % (self.host, self.port) 90 | raise TTransportException(type=TTransportException.NOT_OPEN, message=message) 91 | 92 | def read(self, sz): 93 | try: 94 | buff = self.handle.recv(sz) 95 | except socket.error, e: 96 | if (e.args[0] == errno.ECONNRESET and 97 | (sys.platform == 'darwin' or sys.platform.startswith('freebsd'))): 98 | # freebsd and Mach don't follow POSIX semantic of recv 99 | # and fail with ECONNRESET if peer performed shutdown. 100 | # See corresponding comment and code in TSocket::read() 101 | # in lib/cpp/src/transport/TSocket.cpp. 102 | self.close() 103 | # Trigger the check to raise the END_OF_FILE exception below. 104 | buff = '' 105 | else: 106 | raise 107 | if len(buff) == 0: 108 | raise TTransportException(type=TTransportException.END_OF_FILE, message='TSocket read 0 bytes') 109 | return buff 110 | 111 | def write(self, buff): 112 | if not self.handle: 113 | raise TTransportException(type=TTransportException.NOT_OPEN, message='Transport not open') 114 | sent = 0 115 | have = len(buff) 116 | while sent < have: 117 | plus = self.handle.send(buff) 118 | if plus == 0: 119 | raise TTransportException(type=TTransportException.END_OF_FILE, message='TSocket sent 0 bytes') 120 | sent += plus 121 | buff = buff[plus:] 122 | 123 | def flush(self): 124 | pass 125 | 126 | class TServerSocket(TSocketBase, TServerTransportBase): 127 | """Socket implementation of TServerTransport base.""" 128 | 129 | def __init__(self, host=None, port=9090, unix_socket=None): 130 | self.host = host 131 | self.port = port 132 | self._unix_socket = unix_socket 133 | self.handle = None 134 | 135 | def listen(self): 136 | res0 = self._resolveAddr() 137 | for res in res0: 138 | if res[0] is socket.AF_INET6 or res is res0[-1]: 139 | break 140 | 141 | # We need remove the old unix socket if the file exists and 142 | # nobody is listening on it. 143 | if self._unix_socket: 144 | tmp = socket.socket(res[0], res[1]) 145 | try: 146 | tmp.connect(res[4]) 147 | except socket.error, err: 148 | eno, message = err.args 149 | if eno == errno.ECONNREFUSED: 150 | os.unlink(res[4]) 151 | 152 | self.handle = socket.socket(res[0], res[1]) 153 | self.handle.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) 154 | if hasattr(self.handle, 'settimeout'): 155 | self.handle.settimeout(None) 156 | self.handle.bind(res[4]) 157 | self.handle.listen(128) 158 | 159 | def accept(self): 160 | client, addr = self.handle.accept() 161 | result = TSocket() 162 | result.setHandle(client) 163 | return result 164 | -------------------------------------------------------------------------------- /thrift/transport/TTransport.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | from cStringIO import StringIO 21 | from struct import pack,unpack 22 | from thrift.Thrift import TException 23 | 24 | class TTransportException(TException): 25 | 26 | """Custom Transport Exception class""" 27 | 28 | UNKNOWN = 0 29 | NOT_OPEN = 1 30 | ALREADY_OPEN = 2 31 | TIMED_OUT = 3 32 | END_OF_FILE = 4 33 | 34 | def __init__(self, type=UNKNOWN, message=None): 35 | TException.__init__(self, message) 36 | self.type = type 37 | 38 | class TTransportBase: 39 | 40 | """Base class for Thrift transport layer.""" 41 | 42 | def isOpen(self): 43 | pass 44 | 45 | def open(self): 46 | pass 47 | 48 | def close(self): 49 | pass 50 | 51 | def read(self, sz): 52 | pass 53 | 54 | def readAll(self, sz): 55 | buff = '' 56 | have = 0 57 | while (have < sz): 58 | chunk = self.read(sz-have) 59 | have += len(chunk) 60 | buff += chunk 61 | 62 | if len(chunk) == 0: 63 | raise EOFError() 64 | 65 | return buff 66 | 67 | def write(self, buf): 68 | pass 69 | 70 | def flush(self): 71 | pass 72 | 73 | # This class should be thought of as an interface. 74 | class CReadableTransport: 75 | """base class for transports that are readable from C""" 76 | 77 | # TODO(dreiss): Think about changing this interface to allow us to use 78 | # a (Python, not c) StringIO instead, because it allows 79 | # you to write after reading. 80 | 81 | # NOTE: This is a classic class, so properties will NOT work 82 | # correctly for setting. 83 | @property 84 | def cstringio_buf(self): 85 | """A cStringIO buffer that contains the current chunk we are reading.""" 86 | pass 87 | 88 | def cstringio_refill(self, partialread, reqlen): 89 | """Refills cstringio_buf. 90 | 91 | Returns the currently used buffer (which can but need not be the same as 92 | the old cstringio_buf). partialread is what the C code has read from the 93 | buffer, and should be inserted into the buffer before any more reads. The 94 | return value must be a new, not borrowed reference. Something along the 95 | lines of self._buf should be fine. 96 | 97 | If reqlen bytes can't be read, throw EOFError. 98 | """ 99 | pass 100 | 101 | class TServerTransportBase: 102 | 103 | """Base class for Thrift server transports.""" 104 | 105 | def listen(self): 106 | pass 107 | 108 | def accept(self): 109 | pass 110 | 111 | def close(self): 112 | pass 113 | 114 | class TTransportFactoryBase: 115 | 116 | """Base class for a Transport Factory""" 117 | 118 | def getTransport(self, trans): 119 | return trans 120 | 121 | class TBufferedTransportFactory: 122 | 123 | """Factory transport that builds buffered transports""" 124 | 125 | def getTransport(self, trans): 126 | buffered = TBufferedTransport(trans) 127 | return buffered 128 | 129 | 130 | class TBufferedTransport(TTransportBase,CReadableTransport): 131 | 132 | """Class that wraps another transport and buffers its I/O. 133 | 134 | The implementation uses a (configurable) fixed-size read buffer 135 | but buffers all writes until a flush is performed. 136 | """ 137 | 138 | DEFAULT_BUFFER = 4096 139 | 140 | def __init__(self, trans, rbuf_size = DEFAULT_BUFFER): 141 | self.__trans = trans 142 | self.__wbuf = StringIO() 143 | self.__rbuf = StringIO("") 144 | self.__rbuf_size = rbuf_size 145 | 146 | def isOpen(self): 147 | return self.__trans.isOpen() 148 | 149 | def open(self): 150 | return self.__trans.open() 151 | 152 | def close(self): 153 | return self.__trans.close() 154 | 155 | def read(self, sz): 156 | ret = self.__rbuf.read(sz) 157 | if len(ret) != 0: 158 | return ret 159 | 160 | self.__rbuf = StringIO(self.__trans.read(max(sz, self.__rbuf_size))) 161 | return self.__rbuf.read(sz) 162 | 163 | def write(self, buf): 164 | self.__wbuf.write(buf) 165 | 166 | def flush(self): 167 | out = self.__wbuf.getvalue() 168 | # reset wbuf before write/flush to preserve state on underlying failure 169 | self.__wbuf = StringIO() 170 | self.__trans.write(out) 171 | self.__trans.flush() 172 | 173 | # Implement the CReadableTransport interface. 174 | @property 175 | def cstringio_buf(self): 176 | return self.__rbuf 177 | 178 | def cstringio_refill(self, partialread, reqlen): 179 | retstring = partialread 180 | if reqlen < self.__rbuf_size: 181 | # try to make a read of as much as we can. 182 | retstring += self.__trans.read(self.__rbuf_size) 183 | 184 | # but make sure we do read reqlen bytes. 185 | if len(retstring) < reqlen: 186 | retstring += self.__trans.readAll(reqlen - len(retstring)) 187 | 188 | self.__rbuf = StringIO(retstring) 189 | return self.__rbuf 190 | 191 | class TMemoryBuffer(TTransportBase, CReadableTransport): 192 | """Wraps a cStringIO object as a TTransport. 193 | 194 | NOTE: Unlike the C++ version of this class, you cannot write to it 195 | then immediately read from it. If you want to read from a 196 | TMemoryBuffer, you must either pass a string to the constructor. 197 | TODO(dreiss): Make this work like the C++ version. 198 | """ 199 | 200 | def __init__(self, value=None): 201 | """value -- a value to read from for stringio 202 | 203 | If value is set, this will be a transport for reading, 204 | otherwise, it is for writing""" 205 | if value is not None: 206 | self._buffer = StringIO(value) 207 | else: 208 | self._buffer = StringIO() 209 | 210 | def isOpen(self): 211 | return not self._buffer.closed 212 | 213 | def open(self): 214 | pass 215 | 216 | def close(self): 217 | self._buffer.close() 218 | 219 | def read(self, sz): 220 | return self._buffer.read(sz) 221 | 222 | def write(self, buf): 223 | self._buffer.write(buf) 224 | 225 | def flush(self): 226 | pass 227 | 228 | def getvalue(self): 229 | return self._buffer.getvalue() 230 | 231 | # Implement the CReadableTransport interface. 232 | @property 233 | def cstringio_buf(self): 234 | return self._buffer 235 | 236 | def cstringio_refill(self, partialread, reqlen): 237 | # only one shot at reading... 238 | raise EOFError() 239 | 240 | class TFramedTransportFactory: 241 | 242 | """Factory transport that builds framed transports""" 243 | 244 | def getTransport(self, trans): 245 | framed = TFramedTransport(trans) 246 | return framed 247 | 248 | 249 | class TFramedTransport(TTransportBase, CReadableTransport): 250 | 251 | """Class that wraps another transport and frames its I/O when writing.""" 252 | 253 | def __init__(self, trans,): 254 | self.__trans = trans 255 | self.__rbuf = StringIO() 256 | self.__wbuf = StringIO() 257 | 258 | def isOpen(self): 259 | return self.__trans.isOpen() 260 | 261 | def open(self): 262 | return self.__trans.open() 263 | 264 | def close(self): 265 | return self.__trans.close() 266 | 267 | def read(self, sz): 268 | ret = self.__rbuf.read(sz) 269 | if len(ret) != 0: 270 | return ret 271 | 272 | self.readFrame() 273 | return self.__rbuf.read(sz) 274 | 275 | def readFrame(self): 276 | buff = self.__trans.readAll(4) 277 | sz, = unpack('!i', buff) 278 | self.__rbuf = StringIO(self.__trans.readAll(sz)) 279 | 280 | def write(self, buf): 281 | self.__wbuf.write(buf) 282 | 283 | def flush(self): 284 | wout = self.__wbuf.getvalue() 285 | wsz = len(wout) 286 | # reset wbuf before write/flush to preserve state on underlying failure 287 | self.__wbuf = StringIO() 288 | # N.B.: Doing this string concatenation is WAY cheaper than making 289 | # two separate calls to the underlying socket object. Socket writes in 290 | # Python turn out to be REALLY expensive, but it seems to do a pretty 291 | # good job of managing string buffer operations without excessive copies 292 | buf = pack("!i", wsz) + wout 293 | self.__trans.write(buf) 294 | self.__trans.flush() 295 | 296 | # Implement the CReadableTransport interface. 297 | @property 298 | def cstringio_buf(self): 299 | return self.__rbuf 300 | 301 | def cstringio_refill(self, prefix, reqlen): 302 | # self.__rbuf will already be empty here because fastbinary doesn't 303 | # ask for a refill until the previous buffer is empty. Therefore, 304 | # we can start reading new frames immediately. 305 | while len(prefix) < reqlen: 306 | self.readFrame() 307 | prefix += self.__rbuf.getvalue() 308 | self.__rbuf = StringIO(prefix) 309 | return self.__rbuf 310 | 311 | 312 | class TFileObjectTransport(TTransportBase): 313 | """Wraps a file-like object to make it work as a Thrift transport.""" 314 | 315 | def __init__(self, fileobj): 316 | self.fileobj = fileobj 317 | 318 | def isOpen(self): 319 | return True 320 | 321 | def close(self): 322 | self.fileobj.close() 323 | 324 | def read(self, sz): 325 | return self.fileobj.read(sz) 326 | 327 | def write(self, buf): 328 | self.fileobj.write(buf) 329 | 330 | def flush(self): 331 | self.fileobj.flush() 332 | -------------------------------------------------------------------------------- /thrift/transport/TTwisted.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | from zope.interface import implements, Interface, Attribute 20 | from twisted.internet.protocol import Protocol, ServerFactory, ClientFactory, \ 21 | connectionDone 22 | from twisted.internet import defer 23 | from twisted.protocols import basic 24 | from twisted.python import log 25 | from twisted.web import server, resource, http 26 | 27 | from thrift.transport import TTransport 28 | from cStringIO import StringIO 29 | 30 | 31 | class TMessageSenderTransport(TTransport.TTransportBase): 32 | 33 | def __init__(self): 34 | self.__wbuf = StringIO() 35 | 36 | def write(self, buf): 37 | self.__wbuf.write(buf) 38 | 39 | def flush(self): 40 | msg = self.__wbuf.getvalue() 41 | self.__wbuf = StringIO() 42 | self.sendMessage(msg) 43 | 44 | def sendMessage(self, message): 45 | raise NotImplementedError 46 | 47 | 48 | class TCallbackTransport(TMessageSenderTransport): 49 | 50 | def __init__(self, func): 51 | TMessageSenderTransport.__init__(self) 52 | self.func = func 53 | 54 | def sendMessage(self, message): 55 | self.func(message) 56 | 57 | 58 | class ThriftClientProtocol(basic.Int32StringReceiver): 59 | 60 | MAX_LENGTH = 2 ** 31 - 1 61 | 62 | def __init__(self, client_class, iprot_factory, oprot_factory=None): 63 | self._client_class = client_class 64 | self._iprot_factory = iprot_factory 65 | if oprot_factory is None: 66 | self._oprot_factory = iprot_factory 67 | else: 68 | self._oprot_factory = oprot_factory 69 | 70 | self.recv_map = {} 71 | self.started = defer.Deferred() 72 | 73 | def dispatch(self, msg): 74 | self.sendString(msg) 75 | 76 | def connectionMade(self): 77 | tmo = TCallbackTransport(self.dispatch) 78 | self.client = self._client_class(tmo, self._oprot_factory) 79 | self.started.callback(self.client) 80 | 81 | def connectionLost(self, reason=connectionDone): 82 | for k,v in self.client._reqs.iteritems(): 83 | tex = TTransport.TTransportException( 84 | type=TTransport.TTransportException.END_OF_FILE, 85 | message='Connection closed') 86 | v.errback(tex) 87 | 88 | def stringReceived(self, frame): 89 | tr = TTransport.TMemoryBuffer(frame) 90 | iprot = self._iprot_factory.getProtocol(tr) 91 | (fname, mtype, rseqid) = iprot.readMessageBegin() 92 | 93 | try: 94 | method = self.recv_map[fname] 95 | except KeyError: 96 | method = getattr(self.client, 'recv_' + fname) 97 | self.recv_map[fname] = method 98 | 99 | method(iprot, mtype, rseqid) 100 | 101 | 102 | class ThriftServerProtocol(basic.Int32StringReceiver): 103 | 104 | MAX_LENGTH = 2 ** 31 - 1 105 | 106 | def dispatch(self, msg): 107 | self.sendString(msg) 108 | 109 | def processError(self, error): 110 | self.transport.loseConnection() 111 | 112 | def processOk(self, _, tmo): 113 | msg = tmo.getvalue() 114 | 115 | if len(msg) > 0: 116 | self.dispatch(msg) 117 | 118 | def stringReceived(self, frame): 119 | tmi = TTransport.TMemoryBuffer(frame) 120 | tmo = TTransport.TMemoryBuffer() 121 | 122 | iprot = self.factory.iprot_factory.getProtocol(tmi) 123 | oprot = self.factory.oprot_factory.getProtocol(tmo) 124 | 125 | d = self.factory.processor.process(iprot, oprot) 126 | d.addCallbacks(self.processOk, self.processError, 127 | callbackArgs=(tmo,)) 128 | 129 | 130 | class IThriftServerFactory(Interface): 131 | 132 | processor = Attribute("Thrift processor") 133 | 134 | iprot_factory = Attribute("Input protocol factory") 135 | 136 | oprot_factory = Attribute("Output protocol factory") 137 | 138 | 139 | class IThriftClientFactory(Interface): 140 | 141 | client_class = Attribute("Thrift client class") 142 | 143 | iprot_factory = Attribute("Input protocol factory") 144 | 145 | oprot_factory = Attribute("Output protocol factory") 146 | 147 | 148 | class ThriftServerFactory(ServerFactory): 149 | 150 | implements(IThriftServerFactory) 151 | 152 | protocol = ThriftServerProtocol 153 | 154 | def __init__(self, processor, iprot_factory, oprot_factory=None): 155 | self.processor = processor 156 | self.iprot_factory = iprot_factory 157 | if oprot_factory is None: 158 | self.oprot_factory = iprot_factory 159 | else: 160 | self.oprot_factory = oprot_factory 161 | 162 | 163 | class ThriftClientFactory(ClientFactory): 164 | 165 | implements(IThriftClientFactory) 166 | 167 | protocol = ThriftClientProtocol 168 | 169 | def __init__(self, client_class, iprot_factory, oprot_factory=None): 170 | self.client_class = client_class 171 | self.iprot_factory = iprot_factory 172 | if oprot_factory is None: 173 | self.oprot_factory = iprot_factory 174 | else: 175 | self.oprot_factory = oprot_factory 176 | 177 | def buildProtocol(self, addr): 178 | p = self.protocol(self.client_class, self.iprot_factory, 179 | self.oprot_factory) 180 | p.factory = self 181 | return p 182 | 183 | 184 | class ThriftResource(resource.Resource): 185 | 186 | allowedMethods = ('POST',) 187 | 188 | def __init__(self, processor, inputProtocolFactory, 189 | outputProtocolFactory=None): 190 | resource.Resource.__init__(self) 191 | self.inputProtocolFactory = inputProtocolFactory 192 | if outputProtocolFactory is None: 193 | self.outputProtocolFactory = inputProtocolFactory 194 | else: 195 | self.outputProtocolFactory = outputProtocolFactory 196 | self.processor = processor 197 | 198 | def getChild(self, path, request): 199 | return self 200 | 201 | def _cbProcess(self, _, request, tmo): 202 | msg = tmo.getvalue() 203 | request.setResponseCode(http.OK) 204 | request.setHeader("content-type", "application/x-thrift") 205 | request.write(msg) 206 | request.finish() 207 | 208 | def render_POST(self, request): 209 | request.content.seek(0, 0) 210 | data = request.content.read() 211 | tmi = TTransport.TMemoryBuffer(data) 212 | tmo = TTransport.TMemoryBuffer() 213 | 214 | iprot = self.inputProtocolFactory.getProtocol(tmi) 215 | oprot = self.outputProtocolFactory.getProtocol(tmo) 216 | 217 | d = self.processor.process(iprot, oprot) 218 | d.addCallback(self._cbProcess, request, tmo) 219 | return server.NOT_DONE_YET 220 | -------------------------------------------------------------------------------- /thrift/transport/TZlibTransport.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | ''' 20 | TZlibTransport provides a compressed transport and transport factory 21 | class, using the python standard library zlib module to implement 22 | data compression. 23 | ''' 24 | 25 | from __future__ import division 26 | import zlib 27 | from cStringIO import StringIO 28 | from TTransport import TTransportBase, CReadableTransport 29 | 30 | class TZlibTransportFactory(object): 31 | ''' 32 | Factory transport that builds zlib compressed transports. 33 | 34 | This factory caches the last single client/transport that it was passed 35 | and returns the same TZlibTransport object that was created. 36 | 37 | This caching means the TServer class will get the _same_ transport 38 | object for both input and output transports from this factory. 39 | (For non-threaded scenarios only, since the cache only holds one object) 40 | 41 | The purpose of this caching is to allocate only one TZlibTransport where 42 | only one is really needed (since it must have separate read/write buffers), 43 | and makes the statistics from getCompSavings() and getCompRatio() 44 | easier to understand. 45 | ''' 46 | 47 | # class scoped cache of last transport given and zlibtransport returned 48 | _last_trans = None 49 | _last_z = None 50 | 51 | def getTransport(self, trans, compresslevel=9): 52 | '''Wrap a transport , trans, with the TZlibTransport 53 | compressed transport class, returning a new 54 | transport to the caller. 55 | 56 | @param compresslevel: The zlib compression level, ranging 57 | from 0 (no compression) to 9 (best compression). Defaults to 9. 58 | @type compresslevel: int 59 | 60 | This method returns a TZlibTransport which wraps the 61 | passed C{trans} TTransport derived instance. 62 | ''' 63 | if trans == self._last_trans: 64 | return self._last_z 65 | ztrans = TZlibTransport(trans, compresslevel) 66 | self._last_trans = trans 67 | self._last_z = ztrans 68 | return ztrans 69 | 70 | 71 | class TZlibTransport(TTransportBase, CReadableTransport): 72 | ''' 73 | Class that wraps a transport with zlib, compressing writes 74 | and decompresses reads, using the python standard 75 | library zlib module. 76 | ''' 77 | 78 | # Read buffer size for the python fastbinary C extension, 79 | # the TBinaryProtocolAccelerated class. 80 | DEFAULT_BUFFSIZE = 4096 81 | 82 | def __init__(self, trans, compresslevel=9): 83 | ''' 84 | Create a new TZlibTransport, wrapping C{trans}, another 85 | TTransport derived object. 86 | 87 | @param trans: A thrift transport object, i.e. a TSocket() object. 88 | @type trans: TTransport 89 | @param compresslevel: The zlib compression level, ranging 90 | from 0 (no compression) to 9 (best compression). Default is 9. 91 | @type compresslevel: int 92 | ''' 93 | self.__trans = trans 94 | self.compresslevel = compresslevel 95 | self.__rbuf = StringIO() 96 | self.__wbuf = StringIO() 97 | self._init_zlib() 98 | self._init_stats() 99 | 100 | def _reinit_buffers(self): 101 | ''' 102 | Internal method to initialize/reset the internal StringIO objects 103 | for read and write buffers. 104 | ''' 105 | self.__rbuf = StringIO() 106 | self.__wbuf = StringIO() 107 | 108 | def _init_stats(self): 109 | ''' 110 | Internal method to reset the internal statistics counters 111 | for compression ratios and bandwidth savings. 112 | ''' 113 | self.bytes_in = 0 114 | self.bytes_out = 0 115 | self.bytes_in_comp = 0 116 | self.bytes_out_comp = 0 117 | 118 | def _init_zlib(self): 119 | ''' 120 | Internal method for setting up the zlib compression and 121 | decompression objects. 122 | ''' 123 | self._zcomp_read = zlib.decompressobj() 124 | self._zcomp_write = zlib.compressobj(self.compresslevel) 125 | 126 | def getCompRatio(self): 127 | ''' 128 | Get the current measured compression ratios (in,out) from 129 | this transport. 130 | 131 | Returns a tuple of: 132 | (inbound_compression_ratio, outbound_compression_ratio) 133 | 134 | The compression ratios are computed as: 135 | compressed / uncompressed 136 | 137 | E.g., data that compresses by 10x will have a ratio of: 0.10 138 | and data that compresses to half of ts original size will 139 | have a ratio of 0.5 140 | 141 | None is returned if no bytes have yet been processed in 142 | a particular direction. 143 | ''' 144 | r_percent, w_percent = (None, None) 145 | if self.bytes_in > 0: 146 | r_percent = self.bytes_in_comp / self.bytes_in 147 | if self.bytes_out > 0: 148 | w_percent = self.bytes_out_comp / self.bytes_out 149 | return (r_percent, w_percent) 150 | 151 | def getCompSavings(self): 152 | ''' 153 | Get the current count of saved bytes due to data 154 | compression. 155 | 156 | Returns a tuple of: 157 | (inbound_saved_bytes, outbound_saved_bytes) 158 | 159 | Note: if compression is actually expanding your 160 | data (only likely with very tiny thrift objects), then 161 | the values returned will be negative. 162 | ''' 163 | r_saved = self.bytes_in - self.bytes_in_comp 164 | w_saved = self.bytes_out - self.bytes_out_comp 165 | return (r_saved, w_saved) 166 | 167 | def isOpen(self): 168 | '''Return the underlying transport's open status''' 169 | return self.__trans.isOpen() 170 | 171 | def open(self): 172 | """Open the underlying transport""" 173 | self._init_stats() 174 | return self.__trans.open() 175 | 176 | def listen(self): 177 | '''Invoke the underlying transport's listen() method''' 178 | self.__trans.listen() 179 | 180 | def accept(self): 181 | '''Accept connections on the underlying transport''' 182 | return self.__trans.accept() 183 | 184 | def close(self): 185 | '''Close the underlying transport,''' 186 | self._reinit_buffers() 187 | self._init_zlib() 188 | return self.__trans.close() 189 | 190 | def read(self, sz): 191 | ''' 192 | Read up to sz bytes from the decompressed bytes buffer, and 193 | read from the underlying transport if the decompression 194 | buffer is empty. 195 | ''' 196 | ret = self.__rbuf.read(sz) 197 | if len(ret) > 0: 198 | return ret 199 | # keep reading from transport until something comes back 200 | while True: 201 | if self.readComp(sz): 202 | break 203 | ret = self.__rbuf.read(sz) 204 | return ret 205 | 206 | def readComp(self, sz): 207 | ''' 208 | Read compressed data from the underlying transport, then 209 | decompress it and append it to the internal StringIO read buffer 210 | ''' 211 | zbuf = self.__trans.read(sz) 212 | zbuf = self._zcomp_read.unconsumed_tail + zbuf 213 | buf = self._zcomp_read.decompress(zbuf) 214 | self.bytes_in += len(zbuf) 215 | self.bytes_in_comp += len(buf) 216 | old = self.__rbuf.read() 217 | self.__rbuf = StringIO(old + buf) 218 | if len(old) + len(buf) == 0: 219 | return False 220 | return True 221 | 222 | def write(self, buf): 223 | ''' 224 | Write some bytes, putting them into the internal write 225 | buffer for eventual compression. 226 | ''' 227 | self.__wbuf.write(buf) 228 | 229 | def flush(self): 230 | ''' 231 | Flush any queued up data in the write buffer and ensure the 232 | compression buffer is flushed out to the underlying transport 233 | ''' 234 | wout = self.__wbuf.getvalue() 235 | if len(wout) > 0: 236 | zbuf = self._zcomp_write.compress(wout) 237 | self.bytes_out += len(wout) 238 | self.bytes_out_comp += len(zbuf) 239 | else: 240 | zbuf = '' 241 | ztail = self._zcomp_write.flush(zlib.Z_SYNC_FLUSH) 242 | self.bytes_out_comp += len(ztail) 243 | if (len(zbuf) + len(ztail)) > 0: 244 | self.__wbuf = StringIO() 245 | self.__trans.write(zbuf + ztail) 246 | self.__trans.flush() 247 | 248 | @property 249 | def cstringio_buf(self): 250 | '''Implement the CReadableTransport interface''' 251 | return self.__rbuf 252 | 253 | def cstringio_refill(self, partialread, reqlen): 254 | '''Implement the CReadableTransport interface for refill''' 255 | retstring = partialread 256 | if reqlen < self.DEFAULT_BUFFSIZE: 257 | retstring += self.read(self.DEFAULT_BUFFSIZE) 258 | while len(retstring) < reqlen: 259 | retstring += self.read(reqlen - len(retstring)) 260 | self.__rbuf = StringIO(retstring) 261 | return self.__rbuf 262 | -------------------------------------------------------------------------------- /thrift/transport/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, 13 | # software distributed under the License is distributed on an 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | # KIND, either express or implied. See the License for the 16 | # specific language governing permissions and limitations 17 | # under the License. 18 | # 19 | 20 | __all__ = ['TTransport', 'TSocket', 'THttpClient','TZlibTransport'] 21 | --------------------------------------------------------------------------------