├── README.md ├── app.yaml ├── cron.py ├── cron.yaml ├── css ├── images │ ├── css-full-text-button.png │ ├── css-full-textindex.png │ ├── index-button.png │ ├── index.png │ ├── ui-bg_diagonals-thick_18_b81900_40x40.png │ ├── ui-bg_diagonals-thick_20_666666_40x40.png │ ├── ui-bg_flat_10_000000_40x100.png │ ├── ui-bg_glass_100_f6f6f6_1x400.png │ ├── ui-bg_glass_100_fdf5ce_1x400.png │ ├── ui-bg_glass_65_ffffff_1x400.png │ ├── ui-bg_gloss-wave_35_f6a828_500x100.png │ ├── ui-bg_highlight-soft_100_eeeeee_1x100.png │ ├── ui-bg_highlight-soft_75_ffe45c_1x100.png │ ├── ui-icons_222222_256x240.png │ ├── ui-icons_228ef1_256x240.png │ ├── ui-icons_ef8c08_256x240.png │ ├── ui-icons_ffd27a_256x240.png │ └── ui-icons_ffffff_256x240.png ├── jquery-ui-1.8.9.custom.css └── style.css ├── css_selector.py ├── doc ├── RSS 2.0 Specification (version 2.0.11).html ├── RSS 2.0 Specification (version 2.0.11)_files │ ├── abg-en-100c-000000.png │ ├── ads.html │ ├── expansion_embed.js │ ├── flowers.gif │ ├── left-arrow.gif │ ├── rss-advisory-board.gif │ ├── rss-icon.png │ ├── show_ads.js │ ├── show_ads_impl.js │ ├── test_domain.js │ └── urchin.js ├── cnbeta.xml ├── feed ├── sample-rss-2.xml └── test.xml ├── favicon.ico ├── feedformatter.py ├── feedparser.py ├── fetcher.py ├── fix_path.py ├── front.py ├── html5lib ├── __init__.py ├── chardet │ ├── __init__.py │ ├── big5freq.py │ ├── big5prober.py │ ├── chardistribution.py │ ├── charsetgroupprober.py │ ├── charsetprober.py │ ├── codingstatemachine.py │ ├── constants.py │ ├── escprober.py │ ├── escsm.py │ ├── eucjpprober.py │ ├── euckrfreq.py │ ├── euckrprober.py │ ├── euctwfreq.py │ ├── euctwprober.py │ ├── gb2312freq.py │ ├── gb2312prober.py │ ├── hebrewprober.py │ ├── jisfreq.py │ ├── jpcntx.py │ ├── langbulgarianmodel.py │ ├── langcyrillicmodel.py │ ├── langgreekmodel.py │ ├── langhebrewmodel.py │ ├── langhungarianmodel.py │ ├── langthaimodel.py │ ├── latin1prober.py │ ├── mbcharsetprober.py │ ├── mbcsgroupprober.py │ ├── mbcssm.py │ ├── sbcharsetprober.py │ ├── sbcsgroupprober.py │ ├── sjisprober.py │ ├── test.py │ ├── universaldetector.py │ └── utf8prober.py ├── constants.py ├── filters │ ├── __init__.py │ ├── _base.py │ ├── formfiller.py │ ├── fullurl.py │ ├── inject_meta_charset.py │ ├── lint.py │ ├── optionaltags.py │ ├── sanitizer.py │ └── whitespace.py ├── html5parser.py ├── ihatexml.py ├── inputstream.py ├── sanitizer.py ├── serializer │ ├── __init__.py │ ├── htmlserializer.py │ └── xhtmlserializer.py ├── tokenizer.py ├── tokenizer_old.py ├── treebuilders │ ├── __init__.py │ ├── _base.py │ ├── dom.py │ ├── etree.py │ ├── etree_lxml.py │ ├── simpletree.py │ └── soup.py ├── treewalkers │ ├── __init__.py │ ├── _base.py │ ├── dom.py │ ├── etree.py │ ├── genshistream.py │ ├── lxmletree.py │ ├── pulldom.py │ ├── simpletree.py │ └── soup.py └── utils.py ├── images ├── index.png ├── ui-bg_diagonals-thick_18_b81900_40x40.png ├── ui-bg_diagonals-thick_20_666666_40x40.png ├── ui-bg_flat_10_000000_40x100.png ├── ui-bg_glass_100_f6f6f6_1x400.png ├── ui-bg_glass_100_fdf5ce_1x400.png ├── ui-bg_glass_65_ffffff_1x400.png ├── ui-bg_gloss-wave_35_f6a828_500x100.png ├── ui-bg_highlight-soft_100_eeeeee_1x100.png ├── ui-bg_highlight-soft_75_ffe45c_1x100.png ├── ui-icons_222222_256x240.png ├── ui-icons_228ef1_256x240.png ├── ui-icons_ef8c08_256x240.png ├── ui-icons_ffd27a_256x240.png └── ui-icons_ffffff_256x240.png ├── index.yaml ├── main.py ├── project.py ├── queue.yaml ├── template ├── base.html ├── editor.html ├── index.html ├── search.html ├── style.css └── test.html ├── test.py └── tmp /README.md: -------------------------------------------------------------------------------- 1 | # cssfulltext 2 | source code of http://css-fulltext.appspot.com/ 3 | 4 | - 实时的 RSS 全文转换器 5 | - 基于 CSS选择器 选择正文区域,去除广告 6 | - 2011年旧代码 7 | - 运行于 GAE 8 | - 可能 Python 2.5 9 | - 使用 Google 账户登录 (OpenID 2.0 将于 2015年 4月关闭) 10 | -------------------------------------------------------------------------------- /app.yaml: -------------------------------------------------------------------------------- 1 | application: css-fulltext 2 | version: 1 3 | runtime: python 4 | api_version: 1 5 | 6 | handlers: 7 | - url: /cron/.* 8 | script: cron.py 9 | login: admin 10 | 11 | - url: /worker/.* 12 | script: cron.py 13 | login: admin 14 | 15 | - url: /css 16 | static_dir: css 17 | 18 | - url: /images 19 | static_dir: images 20 | 21 | - url: /.* 22 | script: front.py 23 | -------------------------------------------------------------------------------- /cron.py: -------------------------------------------------------------------------------- 1 | #!/url/bin/python 2 | #-- coding: utf-8 -- 3 | ''' 4 | Create on 2011.2.3 5 | 6 | @author: binux 7 | ''' 8 | 9 | import datetime 10 | 11 | from google.appengine.api import taskqueue 12 | from google.appengine.ext import db 13 | from google.appengine.ext import webapp 14 | from google.appengine.ext.webapp.util import run_wsgi_app 15 | 16 | from project import Project, updateProject 17 | from fetcher import DescriptionCache 18 | 19 | class updateProjects(webapp.RequestHandler): 20 | def get(self): 21 | projects = db.GqlQuery("SELECT __key__ FROM Project WHERE nextUpdateDate < :1", datetime.datetime.now()) 22 | for key in projects: 23 | task = taskqueue.Task(url='/worker/update_project', params={'key': key.id()}) 24 | task.add('project') 25 | 26 | class removeCache(webapp.RequestHandler): 27 | def get(self): 28 | task = taskqueue.add(url='/worker/remove_cache') 29 | 30 | def post(self): 31 | q = db.GqlQuery("SELECT __key__ FROM DescriptionCache WHERE lastVisitedDate < :1", 32 | datetime.datetime.now() - datetime.timedelta(days=7)) 33 | r = q.fetch(q.count()) 34 | db.delete(r) 35 | 36 | class updateProjectWorker(webapp.RequestHandler): 37 | def post(self): 38 | key = self.request.get('key') 39 | project = Project.get_by_id(key) 40 | if project: 41 | updateProject(project) 42 | else: 43 | logging.warning("Unknow project key: %s" % key) 44 | 45 | def main(): 46 | run_wsgi_app(webapp.WSGIApplication([ 47 | ('/cron/remove_cache', removeCache), 48 | ('/worker/remove_cache', removeCache), 49 | ('/cron/update_project', updateProjects), 50 | ('/worker/update_project', updateProjectWorker), 51 | ], debug=True)) 52 | 53 | if __name__ == '__main__': 54 | main() 55 | -------------------------------------------------------------------------------- /cron.yaml: -------------------------------------------------------------------------------- 1 | cron: 2 | - description: daily summary job ( remove cache ) 3 | url: /cron/remove_cache 4 | schedule: every 24 hours 5 | -------------------------------------------------------------------------------- /css/images/css-full-text-button.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/css-full-text-button.png -------------------------------------------------------------------------------- /css/images/css-full-textindex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/css-full-textindex.png -------------------------------------------------------------------------------- /css/images/index-button.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/index-button.png -------------------------------------------------------------------------------- /css/images/index.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/index.png -------------------------------------------------------------------------------- /css/images/ui-bg_diagonals-thick_18_b81900_40x40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_diagonals-thick_18_b81900_40x40.png -------------------------------------------------------------------------------- /css/images/ui-bg_diagonals-thick_20_666666_40x40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_diagonals-thick_20_666666_40x40.png -------------------------------------------------------------------------------- /css/images/ui-bg_flat_10_000000_40x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_flat_10_000000_40x100.png -------------------------------------------------------------------------------- /css/images/ui-bg_glass_100_f6f6f6_1x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_glass_100_f6f6f6_1x400.png -------------------------------------------------------------------------------- /css/images/ui-bg_glass_100_fdf5ce_1x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_glass_100_fdf5ce_1x400.png -------------------------------------------------------------------------------- /css/images/ui-bg_glass_65_ffffff_1x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_glass_65_ffffff_1x400.png -------------------------------------------------------------------------------- /css/images/ui-bg_gloss-wave_35_f6a828_500x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_gloss-wave_35_f6a828_500x100.png -------------------------------------------------------------------------------- /css/images/ui-bg_highlight-soft_100_eeeeee_1x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_highlight-soft_100_eeeeee_1x100.png -------------------------------------------------------------------------------- /css/images/ui-bg_highlight-soft_75_ffe45c_1x100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_highlight-soft_75_ffe45c_1x100.png -------------------------------------------------------------------------------- /css/images/ui-icons_222222_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-icons_222222_256x240.png -------------------------------------------------------------------------------- /css/images/ui-icons_228ef1_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-icons_228ef1_256x240.png -------------------------------------------------------------------------------- /css/images/ui-icons_ef8c08_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-icons_ef8c08_256x240.png -------------------------------------------------------------------------------- /css/images/ui-icons_ffd27a_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-icons_ffd27a_256x240.png -------------------------------------------------------------------------------- /css/images/ui-icons_ffffff_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-icons_ffffff_256x240.png -------------------------------------------------------------------------------- /css/style.css: -------------------------------------------------------------------------------- 1 | /* 2 | Site Name: full-text 3 | Developed By: binux - 足兆叉虫 4 | Date Created: 2011-2-5 5 | Last Updated: 2011-2-5 6 | Copyright: GPLv3 7 | */ 8 | 9 | #mainContainer { 10 | margin: auto; /* center in viewport */ 11 | width: 974px; 12 | font-size: 14px; 13 | height: 100%; /* for stickyfooter */ 14 | } 15 | 16 | /* ....... header ........*/ 17 | #headerContent { 18 | width: 100%; 19 | } 20 | #headerContent span { 21 | margin: 0px 2px; 22 | } 23 | #headerRight { 24 | float: right; 25 | } 26 | #headerLeft { 27 | float: left; 28 | } 29 | 30 | /* ...... middle ...... */ 31 | /* ...... feedForm ...... */ 32 | #feedForm { 33 | width: 630px; 34 | height: 217px; 35 | margin: 10% auto; 36 | background: url("images/index.png") no-repeat top left; 37 | } 38 | #feedInputBox { 39 | position: relative; 40 | top: 90px; 41 | left: 96px; 42 | } 43 | #feedInput { 44 | float: left; 45 | display: block; 46 | width: 435px; 47 | height: 29px; 48 | background: #CFE2F3; 49 | border: 2px solid #073763; 50 | font-size: 25px; 51 | } 52 | #feedButton { 53 | float: left; 54 | margin-left: 20px; 55 | width: 77px; 56 | height: 35px; 57 | background: url("images/index-button.png") no-repeat top left; 58 | border: none; 59 | display: block; 60 | font-width: bold; 61 | } 62 | #feedButton:hover { 63 | background-position: 0px -34px; 64 | } 65 | #feedButton:active { 66 | background-position: 0px -68px; 67 | } 68 | /* ...... editorForm ...... */ 69 | #editorContent { 70 | border: 1px solid #36C; 71 | width: 80%; 72 | margin: 1em auto; 73 | } 74 | #editorTitle { 75 | padding: 0.3em 0 0.2em 0.5em; 76 | background: #E5ECF9; 77 | } 78 | #editorTitle > span { 79 | font-size: 120%; 80 | font-width: bold; 81 | line-height: 1.6em; 82 | } 83 | #editorTitle > a { 84 | margin-right: 5px; 85 | } 86 | #editorLeft { 87 | width: 39%; 88 | float: left; 89 | } 90 | #editorLeft > div { 91 | padding: 0px 5px; 92 | } 93 | #editorRight { 94 | width: 59%; 95 | float: right; 96 | } 97 | #editorRight > div { 98 | padding: 0px 5px; 99 | } 100 | #editorButtonBox { 101 | text-align: right; 102 | margin: 0.7em 0px; 103 | } 104 | #editorButtonBox > div.float-right > * { 105 | margin-left: 1em; 106 | } 107 | .inputArea { 108 | margin: 0.7em 0px; 109 | padding: 0.5em 2em 1em; 110 | border: 1px solid #CCC; 111 | background: #F6F6F6; 112 | -moz-border-radius: 4px; 113 | -webkit-border-radius: 4px; 114 | border-radius: 4px; 115 | } 116 | .noticeArea { 117 | border: 1px solid #CD0A0A; 118 | background: #FEF1EC url(../images/?new=fef1ec&w=1&h=400&f=png&q=100&fltr[]=over|textures/02_glass.png|0|0|95) 50% 50% repeat-x; 119 | } 120 | /* ...... preview ...... */ 121 | #previewContent { 122 | border: 1px solid #36C; 123 | margin: 1em auto; 124 | } 125 | #previewTitleBox { 126 | padding: 0.3em 0 0.2em 0.5em; 127 | background: #E5ECF9; 128 | } 129 | #previewTitle { 130 | font-size: 120%; 131 | font-width: bold; 132 | line-height: 1.6em; 133 | } 134 | #previewControls > a { 135 | margin-right: 1em; 136 | } 137 | #previewFullText { 138 | padding: 1em; 139 | } 140 | #previewCode { 141 | width: 100%; 142 | height: 10em; 143 | margin: 1em auto; 144 | } 145 | /* ...... search ...... */ 146 | .searchItem { 147 | border: 1px solid #CCC; 148 | border-top-width: 0px; 149 | padding: 1em 2em; 150 | display: block; 151 | } 152 | .searchItemFirst { 153 | border-top-width: 1px; 154 | } 155 | .itemControls { 156 | float: right; 157 | } 158 | .itemControls > a { 159 | margin-left: 0.5em; 160 | } 161 | .starBox { 162 | margin: 0.4em; 163 | float: left; 164 | } 165 | .titleRow { 166 | font-size: 1.5em; 167 | font-width: bold; 168 | float: left; 169 | } 170 | .itemLink { 171 | font-size: 0.7em; 172 | font-width: normal; 173 | color: #CCC; 174 | } 175 | .discriptionRow { 176 | margin-top: 0.5em; 177 | padding-right: 6em; 178 | padding-top: 1px; 179 | clear: both; 180 | } 181 | .authorRow { 182 | margin-top: 0.5em; 183 | clear: both; 184 | text-align: right; 185 | } 186 | .newProject { 187 | border: 1px solid #CCC; 188 | padding: 1em; 189 | margin: 2em; 190 | display: block; 191 | } 192 | 193 | /* ...... footer ...... */ 194 | #applogo { 195 | float: right; 196 | } 197 | /* ...... stickyfooter ...... */ 198 | html, body {height: 100%;} 199 | #wrap { 200 | min-height: 100%; 201 | } 202 | #mainContent { 203 | overflow:auto; 204 | padding-bottom: 35px; 205 | } /* must be same height as the footer */ 206 | #footer { 207 | position: relative; 208 | margin-top: -35px; /* negative value of footer height */ 209 | height: 35px; 210 | clear:both; 211 | } 212 | #footer-margin { 213 | height: 55px; 214 | } 215 | /*Opera Fix*/ 216 | body:before { 217 | content:""; 218 | height:100%; 219 | float:left; 220 | width:0; 221 | margin-top:-32767px;/ 222 | } 223 | 224 | /* ....... elements ....... */ 225 | .line { 226 | border-top: 1px solid #C9D7F1; 227 | font-size: 1px; 228 | height: 0; 229 | width: 100%; 230 | clear: both; 231 | } 232 | .button { 233 | padding: 0.3em 0.5em; 234 | float: left; 235 | border: 1px solid #CCC; 236 | background: #F6F6F6; 237 | font-weight: bold; 238 | color: #1C94C4; 239 | } 240 | a.button { 241 | text-decoration: none; 242 | } 243 | a.button:hover, input.button:hover { 244 | background: #DADADA; 245 | border-color: #999; 246 | } 247 | a.button:active, input.button:active { 248 | background: white; 249 | border-color: #AAA; 250 | } 251 | .clear-both { 252 | clear: both; 253 | } 254 | .float-right { 255 | float: right; 256 | } 257 | .float-left { 258 | float: left; 259 | } 260 | [readonly] { 261 | background: lightgray; 262 | } 263 | -------------------------------------------------------------------------------- /css_selector.py: -------------------------------------------------------------------------------- 1 | #!/url/bin/python 2 | #-- coding: utf-8 -- 3 | ''' 4 | make minidom selected by css selector 5 | 6 | Create on 2011.2.6 7 | 8 | @author: binux 9 | ''' 10 | 11 | import re 12 | import string 13 | 14 | from xml.dom import Node 15 | def fixMiniDom(): 16 | def getElementById(self, id): 17 | if id in self._id_cache: 18 | return self._id_cache[id] 19 | # cache id 20 | if not self._id_cache: 21 | for element in self.getElementsByTagName('*'): 22 | if element.getAttribute("id"): 23 | self._id_cache[element.getAttribute("id")] = element 24 | if id in self._id_cache: 25 | return self._id_cache[id] 26 | 27 | def getElementsBySelector(self, all_selectors): 28 | selected = [] 29 | 30 | # remove blanks in the right of > 31 | all_selectors = re.sub('>\s+', '>', all_selectors) 32 | 33 | # Grab all of the tagName elements within current context 34 | def getElements(context,tag): 35 | if not tag: tag = '*' 36 | 37 | # Get elements matching tag, filter them for class selector 38 | found = [] 39 | for con in context: 40 | eles = con.getElementsByTagName(tag) 41 | found.extend(eles) 42 | 43 | return found 44 | 45 | context = [self, ] 46 | inheriters = string.split(all_selectors, " ") 47 | 48 | for element in inheriters: 49 | # take all 50 | m = re.match(r'^(>)?(\w+)?(#[a-zA-z0-9\-_]+)?(\.[a-zA-z0-9\-_]+)?(#[a-zA-z0-9\-_]+)?(\[(\w+)([=~!\|\^\$\*]?)=?[\'"]?([^\]\'"]*)[\'"]?\])?$', element) 51 | if (m): 52 | _sub = m.group(1) 53 | _tag = m.group(2) 54 | _id = m.group(3) or m.group(5) 55 | _class = m.group(4) 56 | _css3 = m.group(6) 57 | _attr = m.group(7) 58 | _operator = m.group(8) 59 | _value = m.group(9) 60 | else: 61 | continue 62 | 63 | # fix id and class 64 | if _id: _id = _id[1:] 65 | if _class: _class = _class[1:] 66 | 67 | found = [] 68 | if _sub: 69 | for con in context: 70 | for each in con.childNodes: 71 | if each.nodeType == Node.ELEMENT_NODE: 72 | found.append(each) 73 | elif _id: 74 | ele = self.getElementById(_id) 75 | if ele: 76 | found = [ele, ] 77 | else: 78 | found = getElements(context,_tag) 79 | 80 | # tag 81 | if _tag and _id: # as _id is not exist we get element by tag, so isn't neccessary to test this 82 | tmp = [] 83 | for fnd in found: 84 | if(fnd.tagName == _tag): 85 | tmp.append(fnd) 86 | found = tmp 87 | 88 | # id 89 | if _id: 90 | tmp = [] 91 | for fnd in found: 92 | if(fnd.getAttribute("id") and (_id == fnd.getAttribute("id"))): 93 | tmp.append(fnd) 94 | found = tmp 95 | 96 | # class 97 | if _class: 98 | tmp = [] 99 | for fnd in found: 100 | if(fnd.getAttribute("class") and (_class in fnd.getAttribute("class").split())): 101 | tmp.append(fnd) 102 | found = tmp 103 | 104 | # css3 105 | if _css3: 106 | tmp = [] 107 | for fnd in found: 108 | if(_operator=='=' and fnd.getAttribute(_attr) != _value): continue 109 | if(_operator=='~' and not(re.search(r'(^|\\s)'+_value+'(\\s|$)', fnd.getAttribute(_attr)))): continue 110 | if(_operator=='!' and re.search(r'(^|\\s)'+_value+'(\\s|$)', fnd.getAttribute(_attr))): continue 111 | if(_operator=='|' and not(re.search(r'^'+_value+'-?', fnd.getAttribute(_attr)))): continue 112 | if(_operator=='^' and string.find(fnd.getAttribute(_attr), _value)!=0): continue 113 | if(_operator=='$' and string.rfind(fnd.getAttribute(_attr), _value) != (fnd.getAttribute(_attr).length-_value.length)): continue 114 | if(_operator=='*' and not(string.find(fnd.getAttribute(_attr), _value)+1)): continue 115 | 116 | elif(not fnd.getAttribute(_attr)): continue 117 | tmp.append(fnd) 118 | found = tmp 119 | 120 | context = found 121 | 122 | selected.extend(context) 123 | return selected 124 | 125 | from xml.dom import minidom 126 | setattr(minidom.Element, '_id_cache', {}) 127 | setattr(minidom.Element, 'getElementById', getElementById) 128 | setattr(minidom.Element, 'getElementsBySelector', getElementsBySelector) 129 | setattr(minidom.Document, '_id_cache', {}) 130 | setattr(minidom.Document, 'getElementById', getElementById) 131 | setattr(minidom.Document, 'getElementsBySelector', getElementsBySelector) 132 | 133 | fixMiniDom() 134 | -------------------------------------------------------------------------------- /doc/RSS 2.0 Specification (version 2.0.11)_files/abg-en-100c-000000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/doc/RSS 2.0 Specification (version 2.0.11)_files/abg-en-100c-000000.png -------------------------------------------------------------------------------- /doc/RSS 2.0 Specification (version 2.0.11)_files/flowers.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/doc/RSS 2.0 Specification (version 2.0.11)_files/flowers.gif -------------------------------------------------------------------------------- /doc/RSS 2.0 Specification (version 2.0.11)_files/left-arrow.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/doc/RSS 2.0 Specification (version 2.0.11)_files/left-arrow.gif -------------------------------------------------------------------------------- /doc/RSS 2.0 Specification (version 2.0.11)_files/rss-advisory-board.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/doc/RSS 2.0 Specification (version 2.0.11)_files/rss-advisory-board.gif -------------------------------------------------------------------------------- /doc/RSS 2.0 Specification (version 2.0.11)_files/rss-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/doc/RSS 2.0 Specification (version 2.0.11)_files/rss-icon.png -------------------------------------------------------------------------------- /doc/RSS 2.0 Specification (version 2.0.11)_files/test_domain.js: -------------------------------------------------------------------------------- 1 | (function(){window.google_new_domain_enabled=1;})() 2 | -------------------------------------------------------------------------------- /doc/sample-rss-2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Liftoff News 5 | http://liftoff.msfc.nasa.gov/ 6 | Liftoff to Space Exploration. 7 | en-us 8 | Tue, 10 Jun 2003 04:00:00 GMT 9 | Tue, 10 Jun 2003 09:41:01 GMT 10 | http://blogs.law.harvard.edu/tech/rss 11 | Weblog Editor 2.0 12 | editor@example.com 13 | webmaster@example.com 14 | 15 | Star City 16 | http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp 17 | How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>. 18 | Tue, 03 Jun 2003 09:39:21 GMT 19 | http://liftoff.msfc.nasa.gov/2003/06/03.html#item573 20 | 21 | 22 | Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial eclipse of the Sun</a> on Saturday, May 31st. 23 | Fri, 30 May 2003 11:06:42 GMT 24 | http://liftoff.msfc.nasa.gov/2003/05/30.html#item572 25 | 26 | 27 | The Engine That Does More 28 | http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp 29 | Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that. 30 | Tue, 27 May 2003 08:37:32 GMT 31 | http://liftoff.msfc.nasa.gov/2003/05/27.html#item571 32 | 33 | 34 | Astronauts' Dirty Laundry 35 | http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp 36 | Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options. 37 | Tue, 20 May 2003 08:56:02 GMT 38 | http://liftoff.msfc.nasa.gov/2003/05/20.html#item570 39 | 40 | 41 | -------------------------------------------------------------------------------- /favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/favicon.ico -------------------------------------------------------------------------------- /fetcher.py: -------------------------------------------------------------------------------- 1 | #!/url/bin/python 2 | #-- coding: utf-8 -- 3 | ''' 4 | Create on 2011.2.1 5 | 6 | @author: binux 7 | ''' 8 | 9 | import logging 10 | import hashlib 11 | import html5lib 12 | import css_selector 13 | 14 | from html5lib.filters import fullurl 15 | 16 | from google.appengine.ext import db 17 | from google.appengine.api import urlfetch 18 | 19 | _parse = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")).parse 20 | _walker = html5lib.treewalkers.getTreeWalker("dom") 21 | _serialize = html5lib.serializer.htmlserializer.HTMLSerializer( 22 | omit_optional_tags=False, 23 | quote_attr_values=True).serialize 24 | 25 | class DescriptionCache(db.Model): 26 | link = db.StringProperty(required=True) 27 | project_id = db.IntegerProperty(required=True) 28 | description = db.TextProperty() 29 | 30 | retryCount = db.IntegerProperty(required=True, default=3) 31 | createdDate = db.DateTimeProperty(required=True, auto_now_add=True) 32 | lastVisitedDate = db.DateTimeProperty(required=True, auto_now=True) 33 | 34 | def fetch_description(url, project): 35 | des_cache = DescriptionCache.get_by_key_name(hashlib.md5(str(project.key().id())+url).hexdigest()) 36 | if des_cache is None: 37 | description = real_fetch_description(url, project.contentSelector, project.filterSelector, project.encoding) 38 | if description: 39 | des_cache = DescriptionCache.get_or_insert(hashlib.md5(str(project.key().id())+url).hexdigest(), 40 | link=url, project_id = project.key().id(), description=db.Text(description)) 41 | des_cache.put() 42 | else: 43 | description = des_cache.description 44 | des_cache.put() 45 | 46 | return description 47 | 48 | def real_fetch_description(url, content_selector, filter_selector, encoding=None): 49 | try: 50 | response = urlfetch.fetch(url) 51 | except urlfetch.InvalidURLError, e: 52 | return u'' 53 | if response.status_code != 200: 54 | raise Exception, "status code: response.status_code" 55 | 56 | doc_dom = _parse(response.content, encoding=encoding) 57 | content_dom = [] 58 | for each in [x for x in content_selector.split('\n') if x]: 59 | dom = doc_dom.getElementsBySelector(each) 60 | content_dom.extend(dom) 61 | content_dom = set(content_dom) 62 | 63 | filter_dom = [] 64 | for each_content in content_dom: 65 | for each_selector in [x for x in filter_selector.split('\n') if x]: 66 | dom = each_content.getElementsBySelector(each_selector) 67 | filter_dom.extend(dom) 68 | filter_dom = set(filter_dom) 69 | for each_dom in filter_dom: 70 | if each_dom.parentNode: 71 | each_dom.parentNode.removeChild(each_dom) 72 | 73 | contents = [] 74 | for dom in content_dom: 75 | w = _walker(dom) 76 | w = fullurl.Filter(w, url) 77 | for item in _serialize(w): 78 | contents.append(item) 79 | return u''.join(contents) 80 | -------------------------------------------------------------------------------- /fix_path.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append(os.path.join(os.path.dirname(__file__), 'lib')) 5 | -------------------------------------------------------------------------------- /html5lib/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | HTML parsing library based on the WHATWG "HTML5" 3 | specification. The parser is designed to be compatible with existing 4 | HTML found in the wild and implements well-defined error recovery that 5 | is largely compatible with modern desktop web browsers. 6 | 7 | Example usage: 8 | 9 | import html5lib 10 | f = open("my_document.html") 11 | tree = html5lib.parse(f) 12 | """ 13 | __version__ = "0.90" 14 | from html5parser import HTMLParser, parse, parseFragment 15 | from treebuilders import getTreeBuilder 16 | from treewalkers import getTreeWalker 17 | from serializer import serialize 18 | -------------------------------------------------------------------------------- /html5lib/chardet/__init__.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # This library is free software; you can redistribute it and/or 3 | # modify it under the terms of the GNU Lesser General Public 4 | # License as published by the Free Software Foundation; either 5 | # version 2.1 of the License, or (at your option) any later version. 6 | # 7 | # This library is distributed in the hope that it will be useful, 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 | # Lesser General Public License for more details. 11 | # 12 | # You should have received a copy of the GNU Lesser General Public 13 | # License along with this library; if not, write to the Free Software 14 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 15 | # 02110-1301 USA 16 | ######################### END LICENSE BLOCK ######################### 17 | 18 | __version__ = "2.0.1" 19 | 20 | def detect(aBuf): 21 | import universaldetector 22 | u = universaldetector.UniversalDetector() 23 | u.reset() 24 | u.feed(aBuf) 25 | u.close() 26 | return u.result 27 | -------------------------------------------------------------------------------- /html5lib/chardet/big5prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from mbcharsetprober import MultiByteCharSetProber 29 | from codingstatemachine import CodingStateMachine 30 | from chardistribution import Big5DistributionAnalysis 31 | from mbcssm import Big5SMModel 32 | 33 | class Big5Prober(MultiByteCharSetProber): 34 | def __init__(self): 35 | MultiByteCharSetProber.__init__(self) 36 | self._mCodingSM = CodingStateMachine(Big5SMModel) 37 | self._mDistributionAnalyzer = Big5DistributionAnalysis() 38 | self.reset() 39 | 40 | def get_charset_name(self): 41 | return "Big5" 42 | -------------------------------------------------------------------------------- /html5lib/chardet/chardistribution.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | import constants 29 | from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO 30 | from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO 31 | from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO 32 | from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO 33 | from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO 34 | 35 | ENOUGH_DATA_THRESHOLD = 1024 36 | SURE_YES = 0.99 37 | SURE_NO = 0.01 38 | 39 | class CharDistributionAnalysis: 40 | def __init__(self): 41 | self._mCharToFreqOrder = None # Mapping table to get frequency order from char order (get from GetOrder()) 42 | self._mTableSize = None # Size of above table 43 | self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail. 44 | self.reset() 45 | 46 | def reset(self): 47 | """reset analyser, clear any state""" 48 | self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made 49 | self._mTotalChars = 0 # Total characters encountered 50 | self._mFreqChars = 0 # The number of characters whose frequency order is less than 512 51 | 52 | def feed(self, aStr, aCharLen): 53 | """feed a character with known length""" 54 | if aCharLen == 2: 55 | # we only care about 2-bytes character in our distribution analysis 56 | order = self.get_order(aStr) 57 | else: 58 | order = -1 59 | if order >= 0: 60 | self._mTotalChars += 1 61 | # order is valid 62 | if order < self._mTableSize: 63 | if 512 > self._mCharToFreqOrder[order]: 64 | self._mFreqChars += 1 65 | 66 | def get_confidence(self): 67 | """return confidence based on existing data""" 68 | # if we didn't receive any character in our consideration range, return negative answer 69 | if self._mTotalChars <= 0: 70 | return SURE_NO 71 | 72 | if self._mTotalChars != self._mFreqChars: 73 | r = self._mFreqChars / ((self._mTotalChars - self._mFreqChars) * self._mTypicalDistributionRatio) 74 | if r < SURE_YES: 75 | return r 76 | 77 | # normalize confidence (we don't want to be 100% sure) 78 | return SURE_YES 79 | 80 | def got_enough_data(self): 81 | # It is not necessary to receive all data to draw conclusion. For charset detection, 82 | # certain amount of data is enough 83 | return self._mTotalChars > ENOUGH_DATA_THRESHOLD 84 | 85 | def get_order(self, aStr): 86 | # We do not handle characters based on the original encoding string, but 87 | # convert this encoding string to a number, here called order. 88 | # This allows multiple encodings of a language to share one frequency table. 89 | return -1 90 | 91 | class EUCTWDistributionAnalysis(CharDistributionAnalysis): 92 | def __init__(self): 93 | CharDistributionAnalysis.__init__(self) 94 | self._mCharToFreqOrder = EUCTWCharToFreqOrder 95 | self._mTableSize = EUCTW_TABLE_SIZE 96 | self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO 97 | 98 | def get_order(self, aStr): 99 | # for euc-TW encoding, we are interested 100 | # first byte range: 0xc4 -- 0xfe 101 | # second byte range: 0xa1 -- 0xfe 102 | # no validation needed here. State machine has done that 103 | if aStr[0] >= '\xC4': 104 | return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1 105 | else: 106 | return -1 107 | 108 | class EUCKRDistributionAnalysis(CharDistributionAnalysis): 109 | def __init__(self): 110 | CharDistributionAnalysis.__init__(self) 111 | self._mCharToFreqOrder = EUCKRCharToFreqOrder 112 | self._mTableSize = EUCKR_TABLE_SIZE 113 | self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO 114 | 115 | def get_order(self, aStr): 116 | # for euc-KR encoding, we are interested 117 | # first byte range: 0xb0 -- 0xfe 118 | # second byte range: 0xa1 -- 0xfe 119 | # no validation needed here. State machine has done that 120 | if aStr[0] >= '\xB0': 121 | return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 122 | else: 123 | return -1; 124 | 125 | class GB2312DistributionAnalysis(CharDistributionAnalysis): 126 | def __init__(self): 127 | CharDistributionAnalysis.__init__(self) 128 | self._mCharToFreqOrder = GB2312CharToFreqOrder 129 | self._mTableSize = GB2312_TABLE_SIZE 130 | self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO 131 | 132 | def get_order(self, aStr): 133 | # for GB2312 encoding, we are interested 134 | # first byte range: 0xb0 -- 0xfe 135 | # second byte range: 0xa1 -- 0xfe 136 | # no validation needed here. State machine has done that 137 | if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'): 138 | return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 139 | else: 140 | return -1; 141 | 142 | class Big5DistributionAnalysis(CharDistributionAnalysis): 143 | def __init__(self): 144 | CharDistributionAnalysis.__init__(self) 145 | self._mCharToFreqOrder = Big5CharToFreqOrder 146 | self._mTableSize = BIG5_TABLE_SIZE 147 | self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO 148 | 149 | def get_order(self, aStr): 150 | # for big5 encoding, we are interested 151 | # first byte range: 0xa4 -- 0xfe 152 | # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe 153 | # no validation needed here. State machine has done that 154 | if aStr[0] >= '\xA4': 155 | if aStr[1] >= '\xA1': 156 | return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63 157 | else: 158 | return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40 159 | else: 160 | return -1 161 | 162 | class SJISDistributionAnalysis(CharDistributionAnalysis): 163 | def __init__(self): 164 | CharDistributionAnalysis.__init__(self) 165 | self._mCharToFreqOrder = JISCharToFreqOrder 166 | self._mTableSize = JIS_TABLE_SIZE 167 | self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO 168 | 169 | def get_order(self, aStr): 170 | # for sjis encoding, we are interested 171 | # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe 172 | # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe 173 | # no validation needed here. State machine has done that 174 | if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'): 175 | order = 188 * (ord(aStr[0]) - 0x81) 176 | elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'): 177 | order = 188 * (ord(aStr[0]) - 0xE0 + 31) 178 | else: 179 | return -1; 180 | order = order + ord(aStr[1]) - 0x40 181 | if aStr[1] > '\x7F': 182 | order =- 1 183 | return order 184 | 185 | class EUCJPDistributionAnalysis(CharDistributionAnalysis): 186 | def __init__(self): 187 | CharDistributionAnalysis.__init__(self) 188 | self._mCharToFreqOrder = JISCharToFreqOrder 189 | self._mTableSize = JIS_TABLE_SIZE 190 | self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO 191 | 192 | def get_order(self, aStr): 193 | # for euc-JP encoding, we are interested 194 | # first byte range: 0xa0 -- 0xfe 195 | # second byte range: 0xa1 -- 0xfe 196 | # no validation needed here. State machine has done that 197 | if aStr[0] >= '\xA0': 198 | return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1 199 | else: 200 | return -1 201 | -------------------------------------------------------------------------------- /html5lib/chardet/charsetgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Communicator client code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | import constants, sys 29 | from charsetprober import CharSetProber 30 | 31 | class CharSetGroupProber(CharSetProber): 32 | def __init__(self): 33 | CharSetProber.__init__(self) 34 | self._mActiveNum = 0 35 | self._mProbers = [] 36 | self._mBestGuessProber = None 37 | 38 | def reset(self): 39 | CharSetProber.reset(self) 40 | self._mActiveNum = 0 41 | for prober in self._mProbers: 42 | if prober: 43 | prober.reset() 44 | prober.active = constants.True 45 | self._mActiveNum += 1 46 | self._mBestGuessProber = None 47 | 48 | def get_charset_name(self): 49 | if not self._mBestGuessProber: 50 | self.get_confidence() 51 | if not self._mBestGuessProber: return None 52 | # self._mBestGuessProber = self._mProbers[0] 53 | return self._mBestGuessProber.get_charset_name() 54 | 55 | def feed(self, aBuf): 56 | for prober in self._mProbers: 57 | if not prober: continue 58 | if not prober.active: continue 59 | st = prober.feed(aBuf) 60 | if not st: continue 61 | if st == constants.eFoundIt: 62 | self._mBestGuessProber = prober 63 | return self.get_state() 64 | elif st == constants.eNotMe: 65 | prober.active = constants.False 66 | self._mActiveNum -= 1 67 | if self._mActiveNum <= 0: 68 | self._mState = constants.eNotMe 69 | return self.get_state() 70 | return self.get_state() 71 | 72 | def get_confidence(self): 73 | st = self.get_state() 74 | if st == constants.eFoundIt: 75 | return 0.99 76 | elif st == constants.eNotMe: 77 | return 0.01 78 | bestConf = 0.0 79 | self._mBestGuessProber = None 80 | for prober in self._mProbers: 81 | if not prober: continue 82 | if not prober.active: 83 | if constants._debug: 84 | sys.stderr.write(prober.get_charset_name() + ' not active\n') 85 | continue 86 | cf = prober.get_confidence() 87 | if constants._debug: 88 | sys.stderr.write('%s confidence = %s\n' % (prober.get_charset_name(), cf)) 89 | if bestConf < cf: 90 | bestConf = cf 91 | self._mBestGuessProber = prober 92 | if not self._mBestGuessProber: return 0.0 93 | return bestConf 94 | # else: 95 | # self._mBestGuessProber = self._mProbers[0] 96 | # return self._mBestGuessProber.get_confidence() 97 | -------------------------------------------------------------------------------- /html5lib/chardet/charsetprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | import constants, re 30 | 31 | class CharSetProber: 32 | def __init__(self): 33 | pass 34 | 35 | def reset(self): 36 | self._mState = constants.eDetecting 37 | 38 | def get_charset_name(self): 39 | return None 40 | 41 | def feed(self, aBuf): 42 | pass 43 | 44 | def get_state(self): 45 | return self._mState 46 | 47 | def get_confidence(self): 48 | return 0.0 49 | 50 | def filter_high_bit_only(self, aBuf): 51 | aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf) 52 | return aBuf 53 | 54 | def filter_without_english_letters(self, aBuf): 55 | aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf) 56 | return aBuf 57 | 58 | def filter_with_english_letters(self, aBuf): 59 | # TODO 60 | return aBuf 61 | -------------------------------------------------------------------------------- /html5lib/chardet/codingstatemachine.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from constants import eStart, eError, eItsMe 29 | 30 | class CodingStateMachine: 31 | def __init__(self, sm): 32 | self._mModel = sm 33 | self._mCurrentBytePos = 0 34 | self._mCurrentCharLen = 0 35 | self.reset() 36 | 37 | def reset(self): 38 | self._mCurrentState = eStart 39 | 40 | def next_state(self, c): 41 | # for each byte we get its class 42 | # if it is first byte, we also get byte length 43 | byteCls = self._mModel['classTable'][ord(c)] 44 | if self._mCurrentState == eStart: 45 | self._mCurrentBytePos = 0 46 | self._mCurrentCharLen = self._mModel['charLenTable'][byteCls] 47 | # from byte's class and stateTable, we get its next state 48 | self._mCurrentState = self._mModel['stateTable'][self._mCurrentState * self._mModel['classFactor'] + byteCls] 49 | self._mCurrentBytePos += 1 50 | return self._mCurrentState 51 | 52 | def get_current_charlen(self): 53 | return self._mCurrentCharLen 54 | 55 | def get_coding_state_machine(self): 56 | return self._mModel['name'] 57 | -------------------------------------------------------------------------------- /html5lib/chardet/constants.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | _debug = 0 30 | 31 | eDetecting = 0 32 | eFoundIt = 1 33 | eNotMe = 2 34 | 35 | eStart = 0 36 | eError = 1 37 | eItsMe = 2 38 | 39 | SHORTCUT_THRESHOLD = 0.95 40 | 41 | import __builtin__ 42 | if not hasattr(__builtin__, 'False'): 43 | False = 0 44 | True = 1 45 | else: 46 | False = __builtin__.False 47 | True = __builtin__.True 48 | -------------------------------------------------------------------------------- /html5lib/chardet/escprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | import constants, sys 29 | from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel 30 | from charsetprober import CharSetProber 31 | from codingstatemachine import CodingStateMachine 32 | 33 | class EscCharSetProber(CharSetProber): 34 | def __init__(self): 35 | CharSetProber.__init__(self) 36 | self._mCodingSM = [ \ 37 | CodingStateMachine(HZSMModel), 38 | CodingStateMachine(ISO2022CNSMModel), 39 | CodingStateMachine(ISO2022JPSMModel), 40 | CodingStateMachine(ISO2022KRSMModel) 41 | ] 42 | self.reset() 43 | 44 | def reset(self): 45 | CharSetProber.reset(self) 46 | for codingSM in self._mCodingSM: 47 | if not codingSM: continue 48 | codingSM.active = constants.True 49 | codingSM.reset() 50 | self._mActiveSM = len(self._mCodingSM) 51 | self._mDetectedCharset = None 52 | 53 | def get_charset_name(self): 54 | return self._mDetectedCharset 55 | 56 | def get_confidence(self): 57 | if self._mDetectedCharset: 58 | return 0.99 59 | else: 60 | return 0.00 61 | 62 | def feed(self, aBuf): 63 | for c in aBuf: 64 | for codingSM in self._mCodingSM: 65 | if not codingSM: continue 66 | if not codingSM.active: continue 67 | codingState = codingSM.next_state(c) 68 | if codingState == constants.eError: 69 | codingSM.active = constants.False 70 | self._mActiveSM -= 1 71 | if self._mActiveSM <= 0: 72 | self._mState = constants.eNotMe 73 | return self.get_state() 74 | elif codingState == constants.eItsMe: 75 | self._mState = constants.eFoundIt 76 | self._mDetectedCharset = codingSM.get_coding_state_machine() 77 | return self.get_state() 78 | 79 | return self.get_state() 80 | -------------------------------------------------------------------------------- /html5lib/chardet/escsm.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from constants import eStart, eError, eItsMe 29 | 30 | HZ_cls = ( \ 31 | 1,0,0,0,0,0,0,0, # 00 - 07 32 | 0,0,0,0,0,0,0,0, # 08 - 0f 33 | 0,0,0,0,0,0,0,0, # 10 - 17 34 | 0,0,0,1,0,0,0,0, # 18 - 1f 35 | 0,0,0,0,0,0,0,0, # 20 - 27 36 | 0,0,0,0,0,0,0,0, # 28 - 2f 37 | 0,0,0,0,0,0,0,0, # 30 - 37 38 | 0,0,0,0,0,0,0,0, # 38 - 3f 39 | 0,0,0,0,0,0,0,0, # 40 - 47 40 | 0,0,0,0,0,0,0,0, # 48 - 4f 41 | 0,0,0,0,0,0,0,0, # 50 - 57 42 | 0,0,0,0,0,0,0,0, # 58 - 5f 43 | 0,0,0,0,0,0,0,0, # 60 - 67 44 | 0,0,0,0,0,0,0,0, # 68 - 6f 45 | 0,0,0,0,0,0,0,0, # 70 - 77 46 | 0,0,0,4,0,5,2,0, # 78 - 7f 47 | 1,1,1,1,1,1,1,1, # 80 - 87 48 | 1,1,1,1,1,1,1,1, # 88 - 8f 49 | 1,1,1,1,1,1,1,1, # 90 - 97 50 | 1,1,1,1,1,1,1,1, # 98 - 9f 51 | 1,1,1,1,1,1,1,1, # a0 - a7 52 | 1,1,1,1,1,1,1,1, # a8 - af 53 | 1,1,1,1,1,1,1,1, # b0 - b7 54 | 1,1,1,1,1,1,1,1, # b8 - bf 55 | 1,1,1,1,1,1,1,1, # c0 - c7 56 | 1,1,1,1,1,1,1,1, # c8 - cf 57 | 1,1,1,1,1,1,1,1, # d0 - d7 58 | 1,1,1,1,1,1,1,1, # d8 - df 59 | 1,1,1,1,1,1,1,1, # e0 - e7 60 | 1,1,1,1,1,1,1,1, # e8 - ef 61 | 1,1,1,1,1,1,1,1, # f0 - f7 62 | 1,1,1,1,1,1,1,1, # f8 - ff 63 | ) 64 | 65 | HZ_st = ( \ 66 | eStart,eError, 3,eStart,eStart,eStart,eError,eError,# 00-07 67 | eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f 68 | eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError,# 10-17 69 | 5,eError, 6,eError, 5, 5, 4,eError,# 18-1f 70 | 4,eError, 4, 4, 4,eError, 4,eError,# 20-27 71 | 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f 72 | ) 73 | 74 | HZCharLenTable = (0, 0, 0, 0, 0, 0) 75 | 76 | HZSMModel = {'classTable': HZ_cls, 77 | 'classFactor': 6, 78 | 'stateTable': HZ_st, 79 | 'charLenTable': HZCharLenTable, 80 | 'name': "HZ-GB-2312"} 81 | 82 | ISO2022CN_cls = ( \ 83 | 2,0,0,0,0,0,0,0, # 00 - 07 84 | 0,0,0,0,0,0,0,0, # 08 - 0f 85 | 0,0,0,0,0,0,0,0, # 10 - 17 86 | 0,0,0,1,0,0,0,0, # 18 - 1f 87 | 0,0,0,0,0,0,0,0, # 20 - 27 88 | 0,3,0,0,0,0,0,0, # 28 - 2f 89 | 0,0,0,0,0,0,0,0, # 30 - 37 90 | 0,0,0,0,0,0,0,0, # 38 - 3f 91 | 0,0,0,4,0,0,0,0, # 40 - 47 92 | 0,0,0,0,0,0,0,0, # 48 - 4f 93 | 0,0,0,0,0,0,0,0, # 50 - 57 94 | 0,0,0,0,0,0,0,0, # 58 - 5f 95 | 0,0,0,0,0,0,0,0, # 60 - 67 96 | 0,0,0,0,0,0,0,0, # 68 - 6f 97 | 0,0,0,0,0,0,0,0, # 70 - 77 98 | 0,0,0,0,0,0,0,0, # 78 - 7f 99 | 2,2,2,2,2,2,2,2, # 80 - 87 100 | 2,2,2,2,2,2,2,2, # 88 - 8f 101 | 2,2,2,2,2,2,2,2, # 90 - 97 102 | 2,2,2,2,2,2,2,2, # 98 - 9f 103 | 2,2,2,2,2,2,2,2, # a0 - a7 104 | 2,2,2,2,2,2,2,2, # a8 - af 105 | 2,2,2,2,2,2,2,2, # b0 - b7 106 | 2,2,2,2,2,2,2,2, # b8 - bf 107 | 2,2,2,2,2,2,2,2, # c0 - c7 108 | 2,2,2,2,2,2,2,2, # c8 - cf 109 | 2,2,2,2,2,2,2,2, # d0 - d7 110 | 2,2,2,2,2,2,2,2, # d8 - df 111 | 2,2,2,2,2,2,2,2, # e0 - e7 112 | 2,2,2,2,2,2,2,2, # e8 - ef 113 | 2,2,2,2,2,2,2,2, # f0 - f7 114 | 2,2,2,2,2,2,2,2, # f8 - ff 115 | ) 116 | 117 | ISO2022CN_st = ( \ 118 | eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 119 | eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f 120 | eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 121 | eItsMe,eItsMe,eItsMe,eError,eError,eError, 4,eError,# 18-1f 122 | eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27 123 | 5, 6,eError,eError,eError,eError,eError,eError,# 28-2f 124 | eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37 125 | eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f 126 | ) 127 | 128 | ISO2022CNCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0) 129 | 130 | ISO2022CNSMModel = {'classTable': ISO2022CN_cls, 131 | 'classFactor': 9, 132 | 'stateTable': ISO2022CN_st, 133 | 'charLenTable': ISO2022CNCharLenTable, 134 | 'name': "ISO-2022-CN"} 135 | 136 | ISO2022JP_cls = ( \ 137 | 2,0,0,0,0,0,0,0, # 00 - 07 138 | 0,0,0,0,0,0,2,2, # 08 - 0f 139 | 0,0,0,0,0,0,0,0, # 10 - 17 140 | 0,0,0,1,0,0,0,0, # 18 - 1f 141 | 0,0,0,0,7,0,0,0, # 20 - 27 142 | 3,0,0,0,0,0,0,0, # 28 - 2f 143 | 0,0,0,0,0,0,0,0, # 30 - 37 144 | 0,0,0,0,0,0,0,0, # 38 - 3f 145 | 6,0,4,0,8,0,0,0, # 40 - 47 146 | 0,9,5,0,0,0,0,0, # 48 - 4f 147 | 0,0,0,0,0,0,0,0, # 50 - 57 148 | 0,0,0,0,0,0,0,0, # 58 - 5f 149 | 0,0,0,0,0,0,0,0, # 60 - 67 150 | 0,0,0,0,0,0,0,0, # 68 - 6f 151 | 0,0,0,0,0,0,0,0, # 70 - 77 152 | 0,0,0,0,0,0,0,0, # 78 - 7f 153 | 2,2,2,2,2,2,2,2, # 80 - 87 154 | 2,2,2,2,2,2,2,2, # 88 - 8f 155 | 2,2,2,2,2,2,2,2, # 90 - 97 156 | 2,2,2,2,2,2,2,2, # 98 - 9f 157 | 2,2,2,2,2,2,2,2, # a0 - a7 158 | 2,2,2,2,2,2,2,2, # a8 - af 159 | 2,2,2,2,2,2,2,2, # b0 - b7 160 | 2,2,2,2,2,2,2,2, # b8 - bf 161 | 2,2,2,2,2,2,2,2, # c0 - c7 162 | 2,2,2,2,2,2,2,2, # c8 - cf 163 | 2,2,2,2,2,2,2,2, # d0 - d7 164 | 2,2,2,2,2,2,2,2, # d8 - df 165 | 2,2,2,2,2,2,2,2, # e0 - e7 166 | 2,2,2,2,2,2,2,2, # e8 - ef 167 | 2,2,2,2,2,2,2,2, # f0 - f7 168 | 2,2,2,2,2,2,2,2, # f8 - ff 169 | ) 170 | 171 | ISO2022JP_st = ( \ 172 | eStart, 3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 173 | eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f 174 | eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 175 | eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f 176 | eError, 5,eError,eError,eError, 4,eError,eError,# 20-27 177 | eError,eError,eError, 6,eItsMe,eError,eItsMe,eError,# 28-2f 178 | eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37 179 | eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f 180 | eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47 181 | ) 182 | 183 | ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) 184 | 185 | ISO2022JPSMModel = {'classTable': ISO2022JP_cls, 186 | 'classFactor': 10, 187 | 'stateTable': ISO2022JP_st, 188 | 'charLenTable': ISO2022JPCharLenTable, 189 | 'name': "ISO-2022-JP"} 190 | 191 | ISO2022KR_cls = ( \ 192 | 2,0,0,0,0,0,0,0, # 00 - 07 193 | 0,0,0,0,0,0,0,0, # 08 - 0f 194 | 0,0,0,0,0,0,0,0, # 10 - 17 195 | 0,0,0,1,0,0,0,0, # 18 - 1f 196 | 0,0,0,0,3,0,0,0, # 20 - 27 197 | 0,4,0,0,0,0,0,0, # 28 - 2f 198 | 0,0,0,0,0,0,0,0, # 30 - 37 199 | 0,0,0,0,0,0,0,0, # 38 - 3f 200 | 0,0,0,5,0,0,0,0, # 40 - 47 201 | 0,0,0,0,0,0,0,0, # 48 - 4f 202 | 0,0,0,0,0,0,0,0, # 50 - 57 203 | 0,0,0,0,0,0,0,0, # 58 - 5f 204 | 0,0,0,0,0,0,0,0, # 60 - 67 205 | 0,0,0,0,0,0,0,0, # 68 - 6f 206 | 0,0,0,0,0,0,0,0, # 70 - 77 207 | 0,0,0,0,0,0,0,0, # 78 - 7f 208 | 2,2,2,2,2,2,2,2, # 80 - 87 209 | 2,2,2,2,2,2,2,2, # 88 - 8f 210 | 2,2,2,2,2,2,2,2, # 90 - 97 211 | 2,2,2,2,2,2,2,2, # 98 - 9f 212 | 2,2,2,2,2,2,2,2, # a0 - a7 213 | 2,2,2,2,2,2,2,2, # a8 - af 214 | 2,2,2,2,2,2,2,2, # b0 - b7 215 | 2,2,2,2,2,2,2,2, # b8 - bf 216 | 2,2,2,2,2,2,2,2, # c0 - c7 217 | 2,2,2,2,2,2,2,2, # c8 - cf 218 | 2,2,2,2,2,2,2,2, # d0 - d7 219 | 2,2,2,2,2,2,2,2, # d8 - df 220 | 2,2,2,2,2,2,2,2, # e0 - e7 221 | 2,2,2,2,2,2,2,2, # e8 - ef 222 | 2,2,2,2,2,2,2,2, # f0 - f7 223 | 2,2,2,2,2,2,2,2, # f8 - ff 224 | ) 225 | 226 | ISO2022KR_st = ( \ 227 | eStart, 3,eError,eStart,eStart,eStart,eError,eError,# 00-07 228 | eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f 229 | eItsMe,eItsMe,eError,eError,eError, 4,eError,eError,# 10-17 230 | eError,eError,eError,eError, 5,eError,eError,eError,# 18-1f 231 | eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27 232 | ) 233 | 234 | ISO2022KRCharLenTable = (0, 0, 0, 0, 0, 0) 235 | 236 | ISO2022KRSMModel = {'classTable': ISO2022KR_cls, 237 | 'classFactor': 6, 238 | 'stateTable': ISO2022KR_st, 239 | 'charLenTable': ISO2022KRCharLenTable, 240 | 'name': "ISO-2022-KR"} 241 | -------------------------------------------------------------------------------- /html5lib/chardet/eucjpprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | import constants, sys 29 | from constants import eStart, eError, eItsMe 30 | from mbcharsetprober import MultiByteCharSetProber 31 | from codingstatemachine import CodingStateMachine 32 | from chardistribution import EUCJPDistributionAnalysis 33 | from jpcntx import EUCJPContextAnalysis 34 | from mbcssm import EUCJPSMModel 35 | 36 | class EUCJPProber(MultiByteCharSetProber): 37 | def __init__(self): 38 | MultiByteCharSetProber.__init__(self) 39 | self._mCodingSM = CodingStateMachine(EUCJPSMModel) 40 | self._mDistributionAnalyzer = EUCJPDistributionAnalysis() 41 | self._mContextAnalyzer = EUCJPContextAnalysis() 42 | self.reset() 43 | 44 | def reset(self): 45 | MultiByteCharSetProber.reset(self) 46 | self._mContextAnalyzer.reset() 47 | 48 | def get_charset_name(self): 49 | return "EUC-JP" 50 | 51 | def feed(self, aBuf): 52 | aLen = len(aBuf) 53 | for i in range(0, aLen): 54 | codingState = self._mCodingSM.next_state(aBuf[i]) 55 | if codingState == eError: 56 | if constants._debug: 57 | sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') 58 | self._mState = constants.eNotMe 59 | break 60 | elif codingState == eItsMe: 61 | self._mState = constants.eFoundIt 62 | break 63 | elif codingState == eStart: 64 | charLen = self._mCodingSM.get_current_charlen() 65 | if i == 0: 66 | self._mLastChar[1] = aBuf[0] 67 | self._mContextAnalyzer.feed(self._mLastChar, charLen) 68 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen) 69 | else: 70 | self._mContextAnalyzer.feed(aBuf[i-1:i+1], charLen) 71 | self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen) 72 | 73 | self._mLastChar[0] = aBuf[aLen - 1] 74 | 75 | if self.get_state() == constants.eDetecting: 76 | if self._mContextAnalyzer.got_enough_data() and \ 77 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD): 78 | self._mState = constants.eFoundIt 79 | 80 | return self.get_state() 81 | 82 | def get_confidence(self): 83 | contxtCf = self._mContextAnalyzer.get_confidence() 84 | distribCf = self._mDistributionAnalyzer.get_confidence() 85 | return max(contxtCf, distribCf) 86 | -------------------------------------------------------------------------------- /html5lib/chardet/euckrprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from mbcharsetprober import MultiByteCharSetProber 29 | from codingstatemachine import CodingStateMachine 30 | from chardistribution import EUCKRDistributionAnalysis 31 | from mbcssm import EUCKRSMModel 32 | 33 | class EUCKRProber(MultiByteCharSetProber): 34 | def __init__(self): 35 | MultiByteCharSetProber.__init__(self) 36 | self._mCodingSM = CodingStateMachine(EUCKRSMModel) 37 | self._mDistributionAnalyzer = EUCKRDistributionAnalysis() 38 | self.reset() 39 | 40 | def get_charset_name(self): 41 | return "EUC-KR" 42 | -------------------------------------------------------------------------------- /html5lib/chardet/euctwprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from mbcharsetprober import MultiByteCharSetProber 29 | from codingstatemachine import CodingStateMachine 30 | from chardistribution import EUCTWDistributionAnalysis 31 | from mbcssm import EUCTWSMModel 32 | 33 | class EUCTWProber(MultiByteCharSetProber): 34 | def __init__(self): 35 | MultiByteCharSetProber.__init__(self) 36 | self._mCodingSM = CodingStateMachine(EUCTWSMModel) 37 | self._mDistributionAnalyzer = EUCTWDistributionAnalysis() 38 | self.reset() 39 | 40 | def get_charset_name(self): 41 | return "EUC-TW" 42 | -------------------------------------------------------------------------------- /html5lib/chardet/gb2312prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from mbcharsetprober import MultiByteCharSetProber 29 | from codingstatemachine import CodingStateMachine 30 | from chardistribution import GB2312DistributionAnalysis 31 | from mbcssm import GB2312SMModel 32 | 33 | class GB2312Prober(MultiByteCharSetProber): 34 | def __init__(self): 35 | MultiByteCharSetProber.__init__(self) 36 | self._mCodingSM = CodingStateMachine(GB2312SMModel) 37 | self._mDistributionAnalyzer = GB2312DistributionAnalysis() 38 | self.reset() 39 | 40 | def get_charset_name(self): 41 | return "GB2312" 42 | -------------------------------------------------------------------------------- /html5lib/chardet/latin1prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | from charsetprober import CharSetProber 30 | import constants 31 | import operator 32 | 33 | FREQ_CAT_NUM = 4 34 | 35 | UDF = 0 # undefined 36 | OTH = 1 # other 37 | ASC = 2 # ascii capital letter 38 | ASS = 3 # ascii small letter 39 | ACV = 4 # accent capital vowel 40 | ACO = 5 # accent capital other 41 | ASV = 6 # accent small vowel 42 | ASO = 7 # accent small other 43 | CLASS_NUM = 8 # total classes 44 | 45 | Latin1_CharToClass = ( \ 46 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 47 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F 48 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 49 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F 50 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 51 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F 52 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 53 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F 54 | OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 55 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F 56 | ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 57 | ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F 58 | OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 59 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F 60 | ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 61 | ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F 62 | OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 87 63 | OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F 64 | UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 97 65 | OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F 66 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A7 67 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF 68 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 69 | OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF 70 | ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C7 71 | ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF 72 | ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D7 73 | ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF 74 | ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E7 75 | ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF 76 | ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F7 77 | ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF 78 | ) 79 | 80 | # 0 : illegal 81 | # 1 : very unlikely 82 | # 2 : normal 83 | # 3 : very likely 84 | Latin1ClassModel = ( \ 85 | # UDF OTH ASC ASS ACV ACO ASV ASO 86 | 0, 0, 0, 0, 0, 0, 0, 0, # UDF 87 | 0, 3, 3, 3, 3, 3, 3, 3, # OTH 88 | 0, 3, 3, 3, 3, 3, 3, 3, # ASC 89 | 0, 3, 3, 3, 1, 1, 3, 3, # ASS 90 | 0, 3, 3, 3, 1, 2, 1, 2, # ACV 91 | 0, 3, 3, 3, 3, 3, 3, 3, # ACO 92 | 0, 3, 1, 3, 1, 1, 1, 3, # ASV 93 | 0, 3, 1, 3, 1, 1, 3, 3, # ASO 94 | ) 95 | 96 | class Latin1Prober(CharSetProber): 97 | def __init__(self): 98 | CharSetProber.__init__(self) 99 | self.reset() 100 | 101 | def reset(self): 102 | self._mLastCharClass = OTH 103 | self._mFreqCounter = [0] * FREQ_CAT_NUM 104 | CharSetProber.reset(self) 105 | 106 | def get_charset_name(self): 107 | return "windows-1252" 108 | 109 | def feed(self, aBuf): 110 | aBuf = self.filter_with_english_letters(aBuf) 111 | for c in aBuf: 112 | charClass = Latin1_CharToClass[ord(c)] 113 | freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass] 114 | if freq == 0: 115 | self._mState = constants.eNotMe 116 | break 117 | self._mFreqCounter[freq] += 1 118 | self._mLastCharClass = charClass 119 | 120 | return self.get_state() 121 | 122 | def get_confidence(self): 123 | if self.get_state() == constants.eNotMe: 124 | return 0.01 125 | 126 | total = reduce(operator.add, self._mFreqCounter) 127 | if total < 0.01: 128 | confidence = 0.0 129 | else: 130 | confidence = (self._mFreqCounter[3] / total) - (self._mFreqCounter[1] * 20.0 / total) 131 | if confidence < 0.0: 132 | confidence = 0.0 133 | # lower the confidence of latin1 so that other more accurate detector 134 | # can take priority. 135 | confidence = confidence * 0.5 136 | return confidence 137 | -------------------------------------------------------------------------------- /html5lib/chardet/mbcharsetprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # Proofpoint, Inc. 13 | # 14 | # This library is free software; you can redistribute it and/or 15 | # modify it under the terms of the GNU Lesser General Public 16 | # License as published by the Free Software Foundation; either 17 | # version 2.1 of the License, or (at your option) any later version. 18 | # 19 | # This library is distributed in the hope that it will be useful, 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 | # Lesser General Public License for more details. 23 | # 24 | # You should have received a copy of the GNU Lesser General Public 25 | # License along with this library; if not, write to the Free Software 26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 27 | # 02110-1301 USA 28 | ######################### END LICENSE BLOCK ######################### 29 | 30 | import constants, sys 31 | from constants import eStart, eError, eItsMe 32 | from charsetprober import CharSetProber 33 | 34 | class MultiByteCharSetProber(CharSetProber): 35 | def __init__(self): 36 | CharSetProber.__init__(self) 37 | self._mDistributionAnalyzer = None 38 | self._mCodingSM = None 39 | self._mLastChar = ['\x00', '\x00'] 40 | 41 | def reset(self): 42 | CharSetProber.reset(self) 43 | if self._mCodingSM: 44 | self._mCodingSM.reset() 45 | if self._mDistributionAnalyzer: 46 | self._mDistributionAnalyzer.reset() 47 | self._mLastChar = ['\x00', '\x00'] 48 | 49 | def get_charset_name(self): 50 | pass 51 | 52 | def feed(self, aBuf): 53 | aLen = len(aBuf) 54 | for i in range(0, aLen): 55 | codingState = self._mCodingSM.next_state(aBuf[i]) 56 | if codingState == eError: 57 | if constants._debug: 58 | sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') 59 | self._mState = constants.eNotMe 60 | break 61 | elif codingState == eItsMe: 62 | self._mState = constants.eFoundIt 63 | break 64 | elif codingState == eStart: 65 | charLen = self._mCodingSM.get_current_charlen() 66 | if i == 0: 67 | self._mLastChar[1] = aBuf[0] 68 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen) 69 | else: 70 | self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen) 71 | 72 | self._mLastChar[0] = aBuf[aLen - 1] 73 | 74 | if self.get_state() == constants.eDetecting: 75 | if self._mDistributionAnalyzer.got_enough_data() and \ 76 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD): 77 | self._mState = constants.eFoundIt 78 | 79 | return self.get_state() 80 | 81 | def get_confidence(self): 82 | return self._mDistributionAnalyzer.get_confidence() 83 | -------------------------------------------------------------------------------- /html5lib/chardet/mbcsgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # Proofpoint, Inc. 13 | # 14 | # This library is free software; you can redistribute it and/or 15 | # modify it under the terms of the GNU Lesser General Public 16 | # License as published by the Free Software Foundation; either 17 | # version 2.1 of the License, or (at your option) any later version. 18 | # 19 | # This library is distributed in the hope that it will be useful, 20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 | # Lesser General Public License for more details. 23 | # 24 | # You should have received a copy of the GNU Lesser General Public 25 | # License along with this library; if not, write to the Free Software 26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 27 | # 02110-1301 USA 28 | ######################### END LICENSE BLOCK ######################### 29 | 30 | from charsetgroupprober import CharSetGroupProber 31 | from utf8prober import UTF8Prober 32 | from sjisprober import SJISProber 33 | from eucjpprober import EUCJPProber 34 | from gb2312prober import GB2312Prober 35 | from euckrprober import EUCKRProber 36 | from big5prober import Big5Prober 37 | from euctwprober import EUCTWProber 38 | 39 | class MBCSGroupProber(CharSetGroupProber): 40 | def __init__(self): 41 | CharSetGroupProber.__init__(self) 42 | self._mProbers = [ \ 43 | UTF8Prober(), 44 | SJISProber(), 45 | EUCJPProber(), 46 | GB2312Prober(), 47 | EUCKRProber(), 48 | Big5Prober(), 49 | EUCTWProber()] 50 | self.reset() 51 | -------------------------------------------------------------------------------- /html5lib/chardet/sbcharsetprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | import constants, sys 30 | from charsetprober import CharSetProber 31 | 32 | SAMPLE_SIZE = 64 33 | SB_ENOUGH_REL_THRESHOLD = 1024 34 | POSITIVE_SHORTCUT_THRESHOLD = 0.95 35 | NEGATIVE_SHORTCUT_THRESHOLD = 0.05 36 | SYMBOL_CAT_ORDER = 250 37 | NUMBER_OF_SEQ_CAT = 4 38 | POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1 39 | #NEGATIVE_CAT = 0 40 | 41 | class SingleByteCharSetProber(CharSetProber): 42 | def __init__(self, model, reversed=constants.False, nameProber=None): 43 | CharSetProber.__init__(self) 44 | self._mModel = model 45 | self._mReversed = reversed # TRUE if we need to reverse every pair in the model lookup 46 | self._mNameProber = nameProber # Optional auxiliary prober for name decision 47 | self.reset() 48 | 49 | def reset(self): 50 | CharSetProber.reset(self) 51 | self._mLastOrder = 255 # char order of last character 52 | self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT 53 | self._mTotalSeqs = 0 54 | self._mTotalChar = 0 55 | self._mFreqChar = 0 # characters that fall in our sampling range 56 | 57 | def get_charset_name(self): 58 | if self._mNameProber: 59 | return self._mNameProber.get_charset_name() 60 | else: 61 | return self._mModel['charsetName'] 62 | 63 | def feed(self, aBuf): 64 | if not self._mModel['keepEnglishLetter']: 65 | aBuf = self.filter_without_english_letters(aBuf) 66 | aLen = len(aBuf) 67 | if not aLen: 68 | return self.get_state() 69 | for c in aBuf: 70 | order = self._mModel['charToOrderMap'][ord(c)] 71 | if order < SYMBOL_CAT_ORDER: 72 | self._mTotalChar += 1 73 | if order < SAMPLE_SIZE: 74 | self._mFreqChar += 1 75 | if self._mLastOrder < SAMPLE_SIZE: 76 | self._mTotalSeqs += 1 77 | if not self._mReversed: 78 | self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1 79 | else: # reverse the order of the letters in the lookup 80 | self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1 81 | self._mLastOrder = order 82 | 83 | if self.get_state() == constants.eDetecting: 84 | if self._mTotalSeqs > SB_ENOUGH_REL_THRESHOLD: 85 | cf = self.get_confidence() 86 | if cf > POSITIVE_SHORTCUT_THRESHOLD: 87 | if constants._debug: 88 | sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf)) 89 | self._mState = constants.eFoundIt 90 | elif cf < NEGATIVE_SHORTCUT_THRESHOLD: 91 | if constants._debug: 92 | sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD)) 93 | self._mState = constants.eNotMe 94 | 95 | return self.get_state() 96 | 97 | def get_confidence(self): 98 | r = 0.01 99 | if self._mTotalSeqs > 0: 100 | # print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio'] 101 | r = (1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs / self._mModel['mTypicalPositiveRatio'] 102 | # print r, self._mFreqChar, self._mTotalChar 103 | r = r * self._mFreqChar / self._mTotalChar 104 | if r >= 1.0: 105 | r = 0.99 106 | return r 107 | -------------------------------------------------------------------------------- /html5lib/chardet/sbcsgroupprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | import constants, sys 30 | from charsetgroupprober import CharSetGroupProber 31 | from sbcharsetprober import SingleByteCharSetProber 32 | from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model 33 | from langgreekmodel import Latin7GreekModel, Win1253GreekModel 34 | from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel 35 | from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel 36 | from langthaimodel import TIS620ThaiModel 37 | from langhebrewmodel import Win1255HebrewModel 38 | from hebrewprober import HebrewProber 39 | 40 | class SBCSGroupProber(CharSetGroupProber): 41 | def __init__(self): 42 | CharSetGroupProber.__init__(self) 43 | self._mProbers = [ \ 44 | SingleByteCharSetProber(Win1251CyrillicModel), 45 | SingleByteCharSetProber(Koi8rModel), 46 | SingleByteCharSetProber(Latin5CyrillicModel), 47 | SingleByteCharSetProber(MacCyrillicModel), 48 | SingleByteCharSetProber(Ibm866Model), 49 | SingleByteCharSetProber(Ibm855Model), 50 | SingleByteCharSetProber(Latin7GreekModel), 51 | SingleByteCharSetProber(Win1253GreekModel), 52 | SingleByteCharSetProber(Latin5BulgarianModel), 53 | SingleByteCharSetProber(Win1251BulgarianModel), 54 | SingleByteCharSetProber(Latin2HungarianModel), 55 | SingleByteCharSetProber(Win1250HungarianModel), 56 | SingleByteCharSetProber(TIS620ThaiModel), 57 | ] 58 | hebrewProber = HebrewProber() 59 | logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.False, hebrewProber) 60 | visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.True, hebrewProber) 61 | hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber) 62 | self._mProbers.extend([hebrewProber, logicalHebrewProber, visualHebrewProber]) 63 | 64 | self.reset() 65 | -------------------------------------------------------------------------------- /html5lib/chardet/sjisprober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | from mbcharsetprober import MultiByteCharSetProber 29 | from codingstatemachine import CodingStateMachine 30 | from chardistribution import SJISDistributionAnalysis 31 | from jpcntx import SJISContextAnalysis 32 | from mbcssm import SJISSMModel 33 | import constants, sys 34 | from constants import eStart, eError, eItsMe 35 | 36 | class SJISProber(MultiByteCharSetProber): 37 | def __init__(self): 38 | MultiByteCharSetProber.__init__(self) 39 | self._mCodingSM = CodingStateMachine(SJISSMModel) 40 | self._mDistributionAnalyzer = SJISDistributionAnalysis() 41 | self._mContextAnalyzer = SJISContextAnalysis() 42 | self.reset() 43 | 44 | def reset(self): 45 | MultiByteCharSetProber.reset(self) 46 | self._mContextAnalyzer.reset() 47 | 48 | def get_charset_name(self): 49 | return "SHIFT_JIS" 50 | 51 | def feed(self, aBuf): 52 | aLen = len(aBuf) 53 | for i in range(0, aLen): 54 | codingState = self._mCodingSM.next_state(aBuf[i]) 55 | if codingState == eError: 56 | if constants._debug: 57 | sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') 58 | self._mState = constants.eNotMe 59 | break 60 | elif codingState == eItsMe: 61 | self._mState = constants.eFoundIt 62 | break 63 | elif codingState == eStart: 64 | charLen = self._mCodingSM.get_current_charlen() 65 | if i == 0: 66 | self._mLastChar[1] = aBuf[0] 67 | self._mContextAnalyzer.feed(self._mLastChar[2 - charLen :], charLen) 68 | self._mDistributionAnalyzer.feed(self._mLastChar, charLen) 69 | else: 70 | self._mContextAnalyzer.feed(aBuf[i + 1 - charLen : i + 3 - charLen], charLen) 71 | self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen) 72 | 73 | self._mLastChar[0] = aBuf[aLen - 1] 74 | 75 | if self.get_state() == constants.eDetecting: 76 | if self._mContextAnalyzer.got_enough_data() and \ 77 | (self.get_confidence() > constants.SHORTCUT_THRESHOLD): 78 | self._mState = constants.eFoundIt 79 | 80 | return self.get_state() 81 | 82 | def get_confidence(self): 83 | contxtCf = self._mContextAnalyzer.get_confidence() 84 | distribCf = self._mDistributionAnalyzer.get_confidence() 85 | return max(contxtCf, distribCf) 86 | -------------------------------------------------------------------------------- /html5lib/chardet/test.py: -------------------------------------------------------------------------------- 1 | import sys, glob 2 | sys.path.insert(0, '..') 3 | from chardet.universaldetector import UniversalDetector 4 | 5 | count = 0 6 | u = UniversalDetector() 7 | for f in glob.glob(sys.argv[1]): 8 | print f.ljust(60), 9 | u.reset() 10 | for line in file(f, 'rb'): 11 | u.feed(line) 12 | if u.done: break 13 | u.close() 14 | result = u.result 15 | if result['encoding']: 16 | print result['encoding'], 'with confidence', result['confidence'] 17 | else: 18 | print '******** no result' 19 | count += 1 20 | print count, 'tests' 21 | -------------------------------------------------------------------------------- /html5lib/chardet/universaldetector.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is Mozilla Universal charset detector code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 2001 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # Shy Shalom - original C code 12 | # 13 | # This library is free software; you can redistribute it and/or 14 | # modify it under the terms of the GNU Lesser General Public 15 | # License as published by the Free Software Foundation; either 16 | # version 2.1 of the License, or (at your option) any later version. 17 | # 18 | # This library is distributed in the hope that it will be useful, 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 | # Lesser General Public License for more details. 22 | # 23 | # You should have received a copy of the GNU Lesser General Public 24 | # License along with this library; if not, write to the Free Software 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 26 | # 02110-1301 USA 27 | ######################### END LICENSE BLOCK ######################### 28 | 29 | import constants, sys 30 | from latin1prober import Latin1Prober # windows-1252 31 | from mbcsgroupprober import MBCSGroupProber # multi-byte character sets 32 | from sbcsgroupprober import SBCSGroupProber # single-byte character sets 33 | from escprober import EscCharSetProber # ISO-2122, etc. 34 | import re 35 | 36 | MINIMUM_THRESHOLD = 0.20 37 | ePureAscii = 0 38 | eEscAscii = 1 39 | eHighbyte = 2 40 | 41 | class UniversalDetector: 42 | def __init__(self): 43 | self._highBitDetector = re.compile(r'[\x80-\xFF]') 44 | self._escDetector = re.compile(r'(\033|~{)') 45 | self._mEscCharSetProber = None 46 | self._mCharSetProbers = [] 47 | self.reset() 48 | 49 | def reset(self): 50 | self.result = {'encoding': None, 'confidence': 0.0} 51 | self.done = constants.False 52 | self._mStart = constants.True 53 | self._mGotData = constants.False 54 | self._mInputState = ePureAscii 55 | self._mLastChar = '' 56 | if self._mEscCharSetProber: 57 | self._mEscCharSetProber.reset() 58 | for prober in self._mCharSetProbers: 59 | prober.reset() 60 | 61 | def feed(self, aBuf): 62 | if self.done: return 63 | 64 | aLen = len(aBuf) 65 | if not aLen: return 66 | 67 | if not self._mGotData: 68 | # If the data starts with BOM, we know it is UTF 69 | if aBuf[:3] == '\xEF\xBB\xBF': 70 | # EF BB BF UTF-8 with BOM 71 | self.result = {'encoding': "UTF-8", 'confidence': 1.0} 72 | elif aBuf[:4] == '\xFF\xFE\x00\x00': 73 | # FF FE 00 00 UTF-32, little-endian BOM 74 | self.result = {'encoding': "UTF-32LE", 'confidence': 1.0} 75 | elif aBuf[:4] == '\x00\x00\xFE\xFF': 76 | # 00 00 FE FF UTF-32, big-endian BOM 77 | self.result = {'encoding': "UTF-32BE", 'confidence': 1.0} 78 | elif aBuf[:4] == '\xFE\xFF\x00\x00': 79 | # FE FF 00 00 UCS-4, unusual octet order BOM (3412) 80 | self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 'confidence': 1.0} 81 | elif aBuf[:4] == '\x00\x00\xFF\xFE': 82 | # 00 00 FF FE UCS-4, unusual octet order BOM (2143) 83 | self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0} 84 | elif aBuf[:2] == '\xFF\xFE': 85 | # FF FE UTF-16, little endian BOM 86 | self.result = {'encoding': "UTF-16LE", 'confidence': 1.0} 87 | elif aBuf[:2] == '\xFE\xFF': 88 | # FE FF UTF-16, big endian BOM 89 | self.result = {'encoding': "UTF-16BE", 'confidence': 1.0} 90 | 91 | self._mGotData = constants.True 92 | if self.result['encoding'] and (self.result['confidence'] > 0.0): 93 | self.done = constants.True 94 | return 95 | 96 | if self._mInputState == ePureAscii: 97 | if self._highBitDetector.search(aBuf): 98 | self._mInputState = eHighbyte 99 | elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf): 100 | self._mInputState = eEscAscii 101 | 102 | self._mLastChar = aBuf[-1] 103 | 104 | if self._mInputState == eEscAscii: 105 | if not self._mEscCharSetProber: 106 | self._mEscCharSetProber = EscCharSetProber() 107 | if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt: 108 | self.result = {'encoding': self._mEscCharSetProber.get_charset_name(), 109 | 'confidence': self._mEscCharSetProber.get_confidence()} 110 | self.done = constants.True 111 | elif self._mInputState == eHighbyte: 112 | if not self._mCharSetProbers: 113 | self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()] 114 | for prober in self._mCharSetProbers: 115 | if prober.feed(aBuf) == constants.eFoundIt: 116 | self.result = {'encoding': prober.get_charset_name(), 117 | 'confidence': prober.get_confidence()} 118 | self.done = constants.True 119 | break 120 | 121 | def close(self): 122 | if self.done: return 123 | if not self._mGotData: 124 | if constants._debug: 125 | sys.stderr.write('no data received!\n') 126 | return 127 | self.done = constants.True 128 | 129 | if self._mInputState == ePureAscii: 130 | self.result = {'encoding': 'ascii', 'confidence': 1.0} 131 | return self.result 132 | 133 | if self._mInputState == eHighbyte: 134 | proberConfidence = None 135 | maxProberConfidence = 0.0 136 | maxProber = None 137 | for prober in self._mCharSetProbers: 138 | if not prober: continue 139 | proberConfidence = prober.get_confidence() 140 | if proberConfidence > maxProberConfidence: 141 | maxProberConfidence = proberConfidence 142 | maxProber = prober 143 | if maxProber and (maxProberConfidence > MINIMUM_THRESHOLD): 144 | self.result = {'encoding': maxProber.get_charset_name(), 145 | 'confidence': maxProber.get_confidence()} 146 | return self.result 147 | 148 | if constants._debug: 149 | sys.stderr.write('no probers hit minimum threshhold\n') 150 | for prober in self._mCharSetProbers[0].mProbers: 151 | if not prober: continue 152 | sys.stderr.write('%s confidence = %s\n' % \ 153 | (prober.get_charset_name(), \ 154 | prober.get_confidence())) 155 | -------------------------------------------------------------------------------- /html5lib/chardet/utf8prober.py: -------------------------------------------------------------------------------- 1 | ######################## BEGIN LICENSE BLOCK ######################## 2 | # The Original Code is mozilla.org code. 3 | # 4 | # The Initial Developer of the Original Code is 5 | # Netscape Communications Corporation. 6 | # Portions created by the Initial Developer are Copyright (C) 1998 7 | # the Initial Developer. All Rights Reserved. 8 | # 9 | # Contributor(s): 10 | # Mark Pilgrim - port to Python 11 | # 12 | # This library is free software; you can redistribute it and/or 13 | # modify it under the terms of the GNU Lesser General Public 14 | # License as published by the Free Software Foundation; either 15 | # version 2.1 of the License, or (at your option) any later version. 16 | # 17 | # This library is distributed in the hope that it will be useful, 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 | # Lesser General Public License for more details. 21 | # 22 | # You should have received a copy of the GNU Lesser General Public 23 | # License along with this library; if not, write to the Free Software 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 | # 02110-1301 USA 26 | ######################### END LICENSE BLOCK ######################### 27 | 28 | import constants, sys 29 | from constants import eStart, eError, eItsMe 30 | from charsetprober import CharSetProber 31 | from codingstatemachine import CodingStateMachine 32 | from mbcssm import UTF8SMModel 33 | 34 | ONE_CHAR_PROB = 0.5 35 | 36 | class UTF8Prober(CharSetProber): 37 | def __init__(self): 38 | CharSetProber.__init__(self) 39 | self._mCodingSM = CodingStateMachine(UTF8SMModel) 40 | self.reset() 41 | 42 | def reset(self): 43 | CharSetProber.reset(self) 44 | self._mCodingSM.reset() 45 | self._mNumOfMBChar = 0 46 | 47 | def get_charset_name(self): 48 | return "utf-8" 49 | 50 | def feed(self, aBuf): 51 | for c in aBuf: 52 | codingState = self._mCodingSM.next_state(c) 53 | if codingState == eError: 54 | self._mState = constants.eNotMe 55 | break 56 | elif codingState == eItsMe: 57 | self._mState = constants.eFoundIt 58 | break 59 | elif codingState == eStart: 60 | if self._mCodingSM.get_current_charlen() >= 2: 61 | self._mNumOfMBChar += 1 62 | 63 | if self.get_state() == constants.eDetecting: 64 | if self.get_confidence() > constants.SHORTCUT_THRESHOLD: 65 | self._mState = constants.eFoundIt 66 | 67 | return self.get_state() 68 | 69 | def get_confidence(self): 70 | unlike = 0.99 71 | if self._mNumOfMBChar < 6: 72 | for i in range(0, self._mNumOfMBChar): 73 | unlike = unlike * ONE_CHAR_PROB 74 | return 1.0 - unlike 75 | else: 76 | return unlike 77 | -------------------------------------------------------------------------------- /html5lib/filters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/html5lib/filters/__init__.py -------------------------------------------------------------------------------- /html5lib/filters/_base.py: -------------------------------------------------------------------------------- 1 | 2 | class Filter(object): 3 | def __init__(self, source): 4 | self.source = source 5 | 6 | def __iter__(self): 7 | return iter(self.source) 8 | 9 | def __getattr__(self, name): 10 | return getattr(self.source, name) 11 | -------------------------------------------------------------------------------- /html5lib/filters/formfiller.py: -------------------------------------------------------------------------------- 1 | # 2 | # The goal is to finally have a form filler where you pass data for 3 | # each form, using the algorithm for "Seeding a form with initial values" 4 | # See http://www.whatwg.org/specs/web-forms/current-work/#seeding 5 | # 6 | 7 | import _base 8 | 9 | from html5lib.constants import spaceCharacters 10 | spaceCharacters = u"".join(spaceCharacters) 11 | 12 | class SimpleFilter(_base.Filter): 13 | def __init__(self, source, fieldStorage): 14 | _base.Filter.__init__(self, source) 15 | self.fieldStorage = fieldStorage 16 | 17 | def __iter__(self): 18 | field_indices = {} 19 | state = None 20 | field_name = None 21 | for token in _base.Filter.__iter__(self): 22 | type = token["type"] 23 | if type in ("StartTag", "EmptyTag"): 24 | name = token["name"].lower() 25 | if name == "input": 26 | field_name = None 27 | field_type = None 28 | input_value_index = -1 29 | input_checked_index = -1 30 | for i,(n,v) in enumerate(token["data"]): 31 | n = n.lower() 32 | if n == u"name": 33 | field_name = v.strip(spaceCharacters) 34 | elif n == u"type": 35 | field_type = v.strip(spaceCharacters) 36 | elif n == u"checked": 37 | input_checked_index = i 38 | elif n == u"value": 39 | input_value_index = i 40 | 41 | value_list = self.fieldStorage.getlist(field_name) 42 | field_index = field_indices.setdefault(field_name, 0) 43 | if field_index < len(value_list): 44 | value = value_list[field_index] 45 | else: 46 | value = "" 47 | 48 | if field_type in (u"checkbox", u"radio"): 49 | if value_list: 50 | if token["data"][input_value_index][1] == value: 51 | if input_checked_index < 0: 52 | token["data"].append((u"checked", u"")) 53 | field_indices[field_name] = field_index + 1 54 | elif input_checked_index >= 0: 55 | del token["data"][input_checked_index] 56 | 57 | elif field_type not in (u"button", u"submit", u"reset"): 58 | if input_value_index >= 0: 59 | token["data"][input_value_index] = (u"value", value) 60 | else: 61 | token["data"].append((u"value", value)) 62 | field_indices[field_name] = field_index + 1 63 | 64 | field_type = None 65 | field_name = None 66 | 67 | elif name == "textarea": 68 | field_type = "textarea" 69 | field_name = dict((token["data"])[::-1])["name"] 70 | 71 | elif name == "select": 72 | field_type = "select" 73 | attributes = dict(token["data"][::-1]) 74 | field_name = attributes.get("name") 75 | is_select_multiple = "multiple" in attributes 76 | is_selected_option_found = False 77 | 78 | elif field_type == "select" and field_name and name == "option": 79 | option_selected_index = -1 80 | option_value = None 81 | for i,(n,v) in enumerate(token["data"]): 82 | n = n.lower() 83 | if n == "selected": 84 | option_selected_index = i 85 | elif n == "value": 86 | option_value = v.strip(spaceCharacters) 87 | if option_value is None: 88 | raise NotImplementedError("