├── README.md
├── app.yaml
├── cron.py
├── cron.yaml
├── css
    ├── images
    │   ├── css-full-text-button.png
    │   ├── css-full-textindex.png
    │   ├── index-button.png
    │   ├── index.png
    │   ├── ui-bg_diagonals-thick_18_b81900_40x40.png
    │   ├── ui-bg_diagonals-thick_20_666666_40x40.png
    │   ├── ui-bg_flat_10_000000_40x100.png
    │   ├── ui-bg_glass_100_f6f6f6_1x400.png
    │   ├── ui-bg_glass_100_fdf5ce_1x400.png
    │   ├── ui-bg_glass_65_ffffff_1x400.png
    │   ├── ui-bg_gloss-wave_35_f6a828_500x100.png
    │   ├── ui-bg_highlight-soft_100_eeeeee_1x100.png
    │   ├── ui-bg_highlight-soft_75_ffe45c_1x100.png
    │   ├── ui-icons_222222_256x240.png
    │   ├── ui-icons_228ef1_256x240.png
    │   ├── ui-icons_ef8c08_256x240.png
    │   ├── ui-icons_ffd27a_256x240.png
    │   └── ui-icons_ffffff_256x240.png
    ├── jquery-ui-1.8.9.custom.css
    └── style.css
├── css_selector.py
├── doc
    ├── RSS 2.0 Specification (version 2.0.11).html
    ├── RSS 2.0 Specification (version 2.0.11)_files
    │   ├── abg-en-100c-000000.png
    │   ├── ads.html
    │   ├── expansion_embed.js
    │   ├── flowers.gif
    │   ├── left-arrow.gif
    │   ├── rss-advisory-board.gif
    │   ├── rss-icon.png
    │   ├── show_ads.js
    │   ├── show_ads_impl.js
    │   ├── test_domain.js
    │   └── urchin.js
    ├── cnbeta.xml
    ├── feed
    ├── sample-rss-2.xml
    └── test.xml
├── favicon.ico
├── feedformatter.py
├── feedparser.py
├── fetcher.py
├── fix_path.py
├── front.py
├── html5lib
    ├── __init__.py
    ├── chardet
    │   ├── __init__.py
    │   ├── big5freq.py
    │   ├── big5prober.py
    │   ├── chardistribution.py
    │   ├── charsetgroupprober.py
    │   ├── charsetprober.py
    │   ├── codingstatemachine.py
    │   ├── constants.py
    │   ├── escprober.py
    │   ├── escsm.py
    │   ├── eucjpprober.py
    │   ├── euckrfreq.py
    │   ├── euckrprober.py
    │   ├── euctwfreq.py
    │   ├── euctwprober.py
    │   ├── gb2312freq.py
    │   ├── gb2312prober.py
    │   ├── hebrewprober.py
    │   ├── jisfreq.py
    │   ├── jpcntx.py
    │   ├── langbulgarianmodel.py
    │   ├── langcyrillicmodel.py
    │   ├── langgreekmodel.py
    │   ├── langhebrewmodel.py
    │   ├── langhungarianmodel.py
    │   ├── langthaimodel.py
    │   ├── latin1prober.py
    │   ├── mbcharsetprober.py
    │   ├── mbcsgroupprober.py
    │   ├── mbcssm.py
    │   ├── sbcharsetprober.py
    │   ├── sbcsgroupprober.py
    │   ├── sjisprober.py
    │   ├── test.py
    │   ├── universaldetector.py
    │   └── utf8prober.py
    ├── constants.py
    ├── filters
    │   ├── __init__.py
    │   ├── _base.py
    │   ├── formfiller.py
    │   ├── fullurl.py
    │   ├── inject_meta_charset.py
    │   ├── lint.py
    │   ├── optionaltags.py
    │   ├── sanitizer.py
    │   └── whitespace.py
    ├── html5parser.py
    ├── ihatexml.py
    ├── inputstream.py
    ├── sanitizer.py
    ├── serializer
    │   ├── __init__.py
    │   ├── htmlserializer.py
    │   └── xhtmlserializer.py
    ├── tokenizer.py
    ├── tokenizer_old.py
    ├── treebuilders
    │   ├── __init__.py
    │   ├── _base.py
    │   ├── dom.py
    │   ├── etree.py
    │   ├── etree_lxml.py
    │   ├── simpletree.py
    │   └── soup.py
    ├── treewalkers
    │   ├── __init__.py
    │   ├── _base.py
    │   ├── dom.py
    │   ├── etree.py
    │   ├── genshistream.py
    │   ├── lxmletree.py
    │   ├── pulldom.py
    │   ├── simpletree.py
    │   └── soup.py
    └── utils.py
├── images
    ├── index.png
    ├── ui-bg_diagonals-thick_18_b81900_40x40.png
    ├── ui-bg_diagonals-thick_20_666666_40x40.png
    ├── ui-bg_flat_10_000000_40x100.png
    ├── ui-bg_glass_100_f6f6f6_1x400.png
    ├── ui-bg_glass_100_fdf5ce_1x400.png
    ├── ui-bg_glass_65_ffffff_1x400.png
    ├── ui-bg_gloss-wave_35_f6a828_500x100.png
    ├── ui-bg_highlight-soft_100_eeeeee_1x100.png
    ├── ui-bg_highlight-soft_75_ffe45c_1x100.png
    ├── ui-icons_222222_256x240.png
    ├── ui-icons_228ef1_256x240.png
    ├── ui-icons_ef8c08_256x240.png
    ├── ui-icons_ffd27a_256x240.png
    └── ui-icons_ffffff_256x240.png
├── index.yaml
├── main.py
├── project.py
├── queue.yaml
├── template
    ├── base.html
    ├── editor.html
    ├── index.html
    ├── search.html
    ├── style.css
    └── test.html
├── test.py
└── tmp


/README.md:
--------------------------------------------------------------------------------
 1 | # cssfulltext
 2 | source code of http://css-fulltext.appspot.com/
 3 | 
 4 | - 实时的 RSS 全文转换器
 5 | - 基于 CSS选择器 选择正文区域，去除广告
 6 | - 2011年旧代码
 7 | - 运行于 GAE
 8 | - 可能 Python 2.5
 9 | - 使用 Google 账户登录 （OpenID 2.0 将于 2015年 4月关闭）
10 | 


--------------------------------------------------------------------------------
/app.yaml:
--------------------------------------------------------------------------------
 1 | application: css-fulltext
 2 | version: 1
 3 | runtime: python
 4 | api_version: 1
 5 | 
 6 | handlers:
 7 | - url: /cron/.*
 8 |   script: cron.py
 9 |   login: admin
10 | 
11 | - url: /worker/.*
12 |   script: cron.py
13 |   login: admin
14 | 
15 | - url: /css
16 |   static_dir: css
17 | 
18 | - url: /images
19 |   static_dir: images
20 | 
21 | - url: /.*
22 |   script: front.py
23 | 


--------------------------------------------------------------------------------
/cron.py:
--------------------------------------------------------------------------------
 1 | #!/url/bin/python
 2 | #-- coding: utf-8 --
 3 | '''
 4 | Create on 2011.2.3
 5 | 
 6 | @author: binux
 7 | '''
 8 | 
 9 | import datetime
10 | 
11 | from google.appengine.api import taskqueue
12 | from google.appengine.ext import db
13 | from google.appengine.ext import webapp
14 | from google.appengine.ext.webapp.util import run_wsgi_app
15 | 
16 | from project import Project, updateProject
17 | from fetcher import DescriptionCache
18 | 
19 | class updateProjects(webapp.RequestHandler):
20 |     def get(self):
21 |         projects = db.GqlQuery("SELECT __key__ FROM Project WHERE nextUpdateDate < :1", datetime.datetime.now())
22 |         for key in projects:
23 |             task = taskqueue.Task(url='/worker/update_project', params={'key': key.id()})
24 |             task.add('project')
25 | 
26 | class removeCache(webapp.RequestHandler):
27 |     def get(self):
28 |         task = taskqueue.add(url='/worker/remove_cache')
29 | 
30 |     def post(self):
31 |         q = db.GqlQuery("SELECT __key__ FROM DescriptionCache WHERE lastVisitedDate < :1", 
32 |                 datetime.datetime.now() - datetime.timedelta(days=7))
33 |         r = q.fetch(q.count())
34 |         db.delete(r)
35 | 
36 | class updateProjectWorker(webapp.RequestHandler):
37 |     def post(self):
38 |         key = self.request.get('key')
39 |         project = Project.get_by_id(key)
40 |         if project:
41 |             updateProject(project)
42 |         else:
43 |             logging.warning("Unknow project key: %s" % key)
44 | 
45 | def main():
46 |     run_wsgi_app(webapp.WSGIApplication([
47 |             ('/cron/remove_cache', removeCache),
48 |             ('/worker/remove_cache', removeCache),
49 |             ('/cron/update_project', updateProjects),
50 |             ('/worker/update_project', updateProjectWorker),
51 |     ], debug=True))
52 | 
53 | if __name__ == '__main__':
54 |     main()
55 | 


--------------------------------------------------------------------------------
/cron.yaml:
--------------------------------------------------------------------------------
1 | cron:
2 | - description: daily summary job ( remove cache )
3 |   url: /cron/remove_cache
4 |   schedule: every 24 hours
5 | 


--------------------------------------------------------------------------------
/css/images/css-full-text-button.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/css-full-text-button.png


--------------------------------------------------------------------------------
/css/images/css-full-textindex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/css-full-textindex.png


--------------------------------------------------------------------------------
/css/images/index-button.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/index-button.png


--------------------------------------------------------------------------------
/css/images/index.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/index.png


--------------------------------------------------------------------------------
/css/images/ui-bg_diagonals-thick_18_b81900_40x40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_diagonals-thick_18_b81900_40x40.png


--------------------------------------------------------------------------------
/css/images/ui-bg_diagonals-thick_20_666666_40x40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_diagonals-thick_20_666666_40x40.png


--------------------------------------------------------------------------------
/css/images/ui-bg_flat_10_000000_40x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_flat_10_000000_40x100.png


--------------------------------------------------------------------------------
/css/images/ui-bg_glass_100_f6f6f6_1x400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_glass_100_f6f6f6_1x400.png


--------------------------------------------------------------------------------
/css/images/ui-bg_glass_100_fdf5ce_1x400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_glass_100_fdf5ce_1x400.png


--------------------------------------------------------------------------------
/css/images/ui-bg_glass_65_ffffff_1x400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_glass_65_ffffff_1x400.png


--------------------------------------------------------------------------------
/css/images/ui-bg_gloss-wave_35_f6a828_500x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_gloss-wave_35_f6a828_500x100.png


--------------------------------------------------------------------------------
/css/images/ui-bg_highlight-soft_100_eeeeee_1x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_highlight-soft_100_eeeeee_1x100.png


--------------------------------------------------------------------------------
/css/images/ui-bg_highlight-soft_75_ffe45c_1x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-bg_highlight-soft_75_ffe45c_1x100.png


--------------------------------------------------------------------------------
/css/images/ui-icons_222222_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-icons_222222_256x240.png


--------------------------------------------------------------------------------
/css/images/ui-icons_228ef1_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-icons_228ef1_256x240.png


--------------------------------------------------------------------------------
/css/images/ui-icons_ef8c08_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-icons_ef8c08_256x240.png


--------------------------------------------------------------------------------
/css/images/ui-icons_ffd27a_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-icons_ffd27a_256x240.png


--------------------------------------------------------------------------------
/css/images/ui-icons_ffffff_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/css/images/ui-icons_ffffff_256x240.png


--------------------------------------------------------------------------------
/css/style.css:
--------------------------------------------------------------------------------
  1 | /*
  2 | Site Name:      full-text
  3 | Developed By:   binux - 足兆叉虫
  4 | Date Created:   2011-2-5
  5 | Last Updated:   2011-2-5
  6 | Copyright:      GPLv3   <http://www.gnu.org/licenses/>
  7 | */
  8 | 
  9 | #mainContainer {
 10 |   margin: auto; /* center in viewport */
 11 |   width: 974px;
 12 |   font-size: 14px;
 13 |   height: 100%; /* for stickyfooter */
 14 | }
 15 | 
 16 | /* ....... header ........*/
 17 | #headerContent {
 18 |   width: 100%;
 19 | }
 20 | #headerContent span {
 21 |   margin: 0px 2px;
 22 | }
 23 | #headerRight {
 24 |   float: right;
 25 | }
 26 | #headerLeft {
 27 |   float: left;
 28 | }
 29 | 
 30 | /* ...... middle ...... */
 31 | /* ...... feedForm ...... */
 32 | #feedForm {
 33 |   width: 630px;
 34 |   height: 217px;
 35 |   margin: 10% auto;
 36 |   background: url("images/index.png") no-repeat top left;
 37 | }
 38 | #feedInputBox {
 39 |   position: relative;
 40 |   top: 90px;
 41 |   left: 96px;
 42 | }
 43 | #feedInput {
 44 |   float: left;
 45 |   display: block;
 46 |   width: 435px;
 47 |   height: 29px;
 48 |   background: #CFE2F3;
 49 |   border: 2px solid #073763;
 50 |   font-size: 25px;
 51 | }
 52 | #feedButton {
 53 |   float: left;
 54 |   margin-left: 20px;
 55 |   width: 77px;
 56 |   height: 35px;
 57 |   background: url("images/index-button.png") no-repeat top left;
 58 |   border: none;
 59 |   display: block;
 60 |   font-width: bold;
 61 | }
 62 | #feedButton:hover {
 63 |   background-position: 0px -34px;
 64 | }
 65 | #feedButton:active {
 66 |   background-position: 0px -68px;
 67 | }
 68 | /* ...... editorForm ...... */
 69 | #editorContent {
 70 |   border: 1px solid #36C;
 71 |   width: 80%;
 72 |   margin: 1em auto;
 73 | }
 74 | #editorTitle {
 75 |   padding: 0.3em 0 0.2em 0.5em;
 76 |   background: #E5ECF9;
 77 | }
 78 | #editorTitle > span {
 79 |   font-size: 120%;
 80 |   font-width: bold;
 81 |   line-height: 1.6em;
 82 | }
 83 | #editorTitle > a {
 84 |   margin-right: 5px;
 85 | }
 86 | #editorLeft {
 87 |   width: 39%;
 88 |   float: left;
 89 | }
 90 | #editorLeft > div {
 91 |   padding: 0px 5px;
 92 | }
 93 | #editorRight {
 94 |   width: 59%;
 95 |   float: right;
 96 | }
 97 | #editorRight > div {
 98 |   padding: 0px 5px;
 99 | }
100 | #editorButtonBox {
101 |   text-align: right;
102 |   margin: 0.7em 0px;
103 | }
104 | #editorButtonBox > div.float-right > * {
105 |   margin-left: 1em;
106 | }
107 | .inputArea {
108 |   margin: 0.7em 0px;
109 |   padding: 0.5em 2em 1em;
110 |   border: 1px solid #CCC;
111 |   background: #F6F6F6;
112 |   -moz-border-radius: 4px;
113 |   -webkit-border-radius: 4px;
114 |   border-radius: 4px;
115 | }
116 | .noticeArea {
117 |   border: 1px solid #CD0A0A;
118 |   background: #FEF1EC url(../images/?new=fef1ec&w=1&h=400&f=png&q=100&fltr[]=over|textures/02_glass.png|0|0|95) 50% 50% repeat-x;
119 | }
120 | /* ...... preview ...... */
121 | #previewContent {
122 |   border: 1px solid #36C;
123 |   margin: 1em auto;
124 | }
125 | #previewTitleBox {
126 |   padding: 0.3em 0 0.2em 0.5em;
127 |   background: #E5ECF9;
128 | }
129 | #previewTitle {
130 |   font-size: 120%;
131 |   font-width: bold;
132 |   line-height: 1.6em;
133 | }
134 | #previewControls > a {
135 |   margin-right: 1em;
136 | }
137 | #previewFullText {
138 |   padding: 1em;
139 | }
140 | #previewCode {
141 |   width: 100%;
142 |   height: 10em;
143 |   margin: 1em auto;
144 | }
145 | /* ...... search ...... */
146 | .searchItem {
147 |   border: 1px solid #CCC;
148 |   border-top-width: 0px;
149 |   padding: 1em 2em;
150 |   display: block;
151 | }
152 | .searchItemFirst {
153 |   border-top-width: 1px;
154 | }
155 | .itemControls {
156 |   float: right;
157 | }
158 | .itemControls > a {
159 |   margin-left: 0.5em;
160 | }
161 | .starBox {
162 |   margin: 0.4em;
163 |   float: left;
164 | }
165 | .titleRow {
166 |   font-size: 1.5em;
167 |   font-width: bold;
168 |   float: left;
169 | }
170 | .itemLink {
171 |   font-size: 0.7em;
172 |   font-width: normal;
173 |   color: #CCC;
174 | }
175 | .discriptionRow {
176 |   margin-top: 0.5em;
177 |   padding-right: 6em;
178 |   padding-top: 1px;
179 |   clear: both;
180 | }
181 | .authorRow {
182 |   margin-top: 0.5em;
183 |   clear: both;
184 |   text-align: right;
185 | }
186 | .newProject {
187 |   border: 1px solid #CCC;
188 |   padding: 1em;
189 |   margin: 2em;
190 |   display: block;
191 | }
192 | 
193 | /* ...... footer ...... */
194 | #applogo {
195 |   float: right;
196 | }
197 | /* ...... stickyfooter ...... */
198 | html, body {height: 100%;}
199 | #wrap {
200 |   min-height: 100%;
201 | }
202 | #mainContent {
203 |   overflow:auto;
204 | 	padding-bottom: 35px;
205 | }  /* must be same height as the footer */
206 | #footer {
207 |   position: relative;
208 | 	margin-top: -35px; /* negative value of footer height */
209 | 	height: 35px;
210 | 	clear:both;
211 | } 
212 | #footer-margin {
213 |   height: 55px;
214 | }
215 | /*Opera Fix*/
216 | body:before {
217 | 	content:"";
218 | 	height:100%;
219 | 	float:left;
220 | 	width:0;
221 | 	margin-top:-32767px;/
222 | }
223 | 
224 | /* ....... elements ....... */
225 | .line {
226 |   border-top: 1px solid #C9D7F1;
227 |   font-size: 1px;
228 |   height: 0;
229 |   width: 100%;
230 |   clear: both;
231 | }
232 | .button {
233 |   padding: 0.3em 0.5em;
234 |   float: left;
235 |   border: 1px solid #CCC;
236 |   background: #F6F6F6;
237 |   font-weight: bold;
238 |   color: #1C94C4;
239 | }
240 | a.button {
241 |   text-decoration: none;
242 | }
243 | a.button:hover, input.button:hover {
244 |   background: #DADADA;
245 |   border-color: #999;
246 | }
247 | a.button:active, input.button:active {
248 |   background: white;
249 |   border-color: #AAA;
250 | }
251 | .clear-both {
252 |   clear: both;
253 | }
254 | .float-right {
255 |   float: right;
256 | }
257 | .float-left {
258 |   float: left;
259 | }
260 | [readonly] {
261 |   background: lightgray;
262 | }
263 | 


--------------------------------------------------------------------------------
/css_selector.py:
--------------------------------------------------------------------------------
  1 | #!/url/bin/python
  2 | #-- coding: utf-8 --
  3 | '''
  4 | make minidom selected by css selector
  5 | 
  6 | Create on 2011.2.6
  7 | 
  8 | @author: binux
  9 | '''
 10 | 
 11 | import re
 12 | import string
 13 | 
 14 | from xml.dom import Node
 15 | def fixMiniDom():
 16 |     def getElementById(self, id):
 17 |         if id in self._id_cache:
 18 |              return self._id_cache[id]
 19 |         # cache id
 20 |         if not self._id_cache:
 21 |             for element in self.getElementsByTagName('*'):
 22 |                 if element.getAttribute("id"):
 23 |                     self._id_cache[element.getAttribute("id")] = element
 24 |             if id in self._id_cache:
 25 |                 return self._id_cache[id]
 26 | 
 27 |     def getElementsBySelector(self, all_selectors):
 28 |         selected = []
 29 | 
 30 |         # remove blanks in the right of >
 31 |         all_selectors = re.sub('>\s+', '>', all_selectors)
 32 | 
 33 |         # Grab all of the tagName elements within current context
 34 |         def getElements(context,tag):
 35 |             if not tag: tag = '*'
 36 | 
 37 |             # Get elements matching tag, filter them for class selector
 38 |             found = []
 39 |             for con in context:
 40 |                 eles = con.getElementsByTagName(tag)
 41 |                 found.extend(eles)
 42 | 
 43 |             return found
 44 | 
 45 |         context = [self, ]
 46 |         inheriters = string.split(all_selectors, " ")
 47 | 
 48 |         for element in inheriters:
 49 |             # take all
 50 |             m = re.match(r'^(>)?(\w+)?(#[a-zA-z0-9\-_]+)?(\.[a-zA-z0-9\-_]+)?(#[a-zA-z0-9\-_]+)?(\[(\w+)([=~!\|\^\$\*]?)=?[\'"]?([^\]\'"]*)[\'"]?\])?$', element)
 51 |             if (m):
 52 |                 _sub = m.group(1)
 53 |                 _tag = m.group(2)
 54 |                 _id = m.group(3) or m.group(5)
 55 |                 _class = m.group(4)
 56 |                 _css3 = m.group(6)
 57 |                 _attr = m.group(7)
 58 |                 _operator = m.group(8)
 59 |                 _value = m.group(9)
 60 |             else:
 61 |                 continue
 62 | 
 63 |             # fix id and class
 64 |             if _id: _id = _id[1:]
 65 |             if _class: _class = _class[1:]
 66 | 
 67 |             found = []
 68 |             if _sub:
 69 |                 for con in context:
 70 |                     for each in con.childNodes:
 71 |                         if each.nodeType == Node.ELEMENT_NODE:
 72 |                             found.append(each)
 73 |             elif _id:
 74 |                 ele = self.getElementById(_id)
 75 |                 if ele:
 76 |                     found = [ele, ]
 77 |             else:
 78 |                 found = getElements(context,_tag)
 79 |             
 80 |             # tag
 81 |             if _tag and _id: # as _id is not exist we get element by tag, so isn't neccessary to test this
 82 |                 tmp = []
 83 |                 for fnd in found:
 84 |                     if(fnd.tagName == _tag):
 85 |                         tmp.append(fnd)
 86 |                 found = tmp
 87 | 
 88 |             # id
 89 |             if _id: 
 90 |                 tmp = []
 91 |                 for fnd in found:
 92 |                     if(fnd.getAttribute("id") and (_id == fnd.getAttribute("id"))): 
 93 |                         tmp.append(fnd)
 94 |                 found = tmp
 95 | 
 96 |             # class
 97 |             if _class: 
 98 |                 tmp = []
 99 |                 for fnd in found:
100 |                     if(fnd.getAttribute("class") and (_class in fnd.getAttribute("class").split())): 
101 |                         tmp.append(fnd)
102 |                 found = tmp
103 | 
104 |             # css3
105 |             if _css3:
106 |                 tmp = []
107 |                 for fnd in found:
108 |                     if(_operator=='=' and fnd.getAttribute(_attr) != _value): continue
109 |                     if(_operator=='~' and not(re.search(r'(^|\\s)'+_value+'(\\s|$)',  fnd.getAttribute(_attr)))): continue
110 |                     if(_operator=='!' and re.search(r'(^|\\s)'+_value+'(\\s|$)',  fnd.getAttribute(_attr))): continue
111 |                     if(_operator=='|' and not(re.search(r'^'+_value+'-?', fnd.getAttribute(_attr)))): continue
112 |                     if(_operator=='^' and string.find(fnd.getAttribute(_attr), _value)!=0): continue
113 |                     if(_operator=='$' and string.rfind(fnd.getAttribute(_attr), _value) != (fnd.getAttribute(_attr).length-_value.length)): continue
114 |                     if(_operator=='*' and not(string.find(fnd.getAttribute(_attr), _value)+1)): continue
115 | 
116 |                     elif(not fnd.getAttribute(_attr)): continue
117 |                     tmp.append(fnd)
118 |                 found = tmp
119 | 
120 |             context = found
121 | 
122 |         selected.extend(context)
123 |         return selected
124 | 
125 |     from xml.dom import minidom
126 |     setattr(minidom.Element, '_id_cache', {})
127 |     setattr(minidom.Element, 'getElementById', getElementById)
128 |     setattr(minidom.Element, 'getElementsBySelector', getElementsBySelector)
129 |     setattr(minidom.Document, '_id_cache', {})
130 |     setattr(minidom.Document, 'getElementById', getElementById)
131 |     setattr(minidom.Document, 'getElementsBySelector', getElementsBySelector)
132 | 
133 | fixMiniDom()
134 | 


--------------------------------------------------------------------------------
/doc/RSS 2.0 Specification (version 2.0.11)_files/abg-en-100c-000000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/doc/RSS 2.0 Specification (version 2.0.11)_files/abg-en-100c-000000.png


--------------------------------------------------------------------------------
/doc/RSS 2.0 Specification (version 2.0.11)_files/flowers.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/doc/RSS 2.0 Specification (version 2.0.11)_files/flowers.gif


--------------------------------------------------------------------------------
/doc/RSS 2.0 Specification (version 2.0.11)_files/left-arrow.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/doc/RSS 2.0 Specification (version 2.0.11)_files/left-arrow.gif


--------------------------------------------------------------------------------
/doc/RSS 2.0 Specification (version 2.0.11)_files/rss-advisory-board.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/doc/RSS 2.0 Specification (version 2.0.11)_files/rss-advisory-board.gif


--------------------------------------------------------------------------------
/doc/RSS 2.0 Specification (version 2.0.11)_files/rss-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/doc/RSS 2.0 Specification (version 2.0.11)_files/rss-icon.png


--------------------------------------------------------------------------------
/doc/RSS 2.0 Specification (version 2.0.11)_files/test_domain.js:
--------------------------------------------------------------------------------
1 | (function(){window.google_new_domain_enabled=1;})()
2 | 


--------------------------------------------------------------------------------
/doc/sample-rss-2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <rss version="2.0">
 3 |    <channel>
 4 |       <title>Liftoff News</title>
 5 |       <link>http://liftoff.msfc.nasa.gov/</link>
 6 |       <description>Liftoff to Space Exploration.</description>
 7 |       <language>en-us</language>
 8 |       <pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate>
 9 |       <lastBuildDate>Tue, 10 Jun 2003 09:41:01 GMT</lastBuildDate>
10 |       <docs>http://blogs.law.harvard.edu/tech/rss</docs>
11 |       <generator>Weblog Editor 2.0</generator>
12 |       <managingEditor>editor@example.com</managingEditor>
13 |       <webMaster>webmaster@example.com</webMaster>
14 |       <item>
15 |          <title>Star City</title>
16 |          <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
17 |          <description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's &lt;a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm"&gt;Star City&lt;/a&gt;.</description>
18 |          <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
19 |          <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
20 |       </item>
21 |       <item>
22 |          <description>Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a &lt;a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm"&gt;partial eclipse of the Sun&lt;/a&gt; on Saturday, May 31st.</description>
23 |          <pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate>
24 |          <guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid>
25 |       </item>
26 |       <item>
27 |          <title>The Engine That Does More</title>
28 |          <link>http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp</link>
29 |          <description>Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly.  The proposed VASIMR engine would do that.</description>
30 |          <pubDate>Tue, 27 May 2003 08:37:32 GMT</pubDate>
31 |          <guid>http://liftoff.msfc.nasa.gov/2003/05/27.html#item571</guid>
32 |       </item>
33 |       <item>
34 |          <title>Astronauts' Dirty Laundry</title>
35 |          <link>http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp</link>
36 |          <description>Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them.  Instead, astronauts have other options.</description>
37 |          <pubDate>Tue, 20 May 2003 08:56:02 GMT</pubDate>
38 |          <guid>http://liftoff.msfc.nasa.gov/2003/05/20.html#item570</guid>
39 |       </item>
40 |    </channel>
41 | </rss>


--------------------------------------------------------------------------------
/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/favicon.ico


--------------------------------------------------------------------------------
/fetcher.py:
--------------------------------------------------------------------------------
 1 | #!/url/bin/python
 2 | #-- coding: utf-8 --
 3 | '''
 4 | Create on 2011.2.1
 5 | 
 6 | @author: binux
 7 | '''
 8 | 
 9 | import logging
10 | import hashlib
11 | import html5lib
12 | import css_selector
13 | 
14 | from html5lib.filters import fullurl
15 | 
16 | from google.appengine.ext import db
17 | from google.appengine.api import urlfetch
18 | 
19 | _parse = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom")).parse
20 | _walker = html5lib.treewalkers.getTreeWalker("dom")
21 | _serialize = html5lib.serializer.htmlserializer.HTMLSerializer(
22 |         omit_optional_tags=False, 
23 |         quote_attr_values=True).serialize
24 | 
25 | class DescriptionCache(db.Model):
26 |     link = db.StringProperty(required=True)
27 |     project_id = db.IntegerProperty(required=True)
28 |     description = db.TextProperty()
29 | 
30 |     retryCount = db.IntegerProperty(required=True, default=3)
31 |     createdDate = db.DateTimeProperty(required=True, auto_now_add=True)
32 |     lastVisitedDate = db.DateTimeProperty(required=True, auto_now=True)
33 | 
34 | def fetch_description(url, project):
35 |     des_cache = DescriptionCache.get_by_key_name(hashlib.md5(str(project.key().id())+url).hexdigest())
36 |     if des_cache is None:
37 |         description = real_fetch_description(url, project.contentSelector, project.filterSelector, project.encoding)
38 |         if description:
39 |             des_cache = DescriptionCache.get_or_insert(hashlib.md5(str(project.key().id())+url).hexdigest(), 
40 |                     link=url, project_id = project.key().id(), description=db.Text(description))
41 |             des_cache.put()
42 |     else:
43 |         description = des_cache.description
44 |         des_cache.put()
45 | 
46 |     return description
47 | 
48 | def real_fetch_description(url, content_selector, filter_selector, encoding=None):
49 |     try:
50 |         response = urlfetch.fetch(url)
51 |     except urlfetch.InvalidURLError, e:
52 |         return u''
53 |     if response.status_code != 200:
54 |         raise Exception, "status code: response.status_code"
55 | 
56 |     doc_dom = _parse(response.content, encoding=encoding)
57 |     content_dom = []
58 |     for each in [x for x in content_selector.split('\n') if x]:
59 |         dom = doc_dom.getElementsBySelector(each)
60 |         content_dom.extend(dom)
61 |     content_dom = set(content_dom)
62 | 
63 |     filter_dom = []
64 |     for each_content in content_dom:
65 |         for each_selector in [x for x in filter_selector.split('\n') if x]:
66 |             dom = each_content.getElementsBySelector(each_selector)
67 |             filter_dom.extend(dom)
68 |     filter_dom = set(filter_dom)
69 |     for each_dom in filter_dom:
70 |         if each_dom.parentNode:
71 |             each_dom.parentNode.removeChild(each_dom)
72 | 
73 |     contents = []
74 |     for dom in content_dom:
75 |         w = _walker(dom)
76 |         w = fullurl.Filter(w, url)
77 |         for item in _serialize(w):
78 |             contents.append(item)
79 |     return u''.join(contents)
80 | 


--------------------------------------------------------------------------------
/fix_path.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | 
4 | sys.path.append(os.path.join(os.path.dirname(__file__), 'lib'))
5 | 


--------------------------------------------------------------------------------
/html5lib/__init__.py:
--------------------------------------------------------------------------------
 1 | """ 
 2 | HTML parsing library based on the WHATWG "HTML5"
 3 | specification. The parser is designed to be compatible with existing
 4 | HTML found in the wild and implements well-defined error recovery that
 5 | is largely compatible with modern desktop web browsers.
 6 | 
 7 | Example usage:
 8 | 
 9 | import html5lib
10 | f = open("my_document.html")
11 | tree = html5lib.parse(f) 
12 | """
13 | __version__ = "0.90"
14 | from html5parser import HTMLParser, parse, parseFragment
15 | from treebuilders import getTreeBuilder
16 | from treewalkers import getTreeWalker
17 | from serializer import serialize
18 | 


--------------------------------------------------------------------------------
/html5lib/chardet/__init__.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # This library is free software; you can redistribute it and/or
 3 | # modify it under the terms of the GNU Lesser General Public
 4 | # License as published by the Free Software Foundation; either
 5 | # version 2.1 of the License, or (at your option) any later version.
 6 | # 
 7 | # This library is distributed in the hope that it will be useful,
 8 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 9 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
10 | # Lesser General Public License for more details.
11 | # 
12 | # You should have received a copy of the GNU Lesser General Public
13 | # License along with this library; if not, write to the Free Software
14 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
15 | # 02110-1301  USA
16 | ######################### END LICENSE BLOCK #########################
17 | 
18 | __version__ = "2.0.1"
19 | 
20 | def detect(aBuf):
21 |     import universaldetector
22 |     u = universaldetector.UniversalDetector()
23 |     u.reset()
24 |     u.feed(aBuf)
25 |     u.close()
26 |     return u.result
27 | 


--------------------------------------------------------------------------------
/html5lib/chardet/big5prober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is Mozilla Communicator client code.
 3 | # 
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | # 
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | # 
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | # 
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | from mbcharsetprober import MultiByteCharSetProber
29 | from codingstatemachine import CodingStateMachine
30 | from chardistribution import Big5DistributionAnalysis
31 | from mbcssm import Big5SMModel
32 | 
33 | class Big5Prober(MultiByteCharSetProber):
34 |     def __init__(self):
35 |         MultiByteCharSetProber.__init__(self)
36 |         self._mCodingSM = CodingStateMachine(Big5SMModel)
37 |         self._mDistributionAnalyzer = Big5DistributionAnalysis()
38 |         self.reset()
39 | 
40 |     def get_charset_name(self):
41 |         return "Big5"
42 | 


--------------------------------------------------------------------------------
/html5lib/chardet/chardistribution.py:
--------------------------------------------------------------------------------
  1 | ######################## BEGIN LICENSE BLOCK ########################
  2 | # The Original Code is Mozilla Communicator client code.
  3 | # 
  4 | # The Initial Developer of the Original Code is
  5 | # Netscape Communications Corporation.
  6 | # Portions created by the Initial Developer are Copyright (C) 1998
  7 | # the Initial Developer. All Rights Reserved.
  8 | # 
  9 | # Contributor(s):
 10 | #   Mark Pilgrim - port to Python
 11 | #
 12 | # This library is free software; you can redistribute it and/or
 13 | # modify it under the terms of the GNU Lesser General Public
 14 | # License as published by the Free Software Foundation; either
 15 | # version 2.1 of the License, or (at your option) any later version.
 16 | # 
 17 | # This library is distributed in the hope that it will be useful,
 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 20 | # Lesser General Public License for more details.
 21 | # 
 22 | # You should have received a copy of the GNU Lesser General Public
 23 | # License along with this library; if not, write to the Free Software
 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 25 | # 02110-1301  USA
 26 | ######################### END LICENSE BLOCK #########################
 27 | 
 28 | import constants
 29 | from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
 30 | from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
 31 | from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
 32 | from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO
 33 | from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO
 34 | 
 35 | ENOUGH_DATA_THRESHOLD = 1024
 36 | SURE_YES = 0.99
 37 | SURE_NO = 0.01
 38 | 
 39 | class CharDistributionAnalysis:
 40 |     def __init__(self):
 41 |         self._mCharToFreqOrder = None # Mapping table to get frequency order from char order (get from GetOrder())
 42 |         self._mTableSize = None # Size of above table
 43 |         self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence.  See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
 44 |         self.reset()
 45 |         
 46 |     def reset(self):
 47 |         """reset analyser, clear any state"""
 48 |         self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
 49 |         self._mTotalChars = 0 # Total characters encountered
 50 |         self._mFreqChars = 0 # The number of characters whose frequency order is less than 512
 51 | 
 52 |     def feed(self, aStr, aCharLen):
 53 |         """feed a character with known length"""
 54 |         if aCharLen == 2:
 55 |             # we only care about 2-bytes character in our distribution analysis
 56 |             order = self.get_order(aStr)
 57 |         else:
 58 |             order = -1
 59 |         if order >= 0:
 60 |             self._mTotalChars += 1
 61 |             # order is valid
 62 |             if order < self._mTableSize:
 63 |                 if 512 > self._mCharToFreqOrder[order]:
 64 |                     self._mFreqChars += 1
 65 | 
 66 |     def get_confidence(self):
 67 |         """return confidence based on existing data"""
 68 |         # if we didn't receive any character in our consideration range, return negative answer
 69 |         if self._mTotalChars <= 0:
 70 |             return SURE_NO
 71 | 
 72 |         if self._mTotalChars != self._mFreqChars:
 73 |             r = self._mFreqChars / ((self._mTotalChars - self._mFreqChars) * self._mTypicalDistributionRatio)
 74 |             if r < SURE_YES:
 75 |                 return r
 76 | 
 77 |         # normalize confidence (we don't want to be 100% sure)
 78 |         return SURE_YES
 79 | 
 80 |     def got_enough_data(self):
 81 |         # It is not necessary to receive all data to draw conclusion. For charset detection,
 82 |         # certain amount of data is enough
 83 |         return self._mTotalChars > ENOUGH_DATA_THRESHOLD
 84 | 
 85 |     def get_order(self, aStr):
 86 |         # We do not handle characters based on the original encoding string, but 
 87 |         # convert this encoding string to a number, here called order.
 88 |         # This allows multiple encodings of a language to share one frequency table.
 89 |         return -1
 90 |     
 91 | class EUCTWDistributionAnalysis(CharDistributionAnalysis):
 92 |     def __init__(self):
 93 |         CharDistributionAnalysis.__init__(self)
 94 |         self._mCharToFreqOrder = EUCTWCharToFreqOrder
 95 |         self._mTableSize = EUCTW_TABLE_SIZE
 96 |         self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
 97 | 
 98 |     def get_order(self, aStr):
 99 |         # for euc-TW encoding, we are interested 
100 |         #   first  byte range: 0xc4 -- 0xfe
101 |         #   second byte range: 0xa1 -- 0xfe
102 |         # no validation needed here. State machine has done that
103 |         if aStr[0] >= '\xC4':
104 |             return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1
105 |         else:
106 |             return -1
107 | 
108 | class EUCKRDistributionAnalysis(CharDistributionAnalysis):
109 |     def __init__(self):
110 |         CharDistributionAnalysis.__init__(self)
111 |         self._mCharToFreqOrder = EUCKRCharToFreqOrder
112 |         self._mTableSize = EUCKR_TABLE_SIZE
113 |         self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
114 | 
115 |     def get_order(self, aStr):
116 |         # for euc-KR encoding, we are interested 
117 |         #   first  byte range: 0xb0 -- 0xfe
118 |         #   second byte range: 0xa1 -- 0xfe
119 |         # no validation needed here. State machine has done that
120 |         if aStr[0] >= '\xB0':
121 |             return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
122 |         else:
123 |             return -1;
124 | 
125 | class GB2312DistributionAnalysis(CharDistributionAnalysis):
126 |     def __init__(self):
127 |         CharDistributionAnalysis.__init__(self)
128 |         self._mCharToFreqOrder = GB2312CharToFreqOrder
129 |         self._mTableSize = GB2312_TABLE_SIZE
130 |         self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
131 | 
132 |     def get_order(self, aStr):
133 |         # for GB2312 encoding, we are interested 
134 |         #  first  byte range: 0xb0 -- 0xfe
135 |         #  second byte range: 0xa1 -- 0xfe
136 |         # no validation needed here. State machine has done that
137 |         if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'):
138 |             return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
139 |         else:
140 |             return -1;
141 | 
142 | class Big5DistributionAnalysis(CharDistributionAnalysis):
143 |     def __init__(self):
144 |         CharDistributionAnalysis.__init__(self)
145 |         self._mCharToFreqOrder = Big5CharToFreqOrder
146 |         self._mTableSize = BIG5_TABLE_SIZE
147 |         self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
148 | 
149 |     def get_order(self, aStr):
150 |         # for big5 encoding, we are interested 
151 |         #   first  byte range: 0xa4 -- 0xfe
152 |         #   second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
153 |         # no validation needed here. State machine has done that
154 |         if aStr[0] >= '\xA4':
155 |             if aStr[1] >= '\xA1':
156 |                 return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63
157 |             else:
158 |                 return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40
159 |         else:
160 |             return -1
161 | 
162 | class SJISDistributionAnalysis(CharDistributionAnalysis):
163 |     def __init__(self):
164 |         CharDistributionAnalysis.__init__(self)
165 |         self._mCharToFreqOrder = JISCharToFreqOrder
166 |         self._mTableSize = JIS_TABLE_SIZE
167 |         self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
168 | 
169 |     def get_order(self, aStr):
170 |         # for sjis encoding, we are interested 
171 |         #   first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
172 |         #   second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
173 |         # no validation needed here. State machine has done that
174 |         if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'):
175 |             order = 188 * (ord(aStr[0]) - 0x81)
176 |         elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'):
177 |             order = 188 * (ord(aStr[0]) - 0xE0 + 31)
178 |         else:
179 |             return -1;
180 |         order = order + ord(aStr[1]) - 0x40
181 |         if aStr[1] > '\x7F':
182 |             order =- 1
183 |         return order
184 | 
185 | class EUCJPDistributionAnalysis(CharDistributionAnalysis):
186 |     def __init__(self):
187 |         CharDistributionAnalysis.__init__(self)
188 |         self._mCharToFreqOrder = JISCharToFreqOrder
189 |         self._mTableSize = JIS_TABLE_SIZE
190 |         self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
191 | 
192 |     def get_order(self, aStr):
193 |         # for euc-JP encoding, we are interested 
194 |         #   first  byte range: 0xa0 -- 0xfe
195 |         #   second byte range: 0xa1 -- 0xfe
196 |         # no validation needed here. State machine has done that
197 |         if aStr[0] >= '\xA0':
198 |             return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1
199 |         else:
200 |             return -1
201 | 


--------------------------------------------------------------------------------
/html5lib/chardet/charsetgroupprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is Mozilla Communicator client code.
 3 | # 
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | # 
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | # 
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | # 
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | import constants, sys
29 | from charsetprober import CharSetProber
30 | 
31 | class CharSetGroupProber(CharSetProber):
32 |     def __init__(self):
33 |         CharSetProber.__init__(self)
34 |         self._mActiveNum = 0
35 |         self._mProbers = []
36 |         self._mBestGuessProber = None
37 |         
38 |     def reset(self):
39 |         CharSetProber.reset(self)
40 |         self._mActiveNum = 0
41 |         for prober in self._mProbers:
42 |             if prober:
43 |                 prober.reset()
44 |                 prober.active = constants.True
45 |                 self._mActiveNum += 1
46 |         self._mBestGuessProber = None
47 | 
48 |     def get_charset_name(self):
49 |         if not self._mBestGuessProber:
50 |             self.get_confidence()
51 |             if not self._mBestGuessProber: return None
52 | #                self._mBestGuessProber = self._mProbers[0]
53 |         return self._mBestGuessProber.get_charset_name()
54 | 
55 |     def feed(self, aBuf):
56 |         for prober in self._mProbers:
57 |             if not prober: continue
58 |             if not prober.active: continue
59 |             st = prober.feed(aBuf)
60 |             if not st: continue
61 |             if st == constants.eFoundIt:
62 |                 self._mBestGuessProber = prober
63 |                 return self.get_state()
64 |             elif st == constants.eNotMe:
65 |                 prober.active = constants.False
66 |                 self._mActiveNum -= 1
67 |                 if self._mActiveNum <= 0:
68 |                     self._mState = constants.eNotMe
69 |                     return self.get_state()
70 |         return self.get_state()
71 | 
72 |     def get_confidence(self):
73 |         st = self.get_state()
74 |         if st == constants.eFoundIt:
75 |             return 0.99
76 |         elif st == constants.eNotMe:
77 |             return 0.01
78 |         bestConf = 0.0
79 |         self._mBestGuessProber = None
80 |         for prober in self._mProbers:
81 |             if not prober: continue
82 |             if not prober.active:
83 |                 if constants._debug:
84 |                     sys.stderr.write(prober.get_charset_name() + ' not active\n')
85 |                 continue
86 |             cf = prober.get_confidence()
87 |             if constants._debug:
88 |                 sys.stderr.write('%s confidence = %s\n' % (prober.get_charset_name(), cf))
89 |             if bestConf < cf:
90 |                 bestConf = cf
91 |                 self._mBestGuessProber = prober
92 |         if not self._mBestGuessProber: return 0.0
93 |         return bestConf
94 | #        else:
95 | #            self._mBestGuessProber = self._mProbers[0]
96 | #            return self._mBestGuessProber.get_confidence()
97 | 


--------------------------------------------------------------------------------
/html5lib/chardet/charsetprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is Mozilla Universal charset detector code.
 3 | # 
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 2001
 7 | # the Initial Developer. All Rights Reserved.
 8 | # 
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #   Shy Shalom - original C code
12 | #
13 | # This library is free software; you can redistribute it and/or
14 | # modify it under the terms of the GNU Lesser General Public
15 | # License as published by the Free Software Foundation; either
16 | # version 2.1 of the License, or (at your option) any later version.
17 | # 
18 | # This library is distributed in the hope that it will be useful,
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21 | # Lesser General Public License for more details.
22 | # 
23 | # You should have received a copy of the GNU Lesser General Public
24 | # License along with this library; if not, write to the Free Software
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 | # 02110-1301  USA
27 | ######################### END LICENSE BLOCK #########################
28 | 
29 | import constants, re
30 | 
31 | class CharSetProber:
32 |     def __init__(self):
33 |         pass
34 |         
35 |     def reset(self):
36 |         self._mState = constants.eDetecting
37 |     
38 |     def get_charset_name(self):
39 |         return None
40 | 
41 |     def feed(self, aBuf):
42 |         pass
43 | 
44 |     def get_state(self):
45 |         return self._mState
46 | 
47 |     def get_confidence(self):
48 |         return 0.0
49 | 
50 |     def filter_high_bit_only(self, aBuf):
51 |         aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf)
52 |         return aBuf
53 |     
54 |     def filter_without_english_letters(self, aBuf):
55 |         aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf)
56 |         return aBuf
57 |         
58 |     def filter_with_english_letters(self, aBuf):
59 |         # TODO
60 |         return aBuf
61 | 


--------------------------------------------------------------------------------
/html5lib/chardet/codingstatemachine.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | # 
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | # 
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | from constants import eStart, eError, eItsMe
29 | 
30 | class CodingStateMachine:
31 |     def __init__(self, sm):
32 |         self._mModel = sm
33 |         self._mCurrentBytePos = 0
34 |         self._mCurrentCharLen = 0
35 |         self.reset()
36 | 
37 |     def reset(self):
38 |         self._mCurrentState = eStart
39 | 
40 |     def next_state(self, c):
41 |         # for each byte we get its class
42 |         # if it is first byte, we also get byte length
43 |         byteCls = self._mModel['classTable'][ord(c)]
44 |         if self._mCurrentState == eStart:
45 |             self._mCurrentBytePos = 0
46 |             self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
47 |         # from byte's class and stateTable, we get its next state
48 |         self._mCurrentState = self._mModel['stateTable'][self._mCurrentState * self._mModel['classFactor'] + byteCls]
49 |         self._mCurrentBytePos += 1
50 |         return self._mCurrentState
51 | 
52 |     def get_current_charlen(self):
53 |         return self._mCurrentCharLen
54 | 
55 |     def get_coding_state_machine(self):
56 |         return self._mModel['name']
57 | 


--------------------------------------------------------------------------------
/html5lib/chardet/constants.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is Mozilla Universal charset detector code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 2001
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #   Shy Shalom - original C code
12 | #
13 | # This library is free software; you can redistribute it and/or
14 | # modify it under the terms of the GNU Lesser General Public
15 | # License as published by the Free Software Foundation; either
16 | # version 2.1 of the License, or (at your option) any later version.
17 | # 
18 | # This library is distributed in the hope that it will be useful,
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21 | # Lesser General Public License for more details.
22 | # 
23 | # You should have received a copy of the GNU Lesser General Public
24 | # License along with this library; if not, write to the Free Software
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 | # 02110-1301  USA
27 | ######################### END LICENSE BLOCK #########################
28 | 
29 | _debug = 0
30 | 
31 | eDetecting = 0
32 | eFoundIt = 1
33 | eNotMe = 2
34 | 
35 | eStart = 0
36 | eError = 1
37 | eItsMe = 2
38 | 
39 | SHORTCUT_THRESHOLD = 0.95
40 | 
41 | import __builtin__
42 | if not hasattr(__builtin__, 'False'):
43 |     False = 0
44 |     True = 1
45 | else:
46 |     False = __builtin__.False
47 |     True = __builtin__.True
48 | 


--------------------------------------------------------------------------------
/html5lib/chardet/escprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | # 
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | # 
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | import constants, sys
29 | from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
30 | from charsetprober import CharSetProber
31 | from codingstatemachine import CodingStateMachine
32 | 
33 | class EscCharSetProber(CharSetProber):
34 |     def __init__(self):
35 |         CharSetProber.__init__(self)
36 |         self._mCodingSM = [ \
37 |             CodingStateMachine(HZSMModel),
38 |             CodingStateMachine(ISO2022CNSMModel),
39 |             CodingStateMachine(ISO2022JPSMModel),
40 |             CodingStateMachine(ISO2022KRSMModel)
41 |             ]
42 |         self.reset()
43 | 
44 |     def reset(self):
45 |         CharSetProber.reset(self)
46 |         for codingSM in self._mCodingSM:
47 |             if not codingSM: continue
48 |             codingSM.active = constants.True
49 |             codingSM.reset()
50 |         self._mActiveSM = len(self._mCodingSM)
51 |         self._mDetectedCharset = None
52 | 
53 |     def get_charset_name(self):
54 |         return self._mDetectedCharset
55 | 
56 |     def get_confidence(self):
57 |         if self._mDetectedCharset:
58 |             return 0.99
59 |         else:
60 |             return 0.00
61 | 
62 |     def feed(self, aBuf):
63 |         for c in aBuf:
64 |             for codingSM in self._mCodingSM:
65 |                 if not codingSM: continue
66 |                 if not codingSM.active: continue
67 |                 codingState = codingSM.next_state(c)
68 |                 if codingState == constants.eError:
69 |                     codingSM.active = constants.False
70 |                     self._mActiveSM -= 1
71 |                     if self._mActiveSM <= 0:
72 |                         self._mState = constants.eNotMe
73 |                         return self.get_state()
74 |                 elif codingState == constants.eItsMe:
75 |                     self._mState = constants.eFoundIt
76 |                     self._mDetectedCharset = codingSM.get_coding_state_machine()
77 |                     return self.get_state()
78 |                 
79 |         return self.get_state()
80 | 


--------------------------------------------------------------------------------
/html5lib/chardet/escsm.py:
--------------------------------------------------------------------------------
  1 | ######################## BEGIN LICENSE BLOCK ########################
  2 | # The Original Code is mozilla.org code.
  3 | #
  4 | # The Initial Developer of the Original Code is
  5 | # Netscape Communications Corporation.
  6 | # Portions created by the Initial Developer are Copyright (C) 1998
  7 | # the Initial Developer. All Rights Reserved.
  8 | #
  9 | # Contributor(s):
 10 | #   Mark Pilgrim - port to Python
 11 | #
 12 | # This library is free software; you can redistribute it and/or
 13 | # modify it under the terms of the GNU Lesser General Public
 14 | # License as published by the Free Software Foundation; either
 15 | # version 2.1 of the License, or (at your option) any later version.
 16 | # 
 17 | # This library is distributed in the hope that it will be useful,
 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 20 | # Lesser General Public License for more details.
 21 | # 
 22 | # You should have received a copy of the GNU Lesser General Public
 23 | # License along with this library; if not, write to the Free Software
 24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 25 | # 02110-1301  USA
 26 | ######################### END LICENSE BLOCK #########################
 27 | 
 28 | from constants import eStart, eError, eItsMe
 29 | 
 30 | HZ_cls = ( \
 31 | 1,0,0,0,0,0,0,0,  # 00 - 07 
 32 | 0,0,0,0,0,0,0,0,  # 08 - 0f 
 33 | 0,0,0,0,0,0,0,0,  # 10 - 17 
 34 | 0,0,0,1,0,0,0,0,  # 18 - 1f 
 35 | 0,0,0,0,0,0,0,0,  # 20 - 27 
 36 | 0,0,0,0,0,0,0,0,  # 28 - 2f 
 37 | 0,0,0,0,0,0,0,0,  # 30 - 37 
 38 | 0,0,0,0,0,0,0,0,  # 38 - 3f 
 39 | 0,0,0,0,0,0,0,0,  # 40 - 47 
 40 | 0,0,0,0,0,0,0,0,  # 48 - 4f 
 41 | 0,0,0,0,0,0,0,0,  # 50 - 57 
 42 | 0,0,0,0,0,0,0,0,  # 58 - 5f 
 43 | 0,0,0,0,0,0,0,0,  # 60 - 67 
 44 | 0,0,0,0,0,0,0,0,  # 68 - 6f 
 45 | 0,0,0,0,0,0,0,0,  # 70 - 77 
 46 | 0,0,0,4,0,5,2,0,  # 78 - 7f 
 47 | 1,1,1,1,1,1,1,1,  # 80 - 87 
 48 | 1,1,1,1,1,1,1,1,  # 88 - 8f 
 49 | 1,1,1,1,1,1,1,1,  # 90 - 97 
 50 | 1,1,1,1,1,1,1,1,  # 98 - 9f 
 51 | 1,1,1,1,1,1,1,1,  # a0 - a7 
 52 | 1,1,1,1,1,1,1,1,  # a8 - af 
 53 | 1,1,1,1,1,1,1,1,  # b0 - b7 
 54 | 1,1,1,1,1,1,1,1,  # b8 - bf 
 55 | 1,1,1,1,1,1,1,1,  # c0 - c7 
 56 | 1,1,1,1,1,1,1,1,  # c8 - cf 
 57 | 1,1,1,1,1,1,1,1,  # d0 - d7 
 58 | 1,1,1,1,1,1,1,1,  # d8 - df 
 59 | 1,1,1,1,1,1,1,1,  # e0 - e7 
 60 | 1,1,1,1,1,1,1,1,  # e8 - ef 
 61 | 1,1,1,1,1,1,1,1,  # f0 - f7 
 62 | 1,1,1,1,1,1,1,1,  # f8 - ff 
 63 | )
 64 | 
 65 | HZ_st = ( \
 66 | eStart,eError,     3,eStart,eStart,eStart,eError,eError,# 00-07 
 67 | eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f 
 68 | eItsMe,eItsMe,eError,eError,eStart,eStart,     4,eError,# 10-17 
 69 |      5,eError,     6,eError,     5,     5,     4,eError,# 18-1f 
 70 |      4,eError,     4,     4,     4,eError,     4,eError,# 20-27 
 71 |      4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart,# 28-2f 
 72 | )
 73 | 
 74 | HZCharLenTable = (0, 0, 0, 0, 0, 0)
 75 | 
 76 | HZSMModel = {'classTable': HZ_cls,
 77 |              'classFactor': 6,
 78 |              'stateTable': HZ_st,
 79 |              'charLenTable': HZCharLenTable,
 80 |              'name': "HZ-GB-2312"}
 81 | 
 82 | ISO2022CN_cls = ( \
 83 | 2,0,0,0,0,0,0,0,  # 00 - 07 
 84 | 0,0,0,0,0,0,0,0,  # 08 - 0f 
 85 | 0,0,0,0,0,0,0,0,  # 10 - 17 
 86 | 0,0,0,1,0,0,0,0,  # 18 - 1f 
 87 | 0,0,0,0,0,0,0,0,  # 20 - 27 
 88 | 0,3,0,0,0,0,0,0,  # 28 - 2f 
 89 | 0,0,0,0,0,0,0,0,  # 30 - 37 
 90 | 0,0,0,0,0,0,0,0,  # 38 - 3f 
 91 | 0,0,0,4,0,0,0,0,  # 40 - 47 
 92 | 0,0,0,0,0,0,0,0,  # 48 - 4f 
 93 | 0,0,0,0,0,0,0,0,  # 50 - 57 
 94 | 0,0,0,0,0,0,0,0,  # 58 - 5f 
 95 | 0,0,0,0,0,0,0,0,  # 60 - 67 
 96 | 0,0,0,0,0,0,0,0,  # 68 - 6f 
 97 | 0,0,0,0,0,0,0,0,  # 70 - 77 
 98 | 0,0,0,0,0,0,0,0,  # 78 - 7f 
 99 | 2,2,2,2,2,2,2,2,  # 80 - 87 
100 | 2,2,2,2,2,2,2,2,  # 88 - 8f 
101 | 2,2,2,2,2,2,2,2,  # 90 - 97 
102 | 2,2,2,2,2,2,2,2,  # 98 - 9f 
103 | 2,2,2,2,2,2,2,2,  # a0 - a7 
104 | 2,2,2,2,2,2,2,2,  # a8 - af 
105 | 2,2,2,2,2,2,2,2,  # b0 - b7 
106 | 2,2,2,2,2,2,2,2,  # b8 - bf 
107 | 2,2,2,2,2,2,2,2,  # c0 - c7 
108 | 2,2,2,2,2,2,2,2,  # c8 - cf 
109 | 2,2,2,2,2,2,2,2,  # d0 - d7 
110 | 2,2,2,2,2,2,2,2,  # d8 - df 
111 | 2,2,2,2,2,2,2,2,  # e0 - e7 
112 | 2,2,2,2,2,2,2,2,  # e8 - ef 
113 | 2,2,2,2,2,2,2,2,  # f0 - f7 
114 | 2,2,2,2,2,2,2,2,  # f8 - ff 
115 | )
116 | 
117 | ISO2022CN_st = ( \
118 | eStart,     3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 
119 | eStart,eError,eError,eError,eError,eError,eError,eError,# 08-0f 
120 | eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 
121 | eItsMe,eItsMe,eItsMe,eError,eError,eError,     4,eError,# 18-1f 
122 | eError,eError,eError,eItsMe,eError,eError,eError,eError,# 20-27 
123 |      5,     6,eError,eError,eError,eError,eError,eError,# 28-2f 
124 | eError,eError,eError,eItsMe,eError,eError,eError,eError,# 30-37 
125 | eError,eError,eError,eError,eError,eItsMe,eError,eStart,# 38-3f 
126 | )
127 | 
128 | ISO2022CNCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0)
129 | 
130 | ISO2022CNSMModel = {'classTable': ISO2022CN_cls,
131 |                     'classFactor': 9,
132 |                     'stateTable': ISO2022CN_st,
133 |                     'charLenTable': ISO2022CNCharLenTable,
134 |                     'name': "ISO-2022-CN"}
135 | 
136 | ISO2022JP_cls = ( \
137 | 2,0,0,0,0,0,0,0,  # 00 - 07 
138 | 0,0,0,0,0,0,2,2,  # 08 - 0f 
139 | 0,0,0,0,0,0,0,0,  # 10 - 17 
140 | 0,0,0,1,0,0,0,0,  # 18 - 1f 
141 | 0,0,0,0,7,0,0,0,  # 20 - 27 
142 | 3,0,0,0,0,0,0,0,  # 28 - 2f 
143 | 0,0,0,0,0,0,0,0,  # 30 - 37 
144 | 0,0,0,0,0,0,0,0,  # 38 - 3f 
145 | 6,0,4,0,8,0,0,0,  # 40 - 47 
146 | 0,9,5,0,0,0,0,0,  # 48 - 4f 
147 | 0,0,0,0,0,0,0,0,  # 50 - 57 
148 | 0,0,0,0,0,0,0,0,  # 58 - 5f 
149 | 0,0,0,0,0,0,0,0,  # 60 - 67 
150 | 0,0,0,0,0,0,0,0,  # 68 - 6f 
151 | 0,0,0,0,0,0,0,0,  # 70 - 77 
152 | 0,0,0,0,0,0,0,0,  # 78 - 7f 
153 | 2,2,2,2,2,2,2,2,  # 80 - 87 
154 | 2,2,2,2,2,2,2,2,  # 88 - 8f 
155 | 2,2,2,2,2,2,2,2,  # 90 - 97 
156 | 2,2,2,2,2,2,2,2,  # 98 - 9f 
157 | 2,2,2,2,2,2,2,2,  # a0 - a7 
158 | 2,2,2,2,2,2,2,2,  # a8 - af 
159 | 2,2,2,2,2,2,2,2,  # b0 - b7 
160 | 2,2,2,2,2,2,2,2,  # b8 - bf 
161 | 2,2,2,2,2,2,2,2,  # c0 - c7 
162 | 2,2,2,2,2,2,2,2,  # c8 - cf 
163 | 2,2,2,2,2,2,2,2,  # d0 - d7 
164 | 2,2,2,2,2,2,2,2,  # d8 - df 
165 | 2,2,2,2,2,2,2,2,  # e0 - e7 
166 | 2,2,2,2,2,2,2,2,  # e8 - ef 
167 | 2,2,2,2,2,2,2,2,  # f0 - f7 
168 | 2,2,2,2,2,2,2,2,  # f8 - ff 
169 | )
170 | 
171 | ISO2022JP_st = ( \
172 | eStart,     3,eError,eStart,eStart,eStart,eStart,eStart,# 00-07 
173 | eStart,eStart,eError,eError,eError,eError,eError,eError,# 08-0f 
174 | eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 10-17 
175 | eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,# 18-1f 
176 | eError,     5,eError,eError,eError,     4,eError,eError,# 20-27 
177 | eError,eError,eError,     6,eItsMe,eError,eItsMe,eError,# 28-2f 
178 | eError,eError,eError,eError,eError,eError,eItsMe,eItsMe,# 30-37 
179 | eError,eError,eError,eItsMe,eError,eError,eError,eError,# 38-3f 
180 | eError,eError,eError,eError,eItsMe,eError,eStart,eStart,# 40-47 
181 | )
182 | 
183 | ISO2022JPCharLenTable = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
184 | 
185 | ISO2022JPSMModel = {'classTable': ISO2022JP_cls,
186 |                     'classFactor': 10,
187 |                     'stateTable': ISO2022JP_st,
188 |                     'charLenTable': ISO2022JPCharLenTable,
189 |                     'name': "ISO-2022-JP"}
190 | 
191 | ISO2022KR_cls = ( \
192 | 2,0,0,0,0,0,0,0,  # 00 - 07 
193 | 0,0,0,0,0,0,0,0,  # 08 - 0f 
194 | 0,0,0,0,0,0,0,0,  # 10 - 17 
195 | 0,0,0,1,0,0,0,0,  # 18 - 1f 
196 | 0,0,0,0,3,0,0,0,  # 20 - 27 
197 | 0,4,0,0,0,0,0,0,  # 28 - 2f 
198 | 0,0,0,0,0,0,0,0,  # 30 - 37 
199 | 0,0,0,0,0,0,0,0,  # 38 - 3f 
200 | 0,0,0,5,0,0,0,0,  # 40 - 47 
201 | 0,0,0,0,0,0,0,0,  # 48 - 4f 
202 | 0,0,0,0,0,0,0,0,  # 50 - 57 
203 | 0,0,0,0,0,0,0,0,  # 58 - 5f 
204 | 0,0,0,0,0,0,0,0,  # 60 - 67 
205 | 0,0,0,0,0,0,0,0,  # 68 - 6f 
206 | 0,0,0,0,0,0,0,0,  # 70 - 77 
207 | 0,0,0,0,0,0,0,0,  # 78 - 7f 
208 | 2,2,2,2,2,2,2,2,  # 80 - 87 
209 | 2,2,2,2,2,2,2,2,  # 88 - 8f 
210 | 2,2,2,2,2,2,2,2,  # 90 - 97 
211 | 2,2,2,2,2,2,2,2,  # 98 - 9f 
212 | 2,2,2,2,2,2,2,2,  # a0 - a7 
213 | 2,2,2,2,2,2,2,2,  # a8 - af 
214 | 2,2,2,2,2,2,2,2,  # b0 - b7 
215 | 2,2,2,2,2,2,2,2,  # b8 - bf 
216 | 2,2,2,2,2,2,2,2,  # c0 - c7 
217 | 2,2,2,2,2,2,2,2,  # c8 - cf 
218 | 2,2,2,2,2,2,2,2,  # d0 - d7 
219 | 2,2,2,2,2,2,2,2,  # d8 - df 
220 | 2,2,2,2,2,2,2,2,  # e0 - e7 
221 | 2,2,2,2,2,2,2,2,  # e8 - ef 
222 | 2,2,2,2,2,2,2,2,  # f0 - f7 
223 | 2,2,2,2,2,2,2,2,  # f8 - ff 
224 | )
225 | 
226 | ISO2022KR_st = ( \
227 | eStart,     3,eError,eStart,eStart,eStart,eError,eError,# 00-07 
228 | eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,# 08-0f 
229 | eItsMe,eItsMe,eError,eError,eError,     4,eError,eError,# 10-17 
230 | eError,eError,eError,eError,     5,eError,eError,eError,# 18-1f 
231 | eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart,# 20-27 
232 | )
233 | 
234 | ISO2022KRCharLenTable = (0, 0, 0, 0, 0, 0)
235 | 
236 | ISO2022KRSMModel = {'classTable': ISO2022KR_cls,
237 |                     'classFactor': 6,
238 |                     'stateTable': ISO2022KR_st,
239 |                     'charLenTable': ISO2022KRCharLenTable,
240 |                     'name': "ISO-2022-KR"}
241 | 


--------------------------------------------------------------------------------
/html5lib/chardet/eucjpprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | # 
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | # 
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | import constants, sys
29 | from constants import eStart, eError, eItsMe
30 | from mbcharsetprober import MultiByteCharSetProber
31 | from codingstatemachine import CodingStateMachine
32 | from chardistribution import EUCJPDistributionAnalysis
33 | from jpcntx import EUCJPContextAnalysis
34 | from mbcssm import EUCJPSMModel
35 | 
36 | class EUCJPProber(MultiByteCharSetProber):
37 |     def __init__(self):
38 |         MultiByteCharSetProber.__init__(self)
39 |         self._mCodingSM = CodingStateMachine(EUCJPSMModel)
40 |         self._mDistributionAnalyzer = EUCJPDistributionAnalysis()
41 |         self._mContextAnalyzer = EUCJPContextAnalysis()
42 |         self.reset()
43 | 
44 |     def reset(self):
45 |         MultiByteCharSetProber.reset(self)
46 |         self._mContextAnalyzer.reset()
47 |         
48 |     def get_charset_name(self):
49 |         return "EUC-JP"
50 | 
51 |     def feed(self, aBuf):
52 |         aLen = len(aBuf)
53 |         for i in range(0, aLen):
54 |             codingState = self._mCodingSM.next_state(aBuf[i])
55 |             if codingState == eError:
56 |                 if constants._debug:
57 |                     sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
58 |                 self._mState = constants.eNotMe
59 |                 break
60 |             elif codingState == eItsMe:
61 |                 self._mState = constants.eFoundIt
62 |                 break
63 |             elif codingState == eStart:
64 |                 charLen = self._mCodingSM.get_current_charlen()
65 |                 if i == 0:
66 |                     self._mLastChar[1] = aBuf[0]
67 |                     self._mContextAnalyzer.feed(self._mLastChar, charLen)
68 |                     self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
69 |                 else:
70 |                     self._mContextAnalyzer.feed(aBuf[i-1:i+1], charLen)
71 |                     self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen)
72 |                     
73 |         self._mLastChar[0] = aBuf[aLen - 1]
74 |         
75 |         if self.get_state() == constants.eDetecting:
76 |             if self._mContextAnalyzer.got_enough_data() and \
77 |                    (self.get_confidence() > constants.SHORTCUT_THRESHOLD):
78 |                 self._mState = constants.eFoundIt
79 | 
80 |         return self.get_state()
81 | 
82 |     def get_confidence(self):
83 |         contxtCf = self._mContextAnalyzer.get_confidence()
84 |         distribCf = self._mDistributionAnalyzer.get_confidence()
85 |         return max(contxtCf, distribCf)
86 | 


--------------------------------------------------------------------------------
/html5lib/chardet/euckrprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | # 
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | # 
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | from mbcharsetprober import MultiByteCharSetProber
29 | from codingstatemachine import CodingStateMachine
30 | from chardistribution import EUCKRDistributionAnalysis
31 | from mbcssm import EUCKRSMModel
32 | 
33 | class EUCKRProber(MultiByteCharSetProber):
34 |     def __init__(self):
35 |         MultiByteCharSetProber.__init__(self)
36 |         self._mCodingSM = CodingStateMachine(EUCKRSMModel)
37 |         self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
38 |         self.reset()
39 | 
40 |     def get_charset_name(self):
41 |         return "EUC-KR"
42 | 


--------------------------------------------------------------------------------
/html5lib/chardet/euctwprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | # 
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | # 
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | from mbcharsetprober import MultiByteCharSetProber
29 | from codingstatemachine import CodingStateMachine
30 | from chardistribution import EUCTWDistributionAnalysis
31 | from mbcssm import EUCTWSMModel
32 | 
33 | class EUCTWProber(MultiByteCharSetProber):
34 |     def __init__(self):
35 |         MultiByteCharSetProber.__init__(self)
36 |         self._mCodingSM = CodingStateMachine(EUCTWSMModel)
37 |         self._mDistributionAnalyzer = EUCTWDistributionAnalysis()
38 |         self.reset()
39 | 
40 |     def get_charset_name(self):
41 |         return "EUC-TW"
42 | 


--------------------------------------------------------------------------------
/html5lib/chardet/gb2312prober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | # 
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | # 
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | from mbcharsetprober import MultiByteCharSetProber
29 | from codingstatemachine import CodingStateMachine
30 | from chardistribution import GB2312DistributionAnalysis
31 | from mbcssm import GB2312SMModel
32 | 
33 | class GB2312Prober(MultiByteCharSetProber):
34 |     def __init__(self):
35 |         MultiByteCharSetProber.__init__(self)
36 |         self._mCodingSM = CodingStateMachine(GB2312SMModel)
37 |         self._mDistributionAnalyzer = GB2312DistributionAnalysis()
38 |         self.reset()
39 | 
40 |     def get_charset_name(self):
41 |         return "GB2312"
42 | 


--------------------------------------------------------------------------------
/html5lib/chardet/latin1prober.py:
--------------------------------------------------------------------------------
  1 | ######################## BEGIN LICENSE BLOCK ########################
  2 | # The Original Code is Mozilla Universal charset detector code.
  3 | #
  4 | # The Initial Developer of the Original Code is
  5 | # Netscape Communications Corporation.
  6 | # Portions created by the Initial Developer are Copyright (C) 2001
  7 | # the Initial Developer. All Rights Reserved.
  8 | #
  9 | # Contributor(s):
 10 | #   Mark Pilgrim - port to Python
 11 | #   Shy Shalom - original C code
 12 | #
 13 | # This library is free software; you can redistribute it and/or
 14 | # modify it under the terms of the GNU Lesser General Public
 15 | # License as published by the Free Software Foundation; either
 16 | # version 2.1 of the License, or (at your option) any later version.
 17 | # 
 18 | # This library is distributed in the hope that it will be useful,
 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 21 | # Lesser General Public License for more details.
 22 | # 
 23 | # You should have received a copy of the GNU Lesser General Public
 24 | # License along with this library; if not, write to the Free Software
 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 26 | # 02110-1301  USA
 27 | ######################### END LICENSE BLOCK #########################
 28 | 
 29 | from charsetprober import CharSetProber
 30 | import constants
 31 | import operator
 32 | 
 33 | FREQ_CAT_NUM = 4
 34 | 
 35 | UDF = 0 # undefined
 36 | OTH = 1 # other
 37 | ASC = 2 # ascii capital letter
 38 | ASS = 3 # ascii small letter
 39 | ACV = 4 # accent capital vowel
 40 | ACO = 5 # accent capital other
 41 | ASV = 6 # accent small vowel
 42 | ASO = 7 # accent small other
 43 | CLASS_NUM = 8 # total classes
 44 | 
 45 | Latin1_CharToClass = ( \
 46 |   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   # 00 - 07
 47 |   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   # 08 - 0F
 48 |   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   # 10 - 17
 49 |   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   # 18 - 1F
 50 |   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   # 20 - 27
 51 |   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   # 28 - 2F
 52 |   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   # 30 - 37
 53 |   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   # 38 - 3F
 54 |   OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   # 40 - 47
 55 |   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   # 48 - 4F
 56 |   ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,   # 50 - 57
 57 |   ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH,   # 58 - 5F
 58 |   OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   # 60 - 67
 59 |   ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   # 68 - 6F
 60 |   ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,   # 70 - 77
 61 |   ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH,   # 78 - 7F
 62 |   OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH,   # 80 - 87
 63 |   OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF,   # 88 - 8F
 64 |   UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   # 90 - 97
 65 |   OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO,   # 98 - 9F
 66 |   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   # A0 - A7
 67 |   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   # A8 - AF
 68 |   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   # B0 - B7
 69 |   OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,   # B8 - BF
 70 |   ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO,   # C0 - C7
 71 |   ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV,   # C8 - CF
 72 |   ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH,   # D0 - D7
 73 |   ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO,   # D8 - DF
 74 |   ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO,   # E0 - E7
 75 |   ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV,   # E8 - EF
 76 |   ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH,   # F0 - F7
 77 |   ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO,   # F8 - FF
 78 | )
 79 | 
 80 | # 0 : illegal 
 81 | # 1 : very unlikely 
 82 | # 2 : normal 
 83 | # 3 : very likely
 84 | Latin1ClassModel = ( \
 85 | # UDF OTH ASC ASS ACV ACO ASV ASO
 86 |    0,  0,  0,  0,  0,  0,  0,  0,  # UDF
 87 |    0,  3,  3,  3,  3,  3,  3,  3,  # OTH
 88 |    0,  3,  3,  3,  3,  3,  3,  3,  # ASC
 89 |    0,  3,  3,  3,  1,  1,  3,  3,  # ASS
 90 |    0,  3,  3,  3,  1,  2,  1,  2,  # ACV
 91 |    0,  3,  3,  3,  3,  3,  3,  3,  # ACO
 92 |    0,  3,  1,  3,  1,  1,  1,  3,  # ASV
 93 |    0,  3,  1,  3,  1,  1,  3,  3,  # ASO
 94 | )
 95 | 
 96 | class Latin1Prober(CharSetProber):
 97 |     def __init__(self):
 98 |         CharSetProber.__init__(self)
 99 |         self.reset()
100 | 
101 |     def reset(self):
102 |         self._mLastCharClass = OTH
103 |         self._mFreqCounter = [0] * FREQ_CAT_NUM
104 |         CharSetProber.reset(self)
105 | 
106 |     def get_charset_name(self):
107 |         return "windows-1252"
108 | 
109 |     def feed(self, aBuf):
110 |         aBuf = self.filter_with_english_letters(aBuf)
111 |         for c in aBuf:
112 |             charClass = Latin1_CharToClass[ord(c)]
113 |             freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass]
114 |             if freq == 0:
115 |                 self._mState = constants.eNotMe
116 |                 break
117 |             self._mFreqCounter[freq] += 1
118 |             self._mLastCharClass = charClass
119 | 
120 |         return self.get_state()
121 | 
122 |     def get_confidence(self):
123 |         if self.get_state() == constants.eNotMe:
124 |             return 0.01
125 |   
126 |         total = reduce(operator.add, self._mFreqCounter)
127 |         if total < 0.01:
128 |             confidence = 0.0
129 |         else:
130 |             confidence = (self._mFreqCounter[3] / total) - (self._mFreqCounter[1] * 20.0 / total)
131 |         if confidence < 0.0:
132 |             confidence = 0.0
133 |         # lower the confidence of latin1 so that other more accurate detector 
134 |         # can take priority.
135 |         confidence = confidence * 0.5
136 |         return confidence
137 | 


--------------------------------------------------------------------------------
/html5lib/chardet/mbcharsetprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is Mozilla Universal charset detector code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 2001
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #   Shy Shalom - original C code
12 | #   Proofpoint, Inc.
13 | #
14 | # This library is free software; you can redistribute it and/or
15 | # modify it under the terms of the GNU Lesser General Public
16 | # License as published by the Free Software Foundation; either
17 | # version 2.1 of the License, or (at your option) any later version.
18 | # 
19 | # This library is distributed in the hope that it will be useful,
20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22 | # Lesser General Public License for more details.
23 | # 
24 | # You should have received a copy of the GNU Lesser General Public
25 | # License along with this library; if not, write to the Free Software
26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27 | # 02110-1301  USA
28 | ######################### END LICENSE BLOCK #########################
29 | 
30 | import constants, sys
31 | from constants import eStart, eError, eItsMe
32 | from charsetprober import CharSetProber
33 | 
34 | class MultiByteCharSetProber(CharSetProber):
35 |     def __init__(self):
36 |         CharSetProber.__init__(self)
37 |         self._mDistributionAnalyzer = None
38 |         self._mCodingSM = None
39 |         self._mLastChar = ['\x00', '\x00']
40 | 
41 |     def reset(self):
42 |         CharSetProber.reset(self)
43 |         if self._mCodingSM:
44 |             self._mCodingSM.reset()
45 |         if self._mDistributionAnalyzer:
46 |             self._mDistributionAnalyzer.reset()
47 |         self._mLastChar = ['\x00', '\x00']
48 | 
49 |     def get_charset_name(self):
50 |         pass
51 | 
52 |     def feed(self, aBuf):
53 |         aLen = len(aBuf)
54 |         for i in range(0, aLen):
55 |             codingState = self._mCodingSM.next_state(aBuf[i])
56 |             if codingState == eError:
57 |                 if constants._debug:
58 |                     sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
59 |                 self._mState = constants.eNotMe
60 |                 break
61 |             elif codingState == eItsMe:
62 |                 self._mState = constants.eFoundIt
63 |                 break
64 |             elif codingState == eStart:
65 |                 charLen = self._mCodingSM.get_current_charlen()
66 |                 if i == 0:
67 |                     self._mLastChar[1] = aBuf[0]
68 |                     self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
69 |                 else:
70 |                     self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen)
71 |                     
72 |         self._mLastChar[0] = aBuf[aLen - 1]
73 |         
74 |         if self.get_state() == constants.eDetecting:
75 |             if self._mDistributionAnalyzer.got_enough_data() and \
76 |                (self.get_confidence() > constants.SHORTCUT_THRESHOLD):
77 |                 self._mState = constants.eFoundIt
78 | 
79 |         return self.get_state()
80 | 
81 |     def get_confidence(self):
82 |         return self._mDistributionAnalyzer.get_confidence()
83 | 


--------------------------------------------------------------------------------
/html5lib/chardet/mbcsgroupprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is Mozilla Universal charset detector code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 2001
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #   Shy Shalom - original C code
12 | #   Proofpoint, Inc.
13 | #
14 | # This library is free software; you can redistribute it and/or
15 | # modify it under the terms of the GNU Lesser General Public
16 | # License as published by the Free Software Foundation; either
17 | # version 2.1 of the License, or (at your option) any later version.
18 | # 
19 | # This library is distributed in the hope that it will be useful,
20 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
21 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22 | # Lesser General Public License for more details.
23 | # 
24 | # You should have received a copy of the GNU Lesser General Public
25 | # License along with this library; if not, write to the Free Software
26 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27 | # 02110-1301  USA
28 | ######################### END LICENSE BLOCK #########################
29 | 
30 | from charsetgroupprober import CharSetGroupProber
31 | from utf8prober import UTF8Prober
32 | from sjisprober import SJISProber
33 | from eucjpprober import EUCJPProber
34 | from gb2312prober import GB2312Prober
35 | from euckrprober import EUCKRProber
36 | from big5prober import Big5Prober
37 | from euctwprober import EUCTWProber
38 | 
39 | class MBCSGroupProber(CharSetGroupProber):
40 |     def __init__(self):
41 |         CharSetGroupProber.__init__(self)
42 |         self._mProbers = [ \
43 |             UTF8Prober(),
44 |             SJISProber(),
45 |             EUCJPProber(),
46 |             GB2312Prober(),
47 |             EUCKRProber(),
48 |             Big5Prober(),
49 |             EUCTWProber()]
50 |         self.reset()
51 | 


--------------------------------------------------------------------------------
/html5lib/chardet/sbcharsetprober.py:
--------------------------------------------------------------------------------
  1 | ######################## BEGIN LICENSE BLOCK ########################
  2 | # The Original Code is Mozilla Universal charset detector code.
  3 | #
  4 | # The Initial Developer of the Original Code is
  5 | # Netscape Communications Corporation.
  6 | # Portions created by the Initial Developer are Copyright (C) 2001
  7 | # the Initial Developer. All Rights Reserved.
  8 | #
  9 | # Contributor(s):
 10 | #   Mark Pilgrim - port to Python
 11 | #   Shy Shalom - original C code
 12 | #
 13 | # This library is free software; you can redistribute it and/or
 14 | # modify it under the terms of the GNU Lesser General Public
 15 | # License as published by the Free Software Foundation; either
 16 | # version 2.1 of the License, or (at your option) any later version.
 17 | # 
 18 | # This library is distributed in the hope that it will be useful,
 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 21 | # Lesser General Public License for more details.
 22 | # 
 23 | # You should have received a copy of the GNU Lesser General Public
 24 | # License along with this library; if not, write to the Free Software
 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 26 | # 02110-1301  USA
 27 | ######################### END LICENSE BLOCK #########################
 28 | 
 29 | import constants, sys
 30 | from charsetprober import CharSetProber
 31 | 
 32 | SAMPLE_SIZE = 64
 33 | SB_ENOUGH_REL_THRESHOLD = 1024
 34 | POSITIVE_SHORTCUT_THRESHOLD = 0.95
 35 | NEGATIVE_SHORTCUT_THRESHOLD = 0.05
 36 | SYMBOL_CAT_ORDER = 250
 37 | NUMBER_OF_SEQ_CAT = 4
 38 | POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1
 39 | #NEGATIVE_CAT = 0
 40 |  
 41 | class SingleByteCharSetProber(CharSetProber):
 42 |     def __init__(self, model, reversed=constants.False, nameProber=None):
 43 |         CharSetProber.__init__(self)
 44 |         self._mModel = model
 45 |         self._mReversed = reversed # TRUE if we need to reverse every pair in the model lookup
 46 |         self._mNameProber = nameProber # Optional auxiliary prober for name decision
 47 |         self.reset()
 48 | 
 49 |     def reset(self):
 50 |         CharSetProber.reset(self)
 51 |         self._mLastOrder = 255 # char order of last character
 52 |         self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT
 53 |         self._mTotalSeqs = 0
 54 |         self._mTotalChar = 0
 55 |         self._mFreqChar = 0 # characters that fall in our sampling range
 56 | 
 57 |     def get_charset_name(self):
 58 |         if self._mNameProber:
 59 |             return self._mNameProber.get_charset_name()
 60 |         else:
 61 |             return self._mModel['charsetName']
 62 | 
 63 |     def feed(self, aBuf):
 64 |         if not self._mModel['keepEnglishLetter']:
 65 |             aBuf = self.filter_without_english_letters(aBuf)
 66 |         aLen = len(aBuf)
 67 |         if not aLen:
 68 |             return self.get_state()
 69 |         for c in aBuf:
 70 |             order = self._mModel['charToOrderMap'][ord(c)]
 71 |             if order < SYMBOL_CAT_ORDER:
 72 |                 self._mTotalChar += 1
 73 |             if order < SAMPLE_SIZE:
 74 |                 self._mFreqChar += 1
 75 |                 if self._mLastOrder < SAMPLE_SIZE:
 76 |                     self._mTotalSeqs += 1
 77 |                     if not self._mReversed:
 78 |                         self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1
 79 |                     else: # reverse the order of the letters in the lookup
 80 |                         self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1
 81 |             self._mLastOrder = order
 82 | 
 83 |         if self.get_state() == constants.eDetecting:
 84 |             if self._mTotalSeqs > SB_ENOUGH_REL_THRESHOLD:
 85 |                 cf = self.get_confidence()
 86 |                 if cf > POSITIVE_SHORTCUT_THRESHOLD:
 87 |                     if constants._debug:
 88 |                         sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf))
 89 |                     self._mState = constants.eFoundIt
 90 |                 elif cf < NEGATIVE_SHORTCUT_THRESHOLD:
 91 |                     if constants._debug:
 92 |                         sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD))
 93 |                     self._mState = constants.eNotMe
 94 | 
 95 |         return self.get_state()
 96 | 
 97 |     def get_confidence(self):
 98 |         r = 0.01
 99 |         if self._mTotalSeqs > 0:
100 | #            print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio']
101 |             r = (1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs / self._mModel['mTypicalPositiveRatio']
102 | #            print r, self._mFreqChar, self._mTotalChar
103 |             r = r * self._mFreqChar / self._mTotalChar
104 |             if r >= 1.0:
105 |                 r = 0.99
106 |         return r
107 | 


--------------------------------------------------------------------------------
/html5lib/chardet/sbcsgroupprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is Mozilla Universal charset detector code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 2001
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #   Shy Shalom - original C code
12 | #
13 | # This library is free software; you can redistribute it and/or
14 | # modify it under the terms of the GNU Lesser General Public
15 | # License as published by the Free Software Foundation; either
16 | # version 2.1 of the License, or (at your option) any later version.
17 | # 
18 | # This library is distributed in the hope that it will be useful,
19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21 | # Lesser General Public License for more details.
22 | # 
23 | # You should have received a copy of the GNU Lesser General Public
24 | # License along with this library; if not, write to the Free Software
25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 | # 02110-1301  USA
27 | ######################### END LICENSE BLOCK #########################
28 | 
29 | import constants, sys
30 | from charsetgroupprober import CharSetGroupProber
31 | from sbcharsetprober import SingleByteCharSetProber
32 | from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model
33 | from langgreekmodel import Latin7GreekModel, Win1253GreekModel
34 | from langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel
35 | from langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel
36 | from langthaimodel import TIS620ThaiModel
37 | from langhebrewmodel import Win1255HebrewModel
38 | from hebrewprober import HebrewProber
39 | 
40 | class SBCSGroupProber(CharSetGroupProber):
41 |     def __init__(self):
42 |         CharSetGroupProber.__init__(self)
43 |         self._mProbers = [ \
44 |             SingleByteCharSetProber(Win1251CyrillicModel),
45 |             SingleByteCharSetProber(Koi8rModel),
46 |             SingleByteCharSetProber(Latin5CyrillicModel),
47 |             SingleByteCharSetProber(MacCyrillicModel),
48 |             SingleByteCharSetProber(Ibm866Model),
49 |             SingleByteCharSetProber(Ibm855Model),
50 |             SingleByteCharSetProber(Latin7GreekModel),
51 |             SingleByteCharSetProber(Win1253GreekModel),
52 |             SingleByteCharSetProber(Latin5BulgarianModel),
53 |             SingleByteCharSetProber(Win1251BulgarianModel),
54 |             SingleByteCharSetProber(Latin2HungarianModel),
55 |             SingleByteCharSetProber(Win1250HungarianModel),
56 |             SingleByteCharSetProber(TIS620ThaiModel),
57 |             ]
58 |         hebrewProber = HebrewProber()
59 |         logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.False, hebrewProber)
60 |         visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.True, hebrewProber)
61 |         hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber)
62 |         self._mProbers.extend([hebrewProber, logicalHebrewProber, visualHebrewProber])
63 | 
64 |         self.reset()
65 | 


--------------------------------------------------------------------------------
/html5lib/chardet/sjisprober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | # 
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | # 
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | from mbcharsetprober import MultiByteCharSetProber
29 | from codingstatemachine import CodingStateMachine
30 | from chardistribution import SJISDistributionAnalysis
31 | from jpcntx import SJISContextAnalysis
32 | from mbcssm import SJISSMModel
33 | import constants, sys
34 | from constants import eStart, eError, eItsMe
35 | 
36 | class SJISProber(MultiByteCharSetProber):
37 |     def __init__(self):
38 |         MultiByteCharSetProber.__init__(self)
39 |         self._mCodingSM = CodingStateMachine(SJISSMModel)
40 |         self._mDistributionAnalyzer = SJISDistributionAnalysis()
41 |         self._mContextAnalyzer = SJISContextAnalysis()
42 |         self.reset()
43 | 
44 |     def reset(self):
45 |         MultiByteCharSetProber.reset(self)
46 |         self._mContextAnalyzer.reset()
47 |         
48 |     def get_charset_name(self):
49 |         return "SHIFT_JIS"
50 | 
51 |     def feed(self, aBuf):
52 |         aLen = len(aBuf)
53 |         for i in range(0, aLen):
54 |             codingState = self._mCodingSM.next_state(aBuf[i])
55 |             if codingState == eError:
56 |                 if constants._debug:
57 |                     sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n')
58 |                 self._mState = constants.eNotMe
59 |                 break
60 |             elif codingState == eItsMe:
61 |                 self._mState = constants.eFoundIt
62 |                 break
63 |             elif codingState == eStart:
64 |                 charLen = self._mCodingSM.get_current_charlen()
65 |                 if i == 0:
66 |                     self._mLastChar[1] = aBuf[0]
67 |                     self._mContextAnalyzer.feed(self._mLastChar[2 - charLen :], charLen)
68 |                     self._mDistributionAnalyzer.feed(self._mLastChar, charLen)
69 |                 else:
70 |                     self._mContextAnalyzer.feed(aBuf[i + 1 - charLen : i + 3 - charLen], charLen)
71 |                     self._mDistributionAnalyzer.feed(aBuf[i - 1 : i + 1], charLen)
72 |                     
73 |         self._mLastChar[0] = aBuf[aLen - 1]
74 |         
75 |         if self.get_state() == constants.eDetecting:
76 |             if self._mContextAnalyzer.got_enough_data() and \
77 |                    (self.get_confidence() > constants.SHORTCUT_THRESHOLD):
78 |                 self._mState = constants.eFoundIt
79 | 
80 |         return self.get_state()
81 | 
82 |     def get_confidence(self):
83 |         contxtCf = self._mContextAnalyzer.get_confidence()
84 |         distribCf = self._mDistributionAnalyzer.get_confidence()
85 |         return max(contxtCf, distribCf)
86 | 


--------------------------------------------------------------------------------
/html5lib/chardet/test.py:
--------------------------------------------------------------------------------
 1 | import sys, glob
 2 | sys.path.insert(0, '..')
 3 | from chardet.universaldetector import UniversalDetector
 4 | 
 5 | count = 0
 6 | u = UniversalDetector()
 7 | for f in glob.glob(sys.argv[1]):
 8 |     print f.ljust(60),
 9 |     u.reset()
10 |     for line in file(f, 'rb'):
11 |         u.feed(line)
12 |         if u.done: break
13 |     u.close()
14 |     result = u.result
15 |     if result['encoding']:
16 |         print result['encoding'], 'with confidence', result['confidence']
17 |     else:
18 |         print '******** no result'
19 |     count += 1
20 | print count, 'tests'
21 | 


--------------------------------------------------------------------------------
/html5lib/chardet/universaldetector.py:
--------------------------------------------------------------------------------
  1 | ######################## BEGIN LICENSE BLOCK ########################
  2 | # The Original Code is Mozilla Universal charset detector code.
  3 | #
  4 | # The Initial Developer of the Original Code is
  5 | # Netscape Communications Corporation.
  6 | # Portions created by the Initial Developer are Copyright (C) 2001
  7 | # the Initial Developer. All Rights Reserved.
  8 | #
  9 | # Contributor(s):
 10 | #   Mark Pilgrim - port to Python
 11 | #   Shy Shalom - original C code
 12 | #
 13 | # This library is free software; you can redistribute it and/or
 14 | # modify it under the terms of the GNU Lesser General Public
 15 | # License as published by the Free Software Foundation; either
 16 | # version 2.1 of the License, or (at your option) any later version.
 17 | # 
 18 | # This library is distributed in the hope that it will be useful,
 19 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 20 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 21 | # Lesser General Public License for more details.
 22 | # 
 23 | # You should have received a copy of the GNU Lesser General Public
 24 | # License along with this library; if not, write to the Free Software
 25 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 26 | # 02110-1301  USA
 27 | ######################### END LICENSE BLOCK #########################
 28 | 
 29 | import constants, sys
 30 | from latin1prober import Latin1Prober # windows-1252
 31 | from mbcsgroupprober import MBCSGroupProber # multi-byte character sets
 32 | from sbcsgroupprober import SBCSGroupProber # single-byte character sets
 33 | from escprober import EscCharSetProber # ISO-2122, etc.
 34 | import re
 35 | 
 36 | MINIMUM_THRESHOLD = 0.20
 37 | ePureAscii = 0
 38 | eEscAscii = 1
 39 | eHighbyte = 2
 40 | 
 41 | class UniversalDetector:
 42 |     def __init__(self):
 43 |         self._highBitDetector = re.compile(r'[\x80-\xFF]')
 44 |         self._escDetector = re.compile(r'(\033|~{)')
 45 |         self._mEscCharSetProber = None
 46 |         self._mCharSetProbers = []
 47 |         self.reset()
 48 | 
 49 |     def reset(self):
 50 |         self.result = {'encoding': None, 'confidence': 0.0}
 51 |         self.done = constants.False
 52 |         self._mStart = constants.True
 53 |         self._mGotData = constants.False
 54 |         self._mInputState = ePureAscii
 55 |         self._mLastChar = ''
 56 |         if self._mEscCharSetProber:
 57 |             self._mEscCharSetProber.reset()
 58 |         for prober in self._mCharSetProbers:
 59 |             prober.reset()
 60 | 
 61 |     def feed(self, aBuf):
 62 |         if self.done: return
 63 | 
 64 |         aLen = len(aBuf)
 65 |         if not aLen: return
 66 |         
 67 |         if not self._mGotData:
 68 |             # If the data starts with BOM, we know it is UTF
 69 |             if aBuf[:3] == '\xEF\xBB\xBF':
 70 |                 # EF BB BF  UTF-8 with BOM
 71 |                 self.result = {'encoding': "UTF-8", 'confidence': 1.0}
 72 |             elif aBuf[:4] == '\xFF\xFE\x00\x00':
 73 |                 # FF FE 00 00  UTF-32, little-endian BOM
 74 |                 self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
 75 |             elif aBuf[:4] == '\x00\x00\xFE\xFF': 
 76 |                 # 00 00 FE FF  UTF-32, big-endian BOM
 77 |                 self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
 78 |             elif aBuf[:4] == '\xFE\xFF\x00\x00':
 79 |                 # FE FF 00 00  UCS-4, unusual octet order BOM (3412)
 80 |                 self.result = {'encoding': "X-ISO-10646-UCS-4-3412", 'confidence': 1.0}
 81 |             elif aBuf[:4] == '\x00\x00\xFF\xFE':
 82 |                 # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
 83 |                 self.result = {'encoding': "X-ISO-10646-UCS-4-2143", 'confidence': 1.0}
 84 |             elif aBuf[:2] == '\xFF\xFE':
 85 |                 # FF FE  UTF-16, little endian BOM
 86 |                 self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
 87 |             elif aBuf[:2] == '\xFE\xFF':
 88 |                 # FE FF  UTF-16, big endian BOM
 89 |                 self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}
 90 | 
 91 |         self._mGotData = constants.True
 92 |         if self.result['encoding'] and (self.result['confidence'] > 0.0):
 93 |             self.done = constants.True
 94 |             return
 95 | 
 96 |         if self._mInputState == ePureAscii:
 97 |             if self._highBitDetector.search(aBuf):
 98 |                 self._mInputState = eHighbyte
 99 |             elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf):
100 |                 self._mInputState = eEscAscii
101 | 
102 |         self._mLastChar = aBuf[-1]
103 | 
104 |         if self._mInputState == eEscAscii:
105 |             if not self._mEscCharSetProber:
106 |                 self._mEscCharSetProber = EscCharSetProber()
107 |             if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt:
108 |                 self.result = {'encoding': self._mEscCharSetProber.get_charset_name(),
109 |                                'confidence': self._mEscCharSetProber.get_confidence()}
110 |                 self.done = constants.True
111 |         elif self._mInputState == eHighbyte:
112 |             if not self._mCharSetProbers:
113 |                 self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()]
114 |             for prober in self._mCharSetProbers:
115 |                 if prober.feed(aBuf) == constants.eFoundIt:
116 |                     self.result = {'encoding': prober.get_charset_name(),
117 |                                    'confidence': prober.get_confidence()}
118 |                     self.done = constants.True
119 |                     break
120 | 
121 |     def close(self):
122 |         if self.done: return
123 |         if not self._mGotData:
124 |             if constants._debug:
125 |                 sys.stderr.write('no data received!\n')
126 |             return
127 |         self.done = constants.True
128 |         
129 |         if self._mInputState == ePureAscii:
130 |             self.result = {'encoding': 'ascii', 'confidence': 1.0}
131 |             return self.result
132 | 
133 |         if self._mInputState == eHighbyte:
134 |             proberConfidence = None
135 |             maxProberConfidence = 0.0
136 |             maxProber = None
137 |             for prober in self._mCharSetProbers:
138 |                 if not prober: continue
139 |                 proberConfidence = prober.get_confidence()
140 |                 if proberConfidence > maxProberConfidence:
141 |                     maxProberConfidence = proberConfidence
142 |                     maxProber = prober
143 |             if maxProber and (maxProberConfidence > MINIMUM_THRESHOLD):
144 |                 self.result = {'encoding': maxProber.get_charset_name(),
145 |                                'confidence': maxProber.get_confidence()}
146 |                 return self.result
147 | 
148 |         if constants._debug:
149 |             sys.stderr.write('no probers hit minimum threshhold\n')
150 |             for prober in self._mCharSetProbers[0].mProbers:
151 |                 if not prober: continue
152 |                 sys.stderr.write('%s confidence = %s\n' % \
153 |                                  (prober.get_charset_name(), \
154 |                                   prober.get_confidence()))
155 | 


--------------------------------------------------------------------------------
/html5lib/chardet/utf8prober.py:
--------------------------------------------------------------------------------
 1 | ######################## BEGIN LICENSE BLOCK ########################
 2 | # The Original Code is mozilla.org code.
 3 | #
 4 | # The Initial Developer of the Original Code is
 5 | # Netscape Communications Corporation.
 6 | # Portions created by the Initial Developer are Copyright (C) 1998
 7 | # the Initial Developer. All Rights Reserved.
 8 | #
 9 | # Contributor(s):
10 | #   Mark Pilgrim - port to Python
11 | #
12 | # This library is free software; you can redistribute it and/or
13 | # modify it under the terms of the GNU Lesser General Public
14 | # License as published by the Free Software Foundation; either
15 | # version 2.1 of the License, or (at your option) any later version.
16 | # 
17 | # This library is distributed in the hope that it will be useful,
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 | # Lesser General Public License for more details.
21 | # 
22 | # You should have received a copy of the GNU Lesser General Public
23 | # License along with this library; if not, write to the Free Software
24 | # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 | # 02110-1301  USA
26 | ######################### END LICENSE BLOCK #########################
27 | 
28 | import constants, sys
29 | from constants import eStart, eError, eItsMe
30 | from charsetprober import CharSetProber
31 | from codingstatemachine import CodingStateMachine
32 | from mbcssm import UTF8SMModel
33 | 
34 | ONE_CHAR_PROB = 0.5
35 | 
36 | class UTF8Prober(CharSetProber):
37 |     def __init__(self):
38 |         CharSetProber.__init__(self)
39 |         self._mCodingSM = CodingStateMachine(UTF8SMModel)
40 |         self.reset()
41 | 
42 |     def reset(self):
43 |         CharSetProber.reset(self)
44 |         self._mCodingSM.reset()
45 |         self._mNumOfMBChar = 0
46 | 
47 |     def get_charset_name(self):
48 |         return "utf-8"
49 | 
50 |     def feed(self, aBuf):
51 |         for c in aBuf:
52 |             codingState = self._mCodingSM.next_state(c)
53 |             if codingState == eError:
54 |                 self._mState = constants.eNotMe
55 |                 break
56 |             elif codingState == eItsMe:
57 |                 self._mState = constants.eFoundIt
58 |                 break
59 |             elif codingState == eStart:
60 |                 if self._mCodingSM.get_current_charlen() >= 2:
61 |                     self._mNumOfMBChar += 1
62 | 
63 |         if self.get_state() == constants.eDetecting:
64 |             if self.get_confidence() > constants.SHORTCUT_THRESHOLD:
65 |                 self._mState = constants.eFoundIt
66 | 
67 |         return self.get_state()
68 | 
69 |     def get_confidence(self):
70 |         unlike = 0.99
71 |         if self._mNumOfMBChar < 6:
72 |             for i in range(0, self._mNumOfMBChar):
73 |                 unlike = unlike * ONE_CHAR_PROB
74 |             return 1.0 - unlike
75 |         else:
76 |             return unlike
77 | 


--------------------------------------------------------------------------------
/html5lib/filters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/html5lib/filters/__init__.py


--------------------------------------------------------------------------------
/html5lib/filters/_base.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class Filter(object):
 3 |     def __init__(self, source):
 4 |         self.source = source
 5 | 
 6 |     def __iter__(self):
 7 |         return iter(self.source)
 8 | 
 9 |     def __getattr__(self, name):
10 |         return getattr(self.source, name)
11 | 


--------------------------------------------------------------------------------
/html5lib/filters/formfiller.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # The goal is to finally have a form filler where you pass data for
  3 | # each form, using the algorithm for "Seeding a form with initial values"
  4 | # See http://www.whatwg.org/specs/web-forms/current-work/#seeding
  5 | #
  6 | 
  7 | import _base
  8 | 
  9 | from html5lib.constants import spaceCharacters
 10 | spaceCharacters = u"".join(spaceCharacters)
 11 | 
 12 | class SimpleFilter(_base.Filter):
 13 |     def __init__(self, source, fieldStorage):
 14 |         _base.Filter.__init__(self, source)
 15 |         self.fieldStorage = fieldStorage
 16 | 
 17 |     def __iter__(self):
 18 |         field_indices = {}
 19 |         state = None
 20 |         field_name = None
 21 |         for token in _base.Filter.__iter__(self):
 22 |             type = token["type"]
 23 |             if type in ("StartTag", "EmptyTag"):
 24 |                 name = token["name"].lower()
 25 |                 if name == "input":
 26 |                     field_name = None
 27 |                     field_type = None
 28 |                     input_value_index = -1
 29 |                     input_checked_index = -1
 30 |                     for i,(n,v) in enumerate(token["data"]):
 31 |                         n = n.lower()
 32 |                         if n == u"name":
 33 |                             field_name = v.strip(spaceCharacters)
 34 |                         elif n == u"type":
 35 |                             field_type = v.strip(spaceCharacters)
 36 |                         elif n == u"checked":
 37 |                             input_checked_index = i
 38 |                         elif n == u"value":
 39 |                             input_value_index = i
 40 | 
 41 |                     value_list = self.fieldStorage.getlist(field_name)
 42 |                     field_index = field_indices.setdefault(field_name, 0)
 43 |                     if field_index < len(value_list):
 44 |                         value = value_list[field_index]
 45 |                     else:
 46 |                         value = ""
 47 | 
 48 |                     if field_type in (u"checkbox", u"radio"):
 49 |                         if value_list:
 50 |                             if token["data"][input_value_index][1] == value:
 51 |                                 if input_checked_index < 0:
 52 |                                     token["data"].append((u"checked", u""))
 53 |                                 field_indices[field_name] = field_index + 1
 54 |                             elif input_checked_index >= 0:
 55 |                                 del token["data"][input_checked_index]
 56 | 
 57 |                     elif field_type not in (u"button", u"submit", u"reset"):
 58 |                         if input_value_index >= 0:
 59 |                             token["data"][input_value_index] = (u"value", value)
 60 |                         else:
 61 |                             token["data"].append((u"value", value))
 62 |                         field_indices[field_name] = field_index + 1
 63 | 
 64 |                     field_type = None
 65 |                     field_name = None
 66 | 
 67 |                 elif name == "textarea":
 68 |                     field_type = "textarea"
 69 |                     field_name = dict((token["data"])[::-1])["name"]
 70 | 
 71 |                 elif name == "select":
 72 |                     field_type = "select"
 73 |                     attributes = dict(token["data"][::-1])
 74 |                     field_name = attributes.get("name")
 75 |                     is_select_multiple = "multiple" in attributes
 76 |                     is_selected_option_found = False
 77 | 
 78 |                 elif field_type == "select" and field_name and name == "option":
 79 |                     option_selected_index = -1
 80 |                     option_value = None
 81 |                     for i,(n,v) in enumerate(token["data"]):
 82 |                         n = n.lower()
 83 |                         if n == "selected":
 84 |                             option_selected_index = i
 85 |                         elif n == "value":
 86 |                             option_value = v.strip(spaceCharacters)
 87 |                     if option_value is None:
 88 |                         raise NotImplementedError("<option>s without a value= attribute")
 89 |                     else:
 90 |                         value_list = self.fieldStorage.getlist(field_name)
 91 |                         if value_list:
 92 |                             field_index = field_indices.setdefault(field_name, 0)
 93 |                             if field_index < len(value_list):
 94 |                                 value = value_list[field_index]
 95 |                             else:
 96 |                                 value = ""
 97 |                             if (is_select_multiple or not is_selected_option_found) and option_value == value:
 98 |                                 if option_selected_index < 0:
 99 |                                     token["data"].append((u"selected", u""))
100 |                                 field_indices[field_name] = field_index + 1
101 |                                 is_selected_option_found = True
102 |                             elif option_selected_index >= 0:
103 |                                 del token["data"][option_selected_index]
104 | 
105 |             elif field_type is not None and field_name and type == "EndTag":
106 |                 name = token["name"].lower()
107 |                 if name == field_type:
108 |                     if name == "textarea":
109 |                         value_list = self.fieldStorage.getlist(field_name)
110 |                         if value_list:
111 |                             field_index = field_indices.setdefault(field_name, 0)
112 |                             if field_index < len(value_list):
113 |                                 value = value_list[field_index]
114 |                             else:
115 |                                 value = ""
116 |                             yield {"type": "Characters", "data": value}
117 |                             field_indices[field_name] = field_index + 1
118 | 
119 |                     field_name = None
120 | 
121 |                 elif name == "option" and field_type == "select":
122 |                     pass # TODO: part of "option without value= attribute" processing
123 | 
124 |             elif field_type == "textarea":
125 |                 continue # ignore token
126 | 
127 |             yield token
128 | 


--------------------------------------------------------------------------------
/html5lib/filters/fullurl.py:
--------------------------------------------------------------------------------
 1 | import _base
 2 | from urlparse import urljoin
 3 | 
 4 | attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
 5 |      'xlink:href', 'xml:base']
 6 | 
 7 | class Filter(_base.Filter):
 8 |     def __init__(self, source, baseURI):
 9 |         _base.Filter.__init__(self, source)
10 |         self.baseURI = baseURI
11 | 
12 |     def __iter__(self):
13 |         for token in _base.Filter.__iter__(self):
14 |             if token.has_key("data"):
15 |                 for i, pair in enumerate(token["data"]):
16 |                     if pair[0] in attr_val_is_uri:
17 |                         token["data"][i] = (pair[0], urljoin(self.baseURI, pair[1]))
18 |             yield token
19 | 


--------------------------------------------------------------------------------
/html5lib/filters/inject_meta_charset.py:
--------------------------------------------------------------------------------
 1 | import _base
 2 | 
 3 | class Filter(_base.Filter):
 4 |     def __init__(self, source, encoding):
 5 |         _base.Filter.__init__(self, source)
 6 |         self.encoding = encoding
 7 | 
 8 |     def __iter__(self):
 9 |         state = "pre_head"
10 |         meta_found = (self.encoding is None)
11 |         pending = []
12 | 
13 |         for token in _base.Filter.__iter__(self):
14 |             type = token["type"]
15 |             if type == "StartTag":
16 |                 if token["name"].lower() == "head":
17 |                     state = "in_head"
18 | 
19 |             elif type == "EmptyTag":
20 |                 if token["name"].lower() == "meta":
21 |                    # replace charset with actual encoding
22 |                    has_http_equiv_content_type = False
23 |                    content_index = -1
24 |                    for i,(name,value) in enumerate(token["data"]):
25 |                        if name.lower() == 'charset':
26 |                           token["data"][i] = (u'charset', self.encoding)
27 |                           meta_found = True
28 |                           break
29 |                        elif name == 'http-equiv' and value.lower() == 'content-type':
30 |                            has_http_equiv_content_type = True
31 |                        elif name == 'content':
32 |                            content_index = i
33 |                    else:
34 |                        if has_http_equiv_content_type and content_index >= 0:
35 |                            token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
36 |                            meta_found = True
37 | 
38 |                 elif token["name"].lower() == "head" and not meta_found:
39 |                     # insert meta into empty head
40 |                     yield {"type": "StartTag", "name": "head",
41 |                            "data": token["data"]}
42 |                     yield {"type": "EmptyTag", "name": "meta",
43 |                            "data": [["charset", self.encoding]]}
44 |                     yield {"type": "EndTag", "name": "head"}
45 |                     meta_found = True
46 |                     continue
47 | 
48 |             elif type == "EndTag":
49 |                 if token["name"].lower() == "head" and pending:
50 |                     # insert meta into head (if necessary) and flush pending queue
51 |                     yield pending.pop(0)
52 |                     if not meta_found:
53 |                         yield {"type": "EmptyTag", "name": "meta",
54 |                                "data": [["charset", self.encoding]]}
55 |                     while pending:
56 |                         yield pending.pop(0)
57 |                     meta_found = True
58 |                     state = "post_head"
59 | 
60 |             if state == "in_head":
61 |                 pending.append(token)
62 |             else:
63 |                 yield token
64 | 


--------------------------------------------------------------------------------
/html5lib/filters/lint.py:
--------------------------------------------------------------------------------
 1 | from gettext import gettext
 2 | _ = gettext
 3 | 
 4 | import _base
 5 | from html5lib.constants import cdataElements, rcdataElements, voidElements
 6 | 
 7 | from html5lib.constants import spaceCharacters
 8 | spaceCharacters = u"".join(spaceCharacters)
 9 | 
10 | class LintError(Exception): pass
11 | 
12 | class Filter(_base.Filter):
13 |     def __iter__(self):
14 |         open_elements = []
15 |         contentModelFlag = "PCDATA"
16 |         for token in _base.Filter.__iter__(self):
17 |             type = token["type"]
18 |             if type in ("StartTag", "EmptyTag"):
19 |                 name = token["name"]
20 |                 if contentModelFlag != "PCDATA":
21 |                     raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
22 |                 if not isinstance(name, unicode):
23 |                     raise LintError(_(u"Tag name is not a string: %r") % name)
24 |                 if not name:
25 |                     raise LintError(_(u"Empty tag name"))
26 |                 if type == "StartTag" and name in voidElements:
27 |                     raise LintError(_(u"Void element reported as StartTag token: %s") % name)
28 |                 elif type == "EmptyTag" and name not in voidElements:
29 |                     raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
30 |                 if type == "StartTag":
31 |                     open_elements.append(name)
32 |                 for name, value in token["data"]:
33 |                     if not isinstance(name, unicode):
34 |                         raise LintError(_("Attribute name is not a string: %r") % name)
35 |                     if not name:
36 |                         raise LintError(_(u"Empty attribute name"))
37 |                     if not isinstance(value, unicode):
38 |                         raise LintError(_("Attribute value is not a string: %r") % value)
39 |                 if name in cdataElements:
40 |                     contentModelFlag = "CDATA"
41 |                 elif name in rcdataElements:
42 |                     contentModelFlag = "RCDATA"
43 |                 elif name == "plaintext":
44 |                     contentModelFlag = "PLAINTEXT"
45 | 
46 |             elif type == "EndTag":
47 |                 name = token["name"]
48 |                 if not isinstance(name, unicode):
49 |                     raise LintError(_(u"Tag name is not a string: %r") % name)
50 |                 if not name:
51 |                     raise LintError(_(u"Empty tag name"))
52 |                 if name in voidElements:
53 |                     raise LintError(_(u"Void element reported as EndTag token: %s") % name)
54 |                 start_name = open_elements.pop()
55 |                 if start_name != name:
56 |                     raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
57 |                 contentModelFlag = "PCDATA"
58 | 
59 |             elif type == "Comment":
60 |                 if contentModelFlag != "PCDATA":
61 |                     raise LintError(_("Comment not in PCDATA content model flag"))
62 | 
63 |             elif type in ("Characters", "SpaceCharacters"):
64 |                 data = token["data"]
65 |                 if not isinstance(data, unicode):
66 |                     raise LintError(_("Attribute name is not a string: %r") % data)
67 |                 if not data:
68 |                     raise LintError(_(u"%s token with empty data") % type)
69 |                 if type == "SpaceCharacters":
70 |                     data = data.strip(spaceCharacters)
71 |                     if data:
72 |                         raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
73 | 
74 |             elif type == "Doctype":
75 |                 name = token["name"]
76 |                 if contentModelFlag != "PCDATA":
77 |                     raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
78 |                 if not isinstance(name, unicode):
79 |                     raise LintError(_(u"Tag name is not a string: %r") % name)
80 |                 # XXX: what to do with token["data"] ?
81 | 
82 |             elif type in ("ParseError", "SerializeError"):
83 |                 pass
84 | 
85 |             else:
86 |                 raise LintError(_(u"Unknown token type: %s") % type)
87 | 
88 |             yield token
89 | 


--------------------------------------------------------------------------------
/html5lib/filters/sanitizer.py:
--------------------------------------------------------------------------------
1 | import _base
2 | from html5lib.sanitizer import HTMLSanitizerMixin
3 | 
4 | class Filter(_base.Filter, HTMLSanitizerMixin):
5 |     def __iter__(self):
6 |         for token in _base.Filter.__iter__(self):
7 |             token = self.sanitize_token(token)
8 |             if token: yield token
9 | 


--------------------------------------------------------------------------------
/html5lib/filters/whitespace.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     frozenset
 3 | except NameError:
 4 |     # Import from the sets module for python 2.3
 5 |     from sets import ImmutableSet as frozenset
 6 | 
 7 | import re
 8 | 
 9 | import _base
10 | from html5lib.constants import rcdataElements, spaceCharacters
11 | spaceCharacters = u"".join(spaceCharacters)
12 | 
13 | SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
14 | 
15 | class Filter(_base.Filter):
16 | 
17 |     spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
18 | 
19 |     def __iter__(self):
20 |         preserve = 0
21 |         for token in _base.Filter.__iter__(self):
22 |             type = token["type"]
23 |             if type == "StartTag" \
24 |               and (preserve or token["name"] in self.spacePreserveElements):
25 |                 preserve += 1
26 | 
27 |             elif type == "EndTag" and preserve:
28 |                 preserve -= 1
29 | 
30 |             elif not preserve and type == "SpaceCharacters" and token["data"]:
31 |                 # Test on token["data"] above to not introduce spaces where there were not
32 |                 token["data"] = u" "
33 | 
34 |             elif not preserve and type == "Characters":
35 |                 token["data"] = collapse_spaces(token["data"])
36 | 
37 |             yield token
38 | 
39 | def collapse_spaces(text):
40 |     return SPACES_REGEX.sub(' ', text)
41 | 
42 | 


--------------------------------------------------------------------------------
/html5lib/serializer/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from html5lib import treewalkers
 3 | 
 4 | from htmlserializer import HTMLSerializer
 5 | from xhtmlserializer import XHTMLSerializer
 6 | 
 7 | def serialize(input, tree="simpletree", format="html", encoding=None,
 8 |               **serializer_opts):
 9 |     # XXX: Should we cache this?
10 |     walker = treewalkers.getTreeWalker(tree) 
11 |     if format == "html":
12 |         s = HTMLSerializer(**serializer_opts)
13 |     elif format == "xhtml":
14 |         s = XHTMLSerializer(**serializer_opts)
15 |     else:
16 |         raise ValueError, "type must be either html or xhtml"
17 |     return s.render(walker(input), encoding)
18 | 


--------------------------------------------------------------------------------
/html5lib/serializer/htmlserializer.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |     frozenset
  3 | except NameError:
  4 |     # Import from the sets module for python 2.3
  5 |     from sets import ImmutableSet as frozenset
  6 | 
  7 | import gettext
  8 | _ = gettext.gettext
  9 | 
 10 | from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
 11 | from html5lib.constants import rcdataElements
 12 | 
 13 | from xml.sax.saxutils import escape
 14 | 
 15 | spaceCharacters = u"".join(spaceCharacters)
 16 | 
 17 | try:
 18 |     from codecs import register_error, xmlcharrefreplace_errors
 19 | except ImportError:
 20 |     unicode_encode_errors = "strict"
 21 | else:
 22 |     unicode_encode_errors = "htmlentityreplace"
 23 | 
 24 |     from html5lib.constants import entities
 25 | 
 26 |     encode_entity_map = {}
 27 |     for k, v in entities.items():
 28 |         if v != "&" and encode_entity_map.get(v) != k.lower():
 29 |             # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
 30 |             encode_entity_map[v] = k
 31 | 
 32 |     def htmlentityreplace_errors(exc):
 33 |         if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
 34 |             res = []
 35 |             for c in exc.object[exc.start:exc.end]:
 36 |                 e = encode_entity_map.get(c)
 37 |                 if e:
 38 |                     res.append("&")
 39 |                     res.append(e)
 40 |                     if not e.endswith(";"):
 41 |                         res.append(";")
 42 |                 else:
 43 |                     res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
 44 |             return (u"".join(res), exc.end)
 45 |         else:
 46 |             return xmlcharrefreplace_errors(exc)
 47 | 
 48 |     register_error(unicode_encode_errors, htmlentityreplace_errors)
 49 | 
 50 |     del register_error
 51 | 
 52 | def encode(text, encoding):
 53 |     return text.encode(encoding, unicode_encode_errors)
 54 | 
 55 | class HTMLSerializer(object):
 56 | 
 57 |     quote_attr_values = False
 58 |     quote_char = '"'
 59 |     use_best_quote_char = True
 60 |     minimize_boolean_attributes = True
 61 | 
 62 |     use_trailing_solidus = False
 63 |     space_before_trailing_solidus = True
 64 |     escape_lt_in_attrs = False
 65 |     escape_rcdata = False
 66 | 
 67 |     inject_meta_charset = True
 68 |     strip_whitespace = False
 69 |     sanitize = False
 70 |     omit_optional_tags = True
 71 | 
 72 |     options = ("quote_attr_values", "quote_char", "use_best_quote_char",
 73 |           "minimize_boolean_attributes", "use_trailing_solidus",
 74 |           "space_before_trailing_solidus", "omit_optional_tags",
 75 |           "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
 76 |           "escape_rcdata", 'use_trailing_solidus', "sanitize")
 77 | 
 78 |     def __init__(self, **kwargs):
 79 |         if kwargs.has_key('quote_char'):
 80 |             self.use_best_quote_char = False
 81 |         for attr in self.options:
 82 |             setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
 83 |         self.errors = []
 84 |         self.strict = False
 85 | 
 86 |     def serialize(self, treewalker, encoding=None):
 87 |         in_cdata = False
 88 |         self.errors = []
 89 |         if encoding and self.inject_meta_charset:
 90 |             from html5lib.filters.inject_meta_charset import Filter
 91 |             treewalker = Filter(treewalker, encoding)
 92 |         # XXX: WhitespaceFilter should be used before OptionalTagFilter
 93 |         # for maximum efficiently of this latter filter
 94 |         if self.strip_whitespace:
 95 |             from html5lib.filters.whitespace import Filter
 96 |             treewalker = Filter(treewalker)
 97 |         if self.sanitize:
 98 |             from html5lib.filters.sanitizer import Filter
 99 |             treewalker = Filter(treewalker)
100 |         if self.omit_optional_tags:
101 |             from html5lib.filters.optionaltags import Filter
102 |             treewalker = Filter(treewalker)
103 |         for token in treewalker:
104 |             type = token["type"]
105 |             if type == "Doctype":
106 |                 doctype = u"<!DOCTYPE %s" % token["name"]
107 |                 
108 |                 if token["publicId"]:
109 |                     doctype += u' PUBLIC "%s"' % token["publicId"]
110 |                 elif token["systemId"]:
111 |                     doctype += u" SYSTEM"
112 |                 if token["systemId"]:                
113 |                     if token["systemId"].find(u'"') >= 0:
114 |                         if token["systemId"].find(u"'") >= 0:
115 |                             self.serializeError(_("System identifer contains both single and double quote characters"))
116 |                         quote_char = u"'"
117 |                     else:
118 |                         quote_char = u'"'
119 |                     doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
120 |                 
121 |                 doctype += u">"
122 |                 
123 |                 if encoding:
124 |                     yield doctype.encode(encoding)
125 |                 else:
126 |                     yield doctype
127 | 
128 |             elif type in ("Characters", "SpaceCharacters"):
129 |                 if type == "SpaceCharacters" or in_cdata:
130 |                     if in_cdata and token["data"].find("</") >= 0:
131 |                         self.serializeError(_("Unexpected </ in CDATA"))
132 |                     if encoding:
133 |                         yield token["data"].encode(encoding, "strict")
134 |                     else:
135 |                         yield token["data"]
136 |                 elif encoding:
137 |                     yield encode(escape(token["data"]), encoding)
138 |                 else:
139 |                     yield escape(token["data"])
140 | 
141 |             elif type in ("StartTag", "EmptyTag"):
142 |                 name = token["name"]
143 |                 if name in rcdataElements and not self.escape_rcdata:
144 |                     in_cdata = True
145 |                 elif in_cdata:
146 |                     self.serializeError(_("Unexpected child element of a CDATA element"))
147 |                 attrs = token["data"]
148 |                 if hasattr(attrs, "items"):
149 |                     attrs = attrs.items()
150 |                 attrs.sort()
151 |                 attributes = []
152 |                 for k,v in attrs:
153 |                     if encoding:
154 |                         k = k.encode(encoding, "strict")
155 |                     attributes.append(' ')
156 | 
157 |                     attributes.append(k)
158 |                     if not self.minimize_boolean_attributes or \
159 |                       (k not in booleanAttributes.get(name, tuple()) \
160 |                       and k not in booleanAttributes.get("", tuple())):
161 |                         attributes.append("=")
162 |                         if self.quote_attr_values or not v:
163 |                             quote_attr = True
164 |                         else:
165 |                             quote_attr = reduce(lambda x,y: x or (y in v),
166 |                                 spaceCharacters + ">\"'=", False)
167 |                         v = v.replace("&", "&amp;")
168 |                         if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
169 |                         if encoding:
170 |                             v = encode(v, encoding)
171 |                         if quote_attr:
172 |                             quote_char = self.quote_char
173 |                             if self.use_best_quote_char:
174 |                                 if "'" in v and '"' not in v:
175 |                                     quote_char = '"'
176 |                                 elif '"' in v and "'" not in v:
177 |                                     quote_char = "'"
178 |                             if quote_char == "'":
179 |                                 v = v.replace("'", "&#39;")
180 |                             else:
181 |                                 v = v.replace('"', "&quot;")
182 |                             attributes.append(quote_char)
183 |                             attributes.append(v)
184 |                             attributes.append(quote_char)
185 |                         else:
186 |                             attributes.append(v)
187 |                 if name in voidElements and self.use_trailing_solidus:
188 |                     if self.space_before_trailing_solidus:
189 |                         attributes.append(" /")
190 |                     else:
191 |                         attributes.append("/")
192 |                 if encoding:
193 |                     yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
194 |                 else:
195 |                     yield u"<%s%s>" % (name, u"".join(attributes))
196 | 
197 |             elif type == "EndTag":
198 |                 name = token["name"]
199 |                 if name in rcdataElements:
200 |                     in_cdata = False
201 |                 elif in_cdata:
202 |                     self.serializeError(_("Unexpected child element of a CDATA element"))
203 |                 end_tag = u"</%s>" % name
204 |                 if encoding:
205 |                     end_tag = end_tag.encode(encoding, "strict")
206 |                 yield end_tag
207 | 
208 |             elif type == "Comment":
209 |                 data = token["data"]
210 |                 if data.find("--") >= 0:
211 |                     self.serializeError(_("Comment contains --"))
212 |                 comment = u"<!--%s-->" % token["data"]
213 |                 if encoding:
214 |                     comment = comment.encode(encoding, unicode_encode_errors)
215 |                 yield comment
216 | 
217 |             else:
218 |                 self.serializeError(token["data"])
219 | 
220 |     def render(self, treewalker, encoding=None):
221 |         if encoding:
222 |             return "".join(list(self.serialize(treewalker, encoding)))
223 |         else:
224 |             return u"".join(list(self.serialize(treewalker)))
225 | 
226 |     def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
227 |         # XXX The idea is to make data mandatory.
228 |         self.errors.append(data)
229 |         if self.strict:
230 |             raise SerializeError
231 | 
232 | def SerializeError(Exception):
233 |     """Error in serialized tree"""
234 |     pass
235 | 


--------------------------------------------------------------------------------
/html5lib/serializer/xhtmlserializer.py:
--------------------------------------------------------------------------------
 1 | from htmlserializer import HTMLSerializer
 2 | 
 3 | class XHTMLSerializer(HTMLSerializer):
 4 |     quote_attr_values = True
 5 |     minimize_boolean_attributes = False
 6 |     use_trailing_solidus = True
 7 |     escape_lt_in_attrs = True
 8 |     omit_optional_tags = False
 9 |     escape_rcdata = True
10 | 


--------------------------------------------------------------------------------
/html5lib/treebuilders/__init__.py:
--------------------------------------------------------------------------------
 1 | """A collection of modules for building different kinds of tree from
 2 | HTML documents.
 3 | 
 4 | To create a treebuilder for a new type of tree, you need to do
 5 | implement several things:
 6 | 
 7 | 1) A set of classes for various types of elements: Document, Doctype,
 8 | Comment, Element. These must implement the interface of
 9 | _base.treebuilders.Node (although comment nodes have a different
10 | signature for their constructor, see treebuilders.simpletree.Comment)
11 | Textual content may also be implemented as another node type, or not, as
12 | your tree implementation requires.
13 | 
14 | 2) A treebuilder object (called TreeBuilder by convention) that
15 | inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
16 | documentClass - the class to use for the bottommost node of a document
17 | elementClass - the class to use for HTML Elements
18 | commentClass - the class to use for comments
19 | doctypeClass - the class to use for doctypes
20 | It also has one required method:
21 | getDocument - Returns the root node of the complete document tree
22 | 
23 | 3) If you wish to run the unit tests, you must also create a
24 | testSerializer method on your treebuilder which accepts a node and
25 | returns a string containing Node and its children serialized according
26 | to the format used in the unittests
27 | 
28 | The supplied simpletree module provides a python-only implementation
29 | of a full treebuilder and is a useful reference for the semantics of
30 | the various methods.
31 | """
32 | 
33 | treeBuilderCache = {}
34 | 
35 | def getTreeBuilder(treeType, implementation=None, **kwargs):
36 |     """Get a TreeBuilder class for various types of tree with built-in support
37 | 
38 |     treeType - the name of the tree type required (case-insensitive). Supported
39 |                values are "simpletree", "dom", "etree" and "beautifulsoup"
40 | 
41 |                "simpletree" - a built-in DOM-ish tree type with support for some
42 |                               more pythonic idioms.
43 |                 "dom" - A generic builder for DOM implementations, defaulting to
44 |                         a xml.dom.minidom based implementation for the sake of
45 |                         backwards compatibility (as releases up until 0.10 had a
46 |                         builder called "dom" that was a minidom implemenation).
47 |                 "etree" - A generic builder for tree implementations exposing an
48 |                           elementtree-like interface (known to work with
49 |                           ElementTree, cElementTree and lxml.etree).
50 |                 "beautifulsoup" - Beautiful soup (if installed)
51 | 
52 |     implementation - (Currently applies to the "etree" and "dom" tree types). A
53 |                       module implementing the tree type e.g.
54 |                       xml.etree.ElementTree or lxml.etree."""
55 | 
56 |     treeType = treeType.lower()
57 |     if treeType not in treeBuilderCache:
58 |         if treeType == "dom":
59 |             import dom
60 |             # XXX: Keep backwards compatibility by using minidom if no implementation is given
61 |             if implementation == None:
62 |                 from xml.dom import minidom
63 |                 implementation = minidom
64 |             # XXX: NEVER cache here, caching is done in the dom submodule
65 |             return dom.getDomModule(implementation, **kwargs).TreeBuilder
66 |         elif treeType == "simpletree":
67 |             import simpletree
68 |             treeBuilderCache[treeType] = simpletree.TreeBuilder
69 |         elif treeType == "beautifulsoup":
70 |             import soup
71 |             treeBuilderCache[treeType] = soup.TreeBuilder
72 |         elif treeType == "lxml":
73 |             import etree_lxml
74 |             treeBuilderCache[treeType] = etree_lxml.TreeBuilder
75 |         elif treeType == "etree":
76 |             # Come up with a sane default
77 |             if implementation == None:
78 |                 try:
79 |                     import xml.etree.cElementTree as ET
80 |                 except ImportError:
81 |                     try:
82 |                         import xml.etree.ElementTree as ET
83 |                     except ImportError:
84 |                         try:
85 |                             import cElementTree as ET
86 |                         except ImportError:
87 |                             import elementtree.ElementTree as ET
88 |                 implementation = ET
89 |             import etree
90 |             # XXX: NEVER cache here, caching is done in the etree submodule
91 |             return etree.getETreeModule(implementation, **kwargs).TreeBuilder
92 |     return treeBuilderCache.get(treeType)
93 | 


--------------------------------------------------------------------------------
/html5lib/treebuilders/simpletree.py:
--------------------------------------------------------------------------------
  1 | import _base
  2 | from html5lib.constants import voidElements, namespaces, prefixes
  3 | from xml.sax.saxutils import escape
  4 | 
  5 | # Really crappy basic implementation of a DOM-core like thing
  6 | class Node(_base.Node):
  7 |     type = -1
  8 |     def __init__(self, name):
  9 |         self.name = name
 10 |         self.parent = None
 11 |         self.value = None
 12 |         self.childNodes = []
 13 |         self._flags = []
 14 | 
 15 |     def __iter__(self):
 16 |         for node in self.childNodes:
 17 |             yield node
 18 |             for item in node:
 19 |                 yield item
 20 | 
 21 |     def __unicode__(self):
 22 |         return self.name
 23 | 
 24 |     def toxml(self):
 25 |         raise NotImplementedError
 26 | 
 27 |     def printTree(self, indent=0):
 28 |         tree = '\n|%s%s' % (' '* indent, unicode(self))
 29 |         for child in self.childNodes:
 30 |             tree += child.printTree(indent + 2)
 31 |         return tree
 32 | 
 33 |     def appendChild(self, node):
 34 |         if (isinstance(node, TextNode) and self.childNodes and
 35 |           isinstance(self.childNodes[-1], TextNode)):
 36 |             self.childNodes[-1].value += node.value
 37 |         else:
 38 |             self.childNodes.append(node)
 39 |         node.parent = self
 40 | 
 41 |     def insertText(self, data, insertBefore=None):
 42 |         if insertBefore is None:
 43 |             self.appendChild(TextNode(data))
 44 |         else:
 45 |             self.insertBefore(TextNode(data), insertBefore)
 46 | 
 47 |     def insertBefore(self, node, refNode):
 48 |         index = self.childNodes.index(refNode)
 49 |         if (isinstance(node, TextNode) and index > 0 and
 50 |           isinstance(self.childNodes[index - 1], TextNode)):
 51 |             self.childNodes[index - 1].value += node.value
 52 |         else:
 53 |             self.childNodes.insert(index, node)
 54 |         node.parent = self
 55 | 
 56 |     def removeChild(self, node):
 57 |         try:
 58 |             self.childNodes.remove(node)
 59 |         except:
 60 |             # XXX
 61 |             raise
 62 |         node.parent = None
 63 | 
 64 |     def cloneNode(self):
 65 |         raise NotImplementedError
 66 | 
 67 |     def hasContent(self):
 68 |         """Return true if the node has children or text"""
 69 |         return bool(self.childNodes)
 70 | 
 71 |     def getNameTuple(self):
 72 |         if self.namespace == None:
 73 |             return namespaces["html"], self.name
 74 |         else:
 75 |             return self.namespace, self.name
 76 | 
 77 |     nameTuple = property(getNameTuple)
 78 | 
 79 | class Document(Node):
 80 |     type = 1
 81 |     def __init__(self):
 82 |         Node.__init__(self, None)
 83 | 
 84 |     def __unicode__(self):
 85 |         return "#document"
 86 | 
 87 |     def appendChild(self, child):
 88 |         Node.appendChild(self, child)
 89 | 
 90 |     def toxml(self, encoding="utf=8"):
 91 |         result = ""
 92 |         for child in self.childNodes:
 93 |             result += child.toxml()
 94 |         return result.encode(encoding)
 95 | 
 96 |     def hilite(self, encoding="utf-8"):
 97 |         result = "<pre>"
 98 |         for child in self.childNodes:
 99 |             result += child.hilite()
100 |         return result.encode(encoding) + "</pre>"
101 |     
102 |     def printTree(self):
103 |         tree = unicode(self)
104 |         for child in self.childNodes:
105 |             tree += child.printTree(2)
106 |         return tree
107 | 
108 |     def cloneNode(self):
109 |         return Document()
110 | 
111 | class DocumentFragment(Document):
112 |     type = 2
113 |     def __unicode__(self):
114 |         return "#document-fragment"
115 | 
116 |     def cloneNode(self):
117 |         return DocumentFragment()
118 | 
119 | class DocumentType(Node):
120 |     type = 3
121 |     def __init__(self, name, publicId, systemId):
122 |         Node.__init__(self, name)
123 |         self.publicId = publicId
124 |         self.systemId = systemId
125 | 
126 |     def __unicode__(self):
127 |         if self.publicId or self.systemId:
128 |             publicId = self.publicId or ""
129 |             systemId = self.systemId or ""
130 |             return """<!DOCTYPE %s "%s" "%s">"""%(
131 |                 self.name, publicId, systemId)
132 |                             
133 |         else:
134 |             return u"<!DOCTYPE %s>" % self.name
135 |     
136 | 
137 |     toxml = __unicode__
138 |     
139 |     def hilite(self):
140 |         return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
141 | 
142 |     def cloneNode(self):
143 |         return DocumentType(self.name, self.publicId, self.systemId)
144 | 
145 | class TextNode(Node):
146 |     type = 4
147 |     def __init__(self, value):
148 |         Node.__init__(self, None)
149 |         self.value = value
150 | 
151 |     def __unicode__(self):
152 |         return u"\"%s\"" % self.value
153 | 
154 |     def toxml(self):
155 |         return escape(self.value)
156 |     
157 |     hilite = toxml
158 | 
159 |     def cloneNode(self):
160 |         return TextNode(self.value)
161 | 
162 | class Element(Node):
163 |     type = 5
164 |     def __init__(self, name, namespace=None):
165 |         Node.__init__(self, name)
166 |         self.namespace = namespace
167 |         self.attributes = {}
168 | 
169 |     def __unicode__(self):
170 |         if self.namespace == None:
171 |             return u"<%s>" % self.name
172 |         else:
173 |             return u"<%s %s>"%(prefixes[self.namespace], self.name)
174 | 
175 |     def toxml(self):
176 |         result = '<' + self.name
177 |         if self.attributes:
178 |             for name,value in self.attributes.iteritems():
179 |                 result += u' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
180 |         if self.childNodes:
181 |             result += '>'
182 |             for child in self.childNodes:
183 |                 result += child.toxml()
184 |             result += u'</%s>' % self.name
185 |         else:
186 |             result += u'/>'
187 |         return result
188 |     
189 |     def hilite(self):
190 |         result = '&lt;<code class="markup element-name">%s</code>' % self.name
191 |         if self.attributes:
192 |             for name, value in self.attributes.iteritems():
193 |                 result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'&quot;'}))
194 |         if self.childNodes:
195 |             result += ">"
196 |             for child in self.childNodes:
197 |                 result += child.hilite()
198 |         elif self.name in voidElements:
199 |             return result + ">"
200 |         return result + '&lt;/<code class="markup element-name">%s</code>>' % self.name
201 | 
202 |     def printTree(self, indent):
203 |         tree = '\n|%s%s' % (' '*indent, unicode(self))
204 |         indent += 2
205 |         if self.attributes:
206 |             for name, value in self.attributes.iteritems():
207 |                 if isinstance(name, tuple):
208 |                     name = "%s %s"%(name[0], name[1])
209 |                 tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
210 |         for child in self.childNodes:
211 |             tree += child.printTree(indent)
212 |         return tree
213 | 
214 |     def cloneNode(self):
215 |         newNode = Element(self.name)
216 |         if hasattr(self, 'namespace'):
217 |             newNode.namespace = self.namespace
218 |         for attr, value in self.attributes.iteritems():
219 |             newNode.attributes[attr] = value
220 |         return newNode
221 | 
222 | class CommentNode(Node):
223 |     type = 6
224 |     def __init__(self, data):
225 |         Node.__init__(self, None)
226 |         self.data = data
227 | 
228 |     def __unicode__(self):
229 |         return "<!-- %s -->" % self.data
230 |     
231 |     def toxml(self):
232 |         return "<!--%s-->" % self.data
233 | 
234 |     def hilite(self):
235 |         return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
236 | 
237 |     def cloneNode(self):
238 |         return CommentNode(self.data)
239 | 
240 | class TreeBuilder(_base.TreeBuilder):
241 |     documentClass = Document
242 |     doctypeClass = DocumentType
243 |     elementClass = Element
244 |     commentClass = CommentNode
245 |     fragmentClass = DocumentFragment
246 |     
247 |     def testSerializer(self, node):
248 |         return node.printTree()
249 | 


--------------------------------------------------------------------------------
/html5lib/treebuilders/soup.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | 
  3 | warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
  4 | 
  5 | from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
  6 | 
  7 | import _base
  8 | from html5lib.constants import namespaces, DataLossWarning
  9 | 
 10 | class AttrList(object):
 11 |     def __init__(self, element):
 12 |         self.element = element
 13 |         self.attrs = dict(self.element.attrs)
 14 |     def __iter__(self):
 15 |         return self.attrs.items().__iter__()
 16 |     def __setitem__(self, name, value):
 17 |         "set attr", name, value
 18 |         self.element[name] = value
 19 |     def items(self):
 20 |         return self.attrs.items()
 21 |     def keys(self):
 22 |         return self.attrs.keys()
 23 |     def __getitem__(self, name):
 24 |         return self.attrs[name]
 25 |     def __contains__(self, name):
 26 |         return name in self.attrs.keys()
 27 | 
 28 | 
 29 | class Element(_base.Node):
 30 |     def __init__(self, element, soup, namespace):
 31 |         _base.Node.__init__(self, element.name)
 32 |         self.element = element
 33 |         self.soup = soup
 34 |         self.namespace = namespace
 35 | 
 36 |     def _nodeIndex(self, node, refNode):
 37 |         # Finds a node by identity rather than equality
 38 |         for index in range(len(self.element.contents)):
 39 |             if id(self.element.contents[index]) == id(refNode.element):
 40 |                 return index
 41 |         return None
 42 | 
 43 |     def appendChild(self, node):
 44 |         if (node.element.__class__ == NavigableString and self.element.contents
 45 |             and self.element.contents[-1].__class__ == NavigableString):
 46 |             # Concatenate new text onto old text node
 47 |             # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
 48 |             newStr = NavigableString(self.element.contents[-1]+node.element)
 49 | 
 50 |             # Remove the old text node
 51 |             # (Can't simply use .extract() by itself, because it fails if
 52 |             # an equal text node exists within the parent node)
 53 |             oldElement = self.element.contents[-1]
 54 |             del self.element.contents[-1]
 55 |             oldElement.parent = None
 56 |             oldElement.extract()
 57 | 
 58 |             self.element.insert(len(self.element.contents), newStr)
 59 |         else:
 60 |             self.element.insert(len(self.element.contents), node.element)
 61 |             node.parent = self
 62 | 
 63 |     def getAttributes(self):
 64 |         return AttrList(self.element)
 65 | 
 66 |     def setAttributes(self, attributes):
 67 |         if attributes:
 68 |             for name, value in attributes.items():
 69 |                 self.element[name] =  value
 70 | 
 71 |     attributes = property(getAttributes, setAttributes)
 72 |     
 73 |     def insertText(self, data, insertBefore=None):
 74 |         text = TextNode(NavigableString(data), self.soup)
 75 |         if insertBefore:
 76 |             self.insertBefore(text, insertBefore)
 77 |         else:
 78 |             self.appendChild(text)
 79 | 
 80 |     def insertBefore(self, node, refNode):
 81 |         index = self._nodeIndex(node, refNode)
 82 |         if (node.element.__class__ == NavigableString and self.element.contents
 83 |             and self.element.contents[index-1].__class__ == NavigableString):
 84 |             # (See comments in appendChild)
 85 |             newStr = NavigableString(self.element.contents[index-1]+node.element)
 86 |             oldNode = self.element.contents[index-1]
 87 |             del self.element.contents[index-1]
 88 |             oldNode.parent = None
 89 |             oldNode.extract()
 90 | 
 91 |             self.element.insert(index-1, newStr)
 92 |         else:
 93 |             self.element.insert(index, node.element)
 94 |             node.parent = self
 95 | 
 96 |     def removeChild(self, node):
 97 |         index = self._nodeIndex(node.parent, node)
 98 |         del node.parent.element.contents[index]
 99 |         node.element.parent = None
100 |         node.element.extract()
101 |         node.parent = None
102 | 
103 |     def reparentChildren(self, newParent):
104 |         while self.element.contents:
105 |             child = self.element.contents[0]
106 |             child.extract()
107 |             if isinstance(child, Tag):
108 |                 newParent.appendChild(Element(child, self.soup, namespaces["html"]))
109 |             else:
110 |                 newParent.appendChild(TextNode(child, self.soup))
111 | 
112 |     def cloneNode(self):
113 |         node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
114 |         for key,value in self.attributes:
115 |             node.attributes[key] = value
116 |         return node
117 | 
118 |     def hasContent(self):
119 |         return self.element.contents
120 | 
121 |     def getNameTuple(self):
122 |         if self.namespace == None:
123 |             return namespaces["html"], self.name
124 |         else:
125 |             return self.namespace, self.name
126 | 
127 |     nameTuple = property(getNameTuple)
128 | 
129 | class TextNode(Element):
130 |     def __init__(self, element, soup):
131 |         _base.Node.__init__(self, None)
132 |         self.element = element
133 |         self.soup = soup
134 |     
135 |     def cloneNode(self):
136 |         raise NotImplementedError
137 | 
138 | class TreeBuilder(_base.TreeBuilder):
139 |     def __init__(self, namespaceHTMLElements):
140 |         if namespaceHTMLElements:
141 |             warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
142 |         _base.TreeBuilder.__init__(self, namespaceHTMLElements)
143 |         
144 |     def documentClass(self):
145 |         self.soup = BeautifulSoup("")
146 |         return Element(self.soup, self.soup, None)
147 |     
148 |     def insertDoctype(self, token):
149 |         name = token["name"]
150 |         publicId = token["publicId"]
151 |         systemId = token["systemId"]
152 | 
153 |         if publicId:
154 |             self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
155 |         elif systemId:
156 |             self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
157 |                                             (name, systemId)))
158 |         else:
159 |             self.soup.insert(0, Declaration(name))
160 |     
161 |     def elementClass(self, name, namespace):
162 |         if namespace is not None:
163 |             warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
164 |         return Element(Tag(self.soup, name), self.soup, namespace)
165 |         
166 |     def commentClass(self, data):
167 |         return TextNode(Comment(data), self.soup)
168 |     
169 |     def fragmentClass(self):
170 |         self.soup = BeautifulSoup("")
171 |         self.soup.name = "[document_fragment]"
172 |         return Element(self.soup, self.soup, None) 
173 | 
174 |     def appendChild(self, node):
175 |         self.soup.insert(len(self.soup.contents), node.element)
176 | 
177 |     def testSerializer(self, element):
178 |         return testSerializer(element)
179 | 
180 |     def getDocument(self):
181 |         return self.soup
182 |     
183 |     def getFragment(self):
184 |         return _base.TreeBuilder.getFragment(self).element
185 |     
186 | def testSerializer(element):
187 |     import re
188 |     rv = []
189 |     def serializeElement(element, indent=0):
190 |         if isinstance(element, Declaration):
191 |             doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
192 |             m = re.compile(doctype_regexp).match(element.string)
193 |             assert m is not None, "DOCTYPE did not match expected format"
194 |             name = m.group('name')
195 |             publicId = m.group('publicId')
196 |             if publicId is not None:
197 |                 systemId = m.group('systemId1') or ""
198 |             else:
199 |                 systemId = m.group('systemId2')
200 | 
201 |             if publicId is not None or systemId is not None:
202 |                 rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
203 |                           (' '*indent, name, publicId or "", systemId or ""))
204 |             else:
205 |                 rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
206 |             
207 |         elif isinstance(element, BeautifulSoup):
208 |             if element.name == "[document_fragment]":
209 |                 rv.append("#document-fragment")                
210 |             else:
211 |                 rv.append("#document")
212 | 
213 |         elif isinstance(element, Comment):
214 |             rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
215 |         elif isinstance(element, unicode):
216 |             rv.append("|%s\"%s\"" %(' '*indent, element))
217 |         else:
218 |             rv.append("|%s<%s>"%(' '*indent, element.name))
219 |             if element.attrs:
220 |                 for name, value in element.attrs:
221 |                     rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
222 |         indent += 2
223 |         if hasattr(element, "contents"):
224 |             for child in element.contents:
225 |                 serializeElement(child, indent)
226 |     serializeElement(element, 0)
227 | 
228 |     return "\n".join(rv)
229 | 


--------------------------------------------------------------------------------
/html5lib/treewalkers/__init__.py:
--------------------------------------------------------------------------------
 1 | """A collection of modules for iterating through different kinds of
 2 | tree, generating tokens identical to those produced by the tokenizer
 3 | module.
 4 | 
 5 | To create a tree walker for a new type of tree, you need to do
 6 | implement a tree walker object (called TreeWalker by convention) that
 7 | implements a 'serialize' method taking a tree as sole argument and
 8 | returning an iterator generating tokens.
 9 | """
10 | 
11 | treeWalkerCache = {}
12 | 
13 | def getTreeWalker(treeType, implementation=None, **kwargs):
14 |     """Get a TreeWalker class for various types of tree with built-in support
15 | 
16 |     treeType - the name of the tree type required (case-insensitive). Supported
17 |                values are "simpletree", "dom", "etree" and "beautifulsoup"
18 | 
19 |                "simpletree" - a built-in DOM-ish tree type with support for some
20 |                               more pythonic idioms.
21 |                 "dom" - The xml.dom.minidom DOM implementation
22 |                 "pulldom" - The xml.dom.pulldom event stream
23 |                 "etree" - A generic walker for tree implementations exposing an
24 |                           elementtree-like interface (known to work with
25 |                           ElementTree, cElementTree and lxml.etree).
26 |                 "lxml" - Optimized walker for lxml.etree
27 |                 "beautifulsoup" - Beautiful soup (if installed)
28 |                 "genshi" - a Genshi stream
29 | 
30 |     implementation - (Currently applies to the "etree" tree type only). A module
31 |                       implementing the tree type e.g. xml.etree.ElementTree or
32 |                       cElementTree."""
33 | 
34 |     treeType = treeType.lower()
35 |     if treeType not in treeWalkerCache:
36 |         if treeType in ("dom", "pulldom", "simpletree"):
37 |             mod = __import__(treeType, globals())
38 |             treeWalkerCache[treeType] = mod.TreeWalker
39 |         elif treeType == "genshi":
40 |             import genshistream
41 |             treeWalkerCache[treeType] = genshistream.TreeWalker
42 |         elif treeType == "beautifulsoup":
43 |             import soup
44 |             treeWalkerCache[treeType] = soup.TreeWalker
45 |         elif treeType == "lxml":
46 |             import lxmletree
47 |             treeWalkerCache[treeType] = lxmletree.TreeWalker
48 |         elif treeType == "etree":
49 |             import etree
50 |             # XXX: NEVER cache here, caching is done in the etree submodule
51 |             return etree.getETreeModule(implementation, **kwargs).TreeWalker
52 |     return treeWalkerCache.get(treeType)
53 | 


--------------------------------------------------------------------------------
/html5lib/treewalkers/_base.py:
--------------------------------------------------------------------------------
  1 | import gettext
  2 | _ = gettext.gettext
  3 | 
  4 | from html5lib.constants import voidElements, spaceCharacters
  5 | spaceCharacters = u"".join(spaceCharacters)
  6 | 
  7 | class TreeWalker(object):
  8 |     def __init__(self, tree):
  9 |         self.tree = tree
 10 | 
 11 |     def __iter__(self):
 12 |         raise NotImplementedError
 13 | 
 14 |     def error(self, msg):
 15 |         return {"type": "SerializeError", "data": msg}
 16 | 
 17 |     def normalizeAttrs(self, attrs):
 18 |         if not attrs:
 19 |             attrs = []
 20 |         elif hasattr(attrs, 'items'):
 21 |             attrs = attrs.items()
 22 |         return [(unicode(name),unicode(value)) for name,value in attrs]
 23 | 
 24 |     def emptyTag(self, namespace, name, attrs, hasChildren=False):
 25 |         yield {"type": "EmptyTag", "name": unicode(name), 
 26 |                "namespace":unicode(namespace),
 27 |                "data": self.normalizeAttrs(attrs)}
 28 |         if hasChildren:
 29 |             yield self.error(_("Void element has children"))
 30 | 
 31 |     def startTag(self, namespace, name, attrs):
 32 |         return {"type": "StartTag", 
 33 |                 "name": unicode(name),
 34 |                 "namespace":unicode(namespace),
 35 |                 "data": self.normalizeAttrs(attrs)}
 36 | 
 37 |     def endTag(self, namespace, name):
 38 |         return {"type": "EndTag", 
 39 |                 "name": unicode(name),
 40 |                 "namespace":unicode(namespace),
 41 |                 "data": []}
 42 | 
 43 |     def text(self, data):
 44 |         data = unicode(data)
 45 |         middle = data.lstrip(spaceCharacters)
 46 |         left = data[:len(data)-len(middle)]
 47 |         if left:
 48 |             yield {"type": "SpaceCharacters", "data": left}
 49 |         data = middle
 50 |         middle = data.rstrip(spaceCharacters)
 51 |         right = data[len(middle):]
 52 |         if middle:
 53 |             yield {"type": "Characters", "data": middle}
 54 |         if right:
 55 |             yield {"type": "SpaceCharacters", "data": right}
 56 | 
 57 |     def comment(self, data):
 58 |         return {"type": "Comment", "data": unicode(data)}
 59 | 
 60 |     def doctype(self, name, publicId=None, systemId=None, correct=True):
 61 |         return {"type": "Doctype",
 62 |                 "name": name is not None and unicode(name) or u"",
 63 |                 "publicId": publicId,
 64 |                 "systemId": systemId,
 65 |                 "correct": correct}
 66 | 
 67 |     def unknown(self, nodeType):
 68 |         return self.error(_("Unknown node type: ") + nodeType)
 69 | 
 70 | class RecursiveTreeWalker(TreeWalker):
 71 |     def walkChildren(self, node):
 72 |         raise NodeImplementedError
 73 | 
 74 |     def element(self, node, namespace, name, attrs, hasChildren):
 75 |         if name in voidElements:
 76 |             for token in self.emptyTag(namespace, name, attrs, hasChildren):
 77 |                 yield token
 78 |         else:
 79 |             yield self.startTag(name, attrs)
 80 |             if hasChildren:
 81 |                 for token in self.walkChildren(node):
 82 |                     yield token
 83 |             yield self.endTag(name)
 84 | 
 85 | from xml.dom import Node
 86 | 
 87 | DOCUMENT = Node.DOCUMENT_NODE
 88 | DOCTYPE = Node.DOCUMENT_TYPE_NODE
 89 | TEXT = Node.TEXT_NODE
 90 | ELEMENT = Node.ELEMENT_NODE
 91 | COMMENT = Node.COMMENT_NODE
 92 | UNKNOWN = "<#UNKNOWN#>"
 93 | 
 94 | class NonRecursiveTreeWalker(TreeWalker):
 95 |     def getNodeDetails(self, node):
 96 |         raise NotImplementedError
 97 |     
 98 |     def getFirstChild(self, node):
 99 |         raise NotImplementedError
100 |     
101 |     def getNextSibling(self, node):
102 |         raise NotImplementedError
103 |     
104 |     def getParentNode(self, node):
105 |         raise NotImplementedError
106 | 
107 |     def __iter__(self):
108 |         currentNode = self.tree
109 |         while currentNode is not None:
110 |             details = self.getNodeDetails(currentNode)
111 |             type, details = details[0], details[1:]
112 |             hasChildren = False
113 |             endTag = None
114 | 
115 |             if type == DOCTYPE:
116 |                 yield self.doctype(*details)
117 | 
118 |             elif type == TEXT:
119 |                 for token in self.text(*details):
120 |                     yield token
121 | 
122 |             elif type == ELEMENT:
123 |                 namespace, name, attributes, hasChildren = details
124 |                 if name in voidElements:
125 |                     for token in self.emptyTag(namespace, name, attributes, 
126 |                                                hasChildren):
127 |                         yield token
128 |                     hasChildren = False
129 |                 else:
130 |                     endTag = name
131 |                     yield self.startTag(namespace, name, attributes)
132 | 
133 |             elif type == COMMENT:
134 |                 yield self.comment(details[0])
135 | 
136 |             elif type == DOCUMENT:
137 |                 hasChildren = True
138 | 
139 |             else:
140 |                 yield self.unknown(details[0])
141 |             
142 |             if hasChildren:
143 |                 firstChild = self.getFirstChild(currentNode)
144 |             else:
145 |                 firstChild = None
146 |             
147 |             if firstChild is not None:
148 |                 currentNode = firstChild
149 |             else:
150 |                 while currentNode is not None:
151 |                     details = self.getNodeDetails(currentNode)
152 |                     type, details = details[0], details[1:]
153 |                     if type == ELEMENT:
154 |                         namespace, name, attributes, hasChildren = details
155 |                         if name not in voidElements:
156 |                             yield self.endTag(namespace, name)
157 |                     if self.tree is currentNode:
158 |                         currentNode = None
159 |                         break
160 |                     nextSibling = self.getNextSibling(currentNode)
161 |                     if nextSibling is not None:
162 |                         currentNode = nextSibling
163 |                         break
164 |                     else:
165 |                         currentNode = self.getParentNode(currentNode)
166 | 


--------------------------------------------------------------------------------
/html5lib/treewalkers/dom.py:
--------------------------------------------------------------------------------
 1 | from xml.dom import Node
 2 | 
 3 | import gettext
 4 | _ = gettext.gettext
 5 | 
 6 | import _base
 7 | from html5lib.constants import voidElements
 8 | 
 9 | class TreeWalker(_base.NonRecursiveTreeWalker):
10 |     def getNodeDetails(self, node):
11 |         if node.nodeType == Node.DOCUMENT_TYPE_NODE:
12 |             return _base.DOCTYPE, node.name, node.publicId, node.systemId
13 | 
14 |         elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
15 |             return _base.TEXT, node.nodeValue
16 | 
17 |         elif node.nodeType == Node.ELEMENT_NODE:
18 |             return (_base.ELEMENT, node.namespaceURI, node.nodeName, 
19 |                     node.attributes.items(), node.hasChildNodes)
20 | 
21 |         elif node.nodeType == Node.COMMENT_NODE:
22 |             return _base.COMMENT, node.nodeValue
23 | 
24 |         elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
25 |             return (_base.DOCUMENT,)
26 | 
27 |         else:
28 |             return _base.UNKNOWN, node.nodeType
29 | 
30 |     def getFirstChild(self, node):
31 |         return node.firstChild
32 | 
33 |     def getNextSibling(self, node):
34 |         return node.nextSibling
35 | 
36 |     def getParentNode(self, node):
37 |         return node.parentNode
38 | 


--------------------------------------------------------------------------------
/html5lib/treewalkers/etree.py:
--------------------------------------------------------------------------------
  1 | import gettext
  2 | _ = gettext.gettext
  3 | 
  4 | import new
  5 | import copy
  6 | import re
  7 | 
  8 | import _base
  9 | from html5lib.constants import voidElements
 10 | 
 11 | tag_regexp = re.compile("{([^}]*)}(.*)")
 12 | 
 13 | moduleCache = {}
 14 | 
 15 | def getETreeModule(ElementTreeImplementation):
 16 |     name = "_" + ElementTreeImplementation.__name__+"builder"
 17 |     if name in moduleCache:
 18 |         return moduleCache[name]
 19 |     else:
 20 |         mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
 21 |         objs = getETreeBuilder(ElementTreeImplementation)
 22 |         mod.__dict__.update(objs)
 23 |         moduleCache[name] = mod
 24 |         return mod
 25 | 
 26 | def getETreeBuilder(ElementTreeImplementation):
 27 |     ElementTree = ElementTreeImplementation
 28 | 
 29 |     class TreeWalker(_base.NonRecursiveTreeWalker):
 30 |         """Given the particular ElementTree representation, this implementation,
 31 |         to avoid using recursion, returns "nodes" as tuples with the following
 32 |         content:
 33 | 
 34 |         1. The current element
 35 |         
 36 |         2. The index of the element relative to its parent
 37 |         
 38 |         3. A stack of ancestor elements
 39 |         
 40 |         4. A flag "text", "tail" or None to indicate if the current node is a
 41 |            text node; either the text or tail of the current element (1)
 42 |         """
 43 |         def getNodeDetails(self, node):
 44 |             if isinstance(node, tuple): # It might be the root Element
 45 |                 elt, key, parents, flag = node
 46 |                 if flag in ("text", "tail"):
 47 |                     return _base.TEXT, getattr(elt, flag)
 48 |                 else:
 49 |                     node = elt
 50 | 
 51 |             if not(hasattr(node, "tag")):
 52 |                 node = node.getroot()
 53 | 
 54 |             if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
 55 |                 return (_base.DOCUMENT,)
 56 | 
 57 |             elif node.tag == "<!DOCTYPE>":
 58 |                 return (_base.DOCTYPE, node.text, 
 59 |                         node.get("publicId"), node.get("systemId"))
 60 | 
 61 |             elif type(node.tag) == type(ElementTree.Comment):
 62 |                 return _base.COMMENT, node.text
 63 | 
 64 |             else:
 65 |                 #This is assumed to be an ordinary element
 66 |                 match = tag_regexp.match(node.tag)
 67 |                 if match:
 68 |                     namespace, tag = match.groups()
 69 |                 else:
 70 |                     namespace = None
 71 |                     tag = node.tag
 72 |                 return (_base.ELEMENT, namespace, tag, 
 73 |                         node.attrib.items(), len(node) or node.text)
 74 |     
 75 |         def getFirstChild(self, node):
 76 |             if isinstance(node, tuple):
 77 |                 element, key, parents, flag = node
 78 |             else:
 79 |                 element, key, parents, flag = node, None, [], None
 80 |                 
 81 |             if flag in ("text", "tail"):
 82 |                 return None
 83 |             else:
 84 |                 if element.text:
 85 |                     return element, key, parents, "text"
 86 |                 elif len(element):
 87 |                     parents.append(element)
 88 |                     return element[0], 0, parents, None
 89 |                 else:
 90 |                     return None
 91 |         
 92 |         def getNextSibling(self, node):
 93 |             if isinstance(node, tuple):
 94 |                 element, key, parents, flag = node
 95 |             else:
 96 |                 return None
 97 |                 
 98 |             if flag == "text":
 99 |                 if len(element):
100 |                     parents.append(element)
101 |                     return element[0], 0, parents, None
102 |                 else:
103 |                     return None
104 |             else:
105 |                 if element.tail and flag != "tail":
106 |                     return element, key, parents, "tail"
107 |                 elif key < len(parents[-1]) - 1:
108 |                     return parents[-1][key+1], key+1, parents, None
109 |                 else:
110 |                     return None
111 |         
112 |         def getParentNode(self, node):
113 |             if isinstance(node, tuple):
114 |                 element, key, parents, flag = node
115 |             else:
116 |                 return None
117 |             
118 |             if flag == "text":
119 |                 if not parents:
120 |                     return element
121 |                 else:
122 |                     return element, key, parents, None
123 |             else:
124 |                 parent = parents.pop()
125 |                 if not parents:
126 |                     return parent
127 |                 else:
128 |                     return parent, list(parents[-1]).index(parent), parents, None
129 | 
130 |     return locals()
131 | 


--------------------------------------------------------------------------------
/html5lib/treewalkers/genshistream.py:
--------------------------------------------------------------------------------
 1 | from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
 2 | from genshi.core  import  START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
 3 | from genshi.output import NamespaceFlattener
 4 | 
 5 | import _base
 6 | 
 7 | from html5lib.constants import voidElements
 8 | 
 9 | class TreeWalker(_base.TreeWalker):
10 |     def __iter__(self):
11 |         depth = 0
12 |         ignore_until = None
13 |         previous = None
14 |         for event in self.tree:
15 |             if previous is not None:
16 |                 if previous[0] == START:
17 |                     depth += 1
18 |                 if ignore_until <= depth:
19 |                     ignore_until = None
20 |                 if ignore_until is None:
21 |                     for token in self.tokens(previous, event):
22 |                         yield token
23 |                         if token["type"] == "EmptyTag":
24 |                             ignore_until = depth
25 |                 if previous[0] == END:
26 |                     depth -= 1
27 |             previous = event
28 |         if previous is not None:
29 |             if ignore_until is None or ignore_until <= depth:
30 |                 for token in self.tokens(previous, None):
31 |                     yield token
32 |             elif ignore_until is not None:
33 |                 raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
34 | 
35 |     def tokens(self, event, next):
36 |         kind, data, pos = event
37 |         if kind == START:
38 |             tag, attrib = data
39 |             name = tag.localname
40 |             namespace = tag.namespace
41 |             if tag in voidElements:
42 |                 for token in self.emptyTag(namespace, name, list(attrib),
43 |                                            not next or next[0] != END 
44 |                                            or next[1] != tag):
45 |                     yield token
46 |             else:
47 |                 yield self.startTag(namespace, name, list(attrib))
48 | 
49 |         elif kind == END:
50 |             name = data.localname
51 |             namespace = data.namespace
52 |             if name not in voidElements:
53 |                 yield self.endTag(namespace, name)
54 | 
55 |         elif kind == COMMENT:
56 |             yield self.comment(data)
57 | 
58 |         elif kind == TEXT:
59 |             for token in self.text(data):
60 |                 yield token
61 | 
62 |         elif kind == DOCTYPE:
63 |             yield self.doctype(*data)
64 | 
65 |         elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
66 |           START_CDATA, END_CDATA, PI):
67 |             pass
68 | 
69 |         else:
70 |             yield self.unknown(kind)
71 | 


--------------------------------------------------------------------------------
/html5lib/treewalkers/lxmletree.py:
--------------------------------------------------------------------------------
  1 | from lxml import etree
  2 | from html5lib.treebuilders.etree import tag_regexp
  3 | 
  4 | from gettext import gettext
  5 | _ = gettext
  6 | 
  7 | import _base
  8 | 
  9 | from html5lib.constants import voidElements
 10 | from html5lib import ihatexml
 11 | 
 12 | class Root(object):
 13 |     def __init__(self, et):
 14 |         self.elementtree = et
 15 |         self.children = []
 16 |         if et.docinfo.internalDTD:
 17 |             self.children.append(Doctype(self, et.docinfo.root_name, 
 18 |                                          et.docinfo.public_id, 
 19 |                                          et.docinfo.system_url))
 20 |         root = et.getroot()
 21 |         node = root
 22 | 
 23 |         while node.getprevious() is not None:
 24 |             node = node.getprevious()
 25 |         while node is not None:
 26 |             self.children.append(node)
 27 |             node = node.getnext()
 28 | 
 29 |         self.text = None
 30 |         self.tail = None
 31 |     
 32 |     def __getitem__(self, key):
 33 |         return self.children[key]
 34 | 
 35 |     def getnext(self):
 36 |         return None
 37 | 
 38 |     def __len__(self):
 39 |         return 1
 40 | 
 41 | class Doctype(object):
 42 |     def __init__(self, root_node, name, public_id, system_id):
 43 |         self.root_node = root_node
 44 |         self.name = name
 45 |         self.public_id = public_id
 46 |         self.system_id = system_id
 47 |         
 48 |         self.text = None
 49 |         self.tail = None
 50 | 
 51 |     def getnext(self):
 52 |         return self.root_node.children[1]
 53 | 
 54 | class FragmentRoot(Root):
 55 |     def __init__(self, children):
 56 |         self.children = [FragmentWrapper(self, child) for child in children]
 57 |         self.text = self.tail = None
 58 | 
 59 |     def getnext(self):
 60 |         return None
 61 | 
 62 | class FragmentWrapper(object):
 63 |     def __init__(self, fragment_root, obj):
 64 |         self.root_node = fragment_root
 65 |         self.obj = obj
 66 |         if hasattr(self.obj, 'text'):
 67 |             self.text = self.obj.text
 68 |         else:
 69 |             self.text = None
 70 |         if hasattr(self.obj, 'tail'):
 71 |             self.tail = self.obj.tail
 72 |         else:
 73 |             self.tail = None
 74 |         self.isstring = isinstance(obj, basestring)
 75 |         
 76 |     def __getattr__(self, name):
 77 |         return getattr(self.obj, name)
 78 |     
 79 |     def getnext(self):
 80 |         siblings = self.root_node.children
 81 |         idx = siblings.index(self)
 82 |         if idx < len(siblings) - 1:
 83 |             return siblings[idx + 1]
 84 |         else:
 85 |             return None
 86 | 
 87 |     def __getitem__(self, key):
 88 |         return self.obj[key]
 89 | 
 90 |     def __nonzero__(self):
 91 |         return bool(self.obj)
 92 | 
 93 |     def getparent(self):
 94 |         return None
 95 | 
 96 |     def __str__(self):
 97 |         return str(self.obj)
 98 | 
 99 |     def __len__(self):
100 |         return len(self.obj)
101 | 
102 |         
103 | class TreeWalker(_base.NonRecursiveTreeWalker):
104 |     def __init__(self, tree):
105 |         if hasattr(tree, "getroot"):
106 |             tree = Root(tree)
107 |         elif isinstance(tree, list):
108 |             tree = FragmentRoot(tree)
109 |         _base.NonRecursiveTreeWalker.__init__(self, tree)
110 |         self.filter = ihatexml.InfosetFilter()
111 |     def getNodeDetails(self, node):
112 |         if isinstance(node, tuple): # Text node
113 |             node, key = node
114 |             assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
115 |             return _base.TEXT, getattr(node, key)
116 | 
117 |         elif isinstance(node, Root):
118 |             return (_base.DOCUMENT,)
119 | 
120 |         elif isinstance(node, Doctype):
121 |             return _base.DOCTYPE, node.name, node.public_id, node.system_id
122 | 
123 |         elif isinstance(node, FragmentWrapper) and node.isstring:
124 |             return _base.TEXT, node
125 | 
126 |         elif node.tag == etree.Comment:
127 |             return _base.COMMENT, node.text
128 | 
129 |         else:
130 |             #This is assumed to be an ordinary element
131 |             match = tag_regexp.match(node.tag)
132 |             if match:
133 |                 namespace, tag = match.groups()
134 |             else:
135 |                 namespace = None
136 |                 tag = node.tag
137 |             return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag), 
138 |                     [(self.filter.fromXmlName(name), value) for 
139 |                      name,value in node.attrib.iteritems()], 
140 |                      len(node) > 0 or node.text)
141 | 
142 |     def getFirstChild(self, node):
143 |         assert not isinstance(node, tuple), _("Text nodes have no children")
144 | 
145 |         assert len(node) or node.text, "Node has no children"
146 |         if node.text:
147 |             return (node, "text")
148 |         else:
149 |             return node[0]
150 | 
151 |     def getNextSibling(self, node):
152 |         if isinstance(node, tuple): # Text node
153 |             node, key = node
154 |             assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
155 |             if key == "text":
156 |                 # XXX: we cannot use a "bool(node) and node[0] or None" construct here
157 |                 # because node[0] might evaluate to False if it has no child element
158 |                 if len(node):
159 |                     return node[0]
160 |                 else:
161 |                     return None
162 |             else: # tail
163 |                 return node.getnext()
164 | 
165 |         return node.tail and (node, "tail") or node.getnext()
166 | 
167 |     def getParentNode(self, node):
168 |         if isinstance(node, tuple): # Text node
169 |             node, key = node
170 |             assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
171 |             if key == "text":
172 |                 return node
173 |             # else: fallback to "normal" processing
174 | 
175 |         return node.getparent()
176 | 


--------------------------------------------------------------------------------
/html5lib/treewalkers/pulldom.py:
--------------------------------------------------------------------------------
 1 | from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
 2 |     COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
 3 | 
 4 | import _base
 5 | 
 6 | from html5lib.constants import voidElements
 7 | 
 8 | class TreeWalker(_base.TreeWalker):
 9 |     def __iter__(self):
10 |         ignore_until = None
11 |         previous = None
12 |         for event in self.tree:
13 |             if previous is not None and \
14 |               (ignore_until is None or previous[1] is ignore_until):
15 |                 if previous[1] is ignore_until:
16 |                     ignore_until = None
17 |                 for token in self.tokens(previous, event):
18 |                     yield token
19 |                     if token["type"] == "EmptyTag":
20 |                         ignore_until = previous[1]
21 |             previous = event
22 |         if ignore_until is None or previous[1] is ignore_until:
23 |             for token in self.tokens(previous, None):
24 |                 yield token
25 |         elif ignore_until is not None:
26 |             raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
27 | 
28 |     def tokens(self, event, next):
29 |         type, node = event
30 |         if type == START_ELEMENT:
31 |             name = node.nodeName
32 |             namespace = node.namespaceURI
33 |             if name in voidElements:
34 |                 for token in self.emptyTag(namespace,
35 |                                            name,
36 |                                            node.attributes.items(), 
37 |                                            not next or next[1] is not node):
38 |                     yield token
39 |             else:
40 |                 yield self.startTag(namespace, name, node.attributes.items())
41 | 
42 |         elif type == END_ELEMENT:
43 |             name = node.nodeName
44 |             namespace = node.namespaceURI
45 |             if name not in voidElements:
46 |                 yield self.endTag(namespace, name)
47 | 
48 |         elif type == COMMENT:
49 |             yield self.comment(node.nodeValue)
50 | 
51 |         elif type in (IGNORABLE_WHITESPACE, CHARACTERS):
52 |             for token in self.text(node.nodeValue):
53 |                 yield token
54 | 
55 |         else:
56 |             yield self.unknown(type)
57 | 


--------------------------------------------------------------------------------
/html5lib/treewalkers/simpletree.py:
--------------------------------------------------------------------------------
 1 | import gettext
 2 | _ = gettext.gettext
 3 | 
 4 | import _base
 5 | 
 6 | class TreeWalker(_base.NonRecursiveTreeWalker):
 7 |     """Given that simpletree has no performant way of getting a node's
 8 |     next sibling, this implementation returns "nodes" as tuples with the
 9 |     following content:
10 | 
11 |     1. The parent Node (Element, Document or DocumentFragment)
12 | 
13 |     2. The child index of the current node in its parent's children list
14 | 
15 |     3. A list used as a stack of all ancestors. It is a pair tuple whose
16 |        first item is a parent Node and second item is a child index.
17 |     """
18 | 
19 |     def getNodeDetails(self, node):
20 |         if isinstance(node, tuple): # It might be the root Node
21 |             parent, idx, parents = node
22 |             node = parent.childNodes[idx]
23 | 
24 |         # testing node.type allows us not to import treebuilders.simpletree
25 |         if node.type in (1, 2): # Document or DocumentFragment
26 |             return (_base.DOCUMENT,)
27 | 
28 |         elif node.type == 3: # DocumentType
29 |             return _base.DOCTYPE, node.name, node.publicId, node.systemId
30 | 
31 |         elif node.type == 4: # TextNode
32 |             return _base.TEXT, node.value
33 | 
34 |         elif node.type == 5: # Element
35 |             return (_base.ELEMENT, node.namespace, node.name, 
36 |                     node.attributes.items(), node.hasContent())
37 | 
38 |         elif node.type == 6: # CommentNode
39 |             return _base.COMMENT, node.data
40 | 
41 |         else:
42 |             return _node.UNKNOWN, node.type
43 | 
44 |     def getFirstChild(self, node):
45 |         if isinstance(node, tuple): # It might be the root Node
46 |             parent, idx, parents = node
47 |             parents.append((parent, idx))
48 |             node = parent.childNodes[idx]
49 |         else:
50 |             parents = []
51 | 
52 |         assert node.hasContent(), "Node has no children"
53 |         return (node, 0, parents)
54 | 
55 |     def getNextSibling(self, node):
56 |         assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
57 |         parent, idx, parents = node
58 |         idx += 1
59 |         if len(parent.childNodes) > idx:
60 |             return (parent, idx, parents)
61 |         else:
62 |             return None
63 | 
64 |     def getParentNode(self, node):
65 |         assert isinstance(node, tuple)
66 |         parent, idx, parents = node
67 |         if parents:
68 |             parent, idx = parents.pop()
69 |             return parent, idx, parents
70 |         else:
71 |             # HACK: We could return ``parent`` but None will stop the algorithm the same way
72 |             return None
73 | 


--------------------------------------------------------------------------------
/html5lib/treewalkers/soup.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import gettext
 3 | _ = gettext.gettext
 4 | 
 5 | from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
 6 | from html5lib.constants import namespaces
 7 | import _base
 8 | 
 9 | class TreeWalker(_base.NonRecursiveTreeWalker):
10 |     doctype_regexp = re.compile(
11 |         r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
12 |     def getNodeDetails(self, node):
13 |         if isinstance(node, BeautifulSoup): # Document or DocumentFragment
14 |             return (_base.DOCUMENT,)
15 | 
16 |         elif isinstance(node, Declaration): # DocumentType
17 |             string = unicode(node.string)
18 |             #Slice needed to remove markup added during unicode conversion,
19 |             #but only in some versions of BeautifulSoup/Python
20 |             if string.startswith('<!') and string.endswith('>'):
21 |                 string = string[2:-1]
22 |             m = self.doctype_regexp.match(string)
23 |             #This regexp approach seems wrong and fragile
24 |             #but beautiful soup stores the doctype as a single thing and we want the seperate bits
25 |             #It should work as long as the tree is created by html5lib itself but may be wrong if it's
26 |             #been modified at all
27 |             #We could just feed to it a html5lib tokenizer, I guess...
28 |             assert m is not None, "DOCTYPE did not match expected format"
29 |             name = m.group('name')
30 |             publicId = m.group('publicId')
31 |             if publicId is not None:
32 |                 systemId = m.group('systemId1')
33 |             else:
34 |                 systemId = m.group('systemId2')
35 |             return _base.DOCTYPE, name, publicId or "", systemId or ""
36 | 
37 |         elif isinstance(node, Comment):
38 |             string = unicode(node.string)
39 |             if string.startswith('<!--') and string.endswith('-->'):
40 |                 string = string[4:-3]
41 |             return _base.COMMENT, string
42 | 
43 |         elif isinstance(node, unicode): # TextNode
44 |             return _base.TEXT, node
45 | 
46 |         elif isinstance(node, Tag): # Element
47 |             return (_base.ELEMENT, namespaces["html"], node.name,
48 |                     dict(node.attrs).items(), node.contents)
49 |         else:
50 |             return _base.UNKNOWN, node.__class__.__name__
51 | 
52 |     def getFirstChild(self, node):
53 |         return node.contents[0]
54 | 
55 |     def getNextSibling(self, node):
56 |         return node.nextSibling
57 | 
58 |     def getParentNode(self, node):
59 |         return node.parent
60 | 


--------------------------------------------------------------------------------
/html5lib/utils.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |     frozenset
  3 | except NameError:
  4 |     #Import from the sets module for python 2.3
  5 |     from sets import Set as set
  6 |     from sets import ImmutableSet as frozenset
  7 | 
  8 | class MethodDispatcher(dict):
  9 |     """Dict with 2 special properties:
 10 | 
 11 |     On initiation, keys that are lists, sets or tuples are converted to
 12 |     multiple keys so accessing any one of the items in the original
 13 |     list-like object returns the matching value
 14 | 
 15 |     md = MethodDispatcher({("foo", "bar"):"baz"})
 16 |     md["foo"] == "baz"
 17 | 
 18 |     A default value which can be set through the default attribute.
 19 |     """
 20 | 
 21 |     def __init__(self, items=()):
 22 |         # Using _dictEntries instead of directly assigning to self is about
 23 |         # twice as fast. Please do careful performance testing before changing
 24 |         # anything here.
 25 |         _dictEntries = []
 26 |         for name,value in items:
 27 |             if type(name) in (list, tuple, frozenset, set):
 28 |                 for item in name:
 29 |                     _dictEntries.append((item, value))
 30 |             else:
 31 |                 _dictEntries.append((name, value))
 32 |         dict.__init__(self, _dictEntries)
 33 |         self.default = None
 34 | 
 35 |     def __getitem__(self, key):
 36 |         return dict.get(self, key, self.default)
 37 | 
 38 | #Pure python implementation of deque taken from the ASPN Python Cookbook
 39 | #Original code by Raymond Hettinger
 40 | 
 41 | class deque(object):
 42 | 
 43 |     def __init__(self, iterable=(), maxsize=-1):
 44 |         if not hasattr(self, 'data'):
 45 |             self.left = self.right = 0
 46 |             self.data = {}
 47 |         self.maxsize = maxsize
 48 |         self.extend(iterable)
 49 | 
 50 |     def append(self, x):
 51 |         self.data[self.right] = x
 52 |         self.right += 1
 53 |         if self.maxsize != -1 and len(self) > self.maxsize:
 54 |             self.popleft()
 55 |         
 56 |     def appendleft(self, x):
 57 |         self.left -= 1        
 58 |         self.data[self.left] = x
 59 |         if self.maxsize != -1 and len(self) > self.maxsize:
 60 |             self.pop()      
 61 |         
 62 |     def pop(self):
 63 |         if self.left == self.right:
 64 |             raise IndexError('cannot pop from empty deque')
 65 |         self.right -= 1
 66 |         elem = self.data[self.right]
 67 |         del self.data[self.right]         
 68 |         return elem
 69 |     
 70 |     def popleft(self):
 71 |         if self.left == self.right:
 72 |             raise IndexError('cannot pop from empty deque')
 73 |         elem = self.data[self.left]
 74 |         del self.data[self.left]
 75 |         self.left += 1
 76 |         return elem
 77 | 
 78 |     def clear(self):
 79 |         self.data.clear()
 80 |         self.left = self.right = 0
 81 | 
 82 |     def extend(self, iterable):
 83 |         for elem in iterable:
 84 |             self.append(elem)
 85 | 
 86 |     def extendleft(self, iterable):
 87 |         for elem in iterable:
 88 |             self.appendleft(elem)
 89 | 
 90 |     def rotate(self, n=1):
 91 |         if self:
 92 |             n %= len(self)
 93 |             for i in xrange(n):
 94 |                 self.appendleft(self.pop())
 95 | 
 96 |     def __getitem__(self, i):
 97 |         if i < 0:
 98 |             i += len(self)
 99 |         try:
100 |             return self.data[i + self.left]
101 |         except KeyError:
102 |             raise IndexError
103 | 
104 |     def __setitem__(self, i, value):
105 |         if i < 0:
106 |             i += len(self)        
107 |         try:
108 |             self.data[i + self.left] = value
109 |         except KeyError:
110 |             raise IndexError
111 | 
112 |     def __delitem__(self, i):
113 |         size = len(self)
114 |         if not (-size <= i < size):
115 |             raise IndexError
116 |         data = self.data
117 |         if i < 0:
118 |             i += size
119 |         for j in xrange(self.left+i, self.right-1):
120 |             data[j] = data[j+1]
121 |         self.pop()
122 |     
123 |     def __len__(self):
124 |         return self.right - self.left
125 | 
126 |     def __cmp__(self, other):
127 |         if type(self) != type(other):
128 |             return cmp(type(self), type(other))
129 |         return cmp(list(self), list(other))
130 |             
131 |     def __repr__(self, _track=[]):
132 |         if id(self) in _track:
133 |             return '...'
134 |         _track.append(id(self))
135 |         r = 'deque(%r)' % (list(self),)
136 |         _track.remove(id(self))
137 |         return r
138 |     
139 |     def __getstate__(self):
140 |         return (tuple(self),)
141 |     
142 |     def __setstate__(self, s):
143 |         self.__init__(s[0])
144 |         
145 |     def __hash__(self):
146 |         raise TypeError
147 |     
148 |     def __copy__(self):
149 |         return self.__class__(self)
150 |     
151 |     def __deepcopy__(self, memo={}):
152 |         from copy import deepcopy
153 |         result = self.__class__()
154 |         memo[id(self)] = result
155 |         result.__init__(deepcopy(tuple(self), memo))
156 |         return result


--------------------------------------------------------------------------------
/images/index.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/index.png


--------------------------------------------------------------------------------
/images/ui-bg_diagonals-thick_18_b81900_40x40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/ui-bg_diagonals-thick_18_b81900_40x40.png


--------------------------------------------------------------------------------
/images/ui-bg_diagonals-thick_20_666666_40x40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/ui-bg_diagonals-thick_20_666666_40x40.png


--------------------------------------------------------------------------------
/images/ui-bg_flat_10_000000_40x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/ui-bg_flat_10_000000_40x100.png


--------------------------------------------------------------------------------
/images/ui-bg_glass_100_f6f6f6_1x400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/ui-bg_glass_100_f6f6f6_1x400.png


--------------------------------------------------------------------------------
/images/ui-bg_glass_100_fdf5ce_1x400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/ui-bg_glass_100_fdf5ce_1x400.png


--------------------------------------------------------------------------------
/images/ui-bg_glass_65_ffffff_1x400.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/ui-bg_glass_65_ffffff_1x400.png


--------------------------------------------------------------------------------
/images/ui-bg_gloss-wave_35_f6a828_500x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/ui-bg_gloss-wave_35_f6a828_500x100.png


--------------------------------------------------------------------------------
/images/ui-bg_highlight-soft_100_eeeeee_1x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/ui-bg_highlight-soft_100_eeeeee_1x100.png


--------------------------------------------------------------------------------
/images/ui-bg_highlight-soft_75_ffe45c_1x100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/ui-bg_highlight-soft_75_ffe45c_1x100.png


--------------------------------------------------------------------------------
/images/ui-icons_222222_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/ui-icons_222222_256x240.png


--------------------------------------------------------------------------------
/images/ui-icons_228ef1_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/ui-icons_228ef1_256x240.png


--------------------------------------------------------------------------------
/images/ui-icons_ef8c08_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/ui-icons_ef8c08_256x240.png


--------------------------------------------------------------------------------
/images/ui-icons_ffd27a_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/ui-icons_ffd27a_256x240.png


--------------------------------------------------------------------------------
/images/ui-icons_ffffff_256x240.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/cssfulltext/bc0091ddb688d0114a9575b77b6a8f92e08f9262/images/ui-icons_ffffff_256x240.png


--------------------------------------------------------------------------------
/index.yaml:
--------------------------------------------------------------------------------
1 | indexes:
2 | 
3 | - kind: Project
4 |   properties:
5 |     - name: __searchable_text_index_name_link_description
6 |     - name: lastUpdateDate
7 |       direction: desc
8 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Copyright 2007 Google Inc.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | import webapp2
18 | 
19 | class MainHandler(webapp2.RequestHandler):
20 |     def get(self):
21 |         self.response.write('Hello world!')
22 | 
23 | app = webapp2.WSGIApplication([
24 |     ('/', MainHandler)
25 | ], debug=True)
26 | 


--------------------------------------------------------------------------------
/project.py:
--------------------------------------------------------------------------------
  1 | #!/url/bin/python
  2 | #-- coding: utf-8 --
  3 | '''
  4 | Create on 2011.2.3
  5 | 
  6 | @author: binux
  7 | '''
  8 | 
  9 | import hashlib
 10 | import datetime
 11 | import feedparser
 12 | import feedformatter
 13 | import fetcher
 14 | 
 15 | from google.appengine.ext import db
 16 | from google.appengine.api import urlfetch
 17 | from google.appengine.api import users
 18 | from google.appengine.ext import search
 19 | 
 20 | RETRY_TIMES_60 = 3
 21 | RETRY_TIMES_PROJECT = 20
 22 | RETRY_TIMES_1DAYS = 120
 23 | RETRY_INTERVAL  = datetime.timedelta(0, 60)
 24 | 
 25 | class Project(search.SearchableModel):
 26 |     name = db.StringProperty(required=True)
 27 |     link = db.LinkProperty(required=True)
 28 |     description = db.StringProperty(multiline=True)
 29 |     user = db.UserProperty(required=True)
 30 |     contentSelector = db.StringProperty(multiline=True)
 31 |     filterSelector = db.StringProperty(multiline=True)
 32 |     encoding = db.StringProperty(default=None)
 33 | 
 34 |     contentHash = db.StringProperty()
 35 |     resultCache = db.TextProperty()
 36 | 
 37 |     createdDate = db.DateTimeProperty(required=True, auto_now_add=True)
 38 |     lastModifiedDate = db.DateTimeProperty(required=True, auto_now_add=True)
 39 |     lastUpdateDate = db.DateTimeProperty(required=True, auto_now=True)
 40 |     nextUpdateDate = db.DateTimeProperty(required=True, auto_now_add=True)
 41 | 
 42 |     subscriptCount = db.IntegerProperty()
 43 |     updateFrequent = db.IntegerProperty()
 44 |     retryCount = db.IntegerProperty(default=0)
 45 |     comments = db.StringProperty()
 46 |     inValid = db.BooleanProperty(required=True,default=False)
 47 | 
 48 |     @classmethod
 49 |     def SearchableProperties(cls):
 50 |         return [['name','link', 'description']]
 51 | 
 52 | def insertErrorItem(rss_string):
 53 |     from os import environ
 54 |     rss_dict = feedparser.parse(rss_string)
 55 |     ErrorMessageItem= {}
 56 |     ErrorMessageItem["title"] = "your feed is invalid"
 57 |     ErrorMessageItem["link"] = "http://%s.appspot.com" % environ['APPLICATION_ID']
 58 |     ErrorMessageItem["description"] = "Your feed made by is no more valid."
 59 |     ErrorMessageItem["guid"] = "Feed Invalid at "+datetime.datetime.now()
 60 |     rss_dict.entries.insert(0, ErrorMessageItem)
 61 |     feed = feedformatter.Feed(rss_dict.feed, rss_dict.entries)
 62 |     return feed.format_rss2_string(validate=False, pretty=True)
 63 | 
 64 | def updateProject(project):
 65 |     if datetime.datetime.now() - project.lastUpdateDate < datetime.timedelta(minutes=1): 
 66 |         return 
 67 |     elif project.inValid: 
 68 |         project.nextUpdateDate = datetime.datetime.max
 69 |         return
 70 |     else:
 71 |         forceUpdateProject(project)
 72 | 
 73 | def forceUpdateProject(project):
 74 |     # step 1: fetch rss
 75 |     def tempFail():
 76 |         project.retryCount += 1
 77 |         if project.retryCount < RETRY_TIMES_60:
 78 |             project.nextUpdateDate += RETRY_INTERVAL
 79 |         elif project.retryCount < RETRY_TIMES_PROJECT:
 80 |             project.nextUpdateDate += datetime.timedelta(seconds=project.updateFrequent)
 81 |         elif project.retryCount < RETRY_TIMES_1DAYS:
 82 |             project.nextUpdateDate += datetime.timedelta(days=1)
 83 |         else:
 84 |             project.nextUpdateDate = datetime.datetime.max
 85 |             project.resultCache = insertErrorItem(project.resultCache)
 86 |             project.inValid = True
 87 | 
 88 |     try:
 89 |         response = urlfetch.fetch(project.link)
 90 |     except urlfetch.InvalidURLError, e:
 91 |         project.inValid = True
 92 |         project.put()
 93 |         return
 94 |     except (urlfetch.DownloadError, urlfetch.ResponseTooLargeError), e:
 95 |         tempFail()
 96 |         project.put()
 97 |         return
 98 | 
 99 |     if response.status_code != 200:
100 |         tempFail()
101 |         project.put()
102 |         return
103 |     elif hashlib.md5(response.content).hexdigest() == project.contentHash:
104 |         # step 1.5(1): nothing change
105 |         project.retryCount = 0
106 |         project.nextUpdateDate += datetime.timedelta(seconds=project.updateFrequent)
107 |         project.put()
108 |         return
109 |     else:
110 |         # step 1.5(2): set project status
111 |         project.contentHash = hashlib.md5(response.content).hexdigest()
112 |         project.retryCount = 0
113 |         project.nextUpdateDate += datetime.timedelta(seconds=project.updateFrequent)
114 | 
115 |     # step 2: parse rss
116 |     rss_dict = feedparser.parse(response.content, response_headers=response.headers)
117 |     
118 |     # step 3: get each descriptions
119 |     for entry in rss_dict.entries:
120 |         if not entry.has_key('link'):
121 |             continue
122 | 
123 |         new_description = fetcher.fetch_description(entry.link, project)
124 |         if new_description:
125 |             entry['summary'] = new_description
126 | 
127 |     # step 4: save result to database
128 |     feed = feedformatter.Feed(rss_dict.feed, rss_dict.entries)
129 |     feed_content = feed.format_rss2_string(validate=False, pretty=True)
130 |     # add project dtd
131 |     project.resultCache = '<?xml version="1.0" encoding="utf-8"?>' + feed_content
132 | 
133 |     project.put()
134 | 


--------------------------------------------------------------------------------
/queue.yaml:
--------------------------------------------------------------------------------
1 | queue:
2 | - name: default
3 |   rate: 5/s
4 | - name: project
5 |   rate: 50/s
6 | 


--------------------------------------------------------------------------------
/template/base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML>
 2 | 
 3 | <!--
 4 | Site Name:      full-text
 5 | Developed By:   binux - 足兆叉虫
 6 | Date Created:   2011-2-5
 7 | Last Updated:   2011-2-5
 8 | Copyright:      GPLv3   <http://www.gnu.org/licenses/>
 9 | -->
10 | 
11 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
12 | <head>
13 |   <title>{% block title %}Css Full-Text{% endblock %}</title>
14 |   
15 |   <!--CSS-->
16 |   <link rel="stylesheet" type="text/css" href="http://yui.yahooapis.com/3.2.0/build/cssreset/reset-min.css">
17 |   <link rel="stylesheet" type="text/css" href="http://yui.yahooapis.com/3.2.0/build/cssbase/base-min.css">
18 |   <link rel="stylesheet" type="text/css" href="http://yui.yahooapis.com/3.2.0/build/cssfonts/fonts-min.css">
19 |   <link rel="stylesheet" type="text/css" href="/css/style.css">
20 |   <link rel="stylesheet" type="text/css" href="/css/jquery-ui-1.8.9.custom.css">
21 |   <!--Character Encoding-->
22 |   <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
23 |   
24 |   <!--javascript-->
25 |   <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js"></script>
26 |   <script src="https://ajax.googleapis.com/ajax/libs/jqueryui/1.8.9/jquery-ui.min.js"></script>
27 |   
28 | </head>
29 | 
30 | <body>
31 |   <!--Main Container - Centers Everything-->
32 |   <div id="mainContainer" class="yui3-g">
33 |     
34 |     <div id="wrap">
35 |       <!--Header-->
36 |       <div id="header">
37 |         <div id="headerContent">
38 |           <div id="headerRight">
39 |             {% if user %}
40 |             <span> {{ user.email }} </span>
41 |             <span>|</span>
42 |             <span> <a href="/s/user/?q={{user.email}}">我的Feed</a> </span>
43 |             <span>|</span>
44 |             <span> <a href="{{ logout }}">登出</a> </span>
45 |             {% else %}
46 |             <span> <a href="{{ login }}">登入</a> </span>
47 |             {% endif %}
48 |           </div>
49 |           <div id="headerLeft">
50 |             <span> <a href="/">首页</a> </span>
51 |             <span> <a href="http://binux.me/" target="_blank">Binuxの杂货铺</a> </span>
52 |             <!--<span> <a href="http://douban2bangumi.appspot.com/" target="_blank">Douban2Bangumi</a> </span>-->
53 |             <!--<span> <a href="http://war3.byr.edu.cn/" target="_blank">BYR Dota 战网</a> </span>-->
54 |           </div>
55 |         </div>
56 |         <div class="line"></div>
57 |       </div>
58 |       
59 |       <!--Main Content-->
60 |       <div id="mainContent">
61 |         {% block content %}{% endblock %}
62 |         <div id="footer-margin" class="clear-both"></div>
63 |       </div>
64 |     </div>
65 |     
66 |     <!--Footer-->
67 |     <div id="footer">
68 |       <div class="line"></div>
69 |       <div id="applogo"><a href="#"><img src="http://code.google.com/appengine/images/appengine-silver-120x30.gif" alt="由 Google App Engine 提供支持" /></a></div>
70 |     </div>
71 |   </div>
72 |   {% block script %}{% endblock %}
73 | </body>
74 | </html>
75 | 
76 | 
77 |  
78 | 


--------------------------------------------------------------------------------
/template/editor.html:
--------------------------------------------------------------------------------
  1 | {% extends "base.html" %}
  2 | 
  3 | {% block content %}
  4 |       {% ifequal saved "1" %}
  5 |         <div id="notice" class="ui-corner-all ui-state-highlight" style="width: 80%; margin: 20px auto 0;">
  6 |           <p style="margin: 0.7em;"><span class="ui-icon ui-icon-info" style="float: left; margin-right: .3em;"></span>保存成功！</p>
  7 |         </div>
  8 |       {% endifequal %}
  9 |       <div id="editorContent" class="ui-corner-all">
 10 |         <div id="editorTitle">
 11 |           <span>编辑全文Feed</span>
 12 |           {% ifnotequal project.key.id 0 %}
 13 |           <a href="/a/{{ project.key.id }}" class="button ui-corner-all float-right">立即订阅</a>
 14 |           <div class="clear-both"></div>
 15 |           {% endifnotequal %}
 16 |         </div>
 17 |         <form id="editorForm" action="/e/" method="post" onsubmit="return checkForm();">
 18 |           <div id="editorLeft">
 19 |             <div>
 20 |               <input id="projectId" type="hidden" name="id" value="{{ project.key.id }}" />
 21 |               <div class="inputArea">
 22 |                 <label for="editorLinkInput">原始Feed地址</label>
 23 |                 <input id="editorLinkInput" type="text" name="link" value="{{ project.link|escape }}" {{ is_valid|yesno:",readonly" }} {% ifnotequal project.key.id 0 %}readonly{% endifnotequal %} style="width: 100%" />
 24 |               </div>
 25 |               <div class="inputArea">
 26 |                 <label for="editorNameInput">名称</label>
 27 |                 <input id="editorNameInput" type="text" name="name" value="{{ project.name|escape }}" {{ is_valid|yesno:",readonly" }} style="width: 100%" />
 28 |               </div>
 29 |               <div class="inputArea">
 30 |                 <label for="editorDescriptionTextarea">简介</label>
 31 |                 <textarea id="editorDescriptionTextarea" name="description" {{ is_valid|yesno:",readonly" }} style="width: 100%; height: 3em;">{{ project.description|escape }}</textarea>
 32 |               </div>
 33 |               <div class="inputArea">
 34 |                 <label>Created By:</label>
 35 |                 <div style="text-align: right;">{{ project.user.email|escape }}</div>
 36 |               </div>
 37 |             </div>
 38 |           </div>
 39 |           <div id="editorRight">
 40 |             <div>
 41 |               <div class="inputArea">
 42 |                 <label for="editorContentSelector">
 43 |                   <span class="float-left">全文内容CSS选择器（每行一个）</span>
 44 |                   <a href="http://blog.binux.me/2011/02/css_selector_for_fulltext/" class="ui-icon ui-icon-help float-right" target="_blank"></a>
 45 |                 </label>
 46 |                 <textarea id="editorContentSelector" name="content" {{ is_valid|yesno:",readonly" }} style="width: 100%; height: 4em;">{{ project.contentSelector|escape }}</textarea>
 47 |               </div>
 48 |               <div class="inputArea">
 49 |                 <label for"editorFilterSelector">
 50 |                   <span class="float-left">广告过滤CSS选择器（每行一个）</span>
 51 |                   <a href="http://blog.binux.me/2011/02/css_selector_for_fulltext/" class="ui-icon ui-icon-help float-right" target="_blank"></a>
 52 |                 </label>
 53 |                 <textarea id="editorFilterSelector" name="filter" {{ is_valid|yesno:",readonly" }} style="width: 100%; height: 7em;">{{ project.filterSelector|escape }}</textarea>
 54 |               </div>
 55 |               <div id="editorButtonBox">
 56 |                 <div class="float-left">
 57 |                   {% if is_valid %}
 58 |                     {% ifnotequal project.key.id 0 %}
 59 |                       <input class="button ui-corner-all" id="editorDeleteButton" name="delete" value="删除" type="submit" />
 60 |                     {% endifnotequal %}
 61 |                   {% endif %}
 62 |                 </div>
 63 |                 <div class="float-right">
 64 |                   {% if is_valid %}
 65 |                     <a href="#" onclick="preview(); return false;" class="button ui-corner-all" id="editorPreviewButton">预览</a>
 66 |                     <input class="button ui-corner-all" id="editorSubmitButton" value="保存" type="submit" />
 67 |                   {% else %}
 68 |                     <a href="{{ login }}" class="button ui-corner-all" id="editorUnlockButton">解锁</a>
 69 |                   {% endif %}
 70 |                 </div>
 71 |                 <div class="clear-both"></div>
 72 |               </div>
 73 |             </div>
 74 |           </div>
 75 |         </form>
 76 |         <div class="clear-both"></div>
 77 |       </div>
 78 | 
 79 |       <div id="preview">
 80 |       </div>
 81 | {% endblock %}
 82 | 
 83 | {% block script %}
 84 |   <script>
 85 |     function checkForm() {
 86 |       var valid = true;
 87 |       if($('#editorNameInput').val() == "") {
 88 |         $('#editorNameInput').parent('.inputArea').addClass('noticeArea');
 89 |         valid = false;
 90 |       } else {
 91 |         $('#editorNameInput').parent('.inputArea').removeClass('noticeArea');
 92 |       }
 93 |       if($('#editorLinkInput').val() == "") {
 94 |         $('#editorLinkInput').parent('.inputArea').addClass('noticeArea');
 95 |         valid = false;
 96 |       } else {
 97 |         $('#editorLinkInput').parent('.inputArea').removeClass('noticeArea');
 98 |       }
 99 |       if($('#editorContentSelector').val() == "") {
100 |         $('#editorContentSelector').parent('.inputArea').addClass('noticeArea');
101 |         valid = false;
102 |       } else {
103 |         $('#editorContentSelector').parent('.inputArea').removeClass('noticeArea');
104 |       }
105 |       return valid;
106 |     }
107 | 
108 |     function preview() {
109 |       if (!checkForm()) {
110 |         return;
111 |       }
112 |       var data = {
113 |         'link'    : $('#editorLinkInput').val(),
114 |         'description' : $('#editorDescriptionTextarea').val(),
115 |         'content' : $('#editorContentSelector').val(),
116 |         'filter'  : $('#editorFilterSelector').val(),
117 |       };
118 |       $('#preview').load('/p/', data);
119 |     }
120 |   </script>
121 | {% endblock %}
122 | 


--------------------------------------------------------------------------------
/template/index.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block content %}
 4 |       <div id="feedForm">
 5 |         <form action="s/all/" method="get">
 6 |           <div id="feedInputBox">
 7 |             <input id="feedInput" name="q" type="text" />
 8 |             <input id="feedButton" title="MakeFeed" value="Search" type="submit" />
 9 |           </div>
10 |         </form>
11 |       </div>
12 | {% endblock %}
13 | 


--------------------------------------------------------------------------------
/template/search.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block content %}
 4 |   <div id="searchListBox">
 5 |     <ul id="searchList">
 6 |     {% if projects %}
 7 |     {% for project in projects %}
 8 |       {% if forloop.first %}
 9 |       <li class="searchItem searchItemFirst ui-corner-top">
10 |       {% else %}
11 |       <li class="searchItem">
12 |       {% endif %}
13 |         <div class="itemControls">
14 |           {% ifequal user project.user %}
15 |             <a href="/e/?id={{ project.key.id }}" class="button ui-corner-all">
16 |             <span>编辑</span>
17 |           </a>
18 |           {% endifequal %}
19 |           <a href="/a/{{ project.key.id }}" class="button ui-corner-all">
20 |             <span>立即订阅</span>
21 |           </a>
22 |         </div>
23 |         <div class="starBox">
24 |           <span class="ui-icon ui-icon-circle-triangle-e"></span>
25 |         </div>
26 |         <div class="titleRow">
27 |           <span>{{ project.name|escape }}</span>
28 |           <a href="{{ project.link|escape }}" class="itemLink">{{ project.link|escape }}</a>
29 |         </div>
30 |         <div class="discriptionRow">
31 |           {{ project.description|escape|linebreaks }}
32 |         </div>
33 |         <div class="authorRow">
34 |           <span>{{ project.user.email|escape }}</span>
35 |         </div>
36 |       </li>
37 |     {% endfor %}
38 |     {% else %}
39 |       <li class="searchItem searchItemFirst ui-corner-top">
40 |         <div class="starBox">
41 |           <span class="ui-icon ui-icon-circle-close"></span>
42 |         </div>
43 |         <div class="titleRow">
44 |           <span>没有找到匹配的全文Feed</span>
45 |         </div>
46 |         <div class="discriptionRow">
47 |           Why not create it?
48 |         </div>
49 |         <div class="clear-both"></div>
50 |       </li>
51 |     {% endif %}
52 |       <li class="searchItem ui-corner-bottom float-right">
53 |         <a href="/e/?link={{ link }}" class="button ui-corner-all">
54 |           <span>建立新的全文Feed</span>
55 |         </a>
56 |         <div class="clear-both"></div>
57 |       </li>
58 |     </ul>
59 |   </div>
60 | {% endblock %}
61 | 


--------------------------------------------------------------------------------
/template/style.css:
--------------------------------------------------------------------------------
  1 | /*
  2 | Site Name:      full-text
  3 | Developed By:   binux - 足兆叉虫
  4 | Date Created:   2011-2-5
  5 | Last Updated:   2011-2-5
  6 | Copyright:      GPLv3   <http://www.gnu.org/licenses/>
  7 | */
  8 | 
  9 | #mainContainer {
 10 |   margin: auto; /* center in viewport */
 11 |   width: 974px;
 12 |   font-size: 14px;
 13 |   height: 100%; /* for stickyfooter */
 14 | }
 15 | 
 16 | /* ....... header ........*/
 17 | #headerContent {
 18 |   width: 100%;
 19 | }
 20 | #headerContent span {
 21 |   margin: 0px 2px;
 22 | }
 23 | #headerRight {
 24 |   float: right;
 25 | }
 26 | #headerLeft {
 27 |   float: left;
 28 | }
 29 | 
 30 | /* ...... middle ...... */
 31 | /* ...... feedForm ...... */
 32 | #feedForm {
 33 |   margin-top: 20%;
 34 | }
 35 | #feedInputBox {
 36 |   text-align: center;
 37 | }
 38 | #feedInput {
 39 |   width: 360px;
 40 | }
 41 | /* ...... editorForm ...... */
 42 | #editorContent {
 43 |   width: 80%;
 44 |   margin: auto;
 45 | }
 46 | #editorLeft {
 47 |   width: 39%;
 48 |   float: left;
 49 | }
 50 | #editorLeft > div {
 51 |   padding: 0px 5px;
 52 | }
 53 | #editorRight {
 54 |   width: 59%;
 55 |   float: right;
 56 | }
 57 | #editorRight > div {
 58 |   padding: 0px 5px;
 59 | }
 60 | #editorButtonBox {
 61 |   text-align: right;
 62 | }
 63 | 
 64 | /* ...... footer ...... */
 65 | #applogo {
 66 |   float: right;
 67 | }
 68 | /* ...... stickyfooter ...... */
 69 | html, body {height: 100%;}
 70 | #wrap {
 71 |   min-height: 100%;
 72 | }
 73 | #mainContent {
 74 |   overflow:auto;
 75 | 	padding-bottom: 35px;
 76 | }  /* must be same height as the footer */
 77 | #footer {
 78 |   position: relative;
 79 | 	margin-top: -35px; /* negative value of footer height */
 80 | 	height: 35px;
 81 | 	clear:both;
 82 | } 
 83 | /*Opera Fix*/
 84 | body:before {
 85 | 	content:"";
 86 | 	height:100%;
 87 | 	float:left;
 88 | 	width:0;
 89 | 	margin-top:-32767px;/
 90 | }
 91 | 
 92 | /* ....... elements ....... */
 93 | .line {
 94 |   border-top: 1px solid #C9D7F1;
 95 |   font-size: 1px;
 96 |   height: 0;
 97 |   width: 100%;
 98 |   clear: both;
 99 | }
100 | 


--------------------------------------------------------------------------------
/template/test.html:
--------------------------------------------------------------------------------
 1 |       <div id="previewContent" class="ui-corner-all">
 2 |        <div id="previewTitleBox">
 3 |           <div id="previewTitle" class="float-left">
 4 |             <span class="ui-icon ui-icon-circle-triangle-e float-left"></span>
 5 |             <span class="float-left">{{ sample.title }}</span>
 6 |           </div>
 7 |           <div id="previewControls" class="float-right" style="margin-left: 0.5em">
 8 |             <a href="{{ sample.link }}" class="button ui-corner-all" target="_blank">浏览原文</a>
 9 |             <a href="#" onclick="preview(); return false;" class="button ui-corner-all" id="editorPreviewButton">下一篇</a>
10 |           </div>
11 |           <div class="clear-both"></div>
12 |         </div>
13 |         <div id="previewFullText">
14 |           {{ fulltext }}
15 |         </div>
16 |         <div class="clear-both"></div>
17 |       </div>
18 |       <div>
19 |         <textarea id="previewCode">{{ fulltext|escape }}</textarea>
20 |       </div>
21 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | #!/url/bin/python
 2 | #-- coding: utf-8 --
 3 | '''
 4 | Create on 2011.2.4
 5 | 
 6 | @author: binux
 7 | '''
 8 | 
 9 | def test_parser(filename):
10 |     import feedparser
11 |     import feedformatter
12 | 
13 |     fp = open(filename, 'r')
14 |     content = fp.read()
15 |     rss = feedparser.parse(content)
16 |     feedparser.PprintSerializer(rss).write()
17 | 
18 | def test_parser_serilaz(filename):
19 |     import feedparser
20 |     import feedformatter
21 | 
22 |     fp = open(filename, 'r')
23 |     content = fp.read()
24 |     rss = feedparser.parse(content)
25 |     #feedparser.PprintSerializer(rss).write()
26 | 
27 |     feed = feedformatter.Feed(rss.feed, rss.entries)
28 |     print feed.format_rss2_string(pretty=True)
29 | 
30 | def test_insertErrorItem(filename):
31 |     import feedparser
32 |     import feedformatter
33 |     from cron import insertErrorItem
34 | 
35 |     fp = open(filename, 'r')
36 |     content = fp.read()
37 |     rss = feedparser.parse(content)
38 |     #feedparser.PprintSerializer(rss).write()
39 | 
40 |     feed = feedformatter.Feed(rss.feed, rss.entries)
41 |     feed_string = feed.format_rss2_string(pretty=True)
42 | 
43 |     print insertErrorItem(feed_string)
44 | 
45 | def test_css_selector(filename, selector):
46 |     import html5lib
47 |     import css_selector
48 |     
49 |     fp = open(filename , 'r')
50 |     content = fp.read()
51 |     doc_dom = html5lib.parse(content, 'dom')
52 |     print doc_dom.getElementsBySelector(selector)
53 | 
54 | def main():
55 |     test_parser('./doc/feed')
56 |     #test_parser_serilaz('./doc/feed')
57 |     #test_insertErrorItem('cnbate.rss')
58 |     #test_css_selector('/home/binux/downloads/google-appengine-docs-20110105/appengine/docs/python/taskqueue/tasks.html', 'div.g-unit#gc-toc')
59 |     pass
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     main()
64 | 


--------------------------------------------------------------------------------
/tmp:
--------------------------------------------------------------------------------
 1 | 53077
 2 | 54746
 3 | 55696
 4 | 39444
 5 | 54277
 6 | 53616
 7 | 53420
 8 | -55853
 9 | 
10 | 55319
11 | 54839
12 | 53933
13 | 
14 | 


--------------------------------------------------------------------------------