├── __init__.py ├── setup.cfg ├── MANIFEST.in ├── sample ├── amp_sym.py ├── remove_items.py ├── fst_elem.py ├── insert_after.py ├── print_dom.py ├── insert_data.py └── delete_attributes.py ├── MANIFEST ├── setup.py ├── .gitignore ├── LICENSE ├── README.md ├── escs.sh └── ehp.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | 2 | [metadata] 3 | description-file = README.md 4 | 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include COPYING 2 | recursive-include sample * 3 | recursive-include doc * 4 | 5 | -------------------------------------------------------------------------------- /sample/amp_sym.py: -------------------------------------------------------------------------------- 1 | from ehp import * 2 | 3 | html = Html() 4 | data = ''' The & is a good & symbol. ''' 5 | dom = html.feed(data) 6 | 7 | for root, ind in dom.find_with_root(AMP): 8 | print(ind) 9 | 10 | -------------------------------------------------------------------------------- /sample/remove_items.py: -------------------------------------------------------------------------------- 1 | from ehp import * 2 | 3 | html = Html() 4 | 5 | data = ''' 6 | foo 7 | ''' 8 | 9 | dom = html.feed(data) 10 | 11 | for root, item in dom.find_with_root('em'): 12 | root.remove(item) 13 | 14 | print(dom) 15 | -------------------------------------------------------------------------------- /sample/fst_elem.py: -------------------------------------------------------------------------------- 1 | # Name: ex17.py 2 | 3 | from ehp import * 4 | 5 | html = Html() 6 | data = ' beta. ' 7 | dom = html.feed(data) 8 | 9 | root, item = dom.fst_with_root('em') 10 | root.insert_after(item, Tag('p')) 11 | print(root) 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | ehp.py 3 | setup.cfg 4 | setup.py 5 | sample/amp_sym.py 6 | sample/delete_attributes.py 7 | sample/fst_elem.py 8 | sample/insert_after.py 9 | sample/insert_data.py 10 | sample/print_dom.py 11 | sample/remove_items.py 12 | sample/tag_attr.html 13 | -------------------------------------------------------------------------------- /sample/insert_after.py: -------------------------------------------------------------------------------- 1 | from ehp import * 2 | data = ''' alpha ''' 3 | dom = Html().feed(data) 4 | x = Tag('em') 5 | x.append(Data('beta')) 6 | 7 | for root, ind in dom.find_with_root('em'): 8 | root.insert_after(ind, x) 9 | 10 | print(dom) 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /sample/print_dom.py: -------------------------------------------------------------------------------- 1 | from ehp import * 2 | 3 | html = Html() 4 | 5 | data = ''' 6 |

7 | This is a paragraph. 8 |

9 | ''' 10 | 11 | dom = html.feed(data) 12 | 13 | print("The entire dom:") 14 | print(dom) 15 | print("The text in the dom:") 16 | print(dom.text()) 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /sample/insert_data.py: -------------------------------------------------------------------------------- 1 | from ehp import * 2 | 3 | data = ''' ''' 4 | html = Html() 5 | dom = html.feed(data) 6 | 7 | font = Tag('font', {'color':'red'}) 8 | font.append(Data('Data inserted!')) 9 | 10 | for ind in dom.find('em'): 11 | ind.append(font) 12 | 13 | print(dom) 14 | 15 | 16 | -------------------------------------------------------------------------------- /sample/delete_attributes.py: -------------------------------------------------------------------------------- 1 | from ehp import * 2 | 3 | html = Html() 4 | dom = html.feed('

xxx

\ 5 |

mmm

') 6 | 7 | for root, ind in dom.match_with_root(('style', 'color:black')): 8 | del ind.attr['style'] 9 | 10 | 11 | print(dom) 12 | 13 | 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup(name="ehp", 4 | version="2.0.1", 5 | py_modules=["ehp"], 6 | author="Iury O. G. Figueiredo", 7 | author_email="ioliveira@id.uff.br", 8 | url="", 9 | description="Parsing html/xml documents in a faster and easier way") 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ehp 2 | 3 | Easy Html Parser is an AST generator for html/xml documents. EHP is a nice tool to parse html content. 4 | It has a short learning curve compared to other parsers. You don't need to lose time going through massive 5 | documentation to do simple stuff. EHP handles broken html nicely. 6 | 7 | EHP has a short learning curve, you can go through some examples, in a few minutes 8 | you can implement cool stuff. 9 | 10 | ### Create/Delete elements 11 | 12 | ~~~python 13 | from ehp import * 14 | 15 | html = Html() 16 | 17 | data = ''' 18 | foo 19 | ''' 20 | 21 | dom = html.feed(data) 22 | 23 | for root, item in dom.find_with_root('em'): 24 | root.remove(item) 25 | 26 | print(dom) 27 | ~~~ 28 | 29 | ~~~ 30 | 31 | 32 | ~~~ 33 | 34 | ### Manipulate attributes 35 | 36 | ~~~python 37 | from ehp import * 38 | 39 | data = '''

It is simple.

''' 40 | 41 | dom = Html().feed(data) 42 | 43 | for ind, name, attr in dom.walk(): 44 | attr['size'] = '+2' 45 | 46 | print(dom) 47 | ~~~ 48 | 49 | ~~~ 50 |

It is simple.

51 | ~~~ 52 | 53 | Install 54 | ======= 55 | 56 | **Note:** Ehp works on python3 only, python2 support is no longer available. 57 | 58 | ~~~ 59 | pip ehp install 60 | ~~~ 61 | 62 | **Note:** The module is quite well documented, you can find documentation there. 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /escs.sh: -------------------------------------------------------------------------------- 1 | # clone, easyhtmlparser, ehp. 2 | cd ~/projects 3 | git clone git@github.com:iogf/ehp.git ehp-code 4 | ############################################################################## 5 | # push, easyhtmlparser, ehp. 6 | cd ~/projects/ehp-code 7 | git status 8 | git add * 9 | git commit 10 | git push 11 | ############################################################################## 12 | # create, development, branch, ehp. 13 | cd /home/tau/projects/ehp-code 14 | git branch -a 15 | git checkout -b development 16 | git push --set-upstream origin development 17 | ############################################################################## 18 | # checkout, undo, easyhtmlparser, ehp. 19 | cd ~/projects/ehp-code 20 | git checkout * 21 | ############################################################################## 22 | # install, easyhtmlparser, ehp. 23 | cd ~/projects/ehp-code 24 | sudo bash -i 25 | python2 setup.py install 26 | rm -fr build 27 | exit 28 | ############################################################################## 29 | # ehp, pip. 30 | cd ~/projects/ehp-code 31 | 32 | echo ' 33 | [metadata] 34 | description-file = README.md 35 | ' > setup.cfg 36 | 37 | vy setup.cfg 38 | python setup.py sdist register upload 39 | rm -fr sdist 40 | ############################################################################## 41 | # merge development into master. 42 | cd /home/tau/projects/ehp-code/ 43 | git checkout master 44 | git merge development 45 | git push 46 | git checkout development 47 | 48 | -------------------------------------------------------------------------------- /ehp.py: -------------------------------------------------------------------------------- 1 | from html.parser import HTMLParser 2 | from collections import deque 3 | 4 | version = '1.3' 5 | DATA = 1 6 | META = 2 7 | COMMENT = 3 8 | PI = 4 9 | CODE = 5 10 | AMP = 6 11 | 12 | class Attribute(dict): 13 | """ 14 | This class holds the tags's attributes. 15 | The idea consists in providing an efficient and flexible way of manipulating 16 | tags attributes inside the dom. 17 | 18 | Example: 19 | dom = Html().feed('

foo

') 20 | 21 | for ind in dom.sail(): 22 | if ind.name == 'p': 23 | ind.attr['style'] = "color:blue" 24 | 25 | It would change to color blue. 26 | """ 27 | 28 | def __getitem__(self, key): 29 | """ 30 | If self doesn't have the key it returns '' 31 | """ 32 | 33 | return self.get(key, '') 34 | 35 | def __str__(self): 36 | """ 37 | It returns a htmlized representation for attributes 38 | which are inside self. 39 | """ 40 | 41 | data = '' 42 | for key, value in list(self.items()): 43 | pair = '%s="%s" ' % (key, value) 44 | data = data + pair 45 | 46 | return data 47 | 48 | 49 | class Root(list): 50 | """ 51 | A Root instance is the outmost node for a xml/html document. 52 | All xml/html entities inherit from this class. 53 | 54 | html = Html() 55 | dom = html.feed(' ... ') 56 | 57 | dom.name == '' 58 | True 59 | type(dom) == Root 60 | True 61 | 62 | """ 63 | 64 | 65 | def __init__(self, name=None, attr={}): 66 | """ 67 | """ 68 | 69 | self.name = name 70 | self.attr = Attribute(attr) 71 | 72 | list.__init__(self) 73 | 74 | __repr__ = object.__repr__ 75 | 76 | def __str__(self): 77 | """ 78 | This str function returns a string representation of the structure. 79 | """ 80 | 81 | html = '' 82 | 83 | for ind in self: 84 | html = '%s%s' % (html, ind) 85 | 86 | return html 87 | 88 | def sail(self): 89 | """ 90 | This is used to navigate through the xml/html document. 91 | Every xml/html object is represented by a python class 92 | instance that inherits from Root. 93 | 94 | The method sail is used to return an iterator 95 | for these objects. 96 | 97 | Example: 98 | data = ' ' 99 | 100 | html = Html() 101 | dom = html.feed(data) 102 | 103 | for ind in dom.sail(): 104 | print type(ind),',', ind.name 105 | 106 | It would output. 107 | 108 | , a 109 | , b 110 | """ 111 | 112 | for indi in self[:]: 113 | for indj in indi.sail(): 114 | yield(indj) 115 | 116 | yield(indi) 117 | 118 | def index(self, item): 119 | """ 120 | This is similar to index but uses id 121 | to check for equality. 122 | 123 | Example: 124 | 125 | data = '' 126 | html = Html() 127 | dom = html.feed(data) 128 | 129 | for root, ind in dom.sail_with_root(): 130 | print root.name, ind.name, root.index(ind) 131 | 132 | 133 | It would print. 134 | 135 | a b 0 136 | a b 1 137 | a 0 138 | 139 | The line where it appears ' a 0' corresponds to the 140 | outmost object. The outmost object is an instance of Root 141 | that contains all the other objects. 142 | """ 143 | 144 | count = 0 145 | for ind in self: 146 | if ind is item: return count 147 | count = count + 1 148 | 149 | raise ValueError 150 | 151 | def remove(self, item): 152 | """ 153 | This is as list.remove but works with id. 154 | 155 | data = '' 156 | html = Html() 157 | dom = html.feed(data) 158 | 159 | for root, ind in dom.sail_with_root(): 160 | if ind.name == 'b': 161 | root.remove(ind) 162 | 163 | print dom 164 | 165 | It should print. 166 | 167 | 168 | """ 169 | 170 | index = self.index(item) 171 | del self[index] 172 | 173 | def find(self, name, *args): 174 | """ 175 | It is used to find all objects that match name. 176 | 177 | Example 1: 178 | 179 | data = '' 180 | html = Html() 181 | dom = html.feed(data) 182 | 183 | for ind in dom.find('b'): 184 | print ind 185 | 186 | It should print. 187 | 188 | 189 | 190 | 191 | Example 2. 192 | 193 | data = '

alpha.

beta.

' 194 | html = Html() 195 | dom = html.feed(data) 196 | 197 | for ind in dom.find('p', ('style', 'color:green')): 198 | print ind 199 | 200 | Output. 201 | 202 | 203 |

beta.

204 | """ 205 | 206 | for ind in self.sail(): 207 | if ind.name == name: 208 | for key, value in args: 209 | if ind.attr[key] != value: 210 | break 211 | else: 212 | yield(ind) 213 | 214 | def find_with_root(self, name, *args): 215 | """ 216 | Like Root.find but returns its parent tag. 217 | 218 | from ehp import * 219 | 220 | html = Html() 221 | dom = html.feed('''

alpha

beta

''') 222 | 223 | for root, ind in dom.find_with_root('p'): 224 | root.remove(ind) 225 | 226 | print dom 227 | 228 | It would output. 229 | 230 | 231 | """ 232 | 233 | for root, ind in self.sail_with_root(): 234 | if ind.name == name: 235 | for key, value in args: 236 | if ind.attr[key] != value: 237 | break 238 | else: 239 | yield(root, ind) 240 | 241 | 242 | def byid(self, id): 243 | """ 244 | It is a shortcut for finding an object 245 | whose attribute 'id' matches id. 246 | 247 | Example: 248 | 249 | data = '' 250 | html = Html() 251 | dom = html.feed(data) 252 | 253 | print dom.byid('foo') 254 | print dom.byid('bar') 255 | 256 | It should print. 257 | 258 | 259 | None 260 | """ 261 | 262 | return self.take(('id', id)) 263 | 264 | def take(self, *args): 265 | """ 266 | It returns the first object whose one of its 267 | attributes matches (key0, value0), (key1, value1), ... . 268 | 269 | Example: 270 | 271 | data = '' 272 | html = Html() 273 | dom = html.feed(data) 274 | 275 | print dom.take(('id', 'foo')) 276 | print dom.take(('id', 'foo'), ('size', '2')) 277 | """ 278 | 279 | seq = self.match(*args) 280 | 281 | try: 282 | item = next(seq) 283 | except StopIteration: 284 | return None 285 | else: 286 | return item 287 | 288 | def take_with_root(self, *args): 289 | """ 290 | Like Root.take but returns the tag parent. 291 | """ 292 | 293 | seq = self.match_with_root(*args) 294 | 295 | try: 296 | item = next(seq) 297 | except StopIteration: 298 | return None 299 | else: 300 | return item 301 | 302 | 303 | pass 304 | 305 | def match(self, *args): 306 | """ 307 | It returns a sequence of objects whose attributes match. 308 | (key0, value0), (key1, value1), ... . 309 | 310 | Example: 311 | 312 | data = '' 313 | html = Html() 314 | dom = html.feed(data) 315 | 316 | for ind in dom.match(('size', '1')): 317 | print ind 318 | 319 | It would print. 320 | 321 | 322 | 323 | """ 324 | 325 | for ind in self.sail(): 326 | for key, value in args: 327 | if ind.attr[key] != value: 328 | break 329 | else: 330 | yield(ind) 331 | 332 | def match_with_root(self, *args): 333 | """ 334 | Like Root.match but with its parent tag. 335 | 336 | Example: 337 | 338 | from ehp import * 339 | 340 | html = Html() 341 | dom = html.feed('''

xxx

342 |

mmm

''') 343 | 344 | for root, ind in dom.match_with_root(('style', 'color:black')): 345 | del ind.attr['style'] 346 | 347 | item = dom.fst('body') 348 | item.attr['style'] = 'color:black' 349 | 350 | print dom 351 | 352 | Output. 353 | 354 |

xxx

355 |

mmm

356 | """ 357 | 358 | for root, ind in self.sail_with_root(): 359 | for key, value in args: 360 | if ind.attr[key] != value: 361 | break 362 | else: 363 | yield(root, ind) 364 | 365 | 366 | def join(self, delim, *args): 367 | """ 368 | It joins all the objects whose name appears in args. 369 | 370 | Example 1: 371 | 372 | html = Html() 373 | data = ' This is cool. That is. ' 374 | dom = html.feed(data) 375 | 376 | print dom.join('', 'b') 377 | print type(dom.join('b')) 378 | 379 | It would print. 380 | 381 | This is cool. That is. 382 | 383 | 384 | Example 2: 385 | 386 | html = Html() 387 | data = ' alphabeta gamma' 388 | dom = html.feed(data) 389 | 390 | print dom.join('', 'b', 'c') 391 | 392 | It would print. 393 | 394 | alphabetagamma 395 | 396 | Example 3: 397 | 398 | 399 | html = Html() 400 | data = 'alphabetagamma' 401 | dom = html.feed(data) 402 | 403 | print dom.join('\n', DATA) 404 | 405 | It would print. 406 | 407 | alpha 408 | beta 409 | gamma 410 | """ 411 | 412 | data = '' 413 | 414 | for ind in self.sail(): 415 | if ind.name in args: 416 | data = '%s%s%s' % (data, delim, ind) 417 | 418 | return data 419 | 420 | def fst(self, name, *args): 421 | """ 422 | It returns the first object whose name 423 | matches. 424 | 425 | Example 1: 426 | 427 | html = Html() 428 | data = ' Cool. ' 429 | dom = html.feed(data) 430 | 431 | print dom.fst('em') 432 | 433 | It outputs. 434 | 435 | Cool. 436 | 437 | Example 2: 438 | 439 | data = '

alpha.

beta.

' 440 | html = Html() 441 | dom = html.feed(data) 442 | 443 | for ind in dom.find('p', ('style', 'color:green')): 444 | print ind 445 | 446 | print dom.fst('p', ('style', 'color:green')) 447 | print dom.fst_with_root('p', ('style', 'color:green')) 448 | 449 | Output: 450 | 451 |

beta.

452 |

beta.

453 | (, ) 454 | """ 455 | 456 | 457 | # for ind in self.sail(): 458 | # if ind.name == name: 459 | # for key, value in args: 460 | # if ind.attr[key] != value: 461 | # break 462 | # else: 463 | # return ind 464 | 465 | seq = self.find(name, *args) 466 | 467 | try: 468 | item = next(seq) 469 | except StopIteration: 470 | return None 471 | else: 472 | return item 473 | 474 | def fst_with_root(self, name, *args): 475 | """ 476 | Like fst but returns its item parent. 477 | 478 | Example: 479 | 480 | html = Html() 481 | data = ' Cool. ' 482 | dom = html.feed(data) 483 | 484 | root, item dom.fst_with_root('em') 485 | root.insert_after(item, Tag('p')) 486 | print root 487 | 488 | It outputs. 489 | 490 | Cool.

491 | 492 | For another similar example, see help(Root.fst) 493 | """ 494 | 495 | # for root, ind in self.sail_with_root(): 496 | # if ind.name == name: 497 | # for key, value in args: 498 | # if ind.attr[key] != value: 499 | # break 500 | # else: 501 | # return root, ind 502 | 503 | seq = self.find_with_root(name, *args) 504 | 505 | try: 506 | item = next(seq) 507 | except StopIteration: 508 | return None 509 | else: 510 | return item 511 | 512 | 513 | def text(self): 514 | """ 515 | It returns all objects whose name matches DATA. 516 | It basically returns a string corresponding 517 | to all asci characters that are inside a xml/html 518 | tag. 519 | 520 | 521 | Example: 522 | 523 | html = Html() 524 | data = 'This is all the text.' 525 | dom = html.feed(data) 526 | 527 | print dom.fst('em').text() 528 | 529 | It outputs. 530 | 531 | This is all the text. 532 | 533 | Notice that if you call text() on an item with 534 | children then it returns all the *printable* characters 535 | for that node. 536 | """ 537 | 538 | return self.join('', DATA, AMP, CODE) 539 | 540 | 541 | def write(self, filename): 542 | """ 543 | It saves the structure to a file. 544 | """ 545 | 546 | fd = open(filename, 'w') 547 | fd.write(self) 548 | fd.close() 549 | 550 | 551 | def sail_with_root(self): 552 | """ 553 | This one works like sail(), however it yields the tag's parents as 554 | well as the child tag. 555 | 556 | For an example, see help(Root.remove). 557 | """ 558 | 559 | for indi in self[:]: 560 | for indj in indi.sail_with_root(): 561 | yield(indj) 562 | 563 | yield((self, indi)) 564 | 565 | def walk(self): 566 | """ 567 | Like sail but carries name and attr. 568 | 569 | Example: 570 | 571 | html = Html() 572 | data = ' This is all the text.' 573 | dom = html.feed(data) 574 | 575 | for ind, name, attr in dom.walk(): 576 | print 'TAG:', ind 577 | print 'NAME:', name 578 | print 'ATTR:', attr 579 | 580 | It should print. 581 | 582 | TAG: 583 | NAME: 1 584 | ATTR: 585 | TAG: This is all the text. 586 | NAME: 1 587 | ATTR: 588 | TAG: This is all the text. 589 | NAME: em 590 | ATTR: 591 | TAG: This is all the text. 592 | NAME: body 593 | ATTR: 594 | """ 595 | 596 | for ind in self.sail(): 597 | yield (ind, ind.name, ind.attr) 598 | 599 | def walk_with_root(self): 600 | """ 601 | Like walk but carries root. 602 | 603 | Example: 604 | 605 | html = Html() 606 | data = 'alpha' 607 | dom = html.feed(data) 608 | 609 | for (root, name, attr), (ind, name, attr) in dom.walk_with_root(): 610 | print root, name, ind, name 611 | 612 | Output: 613 | 614 | alpha 1 alpha 1 615 | alpha em alpha em 616 | alpha body alpha body 617 | """ 618 | 619 | for root, ind in self.sail_with_root(): 620 | yield ((root, root.name, root.attr), 621 | (ind, ind.name, ind.attr)) 622 | 623 | def insert_after(self, y, k): 624 | """ 625 | Insert after a given tag. 626 | 627 | For an example, see help(Root.fst_with_root). 628 | """ 629 | 630 | ind = self.index(y) 631 | self.insert(ind + 1, k) 632 | 633 | def insert_before(self, y, k): 634 | """ 635 | Insert before a given tag. 636 | 637 | For a similar example, see help(Root.fst_with_root). 638 | """ 639 | 640 | ind = self.index(y) 641 | self.insert(ind, k) 642 | 643 | 644 | class Tag(Root): 645 | """ 646 | This class's instances represent xml/html tags under the form: 647 | ... . 648 | 649 | It holds useful methods for parsing xml/html documents. 650 | 651 | """ 652 | 653 | def __init__(self, name, attr={}): 654 | """ 655 | The parameter name is the xml/html tag's name. 656 | 657 | Example: 658 | 659 | d = {'style': 'background:blue;'} 660 | x = Tag('p', d) 661 | """ 662 | Root.__init__(self, name, attr) 663 | 664 | def __str__(self): 665 | """ 666 | This function returns a string representation for a node. 667 | """ 668 | 669 | html = '<%s %s>' % (self.name, self.attr) 670 | 671 | 672 | for ind in self: 673 | html = '%s%s' % (html, ind) 674 | 675 | html = html + '' % self.name 676 | 677 | return html 678 | 679 | 680 | class Data(Root): 681 | """ 682 | The pythonic representation of data that is inside xml/html documents. 683 | 684 | All data that is not a xml/html token is represented by this class in the 685 | structure of the document. 686 | 687 | Example: 688 | 689 | html = Html() 690 | data = 'alpha' 691 | dom = html.feed(data) 692 | 693 | x = dom.fst('em') 694 | 695 | # x holds a Data instance. 696 | 697 | type(x[0]) 698 | print x[0] 699 | 700 | Output: 701 | 702 | 703 | alpha 704 | 705 | The Data instances are everywhere in the document, when 706 | the tokenizer finds them between the xml/html tags it builds 707 | up the structure identically to the document. 708 | """ 709 | 710 | def __init__(self, data): 711 | """ 712 | The data holds the characters. 713 | 714 | Example: 715 | 716 | html = Html() 717 | data = 'alpha' 718 | dom = html.feed(data) 719 | x = dom.fst('em') 720 | x.append(Data('\nbeta')) 721 | 722 | It outputs. 723 | 724 | alpha 725 | beta 726 | """ 727 | 728 | Root.__init__(self, DATA) 729 | self.data = data 730 | 731 | def __str__(self): 732 | """ 733 | This function returns a string which correspond to the data inside the 734 | Data class. 735 | """ 736 | 737 | return self.data 738 | 739 | def text(self): 740 | return self.data 741 | 742 | class XTag(Root): 743 | """ 744 | This tag is the representation of html's tags in XHTML style like 745 | It is tags which do not have children. 746 | 747 | """ 748 | 749 | def __init__(self, name, attr={}): 750 | """ 751 | See help(Tag). 752 | """ 753 | Root.__init__(self, name, attr) 754 | 755 | 756 | def __str__(self): 757 | html = '<%s %s/>' % (self.name, self.attr) 758 | 759 | return html 760 | 761 | 762 | class Meta(Root): 763 | """ 764 | 765 | """ 766 | 767 | def __init__(self, data): 768 | Root.__init__(self, META) 769 | self.data = data 770 | 771 | def __str__(self): 772 | html = '' % self.data 773 | 774 | return html 775 | 776 | class Code(Root): 777 | """ 778 | """ 779 | 780 | def __init__(self, data): 781 | Root.__init__(self, CODE) 782 | self.data = data 783 | 784 | def __str__(self): 785 | html = '&#%s' % self.data 786 | 787 | return html 788 | 789 | class Amp(Root): 790 | """ 791 | 792 | """ 793 | 794 | def __init__(self, data): 795 | Root.__init__(self, AMP) 796 | self.data = data 797 | 798 | def __str__(self): 799 | html = '&%s' % self.data 800 | 801 | return html 802 | 803 | 804 | class Pi(Root): 805 | """ 806 | 807 | """ 808 | 809 | def __init__(self, data): 810 | Root.__init__(self, PI) 811 | self.data = data 812 | 813 | def __str__(self): 814 | html = '' % self.data 815 | 816 | return html 817 | 818 | 819 | class Comment(Root): 820 | """ 821 | 822 | """ 823 | 824 | def __init__(self, data): 825 | Root.__init__(self, COMMENT) 826 | self.data = data 827 | 828 | def __str__(self): 829 | html = '' % self.data 830 | 831 | return html 832 | 833 | 834 | class Tree(object): 835 | """ 836 | The engine class. 837 | """ 838 | 839 | def __init__(self): 840 | """ 841 | Initializes outmost which is the struct which will 842 | hold all data inside the file. 843 | """ 844 | 845 | self.outmost = Root('') 846 | 847 | self.stack = deque() 848 | self.stack.append(self.outmost) 849 | 850 | def clear(self): 851 | """ 852 | Clear the outmost and stack for a new parsing. 853 | """ 854 | 855 | self.outmost = Root('') 856 | self.stack.clear() 857 | self.stack.append(self.outmost) 858 | 859 | def last(self): 860 | """ 861 | Return the last pointer which point to the actual tag scope. 862 | """ 863 | 864 | return self.stack[-1] 865 | 866 | def nest(self, name, attr): 867 | """ 868 | Nest a given tag at the bottom of the tree using 869 | the last stack's pointer. 870 | """ 871 | 872 | item = Tag(name, attr) 873 | 874 | pointer = self.stack.pop() 875 | 876 | pointer.append(item) 877 | 878 | self.stack.append(pointer) 879 | 880 | self.stack.append(item) 881 | 882 | def dnest(self, data): 883 | """ 884 | Nest the actual data onto the tree. 885 | """ 886 | 887 | top = self.last() 888 | 889 | item = Data(data) 890 | 891 | top.append(item) 892 | 893 | def xnest(self, name, attr): 894 | """ 895 | Nest a XTag onto the tree. 896 | """ 897 | 898 | top = self.last() 899 | 900 | item = XTag(name, attr) 901 | 902 | top.append(item) 903 | 904 | 905 | def ynest(self, data): 906 | """ 907 | 908 | """ 909 | 910 | top = self.last() 911 | 912 | item = Meta(data) 913 | 914 | top.append(item) 915 | 916 | 917 | def mnest(self, data): 918 | """ 919 | 920 | """ 921 | 922 | top = self.last() 923 | 924 | item = Comment(data) 925 | 926 | top.append(item) 927 | 928 | def cnest(self, data): 929 | """ 930 | 931 | """ 932 | 933 | top = self.last() 934 | 935 | item = Code(data) 936 | 937 | top.append(item) 938 | 939 | 940 | def rnest(self, data): 941 | """ 942 | 943 | """ 944 | 945 | top = self.last() 946 | 947 | item = Amp(data) 948 | 949 | top.append(item) 950 | 951 | 952 | def inest(self, data): 953 | """ 954 | 955 | """ 956 | 957 | top = self.last() 958 | 959 | item = Pi(data) 960 | 961 | top.append(item) 962 | 963 | def enclose(self, name): 964 | """ 965 | When found a closing tag then pops the pointer's scope from the stack 966 | so pointing to the earlier scope's tag. 967 | """ 968 | 969 | count = 0 970 | 971 | for ind in reversed(self.stack): 972 | count = count + 1 973 | 974 | if ind.name == name: 975 | break 976 | else: 977 | count = 0 978 | 979 | # It pops all the items which do not match with the closing tag. 980 | for i in range(0, count): 981 | self.stack.pop() 982 | 983 | 984 | class Html(HTMLParser): 985 | """ 986 | The tokenizer class. 987 | """ 988 | 989 | def __init__(self): 990 | HTMLParser.__init__(self) 991 | self.struct = Tree() 992 | 993 | def fromfile(self, filename): 994 | """ 995 | It builds a structure from a file. 996 | """ 997 | 998 | fd = open(filename, 'r') 999 | data = fd.read() 1000 | fd.close() 1001 | return self.feed(data) 1002 | 1003 | def feed(self, data): 1004 | """ 1005 | 1006 | """ 1007 | 1008 | self.struct.clear() 1009 | HTMLParser.feed(self, data) 1010 | 1011 | return self.struct.outmost 1012 | 1013 | def handle_starttag(self, name, attr): 1014 | """ 1015 | When found an opening tag then nest it onto the tree 1016 | """ 1017 | 1018 | self.struct.nest(name, attr) 1019 | pass 1020 | 1021 | def handle_startendtag(self, name, attr): 1022 | """ 1023 | When found a XHTML tag style then nest it up to the tree 1024 | """ 1025 | 1026 | self.struct.xnest(name, attr) 1027 | 1028 | def handle_endtag(self, name): 1029 | """ 1030 | When found a closing tag then makes it point to the right scope 1031 | """ 1032 | 1033 | self.struct.enclose(name) 1034 | pass 1035 | 1036 | def handle_data(self, data): 1037 | """ 1038 | Nest data onto the tree. 1039 | """ 1040 | 1041 | self.struct.dnest(data) 1042 | 1043 | def handle_decl(self, decl): 1044 | """ 1045 | 1046 | """ 1047 | self.struct.ynest(decl) 1048 | 1049 | def unknown_decl(self, decl): 1050 | """ 1051 | 1052 | """ 1053 | self.struct.ynest(decl) 1054 | 1055 | def handle_charref(self, data): 1056 | """ 1057 | 1058 | """ 1059 | 1060 | self.struct.cnest(data) 1061 | 1062 | def handle_entityref(self, data): 1063 | """ 1064 | 1065 | """ 1066 | 1067 | self.struct.rnest(data) 1068 | 1069 | def handle_pi(self, data): 1070 | """ 1071 | """ 1072 | 1073 | self.struct.inest(data) 1074 | 1075 | def handle_comment(self, data): 1076 | """ 1077 | 1078 | """ 1079 | 1080 | self.struct.mnest(data) 1081 | 1082 | 1083 | 1084 | 1085 | 1086 | 1087 | 1088 | 1089 | 1090 | 1091 | 1092 | 1093 | 1094 | 1095 | 1096 | 1097 | 1098 | 1099 | 1100 | 1101 | 1102 | 1103 | 1104 | 1105 | 1106 | 1107 | 1108 | --------------------------------------------------------------------------------