├── __init__.py
├── setup.cfg
├── MANIFEST.in
├── sample
├── amp_sym.py
├── remove_items.py
├── fst_elem.py
├── insert_after.py
├── print_dom.py
├── insert_data.py
└── delete_attributes.py
├── MANIFEST
├── setup.py
├── .gitignore
├── LICENSE
├── README.md
├── escs.sh
└── ehp.py
/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 |
2 | [metadata]
3 | description-file = README.md
4 |
5 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include COPYING
2 | recursive-include sample *
3 | recursive-include doc *
4 |
5 |
--------------------------------------------------------------------------------
/sample/amp_sym.py:
--------------------------------------------------------------------------------
1 | from ehp import *
2 |
3 | html = Html()
4 | data = '''
7 | This is a paragraph. 8 |
9 | ''' 10 | 11 | dom = html.feed(data) 12 | 13 | print("The entire dom:") 14 | print(dom) 15 | print("The text in the dom:") 16 | print(dom.text()) 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /sample/insert_data.py: -------------------------------------------------------------------------------- 1 | from ehp import * 2 | 3 | data = ''' ''' 4 | html = Html() 5 | dom = html.feed(data) 6 | 7 | font = Tag('font', {'color':'red'}) 8 | font.append(Data('Data inserted!')) 9 | 10 | for ind in dom.find('em'): 11 | ind.append(font) 12 | 13 | print(dom) 14 | 15 | 16 | -------------------------------------------------------------------------------- /sample/delete_attributes.py: -------------------------------------------------------------------------------- 1 | from ehp import * 2 | 3 | html = Html() 4 | dom = html.feed('xxx
\ 5 |mmm
') 6 | 7 | for root, ind in dom.match_with_root(('style', 'color:black')): 8 | del ind.attr['style'] 9 | 10 | 11 | print(dom) 12 | 13 | 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup(name="ehp", 4 | version="2.0.1", 5 | py_modules=["ehp"], 6 | author="Iury O. G. Figueiredo", 7 | author_email="ioliveira@id.uff.br", 8 | url="", 9 | description="Parsing html/xml documents in a faster and easier way") 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ehp 2 | 3 | Easy Html Parser is an AST generator for html/xml documents. EHP is a nice tool to parse html content. 4 | It has a short learning curve compared to other parsers. You don't need to lose time going through massive 5 | documentation to do simple stuff. EHP handles broken html nicely. 6 | 7 | EHP has a short learning curve, you can go through some examples, in a few minutes 8 | you can implement cool stuff. 9 | 10 | ### Create/Delete elements 11 | 12 | ~~~python 13 | from ehp import * 14 | 15 | html = Html() 16 | 17 | data = ''' 18 | foo 19 | ''' 20 | 21 | dom = html.feed(data) 22 | 23 | for root, item in dom.find_with_root('em'): 24 | root.remove(item) 25 | 26 | print(dom) 27 | ~~~ 28 | 29 | ~~~ 30 | 31 | 32 | ~~~ 33 | 34 | ### Manipulate attributes 35 | 36 | ~~~python 37 | from ehp import * 38 | 39 | data = '''It is simple.
''' 40 | 41 | dom = Html().feed(data) 42 | 43 | for ind, name, attr in dom.walk(): 44 | attr['size'] = '+2' 45 | 46 | print(dom) 47 | ~~~ 48 | 49 | ~~~ 50 |It is simple.
51 | ~~~ 52 | 53 | Install 54 | ======= 55 | 56 | **Note:** Ehp works on python3 only, python2 support is no longer available. 57 | 58 | ~~~ 59 | pip ehp install 60 | ~~~ 61 | 62 | **Note:** The module is quite well documented, you can find documentation there. 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /escs.sh: -------------------------------------------------------------------------------- 1 | # clone, easyhtmlparser, ehp. 2 | cd ~/projects 3 | git clone git@github.com:iogf/ehp.git ehp-code 4 | ############################################################################## 5 | # push, easyhtmlparser, ehp. 6 | cd ~/projects/ehp-code 7 | git status 8 | git add * 9 | git commit 10 | git push 11 | ############################################################################## 12 | # create, development, branch, ehp. 13 | cd /home/tau/projects/ehp-code 14 | git branch -a 15 | git checkout -b development 16 | git push --set-upstream origin development 17 | ############################################################################## 18 | # checkout, undo, easyhtmlparser, ehp. 19 | cd ~/projects/ehp-code 20 | git checkout * 21 | ############################################################################## 22 | # install, easyhtmlparser, ehp. 23 | cd ~/projects/ehp-code 24 | sudo bash -i 25 | python2 setup.py install 26 | rm -fr build 27 | exit 28 | ############################################################################## 29 | # ehp, pip. 30 | cd ~/projects/ehp-code 31 | 32 | echo ' 33 | [metadata] 34 | description-file = README.md 35 | ' > setup.cfg 36 | 37 | vy setup.cfg 38 | python setup.py sdist register upload 39 | rm -fr sdist 40 | ############################################################################## 41 | # merge development into master. 42 | cd /home/tau/projects/ehp-code/ 43 | git checkout master 44 | git merge development 45 | git push 46 | git checkout development 47 | 48 | -------------------------------------------------------------------------------- /ehp.py: -------------------------------------------------------------------------------- 1 | from html.parser import HTMLParser 2 | from collections import deque 3 | 4 | version = '1.3' 5 | DATA = 1 6 | META = 2 7 | COMMENT = 3 8 | PI = 4 9 | CODE = 5 10 | AMP = 6 11 | 12 | class Attribute(dict): 13 | """ 14 | This class holds the tags's attributes. 15 | The idea consists in providing an efficient and flexible way of manipulating 16 | tags attributes inside the dom. 17 | 18 | Example: 19 | dom = Html().feed('foo
') 20 | 21 | for ind in dom.sail(): 22 | if ind.name == 'p': 23 | ind.attr['style'] = "color:blue" 24 | 25 | It would change to color blue. 26 | """ 27 | 28 | def __getitem__(self, key): 29 | """ 30 | If self doesn't have the key it returns '' 31 | """ 32 | 33 | return self.get(key, '') 34 | 35 | def __str__(self): 36 | """ 37 | It returns a htmlized representation for attributes 38 | which are inside self. 39 | """ 40 | 41 | data = '' 42 | for key, value in list(self.items()): 43 | pair = '%s="%s" ' % (key, value) 44 | data = data + pair 45 | 46 | return data 47 | 48 | 49 | class Root(list): 50 | """ 51 | A Root instance is the outmost node for a xml/html document. 52 | All xml/html entities inherit from this class. 53 | 54 | html = Html() 55 | dom = html.feed(' ... ') 56 | 57 | dom.name == '' 58 | True 59 | type(dom) == Root 60 | True 61 | 62 | """ 63 | 64 | 65 | def __init__(self, name=None, attr={}): 66 | """ 67 | """ 68 | 69 | self.name = name 70 | self.attr = Attribute(attr) 71 | 72 | list.__init__(self) 73 | 74 | __repr__ = object.__repr__ 75 | 76 | def __str__(self): 77 | """ 78 | This str function returns a string representation of the structure. 79 | """ 80 | 81 | html = '' 82 | 83 | for ind in self: 84 | html = '%s%s' % (html, ind) 85 | 86 | return html 87 | 88 | def sail(self): 89 | """ 90 | This is used to navigate through the xml/html document. 91 | Every xml/html object is represented by a python class 92 | instance that inherits from Root. 93 | 94 | The method sail is used to return an iterator 95 | for these objects. 96 | 97 | Example: 98 | data = ' ' 99 | 100 | html = Html() 101 | dom = html.feed(data) 102 | 103 | for ind in dom.sail(): 104 | print type(ind),',', ind.name 105 | 106 | It would output. 107 | 108 |alpha.
beta.
' 194 | html = Html() 195 | dom = html.feed(data) 196 | 197 | for ind in dom.find('p', ('style', 'color:green')): 198 | print ind 199 | 200 | Output. 201 | 202 | 203 |beta.
204 | """ 205 | 206 | for ind in self.sail(): 207 | if ind.name == name: 208 | for key, value in args: 209 | if ind.attr[key] != value: 210 | break 211 | else: 212 | yield(ind) 213 | 214 | def find_with_root(self, name, *args): 215 | """ 216 | Like Root.find but returns its parent tag. 217 | 218 | from ehp import * 219 | 220 | html = Html() 221 | dom = html.feed('''alpha
beta
''') 222 | 223 | for root, ind in dom.find_with_root('p'): 224 | root.remove(ind) 225 | 226 | print dom 227 | 228 | It would output. 229 | 230 | 231 | """ 232 | 233 | for root, ind in self.sail_with_root(): 234 | if ind.name == name: 235 | for key, value in args: 236 | if ind.attr[key] != value: 237 | break 238 | else: 239 | yield(root, ind) 240 | 241 | 242 | def byid(self, id): 243 | """ 244 | It is a shortcut for finding an object 245 | whose attribute 'id' matches id. 246 | 247 | Example: 248 | 249 | data = '' 250 | html = Html() 251 | dom = html.feed(data) 252 | 253 | print dom.byid('foo') 254 | print dom.byid('bar') 255 | 256 | It should print. 257 | 258 | 259 | None 260 | """ 261 | 262 | return self.take(('id', id)) 263 | 264 | def take(self, *args): 265 | """ 266 | It returns the first object whose one of its 267 | attributes matches (key0, value0), (key1, value1), ... . 268 | 269 | Example: 270 | 271 | data = '' 272 | html = Html() 273 | dom = html.feed(data) 274 | 275 | print dom.take(('id', 'foo')) 276 | print dom.take(('id', 'foo'), ('size', '2')) 277 | """ 278 | 279 | seq = self.match(*args) 280 | 281 | try: 282 | item = next(seq) 283 | except StopIteration: 284 | return None 285 | else: 286 | return item 287 | 288 | def take_with_root(self, *args): 289 | """ 290 | Like Root.take but returns the tag parent. 291 | """ 292 | 293 | seq = self.match_with_root(*args) 294 | 295 | try: 296 | item = next(seq) 297 | except StopIteration: 298 | return None 299 | else: 300 | return item 301 | 302 | 303 | pass 304 | 305 | def match(self, *args): 306 | """ 307 | It returns a sequence of objects whose attributes match. 308 | (key0, value0), (key1, value1), ... . 309 | 310 | Example: 311 | 312 | data = '' 313 | html = Html() 314 | dom = html.feed(data) 315 | 316 | for ind in dom.match(('size', '1')): 317 | print ind 318 | 319 | It would print. 320 | 321 | 322 | 323 | """ 324 | 325 | for ind in self.sail(): 326 | for key, value in args: 327 | if ind.attr[key] != value: 328 | break 329 | else: 330 | yield(ind) 331 | 332 | def match_with_root(self, *args): 333 | """ 334 | Like Root.match but with its parent tag. 335 | 336 | Example: 337 | 338 | from ehp import * 339 | 340 | html = Html() 341 | dom = html.feed('''xxx
342 |mmm
''') 343 | 344 | for root, ind in dom.match_with_root(('style', 'color:black')): 345 | del ind.attr['style'] 346 | 347 | item = dom.fst('body') 348 | item.attr['style'] = 'color:black' 349 | 350 | print dom 351 | 352 | Output. 353 | 354 |xxx
355 |mmm
356 | """ 357 | 358 | for root, ind in self.sail_with_root(): 359 | for key, value in args: 360 | if ind.attr[key] != value: 361 | break 362 | else: 363 | yield(root, ind) 364 | 365 | 366 | def join(self, delim, *args): 367 | """ 368 | It joins all the objects whose name appears in args. 369 | 370 | Example 1: 371 | 372 | html = Html() 373 | data = ' This is cool. That is. ' 374 | dom = html.feed(data) 375 | 376 | print dom.join('', 'b') 377 | print type(dom.join('b')) 378 | 379 | It would print. 380 | 381 | This is cool. That is. 382 |alpha.
beta.
' 440 | html = Html() 441 | dom = html.feed(data) 442 | 443 | for ind in dom.find('p', ('style', 'color:green')): 444 | print ind 445 | 446 | print dom.fst('p', ('style', 'color:green')) 447 | print dom.fst_with_root('p', ('style', 'color:green')) 448 | 449 | Output: 450 | 451 |beta.
452 |beta.
453 | (
745 | It is tags which do not have children.
746 |
747 | """
748 |
749 | def __init__(self, name, attr={}):
750 | """
751 | See help(Tag).
752 | """
753 | Root.__init__(self, name, attr)
754 |
755 |
756 | def __str__(self):
757 | html = '<%s %s/>' % (self.name, self.attr)
758 |
759 | return html
760 |
761 |
762 | class Meta(Root):
763 | """
764 |
765 | """
766 |
767 | def __init__(self, data):
768 | Root.__init__(self, META)
769 | self.data = data
770 |
771 | def __str__(self):
772 | html = '' % self.data
773 |
774 | return html
775 |
776 | class Code(Root):
777 | """
778 | """
779 |
780 | def __init__(self, data):
781 | Root.__init__(self, CODE)
782 | self.data = data
783 |
784 | def __str__(self):
785 | html = '%s' % self.data
786 |
787 | return html
788 |
789 | class Amp(Root):
790 | """
791 |
792 | """
793 |
794 | def __init__(self, data):
795 | Root.__init__(self, AMP)
796 | self.data = data
797 |
798 | def __str__(self):
799 | html = '&%s' % self.data
800 |
801 | return html
802 |
803 |
804 | class Pi(Root):
805 | """
806 |
807 | """
808 |
809 | def __init__(self, data):
810 | Root.__init__(self, PI)
811 | self.data = data
812 |
813 | def __str__(self):
814 | html = '%s>' % self.data
815 |
816 | return html
817 |
818 |
819 | class Comment(Root):
820 | """
821 |
822 | """
823 |
824 | def __init__(self, data):
825 | Root.__init__(self, COMMENT)
826 | self.data = data
827 |
828 | def __str__(self):
829 | html = '' % self.data
830 |
831 | return html
832 |
833 |
834 | class Tree(object):
835 | """
836 | The engine class.
837 | """
838 |
839 | def __init__(self):
840 | """
841 | Initializes outmost which is the struct which will
842 | hold all data inside the file.
843 | """
844 |
845 | self.outmost = Root('')
846 |
847 | self.stack = deque()
848 | self.stack.append(self.outmost)
849 |
850 | def clear(self):
851 | """
852 | Clear the outmost and stack for a new parsing.
853 | """
854 |
855 | self.outmost = Root('')
856 | self.stack.clear()
857 | self.stack.append(self.outmost)
858 |
859 | def last(self):
860 | """
861 | Return the last pointer which point to the actual tag scope.
862 | """
863 |
864 | return self.stack[-1]
865 |
866 | def nest(self, name, attr):
867 | """
868 | Nest a given tag at the bottom of the tree using
869 | the last stack's pointer.
870 | """
871 |
872 | item = Tag(name, attr)
873 |
874 | pointer = self.stack.pop()
875 |
876 | pointer.append(item)
877 |
878 | self.stack.append(pointer)
879 |
880 | self.stack.append(item)
881 |
882 | def dnest(self, data):
883 | """
884 | Nest the actual data onto the tree.
885 | """
886 |
887 | top = self.last()
888 |
889 | item = Data(data)
890 |
891 | top.append(item)
892 |
893 | def xnest(self, name, attr):
894 | """
895 | Nest a XTag onto the tree.
896 | """
897 |
898 | top = self.last()
899 |
900 | item = XTag(name, attr)
901 |
902 | top.append(item)
903 |
904 |
905 | def ynest(self, data):
906 | """
907 |
908 | """
909 |
910 | top = self.last()
911 |
912 | item = Meta(data)
913 |
914 | top.append(item)
915 |
916 |
917 | def mnest(self, data):
918 | """
919 |
920 | """
921 |
922 | top = self.last()
923 |
924 | item = Comment(data)
925 |
926 | top.append(item)
927 |
928 | def cnest(self, data):
929 | """
930 |
931 | """
932 |
933 | top = self.last()
934 |
935 | item = Code(data)
936 |
937 | top.append(item)
938 |
939 |
940 | def rnest(self, data):
941 | """
942 |
943 | """
944 |
945 | top = self.last()
946 |
947 | item = Amp(data)
948 |
949 | top.append(item)
950 |
951 |
952 | def inest(self, data):
953 | """
954 |
955 | """
956 |
957 | top = self.last()
958 |
959 | item = Pi(data)
960 |
961 | top.append(item)
962 |
963 | def enclose(self, name):
964 | """
965 | When found a closing tag then pops the pointer's scope from the stack
966 | so pointing to the earlier scope's tag.
967 | """
968 |
969 | count = 0
970 |
971 | for ind in reversed(self.stack):
972 | count = count + 1
973 |
974 | if ind.name == name:
975 | break
976 | else:
977 | count = 0
978 |
979 | # It pops all the items which do not match with the closing tag.
980 | for i in range(0, count):
981 | self.stack.pop()
982 |
983 |
984 | class Html(HTMLParser):
985 | """
986 | The tokenizer class.
987 | """
988 |
989 | def __init__(self):
990 | HTMLParser.__init__(self)
991 | self.struct = Tree()
992 |
993 | def fromfile(self, filename):
994 | """
995 | It builds a structure from a file.
996 | """
997 |
998 | fd = open(filename, 'r')
999 | data = fd.read()
1000 | fd.close()
1001 | return self.feed(data)
1002 |
1003 | def feed(self, data):
1004 | """
1005 |
1006 | """
1007 |
1008 | self.struct.clear()
1009 | HTMLParser.feed(self, data)
1010 |
1011 | return self.struct.outmost
1012 |
1013 | def handle_starttag(self, name, attr):
1014 | """
1015 | When found an opening tag then nest it onto the tree
1016 | """
1017 |
1018 | self.struct.nest(name, attr)
1019 | pass
1020 |
1021 | def handle_startendtag(self, name, attr):
1022 | """
1023 | When found a XHTML tag style then nest it up to the tree
1024 | """
1025 |
1026 | self.struct.xnest(name, attr)
1027 |
1028 | def handle_endtag(self, name):
1029 | """
1030 | When found a closing tag then makes it point to the right scope
1031 | """
1032 |
1033 | self.struct.enclose(name)
1034 | pass
1035 |
1036 | def handle_data(self, data):
1037 | """
1038 | Nest data onto the tree.
1039 | """
1040 |
1041 | self.struct.dnest(data)
1042 |
1043 | def handle_decl(self, decl):
1044 | """
1045 |
1046 | """
1047 | self.struct.ynest(decl)
1048 |
1049 | def unknown_decl(self, decl):
1050 | """
1051 |
1052 | """
1053 | self.struct.ynest(decl)
1054 |
1055 | def handle_charref(self, data):
1056 | """
1057 |
1058 | """
1059 |
1060 | self.struct.cnest(data)
1061 |
1062 | def handle_entityref(self, data):
1063 | """
1064 |
1065 | """
1066 |
1067 | self.struct.rnest(data)
1068 |
1069 | def handle_pi(self, data):
1070 | """
1071 | """
1072 |
1073 | self.struct.inest(data)
1074 |
1075 | def handle_comment(self, data):
1076 | """
1077 |
1078 | """
1079 |
1080 | self.struct.mnest(data)
1081 |
1082 |
1083 |
1084 |
1085 |
1086 |
1087 |
1088 |
1089 |
1090 |
1091 |
1092 |
1093 |
1094 |
1095 |
1096 |
1097 |
1098 |
1099 |
1100 |
1101 |
1102 |
1103 |
1104 |
1105 |
1106 |
1107 |
1108 |
--------------------------------------------------------------------------------