├── .gitignore ├── LICENSE ├── README.md ├── inparse ├── __init__.py └── inparse.py ├── requirements.txt ├── setup.py └── tests ├── test_inparse.py └── test_table.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | #pickle file 107 | .pk 108 | 109 | .idea/* 110 | 111 | ex_output/* 112 | !ex_output/README.rst -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015-2016 Ask Solem & contributors. All rights reserved. 2 | Copyright (c) 2012-2014 GoPivotal, Inc. All rights reserved. 3 | Copyright (c) 2009, 2010, 2011, 2012 Ask Solem, and individual contributors. All rights reserved. 4 | 5 | Celery is licensed under The BSD License (3 Clause, also known as 6 | the new BSD license). The license is an OSI approved Open Source 7 | license and is GPL-compatible(1). 8 | 9 | The license text can also be found here: 10 | http://www.opensource.org/licenses/BSD-3-Clause 11 | 12 | License 13 | ======= 14 | 15 | Redistribution and use in source and binary forms, with or without 16 | modification, are permitted provided that the following conditions are met: 17 | * Redistributions of source code must retain the above copyright 18 | notice, this list of conditions and the following disclaimer. 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | * Neither the name of Ask Solem, nor the 23 | names of its contributors may be used to endorse or promote products 24 | derived from this software without specific prior written permission. 25 | 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 28 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Ask Solem OR CONTRIBUTORS 30 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 | POSSIBILITY OF SUCH DAMAGE. 37 | 38 | Documentation License 39 | ===================== 40 | 41 | The documentation portion of Celery (the rendered contents of the 42 | "docs" directory of a software distribution or checkout) is supplied 43 | under the "Creative Commons Attribution-ShareAlike 4.0 44 | International" (CC BY-SA 4.0) License as described by 45 | https://creativecommons.org/licenses/by-sa/4.0/ 46 | 47 | Footnotes 48 | ========= 49 | (1) A GPL-compatible license makes it possible to 50 | combine Celery with other software that is released 51 | under the GPL, it does not mean that we're distributing 52 | Celery under the GPL license. The BSD license, unlike the GPL, 53 | let you distribute a modified version without making your 54 | changes open source. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | In.parse 3 | ========= 4 | 5 | 6 | 7 | Open Collaborative AI Driven Parser builder for Web Scraping, Data Extraction and Crawling,Knowledge Graph 8 | 9 | 10 | 11 | 12 | # Try new AI-Powered version: 13 | # www.coparser.com 14 | # https://github.com/CoParser/CoParser 15 | ----- 16 | -------------------------------------------------------------------------------- /inparse/__init__.py: -------------------------------------------------------------------------------- 1 | from .inparse import Inparse 2 | 3 | 4 | 5 | __all__ = ['Inparse'] -------------------------------------------------------------------------------- /inparse/inparse.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import lxml.html 3 | import lxml.html.clean 4 | from lxml.cssselect import CSSSelector 5 | 6 | from lxml import etree 7 | from lxml.etree import tostring 8 | 9 | import json 10 | import pprint 11 | from dateutil.parser import parse as date_parser 12 | 13 | class InparseException(Exception): 14 | pass 15 | class Inparse(object): 16 | 17 | TEST_MODE=False 18 | REPO_URI='http://inparse.com/api/parser?parser_no={}&access_token={}' 19 | TEST_REPO_URI = 'http://0.0.0.0:8080/api/parser?parser_no={}&access_token={}' 20 | def __init__(self,parser_no,access_token,parser_json=None): 21 | 22 | if parser_json: 23 | self.parser_json = json.loads(parser_json) 24 | else: 25 | self.parser_no=parser_no 26 | self.access_token=access_token 27 | if self.TEST_MODE: 28 | url=self.TEST_REPO_URI.format(parser_no,access_token) 29 | else: 30 | url = self.REPO_URI.format(parser_no, access_token) 31 | res=requests.get(url) 32 | self.parser_json=res.json() 33 | 34 | self.downloader=requests 35 | self.rules=self.bulid_rule_dict(self.parser_json['selectors']) 36 | self.rule_list=self.parser_json['selectors'] 37 | 38 | 39 | def bulid_rule_dict(self,rl): 40 | rule_dict={} 41 | for r in rl: 42 | rule_dict[r['name']]=r 43 | if 'parent_name' not in r: 44 | r['parent_name'] = None 45 | 46 | 47 | return rule_dict 48 | 49 | def parse_url(self,url): 50 | if url is None : 51 | raise InparseException('empty input') 52 | html = self.downloader.get(url).text 53 | return self.parse(html) 54 | 55 | 56 | def parse_item(self,doc,rules): 57 | result={} 58 | for rule in rules: 59 | result[rule['name']]=self.get_val_by_rule(doc, rule['selector']) 60 | 61 | return result 62 | 63 | def parse(self,html): 64 | if html is None : 65 | raise InparseException('empty input') 66 | 67 | htmlparser = etree.HTMLParser() 68 | doc = etree.HTML(html,htmlparser) 69 | 70 | 71 | parent_name=set() 72 | for id,r in self.rules.items(): 73 | if r['parent_name']: 74 | parent_name.add(r['parent_name']) 75 | 76 | 77 | 78 | 79 | 80 | 81 | #basic level item 82 | result_nodes = self.parse_item(doc,[ r for r in self.rule_list if r['name'] not in parent_name and r['parent_name'] is None]) 83 | 84 | for name in parent_name: 85 | result_nodes[name]=[] 86 | rule=self.rules[name] 87 | sub_rule=[ r for r in self.rule_list if r['parent_name'] == name] 88 | parent_nodes=self.get_val_by_rule(doc,rule['selector']) 89 | for node in parent_nodes: 90 | item=self.parse_item(node,sub_rule) 91 | result_nodes[name].append(item) 92 | 93 | 94 | 95 | 96 | return self.post_clean(result_nodes) 97 | 98 | def get_val_by_rule(self,dom,rule): 99 | if 'inparse.' in rule: 100 | func_name=rule.split('.')[1] 101 | return getattr(self,func_name)(dom) 102 | else: 103 | return CSSSelector(rule)(dom) 104 | def table(self,doc): 105 | 106 | 107 | 108 | 109 | 110 | 111 | tables = doc.xpath('.//table') 112 | 113 | return tables 114 | def table_header(self,table): 115 | rows = [] 116 | 117 | for thead in table.xpath('thead'): 118 | rows.extend(thead.xpath('./tr')) 119 | 120 | 121 | 122 | 123 | return rows 124 | 125 | def table_body(self,table): 126 | from_tbody = table.xpath('.//tbody//tr') 127 | from_root = table.xpath('./tr') 128 | 129 | return from_tbody + from_root 130 | 131 | @classmethod 132 | def article_clean(cls,nodes): 133 | 134 | ## will keep tgas 135 | content='' 136 | for n in nodes: 137 | content+=tostring(n,encoding='unicode') 138 | 139 | article_cleaner = lxml.html.clean.Cleaner() 140 | article_cleaner.javascript = True 141 | article_cleaner.style = True 142 | article_cleaner.safe_attrs_only=True 143 | article_cleaner.safe_attrs=['href','src','alt','height','width'] 144 | article_cleaner.inline_style=True 145 | article_cleaner.allow_tags = [ 146 | 'a', 'span', 'p', 'br', 'strong', 'b', 147 | 'em', 'i', 'tt', 'code', 'pre', 'blockquote', 'img', 'h1', 148 | 'h2', 'h3', 'h4', 'h5', 'h6', 149 | 'ul', 'ol', 'li', 'dl', 'dt', 'dd'] 150 | article_cleaner.remove_unknown_tags = False 151 | return article_cleaner.clean_html(content) 152 | 153 | 154 | 155 | 156 | 157 | @classmethod 158 | def image_clean(cls,nodes): 159 | src=set() 160 | for n in nodes: 161 | if n.tag == 'img' and 'src' in n.attrib: 162 | src.add(n.attrib['src']) 163 | else: 164 | for cn in n.xpath('//img'): 165 | if cn.tag == 'img' and 'src' in n.attrib: 166 | src.add(n.attrib['src']) 167 | 168 | 169 | return list(src) 170 | 171 | 172 | @classmethod 173 | def date_clean(cls,nodes): 174 | 175 | for node in nodes: 176 | if node.xpath('//time'): 177 | try: 178 | text=node.xpath('//time')[0].text 179 | return date_parser(text) 180 | except: 181 | pass 182 | 183 | for t in node.itertext(): 184 | 185 | try: 186 | 187 | return date_parser(t) 188 | except: 189 | for tt in t.split(' '): 190 | if len(tt) <6: 191 | continue 192 | try: 193 | return date_parser(tt) 194 | except: 195 | pass 196 | pass 197 | 198 | 199 | 200 | @classmethod 201 | def text_clean(cls,nodes): 202 | text='' 203 | for node in nodes: 204 | 205 | text+=' '.join(node.itertext()).strip() 206 | return text.strip(' ') 207 | 208 | 209 | 210 | 211 | 212 | @classmethod 213 | def table_cell_clean(cls,node): 214 | return ' '.join(node.itertext()).strip() 215 | @classmethod 216 | def table_clean(cls,nodes): 217 | result=[] 218 | for table in nodes: 219 | header=table['table_header'] 220 | header_row=[] 221 | body_rows=[] 222 | body=table['table_body'] 223 | 224 | if not header and body and body[0][0].tag == 'th' : 225 | 226 | header.append(body.pop(0)) 227 | 228 | 229 | for t in header[0].xpath('./td|./th'): 230 | header_row.append(cls.table_cell_clean(t)) 231 | 232 | for row in body: 233 | r=[] 234 | for t in row.xpath('./td|./th'): 235 | r.append(cls.table_cell_clean(t)) 236 | 237 | body_rows.append(r) 238 | 239 | result.append( 240 | 241 | { 242 | 'table_header':header_row, 243 | 'table_body':body_rows 244 | } 245 | ) 246 | 247 | return result 248 | 249 | 250 | 251 | 252 | @classmethod 253 | def post_clean(cls,result_nodes,level=0): 254 | 255 | result={} 256 | for name,nodes in result_nodes.items(): 257 | 258 | if 'article' in name: 259 | result[name]=cls.article_clean(nodes) 260 | 261 | elif 'image' in name: 262 | 263 | result[name] = cls.image_clean(nodes) 264 | 265 | elif 'date' in name: 266 | 267 | result[name] = cls.date_clean(nodes) 268 | elif 'table' == name: 269 | result[name] = cls.table_clean(nodes) 270 | else: 271 | 272 | result[name] = cls.text_clean(nodes) 273 | 274 | 275 | return result 276 | 277 | @classmethod 278 | def pretty_print(cls,d): 279 | pp = pprint.PrettyPrinter(indent=4) 280 | pp.pprint(d) 281 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | lxml 3 | beautifulsoup4 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | import codecs 5 | 6 | 7 | def read(fname): 8 | return codecs.open(os.path.join(os.path.dirname(__file__), fname),encoding="utf-8").read() 9 | 10 | 11 | setup(name='inparse', 12 | description='Collaborative AI for Web Scraping, Data Extraction and Crawling,Knowledge Graph' 13 | ' ', 14 | long_description=read("README.md"), 15 | version='0.1.1', 16 | url='https://github.com/inparse/inparse', 17 | author='Guojian Li', 18 | author_email='guojianlee@gmail.com', 19 | license='BSD', 20 | python_requires=">=3.6.5", 21 | classifiers=[ 22 | 'Development Status :: 3 - Alpha', 23 | 'Intended Audience :: Developers', 24 | 'License :: OSI Approved :: BSD License', 25 | 'Programming Language :: Python :: 3' 26 | ], 27 | packages=['inparse'], 28 | install_requires=[ 29 | 'requests', 30 | 'beautifulsoup4', 31 | 'lxml' 32 | 33 | 34 | ], 35 | 36 | ) 37 | -------------------------------------------------------------------------------- /tests/test_inparse.py: -------------------------------------------------------------------------------- 1 | 2 | from inparse import Inparse 3 | 4 | import requests 5 | parser_json=''' 6 | { 7 | "cr_by": "kkyon", 8 | "cr_dt": "Thu, 11 Oct 2018 22:05:08 GMT", 9 | "no": "8eed5dc5", 10 | "selectors": [ 11 | { 12 | "name": "article_body", 13 | "parent_sid": null, 14 | "parent_name": null, 15 | "selector": "div#endText", 16 | "sid": 11, 17 | "type": "article_body" 18 | }, 19 | { 20 | "name": "publish_date", 21 | "parent_sid": null, 22 | "parent_name": null, 23 | "selector": "div.post_time_source", 24 | "sid": 12, 25 | "type": "publish_date" 26 | }, 27 | { 28 | "name": "title", 29 | "parent_sid": null, 30 | "parent_name": null, 31 | "selector": "h1", 32 | "sid": 13, 33 | "type": "title" 34 | }, 35 | { 36 | "name": "author", 37 | "parent_sid": null, 38 | "parent_name": null, 39 | "selector": "li[data-module-name='n_topnavapplist_t_0']", 40 | "sid": 14, 41 | "type": "author" 42 | }, 43 | { 44 | "name": "top_image", 45 | "parent_sid": null, 46 | "parent_name": null, 47 | "selector": "div.post_next_post.clearfix", 48 | "sid": 15, 49 | "type": "top_image" 50 | } 51 | ], 52 | "status": "ok", 53 | "type": "Ariticle", 54 | "website": "news.163.com" 55 | } 56 | 57 | 58 | ''' 59 | 60 | 61 | def test_parser(): 62 | 63 | 64 | from inparse import Inparse 65 | import requests 66 | p=Inparse(None,None,parser_json=parser_json) 67 | res=requests.get('https://news.163.com/18/1002/16/DT4HPVNL000187VE.html') 68 | d=p.parse(res.text) 69 | Inparse.pretty_print(d) 70 | 71 | 72 | 73 | def test_parser2(): 74 | Inparse.TEST_MODE=True 75 | p=Inparse('b45beddc','d50cb533f69b6a78892afbd093f95fc1') 76 | d=p.parse_url('https://qz.com/india/1413291/trulymadly-ceo-on-how-dating-apps-like-bumble-india-must-localise/') 77 | Inparse.pretty_print(d) 78 | 79 | if __name__ == '__main__': 80 | test_parser() 81 | test_parser2() 82 | -------------------------------------------------------------------------------- /tests/test_table.py: -------------------------------------------------------------------------------- 1 | 2 | from inparse import Inparse 3 | 4 | import requests 5 | parser_json=''' 6 | { 7 | "cr_by": "kkyon", 8 | "cr_dt": "Thu, 11 Oct 2018 22:05:08 GMT", 9 | "no": "11111", 10 | "selectors": [ 11 | 12 | { 13 | "name": "table", 14 | "parent_sid": null, 15 | "parent_name": null, 16 | "selector": "inparse.table", 17 | "sid": 1, 18 | "type": "table" 19 | }, 20 | 21 | { 22 | "name": "table_body", 23 | "parent_sid": 1, 24 | "parent_name": "table", 25 | "selector": "inparse.table_body", 26 | "sid": 2, 27 | "type": "table_body" 28 | }, 29 | 30 | { 31 | "name": "table_header", 32 | "parent_sid": 1, 33 | "parent_name": "table", 34 | "selector": "inparse.table_header", 35 | "sid": 3, 36 | "type": "table_header" 37 | } 38 | 39 | ], 40 | "status": "ok", 41 | "type": "Table", 42 | "website": "https://www.yeastar.com/webinars/" 43 | } 44 | 45 | 46 | ''' 47 | 48 | 49 | def test_table(): 50 | 51 | 52 | from inparse import Inparse 53 | import requests 54 | p=Inparse(None,None,parser_json=parser_json) 55 | for url in ['https://www.yeastar.com/academy/onsite-training-schedule/','http://cs.sports.163.com/tables/','https://www.imdb.com/chart/top']: 56 | #,'https://www.yeastar.com/academy/onsite-training-schedule/', 'https://www.imdb.com/chart/top' 57 | 58 | res=requests.get(url) 59 | d=p.parse(res.text) 60 | Inparse.pretty_print(d) 61 | 62 | 63 | 64 | 65 | 66 | --------------------------------------------------------------------------------