├── .gitignore
├── LICENSE
├── README.md
├── inparse
    ├── __init__.py
    └── inparse.py
├── requirements.txt
├── setup.py
└── tests
    ├── test_inparse.py
    └── test_table.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | #pickle file
107 | .pk
108 | 
109 | .idea/*
110 | 
111 | ex_output/*
112 | !ex_output/README.rst


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015-2016 Ask Solem & contributors.  All rights reserved.
 2 | Copyright (c) 2012-2014 GoPivotal, Inc.  All rights reserved.
 3 | Copyright (c) 2009, 2010, 2011, 2012 Ask Solem, and individual contributors.  All rights reserved.
 4 | 
 5 | Celery is licensed under The BSD License (3 Clause, also known as
 6 | the new BSD license).  The license is an OSI approved Open Source
 7 | license and is GPL-compatible(1).
 8 | 
 9 | The license text can also be found here:
10 | http://www.opensource.org/licenses/BSD-3-Clause
11 | 
12 | License
13 | =======
14 | 
15 | Redistribution and use in source and binary forms, with or without
16 | modification, are permitted provided that the following conditions are met:
17 |     * Redistributions of source code must retain the above copyright
18 |       notice, this list of conditions and the following disclaimer.
19 |     * Redistributions in binary form must reproduce the above copyright
20 |       notice, this list of conditions and the following disclaimer in the
21 |       documentation and/or other materials provided with the distribution.
22 |     * Neither the name of Ask Solem, nor the
23 |       names of its contributors may be used to endorse or promote products
24 |       derived from this software without specific prior written permission.
25 | 
26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
28 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Ask Solem OR CONTRIBUTORS
30 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 | POSSIBILITY OF SUCH DAMAGE.
37 | 
38 | Documentation License
39 | =====================
40 | 
41 | The documentation portion of Celery (the rendered contents of the
42 | "docs" directory of a software distribution or checkout) is supplied
43 | under the "Creative Commons Attribution-ShareAlike 4.0
44 | International" (CC BY-SA 4.0) License as described by
45 | https://creativecommons.org/licenses/by-sa/4.0/
46 | 
47 | Footnotes
48 | =========
49 | (1) A GPL-compatible license makes it possible to
50 |     combine Celery with other software that is released
51 |     under the GPL, it does not mean that we're distributing
52 |     Celery under the GPL license.  The BSD license, unlike the GPL,
53 |     let you distribute a modified version without making your
54 |     changes open source.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | In.parse
 3 | =========
 4 |  
 5 | 
 6 | 
 7 | Open Collaborative AI Driven Parser builder for Web Scraping, Data Extraction and Crawling,Knowledge Graph
 8 | 
 9 |  
10 | 
11 | 
12 | # Try new AI-Powered version:
13 | # www.coparser.com
14 | # https://github.com/CoParser/CoParser
15 | -----
16 | 


--------------------------------------------------------------------------------
/inparse/__init__.py:
--------------------------------------------------------------------------------
1 | from .inparse import Inparse
2 | 
3 | 
4 | 
5 | __all__ = ['Inparse']


--------------------------------------------------------------------------------
/inparse/inparse.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import lxml.html
  3 | import lxml.html.clean
  4 | from lxml.cssselect import CSSSelector
  5 | 
  6 | from lxml import etree
  7 | from lxml.etree import tostring
  8 | 
  9 | import json
 10 | import pprint
 11 | from dateutil.parser import parse as date_parser
 12 | 
 13 | class InparseException(Exception):
 14 |      pass
 15 | class Inparse(object):
 16 | 
 17 |         TEST_MODE=False
 18 |         REPO_URI='http://inparse.com/api/parser?parser_no={}&access_token={}'
 19 |         TEST_REPO_URI = 'http://0.0.0.0:8080/api/parser?parser_no={}&access_token={}'
 20 |         def __init__(self,parser_no,access_token,parser_json=None):
 21 | 
 22 |             if parser_json:
 23 |                 self.parser_json = json.loads(parser_json)
 24 |             else:
 25 |                 self.parser_no=parser_no
 26 |                 self.access_token=access_token
 27 |                 if self.TEST_MODE:
 28 |                     url=self.TEST_REPO_URI.format(parser_no,access_token)
 29 |                 else:
 30 |                     url = self.REPO_URI.format(parser_no, access_token)
 31 |                 res=requests.get(url)
 32 |                 self.parser_json=res.json()
 33 | 
 34 |             self.downloader=requests
 35 |             self.rules=self.bulid_rule_dict(self.parser_json['selectors'])
 36 |             self.rule_list=self.parser_json['selectors']
 37 | 
 38 | 
 39 |         def bulid_rule_dict(self,rl):
 40 |             rule_dict={}
 41 |             for r in rl:
 42 |                 rule_dict[r['name']]=r
 43 |                 if 'parent_name' not in r:
 44 |                     r['parent_name'] = None
 45 | 
 46 | 
 47 |             return rule_dict
 48 | 
 49 |         def parse_url(self,url):
 50 |             if url is None :
 51 |                 raise InparseException('empty input')
 52 |             html = self.downloader.get(url).text
 53 |             return self.parse(html)
 54 | 
 55 | 
 56 |         def parse_item(self,doc,rules):
 57 |             result={}
 58 |             for rule in rules:
 59 |                 result[rule['name']]=self.get_val_by_rule(doc, rule['selector'])
 60 | 
 61 |             return result
 62 | 
 63 |         def parse(self,html):
 64 |             if html is None :
 65 |                 raise InparseException('empty input')
 66 | 
 67 |             htmlparser = etree.HTMLParser()
 68 |             doc = etree.HTML(html,htmlparser)
 69 | 
 70 | 
 71 |             parent_name=set()
 72 |             for id,r in self.rules.items():
 73 |                 if   r['parent_name']:
 74 |                     parent_name.add(r['parent_name'])
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 |             #basic level item
 82 |             result_nodes = self.parse_item(doc,[ r for r in self.rule_list if r['name'] not in  parent_name and r['parent_name'] is None])
 83 | 
 84 |             for name in parent_name:
 85 |                 result_nodes[name]=[]
 86 |                 rule=self.rules[name]
 87 |                 sub_rule=[ r for r in self.rule_list if  r['parent_name'] == name]
 88 |                 parent_nodes=self.get_val_by_rule(doc,rule['selector'])
 89 |                 for node in parent_nodes:
 90 |                     item=self.parse_item(node,sub_rule)
 91 |                     result_nodes[name].append(item)
 92 | 
 93 | 
 94 | 
 95 | 
 96 |             return self.post_clean(result_nodes)
 97 | 
 98 |         def get_val_by_rule(self,dom,rule):
 99 |             if 'inparse.' in rule:
100 |                 func_name=rule.split('.')[1]
101 |                 return getattr(self,func_name)(dom)
102 |             else:
103 |                 return  CSSSelector(rule)(dom)
104 |         def table(self,doc):
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 |             tables = doc.xpath('.//table')
112 | 
113 |             return tables
114 |         def table_header(self,table):
115 |             rows = []
116 | 
117 |             for thead in table.xpath('thead'):
118 |                 rows.extend(thead.xpath('./tr'))
119 | 
120 | 
121 | 
122 | 
123 |             return rows
124 | 
125 |         def table_body(self,table):
126 |             from_tbody = table.xpath('.//tbody//tr')
127 |             from_root = table.xpath('./tr')
128 | 
129 |             return from_tbody + from_root
130 | 
131 |         @classmethod
132 |         def article_clean(cls,nodes):
133 | 
134 |             ## will keep tgas
135 |             content=''
136 |             for n in nodes:
137 |                 content+=tostring(n,encoding='unicode')
138 | 
139 |             article_cleaner = lxml.html.clean.Cleaner()
140 |             article_cleaner.javascript = True
141 |             article_cleaner.style = True
142 |             article_cleaner.safe_attrs_only=True
143 |             article_cleaner.safe_attrs=['href','src','alt','height','width']
144 |             article_cleaner.inline_style=True
145 |             article_cleaner.allow_tags = [
146 |                 'a', 'span', 'p', 'br', 'strong', 'b',
147 |                 'em', 'i', 'tt', 'code', 'pre', 'blockquote', 'img', 'h1',
148 |                 'h2', 'h3', 'h4', 'h5', 'h6',
149 |                 'ul', 'ol', 'li', 'dl', 'dt', 'dd']
150 |             article_cleaner.remove_unknown_tags = False
151 |             return article_cleaner.clean_html(content)
152 | 
153 | 
154 | 
155 | 
156 | 
157 |         @classmethod
158 |         def image_clean(cls,nodes):
159 |             src=set()
160 |             for n in nodes:
161 |                 if n.tag == 'img' and 'src' in n.attrib:
162 |                     src.add(n.attrib['src'])
163 |                 else:
164 |                     for cn in n.xpath('//img'):
165 |                         if cn.tag == 'img' and 'src' in n.attrib:
166 |                             src.add(n.attrib['src'])
167 | 
168 | 
169 |             return list(src)
170 | 
171 | 
172 |         @classmethod
173 |         def date_clean(cls,nodes):
174 | 
175 |             for node in nodes:
176 |                 if node.xpath('//time'):
177 |                     try:
178 |                         text=node.xpath('//time')[0].text
179 |                         return date_parser(text)
180 |                     except:
181 |                         pass
182 | 
183 |                 for t in node.itertext():
184 | 
185 |                     try:
186 | 
187 |                         return date_parser(t)
188 |                     except:
189 |                         for tt in t.split(' '):
190 |                             if len(tt) <6:
191 |                                 continue
192 |                             try:
193 |                                 return date_parser(tt)
194 |                             except:
195 |                                 pass
196 |                         pass
197 | 
198 | 
199 | 
200 |         @classmethod
201 |         def text_clean(cls,nodes):
202 |             text=''
203 |             for node in nodes:
204 | 
205 |                 text+=' '.join(node.itertext()).strip()
206 |             return text.strip(' ')
207 | 
208 | 
209 | 
210 | 
211 | 
212 |         @classmethod
213 |         def table_cell_clean(cls,node):
214 |             return ' '.join(node.itertext()).strip()
215 |         @classmethod
216 |         def table_clean(cls,nodes):
217 |             result=[]
218 |             for table in nodes:
219 |                 header=table['table_header']
220 |                 header_row=[]
221 |                 body_rows=[]
222 |                 body=table['table_body']
223 | 
224 |                 if not header and body and body[0][0].tag == 'th' :
225 | 
226 |                     header.append(body.pop(0))
227 | 
228 | 
229 |                     for t in header[0].xpath('./td|./th'):
230 |                         header_row.append(cls.table_cell_clean(t))
231 | 
232 |                 for row in body:
233 |                     r=[]
234 |                     for t in row.xpath('./td|./th'):
235 |                         r.append(cls.table_cell_clean(t))
236 | 
237 |                     body_rows.append(r)
238 | 
239 |                 result.append(
240 | 
241 |                     {
242 |                         'table_header':header_row,
243 |                         'table_body':body_rows
244 |                     }
245 |                 )
246 | 
247 |             return result
248 | 
249 | 
250 | 
251 | 
252 |         @classmethod
253 |         def post_clean(cls,result_nodes,level=0):
254 | 
255 |             result={}
256 |             for name,nodes in result_nodes.items():
257 | 
258 |                 if 'article' in name:
259 |                     result[name]=cls.article_clean(nodes)
260 | 
261 |                 elif 'image' in name:
262 | 
263 |                     result[name] = cls.image_clean(nodes)
264 | 
265 |                 elif  'date' in name:
266 | 
267 |                     result[name] = cls.date_clean(nodes)
268 |                 elif 'table' == name:
269 |                     result[name] = cls.table_clean(nodes)
270 |                 else:
271 | 
272 |                     result[name] = cls.text_clean(nodes)
273 | 
274 | 
275 |             return result
276 | 
277 |         @classmethod
278 |         def pretty_print(cls,d):
279 |             pp = pprint.PrettyPrinter(indent=4)
280 |             pp.pprint(d)
281 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | lxml
3 | beautifulsoup4


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup
 3 | 
 4 | import codecs
 5 | 
 6 | 
 7 | def read(fname):
 8 |     return codecs.open(os.path.join(os.path.dirname(__file__), fname),encoding="utf-8").read()
 9 | 
10 | 
11 | setup(name='inparse',
12 |       description='Collaborative AI for  Web Scraping, Data Extraction and Crawling,Knowledge Graph'
13 |                   ' ',
14 |       long_description=read("README.md"),
15 |       version='0.1.1',
16 |       url='https://github.com/inparse/inparse',
17 |       author='Guojian Li',
18 |       author_email='guojianlee@gmail.com',
19 |       license='BSD',
20 |       python_requires=">=3.6.5",
21 |       classifiers=[
22 |           'Development Status :: 3 - Alpha',
23 |           'Intended Audience :: Developers',
24 |           'License :: OSI Approved :: BSD License',
25 |           'Programming Language :: Python :: 3'
26 |       ],
27 |       packages=['inparse'],
28 |       install_requires=[
29 |           'requests',
30 |           'beautifulsoup4',
31 |           'lxml'
32 | 
33 | 
34 |       ],
35 | 
36 |       )
37 | 


--------------------------------------------------------------------------------
/tests/test_inparse.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from inparse import Inparse
 3 | 
 4 | import requests
 5 | parser_json='''
 6 | {
 7 | "cr_by": "kkyon",
 8 | "cr_dt": "Thu, 11 Oct 2018 22:05:08 GMT",
 9 | "no": "8eed5dc5",
10 | "selectors": [
11 | {
12 | "name": "article_body",
13 | "parent_sid": null,
14 | "parent_name": null,
15 | "selector": "div#endText",
16 | "sid": 11,
17 | "type": "article_body"
18 | },
19 | {
20 | "name": "publish_date",
21 | "parent_sid": null,
22 | "parent_name": null,
23 | "selector": "div.post_time_source",
24 | "sid": 12,
25 | "type": "publish_date"
26 | },
27 | {
28 | "name": "title",
29 | "parent_sid": null,
30 | "parent_name": null,
31 | "selector": "h1",
32 | "sid": 13,
33 | "type": "title"
34 | },
35 | {
36 | "name": "author",
37 | "parent_sid": null,
38 | "parent_name": null,
39 | "selector": "li[data-module-name='n_topnavapplist_t_0']",
40 | "sid": 14,
41 | "type": "author"
42 | },
43 | {
44 | "name": "top_image",
45 | "parent_sid": null,
46 | "parent_name": null,
47 | "selector": "div.post_next_post.clearfix",
48 | "sid": 15,
49 | "type": "top_image"
50 | }
51 | ],
52 | "status": "ok",
53 | "type": "Ariticle",
54 | "website": "news.163.com"
55 | }
56 | 
57 | 
58 | '''
59 | 
60 | 
61 | def test_parser():
62 | 
63 | 
64 |     from inparse import Inparse
65 |     import requests
66 |     p=Inparse(None,None,parser_json=parser_json)
67 |     res=requests.get('https://news.163.com/18/1002/16/DT4HPVNL000187VE.html')
68 |     d=p.parse(res.text)
69 |     Inparse.pretty_print(d)
70 | 
71 | 
72 | 
73 | def test_parser2():
74 |     Inparse.TEST_MODE=True
75 |     p=Inparse('b45beddc','d50cb533f69b6a78892afbd093f95fc1')
76 |     d=p.parse_url('https://qz.com/india/1413291/trulymadly-ceo-on-how-dating-apps-like-bumble-india-must-localise/')
77 |     Inparse.pretty_print(d)
78 | 
79 | if __name__ == '__main__':
80 |     test_parser()
81 |     test_parser2()
82 | 


--------------------------------------------------------------------------------
/tests/test_table.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from inparse import Inparse
 3 | 
 4 | import requests
 5 | parser_json='''
 6 | {
 7 | "cr_by": "kkyon",
 8 | "cr_dt": "Thu, 11 Oct 2018 22:05:08 GMT",
 9 | "no": "11111",
10 | "selectors": [
11 | 
12 | {
13 | "name": "table",
14 | "parent_sid": null,
15 | "parent_name": null,
16 | "selector": "inparse.table",
17 | "sid": 1,
18 | "type": "table"
19 | },
20 | 
21 | {
22 | "name": "table_body",
23 | "parent_sid": 1,
24 | "parent_name": "table",
25 | "selector": "inparse.table_body",
26 | "sid": 2,
27 | "type": "table_body"
28 | },
29 | 
30 | {
31 | "name": "table_header",
32 | "parent_sid": 1,
33 | "parent_name": "table",
34 | "selector": "inparse.table_header",
35 | "sid": 3,
36 | "type": "table_header"
37 | }
38 | 
39 | ],
40 | "status": "ok",
41 | "type": "Table",
42 | "website": "https://www.yeastar.com/webinars/"
43 | }
44 | 
45 | 
46 | '''
47 | 
48 | 
49 | def test_table():
50 | 
51 | 
52 |     from inparse import Inparse
53 |     import requests
54 |     p=Inparse(None,None,parser_json=parser_json)
55 |     for url in ['https://www.yeastar.com/academy/onsite-training-schedule/','http://cs.sports.163.com/tables/','https://www.imdb.com/chart/top']:
56 |         #,'https://www.yeastar.com/academy/onsite-training-schedule/', 'https://www.imdb.com/chart/top'
57 | 
58 |         res=requests.get(url)
59 |         d=p.parse(res.text)
60 |         Inparse.pretty_print(d)
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------