├── .gitignore ├── LICENSE ├── Procfile ├── README.md ├── README.rst ├── examples ├── hackernews.graphql ├── linkedin.graphql ├── yelp.graphql └── yelp_dynamic.graphql ├── gdom ├── __init__.py ├── cmd.py └── schema.py ├── requirements.txt ├── sample_app.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask instance folder 57 | instance/ 58 | 59 | # Sphinx documentation 60 | docs/_build/ 61 | 62 | # PyBuilder 63 | target/ 64 | 65 | # IPython Notebook 66 | .ipynb_checkpoints 67 | 68 | # pyenv 69 | .python-version 70 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Syrus Akbary 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of GDOM nor the names of its contributors may be used 15 | to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: gunicorn sample_app:app --log-file=- 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GDOM 2 | 3 | GDOM is the next generation of web-parsing, powered by `GraphQL` 4 | syntax and the [Graphene framework](http://graphene-python.org). 5 | 6 | Install it typing in your console: 7 | 8 | ```bash 9 | pip install gdom 10 | ``` 11 | 12 | **DEMO**: [Try GDOM online](http://gdom.graphene-python.org/) 13 | 14 | 15 | ## Usage 16 | 17 | You can either do `gdom --test` to start a test server for testing 18 | queries or 19 | 20 | ```bash 21 | gdom QUERY_FILE 22 | ``` 23 | 24 | This command will write in the standard output (or other output if specified 25 | via `--output`) the resulting JSON. 26 | 27 | Your `QUERY_FILE` could look similar to this: 28 | 29 | ```graphql 30 | { 31 | page(url:"http://news.ycombinator.com") { 32 | items: query(selector:"tr.athing") { 33 | rank: text(selector:"td span.rank") 34 | title: text(selector:"td.title a") 35 | sitebit: text(selector:"span.comhead a") 36 | url: attr(selector:"td.title a", name:"href") 37 | attrs: next { 38 | score: text(selector:"span.score") 39 | user: text(selector:"a:eq(0)") 40 | comments: text(selector:"a:eq(2)") 41 | } 42 | } 43 | } 44 | } 45 | ``` 46 | 47 | 48 | ## Advanced usage 49 | 50 | If you want to generalize your gdom query to any page, just rewrite your 51 | query file adding the `$page` var. So should look to something like 52 | this: 53 | 54 | ```graphql 55 | query ($page: String) { 56 | page(url:$page) { 57 | # ... 58 | } 59 | } 60 | ``` 61 | 62 | And then, query it like: 63 | 64 | ```bash 65 | gdom QUERY_FILE http://news.ycombinator.com 66 | ``` 67 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | GDOM 2 | ==== 3 | 4 | GDOM is the next generation of web-parsing, powered by ``GraphQL`` 5 | syntax and the `Graphene framework `__. 6 | 7 | Install it typing in your console: 8 | 9 | .. code:: bash 10 | 11 | pip install gdom 12 | 13 | **DEMO**: `Try GDOM online `__ 14 | 15 | Usage 16 | ----- 17 | 18 | You can either do ``gdom --test`` to start a test server for testing 19 | queries or 20 | 21 | .. code:: bash 22 | 23 | gdom QUERY_FILE 24 | 25 | This command will write in the standard output (or other output if 26 | specified via ``--output``) the resulting JSON. 27 | 28 | Your ``QUERY_FILE`` could look similar to this: 29 | 30 | .. code:: 31 | 32 | { 33 | page(url:"http://news.ycombinator.com") { 34 | items: query(selector:"tr.athing") { 35 | rank: text(selector:"td span.rank") 36 | title: text(selector:"td.title a") 37 | sitebit: text(selector:"span.comhead a") 38 | url: attr(selector:"td.title a", name:"href") 39 | attrs: next { 40 | score: text(selector:"span.score") 41 | user: text(selector:"a:eq(0)") 42 | comments: text(selector:"a:eq(2)") 43 | } 44 | } 45 | } 46 | } 47 | 48 | Advanced usage 49 | -------------- 50 | 51 | If you want to generalize your gdom query to any page, just rewrite your 52 | query file adding the ``$page`` var. So should look to something like 53 | this: 54 | 55 | .. code:: 56 | 57 | query ($page: String) { 58 | page(url:$page) { 59 | # ... 60 | } 61 | } 62 | 63 | And then, query it like: 64 | 65 | .. code:: bash 66 | 67 | gdom QUERY_FILE http://news.ycombinator.com 68 | -------------------------------------------------------------------------------- /examples/hackernews.graphql: -------------------------------------------------------------------------------- 1 | { 2 | page(url: "http://news.ycombinator.com") { 3 | items: query(selector: "tr.athing") { 4 | rank: text(selector: "td span.rank") 5 | title: text(selector: "td.title a") 6 | sitebit: text(selector: "span.comhead a") 7 | url: attr(selector: "td.title a", name: "href") 8 | attrs: next { 9 | score: text(selector: "span.score") 10 | user: text(selector: "a:eq(0)") 11 | comments: text(selector: "a:eq(2)") 12 | } 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /examples/linkedin.graphql: -------------------------------------------------------------------------------- 1 | { 2 | page(url: "https://www.linkedin.com/in/syrusakbary") { 3 | name: text(selector: "h1#name") 4 | title: text(selector: ".headline.title") 5 | demographics: query(selector: "#demographics") { 6 | location: text(selector: ".locality") 7 | industry: text(selector: ".descriptor:not(.adr)") 8 | } 9 | experience: query(selector: "#experience .position") { 10 | title: text(selector: ".item-title") 11 | company: text(selector: ".item-subtitle") 12 | description: text(selector: ".description") 13 | range: text(selector: ".date-range") 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /examples/yelp.graphql: -------------------------------------------------------------------------------- 1 | { 2 | page(url: "http://www.yelp.com/biz/amnesia-san-francisco") { 3 | title: text(selector: "h1") 4 | phone: text(selector: ".biz-phone") 5 | address: text(selector: ".address") 6 | sections: query(selector: ".breadcrumbs--hierarchy a") { 7 | text 8 | url: attr(name: "href") 9 | } 10 | reviews: query(selector: "[itemprop=review]") { 11 | date: text(selector: ".rating-qualifier") 12 | rating: attr(selector: "[itemprop=ratingValue]", name: "content") 13 | username: text(selector: ".user-name a") 14 | comment: text(selector: "p") 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /examples/yelp_dynamic.graphql: -------------------------------------------------------------------------------- 1 | query($page: String) { 2 | page(url: $page) { 3 | title: text(selector: "h1") 4 | phone: text(selector: ".biz-phone") 5 | address: text(selector: ".address") 6 | sections: query(selector: ".breadcrumbs--hierarchy a") { 7 | text 8 | url: attr(name: "href") 9 | } 10 | reviews: query(selector: "[itemprop=review]") { 11 | date: text(selector: ".rating-qualifier") 12 | rating: attr(selector: "[itemprop=ratingValue]", name: "content") 13 | username: text(selector: ".user-name a") 14 | comment: text(selector: "p") 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /gdom/__init__.py: -------------------------------------------------------------------------------- 1 | from .schema import schema, Node, Element, Document, Query 2 | 3 | __all__ = ['schema', 'Node', 'Element', 'Document', 'Query'] 4 | -------------------------------------------------------------------------------- /gdom/cmd.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import sys 4 | 5 | import flask_graphql 6 | from flask import Flask, Blueprint, url_for 7 | from flask_graphql import GraphQLView 8 | 9 | from schema import schema 10 | 11 | SAMPLE_QUERY = ''' 12 | { 13 | page(url:"http://news.ycombinator.com") { 14 | items: query(selector:"tr.athing") { 15 | rank: text(selector:"td span.rank") 16 | title: text(selector:"td.title a") 17 | sitebit: text(selector:"span.comhead a") 18 | url: attr(selector:"td.title a", name:"href") 19 | attrs: next { 20 | score: text(selector:"span.score") 21 | user: text(selector:"a:eq(0)") 22 | comments: text(selector:"a:eq(2)") 23 | } 24 | } 25 | } 26 | } 27 | '''.strip() 28 | 29 | 30 | def index_view(): 31 | url = url_for('graphql', query=SAMPLE_QUERY) 32 | return 'Hacker News Parser example'.format(url) 33 | 34 | def get_test_app(): 35 | app = Flask(__name__) 36 | app.debug = True 37 | 38 | app.add_url_rule('/graphql', 'graphql', view_func=GraphQLView.as_view('graphql', schema=schema, graphiql=True)) 39 | app.add_url_rule('/', 'index', view_func=index_view,) 40 | return app 41 | 42 | 43 | def parse(query, source, page): 44 | execution = schema.execute(query, args={'page': page, 'source': source}) 45 | if execution.errors: 46 | raise Exception(execution.errors[0]) 47 | return execution.data 48 | 49 | 50 | def main(): 51 | parser = argparse.ArgumentParser(description='Parse and scrape any web page using GraphQL queries') 52 | 53 | group = parser.add_mutually_exclusive_group(required=True) 54 | group.add_argument('query', type=argparse.FileType('r'), nargs='?', help='The query file', default=None) 55 | group.add_argument('--test', action='store_true', default=False, help='This will start a test server with a UI for querying') 56 | 57 | parser.add_argument('page', metavar='PAGE', nargs='?', const=1, type=str, help='The pages to parse') 58 | 59 | parser.add_argument('--source', type=argparse.FileType('r'), default=sys.stdin) 60 | parser.add_argument('--output', type=argparse.FileType('w'), default=sys.stdout) 61 | 62 | args = parser.parse_args() 63 | 64 | if args.test: 65 | app = get_test_app() 66 | import webbrowser 67 | webbrowser.open('http://localhost:5000/') 68 | 69 | app.run() 70 | else: 71 | query = args.query.read() 72 | page = args.page 73 | if not sys.stdin.isatty(): 74 | source = args.source.read() 75 | else: 76 | source = None 77 | data = parse(query, source, page) 78 | outdata = json.dumps(data, indent=4, separators=(',', ': ')) 79 | args.output.write(outdata) 80 | args.output.write('\n') 81 | 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /gdom/schema.py: -------------------------------------------------------------------------------- 1 | import graphene 2 | from pyquery import PyQuery as pq 3 | 4 | 5 | def _query_selector(pq, selector): 6 | if not selector: 7 | return pq 8 | return pq.find(selector) 9 | 10 | 11 | class Node(graphene.Interface): 12 | '''A Node represents a DOM Node''' 13 | content = graphene.String(description='The html representation of the subnodes for the selected DOM', 14 | selector=graphene.String()) 15 | html = graphene.String(description='The html representation of the selected DOM', 16 | selector=graphene.String()) 17 | text = graphene.String(description='The text for the selected DOM', 18 | selector=graphene.String()) 19 | tag = graphene.String(description='The tag for the selected DOM', 20 | selector=graphene.String()) 21 | attr = graphene.String(description='The DOM attr of the Node', 22 | selector=graphene.String(), 23 | name=graphene.String(required=True)) 24 | _is = graphene.Boolean(description='Returns True if the DOM matches the selector', 25 | name='is', selector=graphene.String(required=True)) 26 | query = graphene.List(lambda: Element, 27 | description='Find elements using selector traversing down from self', 28 | selector=graphene.String(required=True)) 29 | children = graphene.List(lambda: Element, 30 | description='The list of children elements from self', 31 | selector=graphene.String()) 32 | parents = graphene.List(lambda: Element, 33 | description='The list of parent elements from self', 34 | selector=graphene.String()) 35 | parent = graphene.Field(lambda: Element, 36 | description='The parent element from self') 37 | siblings = graphene.List(lambda: Element, 38 | description='The siblings elements from self', 39 | selector=graphene.String()) 40 | next = graphene.Field(lambda: Element, 41 | description='The immediately following sibling from self', 42 | selector=graphene.String()) 43 | next_all = graphene.List(lambda: Element, 44 | description='The list of following siblings from self', 45 | selector=graphene.String()) 46 | prev = graphene.Field(lambda: Element, 47 | description='The immediately preceding sibling from self', 48 | selector=graphene.String()) 49 | prev_all = graphene.List(lambda: Element, 50 | description='The list of preceding siblings from self', 51 | selector=graphene.String()) 52 | 53 | def resolve_content(self, info, selector): 54 | return _query_selector(self, selector).eq(0).html() 55 | 56 | def resolve_html(self, info, selector): 57 | return _query_selector(self, selector).outerHtml() 58 | 59 | def resolve_text(self, info, selector): 60 | return _query_selector(self, selector).eq(0).remove('script').text() 61 | 62 | def resolve_tag(self, info, selector): 63 | el = _query_selector(self, selector).eq(0) 64 | if el: 65 | return el[0].tag 66 | 67 | def resolve__is(self, info, selector=None): 68 | return self.is_(selector) 69 | 70 | def resolve_attr(self, info, name, selector=None): 71 | return _query_selector(self, selector).attr(name) 72 | 73 | def resolve_query(self, info, selector=None): 74 | return _query_selector(self, selector).items() 75 | 76 | def resolve_children(self, info, selector=None): 77 | return self.children(selector).items() 78 | 79 | def resolve_parents(self, info, selector=None): 80 | return self.parents(selector).items() 81 | 82 | def resolve_parent(self, info): 83 | parent = self.parents().eq(-1) 84 | if parent: 85 | return parent 86 | 87 | def resolve_siblings(self, info, selector=None): 88 | return self.siblings(selector).items() 89 | 90 | def resolve_next(self, info, selector=None): 91 | _next = self.nextAll(selector) 92 | if _next: 93 | return _next.eq(0) 94 | 95 | def resolve_next_all(self, info, selector=None): 96 | return self.nextAll(selector).items() 97 | 98 | def resolve_prev(self, info, selector=None): 99 | prev = self.prevAll(selector) 100 | if prev: 101 | return prev.eq(0) 102 | 103 | def resolve_prev_all(self, info, selector=None): 104 | return self.prevAll(selector).items() 105 | 106 | 107 | def get_page(page): 108 | return pq(page, headers={'user-agent': 'gdom'}) 109 | 110 | 111 | class Document(graphene.ObjectType): 112 | ''' 113 | The Document Type represent any web page loaded and 114 | serves as an entry point into the page content 115 | ''' 116 | class Meta: 117 | interfaces = (Node, ) 118 | 119 | title = graphene.String(description='The title of the document') 120 | 121 | @classmethod 122 | def is_type_of(cls, root, info): 123 | return isinstance(root, pq) or super(Document, cls).is_type_of(root, info) 124 | 125 | def resolve_title(self, info): 126 | return self.find('title').eq(0).text() 127 | 128 | 129 | class Element(graphene.ObjectType): 130 | ''' 131 | A Element Type represents an object in a Document 132 | ''' 133 | class Meta: 134 | interfaces = (Node, ) 135 | 136 | visit = graphene.Field(Document, 137 | description='Visit will visit the href of the link and return the corresponding document') 138 | 139 | @classmethod 140 | def is_type_of(cls, root, info): 141 | return isinstance(root, pq) or super(Element, cls).is_type_of(root, info) 142 | 143 | def resolve_visit(self, info): 144 | # If is a link we follow through href attr 145 | # return the resulting Document 146 | if self.is_('a'): 147 | href = self.attr('href') 148 | return get_page(href) 149 | 150 | 151 | class Query(graphene.ObjectType): 152 | page = graphene.Field(Document, 153 | description='Visit the specified page', 154 | url=graphene.String( 155 | description='The url of the page'), 156 | _source=graphene.String( 157 | name='source', description='The source of the page') 158 | ) 159 | 160 | def resolve_page(self, info, url=None, source=None): 161 | assert url or source, 'At least you have to provide url or source of the page' 162 | return get_page(url or source) 163 | 164 | 165 | schema = graphene.Schema(query=Query, types=[Element]) 166 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | graphene==2.0.1 2 | flask-graphql==1.4.1 3 | pyquery==1.3.0 4 | requests==2.9.1 5 | # For the sample_app server 6 | gunicorn==19.4.5 7 | -------------------------------------------------------------------------------- /sample_app.py: -------------------------------------------------------------------------------- 1 | import os 2 | from gdom.cmd import get_test_app 3 | 4 | app = get_test_app() 5 | 6 | if __name__ == '__main__': 7 | port = int(os.environ.get("PORT", 5000)) 8 | app.debug = False 9 | app.run(host='0.0.0.0', port=port) 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='gdom', 5 | version='1.0.0', 6 | download_url='https://github.com/syrusakbary/gdom/archive/master.zip', 7 | packages=find_packages(), 8 | author='Syrus Akbary', 9 | author_email='me@syrusakbary.com', 10 | description='DOM Traversing and Scraping using GraphQL', 11 | long_description=open('README.rst').read(), 12 | keywords='scraping html graphql json', 13 | url='http://github.com/syrusakbary/gdom', 14 | license='MIT', 15 | entry_points={ 16 | 'console_scripts': ['gdom = gdom.cmd:main'] 17 | }, 18 | install_requires=[ 19 | 'graphene==2.0.1', 20 | 'flask-graphql==1.4.1', 21 | 'pyquery==1.3.0', 22 | 'requests==2.9.1' 23 | ], 24 | tests_require=[ 25 | ] 26 | ) 27 | --------------------------------------------------------------------------------