├── .gitignore
├── LICENSE.md
├── MANIFEST.in
├── README.md
├── hodor
    └── __init__.py
├── requirements.txt
├── setup.cfg
├── setup.py
└── upload.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | #### joe made this: http://goel.io/joe
 2 | 
 3 | #####=== Python ===#####
 4 | 
 5 | # Byte-compiled / optimized / DLL files
 6 | __pycache__/
 7 | *.py[cod]
 8 | *$py.class
 9 | 
10 | # C extensions
11 | *.so
12 | 
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | 
31 | # PyInstaller
32 | #  Usually these files are written by a python script from a template
33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 | 
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 | 
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *,cover
50 | 
51 | # Translations
52 | *.mo
53 | *.pot
54 | 
55 | # Django stuff:
56 | *.log
57 | 
58 | # Sphinx documentation
59 | docs/_build/
60 | 
61 | # PyBuilder
62 | target/
63 | 
64 | 
65 | # pip
66 | README.txt
67 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Compile Inc
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Hodor [![PyPI](https://img.shields.io/pypi/v/hodorlive.svg?maxAge=2592000?style=plastic)](https://pypi.python.org/pypi/hodorlive/)
  4 | 
  5 | A simple html scraper with xpath or css.
  6 | 
  7 | ## Install
  8 | 
  9 | ```pip install hodorlive```
 10 | 
 11 | ## Usage
 12 | 
 13 | ### As python package
 14 | 
 15 | ***WARNING: This package by default doesn't verify ssl connections. Please check the [arguments](#arguments) to enable them.***
 16 | 
 17 | #### Sample code
 18 | ```python
 19 | from hodor import Hodor
 20 | from dateutil.parser import parse
 21 | 
 22 | 
 23 | def date_convert(data):
 24 |     return parse(data)
 25 | 
 26 | url = 'http://www.nasdaq.com/markets/stocks/symbol-change-history.aspx'
 27 | 
 28 | CONFIG = {
 29 |     'old_symbol': {
 30 |         'css': '#SymbolChangeList_table tr td:nth-child(1)',
 31 |         'many': True
 32 |     },
 33 |     'new_symbol': {
 34 |         'css': '#SymbolChangeList_table tr td:nth-child(2)',
 35 |         'many': True
 36 |     },
 37 |     'effective_date': {
 38 |         'css': '#SymbolChangeList_table tr td:nth-child(3)',
 39 |         'many': True,
 40 |         'transform': date_convert
 41 |     },
 42 |     '_groups': {
 43 |         'data': '__all__',
 44 |         'ticker_changes': ['old_symbol', 'new_symbol']
 45 |     },
 46 |     '_paginate_by': {
 47 |         'xpath': '//*[@id="two_column_main_content_lb_NextPage"]/@href',
 48 |         'many': False
 49 |     }
 50 | }
 51 | 
 52 | h = Hodor(url=url, config=CONFIG, pagination_max_limit=5)
 53 | 
 54 | h.data
 55 | ```
 56 | #### Sample output
 57 | ```python
 58 | {'data': [{'effective_date': datetime.datetime(2016, 11, 1, 0, 0),
 59 |            'new_symbol': 'ARNC',
 60 |            'old_symbol': 'AA'},
 61 |           {'effective_date': datetime.datetime(2016, 11, 1, 0, 0),
 62 |            'new_symbol': 'ARNC$',
 63 |            'old_symbol': 'AA$'},
 64 |           {'effective_date': datetime.datetime(2016, 8, 16, 0, 0),
 65 |            'new_symbol': 'MALN8',
 66 |            'old_symbol': 'AHUSDN2018'},
 67 |           {'effective_date': datetime.datetime(2016, 8, 16, 0, 0),
 68 |            'new_symbol': 'MALN9',
 69 |            'old_symbol': 'AHUSDN2019'},
 70 |           {'effective_date': datetime.datetime(2016, 8, 16, 0, 0),
 71 |            'new_symbol': 'MALQ6',
 72 |            'old_symbol': 'AHUSDQ2016'},
 73 |           {'effective_date': datetime.datetime(2016, 8, 16, 0, 0),
 74 |            'new_symbol': 'MALQ7',
 75 |            'old_symbol': 'AHUSDQ2017'},
 76 |           {'effective_date': datetime.datetime(2016, 8, 16, 0, 0),
 77 |            'new_symbol': 'MALQ8',
 78 |            'old_symbol': 'AHUSDQ2018'}]}
 79 | ```
 80 | 
 81 | #### Arguments
 82 | 
 83 | - ```ua``` (User-Agent)
 84 | - ```proxies``` (check requesocks)
 85 | - ```auth```
 86 | - ```crawl_delay``` (crawl delay in seconds across pagination - default: 3 seconds)
 87 | - ```pagination_max_limit``` (max number of pages to crawl - default: 100)
 88 | - ```ssl_verify``` (default: False)
 89 | - ```robots``` (if set respects robots.txt - default: True)
 90 | - ```reppy_capacity``` (robots cache LRU capacity - default: 100)
 91 | - ```trim_values``` (if set trims output for leading and trailing whitespace - default: True)
 92 | 
 93 | 
 94 | #### Config parameters:
 95 | - By default any key in the config is a rule to parse.
 96 |     - Each rule can be either a ```xpath``` or a ```css```
 97 |     - Each rule can extract ```many``` values by default unless explicity set to ```False```
 98 |     - Each rule can allow to ```transform``` the result with a function if provided
 99 | - Extra parameters include grouping (```_groups```) and pagination (```_paginate_by```) which is also of the rule format.
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/hodor/__init__.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from functools import partial
  3 | from operator import is_not
  4 | 
  5 | import requests
  6 | from lxml import html
  7 | from lxml.cssselect import CSSSelector
  8 | from reppy.cache import RobotsCache
  9 | from reppy.exceptions import ConnectionException
 10 | try:
 11 |     from urlparse import urlparse, urljoin
 12 | except ImportError:
 13 |     from urllib.parse import urlparse, urljoin
 14 | 
 15 | 
 16 | DEFAULT_HODOR_UA = 'Hodor'
 17 | DEFAULT_HODOR_MAX_PAGES = 100
 18 | DEFAULT_CRAWL_DELAY = 3
 19 | EMPTY_VALUES = (None, '', [], (), {})
 20 | 
 21 | 
 22 | class Hodor(object):
 23 |     def __init__(self, url, config={}, proxies={},
 24 |                  auth=None, ua=DEFAULT_HODOR_UA,
 25 |                  pagination_max_limit=DEFAULT_HODOR_MAX_PAGES,
 26 |                  crawl_delay=DEFAULT_CRAWL_DELAY,
 27 |                  ssl_verify=False,
 28 |                  trim_values=True,
 29 |                  robots=True,
 30 |                  reppy_capacity=100):
 31 | 
 32 |         self.content = None
 33 |         self.url = url
 34 |         self.domain = self._get_domain()
 35 |         self.proxies = proxies
 36 |         self.auth = auth
 37 |         self.ua = ua
 38 |         self.trim_values = trim_values
 39 |         self.ssl_verify = ssl_verify
 40 |         self.config = {}
 41 |         self.extra_config = {}
 42 | 
 43 |         self.robots = RobotsCache(capacity=reppy_capacity) if robots else None
 44 | 
 45 |         self._pages = []
 46 |         self._page_count = 0
 47 |         self._pagination_max_limit = pagination_max_limit
 48 |         self.crawl_delay = self._crawl_delay(crawl_delay)
 49 | 
 50 |         for k, v in config.items():
 51 |             if k.startswith("_"):
 52 |                 self.extra_config[k.lstrip("_")] = v
 53 |             else:
 54 |                 self.config[k] = v
 55 | 
 56 |     def _get_domain(self):
 57 |         parsed_uri = urlparse(self.url)
 58 |         return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
 59 | 
 60 |     def _crawl_delay(self, crawl_delay):
 61 |         if self.robots not in EMPTY_VALUES:
 62 |             expiry, robots = self.robots.fetch('{}robots.txt'.format(self.domain))
 63 |             delay = robots.agent(self.ua).delay
 64 |             try:
 65 |                 crawl_delay = max(filter(partial(is_not, None),
 66 |                                          [delay, crawl_delay]))
 67 |             except ConnectionException:
 68 |                 pass
 69 |         return crawl_delay
 70 | 
 71 |     def _fetch(self, url):
 72 |         '''Does the requests fetching and stores result in self.content'''
 73 | 
 74 |         if self.robots in EMPTY_VALUES or self.robots.allowed(url, self.ua):
 75 |             session = requests.session()
 76 |             headers = {'User-Agent': self.ua}
 77 |             if len(self.proxies) > 0:
 78 |                 session.proxies = self.proxies
 79 |             if self.auth:
 80 |                 r = session.get(url, headers=headers, auth=self.auth, verify=self.ssl_verify)
 81 |             else:
 82 |                 r = session.get(url, headers=headers, verify=self.ssl_verify)
 83 |             self.content = r.content
 84 | 
 85 |         return self.content
 86 | 
 87 |     @staticmethod
 88 |     def _get_value(content, rule):
 89 |         '''Returns result for a specific xpath'''
 90 |         try:
 91 |             tree = html.fromstring(content)
 92 |         except TypeError:
 93 |             tree = None
 94 | 
 95 |         post_processing = rule.get('transform', lambda data: data)
 96 | 
 97 |         data = ""
 98 |         if tree not in EMPTY_VALUES:
 99 |             if 'xpath' in rule:
100 |                 data = tree.xpath(rule['xpath'])
101 |             elif 'css' in rule:
102 |                 data = [node.text_content() for node in tree.cssselect(rule['css'])]
103 | 
104 |             many = rule.get('many', True)
105 |             if not many:
106 |                 if len(data) == 0:
107 |                     data = None
108 |                 else:
109 |                     data = post_processing(data[0])
110 |             else:
111 |                 data = [post_processing(d) for d in data]
112 | 
113 |         return data
114 | 
115 |     @staticmethod
116 |     def _group_data(data, groups, config):
117 |         del_fields = []
118 |         for dest, group_fields in groups.items():
119 |             if '__all__' in group_fields or group_fields == '__all__':
120 |                 group_fields = [rule for rule in config.keys() if not rule.startswith('_')]
121 |                 del_fields.extend(group_fields)
122 | 
123 |             gdata = []
124 |             for field in group_fields:
125 |                 gdata.append(data[field])
126 | 
127 |             data[dest] = []
128 |             for gd in zip(*gdata):
129 |                 d = {}
130 |                 for i, field in enumerate(group_fields):
131 |                     d[field] = gd[i]
132 |                 data[dest].append(d)
133 | 
134 |         if len(del_fields) == 0:
135 |             del_fields = [field for field_set in groups.values() for field in field_set]
136 | 
137 |         for field in del_fields:
138 |             if field in data:
139 |                 del data[field]
140 | 
141 |     def _package_pages(self):
142 |         self._data = {}
143 |         if len(self._pages) == 1:
144 |             self._data = self._pages[0]
145 |         else:
146 |             self._data = {key: [] for key in self._pages[0].keys()}
147 |             for page in self._pages:
148 |                 for k, v in page.items():
149 |                     if hasattr(v, '__iter__'):
150 |                         self._data[k].extend(v)
151 |                     else:
152 |                         self._data[k].append(v)
153 |         return self._data
154 | 
155 |     @classmethod
156 |     def _parse(cls, content, config={}, extra_config={}, trim_values=True):
157 |         '''Parses the content based on the config set'''
158 |         if len(config) == 0:
159 |             _data = {'content': content}
160 |         else:
161 |             _data = {}
162 | 
163 |             try:
164 |                 str_class = basestring
165 |             except NameError:
166 |                 str_class = str
167 | 
168 |             for key, rule in config.items():
169 |                 value = cls._get_value(content, rule)
170 |                 if trim_values and value not in EMPTY_VALUES:
171 |                     if 'many' in rule and rule['many']:
172 |                         value = [v.strip() if isinstance(v, str_class) else v for v in value]
173 |                     else:
174 |                         value = value.strip() if isinstance(value, str_class) else value
175 |                 _data[key] = value
176 | 
177 |         paginate_by = extra_config.get('paginate_by')
178 |         if paginate_by:
179 |             paginate_by = cls._get_value(content, paginate_by)
180 | 
181 |         groups = extra_config.get('groups', {})
182 |         if groups:
183 |             cls._group_data(_data, groups, config)
184 |         return _data, paginate_by
185 | 
186 |     def _get(self, url):
187 |         self._fetch(url)
188 |         data, paginate_by = self._parse(self.content, self.config, self.extra_config, self.trim_values)
189 | 
190 |         if paginate_by not in EMPTY_VALUES:
191 |             paginate_by = urljoin(self.domain, paginate_by)
192 | 
193 |         return data, paginate_by
194 | 
195 |     def get(self, url=None):
196 |         url = url if url else self.url
197 |         self._data, paginate_by = self._get(url)
198 | 
199 |         self._pages.append(self._data)
200 |         self._page_count += 1
201 | 
202 |         if paginate_by and self._page_count < self._pagination_max_limit:
203 |             time.sleep(self.crawl_delay)
204 |             self.get(paginate_by)
205 | 
206 |         self._package_pages()
207 |         return self._data
208 | 
209 |     @property
210 |     def data(self):
211 |         if not hasattr(self, '_data'):
212 |             self.get()
213 |         return self._data
214 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cssselect==1.2.0
2 | lxml>=4.9.1
3 | reppy2==0.3.6
4 | requests>=2.32.3,<2.40
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description_file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | 
 6 | def parse_requirements(filename):
 7 |     """load requirements from a pip requirements file"""
 8 |     lineiter = (line.strip() for line in open(filename))
 9 |     return [line for line in lineiter if line and not line.startswith("#")]
10 | 
11 | 
12 | install_reqs = parse_requirements("requirements.txt")
13 | version = "1.2.13"
14 | 
15 | 
16 | description = "xpath/css based scraper with pagination"
17 | long_description = description
18 | 
19 | os.chdir(os.path.normpath(os.path.join(os.path.abspath(__file__), os.pardir)))
20 | setup(
21 |     name="hodorlive",
22 |     version=version,
23 |     packages=find_packages(),
24 |     install_requires=install_reqs,
25 |     include_package_data=True,
26 |     license="MIT",
27 |     description=description,
28 |     long_description=long_description,
29 |     keywords=["hodor", "cssselect", "lxml", "scraping"],
30 |     url="https://github.com/CompileInc/hodor",
31 |     download_url="https://github.com/CompileInc/hodor/archive/v{version}.tar.gz".format(
32 |         version=version
33 |     ),
34 |     author="Compile Inc",
35 |     author_email="dev@compile.com",
36 |     classifiers=[
37 |         "Intended Audience :: Developers",
38 |         "Programming Language :: Python",
39 |         "Programming Language :: Python :: 3.11",
40 |     ],
41 | )
42 | 


--------------------------------------------------------------------------------
/upload.sh:
--------------------------------------------------------------------------------
1 | set -peu  # fail on first error
2 | 
3 | pandoc --from=markdown --to=rst --output=README.txt README.md
4 | rm -rf dist
5 | # requires pypa/build
6 | python -m build
7 | twine upload -r testpypi dist/*
8 | twine upload -r pypi dist/*
9 | 


--------------------------------------------------------------------------------