├── .gitignore ├── LICENSE.md ├── MANIFEST.in ├── README.md ├── hodor └── __init__.py ├── requirements.txt ├── setup.cfg ├── setup.py └── upload.sh /.gitignore: -------------------------------------------------------------------------------- 1 | #### joe made this: http://goel.io/joe 2 | 3 | #####=== Python ===##### 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | 58 | # Sphinx documentation 59 | docs/_build/ 60 | 61 | # PyBuilder 62 | target/ 63 | 64 | 65 | # pip 66 | README.txt 67 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Compile Inc 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Hodor [![PyPI](https://img.shields.io/pypi/v/hodorlive.svg?maxAge=2592000?style=plastic)](https://pypi.python.org/pypi/hodorlive/) 4 | 5 | A simple html scraper with xpath or css. 6 | 7 | ## Install 8 | 9 | ```pip install hodorlive``` 10 | 11 | ## Usage 12 | 13 | ### As python package 14 | 15 | ***WARNING: This package by default doesn't verify ssl connections. Please check the [arguments](#arguments) to enable them.*** 16 | 17 | #### Sample code 18 | ```python 19 | from hodor import Hodor 20 | from dateutil.parser import parse 21 | 22 | 23 | def date_convert(data): 24 | return parse(data) 25 | 26 | url = 'http://www.nasdaq.com/markets/stocks/symbol-change-history.aspx' 27 | 28 | CONFIG = { 29 | 'old_symbol': { 30 | 'css': '#SymbolChangeList_table tr td:nth-child(1)', 31 | 'many': True 32 | }, 33 | 'new_symbol': { 34 | 'css': '#SymbolChangeList_table tr td:nth-child(2)', 35 | 'many': True 36 | }, 37 | 'effective_date': { 38 | 'css': '#SymbolChangeList_table tr td:nth-child(3)', 39 | 'many': True, 40 | 'transform': date_convert 41 | }, 42 | '_groups': { 43 | 'data': '__all__', 44 | 'ticker_changes': ['old_symbol', 'new_symbol'] 45 | }, 46 | '_paginate_by': { 47 | 'xpath': '//*[@id="two_column_main_content_lb_NextPage"]/@href', 48 | 'many': False 49 | } 50 | } 51 | 52 | h = Hodor(url=url, config=CONFIG, pagination_max_limit=5) 53 | 54 | h.data 55 | ``` 56 | #### Sample output 57 | ```python 58 | {'data': [{'effective_date': datetime.datetime(2016, 11, 1, 0, 0), 59 | 'new_symbol': 'ARNC', 60 | 'old_symbol': 'AA'}, 61 | {'effective_date': datetime.datetime(2016, 11, 1, 0, 0), 62 | 'new_symbol': 'ARNC$', 63 | 'old_symbol': 'AA$'}, 64 | {'effective_date': datetime.datetime(2016, 8, 16, 0, 0), 65 | 'new_symbol': 'MALN8', 66 | 'old_symbol': 'AHUSDN2018'}, 67 | {'effective_date': datetime.datetime(2016, 8, 16, 0, 0), 68 | 'new_symbol': 'MALN9', 69 | 'old_symbol': 'AHUSDN2019'}, 70 | {'effective_date': datetime.datetime(2016, 8, 16, 0, 0), 71 | 'new_symbol': 'MALQ6', 72 | 'old_symbol': 'AHUSDQ2016'}, 73 | {'effective_date': datetime.datetime(2016, 8, 16, 0, 0), 74 | 'new_symbol': 'MALQ7', 75 | 'old_symbol': 'AHUSDQ2017'}, 76 | {'effective_date': datetime.datetime(2016, 8, 16, 0, 0), 77 | 'new_symbol': 'MALQ8', 78 | 'old_symbol': 'AHUSDQ2018'}]} 79 | ``` 80 | 81 | #### Arguments 82 | 83 | - ```ua``` (User-Agent) 84 | - ```proxies``` (check requesocks) 85 | - ```auth``` 86 | - ```crawl_delay``` (crawl delay in seconds across pagination - default: 3 seconds) 87 | - ```pagination_max_limit``` (max number of pages to crawl - default: 100) 88 | - ```ssl_verify``` (default: False) 89 | - ```robots``` (if set respects robots.txt - default: True) 90 | - ```reppy_capacity``` (robots cache LRU capacity - default: 100) 91 | - ```trim_values``` (if set trims output for leading and trailing whitespace - default: True) 92 | 93 | 94 | #### Config parameters: 95 | - By default any key in the config is a rule to parse. 96 | - Each rule can be either a ```xpath``` or a ```css``` 97 | - Each rule can extract ```many``` values by default unless explicity set to ```False``` 98 | - Each rule can allow to ```transform``` the result with a function if provided 99 | - Extra parameters include grouping (```_groups```) and pagination (```_paginate_by```) which is also of the rule format. 100 | 101 | 102 | -------------------------------------------------------------------------------- /hodor/__init__.py: -------------------------------------------------------------------------------- 1 | import time 2 | from functools import partial 3 | from operator import is_not 4 | 5 | import requests 6 | from lxml import html 7 | from lxml.cssselect import CSSSelector 8 | from reppy.cache import RobotsCache 9 | from reppy.exceptions import ConnectionException 10 | try: 11 | from urlparse import urlparse, urljoin 12 | except ImportError: 13 | from urllib.parse import urlparse, urljoin 14 | 15 | 16 | DEFAULT_HODOR_UA = 'Hodor' 17 | DEFAULT_HODOR_MAX_PAGES = 100 18 | DEFAULT_CRAWL_DELAY = 3 19 | EMPTY_VALUES = (None, '', [], (), {}) 20 | 21 | 22 | class Hodor(object): 23 | def __init__(self, url, config={}, proxies={}, 24 | auth=None, ua=DEFAULT_HODOR_UA, 25 | pagination_max_limit=DEFAULT_HODOR_MAX_PAGES, 26 | crawl_delay=DEFAULT_CRAWL_DELAY, 27 | ssl_verify=False, 28 | trim_values=True, 29 | robots=True, 30 | reppy_capacity=100): 31 | 32 | self.content = None 33 | self.url = url 34 | self.domain = self._get_domain() 35 | self.proxies = proxies 36 | self.auth = auth 37 | self.ua = ua 38 | self.trim_values = trim_values 39 | self.ssl_verify = ssl_verify 40 | self.config = {} 41 | self.extra_config = {} 42 | 43 | self.robots = RobotsCache(capacity=reppy_capacity) if robots else None 44 | 45 | self._pages = [] 46 | self._page_count = 0 47 | self._pagination_max_limit = pagination_max_limit 48 | self.crawl_delay = self._crawl_delay(crawl_delay) 49 | 50 | for k, v in config.items(): 51 | if k.startswith("_"): 52 | self.extra_config[k.lstrip("_")] = v 53 | else: 54 | self.config[k] = v 55 | 56 | def _get_domain(self): 57 | parsed_uri = urlparse(self.url) 58 | return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) 59 | 60 | def _crawl_delay(self, crawl_delay): 61 | if self.robots not in EMPTY_VALUES: 62 | expiry, robots = self.robots.fetch('{}robots.txt'.format(self.domain)) 63 | delay = robots.agent(self.ua).delay 64 | try: 65 | crawl_delay = max(filter(partial(is_not, None), 66 | [delay, crawl_delay])) 67 | except ConnectionException: 68 | pass 69 | return crawl_delay 70 | 71 | def _fetch(self, url): 72 | '''Does the requests fetching and stores result in self.content''' 73 | 74 | if self.robots in EMPTY_VALUES or self.robots.allowed(url, self.ua): 75 | session = requests.session() 76 | headers = {'User-Agent': self.ua} 77 | if len(self.proxies) > 0: 78 | session.proxies = self.proxies 79 | if self.auth: 80 | r = session.get(url, headers=headers, auth=self.auth, verify=self.ssl_verify) 81 | else: 82 | r = session.get(url, headers=headers, verify=self.ssl_verify) 83 | self.content = r.content 84 | 85 | return self.content 86 | 87 | @staticmethod 88 | def _get_value(content, rule): 89 | '''Returns result for a specific xpath''' 90 | try: 91 | tree = html.fromstring(content) 92 | except TypeError: 93 | tree = None 94 | 95 | post_processing = rule.get('transform', lambda data: data) 96 | 97 | data = "" 98 | if tree not in EMPTY_VALUES: 99 | if 'xpath' in rule: 100 | data = tree.xpath(rule['xpath']) 101 | elif 'css' in rule: 102 | data = [node.text_content() for node in tree.cssselect(rule['css'])] 103 | 104 | many = rule.get('many', True) 105 | if not many: 106 | if len(data) == 0: 107 | data = None 108 | else: 109 | data = post_processing(data[0]) 110 | else: 111 | data = [post_processing(d) for d in data] 112 | 113 | return data 114 | 115 | @staticmethod 116 | def _group_data(data, groups, config): 117 | del_fields = [] 118 | for dest, group_fields in groups.items(): 119 | if '__all__' in group_fields or group_fields == '__all__': 120 | group_fields = [rule for rule in config.keys() if not rule.startswith('_')] 121 | del_fields.extend(group_fields) 122 | 123 | gdata = [] 124 | for field in group_fields: 125 | gdata.append(data[field]) 126 | 127 | data[dest] = [] 128 | for gd in zip(*gdata): 129 | d = {} 130 | for i, field in enumerate(group_fields): 131 | d[field] = gd[i] 132 | data[dest].append(d) 133 | 134 | if len(del_fields) == 0: 135 | del_fields = [field for field_set in groups.values() for field in field_set] 136 | 137 | for field in del_fields: 138 | if field in data: 139 | del data[field] 140 | 141 | def _package_pages(self): 142 | self._data = {} 143 | if len(self._pages) == 1: 144 | self._data = self._pages[0] 145 | else: 146 | self._data = {key: [] for key in self._pages[0].keys()} 147 | for page in self._pages: 148 | for k, v in page.items(): 149 | if hasattr(v, '__iter__'): 150 | self._data[k].extend(v) 151 | else: 152 | self._data[k].append(v) 153 | return self._data 154 | 155 | @classmethod 156 | def _parse(cls, content, config={}, extra_config={}, trim_values=True): 157 | '''Parses the content based on the config set''' 158 | if len(config) == 0: 159 | _data = {'content': content} 160 | else: 161 | _data = {} 162 | 163 | try: 164 | str_class = basestring 165 | except NameError: 166 | str_class = str 167 | 168 | for key, rule in config.items(): 169 | value = cls._get_value(content, rule) 170 | if trim_values and value not in EMPTY_VALUES: 171 | if 'many' in rule and rule['many']: 172 | value = [v.strip() if isinstance(v, str_class) else v for v in value] 173 | else: 174 | value = value.strip() if isinstance(value, str_class) else value 175 | _data[key] = value 176 | 177 | paginate_by = extra_config.get('paginate_by') 178 | if paginate_by: 179 | paginate_by = cls._get_value(content, paginate_by) 180 | 181 | groups = extra_config.get('groups', {}) 182 | if groups: 183 | cls._group_data(_data, groups, config) 184 | return _data, paginate_by 185 | 186 | def _get(self, url): 187 | self._fetch(url) 188 | data, paginate_by = self._parse(self.content, self.config, self.extra_config, self.trim_values) 189 | 190 | if paginate_by not in EMPTY_VALUES: 191 | paginate_by = urljoin(self.domain, paginate_by) 192 | 193 | return data, paginate_by 194 | 195 | def get(self, url=None): 196 | url = url if url else self.url 197 | self._data, paginate_by = self._get(url) 198 | 199 | self._pages.append(self._data) 200 | self._page_count += 1 201 | 202 | if paginate_by and self._page_count < self._pagination_max_limit: 203 | time.sleep(self.crawl_delay) 204 | self.get(paginate_by) 205 | 206 | self._package_pages() 207 | return self._data 208 | 209 | @property 210 | def data(self): 211 | if not hasattr(self, '_data'): 212 | self.get() 213 | return self._data 214 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cssselect==1.2.0 2 | lxml>=4.9.1 3 | reppy2==0.3.6 4 | requests>=2.32.3,<2.40 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description_file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import setup, find_packages 4 | 5 | 6 | def parse_requirements(filename): 7 | """load requirements from a pip requirements file""" 8 | lineiter = (line.strip() for line in open(filename)) 9 | return [line for line in lineiter if line and not line.startswith("#")] 10 | 11 | 12 | install_reqs = parse_requirements("requirements.txt") 13 | version = "1.2.13" 14 | 15 | 16 | description = "xpath/css based scraper with pagination" 17 | long_description = description 18 | 19 | os.chdir(os.path.normpath(os.path.join(os.path.abspath(__file__), os.pardir))) 20 | setup( 21 | name="hodorlive", 22 | version=version, 23 | packages=find_packages(), 24 | install_requires=install_reqs, 25 | include_package_data=True, 26 | license="MIT", 27 | description=description, 28 | long_description=long_description, 29 | keywords=["hodor", "cssselect", "lxml", "scraping"], 30 | url="https://github.com/CompileInc/hodor", 31 | download_url="https://github.com/CompileInc/hodor/archive/v{version}.tar.gz".format( 32 | version=version 33 | ), 34 | author="Compile Inc", 35 | author_email="dev@compile.com", 36 | classifiers=[ 37 | "Intended Audience :: Developers", 38 | "Programming Language :: Python", 39 | "Programming Language :: Python :: 3.11", 40 | ], 41 | ) 42 | -------------------------------------------------------------------------------- /upload.sh: -------------------------------------------------------------------------------- 1 | set -peu # fail on first error 2 | 3 | pandoc --from=markdown --to=rst --output=README.txt README.md 4 | rm -rf dist 5 | # requires pypa/build 6 | python -m build 7 | twine upload -r testpypi dist/* 8 | twine upload -r pypi dist/* 9 | --------------------------------------------------------------------------------