├── .gitignore ├── LICENSE ├── README.md ├── examples ├── guido.py └── wikipedia.py ├── pholcidae2 └── __init__.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | test.py 2 | 3 | # Created by .ignore support plugin (hsz.mobi) 4 | ### OSX template 5 | .DS_Store 6 | .AppleDouble 7 | .LSOverride 8 | 9 | # Icon must end with two \r 10 | Icon 11 | 12 | # Thumbnails 13 | ._* 14 | 15 | # Files that might appear in the root of a volume 16 | .DocumentRevisions-V100 17 | .fseventsd 18 | .Spotlight-V100 19 | .TemporaryItems 20 | .Trashes 21 | .VolumeIcon.icns 22 | 23 | # Directories potentially created on remote AFP share 24 | .AppleDB 25 | .AppleDesktop 26 | Network Trash Folder 27 | Temporary Items 28 | .apdisk 29 | ### Python template 30 | # Byte-compiled / optimized / DLL files 31 | __pycache__/ 32 | *.py[cod] 33 | *$py.class 34 | 35 | # C extensions 36 | *.so 37 | 38 | # Distribution / packaging 39 | .Python 40 | env/ 41 | build/ 42 | develop-eggs/ 43 | dist/ 44 | downloads/ 45 | eggs/ 46 | .eggs/ 47 | lib/ 48 | lib64/ 49 | parts/ 50 | sdist/ 51 | var/ 52 | *.egg-info/ 53 | .installed.cfg 54 | *.egg 55 | 56 | # PyInstaller 57 | # Usually these files are written by a python script from a template 58 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 59 | *.manifest 60 | *.spec 61 | 62 | # Installer logs 63 | pip-log.txt 64 | pip-delete-this-directory.txt 65 | 66 | # Unit test / coverage reports 67 | htmlcov/ 68 | .tox/ 69 | .coverage 70 | .coverage.* 71 | .cache 72 | nosetests.xml 73 | coverage.xml 74 | *,cover 75 | 76 | # Translations 77 | *.mo 78 | *.pot 79 | 80 | # Django stuff: 81 | *.log 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | 86 | # PyBuilder 87 | target/ 88 | 89 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 bbrodriges 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PHOLCIDAE - Tiny python web crawler 2 | ========= 3 | 4 | Pholcidae 5 | ------------ 6 | Pholcidae, commonly known as cellar spiders, are a spider family in the suborder Araneomorphae. 7 | 8 | About 9 | ------------ 10 | Pholcidae is a tiny Python module allows you to write your own crawl spider fast and easy. 11 | 12 | _View end of README to read about changes in v2_ 13 | 14 | Dependencies 15 | ------------ 16 | * python 2.7 or higher 17 | 18 | Install 19 | ------------ 20 | ``` 21 | pip install git+https://github.com/bbrodriges/pholcidae.git 22 | ``` 23 | 24 | Basic example 25 | ------------- 26 | 27 | ``` python 28 | from pholcidae2 import Pholcidae 29 | 30 | class MySpider(Pholcidae): 31 | 32 | def crawl(self, data): 33 | print(data.url) 34 | 35 | settings = {'domain': 'www.test.com', 'start_page': '/sitemap/'} 36 | 37 | spider = MySpider() 38 | spider.extend(settings) 39 | spider.start() 40 | ``` 41 | 42 | Allowed settings 43 | ------------ 44 | Settings must be passed as dictionary to ```extend``` method of the crawler. 45 | 46 | Params you can use: 47 | 48 | **Required** 49 | 50 | * **domain** _string_ - defines domain which pages will be parsed. Defines without trailing slash. 51 | 52 | **Additional** 53 | 54 | * **start_page** _string_ - URL which will be used as entry point to parsed site. Default: `/` 55 | * **protocol** _string_ - defines protocol to be used by crawler. Default: `http://` 56 | * **valid_links** _list_ - list of regular expression strings (or full URLs), which will be used to filter site URLs to be passed to `crawl()` method. Default: `['(.*)']` 57 | * **append_to_links** _string_ - text to be appended to each link before fetching it. Default: `''` 58 | * **exclude_links** _list_ - list of regular expression strings (or full URLs), which will be used to filter site URLs which must not be checked at all. Default: `[]` 59 | * **cookies** _dict_ - a dictionary of string key-values which represents cookie name and cookie value to be passed with site URL request. Default: `{}` 60 | * **headers** _dict_ - a dictionary of string key-values which represents header name and value value to be passed with site URL request. Default: `{}` 61 | * **follow_redirects** _bool_ - allows crawler to bypass 30x headers and not follow redirects. Default: `True` 62 | * **precrawl** _string_ - name of function which will be called before start of crawler. Default: `None` 63 | * **postcrawl** _string_ - name of function which will be called after the end crawling. Default: `None` 64 | * **callbacks** _dict_ - a dictionary of key-values which represents URL pattern from `valid_links` dict and string name of self defined method to get parsed data. Default: `{}` 65 | * **proxy** _dict_ - a dictionary mapping protocol names to URLs of proxies, e.g., {'http': 'http://user:passwd@host:port'}. Default: `{}` 66 | 67 | New in v2: 68 | 69 | * **silent_links** _list_ - list of regular expression strings (or full URLs), which will be used to filter site URLs which must not pass page data to callback function, yet still collect URLs from this page. Default: `[]` 70 | * **valid_mimes** _list_ - list of strings representing valid MIME types. Only URLs that can be identified with this MIME types will be parsed. Default: `[]` 71 | * **threads** _int_ - number of concurrent threads of pages fetchers. Default: `1` 72 | * **with_lock** _bool_ - whether use or not lock while URLs sync. It slightly decreases crawling speed but eliminates race conditions. Default: `True` 73 | * **hashed** _bool_ - whether or not store parsed URLs as shortened SHA1 hashes. Crawler may run a little bit slower but consumes a lot less memory. Default: `False` 74 | * **respect_robots_txt** _bool_ - whether or not read `robots.txt` file before start and add `Disallow` directives to **exclude_links** list. Default: `True` 75 | 76 | Response attributes 77 | ------------ 78 | 79 | While inherit Pholcidae class you can override built-in `crawl()` method to retrieve data gathered from page. Any response object will contain some attributes depending on page parsing success. 80 | 81 | **Successful parsing** 82 | 83 | * **body** _string_ - raw HTML/XML/XHTML etc. representation of page. 84 | * **url** _string_ - URL of parsed page. 85 | * **headers** _AttrDict_ - dictionary of response headers. 86 | * **cookies** _AttrDict_ - dictionary of response cookies. 87 | * **status** _int_ - HTTP status of response (e.g. 200). 88 | * **match** _list_ - matched part from valid_links regex. 89 | 90 | **Unsuccessful parsing** 91 | 92 | * **body** _string_ - raw representation of error. 93 | * **status** _int_ - HTTP status of response (e.g. 400). Default: 500 94 | * **url** _string_ - URL of parsed page. 95 | 96 | Example 97 | ------------ 98 | See ```test.py``` 99 | 100 | Note 101 | ------------ 102 | Pholcidae does not contain any built-in XML, XHTML, HTML or other parser. You can manually add any response body parsing methods using any available python libraries you want. 103 | 104 | v2 vs v1 105 | ------------ 106 | Major changes have been made in version 2.0: 107 | * All code has been completely rewritten from scratch 108 | * Less abstractions = more speed 109 | * Threads support 110 | * Matches in page data are now list and not optional 111 | * Option ```stay_in_domain``` has been removed. Crawler cannot break out of initial domain anymore. 112 | 113 | There are some minor code changes which breaks backward code compatibility between version 1.x and 2.0: 114 | * You need to explicitly pass settings to ```extend``` method of your crawler 115 | * Option ```autostart``` has been removed. You must call ```spider.srart()``` explisitly 116 | * Module is now called ```pholcidae2``` 117 | -------------------------------------------------------------------------------- /examples/guido.py: -------------------------------------------------------------------------------- 1 | from pholcidae2 import Pholcidae 2 | 3 | 4 | class MyGuidoSpider(Pholcidae): 5 | def before(self): 6 | print('-------- PRECRAWL ----------') 7 | 8 | def after(self): 9 | print('-------- POSTCRAWL ----------') 10 | 11 | def my_callback(self, data): 12 | print('-------- MY CALLBACK ----------') 13 | print(data['url'], data['status'], data['matches']) 14 | 15 | def crawl(self, data): 16 | print(data['url'], data['status'], data['matches']) 17 | 18 | 19 | settings = { 20 | 'domain': 'www.python.org/~guido', 21 | 'start_page': '/', 22 | 'valid_links': ['(.*)'], 23 | 'exclude_links': ['ClaymontJPEGS'], 24 | 'silent_links': ['Publications.html'], 25 | 'append_to_links': '?a=b', 26 | 'precrawl': 'before', 27 | 'postcrawl': 'after', 28 | 'callbacks': {'(images.*)': 'my_callback'}, 29 | 'threads': 3, 30 | } 31 | 32 | spider = MyGuidoSpider() 33 | spider.extend(settings) 34 | spider.start() 35 | -------------------------------------------------------------------------------- /examples/wikipedia.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | 3 | from lxml import etree 4 | 5 | from pholcidae2 import Pholcidae 6 | 7 | 8 | class MyWikiSpider(Pholcidae): 9 | def crawl(self, data): 10 | tree = etree.parse(StringIO(data['body']), self.html_parser) 11 | langs = tree.xpath(".//div[@class='central-featured']//strong/text()") 12 | 13 | print('Top Wikipedia languages:') 14 | for lang in langs: 15 | print(lang) 16 | 17 | 18 | settings = { 19 | 'protocol': 'https://', 20 | 'domain': 'www.wikipedia.org', 21 | 'start_page': '/', 22 | 'exclude_links': ['(.*)'], 23 | 'threads': 1, 24 | } 25 | 26 | spider = MyWikiSpider() 27 | spider.extend(settings) 28 | spider.html_parser = etree.HTMLParser() 29 | spider.start() 30 | -------------------------------------------------------------------------------- /pholcidae2/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import mimetypes 4 | import re 5 | import sys 6 | from hashlib import sha1 7 | from threading import Thread, Lock 8 | 9 | if sys.version_info < (3, 0, 0): 10 | import urlparse as parse 11 | import urllib2 as request 12 | else: 13 | from urllib import parse 14 | from urllib import request 15 | 16 | version_info = (2, 1, 0) 17 | __version__ = '.'.join(map(str, version_info)) 18 | 19 | __author__ = 'bbrodriges' 20 | 21 | 22 | class Pholcidae(object): 23 | """" Pholcidae is a small and fast web crawler. """ 24 | 25 | DEFAULT_CALLBACK = 'crawl' 26 | 27 | _settings = { 28 | 'follow_redirects': True, 29 | 'append_to_links': '', 30 | 'valid_links': ['(.*)'], 31 | 'exclude_links': [], 32 | 'silent_links': [], 33 | 'start_page': '/', 34 | 'domain': '', 35 | 'protocol': 'http://', 36 | 'cookies': {}, 37 | 'headers': {}, 38 | 'precrawl': None, 39 | 'postcrawl': None, 40 | 'callbacks': {}, 41 | 'proxy': {}, 42 | 'valid_mimes': [], 43 | 'threads': 1, 44 | 'with_lock': True, 45 | 'hashed': False, 46 | 'respect_robots_txt': True, 47 | } 48 | 49 | def extend(self, settings): 50 | 51 | """ 52 | Extends default settings using given settings. 53 | """ 54 | 55 | self._settings.update(settings) 56 | 57 | def start(self): 58 | 59 | """ 60 | Prepares everything and starts 61 | """ 62 | 63 | self.__prepare() 64 | 65 | # trying to call precrawl function 66 | precrawl = self._settings['precrawl'] 67 | getattr(self, precrawl)() if precrawl else None 68 | 69 | self.__fetch_pages() 70 | 71 | # trying to call postcrawl function 72 | postcrawl = self._settings['postcrawl'] 73 | getattr(self, postcrawl)() if postcrawl else None 74 | 75 | def crawl(self, response): 76 | 77 | """ 78 | You may override this method in a subclass. 79 | Use it to get page content and parse it as you want to. 80 | """ 81 | 82 | pass 83 | 84 | def __prepare(self): 85 | 86 | """ 87 | Prepares everything before start. 88 | """ 89 | 90 | # creating new SyncStorage instance 91 | self._storage = SyncStorage() 92 | 93 | # adding start point into storage 94 | start_url = '%(protocol)s%(domain)s%(start_page)s' % self._settings 95 | self._storage.add(start_url.strip(), SyncStorage.PRIORITY_LOW) 96 | 97 | # creating HTTP opener instance 98 | handlers = [] 99 | if self._settings['proxy']: 100 | proxy_handler = request.ProxyHandler(self._settings['proxy']) 101 | handlers.append(proxy_handler) 102 | 103 | if not self._settings['follow_redirects']: 104 | handlers.extend([RedirectHandler, request.HTTPCookieProcessor()]) 105 | 106 | self._opener = request.build_opener(*handlers) 107 | 108 | # adding headers to opener 109 | self._opener.addheaders.extend(self._settings['headers']) 110 | 111 | # adding cookies to opener 112 | if self._settings['cookies']: 113 | compiled_cookies = [] 114 | for name, value in self._settings['cookies'].items(): 115 | compiled_cookies.append('%s=%s' % (name, value)) 116 | cookies_string = ','.join(compiled_cookies) 117 | self._opener.addheaders.append(('Cookie', cookies_string)) 118 | 119 | # compiling regexes 120 | self._regexes = { 121 | 'valid_links': [], 122 | 'exclude_links': [], 123 | 'silent_links': [], 124 | } 125 | 126 | flags = re.I | re.S 127 | for regex_type in self._regexes.keys(): 128 | for regex in self._settings[regex_type]: 129 | self._regexes[regex_type].append(re.compile(regex, flags=flags)) 130 | self._regexes['href_links'] = re.compile(r']*?\s+)?href="([^"]*)"', flags=flags) 131 | 132 | # compiling callbacks 133 | self._callbacks = {} 134 | for regex, callback_name in self._settings['callbacks'].items(): 135 | compiled_regex = re.compile(regex, flags=flags) 136 | self._callbacks[compiled_regex] = callback_name 137 | 138 | # getting robots.txt 139 | self.__parse_robots_txt() 140 | 141 | def __parse_robots_txt(self): 142 | """ 143 | Parses robots.txt 144 | """ 145 | 146 | if not self._settings['respect_robots_txt']: 147 | return 148 | 149 | start_url = '%(protocol)s%(domain)s%(start_page)s' % self._settings 150 | root_uri = '{uri.scheme}://{uri.netloc}'.format(uri=parse.urlparse(start_url)) 151 | robots_txt_uri = '%s/robots.txt' % root_uri 152 | 153 | try: 154 | 155 | response = self._opener.open(robots_txt_uri) 156 | except: 157 | response = None 158 | 159 | if not response: 160 | return 161 | 162 | headers = {k.lower(): v for k, v in self._settings['headers'].items()} 163 | user_agent = headers['user-agent'] if 'user_agent' in headers else None 164 | 165 | skip_directives = False 166 | flags = re.I | re.S 167 | 168 | robots_txt = response.read().decode('utf-8').lower() 169 | for line in robots_txt.splitlines(): 170 | if line.startswith('#') or not line: 171 | continue 172 | 173 | key, value = [v.strip() for v in line.split(':', 1)] 174 | 175 | if key not in ['user-agent', 'disallow']: 176 | continue 177 | 178 | if key == 'user-agent': 179 | skip_directives = False 180 | if value != '*' and value != user_agent: 181 | skip_directives = True 182 | continue 183 | 184 | if skip_directives: 185 | continue 186 | 187 | regex = '^%s' % value.replace('?', '\?').replace('/', '\/').replace('*', '(.*?)') 188 | self._regexes['exclude_links'].append(re.compile(regex, flags=flags)) 189 | 190 | def __fetch_pages(self): 191 | 192 | """ 193 | Main fetching loop 194 | """ 195 | 196 | # getting initial page 197 | urls = self._storage.pop(self._settings['threads']) 198 | # creating lock 199 | lock = Lock() if self._settings['with_lock'] else DummyLock() 200 | 201 | while urls: 202 | 203 | active_threads = [] 204 | 205 | for url in urls: 206 | fetcher = Fetcher() 207 | fetcher.setup({ 208 | 'url': url, 209 | 'lock': lock, 210 | 'parent': self 211 | }) 212 | fetcher.start() 213 | active_threads.append(fetcher) 214 | 215 | for fetcher in active_threads: 216 | fetcher.join() 217 | 218 | # getting next portion of urls 219 | urls = self._storage.pop(self._settings['threads']) 220 | 221 | 222 | class Fetcher(Thread): 223 | """ Fetches given URL. """ 224 | 225 | DEFAULT_HTTP_CODE = 500 226 | 227 | def __init__(self): 228 | Thread.__init__(self) 229 | 230 | def setup(self, settings): 231 | 232 | """ 233 | Sets up thread 234 | """ 235 | 236 | self._url = settings['url'] 237 | self._lock = settings['lock'] 238 | self._parent = settings['parent'] 239 | 240 | self._opener = self._parent._opener 241 | self._callbacks = self._parent._callbacks 242 | self._regexes = self._parent._regexes 243 | self._settings = self._parent._settings 244 | 245 | self._storage = self._parent._storage 246 | 247 | def run(self): 248 | 249 | """ 250 | Runs url fetch and parse 251 | """ 252 | 253 | page = { 254 | 'body': '', 255 | 'url': self._url, 256 | 'headers': {}, 257 | 'cookies': {}, 258 | 'status': self.DEFAULT_HTTP_CODE, 259 | 'matches': [], 260 | } 261 | 262 | response = None 263 | 264 | try: 265 | # append user defined string to link before crawl 266 | prepared_url = self._url + self._settings['append_to_links'] 267 | 268 | response = self._opener.open(prepared_url) 269 | except request.HTTPError as resp: 270 | response = resp 271 | except: 272 | pass 273 | 274 | if response: 275 | page['body'] = str(response.read()) 276 | 277 | # gather page data and call callback function if not silent 278 | if not self._is_silent(self._url): 279 | headers = {h[0]: h[1] for h in response.headers.items()} 280 | 281 | page.update({ 282 | 'headers': headers, 283 | 'cookies': Cookies.parse(headers), 284 | 'status': response.getcode(), 285 | 'matches': self._get_matches(self._url), 286 | }) 287 | 288 | getattr(self._parent, self.__get_callback())(page) 289 | 290 | self.__extract_urls(page['body']) 291 | 292 | def __get_callback(self): 293 | 294 | """ 295 | Returns callback function by url 296 | """ 297 | 298 | # default callback 299 | callback = self._parent.DEFAULT_CALLBACK 300 | 301 | for regex, callback_name in self._callbacks.items(): 302 | if regex.search(self._url): 303 | callback = callback_name 304 | break 305 | 306 | return callback 307 | 308 | def __extract_urls(self, body): 309 | 310 | """ 311 | Extracts valid URLs from page body 312 | """ 313 | 314 | links = self._regexes['href_links'].findall(body) 315 | 316 | for link in links: 317 | # default priority 318 | priority = SyncStorage.PRIORITY_LOW 319 | 320 | # removing anchor part 321 | link = link.split('#', 1)[0] 322 | 323 | # pass if link contains only anchor 324 | if not link: 325 | continue 326 | 327 | # combining base url and link part 328 | link = parse.urljoin(self._url, link) 329 | link = link.strip() 330 | 331 | # pass if already parsed 332 | if self._storage.is_parsed(link): 333 | continue 334 | 335 | # trying to extract links only from valid set of pages MIME types 336 | url_type = mimetypes.guess_type(link, True)[0] 337 | allowed_mimes = self._settings['valid_mimes'] 338 | if allowed_mimes and url_type not in allowed_mimes: 339 | continue 340 | 341 | # pass excluded link 342 | if self._is_excluded(link): 343 | continue 344 | 345 | # check "out of domain" 346 | link_info = parse.urlparse(link) 347 | 348 | # this is not a link 349 | if not link_info.netloc: 350 | continue 351 | 352 | if self._settings['domain'] not in link: 353 | continue 354 | 355 | # set highest priority if link matches any regex from "valid_links" list 356 | if self._is_valid(link): 357 | priority = SyncStorage.PRIORITY_HIGH 358 | 359 | link_hash = link if not self._settings['hashed'] else sha1(link.encode('utf-8')).hexdigest()[:6] 360 | 361 | # locking 362 | with self._lock: 363 | self._storage.add(link, link_hash, priority) 364 | 365 | def _is_excluded(self, link): 366 | 367 | """ 368 | Checks if link matches excluded regex. 369 | """ 370 | 371 | link_path = '' 372 | if self._settings['respect_robots_txt']: 373 | link_path = '{uri.path}?{uri.query}#{uri.fragment}'.format(uri=parse.urlparse(link)) 374 | 375 | for regex in self._regexes['exclude_links']: 376 | if regex.match(link) or regex.match(link_path): 377 | return True 378 | 379 | return False 380 | 381 | def _get_matches(self, link): 382 | 383 | """ 384 | Returns matches if link is valid 385 | """ 386 | 387 | for regex in self._regexes['valid_links']: 388 | matches = regex.findall(link) 389 | if matches: 390 | return matches 391 | return [] 392 | 393 | def _is_valid(self, link): 394 | 395 | """ 396 | Checks if link matches any regex from "valid_links" list 397 | """ 398 | 399 | return self._get_matches(link) 400 | 401 | def _is_silent(self, link): 402 | 403 | """ 404 | Checks if link is silent 405 | """ 406 | 407 | for regex in self._regexes['silent_links']: 408 | is_silent = regex.search(link) 409 | if is_silent: 410 | return True 411 | return False 412 | 413 | 414 | class Cookies(object): 415 | """ Handles HTTP cookies parsing """ 416 | 417 | # unnecessary cookie fields 418 | __meta_fields = [ 419 | 'expires', 420 | 'path', 421 | 'domain', 422 | 'secure', 423 | 'HttpOnly' 424 | ] 425 | 426 | @staticmethod 427 | def parse(headers): 428 | 429 | """ 430 | Parses cookies from response headers. 431 | """ 432 | 433 | cookies = {} 434 | if 'Set-Cookie' in headers: 435 | # splitting raw cookies 436 | raw_cookies = headers['Set-Cookie'].split(';') 437 | for cookie in raw_cookies: 438 | cookie = cookie.split('=') 439 | if cookie[0].strip() not in Cookies.__meta_fields and len(cookie) > 1: 440 | cookies.update({cookie[0]: cookie[1]}) 441 | return cookies 442 | 443 | 444 | class SyncStorage(object): 445 | """ Stores URLs in persistent storage. """ 446 | 447 | PRIORITY_LOW = 0 448 | PRIORITY_HIGH = 1 449 | 450 | def __init__(self): 451 | self._set = set() 452 | self._list = list() 453 | 454 | def add(self, value, value_hash, priority=PRIORITY_LOW): 455 | 456 | """ 457 | Adds value to storage 458 | """ 459 | 460 | if value_hash in self._set: 461 | return 462 | self._list.insert(0, value) if priority == self.PRIORITY_HIGH else self._list.append(value) 463 | self._set.add(value_hash) 464 | 465 | def pop(self, num=1): 466 | 467 | """ 468 | Pops values from storage 469 | """ 470 | 471 | values = [] 472 | for i in range(0, num): 473 | try: 474 | values.append(self._list.pop(0)) 475 | except IndexError: 476 | break 477 | 478 | return values 479 | 480 | def is_parsed(self, value): 481 | 482 | """ 483 | Checks if value has been already parsed 484 | """ 485 | 486 | return value in self._set 487 | 488 | 489 | class RedirectHandler(request.HTTPRedirectHandler): 490 | """ Custom URL redirects handler. """ 491 | 492 | def http_error_302(self, req, fp, code, msg, headers): 493 | return fp 494 | 495 | 496 | class DummyLock(object): 497 | """ Dummy lock object """ 498 | 499 | def __enter__(self): 500 | pass 501 | 502 | def __exit__(self, exc_type, exc_val, exc_tb): 503 | pass 504 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import setup, find_packages 4 | 5 | import pholcidae2 6 | 7 | setup( 8 | name='pholcidae2', 9 | version=pholcidae2.__version__, 10 | packages=find_packages(os.path.dirname(os.path.abspath(__file__))) 11 | ) 12 | --------------------------------------------------------------------------------