├── .gitignore
├── LICENSE
├── README.md
├── examples
    ├── guido.py
    └── wikipedia.py
├── pholcidae2
    └── __init__.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | test.py
 2 | 
 3 | # Created by .ignore support plugin (hsz.mobi)
 4 | ### OSX template
 5 | .DS_Store
 6 | .AppleDouble
 7 | .LSOverride
 8 | 
 9 | # Icon must end with two \r
10 | Icon
11 | 
12 | # Thumbnails
13 | ._*
14 | 
15 | # Files that might appear in the root of a volume
16 | .DocumentRevisions-V100
17 | .fseventsd
18 | .Spotlight-V100
19 | .TemporaryItems
20 | .Trashes
21 | .VolumeIcon.icns
22 | 
23 | # Directories potentially created on remote AFP share
24 | .AppleDB
25 | .AppleDesktop
26 | Network Trash Folder
27 | Temporary Items
28 | .apdisk
29 | ### Python template
30 | # Byte-compiled / optimized / DLL files
31 | __pycache__/
32 | *.py[cod]
33 | *$py.class
34 | 
35 | # C extensions
36 | *.so
37 | 
38 | # Distribution / packaging
39 | .Python
40 | env/
41 | build/
42 | develop-eggs/
43 | dist/
44 | downloads/
45 | eggs/
46 | .eggs/
47 | lib/
48 | lib64/
49 | parts/
50 | sdist/
51 | var/
52 | *.egg-info/
53 | .installed.cfg
54 | *.egg
55 | 
56 | # PyInstaller
57 | #  Usually these files are written by a python script from a template
58 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
59 | *.manifest
60 | *.spec
61 | 
62 | # Installer logs
63 | pip-log.txt
64 | pip-delete-this-directory.txt
65 | 
66 | # Unit test / coverage reports
67 | htmlcov/
68 | .tox/
69 | .coverage
70 | .coverage.*
71 | .cache
72 | nosetests.xml
73 | coverage.xml
74 | *,cover
75 | 
76 | # Translations
77 | *.mo
78 | *.pot
79 | 
80 | # Django stuff:
81 | *.log
82 | 
83 | # Sphinx documentation
84 | docs/_build/
85 | 
86 | # PyBuilder
87 | target/
88 | 
89 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 bbrodriges
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | PHOLCIDAE - Tiny python web crawler
  2 | =========
  3 | 
  4 | Pholcidae
  5 | ------------
  6 | Pholcidae, commonly known as cellar spiders, are a spider family in the suborder Araneomorphae.
  7 | 
  8 | About
  9 | ------------
 10 | Pholcidae is a tiny Python module allows you to write your own crawl spider fast and easy.
 11 | 
 12 | _View end of README to read about changes in v2_
 13 | 
 14 | Dependencies
 15 | ------------
 16 | * python 2.7 or higher
 17 | 
 18 | Install
 19 | ------------
 20 | ```
 21 | pip install git+https://github.com/bbrodriges/pholcidae.git
 22 | ```
 23 | 
 24 | Basic example
 25 | -------------
 26 | 
 27 | ``` python
 28 | from pholcidae2 import Pholcidae
 29 | 
 30 | class MySpider(Pholcidae):
 31 | 
 32 | 	def crawl(self, data):
 33 |     	print(data.url)
 34 | 
 35 | settings = {'domain': 'www.test.com', 'start_page': '/sitemap/'}
 36 | 
 37 | spider = MySpider()
 38 | spider.extend(settings)
 39 | spider.start()
 40 | ```
 41 | 
 42 | Allowed settings
 43 | ------------
 44 | Settings must be passed as dictionary to ```extend``` method of the crawler.
 45 | 
 46 | Params you can use:
 47 | 
 48 | **Required**
 49 | 
 50 | * **domain** _string_ - defines domain which pages will be parsed. Defines without trailing slash.
 51 | 
 52 | **Additional**
 53 | 
 54 | * **start_page** _string_ - URL which will be used as entry point to parsed site. Default: `/`
 55 | * **protocol** _string_ - defines protocol to be used by crawler. Default: `http://`
 56 | * **valid_links** _list_ - list of regular expression strings (or full URLs), which will be used to filter site URLs to be passed to `crawl()` method. Default: `['(.*)']`
 57 | * **append_to_links** _string_ - text to be appended to each link before fetching it. Default: `''`
 58 | * **exclude_links** _list_ - list of regular expression strings (or full URLs), which will be used to filter site URLs which must not be checked at all. Default: `[]`
 59 | * **cookies** _dict_ - a dictionary of string key-values which represents cookie name and cookie value to be passed with site URL request. Default: `{}`
 60 | * **headers** _dict_ - a dictionary of string key-values which represents header name and value value to be passed with site URL request. Default: `{}`
 61 | * **follow_redirects** _bool_ - allows crawler to bypass 30x headers and not follow redirects. Default: `True`
 62 | * **precrawl** _string_ - name of function which will be called before start of crawler. Default: `None`
 63 | * **postcrawl** _string_ - name of function which will be called after the end crawling. Default: `None`
 64 | * **callbacks** _dict_ - a dictionary of key-values which represents URL pattern from `valid_links` dict and string name of self defined method to get parsed data. Default: `{}`
 65 | * **proxy** _dict_ - a dictionary mapping protocol names to URLs of proxies, e.g., {'http': 'http://user:passwd@host:port'}. Default: `{}`
 66 | 
 67 | New in v2: 
 68 | 
 69 | * **silent_links** _list_ - list of regular expression strings (or full URLs), which will be used to filter site URLs which must not pass page data to callback function, yet still collect URLs from this page. Default: `[]`
 70 | * **valid_mimes** _list_ - list of strings representing valid MIME types. Only URLs that can be identified with this MIME types will be parsed. Default: `[]`
 71 | * **threads** _int_ - number of concurrent threads of pages fetchers. Default: `1`
 72 | * **with_lock** _bool_ - whether use or not lock while URLs sync. It slightly decreases crawling speed but eliminates race conditions. Default: `True`
 73 | * **hashed** _bool_ - whether or not store parsed URLs as shortened SHA1 hashes. Crawler may run a little bit slower but consumes a lot less memory. Default: `False`
 74 | * **respect_robots_txt** _bool_ - whether or not read `robots.txt` file before start and add `Disallow` directives to **exclude_links** list. Default: `True`
 75 | 
 76 | Response attributes
 77 | ------------
 78 | 
 79 | While inherit Pholcidae class you can override built-in `crawl()` method to retrieve data gathered from page. Any response object will contain some attributes depending on page parsing success.
 80 | 
 81 | **Successful parsing**
 82 | 
 83 | * **body** _string_ - raw HTML/XML/XHTML etc. representation of page.
 84 | * **url** _string_ - URL of parsed page.
 85 | * **headers** _AttrDict_ - dictionary of response headers.
 86 | * **cookies** _AttrDict_ - dictionary of response cookies.
 87 | * **status** _int_ - HTTP status of response (e.g. 200).
 88 | * **match** _list_ - matched part from valid_links regex.
 89 | 
 90 | **Unsuccessful parsing**
 91 | 
 92 | * **body** _string_ - raw representation of error.
 93 | * **status** _int_ - HTTP status of response (e.g. 400). Default: 500
 94 | * **url** _string_ - URL of parsed page.
 95 | 
 96 | Example
 97 | ------------
 98 | See ```test.py```
 99 | 
100 | Note
101 | ------------
102 | Pholcidae does not contain any built-in XML, XHTML, HTML or other parser. You can manually add any response body parsing methods using any available python libraries you want.
103 | 
104 | v2 vs v1
105 | ------------
106 | Major changes have been made in version 2.0:
107 | * All code has been completely rewritten from scratch
108 | * Less abstractions = more speed
109 | * Threads support
110 | * Matches in page data are now list and not optional
111 | * Option ```stay_in_domain``` has been removed. Crawler cannot break out of initial domain anymore.
112 | 
113 | There are some minor code changes which breaks backward code compatibility between version 1.x and 2.0:
114 | * You need to explicitly pass settings to ```extend``` method of your crawler
115 | * Option ```autostart``` has been removed. You must call ```spider.srart()``` explisitly
116 | * Module is now called ```pholcidae2```
117 | 


--------------------------------------------------------------------------------
/examples/guido.py:
--------------------------------------------------------------------------------
 1 | from pholcidae2 import Pholcidae
 2 | 
 3 | 
 4 | class MyGuidoSpider(Pholcidae):
 5 |     def before(self):
 6 |         print('-------- PRECRAWL ----------')
 7 | 
 8 |     def after(self):
 9 |         print('-------- POSTCRAWL ----------')
10 | 
11 |     def my_callback(self, data):
12 |         print('-------- MY CALLBACK ----------')
13 |         print(data['url'], data['status'], data['matches'])
14 | 
15 |     def crawl(self, data):
16 |         print(data['url'], data['status'], data['matches'])
17 | 
18 | 
19 | settings = {
20 |     'domain': 'www.python.org/~guido',
21 |     'start_page': '/',
22 |     'valid_links': ['(.*)'],
23 |     'exclude_links': ['ClaymontJPEGS'],
24 |     'silent_links': ['Publications.html'],
25 |     'append_to_links': '?a=b',
26 |     'precrawl': 'before',
27 |     'postcrawl': 'after',
28 |     'callbacks': {'(images.*)': 'my_callback'},
29 |     'threads': 3,
30 | }
31 | 
32 | spider = MyGuidoSpider()
33 | spider.extend(settings)
34 | spider.start()
35 | 


--------------------------------------------------------------------------------
/examples/wikipedia.py:
--------------------------------------------------------------------------------
 1 | from io import StringIO
 2 | 
 3 | from lxml import etree
 4 | 
 5 | from pholcidae2 import Pholcidae
 6 | 
 7 | 
 8 | class MyWikiSpider(Pholcidae):
 9 |     def crawl(self, data):
10 |         tree = etree.parse(StringIO(data['body']), self.html_parser)
11 |         langs = tree.xpath(".//div[@class='central-featured']//strong/text()")
12 | 
13 |         print('Top Wikipedia languages:')
14 |         for lang in langs:
15 |             print(lang)
16 | 
17 | 
18 | settings = {
19 |     'protocol': 'https://',
20 |     'domain': 'www.wikipedia.org',
21 |     'start_page': '/',
22 |     'exclude_links': ['(.*)'],
23 |     'threads': 1,
24 | }
25 | 
26 | spider = MyWikiSpider()
27 | spider.extend(settings)
28 | spider.html_parser = etree.HTMLParser()
29 | spider.start()
30 | 


--------------------------------------------------------------------------------
/pholcidae2/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | 
  3 | import mimetypes
  4 | import re
  5 | import sys
  6 | from hashlib import sha1
  7 | from threading import Thread, Lock
  8 | 
  9 | if sys.version_info < (3, 0, 0):
 10 |     import urlparse as parse
 11 |     import urllib2 as request
 12 | else:
 13 |     from urllib import parse
 14 |     from urllib import request
 15 | 
 16 | version_info = (2, 1, 0)
 17 | __version__ = '.'.join(map(str, version_info))
 18 | 
 19 | __author__ = 'bbrodriges'
 20 | 
 21 | 
 22 | class Pholcidae(object):
 23 |     """" Pholcidae is a small and fast web crawler. """
 24 | 
 25 |     DEFAULT_CALLBACK = 'crawl'
 26 | 
 27 |     _settings = {
 28 |         'follow_redirects': True,
 29 |         'append_to_links': '',
 30 |         'valid_links': ['(.*)'],
 31 |         'exclude_links': [],
 32 |         'silent_links': [],
 33 |         'start_page': '/',
 34 |         'domain': '',
 35 |         'protocol': 'http://',
 36 |         'cookies': {},
 37 |         'headers': {},
 38 |         'precrawl': None,
 39 |         'postcrawl': None,
 40 |         'callbacks': {},
 41 |         'proxy': {},
 42 |         'valid_mimes': [],
 43 |         'threads': 1,
 44 |         'with_lock': True,
 45 |         'hashed': False,
 46 |         'respect_robots_txt': True,
 47 |     }
 48 | 
 49 |     def extend(self, settings):
 50 | 
 51 |         """
 52 |         Extends default settings using given settings.
 53 |         """
 54 | 
 55 |         self._settings.update(settings)
 56 | 
 57 |     def start(self):
 58 | 
 59 |         """
 60 |         Prepares everything and starts
 61 |         """
 62 | 
 63 |         self.__prepare()
 64 | 
 65 |         # trying to call precrawl function
 66 |         precrawl = self._settings['precrawl']
 67 |         getattr(self, precrawl)() if precrawl else None
 68 | 
 69 |         self.__fetch_pages()
 70 | 
 71 |         # trying to call postcrawl function
 72 |         postcrawl = self._settings['postcrawl']
 73 |         getattr(self, postcrawl)() if postcrawl else None
 74 | 
 75 |     def crawl(self, response):
 76 | 
 77 |         """
 78 |         You may override this method in a subclass.
 79 |         Use it to get page content and parse it as you want to.
 80 |         """
 81 | 
 82 |         pass
 83 | 
 84 |     def __prepare(self):
 85 | 
 86 |         """
 87 |         Prepares everything before start.
 88 |         """
 89 | 
 90 |         # creating new SyncStorage instance
 91 |         self._storage = SyncStorage()
 92 | 
 93 |         # adding start point into storage
 94 |         start_url = '%(protocol)s%(domain)s%(start_page)s' % self._settings
 95 |         self._storage.add(start_url.strip(), SyncStorage.PRIORITY_LOW)
 96 | 
 97 |         # creating HTTP opener instance
 98 |         handlers = []
 99 |         if self._settings['proxy']:
100 |             proxy_handler = request.ProxyHandler(self._settings['proxy'])
101 |             handlers.append(proxy_handler)
102 | 
103 |         if not self._settings['follow_redirects']:
104 |             handlers.extend([RedirectHandler, request.HTTPCookieProcessor()])
105 | 
106 |         self._opener = request.build_opener(*handlers)
107 | 
108 |         # adding headers to opener
109 |         self._opener.addheaders.extend(self._settings['headers'])
110 | 
111 |         # adding cookies to opener
112 |         if self._settings['cookies']:
113 |             compiled_cookies = []
114 |             for name, value in self._settings['cookies'].items():
115 |                 compiled_cookies.append('%s=%s' % (name, value))
116 |             cookies_string = ','.join(compiled_cookies)
117 |             self._opener.addheaders.append(('Cookie', cookies_string))
118 | 
119 |         # compiling regexes
120 |         self._regexes = {
121 |             'valid_links': [],
122 |             'exclude_links': [],
123 |             'silent_links': [],
124 |         }
125 | 
126 |         flags = re.I | re.S
127 |         for regex_type in self._regexes.keys():
128 |             for regex in self._settings[regex_type]:
129 |                 self._regexes[regex_type].append(re.compile(regex, flags=flags))
130 |         self._regexes['href_links'] = re.compile(r'<a\s+(?:[^>]*?\s+)?href="([^"]*)"', flags=flags)
131 | 
132 |         # compiling callbacks
133 |         self._callbacks = {}
134 |         for regex, callback_name in self._settings['callbacks'].items():
135 |             compiled_regex = re.compile(regex, flags=flags)
136 |             self._callbacks[compiled_regex] = callback_name
137 | 
138 |         # getting robots.txt
139 |         self.__parse_robots_txt()
140 | 
141 |     def __parse_robots_txt(self):
142 |         """
143 |         Parses robots.txt
144 |         """
145 | 
146 |         if not self._settings['respect_robots_txt']:
147 |             return
148 | 
149 |         start_url = '%(protocol)s%(domain)s%(start_page)s' % self._settings
150 |         root_uri = '{uri.scheme}://{uri.netloc}'.format(uri=parse.urlparse(start_url))
151 |         robots_txt_uri = '%s/robots.txt' % root_uri
152 | 
153 |         try:
154 | 
155 |             response = self._opener.open(robots_txt_uri)
156 |         except:
157 |             response = None
158 | 
159 |         if not response:
160 |             return
161 | 
162 |         headers = {k.lower(): v for k, v in self._settings['headers'].items()}
163 |         user_agent = headers['user-agent'] if 'user_agent' in headers else None
164 | 
165 |         skip_directives = False
166 |         flags = re.I | re.S
167 | 
168 |         robots_txt = response.read().decode('utf-8').lower()
169 |         for line in robots_txt.splitlines():
170 |             if line.startswith('#') or not line:
171 |                 continue
172 | 
173 |             key, value = [v.strip() for v in line.split(':', 1)]
174 | 
175 |             if key not in ['user-agent', 'disallow']:
176 |                 continue
177 | 
178 |             if key == 'user-agent':
179 |                 skip_directives = False
180 |                 if value != '*' and value != user_agent:
181 |                     skip_directives = True
182 |                 continue
183 | 
184 |             if skip_directives:
185 |                 continue
186 | 
187 |             regex = '^%s' % value.replace('?', '\?').replace('/', '\/').replace('*', '(.*?)')
188 |             self._regexes['exclude_links'].append(re.compile(regex, flags=flags))
189 | 
190 |     def __fetch_pages(self):
191 | 
192 |         """
193 |         Main fetching loop
194 |         """
195 | 
196 |         # getting initial page
197 |         urls = self._storage.pop(self._settings['threads'])
198 |         # creating lock
199 |         lock = Lock() if self._settings['with_lock'] else DummyLock()
200 | 
201 |         while urls:
202 | 
203 |             active_threads = []
204 | 
205 |             for url in urls:
206 |                 fetcher = Fetcher()
207 |                 fetcher.setup({
208 |                     'url': url,
209 |                     'lock': lock,
210 |                     'parent': self
211 |                 })
212 |                 fetcher.start()
213 |                 active_threads.append(fetcher)
214 | 
215 |             for fetcher in active_threads:
216 |                 fetcher.join()
217 | 
218 |             # getting next portion of urls
219 |             urls = self._storage.pop(self._settings['threads'])
220 | 
221 | 
222 | class Fetcher(Thread):
223 |     """ Fetches given URL. """
224 | 
225 |     DEFAULT_HTTP_CODE = 500
226 | 
227 |     def __init__(self):
228 |         Thread.__init__(self)
229 | 
230 |     def setup(self, settings):
231 | 
232 |         """
233 |         Sets up thread
234 |         """
235 | 
236 |         self._url = settings['url']
237 |         self._lock = settings['lock']
238 |         self._parent = settings['parent']
239 | 
240 |         self._opener = self._parent._opener
241 |         self._callbacks = self._parent._callbacks
242 |         self._regexes = self._parent._regexes
243 |         self._settings = self._parent._settings
244 | 
245 |         self._storage = self._parent._storage
246 | 
247 |     def run(self):
248 | 
249 |         """
250 |         Runs url fetch and parse
251 |         """
252 | 
253 |         page = {
254 |             'body': '',
255 |             'url': self._url,
256 |             'headers': {},
257 |             'cookies': {},
258 |             'status': self.DEFAULT_HTTP_CODE,
259 |             'matches': [],
260 |         }
261 | 
262 |         response = None
263 | 
264 |         try:
265 |             # append user defined string to link before crawl
266 |             prepared_url = self._url + self._settings['append_to_links']
267 | 
268 |             response = self._opener.open(prepared_url)
269 |         except request.HTTPError as resp:
270 |             response = resp
271 |         except:
272 |             pass
273 | 
274 |         if response:
275 |             page['body'] = str(response.read())
276 | 
277 |             # gather page data and call callback function if not silent
278 |             if not self._is_silent(self._url):
279 |                 headers = {h[0]: h[1] for h in response.headers.items()}
280 | 
281 |                 page.update({
282 |                     'headers': headers,
283 |                     'cookies': Cookies.parse(headers),
284 |                     'status': response.getcode(),
285 |                     'matches': self._get_matches(self._url),
286 |                 })
287 | 
288 |                 getattr(self._parent, self.__get_callback())(page)
289 | 
290 |         self.__extract_urls(page['body'])
291 | 
292 |     def __get_callback(self):
293 | 
294 |         """
295 |         Returns callback function by url
296 |         """
297 | 
298 |         # default callback
299 |         callback = self._parent.DEFAULT_CALLBACK
300 | 
301 |         for regex, callback_name in self._callbacks.items():
302 |             if regex.search(self._url):
303 |                 callback = callback_name
304 |                 break
305 | 
306 |         return callback
307 | 
308 |     def __extract_urls(self, body):
309 | 
310 |         """
311 |         Extracts valid URLs from page body
312 |         """
313 | 
314 |         links = self._regexes['href_links'].findall(body)
315 | 
316 |         for link in links:
317 |             # default priority
318 |             priority = SyncStorage.PRIORITY_LOW
319 | 
320 |             # removing anchor part
321 |             link = link.split('#', 1)[0]
322 | 
323 |             # pass if link contains only anchor
324 |             if not link:
325 |                 continue
326 | 
327 |             # combining base url and link part
328 |             link = parse.urljoin(self._url, link)
329 |             link = link.strip()
330 | 
331 |             # pass if already parsed
332 |             if self._storage.is_parsed(link):
333 |                 continue
334 | 
335 |             # trying to extract links only from valid set of pages MIME types
336 |             url_type = mimetypes.guess_type(link, True)[0]
337 |             allowed_mimes = self._settings['valid_mimes']
338 |             if allowed_mimes and url_type not in allowed_mimes:
339 |                 continue
340 | 
341 |             # pass excluded link
342 |             if self._is_excluded(link):
343 |                 continue
344 | 
345 |             # check "out of domain"
346 |             link_info = parse.urlparse(link)
347 | 
348 |             # this is not a link
349 |             if not link_info.netloc:
350 |                 continue
351 | 
352 |             if self._settings['domain'] not in link:
353 |                 continue
354 | 
355 |             # set highest priority if link matches any regex from "valid_links" list
356 |             if self._is_valid(link):
357 |                 priority = SyncStorage.PRIORITY_HIGH
358 | 
359 |             link_hash = link if not self._settings['hashed'] else sha1(link.encode('utf-8')).hexdigest()[:6]
360 | 
361 |             # locking
362 |             with self._lock:
363 |                 self._storage.add(link, link_hash, priority)
364 | 
365 |     def _is_excluded(self, link):
366 | 
367 |         """
368 |         Checks if link matches excluded regex.
369 |         """
370 | 
371 |         link_path = ''
372 |         if self._settings['respect_robots_txt']:
373 |             link_path = '{uri.path}?{uri.query}#{uri.fragment}'.format(uri=parse.urlparse(link))
374 | 
375 |         for regex in self._regexes['exclude_links']:
376 |             if regex.match(link) or regex.match(link_path):
377 |                 return True
378 | 
379 |         return False
380 | 
381 |     def _get_matches(self, link):
382 | 
383 |         """
384 |         Returns matches if link is valid
385 |         """
386 | 
387 |         for regex in self._regexes['valid_links']:
388 |             matches = regex.findall(link)
389 |             if matches:
390 |                 return matches
391 |         return []
392 | 
393 |     def _is_valid(self, link):
394 | 
395 |         """
396 |         Checks if link matches any regex from "valid_links" list
397 |         """
398 | 
399 |         return self._get_matches(link)
400 | 
401 |     def _is_silent(self, link):
402 | 
403 |         """
404 |         Checks if link is silent
405 |         """
406 | 
407 |         for regex in self._regexes['silent_links']:
408 |             is_silent = regex.search(link)
409 |             if is_silent:
410 |                 return True
411 |         return False
412 | 
413 | 
414 | class Cookies(object):
415 |     """ Handles HTTP cookies parsing """
416 | 
417 |     # unnecessary cookie fields
418 |     __meta_fields = [
419 |         'expires',
420 |         'path',
421 |         'domain',
422 |         'secure',
423 |         'HttpOnly'
424 |     ]
425 | 
426 |     @staticmethod
427 |     def parse(headers):
428 | 
429 |         """
430 |         Parses cookies from response headers.
431 |         """
432 | 
433 |         cookies = {}
434 |         if 'Set-Cookie' in headers:
435 |             # splitting raw cookies
436 |             raw_cookies = headers['Set-Cookie'].split(';')
437 |             for cookie in raw_cookies:
438 |                 cookie = cookie.split('=')
439 |                 if cookie[0].strip() not in Cookies.__meta_fields and len(cookie) > 1:
440 |                     cookies.update({cookie[0]: cookie[1]})
441 |         return cookies
442 | 
443 | 
444 | class SyncStorage(object):
445 |     """ Stores URLs in persistent storage. """
446 | 
447 |     PRIORITY_LOW = 0
448 |     PRIORITY_HIGH = 1
449 | 
450 |     def __init__(self):
451 |         self._set = set()
452 |         self._list = list()
453 | 
454 |     def add(self, value, value_hash, priority=PRIORITY_LOW):
455 | 
456 |         """
457 |         Adds value to storage
458 |         """
459 | 
460 |         if value_hash in self._set:
461 |             return
462 |         self._list.insert(0, value) if priority == self.PRIORITY_HIGH else self._list.append(value)
463 |         self._set.add(value_hash)
464 | 
465 |     def pop(self, num=1):
466 | 
467 |         """
468 |         Pops values from storage
469 |         """
470 | 
471 |         values = []
472 |         for i in range(0, num):
473 |             try:
474 |                 values.append(self._list.pop(0))
475 |             except IndexError:
476 |                 break
477 | 
478 |         return values
479 | 
480 |     def is_parsed(self, value):
481 | 
482 |         """
483 |         Checks if value has been already parsed
484 |         """
485 | 
486 |         return value in self._set
487 | 
488 | 
489 | class RedirectHandler(request.HTTPRedirectHandler):
490 |     """ Custom URL redirects handler. """
491 | 
492 |     def http_error_302(self, req, fp, code, msg, headers):
493 |         return fp
494 | 
495 | 
496 | class DummyLock(object):
497 |     """ Dummy lock object """
498 | 
499 |     def __enter__(self):
500 |         pass
501 | 
502 |     def __exit__(self, exc_type, exc_val, exc_tb):
503 |         pass
504 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | import pholcidae2
 6 | 
 7 | setup(
 8 |     name='pholcidae2',
 9 |     version=pholcidae2.__version__,
10 |     packages=find_packages(os.path.dirname(os.path.abspath(__file__)))
11 | )
12 | 


--------------------------------------------------------------------------------