├── doc
    ├── _build
    │   ├── html
    │   │   ├── .nojekyll
    │   │   ├── _static
    │   │   │   ├── custom.css
    │   │   │   ├── down.png
    │   │   │   ├── file.png
    │   │   │   ├── plus.png
    │   │   │   ├── up.png
    │   │   │   ├── minus.png
    │   │   │   ├── comment.png
    │   │   │   ├── structure.jpg
    │   │   │   ├── ajax-loader.gif
    │   │   │   ├── structure.vsdx
    │   │   │   ├── up-pressed.png
    │   │   │   ├── comment-bright.png
    │   │   │   ├── comment-close.png
    │   │   │   ├── down-pressed.png
    │   │   │   ├── ~$$structure.~vsdx
    │   │   │   ├── pygments.css
    │   │   │   └── nature.css
    │   │   ├── _sources
    │   │   │   ├── last.rst.txt
    │   │   │   ├── spider.rst.txt
    │   │   │   ├── downloader.rst.txt
    │   │   │   ├── downloader_middlewares.rst.txt
    │   │   │   ├── schedular.rst.txt
    │   │   │   ├── index.rst.txt
    │   │   │   ├── structure.rst.txt
    │   │   │   ├── http.rst.txt
    │   │   │   ├── prepare.rst.txt
    │   │   │   ├── settings.rst.txt
    │   │   │   ├── ssettings.rst.txt
    │   │   │   ├── intro.rst.txt
    │   │   │   └── tutorial.rst.txt
    │   │   ├── debug.log
    │   │   ├── objects.inv
    │   │   ├── _images
    │   │   │   └── structure.jpg
    │   │   ├── .buildinfo
    │   │   ├── genindex.html
    │   │   ├── search.html
    │   │   ├── last.html
    │   │   ├── spider.html
    │   │   ├── downloader_middlewares.html
    │   │   ├── downloader.html
    │   │   ├── schedular.html
    │   │   ├── structure.html
    │   │   ├── prepare.html
    │   │   ├── http.html
    │   │   └── index.html
    │   └── doctrees
    │   │   ├── http.doctree
    │   │   ├── last.doctree
    │   │   ├── index.doctree
    │   │   ├── intro.doctree
    │   │   ├── spider.doctree
    │   │   ├── prepare.doctree
    │   │   ├── schedular.doctree
    │   │   ├── settings.doctree
    │   │   ├── ssettings.doctree
    │   │   ├── structure.doctree
    │   │   ├── tutorial.doctree
    │   │   ├── downloader.doctree
    │   │   ├── environment.pickle
    │   │   └── downloader_middlewares.doctree
    ├── spider.rst
    ├── downloader.rst
    ├── _static
    │   ├── structure.jpg
    │   └── structure.vsdx
    ├── downloader_middlewares.rst
    ├── last.rst
    ├── Makefile
    ├── schedular.rst
    ├── index.rst
    ├── make.bat
    ├── structure.rst
    ├── http.rst
    ├── prepare.rst
    ├── settings.rst
    ├── conf.py
    └── tutorial.rst
├── tests
    ├── __init__.py
    ├── http
    │   ├── __init__.py
    │   ├── test_http_request.py
    │   └── test_http_response.py
    ├── utils
    │   ├── __init__.py
    │   ├── test_utils_log.py
    │   ├── test_utils_hash.py
    │   └── test_utils_datatypes.py
    ├── test_data
    │   ├── __init__.py
    │   └── test_settings_data.py
    ├── test_conf_settings.py
    ├── test_scheduler.py
    ├── test_downloader_middlewares.py
    └── test_downloader.py
├── pycreeper
    ├── conf
    │   ├── __init__.py
    │   ├── settings.py
    │   └── default_settings.py
    ├── http
    │   ├── __init__.py
    │   ├── response.py
    │   └── request.py
    ├── __init__.py
    ├── utils
    │   ├── gevent_wrapper.py
    │   ├── hash.py
    │   ├── exceptions.py
    │   ├── log.py
    │   ├── datatypes.py
    │   └── __init__.py
    ├── spider.py
    ├── downloader_middlewares
    │   ├── cookies_middlewares.py
    │   ├── __init__.py
    │   └── middlewares.py
    ├── scheduler.py
    ├── engine.py
    └── downloader
    │   └── __init__.py
├── setup.py
├── examples
    ├── zhihu_spider.py
    └── jd_spider.py
└── README.md


/doc/_build/html/.nojekyll:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/doc/spider.rst:
--------------------------------------------------------------------------------
1 | spider：爬虫
2 | ============================
3 | 
4 | 


--------------------------------------------------------------------------------
/doc/downloader.rst:
--------------------------------------------------------------------------------
1 | downloader：下载器
2 | ============================
3 | 
4 | 


--------------------------------------------------------------------------------
/doc/_build/html/_static/custom.css:
--------------------------------------------------------------------------------
1 | /* This file intentionally left blank. */
2 | 


--------------------------------------------------------------------------------
/doc/_build/html/_sources/last.rst.txt:
--------------------------------------------------------------------------------
1 | 写在最后
2 | ============================
3 | 
4 | 


--------------------------------------------------------------------------------
/doc/_build/html/_sources/spider.rst.txt:
--------------------------------------------------------------------------------
1 | spider：爬虫
2 | ============================
3 | 
4 | 


--------------------------------------------------------------------------------
/doc/_build/html/_sources/downloader.rst.txt:
--------------------------------------------------------------------------------
1 | downloader：下载器
2 | ============================
3 | 
4 | 


--------------------------------------------------------------------------------
/doc/_build/html/debug.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/debug.log


--------------------------------------------------------------------------------
/doc/_build/html/objects.inv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/objects.inv


--------------------------------------------------------------------------------
/doc/_static/structure.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_static/structure.jpg


--------------------------------------------------------------------------------
/doc/_static/structure.vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_static/structure.vsdx


--------------------------------------------------------------------------------
/doc/downloader_middlewares.rst:
--------------------------------------------------------------------------------
1 | downloader_middlewares：下载器中间件
2 | =======================================
3 | 
4 | 


--------------------------------------------------------------------------------
/doc/_build/doctrees/http.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/http.doctree


--------------------------------------------------------------------------------
/doc/_build/doctrees/last.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/last.doctree


--------------------------------------------------------------------------------
/doc/_build/html/_static/down.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/down.png


--------------------------------------------------------------------------------
/doc/_build/html/_static/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/file.png


--------------------------------------------------------------------------------
/doc/_build/html/_static/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/plus.png


--------------------------------------------------------------------------------
/doc/_build/html/_static/up.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/up.png


--------------------------------------------------------------------------------
/doc/_build/doctrees/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/index.doctree


--------------------------------------------------------------------------------
/doc/_build/doctrees/intro.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/intro.doctree


--------------------------------------------------------------------------------
/doc/_build/doctrees/spider.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/spider.doctree


--------------------------------------------------------------------------------
/doc/_build/html/_static/minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/minus.png


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'


--------------------------------------------------------------------------------
/doc/_build/doctrees/prepare.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/prepare.doctree


--------------------------------------------------------------------------------
/doc/_build/doctrees/schedular.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/schedular.doctree


--------------------------------------------------------------------------------
/doc/_build/doctrees/settings.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/settings.doctree


--------------------------------------------------------------------------------
/doc/_build/doctrees/ssettings.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/ssettings.doctree


--------------------------------------------------------------------------------
/doc/_build/doctrees/structure.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/structure.doctree


--------------------------------------------------------------------------------
/doc/_build/doctrees/tutorial.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/tutorial.doctree


--------------------------------------------------------------------------------
/doc/_build/html/_images/structure.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_images/structure.jpg


--------------------------------------------------------------------------------
/doc/_build/html/_static/comment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/comment.png


--------------------------------------------------------------------------------
/doc/_build/html/_static/structure.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/structure.jpg


--------------------------------------------------------------------------------
/tests/http/__init__.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'


--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'


--------------------------------------------------------------------------------
/doc/_build/doctrees/downloader.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/downloader.doctree


--------------------------------------------------------------------------------
/doc/_build/doctrees/environment.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/environment.pickle


--------------------------------------------------------------------------------
/doc/_build/html/_static/ajax-loader.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/ajax-loader.gif


--------------------------------------------------------------------------------
/doc/_build/html/_static/structure.vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/structure.vsdx


--------------------------------------------------------------------------------
/doc/_build/html/_static/up-pressed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/up-pressed.png


--------------------------------------------------------------------------------
/pycreeper/conf/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'


--------------------------------------------------------------------------------
/doc/_build/html/_sources/downloader_middlewares.rst.txt:
--------------------------------------------------------------------------------
1 | downloader_middlewares：下载器中间件
2 | =======================================
3 | 
4 | 


--------------------------------------------------------------------------------
/doc/_build/html/_static/comment-bright.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/comment-bright.png


--------------------------------------------------------------------------------
/doc/_build/html/_static/comment-close.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/comment-close.png


--------------------------------------------------------------------------------
/doc/_build/html/_static/down-pressed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/down-pressed.png


--------------------------------------------------------------------------------
/doc/_build/html/_static/~$$structure.~vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/~$$structure.~vsdx


--------------------------------------------------------------------------------
/pycreeper/http/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 | 


--------------------------------------------------------------------------------
/tests/test_data/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 | 


--------------------------------------------------------------------------------
/doc/_build/doctrees/downloader_middlewares.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/downloader_middlewares.doctree


--------------------------------------------------------------------------------
/pycreeper/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 | 
5 | VERSION = (0, 0, 1)
6 | 


--------------------------------------------------------------------------------
/doc/_build/html/.buildinfo:
--------------------------------------------------------------------------------
1 | # Sphinx build info version 1
2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3 | config: a4bea7a19f3fdfa82050b591f2231270
4 | tags: 645f666f9bcd5a90fca523b33c5a78b7
5 | 


--------------------------------------------------------------------------------
/doc/last.rst:
--------------------------------------------------------------------------------
1 | 写在最后
2 | ============================
3 | PyCreeper旨在提高爬虫爱好者爬取动态页面的效率，在使用时，如果您遇到各种问题，我们都欢迎您反馈给我们，您可以通过github，
4 | 项目主页：https://github.com/ZcyAndWt/pyCreeper，也可以通过邮件，作者的邮箱：zhengchenyu.backend@gmail.com。
5 | 
6 | 未来我们将引入通过Docker安装的支持。
7 | 
8 | 如果您觉得PyCreeper减少了您的工作量，提高了您的开发效率，希望您能在Github上给我们star。您的好评是我们前进的动力！
9 | 


--------------------------------------------------------------------------------
/pycreeper/utils/gevent_wrapper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | import gevent
 6 | 
 7 | 
 8 | def spawn(func, *args, **kwargs):
 9 |     return gevent.spawn(func, *args, **kwargs)
10 | 
11 | 
12 | def join_all(funcs):
13 |     gevent.joinall(funcs)
14 | 


--------------------------------------------------------------------------------
/tests/test_data/test_settings_data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | """ test settings """
 6 | 
 7 | TEST_INT = 10
 8 | 
 9 | TEST_JSON = '{"foo": ["bar", "baz"]}'
10 | 
11 | TEST_STR = 'foo,bar,baz'
12 | 
13 | TEST_DICT = {
14 |     "foo": "bar"
15 | }
16 | 
17 | TEST_LIST = [
18 |     "foo",
19 |     "bar",
20 |     "baz"
21 | ]
22 | 
23 | TEST_FLOAT = 9.11
24 | 
25 | test_lowercase = True
26 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = PyCreeper
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/pycreeper/utils/hash.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | import urllib
 6 | from urlparse import parse_qsl, urlparse, urlunparse
 7 | import hashlib
 8 | 
 9 | 
10 | def request_fingerprint(request):
11 |     """request fingerprint
12 |     """
13 |     scheme, netloc, path, params, query, fragment = urlparse(request.url)
14 |     keyvals = parse_qsl(query)
15 |     keyvals.sort()
16 |     query = urllib.urlencode(keyvals)
17 |     canonicalize_url = urlunparse((
18 |         scheme, netloc.lower(), path, params, query, fragment))
19 |     fpr = hashlib.sha1()
20 |     fpr.update(canonicalize_url)
21 |     return fpr.hexdigest()


--------------------------------------------------------------------------------
/doc/schedular.rst:
--------------------------------------------------------------------------------
 1 | schedular：调度器
 2 | ============================
 3 | 
 4 | 调度器实现的核心是gevent之中的Queue和布隆过滤器
 5 | （Wiki: https://en.wikipedia.org/wiki/Bloom_filter）。
 6 | 其中，Queue保证了多个Downloader协程读取队列时的协程安全，布隆过滤器则提供了url去重功能。
 7 | 
 8 | 将请求入队：enqueue_request(request)
 9 | --------------------------------------------------
10 | 
11 | request入队时，首先使用布隆过滤器检查url是否已经抓取过。如果没有抓取过则直接入队，
12 | 如果抓取过，则会输出一条logging.DEBUG信息，表示忽略了这个url。
13 | 
14 | 取得队列中的请求：next_request()
15 | -----------------------------------------------
16 | 
17 | 这个方法将会从Queue中取出一条request。如果在 **custom_settings** 中设置了 **DOWNLOAD_DELAY**
18 | 项目的话，每次取出request会等待一个固定的时间。
19 | 
20 | PyCreeper将 **TIMEOUT** 值的3倍作为检验爬虫结束的标志。具体是指，如果3*TIMEOUT时间之内Queue为空的话，
21 | 那么则认为爬取任务全部结束，爬虫退出。
22 | 
23 | 


--------------------------------------------------------------------------------
/pycreeper/utils/exceptions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | class PycreeperException(Exception):
 6 |     """
 7 |     Base pycreeper exception.
 8 |     """
 9 | 
10 |     def __init__(self, msg=None, stacktrace=None):
11 |         self.msg = msg
12 |         self.stacktrace = stacktrace
13 | 
14 |     def __str__(self):
15 |         exception_msg = "Message: %s\n" % self.msg
16 |         if self.stacktrace is not None:
17 |             stacktrace = "\n".join(self.stacktrace)
18 |             exception_msg += "Stacktrace:\n%s" % stacktrace
19 |         return exception_msg
20 | 
21 | 
22 | class TimeoutException(PycreeperException):
23 |     pass
24 | 


--------------------------------------------------------------------------------
/doc/_build/html/_sources/schedular.rst.txt:
--------------------------------------------------------------------------------
 1 | schedular：调度器
 2 | ============================
 3 | 
 4 | 调度器实现的核心是gevent之中的Queue和布隆过滤器
 5 | （Wiki: https://en.wikipedia.org/wiki/Bloom_filter）。
 6 | 其中，Queue保证了多个Downloader协程读取队列时的协程安全，布隆过滤器则提供了url去重功能。
 7 | 
 8 | 将请求入队：enqueue_request(request)
 9 | --------------------------------------------------
10 | 
11 | request入队时，首先使用布隆过滤器检查url是否已经抓取过。如果没有抓取过则直接入队，
12 | 如果抓取过，则会输出一条logging.DEBUG信息，表示忽略了这个url。
13 | 
14 | 取得队列中的请求：next_request()
15 | -----------------------------------------------
16 | 
17 | 这个方法将会从Queue中取出一条request。如果在 **custom_settings** 中设置了 **DOWNLOAD_DELAY**
18 | 项目的话，每次取出request会等待一个固定的时间。
19 | 
20 | PyCreeper将 **TIMEOUT** 值的3倍作为检验爬虫结束的标志。具体是指，如果3*TIMEOUT时间之内Queue为空的话，
21 | 那么则认为爬取任务全部结束，爬虫退出。
22 | 
23 | 


--------------------------------------------------------------------------------
/pycreeper/utils/log.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | 
 6 | import logging
 7 | 
 8 | 
 9 | def get_logger(settings, name='pyCreeperLogger'):
10 |     """Create a Logger
11 |     """
12 |     log_level = getattr(logging, settings.get('LOG_LEVEL'), None)
13 |     if not log_level:
14 |         raise ValueError('Invaild LOG_LEVE. Please check your settings.py.')
15 |     logger = logging.getLogger(name)
16 |     logger.setLevel(log_level)
17 |     stream = logging.StreamHandler()
18 |     stream.setLevel(log_level)
19 |     formatter = logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s")
20 |     stream.setFormatter(formatter)
21 |     logger.addHandler(stream)
22 |     return logger
23 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="pycreeper",
 5 |     version="1.0.0",
 6 |     description='''A web crawler that is able to crawl dynamic web page.''',
 7 |     author="zcy",
 8 |     author_email="zhengchenyu.backend@gmail.com",
 9 |     url="https://github.com/ZcyAndWt/pyCreeper",
10 |     license="LGPL",
11 |     packages=find_packages(exclude=('doc', 'doc.*', 'tests',
12 |                                     'tests.*', 'examples', 'examples.*')),
13 |     install_requires=[
14 |         'gevent>=1.2.1',
15 |         'importlib>=1.0.4',
16 |         'requests>=2.8.1',
17 |         'chardet>=2.3.0',
18 |         'w3lib>=1.16.0',
19 |         'six>=1.9.0',
20 |         'pybloom>=1.1',
21 |         'selenium>=2.48.0'
22 |     ],
23 |     )
24 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | .. PyCreeper documentation master file, created by
 2 |    sphinx-quickstart on Sat Mar 18 20:46:54 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | PyCreeper: 抓取你能看到的一切！
 7 | =================================
 8 | PyCreeper是一个用来快速提取网页内容的信息采集（爬虫）框架。项目通过控制 **Selenium.WebDriver** 实现对网页的动态加载与控制，
 9 | 希望可以减少爬虫爱好者分析网页源码，抓取http包，分析Cookies等诸多不便。
10 | 
11 | 项目主页：https://github.com/ZcyAndWt/pyCreeper
12 | 
13 | 作者邮箱：zhengchenyu.backend@gmail.com
14 | 
15 | 项目使用过程中，当您发现任何问题或感受到任何不快，请及时联系我们！
16 | 
17 | .. toctree::
18 |    :maxdepth: 2
19 |    
20 |    
21 |    tutorial
22 |    prepare
23 |    structure
24 |    settings
25 |    http
26 |    downloader
27 |    downloader_middlewares
28 |    schedular
29 |    spider
30 |    last
31 |    
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/doc/_build/html/_sources/index.rst.txt:
--------------------------------------------------------------------------------
 1 | .. PyCreeper documentation master file, created by
 2 |    sphinx-quickstart on Sat Mar 18 20:46:54 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | PyCreeper: 抓取你能看到的一切！
 7 | =================================
 8 | PyCreeper是一个用来快速提取网页内容的信息采集（爬虫）框架。项目通过控制 **Selenium.WebDriver** 实现对网页的动态加载与控制，
 9 | 希望可以减少爬虫爱好者分析网页源码，抓取http包，分析Cookies等诸多不便。
10 | 
11 | 项目主页：https://github.com/ZcyAndWt/pyCreeper
12 | 
13 | 作者邮箱：zhengchenyu.backend@gmail.com
14 | 
15 | 项目使用过程中，当您发现任何问题或感受到任何不快，请及时联系我们！
16 | 
17 | .. toctree::
18 |    :maxdepth: 2
19 |    
20 |    
21 |    tutorial
22 |    prepare
23 |    structure
24 |    settings
25 |    http
26 |    downloader
27 |    downloader_middlewares
28 |    schedular
29 |    spider
30 |    last
31 |    
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=PyCreeper
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/tests/utils/test_utils_log.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | 
 6 | import unittest
 7 | import logging
 8 | 
 9 | from pycreeper.conf.settings import Settings
10 | from pycreeper.utils.log import get_logger
11 | 
12 | 
13 | class SettingsTest(unittest.TestCase):
14 | 
15 |     def test_get_logger(self):
16 |         settings = Settings()
17 |         logger = get_logger(settings, 'testLogger')
18 |         self.assertEqual(logger.level, logging.DEBUG)
19 | 
20 |         settings.set('LOG_LEVEL', 'INFO')
21 |         logger = get_logger(settings, 'testLogger')
22 |         self.assertEqual(logger.level, logging.INFO)
23 | 
24 |         settings.set('LOG_LEVEL', 'foo')
25 |         self.assertRaises(ValueError, get_logger, settings, 'testLogger')
26 | 
27 |         self.assertEqual(logger.name, 'testLogger')
28 | 
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     unittest.main()


--------------------------------------------------------------------------------
/doc/structure.rst:
--------------------------------------------------------------------------------
 1 | 架构概览
 2 | ==========
 3 | PyCreeper的整体架构可以分为引擎，下载器，下载器中间件，调度器，爬虫五个部分。
 4 | 在各个部分之间传递的数据为Request/Response对象。
 5 | 
 6 | 数据的流动方向如下图的绿色箭头所示。
 7 | 
 8 | 各个部分的功能简述
 9 | --------------------
10 | 
11 | .. image:: _static/structure.jpg
12 | 
13 | ------------------------------------
14 | 
15 | **引擎** 是PyCreeper的核心部分，负责调度各个部分的工作。引擎在内部的实现为gevent.Pool。
16 |  
17 | **下载器** 负责下载request请求，在这里将静态请求与动态请求分别处理，静态请求使用requests库实现，
18 | 动态请求使用selenium.webdriver实现。在请求完成后，将响应返回给引擎。
19 | 
20 | **下载器中间件** 可以理解为存在于下载器和引擎之间的钩子系统，可以通过自定义下载器中间件完成对request和response的特殊处理。
21 |  
22 | **调度器** 调度器实现的核心为gevent中的Queue和布隆过滤器，通过对requests进行判重，非重复请求入队，等待引擎取走处理。
23 |  
24 | **爬虫** 爬虫相当于对用户定义的接口，由用户来定义起始的url，对于各个request的callback以及对于爬取结果的处理方法。
25 |  
26 | 数据流动过程
27 | -------------
28 | 
29 | 数据流动的过程如下面各个步骤所示：
30 | 
31 | #. 引擎启动，将爬虫中的start_urls加入到调度器中。
32 | 
33 | #. 引擎从调度器中取得一个request。
34 | 
35 | #. 引擎将请求交给下载器处理，中间经过了下载器中间件对于request的处理。
36 | 
37 | #. 下载器根据request的类型分别操作，静态请求交给requests库，动态请求使用selenium.webdriver加载。
38 | 
39 | #. 下载器将response返回给引擎，中间经过下载器中间件对response的处理。
40 |  
41 | #. 引擎将response交给爬虫定义的处理方法。
42 |  
43 | #. 爬虫的处理方法可能返回一个request（转2），或者返回一个包含爬取结果的字典（转下一个）。
44 |  
45 | #. 引擎根据爬虫定义的对于爬取结果的处理方法，处理结果。
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/doc/_build/html/_sources/structure.rst.txt:
--------------------------------------------------------------------------------
 1 | 架构概览
 2 | ==========
 3 | PyCreeper的整体架构可以分为引擎，下载器，下载器中间件，调度器，爬虫五个部分。
 4 | 在各个部分之间传递的数据为Request/Response对象。
 5 | 
 6 | 数据的流动方向如下图的绿色箭头所示。
 7 | 
 8 | 各个部分的功能简述
 9 | --------------------
10 | 
11 | .. image:: _static/structure.jpg
12 | 
13 | ------------------------------------
14 | 
15 | **引擎** 是PyCreeper的核心部分，负责调度各个部分的工作。引擎在内部的实现为gevent.Pool。
16 |  
17 | **下载器** 负责下载request请求，在这里将静态请求与动态请求分别处理，静态请求使用requests库实现，
18 | 动态请求使用selenium.webdriver实现。在请求完成后，将响应返回给引擎。
19 | 
20 | **下载器中间件** 可以理解为存在于下载器和引擎之间的钩子系统，可以通过自定义下载器中间件完成对request和response的特殊处理。
21 |  
22 | **调度器** 调度器实现的核心为gevent中的Queue和布隆过滤器，通过对requests进行判重，非重复请求入队，等待引擎取走处理。
23 |  
24 | **爬虫** 爬虫相当于对用户定义的接口，由用户来定义起始的url，对于各个request的callback以及对于爬取结果的处理方法。
25 |  
26 | 数据流动过程
27 | -------------
28 | 
29 | 数据流动的过程如下面各个步骤所示：
30 | 
31 | #. 引擎启动，将爬虫中的start_urls加入到调度器中。
32 | 
33 | #. 引擎从调度器中取得一个request。
34 | 
35 | #. 引擎将请求交给下载器处理，中间经过了下载器中间件对于request的处理。
36 | 
37 | #. 下载器根据request的类型分别操作，静态请求交给requests库，动态请求使用selenium.webdriver加载。
38 | 
39 | #. 下载器将response返回给引擎，中间经过下载器中间件对response的处理。
40 |  
41 | #. 引擎将response交给爬虫定义的处理方法。
42 |  
43 | #. 爬虫的处理方法可能返回一个request（转2），或者返回一个包含爬取结果的字典（转下一个）。
44 |  
45 | #. 引擎根据爬虫定义的对于爬取结果的处理方法，处理结果。
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/doc/http.rst:
--------------------------------------------------------------------------------
 1 | request对象和response对象
 2 | ============================
 3 | 
 4 | request对象和response对象负责在各个PyCreeper组件之间传递信息，您在使用爬虫的过程中，会经常需要对这两个对象进行操作。
 5 | 
 6 | Request：自定义您的请求
 7 | -----------------------------
 8 | 
 9 | 构造参数::
10 | 
11 |     Request(url, callback=None, method='GET', headers=None,body=None, meta=None,
12 |             encoding='utf-8', cookiejar=None,dynamic=False, browser_actions=None, wait=0)
13 | 
14 | **url**
15 | 
16 | 请求的url
17 | 
18 | **callback**
19 | 
20 | 请求的回调函数，如果未定义则使用Spider.parse方法处理响应。
21 | 
22 | **method**
23 | 
24 | 支持GET型和POST型请求方法，其中，POST方法只有当dynamic=False时才会被支持，
25 | 如果dynamic=True将会抛出一个AttributeError。
26 | 
27 | **headers**
28 | 
29 | 该参数可以传入一个字典（dict），用于静态请求的头部信息。
30 | 
31 | **body**
32 | 
33 | 该参数用于静态请求的请求体。
34 | 
35 | **meta**
36 | 
37 | 该参数为字典（dict）型，用于给request携带一些参数，这些参数可能在其他模块用到。
38 | 
39 | **encoding**
40 | 
41 | 请求的编码方式，用于给url和body编码。
42 | 
43 | **cookiejar**
44 | 
45 | 该参数用于取出request携带的cookiejar，在构造request对象时请不要向该参数传入值，传入的cookiejar不会被PyCreeper使用到。
46 | 
47 | **dynamic**
48 | 
49 | 该参数用于标记request是否是动态请求。
50 | 
51 | **browser_actions**
52 | 
53 | 该参数用于定义浏览器打开指定网址之后，到提取数据之前，执行的一系列操作。该参数可以传入一个函数列表。
54 | 
55 | **wait**
56 | 
57 | 该参数用于定义浏览器打开指定网址之后，到执行browser_actions中定义的函数之前，等待的时间。
58 | 当网页存在大量异步加载请求的时候，这个参数格外有用。


--------------------------------------------------------------------------------
/doc/_build/html/_sources/http.rst.txt:
--------------------------------------------------------------------------------
 1 | request对象和response对象
 2 | ============================
 3 | 
 4 | request对象和response对象负责在各个PyCreeper组件之间传递信息，您在使用爬虫的过程中，会经常需要对这两个对象进行操作。
 5 | 
 6 | Request：自定义您的请求
 7 | -----------------------------
 8 | 
 9 | 构造参数::
10 | 
11 |     Request(url, callback=None, method='GET', headers=None,body=None, meta=None,
12 |             encoding='utf-8', cookiejar=None,dynamic=False, browser_actions=None, wait=0)
13 | 
14 | **url**
15 | 
16 | 请求的url
17 | 
18 | **callback**
19 | 
20 | 请求的回调函数，如果未定义则使用Spider.parse方法处理响应。
21 | 
22 | **method**
23 | 
24 | 支持GET型和POST型请求方法，其中，POST方法只有当dynamic=False时才会被支持，
25 | 如果dynamic=True将会抛出一个AttributeError。
26 | 
27 | **headers**
28 | 
29 | 该参数可以传入一个字典（dict），用于静态请求的头部信息。
30 | 
31 | **body**
32 | 
33 | 该参数用于静态请求的请求体。
34 | 
35 | **meta**
36 | 
37 | 该参数为字典（dict）型，用于给request携带一些参数，这些参数可能在其他模块用到。
38 | 
39 | **encoding**
40 | 
41 | 请求的编码方式，用于给url和body编码。
42 | 
43 | **cookiejar**
44 | 
45 | 该参数用于取出request携带的cookiejar，在构造request对象时请不要向该参数传入值，传入的cookiejar不会被PyCreeper使用到。
46 | 
47 | **dynamic**
48 | 
49 | 该参数用于标记request是否是动态请求。
50 | 
51 | **browser_actions**
52 | 
53 | 该参数用于定义浏览器打开指定网址之后，到提取数据之前，执行的一系列操作。该参数可以传入一个函数列表。
54 | 
55 | **wait**
56 | 
57 | 该参数用于定义浏览器打开指定网址之后，到执行browser_actions中定义的函数之前，等待的时间。
58 | 当网页存在大量异步加载请求的时候，这个参数格外有用。


--------------------------------------------------------------------------------
/doc/prepare.rst:
--------------------------------------------------------------------------------
 1 | 使用前的准备
 2 | ==============
 3 | 我们假定您已经安装了Python2.7及以上版本，若没有安装，请参考Python官网（https://www.python.org/）选择合适的版本进行安装。
 4 | 
 5 | PyCreeper对于以下几个库存在依赖关系：
 6 | 
 7 |  * gevent
 8 |  * importlib
 9 |  * requests 
10 |  * chardet
11 |  * w3lib 
12 |  * six
13 |  * pybloom
14 |  * Selenium
15 |  
16 | 当然，如果您选择使用pip安装本项目，那么依赖库会自动安装到您的电脑内（至少理论上会是这样）。
17 | 
18 | 使用pip安装项目::
19 | 
20 |     pip install pycreeper
21 | 
22 | 配置Selenium Driver
23 | ---------------------
24 | 当您希望调用指定的浏览器时，Selenium需要您安装指定浏览器的接口。
25 | 举例来说，如果您希望使用Chrome加载请求，您需要下载安装 *Chromedriver* （https://sites.google.com/a/chromium.org/chromedriver/downloads），
26 | 然后将该程序放在您的PATH之下，确保Python能访问到它。
27 | 
28 | 几个常用的Driver：
29 | 
30 | ============== =======================================================================
31 | 名称           link
32 | ============== =======================================================================
33 | Chrome         https://sites.google.com/a/chromium.org/chromedriver/downloads
34 | Firefox        https://github.com/mozilla/geckodriver/releases
35 | PhantomJS      http://phantomjs.org/download.html
36 | ============== =======================================================================
37 | 
38 | 其中，PhantomJS是一款无界面化WebKit，当您在无GUI设备的情况下，该浏览器是您最好的选择。
39 | 
40 | 对于Selenium更详细的配置，请参考 http://selenium-python.readthedocs.io/
41 | 


--------------------------------------------------------------------------------
/doc/_build/html/_sources/prepare.rst.txt:
--------------------------------------------------------------------------------
 1 | 使用前的准备
 2 | ==============
 3 | 我们假定您已经安装了Python2.7及以上版本，若没有安装，请参考Python官网（https://www.python.org/）选择合适的版本进行安装。
 4 | 
 5 | PyCreeper对于以下几个库存在依赖关系：
 6 | 
 7 |  * gevent
 8 |  * importlib
 9 |  * requests 
10 |  * chardet
11 |  * w3lib 
12 |  * six
13 |  * pybloom
14 |  * Selenium
15 |  
16 | 当然，如果您选择使用pip安装本项目，那么依赖库会自动安装到您的电脑内（至少理论上会是这样）。
17 | 
18 | 使用pip安装项目::
19 | 
20 |     pip install pycreeper
21 | 
22 | 配置Selenium Driver
23 | ---------------------
24 | 当您希望调用指定的浏览器时，Selenium需要您安装指定浏览器的接口。
25 | 举例来说，如果您希望使用Chrome加载请求，您需要下载安装 *Chromedriver* （https://sites.google.com/a/chromium.org/chromedriver/downloads），
26 | 然后将该程序放在您的PATH之下，确保Python能访问到它。
27 | 
28 | 几个常用的Driver：
29 | 
30 | ============== =======================================================================
31 | 名称           link
32 | ============== =======================================================================
33 | Chrome         https://sites.google.com/a/chromium.org/chromedriver/downloads
34 | Firefox        https://github.com/mozilla/geckodriver/releases
35 | PhantomJS      http://phantomjs.org/download.html
36 | ============== =======================================================================
37 | 
38 | 其中，PhantomJS是一款无界面化WebKit，当您在无GUI设备的情况下，该浏览器是您最好的选择。
39 | 
40 | 对于Selenium更详细的配置，请参考 http://selenium-python.readthedocs.io/
41 | 


--------------------------------------------------------------------------------
/tests/utils/test_utils_hash.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | import unittest
 6 | 
 7 | from pycreeper.utils.hash import request_fingerprint
 8 | from pycreeper.http.request import Request
 9 | 
10 | __doctests__ = ['pycreeper.utils.hash']
11 | 
12 | URLS = [
13 |     'http://www.example.com/index.html#print',
14 |     'http://www.example.com/index.html',
15 |     'http://www.xxx.com/index.html?id=77&nameid=2905210001&page=1',
16 |     'http://www.xxxxx.com/index.html?id=77&nameid=2905210001&page=1',
17 |     'http://www.xxxxx.com/index.html?test123123',
18 |     'http://www.xxxxx.com/index.html',
19 |     'ftp://www.xxxxx.com/index.html'
20 | ]
21 | 
22 | REQUEST = [Request(url) for url in URLS]
23 | 
24 | 
25 | class RequestFingerprintTest(unittest.TestCase):
26 | 
27 |     def test_basic(self):
28 |         self.assertRaises(AttributeError, request_fingerprint, None)
29 |         self.assertNotEqual(REQUEST[0], REQUEST[1])
30 | 
31 |     def test_not_equal(self):
32 |         self.assertNotEqual(REQUEST[2], REQUEST[3])
33 |         self.assertNotEqual(REQUEST[3], REQUEST[4])
34 |         self.assertNotEqual(REQUEST[3], REQUEST[4])
35 |         self.assertNotEqual(REQUEST[4], REQUEST[5])
36 |         self.assertNotEqual(REQUEST[5], REQUEST[6])
37 | 
38 | if __name__ == "__main__":
39 |     unittest.main()
40 | 


--------------------------------------------------------------------------------
/pycreeper/spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | """ Base Spider"""
 6 | 
 7 | import json
 8 | 
 9 | from pycreeper.conf.settings import Settings
10 | from pycreeper.http.request import Request
11 | from pycreeper.engine import Engine
12 | from pycreeper.utils.log import get_logger
13 | 
14 | 
15 | class Spider(object):
16 |     """ Base Spider"""
17 | 
18 |     custom_settings = None
19 | 
20 |     def __init__(self):
21 |         if not hasattr(self, "start_urls"):
22 |             self.start_urls = []
23 |         # init settings
24 |         self.settings = Settings(self.custom_settings)
25 |         self.logger = get_logger(self.settings)
26 |         self.initialize()
27 | 
28 |     def initialize(self):
29 |         """initialize
30 |         """
31 |         pass
32 | 
33 |     def start_requests(self):
34 |         """start_requests
35 |         """
36 |         for url in self.start_urls:
37 |             yield Request(url)
38 | 
39 |     def start(self):
40 |         """start
41 |         """
42 |         engine = Engine(self)
43 |         engine.start()
44 | 
45 |     def parse(self, response):
46 |         """parse
47 |         """
48 |         raise NotImplementedError
49 | 
50 |     def process_item(self, item):
51 |         """process item
52 |         """
53 |         self.logger.debug(json.dumps(item))
54 | 


--------------------------------------------------------------------------------
/pycreeper/downloader_middlewares/cookies_middlewares.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | from pycreeper.downloader_middlewares import DownloaderMiddleware
 6 | import six
 7 | from collections import defaultdict
 8 | from pycreeper.utils import _get_cookies_from_cookiejar
 9 | from pycreeper.http.response import Response
10 | from cookielib import CookieJar
11 | 
12 | 
13 | class CookiesMiddleware(DownloaderMiddleware):
14 |     """This middleware enables working with sites that need cookies"""
15 | 
16 |     def __init__(self, settings, logger):
17 |         self.jars = defaultdict(CookieJar)
18 |         self.settings = settings
19 |         self.logger = logger
20 | 
21 |     def process_request(self, request):
22 |         if not request.meta or request.meta.get("cookiejar", None) is None:
23 |             return
24 |         cookiejarkey = request.meta.get("cookiejar")
25 |         jar = self.jars[cookiejarkey]
26 |         # set CookieJar
27 |         request.cookiejar = jar
28 | 
29 |     def process_response(self, request, response):
30 |         if not request.meta or request.meta.get("cookiejar", None) is None:
31 |             return response
32 |         # extract cookies from response.cookiejar
33 |         cookiejarkey = request.meta.get("cookiejar")
34 |         jar = self.jars[cookiejarkey]
35 |         cookies = _get_cookies_from_cookiejar(response.cookiejar)
36 |         for cookie in cookies:
37 |             jar.set_cookie(cookie)
38 |         return response


--------------------------------------------------------------------------------
/pycreeper/scheduler.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | """ Scheduler """
 6 | 
 7 | from gevent.queue import Queue
 8 | from pybloom import ScalableBloomFilter
 9 | import gevent
10 | from pycreeper.utils.hash import request_fingerprint
11 | 
12 | 
13 | class Scheduler(object):
14 |     """ Scheduler """
15 | 
16 |     def __init__(self, spider):
17 |         self.request_filter = RequestFilter()
18 |         self.queue = Queue()
19 |         self.settings = spider.settings
20 |         self.timeout = self.settings.get('TIMEOUT', 5)
21 |         self.download_delay = self.settings.get('DOWNLOAD_DELAY', 0)
22 |         self.logger = spider.logger
23 | 
24 |     def enqueue_request(self, request):
25 |         """put request
26 |         """
27 |         if self.request_filter.request_seen(request):
28 |             self.logger.debug("ignore %s", request.url)
29 |             return
30 |         self.queue.put(request)
31 | 
32 |     def next_request(self):
33 |         """next request
34 |         """
35 |         gevent.sleep(self.download_delay)
36 |         return self.queue.get(timeout=self.timeout * 3)
37 | 
38 |     def __len__(self):
39 |         return self.queue.qsize()
40 | 
41 | 
42 | class RequestFilter(object):
43 |     """ RequestFilter """
44 | 
45 |     def __init__(self):
46 |         self.sbf = ScalableBloomFilter(
47 |             mode=ScalableBloomFilter.SMALL_SET_GROWTH)
48 | 
49 |     def request_seen(self, request):
50 |         """request seen
51 |         """
52 |         finger = request_fingerprint(request)
53 |         if finger in self.sbf:
54 |             return True
55 |         self.sbf.add(finger)
56 |         return False
57 | 


--------------------------------------------------------------------------------
/pycreeper/utils/datatypes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | 
 6 | class CaselessDict(dict):
 7 |     __slots__ = ()
 8 | 
 9 |     def __init__(self, seq=None):
10 |         super(CaselessDict, self).__init__()
11 |         if seq:
12 |             self.update(seq)
13 | 
14 |     def __getitem__(self, key):
15 |         return dict.__getitem__(self, self.normkey(key))
16 | 
17 |     def __setitem__(self, key, value):
18 |         dict.__setitem__(self, self.normkey(key), self.normvalue(value))
19 | 
20 |     def __delitem__(self, key):
21 |         dict.__delitem__(self, self.normkey(key))
22 | 
23 |     def __contains__(self, key):
24 |         return dict.__contains__(self, self.normkey(key))
25 | 
26 |     has_key = __contains__
27 | 
28 |     def __copy__(self):
29 |         return self.__class__(self)
30 | 
31 |     copy = __copy__
32 | 
33 |     def normkey(self, key):
34 |         """Method to normalize dictionary key access"""
35 |         return key.lower()
36 | 
37 |     def normvalue(self, value):
38 |         """Method to normalize values prior to be setted"""
39 |         return value
40 | 
41 |     def get(self, key, def_val=None):
42 |         return dict.get(self, self.normkey(key), self.normvalue(def_val))
43 | 
44 |     def setdefault(self, key, def_val=None):
45 |         return dict.setdefault(self, self.normkey(key), self.normvalue(def_val))
46 | 
47 |     def update(self, seq):
48 |         seq = seq.items()
49 |         iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq)
50 |         super(CaselessDict, self).update(iseq)
51 | 
52 |     @classmethod
53 |     def fromkeys(cls, keys, value=None):
54 |         return cls((k, value) for k in keys)
55 | 
56 |     def pop(self, key, *args):
57 |         return dict.pop(self, self.normkey(key), *args)
58 | 


--------------------------------------------------------------------------------
/doc/settings.rst:
--------------------------------------------------------------------------------
 1 | settings：项目设置
 2 | =====================
 3 | 
 4 | 这篇文档主要介绍项目的设定（settings）参数和其默认值。
 5 | 
 6 | 如何覆盖项目的默认设定？
 7 | --------------------------
 8 | 
 9 | 可以在您定义的爬虫中设置 **custom_settings** 属性，覆盖掉PyCreeper的默认设定。
10 | 
11 | 示例::
12 | 
13 |     custom_settings = {
14 |         'DOWNLOADER_MIDDLEWARES': {
15 |             'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
16 |             'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
17 |             'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300,
18 |             'pycreeper.downloader_middlewares.middlewares.EncodingDiscriminateMiddleware': 400
19 |         },
20 |         'DRIVER': 'Chrome',
21 |         'DOWNLOAD_DELAY': 2,
22 |         'USER_AGENT_LIST': [
23 |             '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
24 |             (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
25 |         ]
26 |     }
27 | 
28 | 设定的可选参数和默认值
29 | ---------------------------
30 | 
31 | **LOG_LEVEL** 
32 | 
33 | 该参数为byte类型，默认值为DEBUG，该参数控制PyCreeper日志的输出等级。
34 | 
35 | **RETRY_COUNT** 
36 | 
37 | 该参数为数值型，默认值为3，表示对于失败请求的最大尝试次数（该参数只对静态请求有效）。
38 | 
39 | **RETRY_STATUS_CODES** 
40 | 
41 | 该参数为list型，默认值为[500, 502, 503, 504, 400, 403, 408]，表示返回码在列表中的请求将会被重发（该参数只对静态请求有效）。
42 | 
43 | **TIMEOUT** 
44 | 
45 | 该参数为数值型，默认值为5，表示发出请求定义的超时时间（秒）。
46 | 
47 | **MAX_REQUEST_SIZE** 
48 | 
49 | 该参数为int型，默认值为20，表示可以同时进行的静态请求个数（该参数只对静态请求有效）。
50 | 
51 | **USER_AGENT_LIST** 
52 | 
53 | 该参数为list型，默认值为空列表，表示发送请求时可以携带的User-Agent（需要使用UserAgentMiddleware，该参数只对静态请求有效）。
54 | 
55 | **DOWNLOADER_MIDDLEWARES** 
56 | 
57 | 该参数为dict型，默认值为空字典，表示使用的下载器中间件。字典的key值为希望使用的中间件的reference，
58 | value值为该中间件的优先级，优先级越高的中间件将会越先被使用。
59 | 
60 | **DYNAMIC_CRAWL** 
61 | 
62 | 该参数为bool型，默认值为True，表示引擎是否加载WebDriver。如果在设为False的情况下发出了一系列动态请求，将会引发一系列异常。
63 | 
64 | **DRIVER** 
65 | 
66 | 该参数为byte型，默认值为Firefox，表示PyCreeper使用的Driver类型。可以选择任意一种Selenium支持的Driver，前提是需要配置好Driver的相关环境。
67 | 
68 | **DRIVER_INIT_KWARGS** 
69 | 
70 | 该参数为dict型，默认为空字典，表示启动Driver时传入的参数，您可以通过定义该值修改Driver的属性。
71 | 
72 | **DOWNLOAD_DELAY** 
73 | 
74 | 该参数为数值型，默认值为0，表示下载延迟（秒）。
75 | 
76 | **PROXY_INTERVAL** 
77 | 
78 | 该参数为数值型，默认值为3，表示每个代理使用的最大时间。使用proxy需要搭配ProxyMiddleware，
79 | 并且此处的proxy只对静态请求有效。如果您想配置动态请求的proxy，可以设置DRIVER_INIT_KWARGS参数，在Driver启动时传入配置信息。
80 | 
81 | **PROXY_LIST** 
82 | 
83 | 该参数为list型，默认为空数组，表示请求可以用到的proxy。格式为'IP:端口号'。
84 | 
85 | **STATIC_REQUEST_SSL_VERIFY** 
86 | 
87 | 该参数为bool型，默认值为True，表示发起静态请求是，是否进行ssl认证。
88 | 该参数用于在使用代理的情况下，https认证失败的情况。


--------------------------------------------------------------------------------
/doc/_build/html/_sources/settings.rst.txt:
--------------------------------------------------------------------------------
 1 | settings：项目设置
 2 | =====================
 3 | 
 4 | 这篇文档主要介绍项目的设定（settings）参数和其默认值。
 5 | 
 6 | 如何覆盖项目的默认设定？
 7 | --------------------------
 8 | 
 9 | 可以在您定义的爬虫中设置 **custom_settings** 属性，覆盖掉PyCreeper的默认设定。
10 | 
11 | 示例::
12 | 
13 |     custom_settings = {
14 |         'DOWNLOADER_MIDDLEWARES': {
15 |             'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
16 |             'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
17 |             'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300,
18 |             'pycreeper.downloader_middlewares.middlewares.EncodingDiscriminateMiddleware': 400
19 |         },
20 |         'DRIVER': 'Chrome',
21 |         'DOWNLOAD_DELAY': 2,
22 |         'USER_AGENT_LIST': [
23 |             '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
24 |             (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
25 |         ]
26 |     }
27 | 
28 | 设定的可选参数和默认值
29 | ---------------------------
30 | 
31 | **LOG_LEVEL** 
32 | 
33 | 该参数为byte类型，默认值为DEBUG，该参数控制PyCreeper日志的输出等级。
34 | 
35 | **RETRY_COUNT** 
36 | 
37 | 该参数为数值型，默认值为3，表示对于失败请求的最大尝试次数（该参数只对静态请求有效）。
38 | 
39 | **RETRY_STATUS_CODES** 
40 | 
41 | 该参数为list型，默认值为[500, 502, 503, 504, 400, 403, 408]，表示返回码在列表中的请求将会被重发（该参数只对静态请求有效）。
42 | 
43 | **TIMEOUT** 
44 | 
45 | 该参数为数值型，默认值为5，表示发出请求定义的超时时间（秒）。
46 | 
47 | **MAX_REQUEST_SIZE** 
48 | 
49 | 该参数为int型，默认值为20，表示可以同时进行的静态请求个数（该参数只对静态请求有效）。
50 | 
51 | **USER_AGENT_LIST** 
52 | 
53 | 该参数为list型，默认值为空列表，表示发送请求时可以携带的User-Agent（需要使用UserAgentMiddleware，该参数只对静态请求有效）。
54 | 
55 | **DOWNLOADER_MIDDLEWARES** 
56 | 
57 | 该参数为dict型，默认值为空字典，表示使用的下载器中间件。字典的key值为希望使用的中间件的reference，
58 | value值为该中间件的优先级，优先级越高的中间件将会越先被使用。
59 | 
60 | **DYNAMIC_CRAWL** 
61 | 
62 | 该参数为bool型，默认值为True，表示引擎是否加载WebDriver。如果在设为False的情况下发出了一系列动态请求，将会引发一系列异常。
63 | 
64 | **DRIVER** 
65 | 
66 | 该参数为byte型，默认值为Firefox，表示PyCreeper使用的Driver类型。可以选择任意一种Selenium支持的Driver，前提是需要配置好Driver的相关环境。
67 | 
68 | **DRIVER_INIT_KWARGS** 
69 | 
70 | 该参数为dict型，默认为空字典，表示启动Driver时传入的参数，您可以通过定义该值修改Driver的属性。
71 | 
72 | **DOWNLOAD_DELAY** 
73 | 
74 | 该参数为数值型，默认值为0，表示下载延迟（秒）。
75 | 
76 | **PROXY_INTERVAL** 
77 | 
78 | 该参数为数值型，默认值为3，表示每个代理使用的最大时间。使用proxy需要搭配ProxyMiddleware，
79 | 并且此处的proxy只对静态请求有效。如果您想配置动态请求的proxy，可以设置DRIVER_INIT_KWARGS参数，在Driver启动时传入配置信息。
80 | 
81 | **PROXY_LIST** 
82 | 
83 | 该参数为list型，默认为空数组，表示请求可以用到的proxy。格式为'IP:端口号'。
84 | 
85 | **STATIC_REQUEST_SSL_VERIFY** 
86 | 
87 | 该参数为bool型，默认值为True，表示发起静态请求是，是否进行ssl认证。
88 | 该参数用于在使用代理的情况下，https认证失败的情况。


--------------------------------------------------------------------------------
/doc/_build/html/_sources/ssettings.rst.txt:
--------------------------------------------------------------------------------
 1 | settings：项目设置
 2 | =====================
 3 | 
 4 | 这篇文档主要介绍项目的设定（settings）参数和其默认值。
 5 | 
 6 | 如何覆盖项目的默认设定？
 7 | --------------------------
 8 | 
 9 | 可以在您定义的爬虫中设置 **custom_settings** 属性，覆盖掉PyCreeper的默认设定。
10 | 
11 | 示例::
12 | 
13 |     custom_settings = {
14 |         'DOWNLOADER_MIDDLEWARES': {
15 |             'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
16 |             'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
17 |             'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300,
18 |             'pycreeper.downloader_middlewares.middlewares.EncodingDiscriminateMiddleware': 400
19 |         },
20 |         'DRIVER': 'Chrome',
21 |         'DOWNLOAD_DELAY': 2,
22 |         'USER_AGENT_LIST': [
23 |             '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
24 |             (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
25 |         ]
26 |     }
27 | 
28 | 设定的可选参数和默认值
29 | ---------------------------
30 | 
31 | **LOG_LEVEL** 
32 | 
33 | 该参数为byte类型，默认值为DEBUG，该参数控制PyCreeper日志的输出等级。
34 | 
35 | **RETRY_COUNT** 
36 | 
37 | 该参数为数值型，默认值为3，表示对于失败请求的最大尝试次数（该参数只对静态请求有效）。
38 | 
39 | **RETRY_STATUS_CODES** 
40 | 
41 | 该参数为list型，默认值为[500, 502, 503, 504, 400, 403, 408]，表示返回码在列表中的请求将会被重发（该参数只对静态请求有效）。
42 | 
43 | **TIMEOUT** 
44 | 
45 | 该参数为数值型，默认值为5，表示发出请求定义的超时时间（秒）。
46 | 
47 | **MAX_REQUEST_SIZE** 
48 | 
49 | 该参数为int型，默认值为20，表示可以同时进行的静态请求个数（该参数只对静态请求有效）。
50 | 
51 | **USER_AGENT_LIST** 
52 | 
53 | 该参数为list型，默认值为空列表，表示发送请求时可以携带的User-Agent（需要使用UserAgentMiddleware，该参数只对静态请求有效）。
54 | 
55 | **DOWNLOADER_MIDDLEWARES** 
56 | 
57 | 该参数为dict型，默认值为空字典，表示使用的下载器中间件。字典的key值为希望使用的中间件的reference，
58 | value值为该中间件的优先级，优先级越高的中间件将会越先被使用。
59 | 
60 | **DYNAMIC_CRAWL** 
61 | 
62 | 该参数为bool型，默认值为True，表示引擎是否加载WebDriver。如果在设为False的情况下发出了一系列动态请求，将会引发一系列异常。
63 | 
64 | **DRIVER** 
65 | 
66 | 该参数为byte型，默认值为Firefox，表示PyCreeper使用的Driver类型。可以选择任意一种Selenium支持的Driver，前提是需要配置好Driver的相关环境。
67 | 
68 | **DRIVER_INIT_KWARGS** 
69 | 
70 | 该参数为dict型，默认为空字典，表示启动Driver时传入的参数，您可以通过定义该值修改Driver的属性。
71 | 
72 | **DOWNLOAD_DELAY** 
73 | 
74 | 该参数为数值型，默认值为0，表示下载延迟（秒）。
75 | 
76 | **PROXY_INTERVAL** 
77 | 
78 | 该参数为数值型，默认值为3，表示每个代理使用的最大时间。使用proxy需要搭配ProxyMiddleware，
79 | 并且此处的proxy只对静态请求有效。如果您想配置动态请求的proxy，可以设置DRIVER_INIT_KWARGS参数，在Driver启动时传入配置信息。
80 | 
81 | **PROXY_LIST** 
82 | 
83 | 该参数为list型，默认为空数组，表示请求可以用到的proxy。格式为'IP:端口号'。
84 | 
85 | **STATIC_REQUEST_SSL_VERIFY** 
86 | 
87 | 该参数为bool型，默认值为True，表示发起静态请求是，是否进行ssl认证。
88 | 该参数用于在使用代理的情况下，https认证失败的情况。


--------------------------------------------------------------------------------
/examples/zhihu_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | import json
 6 | 
 7 | from pycreeper.spider import Spider
 8 | from pycreeper.http.request import Request
 9 | import gevent
10 | from lxml import etree
11 | 
12 | class Zhihu_Spider(Spider):
13 | 
14 |     custom_settings = {
15 |         'DOWNLOADER_MIDDLEWARES': {
16 |             'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
17 |             'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
18 |             'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
19 |         },
20 |         'DRIVER': 'Chrome',
21 |         'DOWNLOAD_DELAY': 2,
22 |         'STATIC_REQUEST_SSL_VERIFY': False,
23 |         'USER_AGENT_LIST': [
24 |             '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
25 |             (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
26 |         ]
27 |     }
28 | 
29 |     def start_requests(self):
30 | 
31 |         def _login(driver):
32 |             driver.find_element_by_name('account').send_keys("username")
33 |             driver.find_element_by_name('password').send_keys("password")
34 |             driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
35 |             gevent.sleep(5)
36 | 
37 |         yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
38 |                       callback=self.after_login, dynamic=True, browser_actions=[_login])
39 | 
40 |     def after_login(self, response):
41 |         html = response.body
42 |         selector = etree.HTML(html)
43 |         links = selector.xpath('//a[@class="question_link"]')
44 |         for link in links:
45 |             yield Request('https://www.zhihu.com' + link.attrib["href"],
46 |                           meta={"cookiejar": "zhihu"}, callback=self.get_item)
47 | 
48 |     def get_item(self, response):
49 |         html = response.body
50 |         selector = etree.HTML(html)
51 |         head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
52 |         body = selector.xpath('//span[@class="RichText"]')[0].text
53 |         yield {
54 |             'head': head,
55 |             'body': body
56 |         }
57 | 
58 |     def process_item(self, item):
59 |         print json.dumps(item, ensure_ascii=False).encode('GBK', 'ignore')
60 | 
61 | if __name__ == "__main__":
62 |     spider = Zhihu_Spider()
63 |     spider.start()
64 | 


--------------------------------------------------------------------------------
/tests/http/test_http_request.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | import unittest
 6 | 
 7 | from w3lib.url import safe_url_string
 8 | 
 9 | from pycreeper.http.request import Request
10 | 
11 | 
12 | class RequestTest(unittest.TestCase):
13 |     def test_init(self):
14 |         self.assertRaises(Exception, Request)
15 |         self.assertRaises(ValueError, Request, 'foo')
16 |         request = Request('http://www.example.com/')
17 |         assert request.url
18 |         assert not request.body
19 |         request = Request('http://www.example.com/',
20 |                           headers={'Content-Type': 'text/html',
21 |                                    'Content-Length': 1234
22 |                                    },
23 |                           method='get'
24 |                           )
25 |         self.assertEqual(request.method, 'GET')
26 | 
27 |     def test_copy(self):
28 |         request1 = Request('http://www.example.com/',
29 |                            headers={'Content-Type': 'text/html',
30 |                                     'Content-Length': 1234
31 |                                     },
32 |                            method='get'
33 |                            )
34 |         request2 = request1.copy()
35 |         assert request1.__dict__ == request2.__dict__
36 |         self.assertEqual(request1.headers, request2.headers)
37 |         self.assertEqual(request1, request2)
38 |         self.assertIsNot(request1, request2)
39 | 
40 |     def test_url(self):
41 |         request = Request('http://www.example.com/')
42 |         self.assertIsInstance(request.url, str)
43 |         self.assertEqual(request.url, 'http://www.example.com/')
44 |         request = Request(u'http://www.example.com?content=测试')
45 |         self.assertEqual(request.url,
46 |                          safe_url_string('http://www.example.com?content=测试'))
47 |         self.assertRaises(TypeError, Request, 123)
48 | 
49 |     def test_body(self):
50 |         r1 = Request(url="http://www.example.com/")
51 |         assert r1.body == b''
52 | 
53 |         r2 = Request(url="http://www.example.com/", body=b"")
54 |         assert isinstance(r2.body, bytes)
55 |         self.assertEqual(r2.encoding, 'utf-8')  # default encoding
56 | 
57 |         r3 = Request(url="http://www.example.com/", body=u"Price: \xa3100", encoding='utf-8')
58 |         assert isinstance(r3.body, bytes)
59 |         self.assertEqual(r3.body, b"Price: \xc2\xa3100")
60 | 
61 |         r4 = Request(url="http://www.example.com/", body=u"Price: \xa3100", encoding='latin1')
62 |         assert isinstance(r4.body, bytes)
63 |         self.assertEqual(r4.body, b"Price: \xa3100")
64 | 


--------------------------------------------------------------------------------
/pycreeper/conf/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | reload(__import__('sys')).setdefaultencoding('utf-8')
  3 | __author__ = 'zcy'
  4 | 
  5 | """" Settings """
  6 | 
  7 | import json
  8 | from importlib import import_module
  9 | 
 10 | from pycreeper.conf import default_settings
 11 | 
 12 | 
 13 | class Settings(object):
 14 |     """ Settings Object """
 15 | 
 16 |     def __init__(self, values=None):
 17 |         self.attrs = {}
 18 |         self.load_config(default_settings)
 19 |         if values:
 20 |             self.load_config(values)
 21 | 
 22 |     def __getitem__(self, key):
 23 |         """__getitem__
 24 | 
 25 |         @key, str, key
 26 |         """
 27 |         return self.attrs[key] if key in self.attrs else None
 28 | 
 29 |     def load_config(self, module):
 30 |         """load config
 31 | 
 32 |         @module, module
 33 |         """
 34 |         if isinstance(module, basestring):
 35 |             module = import_module(module)
 36 |         for key in module if isinstance(module, dict) else dir(module):
 37 |             if key.isupper():
 38 |                 self.set(key, module.get(key) \
 39 |                     if isinstance(module, dict) else getattr(module, key))
 40 | 
 41 |     def set(self, key, value):
 42 |         """set
 43 | 
 44 |         @key, str, key
 45 |         @value, str/int/float value
 46 |         """
 47 |         self.attrs[key] = value
 48 | 
 49 |     def set_dict(self, values):
 50 |         """set dict
 51 | 
 52 |         @values, dict, values
 53 |         """
 54 |         for key, value in values.iteritems():
 55 |             self.set(key, value)
 56 | 
 57 |     def get(self, key, default=None):
 58 |         """get
 59 | 
 60 |         @key, str, key
 61 |         @default, default
 62 |         """
 63 |         return self[key] or default
 64 | 
 65 |     def get_int(self, key, default=0):
 66 |         """get int
 67 | 
 68 |         @key, str, key
 69 |         @default, int
 70 |         """
 71 |         return int(self.get(key, default))
 72 | 
 73 |     def get_float(self, key, default=0.0):
 74 |         """get float
 75 | 
 76 |         @key, str, key
 77 |         @default, float
 78 |         """
 79 |         return float(self.get(key, default))
 80 | 
 81 |     def get_list(self, key, default=None):
 82 |         """get list
 83 | 
 84 |         @key, str, key
 85 |         @default, list
 86 |         """
 87 |         value = self.get(key, default or None)
 88 |         if isinstance(value, basestring):
 89 |             value = value.split(",")
 90 |         return value
 91 | 
 92 |     def get_dict(self, key, default=None):
 93 |         """get dict
 94 | 
 95 |         @key, str, key
 96 |         @default, dict
 97 |         """
 98 |         value = self.get(key, default or None)
 99 |         if isinstance(value, basestring):
100 |             value = json.loads(value)
101 |         return value
102 | 


--------------------------------------------------------------------------------
/pycreeper/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | import six
 6 | 
 7 | def result2list(result):
 8 |     """result to list
 9 |     """
10 |     if result is None:
11 |         return []
12 |     if isinstance(result, (dict, basestring)):
13 |         return [result]
14 |     if hasattr(result, "__iter__"):
15 |         return result
16 | 
17 | 
18 | def call_func(func, errback=None, callback=None, *args, **kwargs):
19 |     """执行某个函数,并自动包装异常和回调
20 | 
21 |     :param func:
22 |     :param errback:
23 |     :param callback:
24 |     :param args:
25 |     :param kwargs:
26 |     """
27 |     try:
28 |         result = func(*args, **kwargs)
29 |     except Exception as exc:
30 |         if errback:
31 |             errback(exc)
32 |     else:
33 |         if callback:
34 |             result = callback(result)
35 |         return result
36 | 
37 | 
38 | def sorted_priority_dict(d):
39 |     """Sort the priority dict to a ordered list.
40 | 
41 |     :param d: A priority dict.
42 |     :return: Ordered list.
43 |     """
44 |     modules = sorted(d.items(), key=lambda x: x[1])
45 |     modules = [x[0] for x in modules]
46 |     return modules
47 | 
48 | 
49 | def to_unicode(text, encoding=None, errors='strict'):
50 |     """Return the unicode representation of a bytes object `text`. If `text`
51 |     is already an unicode object, return it as-is."""
52 |     if isinstance(text, six.text_type):
53 |         return text
54 |     if not isinstance(text, (bytes, six.text_type)):
55 |         raise TypeError('to_unicode must receive a bytes, str or unicode '
56 |                         'object, got %s' % type(text).__name__)
57 |     if encoding is None:
58 |         encoding = 'utf-8'
59 |     return text.decode(encoding, errors)
60 | 
61 | 
62 | def to_bytes(text, encoding=None, errors='strict'):
63 |     """Return the binary representation of `text`. If `text`
64 |     is already a bytes object, return it as-is."""
65 |     if isinstance(text, bytes):
66 |         return text
67 |     if not isinstance(text, six.string_types):
68 |         raise TypeError('to_bytes must receive a unicode, str or bytes '
69 |                         'object, got %s' % type(text).__name__)
70 |     if encoding is None:
71 |         encoding = 'utf-8'
72 |     return text.encode(encoding, errors)
73 | 
74 | 
75 | def to_native_str(text, encoding=None, errors='strict'):
76 |     """ Return str representation of `text`
77 |     (bytes in Python 2.x and unicode in Python 3.x). """
78 |     if six.PY2:
79 |         return to_bytes(text, encoding, errors)
80 |     else:
81 |         return to_unicode(text, encoding, errors)
82 | 
83 | 
84 | def _get_cookies_from_cookiejar(cj):
85 |     result = []
86 |     for domain in cj._cookies.keys():
87 |         for path in cj._cookies[domain].keys():
88 |             for cookie in cj._cookies[domain][path].values():
89 |                 result.append(cookie)
90 |     return result
91 | 


--------------------------------------------------------------------------------
/pycreeper/conf/default_settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | """ default config settings """
 6 | 
 7 | LOG_LEVEL = 'DEBUG'
 8 | 
 9 | RETRY_COUNT = 3
10 | 
11 | RETRY_STATUS_CODES = [500, 502, 503, 504, 400, 403, 408]
12 | 
13 | TIMEOUT = 5
14 | 
15 | MAX_REQUEST_SIZE = 20
16 | 
17 | USER_AGENT_LIST = [
18 |     'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31',
19 |     'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17',
20 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17',
21 |     'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',
22 |     'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)',
23 |     'Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
24 |     'Mozilla/6.0 (Windows NT 6.2; WOW64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1',
25 |     'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1',
26 |     'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:15.0) Gecko/20120910144328 Firefox/15.0.2',
27 |     'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
28 |     'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9a3pre) Gecko/20070330',
29 |     'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.13; ) Gecko/20101203',
30 |     'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
31 |     'Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50',
32 |     'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52',
33 |     'Mozilla/5.0 (Windows; U; Win 9x 4.90; SG; rv:1.9.2.4) Gecko/20101104 Netscape/9.1.0285',
34 |     'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.1.7pre) Gecko/20070815 Firefox/2.0.0.6 Navigator/9.0b3',
35 |     'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
36 |     "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
37 |     "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)",
38 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)",
39 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
40 |     "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT)",
41 |     "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13",
42 |     "Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3",
43 | ]
44 | 
45 | DOWNLOADER_MIDDLEWARES = {}
46 | 
47 | DYNAMIC_CRAWL = True
48 | 
49 | DRIVER = 'Firefox'
50 | 
51 | DRIVER_INIT_KWARGS = {}
52 | 
53 | DOWNLOAD_DELAY = 0
54 | 
55 | PROXY_INTERVAL = 3
56 | 
57 | PROXY_LIST = []
58 | 
59 | STATIC_REQUEST_SSL_VERIFY = True
60 | 


--------------------------------------------------------------------------------
/tests/test_conf_settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | import unittest
 6 | 
 7 | from pycreeper.conf.settings import Settings
 8 | from tests.test_data import test_settings_data
 9 | 
10 | CONF_PATH = 'tests.test_data.test_settings_data'
11 | 
12 | 
13 | class SettingsTest(unittest.TestCase):
14 |     def test_basics(self):
15 |         settings = Settings()
16 |         self.assertEqual(settings['RETRY_COUNT'], 3)
17 |         settings = Settings(test_settings_data)
18 |         self.assertEqual(settings['TEST_INT'], 10)
19 | 
20 |     def test_get_item(self):
21 |         settings = Settings(test_settings_data)
22 |         self.assertEqual(settings['TEST_STR'], 'foo,bar,baz')
23 |         self.assertEqual(settings['TEST_DICT'], {"foo": "bar"})
24 | 
25 |     def test_load_config(self):
26 |         settings = Settings(test_settings_data)
27 |         self.assertEqual(settings['TEST_STR'], 'foo,bar,baz')
28 |         settings = Settings(CONF_PATH)
29 |         self.assertEqual(settings['TEST_STR'], 'foo,bar,baz')
30 |         self.assertRaises(KeyError, settings['test_lowercase'])
31 | 
32 |     def test_set(self):
33 |         settings = Settings(test_settings_data)
34 |         self.assertRaises(KeyError, settings['TEST_SET'])
35 |         settings.set('TEST_SET', True)
36 |         self.assertEqual(settings['TEST_SET'], True)
37 | 
38 |     def test_set_dict(self):
39 |         settings = Settings(test_settings_data)
40 |         self.assertRaises(KeyError, settings['TEST_SET_1'])
41 |         self.assertRaises(KeyError, settings['TEST_SET_2'])
42 |         settings.set_dict(
43 |             {
44 |                 'TEST_SET_1': True,
45 |                 'TEST_SET_2': False
46 |             }
47 |         )
48 |         self.assertEqual(settings['TEST_SET_1'], True)
49 |         self.assertEqual(settings['TEST_SET_2'], False)
50 | 
51 |     def test_get(self):
52 |         settings = Settings(test_settings_data)
53 |         self.assertEqual(settings.get('TEST_GET'), None)
54 |         self.assertEqual(settings.get('TEST_GET', 'foo'), 'foo')
55 |         settings.set('TEST_GET', 'bar')
56 |         self.assertEqual(settings.get('TEST_GET', 'foo'), 'bar')
57 | 
58 |     def test_get_int_and_float(self):
59 |         settings = Settings(test_settings_data)
60 |         self.assertIsInstance(settings.get_float('TEST_INT'), float)
61 |         self.assertIsInstance(settings.get_int('TEST_FLOAT'), int)
62 | 
63 |     def test_get_list(self):
64 |         settings = Settings(test_settings_data)
65 |         self.assertIsInstance(settings.get_list('TEST_LIST'), list)
66 |         self.assertIsInstance(settings.get_list('TEST_STR'), list)
67 | 
68 |     def test_get_dict(self):
69 |         settings = Settings(test_settings_data)
70 |         self.assertIsInstance(settings.get_dict('TEST_DICT'), dict)
71 |         self.assertIsInstance(settings.get_dict('TEST_JSON'), dict)
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     unittest.main()
76 | 


--------------------------------------------------------------------------------
/pycreeper/downloader_middlewares/__init__.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | """Dowloader Midlleware"""
 6 | 
 7 | from collections import defaultdict
 8 | 
 9 | from importlib import import_module
10 | 
11 | from pycreeper.utils import call_func, sorted_priority_dict
12 | from pycreeper.http.request import Request
13 | 
14 | 
15 | class DownloaderMiddleware(object):
16 |     """ DownloaderMiddleware iterface """
17 | 
18 |     pass
19 | 
20 | 
21 | class DownloaderMiddlewareManager(object):
22 |     """ DownloaderMiddlewareManager """
23 | 
24 |     def __init__(self, spider):
25 |         self.settings = spider.settings
26 |         self.logger = spider.logger
27 |         self.methods = defaultdict(list)
28 |         self.middlewares = self.load_middleware()
29 |         for miw in self.middlewares:
30 |             self._add_middleware(miw)
31 | 
32 |     def load_middleware(self):
33 |         """load middleware
34 |         """
35 |         middlewares = []
36 |         modules = sorted_priority_dict(
37 |             self.settings.get('DOWNLOADER_MIDDLEWARES', {})
38 |         )
39 |         for module_name in modules:
40 |             module = import_module('.'.join(module_name.split('.')[:-1]))
41 |             middleware_class = getattr(module, module_name.split('.')[-1])
42 |             middlewares.append(middleware_class(self.settings, self.logger))
43 |         return middlewares
44 | 
45 |     def _add_middleware(self, miw):
46 |         """add middleware
47 |         """
48 |         if hasattr(miw, "process_request"):
49 |             self.methods["process_request"].append(miw.process_request)
50 |         if hasattr(miw, "process_response"):
51 |             self.methods["process_response"].insert(0, miw.process_response)
52 |         if hasattr(miw, "process_exception"):
53 |             self.methods["process_exception"].insert(0, miw.process_exception)
54 | 
55 |     def download(self, download_func, request):
56 |         """download
57 |         """
58 | 
59 |         def process_request(request):
60 |             """ process request """
61 |             for method in self.methods["process_request"]:
62 |                 method(request)
63 |             return download_func(request)
64 | 
65 |         def process_response(response):
66 |             """ process response """
67 |             for method in self.methods["process_response"]:
68 |                 response = method(request, response)
69 |                 if isinstance(response, Request):
70 |                     return response
71 |             return response
72 | 
73 |         def process_exception(exception):
74 |             """ process exception """
75 |             for method in self.methods["process_exception"]:
76 |                 response = method(request, exception)
77 |                 if response:
78 |                     return response
79 |             return exception
80 | 
81 |         return call_func(process_request, process_exception,
82 |                          process_response, request)
83 | 


--------------------------------------------------------------------------------
/tests/test_scheduler.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | import unittest
 6 | import time
 7 | from pycreeper.scheduler import RequestFilter, Scheduler
 8 | from pycreeper.http.request import Request
 9 | from pycreeper.spider import Spider
10 | from Queue import Empty
11 | 
12 | __doctests__ = ['pycreeper.utils.scheduler']
13 | 
14 | URLS = [
15 |     'http://www.example.com/index.html#print',
16 |     'http://www.example.com/index.html',
17 |     'http://www.xxx.com/index.html?id=77&nameid=2905210001&page=1',
18 |     'http://www.xxxxx.com/index.html?id=77&nameid=2905210001&page=1',
19 |     'http://www.xxxxx.com/index.html?test123123',
20 |     'http://www.xxxxx.com/index.html',
21 |     'ftp://www.xxxxx.com/index.html'
22 | ]
23 | 
24 | REQUEST = [Request(url) for url in URLS]
25 | 
26 | 
27 | class RequestTest(unittest.TestCase):
28 | 
29 |     def test_basic(self):
30 |         request_filter = RequestFilter()
31 |         request_filter.request_seen(REQUEST[0])
32 |         self.assertEqual(request_filter.request_seen(REQUEST[0]), True)
33 |         self.assertEqual(request_filter.request_seen(REQUEST[1]), False)
34 |         self.assertEqual(request_filter.request_seen(REQUEST[1]), True)
35 |         self.assertRaises(AttributeError, request_filter.request_seen, None)
36 | 
37 | 
38 | class SchedulerTest(unittest.TestCase):
39 | 
40 |     def setUp(self):
41 |         self.spider = Spider()
42 | 
43 |     def test_basic(self):
44 |         self.assertRaises(AttributeError, Scheduler, None)
45 | 
46 |     def test_enqueue(self):
47 |         scheduler = Scheduler(self.spider)
48 |         self.assertRaises(AttributeError, scheduler.enqueue_request, None)
49 |         self.assertEqual(len(scheduler.queue), 0)
50 |         scheduler.enqueue_request(REQUEST[0])
51 |         self.assertEqual(len(scheduler.queue), 1)
52 |         scheduler.enqueue_request(REQUEST[0])
53 |         self.assertEqual(len(scheduler.queue), 1)
54 |         scheduler.enqueue_request(REQUEST[1])
55 |         self.assertEqual(len(scheduler.queue), 2)
56 |         scheduler.enqueue_request(REQUEST[0])
57 |         self.assertEqual(len(scheduler.queue), 2)
58 | 
59 |     def test_next_request(self):
60 |         scheduler = Scheduler(self.spider)
61 |         self.assertRaises(Empty, scheduler.next_request)
62 |         scheduler.enqueue_request(REQUEST[0])
63 |         scheduler.enqueue_request(REQUEST[1])
64 |         scheduler.enqueue_request(REQUEST[2])
65 |         self.assertEqual(scheduler.next_request(), REQUEST[0])
66 |         self.assertEqual(scheduler.next_request(), REQUEST[1])
67 |         self.assertEqual(scheduler.next_request(), REQUEST[2])
68 |         self.assertRaises(Empty, scheduler.next_request)
69 | 
70 |     def test_download_delay(self):
71 |         self.spider.settings.set('DOWNLOAD_DELAY', 5)
72 |         scheduler = Scheduler(self.spider)
73 |         scheduler.enqueue_request(REQUEST[0])
74 |         time1 = time.time()
75 |         scheduler.next_request()
76 |         self.assertGreater(time.time() - time1, 5)
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     unittest.main()
81 | 


--------------------------------------------------------------------------------
/pycreeper/http/response.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | """ Response Object """
 6 | 
 7 | import six
 8 | from w3lib.url import safe_url_string
 9 | from pycreeper.http.request import Request
10 | import copy
11 | 
12 | 
13 | class Response(object):
14 |     """ Response """
15 | 
16 |     def __init__(self, url, request, headers=None, status=200,
17 |                  cookiejar=None, body='', encoding='utf-8'):
18 |         self._encoding = encoding
19 |         self.headers = copy.deepcopy(headers) if headers else {}
20 |         self.url = url
21 |         self.status = int(status)
22 |         self.cookiejar = cookiejar
23 |         self.body = body
24 |         self.request = request
25 | 
26 |     @property
27 |     def encoding(self):
28 |         return self._encoding
29 | 
30 |     @property
31 |     def url(self):
32 |         return self._url
33 | 
34 |     @url.setter
35 |     def url(self, url):
36 |         if isinstance(url, str):
37 |             self._url = safe_url_string(url)
38 |         elif isinstance(url, six.text_type):
39 |             if self.encoding is None:
40 |                 raise TypeError('Cannot convert unicode url - %s has no encoding' %
41 |                                 type(self).__name__)
42 |             self._url = safe_url_string(url.encode(self.encoding))
43 |         else:
44 |             raise TypeError('Response url must be str or unicode, got %s:' % type(url).__name__)
45 |         if ':' not in self._url:
46 |             raise ValueError('Missing scheme in request url: %s' % self._url)
47 | 
48 |     @property
49 |     def body(self):
50 |         return self._body
51 | 
52 |     @body.setter
53 |     def body(self, body):
54 |         if isinstance(body, str):
55 |             self._body = body
56 |         elif isinstance(body, six.text_type):
57 |             if self.encoding is None:
58 |                 raise TypeError('Cannot convert unicode body - %s has no encoding' %
59 |                                 type(self).__name__)
60 |             self._body = body.encode(self.encoding)
61 |         elif body is None:
62 |             self._body = ''
63 |         else:
64 |             raise TypeError("Response body must either str or unicode. Got: '%s'" % type(body).__name__)
65 | 
66 |     @property
67 |     def request(self):
68 |         return self._request
69 | 
70 |     @request.setter
71 |     def request(self, value):
72 |         if isinstance(value, Request):
73 |             self._request = value.copy()
74 |         else:
75 |             raise TypeError("Response request must be pycreeper.Request. Got: '%s'" % type(value).__name__)
76 | 
77 |     def copy(self, *args, **kwargs):
78 |         """ copy """
79 |         for key in ["url", "status", "cookiejar", "body", "request", "encoding", "headers"]:
80 |             kwargs.setdefault(key, getattr(self, key))
81 | 
82 |         cls = kwargs.pop('cls', self.__class__)
83 |         return cls(*args, **kwargs)
84 | 
85 |     def __str__(self):
86 |         return "<%d %s>" % (self.status, self.url)
87 | 
88 |     __repr__ = __str__
89 | 
90 |     def __eq__(self, other):
91 |         return self.__dict__ == other.__dict__
92 | 
93 |     def __ne__(self, other):
94 |         return self.__dict__ != other.__dict__
95 | 


--------------------------------------------------------------------------------
/doc/_build/html/genindex.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
 3 |   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 4 | 
 5 | 
 6 | <html xmlns="http://www.w3.org/1999/xhtml">
 7 |   <head>
 8 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 9 |     
10 |     <title>Index &#8212; PyCreeper 1.0.0 documentation</title>
11 |     
12 |     <link rel="stylesheet" href="_static/nature.css" type="text/css" />
13 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
14 |     
15 |     <script type="text/javascript">
16 |       var DOCUMENTATION_OPTIONS = {
17 |         URL_ROOT:    './',
18 |         VERSION:     '1.0.0',
19 |         COLLAPSE_INDEX: false,
20 |         FILE_SUFFIX: '.html',
21 |         HAS_SOURCE:  true,
22 |         SOURCELINK_SUFFIX: '.txt'
23 |       };
24 |     </script>
25 |     <script type="text/javascript" src="_static/jquery.js"></script>
26 |     <script type="text/javascript" src="_static/underscore.js"></script>
27 |     <script type="text/javascript" src="_static/doctools.js"></script>
28 |     <link rel="index" title="Index" href="#" />
29 |     <link rel="search" title="Search" href="search.html" /> 
30 |   </head>
31 |   <body role="document">
32 |     <div class="related" role="navigation" aria-label="related navigation">
33 |       <h3>Navigation</h3>
34 |       <ul>
35 |         <li class="right" style="margin-right: 10px">
36 |           <a href="#" title="General Index"
37 |              accesskey="I">index</a></li>
38 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
39 |       </ul>
40 |     </div>  
41 | 
42 |     <div class="document">
43 |       <div class="documentwrapper">
44 |         <div class="bodywrapper">
45 |           <div class="body" role="main">
46 |             
47 | 
48 | <h1 id="index">Index</h1>
49 | 
50 | <div class="genindex-jumpbox">
51 |  
52 | </div>
53 | 
54 | 
55 |           </div>
56 |         </div>
57 |       </div>
58 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
59 |         <div class="sphinxsidebarwrapper">
60 | 
61 |    
62 | 
63 | <div id="searchbox" style="display: none" role="search">
64 |   <h3>Quick search</h3>
65 |     <form class="search" action="search.html" method="get">
66 |       <div><input type="text" name="q" /></div>
67 |       <div><input type="submit" value="Go" /></div>
68 |       <input type="hidden" name="check_keywords" value="yes" />
69 |       <input type="hidden" name="area" value="default" />
70 |     </form>
71 | </div>
72 | <script type="text/javascript">$('#searchbox').show(0);</script>
73 |         </div>
74 |       </div>
75 |       <div class="clearer"></div>
76 |     </div>
77 |     <div class="related" role="navigation" aria-label="related navigation">
78 |       <h3>Navigation</h3>
79 |       <ul>
80 |         <li class="right" style="margin-right: 10px">
81 |           <a href="#" title="General Index"
82 |              >index</a></li>
83 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
84 |       </ul>
85 |     </div>
86 |     <div class="footer" role="contentinfo">
87 |         &#169; Copyright 2017, Jim Zheng.
88 |       Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.5.3.
89 |     </div>
90 |   </body>
91 | </html>


--------------------------------------------------------------------------------
/pycreeper/http/request.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | import six
 6 | from w3lib.url import safe_url_string
 7 | import copy
 8 | 
 9 | 
10 | class Request(object):
11 |     """ Request """
12 | 
13 |     def __init__(self, url, callback=None, method='GET', headers=None,
14 |                  body=None, meta=None, encoding='utf-8', cookiejar=None,
15 |                  dynamic=False, browser_actions=None, wait=0):
16 |         self._encoding = encoding
17 |         self.headers = copy.deepcopy(headers) if headers else {}
18 |         self.cookiejar = cookiejar
19 |         self.url = url
20 |         self.body = body
21 |         self.method = str(method).upper()
22 |         self.callback = callback
23 |         self.meta = dict(meta) if meta else {}
24 |         self.dynamic = bool(dynamic)
25 |         if self.dynamic:
26 |             if self.method == 'POST':
27 |                 raise AttributeError('Pycreeper can\'t make a dynamic POST request.')
28 |             self.browser_actions = browser_actions if browser_actions else []
29 |             self.wait = int(wait)
30 |         else:
31 |             self.browser_actions = []
32 |             self.wait = 0
33 | 
34 |     @property
35 |     def encoding(self):
36 |         return self._encoding
37 | 
38 |     @property
39 |     def url(self):
40 |         return self._url
41 | 
42 |     @url.setter
43 |     def url(self, url):
44 |         if isinstance(url, str):
45 |             self._url = safe_url_string(url)
46 |         elif isinstance(url, six.text_type):
47 |             if self._encoding is None:
48 |                 raise TypeError('Cannot convert unicode url - %s has no encoding' %
49 |                                 type(self).__name__)
50 |             self._url = safe_url_string(url.encode(self._encoding))
51 |         else:
52 |             raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
53 |         if ':' not in self._url:
54 |             raise ValueError('Missing scheme in request url: %s' % self._url)
55 | 
56 |     @property
57 |     def body(self):
58 |         return self._body
59 | 
60 |     @body.setter
61 |     def body(self, body):
62 |         if isinstance(body, str):
63 |             self._body = body
64 |         elif isinstance(body, six.text_type):
65 |             if self._encoding is None:
66 |                 raise TypeError('Cannot convert unicode body - %s has no encoding' %
67 |                                 type(self).__name__)
68 |             self._body = body.encode(self._encoding)
69 |         elif body is None:
70 |             self._body = ''
71 |         elif isinstance(body, dict):
72 |             self._body = body
73 |         else:
74 |             raise TypeError("Request body must either str, unicode or dict. Got: '%s'" % type(body).__name__)
75 | 
76 |     def copy(self, *args, **kwargs):
77 |         """ copy """
78 |         for key in ["encoding", "url", "method", "callback",
79 |                     "cookiejar", "body", "meta", "headers"]:
80 |             kwargs.setdefault(key, getattr(self, key))
81 |         cls = kwargs.pop('cls', self.__class__)
82 |         return cls(*args, **kwargs)
83 | 
84 |     def __str__(self):
85 |         return "<%s %s>" % (self.method, self.url)
86 | 
87 |     __repr__ = __str__
88 | 
89 |     def __eq__(self, other):
90 |         return self.__dict__ == other.__dict__
91 | 
92 |     def __ne__(self, other):
93 |         return self.__dict__ != other.__dict__
94 | 


--------------------------------------------------------------------------------
/examples/jd_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | reload(__import__('sys')).setdefaultencoding('utf-8')
 3 | __author__ = 'zcy'
 4 | 
 5 | import json
 6 | import HTMLParser
 7 | from pycreeper.spider import Spider
 8 | from pycreeper.http.request import Request
 9 | from selenium.webdriver.common.keys import Keys
10 | import gevent
11 | from lxml import etree
12 | from selenium.common.exceptions import NoSuchElementException
13 | 
14 | parser = HTMLParser.HTMLParser()
15 | 
16 | class Jd_Spider(Spider):
17 | 
18 |     custom_settings = {
19 |         'DOWNLOADER_MIDDLEWARES': {
20 |             'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
21 |             'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
22 |             'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300,
23 |             'pycreeper.downloader_middlewares.middlewares.EncodingDiscriminateMiddleware': 400
24 |         },
25 |         'DRIVER': 'Chrome',
26 |         'DOWNLOAD_DELAY': 2,
27 |         'USER_AGENT_LIST': [
28 |             '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
29 |             (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
30 |         ]
31 |     }
32 | 
33 |     def start_requests(self):
34 |         def _search(driver):
35 |             driver.find_element_by_id('key').send_keys(u"联想笔记本", Keys.ENTER)
36 |             gevent.sleep(3)
37 |             self._jump_guide(driver)
38 |             gevent.sleep(3)
39 | 
40 |         yield Request(url='https://www.jd.com/',
41 |                       meta={"cookiejar": "jd"},
42 |                       callback=self.parse_list,
43 |                       dynamic=True,
44 |                       browser_actions=[_search]
45 |                       )
46 | 
47 |     def _jump_guide(self, driver):
48 |         try:
49 |             driver.find_element_by_xpath('//*[@id="guide-price"]/div[2]/a').click()
50 |         except NoSuchElementException as e:
51 |             pass
52 | 
53 |     def parse_list(self, response):
54 |         html = response.body
55 |         selector = etree.HTML(html)
56 |         links = selector.xpath('//div[@class="p-img"]/a')
57 |         titles = selector.xpath('//div[@class="p-name p-name-type-2"]/a/em')
58 |         imgs = selector.xpath('//div[@class="p-img"]/a/img')
59 |         prices = selector.xpath('//div[@class="p-price"]/strong/i')
60 |         for i in range(len(links)):
61 |             try:
62 |                 yield {
63 |                     'path': links[i].attrib["href"] if 'http' in links[i].attrib["href"]
64 |                                         else 'http:' + links[i].attrib["href"],
65 |                     'title': parser.unescape(etree.tostring(titles[i], pretty_print=True)),
66 |                     'img': imgs[i].attrib["src"] if 'http' in imgs[i].attrib["src"]
67 |                           else 'http:' + imgs[i].attrib["src"],
68 |                     'price': prices[i].text,
69 |                 }
70 |             except Exception as e:
71 |                 pass
72 | 
73 |             url = response.url
74 | 
75 |         def _next_page(driver):
76 |             self._jump_guide(driver)
77 |             driver.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[9]').click()
78 |             self._jump_guide(driver)
79 | 
80 |         yield Request(url=url,
81 |                       meta={"cookiejar": "jd"},
82 |                       callback=self.parse_list,
83 |                       dynamic=True,
84 |                       browser_actions=[_next_page]
85 |                       )
86 | 
87 |     def process_item(self, item):
88 |         print json.dumps(item, ensure_ascii=False).encode('GBK', 'ignore')
89 | 
90 | if __name__ == "__main__":
91 |     spider = Jd_Spider()
92 |     spider.start()
93 | 


--------------------------------------------------------------------------------
/doc/_build/html/search.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  2 |   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  3 | 
  4 | 
  5 | <html xmlns="http://www.w3.org/1999/xhtml">
  6 |   <head>
  7 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  8 |     
  9 |     <title>Search &#8212; PyCreeper 1.0.0 documentation</title>
 10 |     
 11 |     <link rel="stylesheet" href="_static/nature.css" type="text/css" />
 12 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 13 |     
 14 |     <script type="text/javascript">
 15 |       var DOCUMENTATION_OPTIONS = {
 16 |         URL_ROOT:    './',
 17 |         VERSION:     '1.0.0',
 18 |         COLLAPSE_INDEX: false,
 19 |         FILE_SUFFIX: '.html',
 20 |         HAS_SOURCE:  true,
 21 |         SOURCELINK_SUFFIX: '.txt'
 22 |       };
 23 |     </script>
 24 |     <script type="text/javascript" src="_static/jquery.js"></script>
 25 |     <script type="text/javascript" src="_static/underscore.js"></script>
 26 |     <script type="text/javascript" src="_static/doctools.js"></script>
 27 |     <script type="text/javascript" src="_static/searchtools.js"></script>
 28 |     <link rel="index" title="Index" href="genindex.html" />
 29 |     <link rel="search" title="Search" href="#" />
 30 |   <script type="text/javascript">
 31 |     jQuery(function() { Search.loadIndex("searchindex.js"); });
 32 |   </script>
 33 |   
 34 |   <script type="text/javascript" id="searchindexloader"></script>
 35 |    
 36 | 
 37 |   </head>
 38 |   <body role="document">
 39 |     <div class="related" role="navigation" aria-label="related navigation">
 40 |       <h3>Navigation</h3>
 41 |       <ul>
 42 |         <li class="right" style="margin-right: 10px">
 43 |           <a href="genindex.html" title="General Index"
 44 |              accesskey="I">index</a></li>
 45 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
 46 |       </ul>
 47 |     </div>  
 48 | 
 49 |     <div class="document">
 50 |       <div class="documentwrapper">
 51 |         <div class="bodywrapper">
 52 |           <div class="body" role="main">
 53 |             
 54 |   <h1 id="search-documentation">Search</h1>
 55 |   <div id="fallback" class="admonition warning">
 56 |   <script type="text/javascript">$('#fallback').hide();</script>
 57 |   <p>
 58 |     Please activate JavaScript to enable the search
 59 |     functionality.
 60 |   </p>
 61 |   </div>
 62 |   <p>
 63 |     From here you can search these documents. Enter your search
 64 |     words into the box below and click "search". Note that the search
 65 |     function will automatically search for all of the words. Pages
 66 |     containing fewer words won't appear in the result list.
 67 |   </p>
 68 |   <form action="" method="get">
 69 |     <input type="text" name="q" value="" />
 70 |     <input type="submit" value="search" />
 71 |     <span id="search-progress" style="padding-left: 10px"></span>
 72 |   </form>
 73 |   
 74 |   <div id="search-results">
 75 |   
 76 |   </div>
 77 | 
 78 |           </div>
 79 |         </div>
 80 |       </div>
 81 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 82 |         <div class="sphinxsidebarwrapper">
 83 |         </div>
 84 |       </div>
 85 |       <div class="clearer"></div>
 86 |     </div>
 87 |     <div class="related" role="navigation" aria-label="related navigation">
 88 |       <h3>Navigation</h3>
 89 |       <ul>
 90 |         <li class="right" style="margin-right: 10px">
 91 |           <a href="genindex.html" title="General Index"
 92 |              >index</a></li>
 93 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
 94 |       </ul>
 95 |     </div>
 96 |     <div class="footer" role="contentinfo">
 97 |         &#169; Copyright 2017, Jim Zheng.
 98 |       Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.5.3.
 99 |     </div>
100 |   </body>
101 | </html>


--------------------------------------------------------------------------------
/doc/_build/html/last.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  2 |   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  3 | 
  4 | 
  5 | <html xmlns="http://www.w3.org/1999/xhtml">
  6 |   <head>
  7 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  8 |     
  9 |     <title>写在最后 &#8212; PyCreeper 1.0.0 documentation</title>
 10 |     
 11 |     <link rel="stylesheet" href="_static/nature.css" type="text/css" />
 12 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 13 |     
 14 |     <script type="text/javascript">
 15 |       var DOCUMENTATION_OPTIONS = {
 16 |         URL_ROOT:    './',
 17 |         VERSION:     '1.0.0',
 18 |         COLLAPSE_INDEX: false,
 19 |         FILE_SUFFIX: '.html',
 20 |         HAS_SOURCE:  true,
 21 |         SOURCELINK_SUFFIX: '.txt'
 22 |       };
 23 |     </script>
 24 |     <script type="text/javascript" src="_static/jquery.js"></script>
 25 |     <script type="text/javascript" src="_static/underscore.js"></script>
 26 |     <script type="text/javascript" src="_static/doctools.js"></script>
 27 |     <link rel="index" title="Index" href="genindex.html" />
 28 |     <link rel="search" title="Search" href="search.html" />
 29 |     <link rel="prev" title="spider：爬虫" href="spider.html" /> 
 30 |   </head>
 31 |   <body role="document">
 32 |     <div class="related" role="navigation" aria-label="related navigation">
 33 |       <h3>Navigation</h3>
 34 |       <ul>
 35 |         <li class="right" style="margin-right: 10px">
 36 |           <a href="genindex.html" title="General Index"
 37 |              accesskey="I">index</a></li>
 38 |         <li class="right" >
 39 |           <a href="spider.html" title="spider：爬虫"
 40 |              accesskey="P">previous</a> |</li>
 41 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
 42 |       </ul>
 43 |     </div>  
 44 | 
 45 |     <div class="document">
 46 |       <div class="documentwrapper">
 47 |         <div class="bodywrapper">
 48 |           <div class="body" role="main">
 49 |             
 50 |   <div class="section" id="id1">
 51 | <h1>写在最后<a class="headerlink" href="#id1" title="Permalink to this headline">¶</a></h1>
 52 | </div>
 53 | 
 54 | 
 55 |           </div>
 56 |         </div>
 57 |       </div>
 58 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 59 |         <div class="sphinxsidebarwrapper">
 60 |   <h4>Previous topic</h4>
 61 |   <p class="topless"><a href="spider.html"
 62 |                         title="previous chapter">spider：爬虫</a></p>
 63 |   <div role="note" aria-label="source link">
 64 |     <h3>This Page</h3>
 65 |     <ul class="this-page-menu">
 66 |       <li><a href="_sources/last.rst.txt"
 67 |             rel="nofollow">Show Source</a></li>
 68 |     </ul>
 69 |    </div>
 70 | <div id="searchbox" style="display: none" role="search">
 71 |   <h3>Quick search</h3>
 72 |     <form class="search" action="search.html" method="get">
 73 |       <div><input type="text" name="q" /></div>
 74 |       <div><input type="submit" value="Go" /></div>
 75 |       <input type="hidden" name="check_keywords" value="yes" />
 76 |       <input type="hidden" name="area" value="default" />
 77 |     </form>
 78 | </div>
 79 | <script type="text/javascript">$('#searchbox').show(0);</script>
 80 |         </div>
 81 |       </div>
 82 |       <div class="clearer"></div>
 83 |     </div>
 84 |     <div class="related" role="navigation" aria-label="related navigation">
 85 |       <h3>Navigation</h3>
 86 |       <ul>
 87 |         <li class="right" style="margin-right: 10px">
 88 |           <a href="genindex.html" title="General Index"
 89 |              >index</a></li>
 90 |         <li class="right" >
 91 |           <a href="spider.html" title="spider：爬虫"
 92 |              >previous</a> |</li>
 93 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
 94 |       </ul>
 95 |     </div>
 96 |     <div class="footer" role="contentinfo">
 97 |         &#169; Copyright 2017, Jim Zheng.
 98 |       Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.5.3.
 99 |     </div>
100 |   </body>
101 | </html>


--------------------------------------------------------------------------------
/tests/utils/test_utils_datatypes.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | reload(__import__('sys')).setdefaultencoding('utf-8')
  3 | __author__ = 'zcy'
  4 | 
  5 | import copy
  6 | import unittest
  7 | 
  8 | from pycreeper.utils.datatypes import CaselessDict
  9 | 
 10 | __doctests__ = ['pycreeper.utils.datatypes']
 11 | 
 12 | 
 13 | class CaselessDictTest(unittest.TestCase):
 14 |     def test_init(self):
 15 |         seq = {'red': 1, 'black': 3}
 16 |         d = CaselessDict(seq)
 17 |         self.assertEqual(d['red'], 1)
 18 |         self.assertEqual(d['black'], 3)
 19 | 
 20 |         seq = (('red', 1), ('black', 3))
 21 |         d = CaselessDict(seq)
 22 |         self.assertEqual(d['red'], 1)
 23 |         self.assertEqual(d['black'], 3)
 24 | 
 25 |     def test_caseless(self):
 26 |         d = CaselessDict()
 27 |         d['key_Lower'] = 1
 28 |         self.assertEqual(d['KEy_loWer'], 1)
 29 |         self.assertEqual(d.get('KEy_loWer'), 1)
 30 | 
 31 |         d['KEY_LOWER'] = 3
 32 |         self.assertEqual(d['key_Lower'], 3)
 33 |         self.assertEqual(d.get('key_Lower'), 3)
 34 | 
 35 |     def test_delete(self):
 36 |         d = CaselessDict({'key_lower': 1})
 37 |         del d['key_LOWER']
 38 |         self.assertRaises(KeyError, d.__getitem__, 'key_LOWER')
 39 |         self.assertRaises(KeyError, d.__getitem__, 'key_lower')
 40 | 
 41 |     def test_getdefault(self):
 42 |         d = CaselessDict()
 43 |         self.assertEqual(d.get('c', 5), 5)
 44 |         d['c'] = 10
 45 |         self.assertEqual(d.get('c', 5), 10)
 46 | 
 47 |     def test_setdefault(self):
 48 |         d = CaselessDict({'a': 1, 'b': 2})
 49 | 
 50 |         r = d.setdefault('A', 5)
 51 |         self.assertEqual(r, 1)
 52 |         self.assertEqual(d['A'], 1)
 53 | 
 54 |         r = d.setdefault('c', 5)
 55 |         self.assertEqual(r, 5)
 56 |         self.assertEqual(d['C'], 5)
 57 | 
 58 |     def test_fromkeys(self):
 59 |         keys = ('a', 'b')
 60 | 
 61 |         d = CaselessDict.fromkeys(keys)
 62 |         self.assertEqual(d['A'], None)
 63 |         self.assertEqual(d['B'], None)
 64 | 
 65 |         d = CaselessDict.fromkeys(keys, 1)
 66 |         self.assertEqual(d['A'], 1)
 67 |         self.assertEqual(d['B'], 1)
 68 | 
 69 |         instance = CaselessDict()
 70 |         d = instance.fromkeys(keys)
 71 |         self.assertEqual(d['A'], None)
 72 |         self.assertEqual(d['B'], None)
 73 | 
 74 |         d = instance.fromkeys(keys, 1)
 75 |         self.assertEqual(d['A'], 1)
 76 |         self.assertEqual(d['B'], 1)
 77 | 
 78 |     def test_contains(self):
 79 |         d = CaselessDict()
 80 |         d['a'] = 1
 81 |         assert 'a' in d
 82 | 
 83 |     def test_pop(self):
 84 |         d = CaselessDict()
 85 |         d['a'] = 1
 86 |         self.assertEqual(d.pop('A'), 1)
 87 |         self.assertRaises(KeyError, d.pop, 'A')
 88 | 
 89 |     def test_normkey(self):
 90 |         class MyDict(CaselessDict):
 91 |             def normkey(self, key):
 92 |                 return key.title()
 93 | 
 94 |         d = MyDict()
 95 |         d['key-one'] = 2
 96 |         self.assertEqual(list(d.keys()), ['Key-One'])
 97 | 
 98 |     def test_normvalue(self):
 99 |         class MyDict(CaselessDict):
100 |             def normvalue(self, value):
101 |                 if value is not None:
102 |                     return value + 1
103 | 
104 |         d = MyDict({'key': 1})
105 |         self.assertEqual(d['key'], 2)
106 |         self.assertEqual(d.get('key'), 2)
107 | 
108 |         d = MyDict()
109 |         d['key'] = 1
110 |         self.assertEqual(d['key'], 2)
111 |         self.assertEqual(d.get('key'), 2)
112 | 
113 |         d = MyDict()
114 |         d.setdefault('key', 1)
115 |         self.assertEqual(d['key'], 2)
116 |         self.assertEqual(d.get('key'), 2)
117 | 
118 |         d = MyDict()
119 |         d.update({'key': 1})
120 |         self.assertEqual(d['key'], 2)
121 |         self.assertEqual(d.get('key'), 2)
122 | 
123 |         d = MyDict.fromkeys(('key',), 1)
124 |         self.assertEqual(d['key'], 2)
125 |         self.assertEqual(d.get('key'), 2)
126 | 
127 |     def test_copy(self):
128 |         h1 = CaselessDict({'header1': 'value'})
129 |         h2 = copy.copy(h1)
130 |         self.assertEqual(h1, h2)
131 |         self.assertEqual(h1.get('header1'), h2.get('header1'))
132 |         assert isinstance(h2, CaselessDict)
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     unittest.main()
137 | 


--------------------------------------------------------------------------------
/doc/_build/html/_static/pygments.css:
--------------------------------------------------------------------------------
 1 | .highlight .hll { background-color: #ffffcc }
 2 | .highlight  { background: #eeffcc; }
 3 | .highlight .c { color: #408090; font-style: italic } /* Comment */
 4 | .highlight .err { border: 1px solid #FF0000 } /* Error */
 5 | .highlight .k { color: #007020; font-weight: bold } /* Keyword */
 6 | .highlight .o { color: #666666 } /* Operator */
 7 | .highlight .ch { color: #408090; font-style: italic } /* Comment.Hashbang */
 8 | .highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */
 9 | .highlight .cp { color: #007020 } /* Comment.Preproc */
10 | .highlight .cpf { color: #408090; font-style: italic } /* Comment.PreprocFile */
11 | .highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */
12 | .highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */
13 | .highlight .gd { color: #A00000 } /* Generic.Deleted */
14 | .highlight .ge { font-style: italic } /* Generic.Emph */
15 | .highlight .gr { color: #FF0000 } /* Generic.Error */
16 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
17 | .highlight .gi { color: #00A000 } /* Generic.Inserted */
18 | .highlight .go { color: #333333 } /* Generic.Output */
19 | .highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */
20 | .highlight .gs { font-weight: bold } /* Generic.Strong */
21 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
22 | .highlight .gt { color: #0044DD } /* Generic.Traceback */
23 | .highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */
24 | .highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */
25 | .highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */
26 | .highlight .kp { color: #007020 } /* Keyword.Pseudo */
27 | .highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */
28 | .highlight .kt { color: #902000 } /* Keyword.Type */
29 | .highlight .m { color: #208050 } /* Literal.Number */
30 | .highlight .s { color: #4070a0 } /* Literal.String */
31 | .highlight .na { color: #4070a0 } /* Name.Attribute */
32 | .highlight .nb { color: #007020 } /* Name.Builtin */
33 | .highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */
34 | .highlight .no { color: #60add5 } /* Name.Constant */
35 | .highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */
36 | .highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */
37 | .highlight .ne { color: #007020 } /* Name.Exception */
38 | .highlight .nf { color: #06287e } /* Name.Function */
39 | .highlight .nl { color: #002070; font-weight: bold } /* Name.Label */
40 | .highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */
41 | .highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */
42 | .highlight .nv { color: #bb60d5 } /* Name.Variable */
43 | .highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */
44 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */
45 | .highlight .mb { color: #208050 } /* Literal.Number.Bin */
46 | .highlight .mf { color: #208050 } /* Literal.Number.Float */
47 | .highlight .mh { color: #208050 } /* Literal.Number.Hex */
48 | .highlight .mi { color: #208050 } /* Literal.Number.Integer */
49 | .highlight .mo { color: #208050 } /* Literal.Number.Oct */
50 | .highlight .sa { color: #4070a0 } /* Literal.String.Affix */
51 | .highlight .sb { color: #4070a0 } /* Literal.String.Backtick */
52 | .highlight .sc { color: #4070a0 } /* Literal.String.Char */
53 | .highlight .dl { color: #4070a0 } /* Literal.String.Delimiter */
54 | .highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */
55 | .highlight .s2 { color: #4070a0 } /* Literal.String.Double */
56 | .highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */
57 | .highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */
58 | .highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */
59 | .highlight .sx { color: #c65d09 } /* Literal.String.Other */
60 | .highlight .sr { color: #235388 } /* Literal.String.Regex */
61 | .highlight .s1 { color: #4070a0 } /* Literal.String.Single */
62 | .highlight .ss { color: #517918 } /* Literal.String.Symbol */
63 | .highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */
64 | .highlight .fm { color: #06287e } /* Name.Function.Magic */
65 | .highlight .vc { color: #bb60d5 } /* Name.Variable.Class */
66 | .highlight .vg { color: #bb60d5 } /* Name.Variable.Global */
67 | .highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */
68 | .highlight .vm { color: #bb60d5 } /* Name.Variable.Magic */
69 | .highlight .il { color: #208050 } /* Literal.Number.Integer.Long */


--------------------------------------------------------------------------------
/doc/_build/html/spider.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  2 |   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  3 | 
  4 | 
  5 | <html xmlns="http://www.w3.org/1999/xhtml">
  6 |   <head>
  7 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  8 |     
  9 |     <title>spider：爬虫 &#8212; PyCreeper 1.0.0 documentation</title>
 10 |     
 11 |     <link rel="stylesheet" href="_static/nature.css" type="text/css" />
 12 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 13 |     
 14 |     <script type="text/javascript">
 15 |       var DOCUMENTATION_OPTIONS = {
 16 |         URL_ROOT:    './',
 17 |         VERSION:     '1.0.0',
 18 |         COLLAPSE_INDEX: false,
 19 |         FILE_SUFFIX: '.html',
 20 |         HAS_SOURCE:  true,
 21 |         SOURCELINK_SUFFIX: '.txt'
 22 |       };
 23 |     </script>
 24 |     <script type="text/javascript" src="_static/jquery.js"></script>
 25 |     <script type="text/javascript" src="_static/underscore.js"></script>
 26 |     <script type="text/javascript" src="_static/doctools.js"></script>
 27 |     <link rel="index" title="Index" href="genindex.html" />
 28 |     <link rel="search" title="Search" href="search.html" />
 29 |     <link rel="next" title="写在最后" href="last.html" />
 30 |     <link rel="prev" title="schedular：调度器" href="schedular.html" /> 
 31 |   </head>
 32 |   <body role="document">
 33 |     <div class="related" role="navigation" aria-label="related navigation">
 34 |       <h3>Navigation</h3>
 35 |       <ul>
 36 |         <li class="right" style="margin-right: 10px">
 37 |           <a href="genindex.html" title="General Index"
 38 |              accesskey="I">index</a></li>
 39 |         <li class="right" >
 40 |           <a href="last.html" title="写在最后"
 41 |              accesskey="N">next</a> |</li>
 42 |         <li class="right" >
 43 |           <a href="schedular.html" title="schedular：调度器"
 44 |              accesskey="P">previous</a> |</li>
 45 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
 46 |       </ul>
 47 |     </div>  
 48 | 
 49 |     <div class="document">
 50 |       <div class="documentwrapper">
 51 |         <div class="bodywrapper">
 52 |           <div class="body" role="main">
 53 |             
 54 |   <div class="section" id="spider">
 55 | <h1>spider：爬虫<a class="headerlink" href="#spider" title="Permalink to this headline">¶</a></h1>
 56 | </div>
 57 | 
 58 | 
 59 |           </div>
 60 |         </div>
 61 |       </div>
 62 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 63 |         <div class="sphinxsidebarwrapper">
 64 |   <h4>Previous topic</h4>
 65 |   <p class="topless"><a href="schedular.html"
 66 |                         title="previous chapter">schedular：调度器</a></p>
 67 |   <h4>Next topic</h4>
 68 |   <p class="topless"><a href="last.html"
 69 |                         title="next chapter">写在最后</a></p>
 70 |   <div role="note" aria-label="source link">
 71 |     <h3>This Page</h3>
 72 |     <ul class="this-page-menu">
 73 |       <li><a href="_sources/spider.rst.txt"
 74 |             rel="nofollow">Show Source</a></li>
 75 |     </ul>
 76 |    </div>
 77 | <div id="searchbox" style="display: none" role="search">
 78 |   <h3>Quick search</h3>
 79 |     <form class="search" action="search.html" method="get">
 80 |       <div><input type="text" name="q" /></div>
 81 |       <div><input type="submit" value="Go" /></div>
 82 |       <input type="hidden" name="check_keywords" value="yes" />
 83 |       <input type="hidden" name="area" value="default" />
 84 |     </form>
 85 | </div>
 86 | <script type="text/javascript">$('#searchbox').show(0);</script>
 87 |         </div>
 88 |       </div>
 89 |       <div class="clearer"></div>
 90 |     </div>
 91 |     <div class="related" role="navigation" aria-label="related navigation">
 92 |       <h3>Navigation</h3>
 93 |       <ul>
 94 |         <li class="right" style="margin-right: 10px">
 95 |           <a href="genindex.html" title="General Index"
 96 |              >index</a></li>
 97 |         <li class="right" >
 98 |           <a href="last.html" title="写在最后"
 99 |              >next</a> |</li>
100 |         <li class="right" >
101 |           <a href="schedular.html" title="schedular：调度器"
102 |              >previous</a> |</li>
103 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
104 |       </ul>
105 |     </div>
106 |     <div class="footer" role="contentinfo">
107 |         &#169; Copyright 2017, Jim Zheng.
108 |       Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.5.3.
109 |     </div>
110 |   </body>
111 | </html>


--------------------------------------------------------------------------------
/doc/_build/html/downloader_middlewares.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  2 |   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  3 | 
  4 | 
  5 | <html xmlns="http://www.w3.org/1999/xhtml">
  6 |   <head>
  7 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  8 |     
  9 |     <title>downloader_middlewares：下载器中间件 &#8212; PyCreeper 1.0.0 documentation</title>
 10 |     
 11 |     <link rel="stylesheet" href="_static/nature.css" type="text/css" />
 12 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 13 |     
 14 |     <script type="text/javascript">
 15 |       var DOCUMENTATION_OPTIONS = {
 16 |         URL_ROOT:    './',
 17 |         VERSION:     '1.0.0',
 18 |         COLLAPSE_INDEX: false,
 19 |         FILE_SUFFIX: '.html',
 20 |         HAS_SOURCE:  true,
 21 |         SOURCELINK_SUFFIX: '.txt'
 22 |       };
 23 |     </script>
 24 |     <script type="text/javascript" src="_static/jquery.js"></script>
 25 |     <script type="text/javascript" src="_static/underscore.js"></script>
 26 |     <script type="text/javascript" src="_static/doctools.js"></script>
 27 |     <link rel="index" title="Index" href="genindex.html" />
 28 |     <link rel="search" title="Search" href="search.html" />
 29 |     <link rel="next" title="schedular：调度器" href="schedular.html" />
 30 |     <link rel="prev" title="downloader：下载器" href="downloader.html" /> 
 31 |   </head>
 32 |   <body role="document">
 33 |     <div class="related" role="navigation" aria-label="related navigation">
 34 |       <h3>Navigation</h3>
 35 |       <ul>
 36 |         <li class="right" style="margin-right: 10px">
 37 |           <a href="genindex.html" title="General Index"
 38 |              accesskey="I">index</a></li>
 39 |         <li class="right" >
 40 |           <a href="schedular.html" title="schedular：调度器"
 41 |              accesskey="N">next</a> |</li>
 42 |         <li class="right" >
 43 |           <a href="downloader.html" title="downloader：下载器"
 44 |              accesskey="P">previous</a> |</li>
 45 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
 46 |       </ul>
 47 |     </div>  
 48 | 
 49 |     <div class="document">
 50 |       <div class="documentwrapper">
 51 |         <div class="bodywrapper">
 52 |           <div class="body" role="main">
 53 |             
 54 |   <div class="section" id="downloader-middlewares">
 55 | <h1>downloader_middlewares：下载器中间件<a class="headerlink" href="#downloader-middlewares" title="Permalink to this headline">¶</a></h1>
 56 | </div>
 57 | 
 58 | 
 59 |           </div>
 60 |         </div>
 61 |       </div>
 62 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 63 |         <div class="sphinxsidebarwrapper">
 64 |   <h4>Previous topic</h4>
 65 |   <p class="topless"><a href="downloader.html"
 66 |                         title="previous chapter">downloader：下载器</a></p>
 67 |   <h4>Next topic</h4>
 68 |   <p class="topless"><a href="schedular.html"
 69 |                         title="next chapter">schedular：调度器</a></p>
 70 |   <div role="note" aria-label="source link">
 71 |     <h3>This Page</h3>
 72 |     <ul class="this-page-menu">
 73 |       <li><a href="_sources/downloader_middlewares.rst.txt"
 74 |             rel="nofollow">Show Source</a></li>
 75 |     </ul>
 76 |    </div>
 77 | <div id="searchbox" style="display: none" role="search">
 78 |   <h3>Quick search</h3>
 79 |     <form class="search" action="search.html" method="get">
 80 |       <div><input type="text" name="q" /></div>
 81 |       <div><input type="submit" value="Go" /></div>
 82 |       <input type="hidden" name="check_keywords" value="yes" />
 83 |       <input type="hidden" name="area" value="default" />
 84 |     </form>
 85 | </div>
 86 | <script type="text/javascript">$('#searchbox').show(0);</script>
 87 |         </div>
 88 |       </div>
 89 |       <div class="clearer"></div>
 90 |     </div>
 91 |     <div class="related" role="navigation" aria-label="related navigation">
 92 |       <h3>Navigation</h3>
 93 |       <ul>
 94 |         <li class="right" style="margin-right: 10px">
 95 |           <a href="genindex.html" title="General Index"
 96 |              >index</a></li>
 97 |         <li class="right" >
 98 |           <a href="schedular.html" title="schedular：调度器"
 99 |              >next</a> |</li>
100 |         <li class="right" >
101 |           <a href="downloader.html" title="downloader：下载器"
102 |              >previous</a> |</li>
103 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
104 |       </ul>
105 |     </div>
106 |     <div class="footer" role="contentinfo">
107 |         &#169; Copyright 2017, Jim Zheng.
108 |       Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.5.3.
109 |     </div>
110 |   </body>
111 | </html>


--------------------------------------------------------------------------------
/doc/_build/html/downloader.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  2 |   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  3 | 
  4 | 
  5 | <html xmlns="http://www.w3.org/1999/xhtml">
  6 |   <head>
  7 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  8 |     
  9 |     <title>downloader：下载器 &#8212; PyCreeper 1.0.0 documentation</title>
 10 |     
 11 |     <link rel="stylesheet" href="_static/nature.css" type="text/css" />
 12 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 13 |     
 14 |     <script type="text/javascript">
 15 |       var DOCUMENTATION_OPTIONS = {
 16 |         URL_ROOT:    './',
 17 |         VERSION:     '1.0.0',
 18 |         COLLAPSE_INDEX: false,
 19 |         FILE_SUFFIX: '.html',
 20 |         HAS_SOURCE:  true,
 21 |         SOURCELINK_SUFFIX: '.txt'
 22 |       };
 23 |     </script>
 24 |     <script type="text/javascript" src="_static/jquery.js"></script>
 25 |     <script type="text/javascript" src="_static/underscore.js"></script>
 26 |     <script type="text/javascript" src="_static/doctools.js"></script>
 27 |     <link rel="index" title="Index" href="genindex.html" />
 28 |     <link rel="search" title="Search" href="search.html" />
 29 |     <link rel="next" title="downloader_middlewares：下载器中间件" href="downloader_middlewares.html" />
 30 |     <link rel="prev" title="request对象和response对象" href="http.html" /> 
 31 |   </head>
 32 |   <body role="document">
 33 |     <div class="related" role="navigation" aria-label="related navigation">
 34 |       <h3>Navigation</h3>
 35 |       <ul>
 36 |         <li class="right" style="margin-right: 10px">
 37 |           <a href="genindex.html" title="General Index"
 38 |              accesskey="I">index</a></li>
 39 |         <li class="right" >
 40 |           <a href="downloader_middlewares.html" title="downloader_middlewares：下载器中间件"
 41 |              accesskey="N">next</a> |</li>
 42 |         <li class="right" >
 43 |           <a href="http.html" title="request对象和response对象"
 44 |              accesskey="P">previous</a> |</li>
 45 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
 46 |       </ul>
 47 |     </div>  
 48 | 
 49 |     <div class="document">
 50 |       <div class="documentwrapper">
 51 |         <div class="bodywrapper">
 52 |           <div class="body" role="main">
 53 |             
 54 |   <div class="section" id="downloader">
 55 | <h1>downloader：下载器<a class="headerlink" href="#downloader" title="Permalink to this headline">¶</a></h1>
 56 | </div>
 57 | 
 58 | 
 59 |           </div>
 60 |         </div>
 61 |       </div>
 62 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 63 |         <div class="sphinxsidebarwrapper">
 64 |   <h4>Previous topic</h4>
 65 |   <p class="topless"><a href="http.html"
 66 |                         title="previous chapter">request对象和response对象</a></p>
 67 |   <h4>Next topic</h4>
 68 |   <p class="topless"><a href="downloader_middlewares.html"
 69 |                         title="next chapter">downloader_middlewares：下载器中间件</a></p>
 70 |   <div role="note" aria-label="source link">
 71 |     <h3>This Page</h3>
 72 |     <ul class="this-page-menu">
 73 |       <li><a href="_sources/downloader.rst.txt"
 74 |             rel="nofollow">Show Source</a></li>
 75 |     </ul>
 76 |    </div>
 77 | <div id="searchbox" style="display: none" role="search">
 78 |   <h3>Quick search</h3>
 79 |     <form class="search" action="search.html" method="get">
 80 |       <div><input type="text" name="q" /></div>
 81 |       <div><input type="submit" value="Go" /></div>
 82 |       <input type="hidden" name="check_keywords" value="yes" />
 83 |       <input type="hidden" name="area" value="default" />
 84 |     </form>
 85 | </div>
 86 | <script type="text/javascript">$('#searchbox').show(0);</script>
 87 |         </div>
 88 |       </div>
 89 |       <div class="clearer"></div>
 90 |     </div>
 91 |     <div class="related" role="navigation" aria-label="related navigation">
 92 |       <h3>Navigation</h3>
 93 |       <ul>
 94 |         <li class="right" style="margin-right: 10px">
 95 |           <a href="genindex.html" title="General Index"
 96 |              >index</a></li>
 97 |         <li class="right" >
 98 |           <a href="downloader_middlewares.html" title="downloader_middlewares：下载器中间件"
 99 |              >next</a> |</li>
100 |         <li class="right" >
101 |           <a href="http.html" title="request对象和response对象"
102 |              >previous</a> |</li>
103 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
104 |       </ul>
105 |     </div>
106 |     <div class="footer" role="contentinfo">
107 |         &#169; Copyright 2017, Jim Zheng.
108 |       Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.5.3.
109 |     </div>
110 |   </body>
111 | </html>


--------------------------------------------------------------------------------
/pycreeper/downloader_middlewares/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | reload(__import__('sys')).setdefaultencoding('utf-8')
  3 | __author__ = 'zcy'
  4 | 
  5 | import random
  6 | import time
  7 | from urlparse import urlparse
  8 | from logging import Logger
  9 | import chardet
 10 | import gevent
 11 | from pycreeper.downloader_middlewares import DownloaderMiddleware
 12 | from pycreeper.utils.exceptions import TimeoutException
 13 | from collections import deque
 14 | 
 15 | 
 16 | class RetryMiddleware(DownloaderMiddleware):
 17 |     """ Retry Middleware """
 18 | 
 19 |     RETRY_EXCEPTIONS = TimeoutException
 20 | 
 21 |     def __init__(self, settings, logger):
 22 |         self.max_retry_count = settings.get_int("RETRY_COUNT")
 23 |         self.retry_status_codes = settings.get_list("RETRY_STATUS_CODES")
 24 |         if not isinstance(logger, Logger):
 25 |             raise AttributeError('logger must be instance of logging.Logger')
 26 |         self.logger = logger
 27 | 
 28 |     def process_response(self, request, response):
 29 |         """process response
 30 |         """
 31 |         if request.meta.get("dont_retry", False):
 32 |             return response
 33 |         if response.status in self.retry_status_codes:
 34 |             return self._retry(request) or response
 35 |         return response
 36 | 
 37 |     def process_exception(self, request, exception):
 38 |         """process exception
 39 |         """
 40 |         if isinstance(exception, self.RETRY_EXCEPTIONS) \
 41 |                 and request.meta.get("dont_retry", False):
 42 |             return self._retry(request)
 43 | 
 44 |     def _retry(self, request):
 45 |         """retry
 46 |         """
 47 |         retry_count = request.meta.get("retry_count", 0) + 1
 48 |         if retry_count <= self.max_retry_count:
 49 |             retry_request = request.copy()
 50 |             retry_request.meta["retry_count"] = retry_count
 51 |             return retry_request
 52 | 
 53 | 
 54 | class UserAgentMiddleware(DownloaderMiddleware):
 55 |     """ UserAgent Middleware """
 56 | 
 57 |     def __init__(self, settings, logger):
 58 |         self.user_agent_list = settings.get_list("USER_AGENT_LIST")
 59 |         if not isinstance(logger, Logger):
 60 |             raise AttributeError('logger must be instance of logging.Logger')
 61 |         self.logger = logger
 62 | 
 63 |     def process_request(self, request):
 64 |         """process request
 65 | 
 66 |         static requests only.
 67 |         """
 68 |         if not request.dynamic:
 69 |             request.headers["User-Agent"] = random.choice(self.user_agent_list)
 70 | 
 71 | 
 72 | class ProxyMiddleware(DownloaderMiddleware):
 73 |     """ Proxy Middleware """
 74 | 
 75 |     def __init__(self, settings, logger):
 76 |         self.host_time_queue = deque()
 77 |         self.proxy_interval = settings["PROXY_INTERVAL"]
 78 |         self.proxy_list = settings["PROXY_LIST"]
 79 |         for proxy in self.proxy_list:
 80 |             self.host_time_queue.append((proxy, 0))
 81 |         if not isinstance(logger, Logger):
 82 |             raise AttributeError('logger must be instance of logging.Logger')
 83 |         self.logger = logger
 84 | 
 85 |     def process_request(self, request):
 86 |         """process request
 87 | 
 88 |         static requests only.
 89 |         """
 90 |         if not request.dynamic:
 91 |             request.meta["proxy"] = {
 92 |                 "http": self._get_proxy(),
 93 |             }
 94 | 
 95 |     def _get_proxy(self):
 96 |         """get proxy
 97 |         """
 98 |         proxy, latest = self.host_time_queue.popleft()
 99 |         interval = time.time() - latest
100 |         if interval < self.proxy_interval:
101 |             self.logger.info("Proxy %s waitting ...", proxy)
102 |             gevent.sleep(self.proxy_interval - interval)
103 |         self.host_time_queue.append((proxy, time.time()))
104 |         return "http://%s" % proxy
105 | 
106 | 
107 | class EncodingDiscriminateMiddleware(DownloaderMiddleware):
108 |     """ Encoding Discriminate Middleware """
109 | 
110 |     ENCODING_MAP = {}
111 | 
112 |     def __init__(self, settings, logger):
113 |         self.settings = settings
114 |         if not isinstance(logger, Logger):
115 |             raise AttributeError('logger must be instance of logging.Logger')
116 |         self.logger = logger
117 | 
118 |     def process_response(self, request, response):
119 |         """process respoonse
120 |         :param request:
121 |         :param response:
122 |         """
123 |         netloc = urlparse(request.url).netloc
124 |         content = response.body
125 |         if self.ENCODING_MAP.get(netloc) is None:
126 |             encoding = chardet.detect(content)["encoding"]
127 |             encoding = "GB18030" \
128 |                 if encoding.upper() in ("GBK", "GB2312") else encoding
129 |             self.ENCODING_MAP[netloc] = encoding
130 |         body = content.decode(self.ENCODING_MAP[netloc], "replace")
131 |         return response.copy(body=body)
132 | 


--------------------------------------------------------------------------------
/tests/http/test_http_response.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | reload(__import__('sys')).setdefaultencoding('utf-8')
  3 | __author__ = 'zcy'
  4 | 
  5 | import unittest
  6 | 
  7 | from w3lib.url import safe_url_string
  8 | 
  9 | from pycreeper.http.request import Request
 10 | from pycreeper.http.response import Response
 11 | 
 12 | 
 13 | class ResponseTest(unittest.TestCase):
 14 |     def test_init(self):
 15 |         self.assertRaises(Exception, Response)
 16 |         self.assertRaises(Exception, Response, url='http://www.example.com/')
 17 |         self.assertRaises(Exception, Response, request=Request('http://www.example.com/'))
 18 |         self.assertRaises(ValueError,
 19 |                           Response,
 20 |                           url='foo',
 21 |                           request=Request('http://www.example.com/')
 22 |                           )
 23 |         self.assertRaises(ValueError,
 24 |                           Response,
 25 |                           'http://www.example.com/',
 26 |                           status='foo',
 27 |                           request=Request('http://www.example.com/')
 28 |                           )
 29 |         self.assertRaises(TypeError,
 30 |                           Response,
 31 |                           'http://www.example.com/',
 32 |                           request='foo'
 33 |                           )
 34 |         response = Response('http://www.example.com/',
 35 |                             Request('http://www.example.com/')
 36 |                             )
 37 |         assert response.url
 38 |         assert not response.body
 39 |         response = Response('http://www.example.com/',
 40 |                             Request('http://www.example.com/'),
 41 |                             headers={'Content-Type': 'text/html',
 42 |                                      'Content-Length': 1234
 43 |                                      }
 44 |                             )
 45 | 
 46 |     def test_copy(self):
 47 |         response1 = Response('http://www.example.com/',
 48 |                              headers={'Content-Type': 'text/html',
 49 |                                       'Content-Length': 1234
 50 |                                       },
 51 |                              request=Request('http://www.example.com/')
 52 |                              )
 53 |         response2 = response1.copy()
 54 |         assert response1.__dict__ == response2.__dict__
 55 |         self.assertEqual(response1.headers, response2.headers)
 56 |         self.assertEqual(response1.request, response2.request)
 57 |         self.assertEqual(response1, response2)
 58 | 
 59 |         self.assertIsNot(response1.headers, response2.headers)
 60 |         self.assertIsNot(response1.request, response2.request)
 61 |         self.assertIsNot(response1, response2)
 62 | 
 63 |     def test_url(self):
 64 |         response = Response('http://www.example.com/',
 65 |                             request=Request('http://www.example.com/')
 66 |                             )
 67 |         self.assertIsInstance(response.url, str)
 68 |         self.assertEqual(response.url, 'http://www.example.com/')
 69 |         response = Response(u'http://www.example.com?content=测试',
 70 |                             request=Request('http://www.example.com/')
 71 |                             )
 72 |         self.assertEqual(response.url,
 73 |                          safe_url_string('http://www.example.com?content=测试'))
 74 |         self.assertRaises(TypeError, Response, 123)
 75 | 
 76 |     def test_body(self):
 77 |         r1 = Response(url="http://www.example.com/",
 78 |                       request=Request('http://www.example.com/')
 79 |                       )
 80 |         assert r1.body == b''
 81 | 
 82 |         r2 = Response(url="http://www.example.com/",
 83 |                       body=b"",
 84 |                       request=Request('http://www.example.com/'))
 85 |         assert isinstance(r2.body, bytes)
 86 |         self.assertEqual(r2.encoding, 'utf-8')  # default encoding
 87 | 
 88 |         r3 = Response(url="http://www.example.com/",
 89 |                       body=u"Price: \xa3100",
 90 |                       encoding='utf-8',
 91 |                       request=Request('http://www.example.com/'))
 92 |         assert isinstance(r3.body, bytes)
 93 |         self.assertEqual(r3.body, b"Price: \xc2\xa3100")
 94 | 
 95 |         r4 = Response(url="http://www.example.com/",
 96 |                       request=Request('http://www.example.com/'),
 97 |                       body=u"Price: \xa3100",
 98 |                       encoding='latin1'
 99 |                       )
100 |         assert isinstance(r4.body, bytes)
101 |         self.assertEqual(r4.body, b"Price: \xa3100")
102 | 
103 |     def test_request(self):
104 |         response = Response('http://www.example.com/',
105 |                             request=Request('http://www.example.com/')
106 |                             )
107 |         self.assertIsInstance(response.request, Request)
108 |         self.assertEqual(response.request, Request('http://www.example.com/'))
109 | 


--------------------------------------------------------------------------------
/pycreeper/engine.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | reload(__import__('sys')).setdefaultencoding('utf-8')
  3 | __author__ = 'zcy'
  4 | 
  5 | """ Engine """
  6 | 
  7 | from gevent import monkey
  8 | monkey.patch_all()
  9 | 
 10 | import logging
 11 | from gevent.lock import BoundedSemaphore
 12 | from gevent.pool import Pool
 13 | from importlib import import_module
 14 | from pycreeper.scheduler import Scheduler
 15 | from pycreeper.downloader import Downloader
 16 | from pycreeper.utils.gevent_wrapper import spawn, join_all
 17 | from pycreeper.utils import result2list
 18 | from pycreeper.http.request import Request
 19 | from Queue import Empty
 20 | 
 21 | DRIVER_MODULE = 'selenium.webdriver'
 22 | 
 23 | class Engine(object):
 24 |     """ Engine """
 25 | 
 26 |     def __init__(self, spider):
 27 |         self.spider = spider
 28 |         self.logger = spider.logger
 29 |         self.scheduler = Scheduler(spider)
 30 |         self.settings = spider.settings
 31 |         max_request_size = self.settings["MAX_REQUEST_SIZE"]
 32 |         self.dynamic = self.settings["DYNAMIC_CRAWL"]
 33 |         if self.dynamic:
 34 |             module_path = DRIVER_MODULE
 35 |             module = import_module(module_path)
 36 |             init_kwargs = self.settings['DRIVER_INIT_KWARGS']
 37 |             self.driver = getattr(module,
 38 |                                   self.settings.get('DRIVER').title())(**init_kwargs)
 39 |         else:
 40 |             self.driver = None
 41 |         self.driver_sem = BoundedSemaphore(1)
 42 |         self.downloader = Downloader(spider, self.driver, self.driver_sem)
 43 |         self.pool = Pool(size=max_request_size)
 44 | 
 45 |     def start(self):
 46 |         """start
 47 |         """
 48 |         start_requests = iter(self.spider.start_requests())
 49 |         self.execute(self.spider, start_requests)
 50 | 
 51 |     def execute(self, spider, start_requests):
 52 |         """execute
 53 |         """
 54 |         self.start_requests = start_requests
 55 |         all_routines = []
 56 |         all_routines.append(spawn(self._init_start_requests))
 57 |         all_routines.append(spawn(self._next_request, spider))
 58 |         join_all(all_routines)
 59 | 
 60 |     def _init_start_requests(self):
 61 |         """init start requests
 62 |         """
 63 |         for req in self.start_requests:
 64 |             self.crawl(req)
 65 | 
 66 |     def _next_request(self, spider):
 67 |         """next request
 68 |         """
 69 |         while True:
 70 |             try:
 71 |                 request = self.scheduler.next_request()
 72 |                 self.pool.spawn(
 73 |                     self._process_request, request, spider)
 74 |             except Empty:
 75 |                 self.logger.info('All requests are finished, program exit...')
 76 |                 if self.driver:
 77 |                     self.driver.close()
 78 |                 return
 79 | 
 80 |     def _process_request(self, request, spider):
 81 |         """process request
 82 |         """
 83 |         try:
 84 |             response = self.download(request, spider)
 85 |         except Exception as exc:
 86 |             logging.error("download error: %s", str(exc), exc_info=True)
 87 |         else:
 88 |             self._handle_downloader_output(response, request, spider)
 89 |             return response
 90 | 
 91 |     def download(self, request, spider):
 92 |         """ download
 93 | 
 94 |         Download a request, use self.downloader.fetch
 95 | 
 96 |         """
 97 |         response = self.downloader.fetch(request, spider)
 98 |         #response.request = request
 99 |         return response
100 | 
101 |     def _handle_downloader_output(self, response, request, spider):
102 |         """handle downloader output
103 | 
104 | 
105 |         """
106 |         if isinstance(response, Request):
107 |             self.crawl(response)
108 |             return
109 | 
110 |         self.process_response(response, request, spider)
111 | 
112 |     def process_response(self, response, request, spider):
113 |         """process response
114 | 
115 |         Use request.callback or spider.parse to process response
116 | 
117 |         """
118 |         callback = request.callback or spider.parse
119 |         result = callback(response)
120 |         ret = result2list(result)
121 |         self.handle_spider_output(ret, spider)
122 | 
123 |     def handle_spider_output(self, result, spider):
124 |         """handle spider output
125 | 
126 |         If a spider return a request, crawling it.
127 |         Else if it's a dict, use self.process_item.
128 | 
129 |         """
130 |         for item in result:
131 |             if item is None:
132 |                 continue
133 |             elif isinstance(item, Request):
134 |                 self.crawl(item)
135 |             elif isinstance(item, dict):
136 |                 self.process_item(item, spider)
137 |             else:
138 |                 logging.error("Spider must return Request, dict or None")
139 | 
140 |     def process_item(self, item, spider):
141 |         """handle item
142 | 
143 |         Use spider.process_item function.
144 | 
145 |         """
146 |         spider.process_item(item)
147 | 
148 |     def crawl(self, request):
149 |         """crawl request
150 | 
151 |         Add request to scheduler's queue.
152 | 
153 |         """
154 |         self.scheduler.enqueue_request(request)
155 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # PyCreeper documentation build configuration file, created by
  4 | # sphinx-quickstart on Sat Mar 18 20:46:54 2017.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another directory,
 16 | # add these directories to sys.path here. If the directory is relative to the
 17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 18 | #
 19 | # import os
 20 | # import sys
 21 | # sys.path.insert(0, os.path.abspath('.'))
 22 | 
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | #
 28 | # needs_sphinx = '1.0'
 29 | 
 30 | # Add any Sphinx extension module names here, as strings. They can be
 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 32 | # ones.
 33 | extensions = ['sphinx.ext.autodoc',
 34 |     'sphinx.ext.viewcode',
 35 |     'sphinx.ext.githubpages']
 36 | 
 37 | # Add any paths that contain templates here, relative to this directory.
 38 | templates_path = ['_templates']
 39 | 
 40 | # The suffix(es) of source filenames.
 41 | # You can specify multiple suffix as a list of string:
 42 | #
 43 | # source_suffix = ['.rst', '.md']
 44 | source_suffix = '.rst'
 45 | 
 46 | # The master toctree document.
 47 | master_doc = 'index'
 48 | 
 49 | # General information about the project.
 50 | project = u'PyCreeper'
 51 | copyright = u'2017, Jim Zheng'
 52 | author = u'Jim Zheng'
 53 | 
 54 | # The version info for the project you're documenting, acts as replacement for
 55 | # |version| and |release|, also used in various other places throughout the
 56 | # built documents.
 57 | #
 58 | # The short X.Y version.
 59 | version = u'1.0.0'
 60 | # The full version, including alpha/beta/rc tags.
 61 | release = u'1.0.0'
 62 | 
 63 | # The language for content autogenerated by Sphinx. Refer to documentation
 64 | # for a list of supported languages.
 65 | #
 66 | # This is also used if you do content translation via gettext catalogs.
 67 | # Usually you set "language" from the command line for these cases.
 68 | language = None
 69 | 
 70 | # List of patterns, relative to source directory, that match files and
 71 | # directories to ignore when looking for source files.
 72 | # This patterns also effect to html_static_path and html_extra_path
 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 74 | 
 75 | # The name of the Pygments (syntax highlighting) style to use.
 76 | pygments_style = 'sphinx'
 77 | 
 78 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 79 | todo_include_todos = False
 80 | 
 81 | 
 82 | # -- Options for HTML output ----------------------------------------------
 83 | 
 84 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 85 | # a list of builtin themes.
 86 | #
 87 | html_theme = 'nature'
 88 | 
 89 | # Theme options are theme-specific and customize the look and feel of a theme
 90 | # further.  For a list of options available for each theme, see the
 91 | # documentation.
 92 | #
 93 | # html_theme_options = {}
 94 | 
 95 | # Add any paths that contain custom static files (such as style sheets) here,
 96 | # relative to this directory. They are copied after the builtin static files,
 97 | # so a file named "default.css" will overwrite the builtin "default.css".
 98 | html_static_path = ['_static']
 99 | 
100 | 
101 | # -- Options for HTMLHelp output ------------------------------------------
102 | 
103 | # Output file base name for HTML help builder.
104 | htmlhelp_basename = 'PyCreeperdoc'
105 | 
106 | 
107 | # -- Options for LaTeX output ---------------------------------------------
108 | 
109 | latex_elements = {
110 |     # The paper size ('letterpaper' or 'a4paper').
111 |     #
112 |     # 'papersize': 'letterpaper',
113 | 
114 |     # The font size ('10pt', '11pt' or '12pt').
115 |     #
116 |     # 'pointsize': '10pt',
117 | 
118 |     # Additional stuff for the LaTeX preamble.
119 |     #
120 |     # 'preamble': '',
121 | 
122 |     # Latex figure (float) alignment
123 |     #
124 |     # 'figure_align': 'htbp',
125 | }
126 | 
127 | # Grouping the document tree into LaTeX files. List of tuples
128 | # (source start file, target name, title,
129 | #  author, documentclass [howto, manual, or own class]).
130 | latex_documents = [
131 |     (master_doc, 'PyCreeper.tex', u'PyCreeper Documentation',
132 |      u'zcy', 'manual'),
133 | ]
134 | 
135 | 
136 | # -- Options for manual page output ---------------------------------------
137 | 
138 | # One entry per manual page. List of tuples
139 | # (source start file, name, description, authors, manual section).
140 | man_pages = [
141 |     (master_doc, 'pycreeper', u'PyCreeper Documentation',
142 |      [author], 1)
143 | ]
144 | 
145 | 
146 | # -- Options for Texinfo output -------------------------------------------
147 | 
148 | # Grouping the document tree into Texinfo files. List of tuples
149 | # (source start file, target name, title, author,
150 | #  dir menu entry, description, category)
151 | texinfo_documents = [
152 |     (master_doc, 'PyCreeper', u'PyCreeper Documentation',
153 |      author, 'PyCreeper', 'One line description of project.',
154 |      'Miscellaneous'),
155 | ]
156 | 
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/tests/test_downloader_middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | reload(__import__('sys')).setdefaultencoding('utf-8')
  3 | __author__ = 'zcy'
  4 | 
  5 | import time
  6 | import unittest
  7 | import json
  8 | from pycreeper.downloader_middlewares import DownloaderMiddlewareManager
  9 | from pycreeper.downloader_middlewares.middlewares import UserAgentMiddleware, RetryMiddleware, ProxyMiddleware
 10 | from pycreeper.spider import Spider
 11 | from pycreeper.http.request import Request
 12 | from pycreeper.http.response import Response
 13 | from pycreeper.downloader import DownloadHandler
 14 | from gevent.lock import BoundedSemaphore
 15 | 
 16 | 
 17 | class RetryMiddlewareTest(unittest.TestCase):
 18 |     def setUp(self):
 19 |         self.spider = Spider()
 20 | 
 21 |     def test_basic(self):
 22 |         self.assertRaises(AttributeError, RetryMiddleware,
 23 |                           self.spider.settings, None)
 24 | 
 25 |     def test_process_response(self):
 26 |         request = Request('http://httpbin.org/')
 27 |         response = Response('http://httpbin.org/', request, status=500)
 28 |         rm = RetryMiddleware(self.spider.settings, self.spider.logger)
 29 |         request.meta["dont_retry"] = True
 30 |         self.assertEqual(rm.process_response(request, response), response)
 31 | 
 32 |         request.meta["dont_retry"] = False
 33 |         request = rm.process_response(request, response)
 34 |         self.assertIsInstance(request, Request)
 35 |         self.assertEqual(request.meta.get("retry_count"), 1)
 36 |         request = rm.process_response(request, response)
 37 |         self.assertIsInstance(request, Request)
 38 |         request = rm.process_response(request, response)
 39 |         self.assertIsInstance(request, Request)
 40 |         self.assertIsInstance(rm.process_response(request, response), Response)
 41 | 
 42 | 
 43 | class UserAgentMiddlewareTest(unittest.TestCase):
 44 |     def setUp(self):
 45 |         self.spider = Spider()
 46 | 
 47 |     def test_basic(self):
 48 |         self.assertRaises(AttributeError, ProxyMiddleware,
 49 |                           self.spider.settings, None)
 50 | 
 51 |     def test_process_request(self):
 52 |         self.spider.settings.set("PROXY_LIST", ['124.88.67.54:80'])
 53 |         request = Request('http://httpbin.org/get')
 54 |         pm = ProxyMiddleware(self.spider.settings, self.spider.logger)
 55 |         dh = DownloadHandler(self.spider, None, BoundedSemaphore(1))
 56 |         pm.process_request(request)
 57 |         response = dh.fetch(request)
 58 |         assert response.body
 59 | 
 60 |     def test_process_request_interval(self):
 61 |         self.spider.settings.set("PROXY_LIST", ['218.76.106.78:3128'])
 62 |         request = Request('http://httpbin.org/get')
 63 |         pm = ProxyMiddleware(self.spider.settings, self.spider.logger)
 64 |         dh = DownloadHandler(self.spider, None, BoundedSemaphore(1))
 65 |         pm.process_request(request)
 66 |         time1 = time.time()
 67 |         dh.fetch(request)
 68 | 
 69 |         request = Request('http://httpbin.org/get')
 70 |         pm.process_request(request)
 71 |         self.assertGreater(time.time() - time1, 3)
 72 | 
 73 | 
 74 | class ProxyMiddlewareTest(unittest.TestCase):
 75 |     def setUp(self):
 76 |         self.spider = Spider()
 77 | 
 78 |     def test_basic(self):
 79 |         self.assertRaises(AttributeError, UserAgentMiddleware,
 80 |                           self.spider.settings, None)
 81 | 
 82 |     def test_process_request(self):
 83 |         request = Request('http://httpbin.org/user-agent')
 84 |         self.assertIs(request.headers.get("User-Agent"), None)
 85 |         uam = UserAgentMiddleware(self.spider.settings, self.spider.logger)
 86 |         dh = DownloadHandler(self.spider, None, BoundedSemaphore(1))
 87 |         uam.process_request(request)
 88 |         response = dh.fetch(request)
 89 |         self.assertEqual(json.loads(response.body)['user-agent'], request.headers['User-Agent'])
 90 | 
 91 | 
 92 | class DownloaderMiddlewareManagerTest(unittest.TestCase):
 93 |     def setUp(self):
 94 |         self.spider = Spider()
 95 |         self.spider.settings.set('DOWNLOADER_MIDDLEWARES',
 96 |                                  {
 97 |                                      'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
 98 |                                      'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
 99 |                                  })
100 | 
101 |     def test_methods(self):
102 |         dmm = DownloaderMiddlewareManager(self.spider)
103 |         rm = RetryMiddleware(self.spider.settings, self.spider.logger)
104 |         uam = UserAgentMiddleware(self.spider.settings, self.spider.logger)
105 |         process_request = [uam.process_request]
106 |         process_response = [rm.process_response]
107 |         process_exception = [rm.process_exception]
108 |         self.assertEqual(len(dmm.methods['process_request']), len(process_request))
109 |         for i in range(len(process_request)):
110 |             self.assertEqual(dmm.methods['process_request'][i].__name__, process_request[i].__name__)
111 | 
112 |         self.assertEqual(len(dmm.methods['process_response']), len(process_response))
113 |         for i in range(len(process_response)):
114 |             self.assertEqual(dmm.methods['process_response'][i].__name__, process_response[i].__name__)
115 | 
116 |         self.assertEqual(len(dmm.methods['process_exception']), len(process_exception))
117 |         for i in range(len(process_exception)):
118 |             self.assertEqual(dmm.methods['process_exception'][i].__name__, process_exception[i].__name__)
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     unittest.main()
123 | 


--------------------------------------------------------------------------------
/doc/_build/html/_static/nature.css:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * nature.css_t
  3 |  * ~~~~~~~~~~~~
  4 |  *
  5 |  * Sphinx stylesheet -- nature theme.
  6 |  *
  7 |  * :copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS.
  8 |  * :license: BSD, see LICENSE for details.
  9 |  *
 10 |  */
 11 |  
 12 | @import url("basic.css");
 13 |  
 14 | /* -- page layout ----------------------------------------------------------- */
 15 |  
 16 | body {
 17 |     font-family: Arial, sans-serif;
 18 |     font-size: 100%;
 19 |     background-color: #111;
 20 |     color: #555;
 21 |     margin: 0;
 22 |     padding: 0;
 23 | }
 24 | 
 25 | div.documentwrapper {
 26 |     float: left;
 27 |     width: 100%;
 28 | }
 29 | 
 30 | div.bodywrapper {
 31 |     margin: 0 0 0 230px;
 32 | }
 33 | 
 34 | hr {
 35 |     border: 1px solid #B1B4B6;
 36 | }
 37 |  
 38 | div.document {
 39 |     background-color: #eee;
 40 | }
 41 |  
 42 | div.body {
 43 |     background-color: #ffffff;
 44 |     color: #3E4349;
 45 |     padding: 0 30px 30px 30px;
 46 |     font-size: 0.9em;
 47 | }
 48 |  
 49 | div.footer {
 50 |     color: #555;
 51 |     width: 100%;
 52 |     padding: 13px 0;
 53 |     text-align: center;
 54 |     font-size: 75%;
 55 | }
 56 |  
 57 | div.footer a {
 58 |     color: #444;
 59 |     text-decoration: underline;
 60 | }
 61 |  
 62 | div.related {
 63 |     background-color: #6BA81E;
 64 |     line-height: 32px;
 65 |     color: #fff;
 66 |     text-shadow: 0px 1px 0 #444;
 67 |     font-size: 0.9em;
 68 | }
 69 |  
 70 | div.related a {
 71 |     color: #E2F3CC;
 72 | }
 73 |  
 74 | div.sphinxsidebar {
 75 |     font-size: 0.75em;
 76 |     line-height: 1.5em;
 77 | }
 78 | 
 79 | div.sphinxsidebarwrapper{
 80 |     padding: 20px 0;
 81 | }
 82 |  
 83 | div.sphinxsidebar h3,
 84 | div.sphinxsidebar h4 {
 85 |     font-family: Arial, sans-serif;
 86 |     color: #222;
 87 |     font-size: 1.2em;
 88 |     font-weight: normal;
 89 |     margin: 0;
 90 |     padding: 5px 10px;
 91 |     background-color: #ddd;
 92 |     text-shadow: 1px 1px 0 white
 93 | }
 94 | 
 95 | div.sphinxsidebar h4{
 96 |     font-size: 1.1em;
 97 | }
 98 |  
 99 | div.sphinxsidebar h3 a {
100 |     color: #444;
101 | }
102 |  
103 |  
104 | div.sphinxsidebar p {
105 |     color: #888;
106 |     padding: 5px 20px;
107 | }
108 |  
109 | div.sphinxsidebar p.topless {
110 | }
111 |  
112 | div.sphinxsidebar ul {
113 |     margin: 10px 20px;
114 |     padding: 0;
115 |     color: #000;
116 | }
117 |  
118 | div.sphinxsidebar a {
119 |     color: #444;
120 | }
121 |  
122 | div.sphinxsidebar input {
123 |     border: 1px solid #ccc;
124 |     font-family: sans-serif;
125 |     font-size: 1em;
126 | }
127 | 
128 | div.sphinxsidebar input[type=text]{
129 |     margin-left: 20px;
130 | }
131 | 
132 | div.sphinxsidebar input[type=submit]{
133 |     margin-left: 20px;
134 | }
135 |  
136 | /* -- body styles ----------------------------------------------------------- */
137 |  
138 | a {
139 |     color: #005B81;
140 |     text-decoration: none;
141 | }
142 |  
143 | a:hover {
144 |     color: #E32E00;
145 |     text-decoration: underline;
146 | }
147 |  
148 | div.body h1,
149 | div.body h2,
150 | div.body h3,
151 | div.body h4,
152 | div.body h5,
153 | div.body h6 {
154 |     font-family: Arial, sans-serif;
155 |     background-color: #BED4EB;
156 |     font-weight: normal;
157 |     color: #212224;
158 |     margin: 30px 0px 10px 0px;
159 |     padding: 5px 0 5px 10px;
160 |     text-shadow: 0px 1px 0 white
161 | }
162 |  
163 | div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 200%; }
164 | div.body h2 { font-size: 150%; background-color: #C8D5E3; }
165 | div.body h3 { font-size: 120%; background-color: #D8DEE3; }
166 | div.body h4 { font-size: 110%; background-color: #D8DEE3; }
167 | div.body h5 { font-size: 100%; background-color: #D8DEE3; }
168 | div.body h6 { font-size: 100%; background-color: #D8DEE3; }
169 |  
170 | a.headerlink {
171 |     color: #c60f0f;
172 |     font-size: 0.8em;
173 |     padding: 0 4px 0 4px;
174 |     text-decoration: none;
175 | }
176 |  
177 | a.headerlink:hover {
178 |     background-color: #c60f0f;
179 |     color: white;
180 | }
181 |  
182 | div.body p, div.body dd, div.body li {
183 |     line-height: 1.5em;
184 | }
185 |  
186 | div.admonition p.admonition-title + p {
187 |     display: inline;
188 | }
189 | 
190 | div.highlight{
191 |     background-color: white;
192 | }
193 | 
194 | div.note {
195 |     background-color: #eee;
196 |     border: 1px solid #ccc;
197 | }
198 |  
199 | div.seealso {
200 |     background-color: #ffc;
201 |     border: 1px solid #ff6;
202 | }
203 |  
204 | div.topic {
205 |     background-color: #eee;
206 | }
207 |  
208 | div.warning {
209 |     background-color: #ffe4e4;
210 |     border: 1px solid #f66;
211 | }
212 |  
213 | p.admonition-title {
214 |     display: inline;
215 | }
216 |  
217 | p.admonition-title:after {
218 |     content: ":";
219 | }
220 |  
221 | pre {
222 |     padding: 10px;
223 |     background-color: White;
224 |     color: #222;
225 |     line-height: 1.2em;
226 |     border: 1px solid #C6C9CB;
227 |     font-size: 1.1em;
228 |     margin: 1.5em 0 1.5em 0;
229 |     -webkit-box-shadow: 1px 1px 1px #d8d8d8;
230 |     -moz-box-shadow: 1px 1px 1px #d8d8d8;
231 | }
232 |  
233 | code {
234 |     background-color: #ecf0f3;
235 |     color: #222;
236 |     /* padding: 1px 2px; */
237 |     font-size: 1.1em;
238 |     font-family: monospace;
239 | }
240 | 
241 | .viewcode-back {
242 |     font-family: Arial, sans-serif;
243 | }
244 | 
245 | div.viewcode-block:target {
246 |     background-color: #f4debf;
247 |     border-top: 1px solid #ac9;
248 |     border-bottom: 1px solid #ac9;
249 | }
250 | 
251 | div.code-block-caption {
252 |     background-color: #ddd;
253 |     color: #222;
254 |     border: 1px solid #C6C9CB;
255 | }


--------------------------------------------------------------------------------
/doc/_build/html/schedular.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  2 |   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  3 | 
  4 | 
  5 | <html xmlns="http://www.w3.org/1999/xhtml">
  6 |   <head>
  7 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  8 |     
  9 |     <title>schedular：调度器 &#8212; PyCreeper 1.0.0 documentation</title>
 10 |     
 11 |     <link rel="stylesheet" href="_static/nature.css" type="text/css" />
 12 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 13 |     
 14 |     <script type="text/javascript">
 15 |       var DOCUMENTATION_OPTIONS = {
 16 |         URL_ROOT:    './',
 17 |         VERSION:     '1.0.0',
 18 |         COLLAPSE_INDEX: false,
 19 |         FILE_SUFFIX: '.html',
 20 |         HAS_SOURCE:  true,
 21 |         SOURCELINK_SUFFIX: '.txt'
 22 |       };
 23 |     </script>
 24 |     <script type="text/javascript" src="_static/jquery.js"></script>
 25 |     <script type="text/javascript" src="_static/underscore.js"></script>
 26 |     <script type="text/javascript" src="_static/doctools.js"></script>
 27 |     <link rel="index" title="Index" href="genindex.html" />
 28 |     <link rel="search" title="Search" href="search.html" />
 29 |     <link rel="next" title="spider：爬虫" href="spider.html" />
 30 |     <link rel="prev" title="downloader_middlewares：下载器中间件" href="downloader_middlewares.html" /> 
 31 |   </head>
 32 |   <body role="document">
 33 |     <div class="related" role="navigation" aria-label="related navigation">
 34 |       <h3>Navigation</h3>
 35 |       <ul>
 36 |         <li class="right" style="margin-right: 10px">
 37 |           <a href="genindex.html" title="General Index"
 38 |              accesskey="I">index</a></li>
 39 |         <li class="right" >
 40 |           <a href="spider.html" title="spider：爬虫"
 41 |              accesskey="N">next</a> |</li>
 42 |         <li class="right" >
 43 |           <a href="downloader_middlewares.html" title="downloader_middlewares：下载器中间件"
 44 |              accesskey="P">previous</a> |</li>
 45 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
 46 |       </ul>
 47 |     </div>  
 48 | 
 49 |     <div class="document">
 50 |       <div class="documentwrapper">
 51 |         <div class="bodywrapper">
 52 |           <div class="body" role="main">
 53 |             
 54 |   <div class="section" id="schedular">
 55 | <h1>schedular：调度器<a class="headerlink" href="#schedular" title="Permalink to this headline">¶</a></h1>
 56 | <p>调度器实现的核心是gevent之中的Queue和布隆过滤器
 57 | （Wiki: <a class="reference external" href="https://en.wikipedia.org/wiki/Bloom_filter">https://en.wikipedia.org/wiki/Bloom_filter</a>）。
 58 | 其中，Queue保证了多个Downloader协程读取队列时的协程安全，布隆过滤器则提供了url去重功能。</p>
 59 | <div class="section" id="enqueue-request-request">
 60 | <h2>将请求入队：enqueue_request(request)<a class="headerlink" href="#enqueue-request-request" title="Permalink to this headline">¶</a></h2>
 61 | <p>request入队时，首先使用布隆过滤器检查url是否已经抓取过。如果没有抓取过则直接入队，
 62 | 如果抓取过，则会输出一条logging.DEBUG信息，表示忽略了这个url。</p>
 63 | </div>
 64 | <div class="section" id="next-request">
 65 | <h2>取得队列中的请求：next_request()<a class="headerlink" href="#next-request" title="Permalink to this headline">¶</a></h2>
 66 | <p>这个方法将会从Queue中取出一条request。如果在 <strong>custom_settings</strong> 中设置了 <strong>DOWNLOAD_DELAY</strong>
 67 | 项目的话，每次取出request会等待一个固定的时间。</p>
 68 | <p>PyCreeper将 <strong>TIMEOUT</strong> 值的3倍作为检验爬虫结束的标志。具体是指，如果3*TIMEOUT时间之内Queue为空的话，
 69 | 那么则认为爬取任务全部结束，爬虫退出。</p>
 70 | </div>
 71 | </div>
 72 | 
 73 | 
 74 |           </div>
 75 |         </div>
 76 |       </div>
 77 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 78 |         <div class="sphinxsidebarwrapper">
 79 |   <h3><a href="index.html">Table Of Contents</a></h3>
 80 |   <ul>
 81 | <li><a class="reference internal" href="#">schedular：调度器</a><ul>
 82 | <li><a class="reference internal" href="#enqueue-request-request">将请求入队：enqueue_request(request)</a></li>
 83 | <li><a class="reference internal" href="#next-request">取得队列中的请求：next_request()</a></li>
 84 | </ul>
 85 | </li>
 86 | </ul>
 87 | 
 88 |   <h4>Previous topic</h4>
 89 |   <p class="topless"><a href="downloader_middlewares.html"
 90 |                         title="previous chapter">downloader_middlewares：下载器中间件</a></p>
 91 |   <h4>Next topic</h4>
 92 |   <p class="topless"><a href="spider.html"
 93 |                         title="next chapter">spider：爬虫</a></p>
 94 |   <div role="note" aria-label="source link">
 95 |     <h3>This Page</h3>
 96 |     <ul class="this-page-menu">
 97 |       <li><a href="_sources/schedular.rst.txt"
 98 |             rel="nofollow">Show Source</a></li>
 99 |     </ul>
100 |    </div>
101 | <div id="searchbox" style="display: none" role="search">
102 |   <h3>Quick search</h3>
103 |     <form class="search" action="search.html" method="get">
104 |       <div><input type="text" name="q" /></div>
105 |       <div><input type="submit" value="Go" /></div>
106 |       <input type="hidden" name="check_keywords" value="yes" />
107 |       <input type="hidden" name="area" value="default" />
108 |     </form>
109 | </div>
110 | <script type="text/javascript">$('#searchbox').show(0);</script>
111 |         </div>
112 |       </div>
113 |       <div class="clearer"></div>
114 |     </div>
115 |     <div class="related" role="navigation" aria-label="related navigation">
116 |       <h3>Navigation</h3>
117 |       <ul>
118 |         <li class="right" style="margin-right: 10px">
119 |           <a href="genindex.html" title="General Index"
120 |              >index</a></li>
121 |         <li class="right" >
122 |           <a href="spider.html" title="spider：爬虫"
123 |              >next</a> |</li>
124 |         <li class="right" >
125 |           <a href="downloader_middlewares.html" title="downloader_middlewares：下载器中间件"
126 |              >previous</a> |</li>
127 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
128 |       </ul>
129 |     </div>
130 |     <div class="footer" role="contentinfo">
131 |         &#169; Copyright 2017, Jim Zheng.
132 |       Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.5.3.
133 |     </div>
134 |   </body>
135 | </html>


--------------------------------------------------------------------------------
/doc/_build/html/structure.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  2 |   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  3 | 
  4 | 
  5 | <html xmlns="http://www.w3.org/1999/xhtml">
  6 |   <head>
  7 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  8 |     
  9 |     <title>架构概览 &#8212; PyCreeper 1.0.0 documentation</title>
 10 |     
 11 |     <link rel="stylesheet" href="_static/nature.css" type="text/css" />
 12 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 13 |     
 14 |     <script type="text/javascript">
 15 |       var DOCUMENTATION_OPTIONS = {
 16 |         URL_ROOT:    './',
 17 |         VERSION:     '1.0.0',
 18 |         COLLAPSE_INDEX: false,
 19 |         FILE_SUFFIX: '.html',
 20 |         HAS_SOURCE:  true,
 21 |         SOURCELINK_SUFFIX: '.txt'
 22 |       };
 23 |     </script>
 24 |     <script type="text/javascript" src="_static/jquery.js"></script>
 25 |     <script type="text/javascript" src="_static/underscore.js"></script>
 26 |     <script type="text/javascript" src="_static/doctools.js"></script>
 27 |     <link rel="index" title="Index" href="genindex.html" />
 28 |     <link rel="search" title="Search" href="search.html" />
 29 |     <link rel="next" title="settings：项目设置" href="ssettings.html" />
 30 |     <link rel="prev" title="使用前的准备" href="prepare.html" /> 
 31 |   </head>
 32 |   <body role="document">
 33 |     <div class="related" role="navigation" aria-label="related navigation">
 34 |       <h3>Navigation</h3>
 35 |       <ul>
 36 |         <li class="right" style="margin-right: 10px">
 37 |           <a href="genindex.html" title="General Index"
 38 |              accesskey="I">index</a></li>
 39 |         <li class="right" >
 40 |           <a href="ssettings.html" title="settings：项目设置"
 41 |              accesskey="N">next</a> |</li>
 42 |         <li class="right" >
 43 |           <a href="prepare.html" title="使用前的准备"
 44 |              accesskey="P">previous</a> |</li>
 45 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
 46 |       </ul>
 47 |     </div>  
 48 | 
 49 |     <div class="document">
 50 |       <div class="documentwrapper">
 51 |         <div class="bodywrapper">
 52 |           <div class="body" role="main">
 53 |             
 54 |   <div class="section" id="id1">
 55 | <h1>架构概览<a class="headerlink" href="#id1" title="Permalink to this headline">¶</a></h1>
 56 | <p>PyCreeper的整体架构可以分为引擎，下载器，下载器中间件，调度器，爬虫五个部分。
 57 | 在各个部分之间传递的数据为Request/Response对象。</p>
 58 | <p>数据的流动方向如下图的绿色箭头所示。</p>
 59 | <div class="section" id="id2">
 60 | <h2>各个部分的功能简述<a class="headerlink" href="#id2" title="Permalink to this headline">¶</a></h2>
 61 | <img alt="_images/structure.jpg" src="_images/structure.jpg" />
 62 | <hr class="docutils" />
 63 | <p><strong>引擎</strong> 是PyCreeper的核心部分，负责调度各个部分的工作。引擎在内部的实现为gevent.Pool。</p>
 64 | <p><strong>下载器</strong> 负责下载request请求，在这里将静态请求与动态请求分别处理，静态请求使用requests库实现，
 65 | 动态请求使用selenium.webdriver实现。在请求完成后，将响应返回给引擎。</p>
 66 | <p><strong>下载器中间件</strong> 可以理解为存在于下载器和引擎之间的钩子系统，可以通过自定义下载器中间件完成对request和response的特殊处理。</p>
 67 | <p><strong>调度器</strong> 调度器实现的核心为gevent中的Queue和布隆过滤器，通过对requests进行判重，非重复请求入队，等待引擎取走处理。</p>
 68 | <p><strong>爬虫</strong> 爬虫相当于对用户定义的接口，由用户来定义起始的url，对于各个request的callback以及对于爬取结果的处理方法。</p>
 69 | </div>
 70 | <div class="section" id="id3">
 71 | <h2>数据流动过程<a class="headerlink" href="#id3" title="Permalink to this headline">¶</a></h2>
 72 | <p>数据流动的过程如下面各个步骤所示：</p>
 73 | <ol class="arabic simple">
 74 | <li>引擎启动，将爬虫中的start_urls加入到调度器中。</li>
 75 | <li>引擎从调度器中取得一个request。</li>
 76 | <li>引擎将请求交给下载器处理，中间经过了下载器中间件对于request的处理。</li>
 77 | <li>下载器根据request的类型分别操作，静态请求交给requests库，动态请求使用selenium.webdriver加载。</li>
 78 | <li>下载器将response返回给引擎，中间经过下载器中间件对response的处理。</li>
 79 | <li>引擎将response交给爬虫定义的处理方法。</li>
 80 | <li>爬虫的处理方法可能返回一个request（转2），或者返回一个包含爬取结果的字典（转下一个）。</li>
 81 | <li>引擎根据爬虫定义的对于爬取结果的处理方法，处理结果。</li>
 82 | </ol>
 83 | </div>
 84 | </div>
 85 | 
 86 | 
 87 |           </div>
 88 |         </div>
 89 |       </div>
 90 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 91 |         <div class="sphinxsidebarwrapper">
 92 |   <h3><a href="index.html">Table Of Contents</a></h3>
 93 |   <ul>
 94 | <li><a class="reference internal" href="#">架构概览</a><ul>
 95 | <li><a class="reference internal" href="#id2">各个部分的功能简述</a></li>
 96 | <li><a class="reference internal" href="#id3">数据流动过程</a></li>
 97 | </ul>
 98 | </li>
 99 | </ul>
100 | 
101 |   <h4>Previous topic</h4>
102 |   <p class="topless"><a href="prepare.html"
103 |                         title="previous chapter">使用前的准备</a></p>
104 |   <h4>Next topic</h4>
105 |   <p class="topless"><a href="ssettings.html"
106 |                         title="next chapter">settings：项目设置</a></p>
107 |   <div role="note" aria-label="source link">
108 |     <h3>This Page</h3>
109 |     <ul class="this-page-menu">
110 |       <li><a href="_sources/structure.rst.txt"
111 |             rel="nofollow">Show Source</a></li>
112 |     </ul>
113 |    </div>
114 | <div id="searchbox" style="display: none" role="search">
115 |   <h3>Quick search</h3>
116 |     <form class="search" action="search.html" method="get">
117 |       <div><input type="text" name="q" /></div>
118 |       <div><input type="submit" value="Go" /></div>
119 |       <input type="hidden" name="check_keywords" value="yes" />
120 |       <input type="hidden" name="area" value="default" />
121 |     </form>
122 | </div>
123 | <script type="text/javascript">$('#searchbox').show(0);</script>
124 |         </div>
125 |       </div>
126 |       <div class="clearer"></div>
127 |     </div>
128 |     <div class="related" role="navigation" aria-label="related navigation">
129 |       <h3>Navigation</h3>
130 |       <ul>
131 |         <li class="right" style="margin-right: 10px">
132 |           <a href="genindex.html" title="General Index"
133 |              >index</a></li>
134 |         <li class="right" >
135 |           <a href="ssettings.html" title="settings：项目设置"
136 |              >next</a> |</li>
137 |         <li class="right" >
138 |           <a href="prepare.html" title="使用前的准备"
139 |              >previous</a> |</li>
140 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
141 |       </ul>
142 |     </div>
143 |     <div class="footer" role="contentinfo">
144 |         &#169; Copyright 2017, Jim Zheng.
145 |       Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.5.3.
146 |     </div>
147 |   </body>
148 | </html>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pyCreeper
  2 | PyCreeper是一个用来快速提取网页内容的信息采集（爬虫）框架。项目底层异步网络I/O使用 **Gevent** 协程库，将网络请求分为静态请求和动态请求，
  3 | 静态请求交给 **Requests** 处理，动态请求则使用 **Selenium.Webdriver** 加载。
  4 | 
  5 | 在设计这个项目的过程中，我参考了很多[Scrapy](https://scrapy.org/)的架构和实现方式。Scrapy是一个非常棒的爬虫框架，
  6 | 我之前花了很多心血在Scrapy框架之上！
  7 | 
  8 | 这篇PyCreeper初探会编写一个简单的爬虫例子，让您明白PyCreeper大致的工作流程，使您快速上手。
  9 | 
 10 | ## 目标任务
 11 | [知乎](https://www.zhihu.com/)与Quora类似，是一个分享知识提出问题的平台。我们的Demo任务是模拟登陆知乎，保存Cookie，
 12 | 之后发出一系列静态请求，获取首页的问题题目与描述。
 13 | 
 14 | 由于模拟登陆一步我们采用了基于Selenium.Webdriver的动态请求处理，所以你可以抛开复杂的抓包与分析代码，只需要点几个按钮，
 15 | 就像在真实环境登录知乎一样简单便利！
 16 | 
 17 | 
 18 | ## 定义一个爬虫
 19 | 定义一个爬虫类需要需要继承Spider类，代码如下：
 20 | 
 21 | ```
 22 | from pycreeper.spider import Spider
 23 | 
 24 | class Zhihu_Spider(Spider):
 25 |     pass
 26 | ```
 27 | 
 28 | ## 选择中间件MiddleWares
 29 | 对于Spider的中间件选择，通过修改custom_settings对象实现：
 30 | 
 31 | ```
 32 | custom_settings = {
 33 |     'DOWNLOADER_MIDDLEWARES': {
 34 |         'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
 35 |         'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
 36 |         'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
 37 |     },
 38 |     'DRIVER': 'Chrome',
 39 |     'DOWNLOAD_DELAY': 2,
 40 |     'USER_AGENT_LIST': [
 41 |        '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
 42 |         (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
 43 |     ]
 44 | }
 45 | ```
 46 | 
 47 | 其中，DOWNLOADER_MIDDLEWARES是这个爬虫爬取过程中使用的中间件。UserAgentMiddleware提供了一种简单的控制请求User-Agent的方式（只对静态请求有效，
 48 | 动态请求的UA取决于使用的WebDriver）。RetryMiddleware对失败的请求（错误的返回码，超时等）进行多次重试。CookiesMiddleware在全体的请求之间共享CookieJar池，
 49 | 一组请求可以共享一个CookieJar，CookiesMiddleware维护CookieJar的有效性与一致性。
 50 | 
 51 | DRIVER表明了动态请求的浏览器，这里我们使用Chrome。
 52 | 
 53 | DOWNLOAD_DELAY表明了下载之间的延迟时间（秒），这个选项当网站有某种防爬策略时还是很有用的。
 54 | 
 55 | USER_AGENT_LIST中包含请求使用的User-Agent，UserAgentMiddleware会从中随机取出一个来使用。
 56 | 
 57 | 
 58 | ## 最开始的请求
 59 | 下面这段代码通过重写start_requests方法yield一个PyCreeper请求：
 60 | 
 61 | ```
 62 | def start_requests(self):
 63 | 
 64 |     def _login(driver):
 65 |         driver.find_element_by_name('account').send_keys("username")
 66 |         driver.find_element_by_name('password').send_keys("password")
 67 |         driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
 68 |         gevent.sleep(5)
 69 | 
 70 |     yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
 71 |                   callback=self.after_login, dynamic=True, browser_actions=[_login])
 72 | ```
 73 |                       
 74 | 在Request对象的参数中，dynamic=True表明这是一个动态请求，将会调用WebDriver加载，
 75 | 而browser_actions=[_login]则定义了浏览器加载完成之后进行的动作。本例中输入了用户名与密码，然后点击登录。
 76 | gevent.sleep(5)则是令爬虫等待浏览器加载完成。
 77 | 
 78 | meta={"cookiejar": "zhihu"}这个选项表明本次请求产生的Cookie将会被存储在名为zhihu的CookieJar当中
 79 | 
 80 | callback=self.after_login定义了本次响应的处理函数。
 81 | 
 82 | ## 接下来？
 83 | 
 84 | 接下来一步将在知乎首页中提取问题链接，发出静态问题请求：
 85 | 
 86 | ```
 87 | def after_login(self, response):
 88 |     html = response.body
 89 |     selector = etree.HTML(html)
 90 |     links = selector.xpath('//a[@class="question_link"]')
 91 |     for link in links:
 92 |         yield Request('https://www.zhihu.com' + link.attrib["href"],
 93 |                       meta={"cookiejar": "zhihu"}, callback=self.get_item)
 94 | ```
 95 | 
 96 | response.body存储了响应的内容。我们使用了lxml提取html文本中的标签，然后发出一系列静态请求。
 97 | 
 98 | 在获得问题页面的数据之后，我们需要做的是提取出其中的问题标题与详情：
 99 | 
100 | ```
101 | def get_item(self, response):
102 |     html = response.body
103 |     selector = etree.HTML(html)
104 |     head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
105 |     body = selector.xpath('//span[@class="RichText"]')[0].text
106 |     yield {
107 |         'head': head,
108 |         'body': body
109 |     }
110 | ```
111 |         
112 | 过程与上个函数类似，通过xpath定位元素。
113 | 
114 | ## 处理你获得的数据
115 | 处理数据通过重写process_item方法实现：
116 | 
117 | ```
118 | def process_item(self, item):
119 |     print json.dumps(item, ensure_ascii=False)
120 | ````
121 |        
122 | 这里我们只是将结果打印。
123 | 
124 | ## 运行你的爬虫
125 | 最后我们通过这样一段代码运行爬虫：
126 | 
127 | ```
128 | if __name__ == "__main__":
129 |     spider = Zhihu_Spider()
130 |     spider.start()
131 | ```
132 |         
133 | 完整的代码如下：
134 | 
135 | ```
136 | # -*- coding:utf-8 -*-
137 | 
138 | from pycreeper.spider import Spider
139 | from pycreeper.http.request import Request
140 | from lxml import etree
141 | import json
142 | import gevent
143 | 
144 | 
145 | class Zhihu_Spider(Spider):
146 | 
147 |     custom_settings = {
148 |         'DOWNLOADER_MIDDLEWARES': {
149 |             'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
150 |             'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
151 |             'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
152 |         },
153 |         'DRIVER': 'Chrome',
154 |         'DOWNLOAD_DELAY': 2,
155 |         'STATIC_REQUEST_SSL_VERIFY': False,
156 |         'USER_AGENT_LIST': [
157 |             '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
158 |             (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
159 |         ]
160 |     }
161 | 
162 |     def start_requests(self):
163 | 
164 |         def _login(driver):
165 |             driver.find_element_by_name('account').send_keys("username")
166 |             driver.find_element_by_name('password').send_keys("password")
167 |             driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
168 |             gevent.sleep(5)
169 | 
170 |         yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
171 |                       callback=self.after_login, dynamic=True, browser_actions=[_login])
172 | 
173 |     def after_login(self, response):
174 |         html = response.body
175 |         selector = etree.HTML(html)
176 |         links = selector.xpath('//a[@class="question_link"]')
177 |         for link in links:
178 |             yield Request('https://www.zhihu.com' + link.attrib["href"],
179 |                           meta={"cookiejar": "zhihu"}, callback=self.get_item)
180 | 
181 |     def get_item(self, response):
182 |         html = response.body
183 |         selector = etree.HTML(html)
184 |         head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
185 |         body = selector.xpath('//span[@class="RichText"]')[0].text
186 |         yield {
187 |             'head': head,
188 |             'body': body
189 |         }
190 | 
191 |     def process_item(self, item):
192 |         print json.dumps(item, ensure_ascii=False)
193 | 
194 | if __name__ == "__main__":
195 |     spider = Zhihu_Spider()
196 |     spider.start()
197 | 
198 | ```
199 | 
200 | ## 写在后面
201 | 项目已经通过PyPi发布，您可以通过以下命令下载：
202 | 
203 | ```
204 | pip install pycreeper
205 | ```
206 | 
207 | 未来我们将会引入Docker的支持。
208 | 
209 | 目前项目刚刚发布1.0.0版本，如果在使用时，遇到各种问题，我们都欢迎您反馈给我们，您可以通过github[项目主页](https://github.com/ZcyAndWt/pyCreeper)，也可以通过邮件，作者的邮箱：zhengchenyu.backend@gmail.com。
210 | 
211 | 如果您使用中，觉得本项目有可取之处，提高了您爬取数据的效率，希望您能在github上star本项目。
212 | 您的支持是我们前进最大的动力！
213 | 


--------------------------------------------------------------------------------
/doc/_build/html/prepare.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  2 |   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  3 | 
  4 | 
  5 | <html xmlns="http://www.w3.org/1999/xhtml">
  6 |   <head>
  7 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  8 |     
  9 |     <title>使用前的准备 &#8212; PyCreeper 1.0.0 documentation</title>
 10 |     
 11 |     <link rel="stylesheet" href="_static/nature.css" type="text/css" />
 12 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 13 |     
 14 |     <script type="text/javascript">
 15 |       var DOCUMENTATION_OPTIONS = {
 16 |         URL_ROOT:    './',
 17 |         VERSION:     '1.0.0',
 18 |         COLLAPSE_INDEX: false,
 19 |         FILE_SUFFIX: '.html',
 20 |         HAS_SOURCE:  true,
 21 |         SOURCELINK_SUFFIX: '.txt'
 22 |       };
 23 |     </script>
 24 |     <script type="text/javascript" src="_static/jquery.js"></script>
 25 |     <script type="text/javascript" src="_static/underscore.js"></script>
 26 |     <script type="text/javascript" src="_static/doctools.js"></script>
 27 |     <link rel="index" title="Index" href="genindex.html" />
 28 |     <link rel="search" title="Search" href="search.html" />
 29 |     <link rel="next" title="架构概览" href="structure.html" />
 30 |     <link rel="prev" title="PyCreeper初探" href="tutorial.html" /> 
 31 |   </head>
 32 |   <body role="document">
 33 |     <div class="related" role="navigation" aria-label="related navigation">
 34 |       <h3>Navigation</h3>
 35 |       <ul>
 36 |         <li class="right" style="margin-right: 10px">
 37 |           <a href="genindex.html" title="General Index"
 38 |              accesskey="I">index</a></li>
 39 |         <li class="right" >
 40 |           <a href="structure.html" title="架构概览"
 41 |              accesskey="N">next</a> |</li>
 42 |         <li class="right" >
 43 |           <a href="tutorial.html" title="PyCreeper初探"
 44 |              accesskey="P">previous</a> |</li>
 45 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
 46 |       </ul>
 47 |     </div>  
 48 | 
 49 |     <div class="document">
 50 |       <div class="documentwrapper">
 51 |         <div class="bodywrapper">
 52 |           <div class="body" role="main">
 53 |             
 54 |   <div class="section" id="id1">
 55 | <h1>使用前的准备<a class="headerlink" href="#id1" title="Permalink to this headline">¶</a></h1>
 56 | <p>我们假定您已经安装了Python2.7及以上版本，若没有安装，请参考Python官网（<a class="reference external" href="https://www.python.org/">https://www.python.org/</a>）选择合适的版本进行安装。</p>
 57 | <p>PyCreeper对于以下几个库存在依赖关系：</p>
 58 | <blockquote>
 59 | <div><ul class="simple">
 60 | <li>gevent</li>
 61 | <li>importlib</li>
 62 | <li>requests</li>
 63 | <li>chardet</li>
 64 | <li>w3lib</li>
 65 | <li>six</li>
 66 | <li>pybloom</li>
 67 | <li>Selenium</li>
 68 | </ul>
 69 | </div></blockquote>
 70 | <p>当然，如果您选择使用pip安装本项目，那么依赖库会自动安装到您的电脑内（至少理论上会是这样）。</p>
 71 | <p>使用pip安装项目:</p>
 72 | <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">pip</span> <span class="n">install</span> <span class="n">pycreeper</span>
 73 | </pre></div>
 74 | </div>
 75 | <div class="section" id="selenium-driver">
 76 | <h2>配置Selenium Driver<a class="headerlink" href="#selenium-driver" title="Permalink to this headline">¶</a></h2>
 77 | <p>当您希望调用指定的浏览器时，Selenium需要您安装指定浏览器的接口。
 78 | 举例来说，如果您希望使用Chrome加载请求，您需要下载安装 <em>Chromedriver</em> （<a class="reference external" href="https://sites.google.com/a/chromium.org/chromedriver/downloads">https://sites.google.com/a/chromium.org/chromedriver/downloads</a>），
 79 | 然后将该程序放在您的PATH之下，确保Python能访问到它。</p>
 80 | <p>几个常用的Driver：</p>
 81 | <table border="1" class="docutils">
 82 | <colgroup>
 83 | <col width="16%" />
 84 | <col width="84%" />
 85 | </colgroup>
 86 | <thead valign="bottom">
 87 | <tr class="row-odd"><th class="head">名称</th>
 88 | <th class="head">link</th>
 89 | </tr>
 90 | </thead>
 91 | <tbody valign="top">
 92 | <tr class="row-even"><td>Chrome</td>
 93 | <td><a class="reference external" href="https://sites.google.com/a/chromium.org/chromedriver/downloads">https://sites.google.com/a/chromium.org/chromedriver/downloads</a></td>
 94 | </tr>
 95 | <tr class="row-odd"><td>Firefox</td>
 96 | <td><a class="reference external" href="https://github.com/mozilla/geckodriver/releases">https://github.com/mozilla/geckodriver/releases</a></td>
 97 | </tr>
 98 | <tr class="row-even"><td>PhantomJS</td>
 99 | <td><a class="reference external" href="http://phantomjs.org/download.html">http://phantomjs.org/download.html</a></td>
100 | </tr>
101 | </tbody>
102 | </table>
103 | <p>其中，PhantomJS是一款无界面化WebKit，当您在无GUI设备的情况下，该浏览器是您最好的选择。</p>
104 | <p>对于Selenium更详细的配置，请参考 <a class="reference external" href="http://selenium-python.readthedocs.io/">http://selenium-python.readthedocs.io/</a></p>
105 | </div>
106 | </div>
107 | 
108 | 
109 |           </div>
110 |         </div>
111 |       </div>
112 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
113 |         <div class="sphinxsidebarwrapper">
114 |   <h3><a href="index.html">Table Of Contents</a></h3>
115 |   <ul>
116 | <li><a class="reference internal" href="#">使用前的准备</a><ul>
117 | <li><a class="reference internal" href="#selenium-driver">配置Selenium Driver</a></li>
118 | </ul>
119 | </li>
120 | </ul>
121 | 
122 |   <h4>Previous topic</h4>
123 |   <p class="topless"><a href="tutorial.html"
124 |                         title="previous chapter">PyCreeper初探</a></p>
125 |   <h4>Next topic</h4>
126 |   <p class="topless"><a href="structure.html"
127 |                         title="next chapter">架构概览</a></p>
128 |   <div role="note" aria-label="source link">
129 |     <h3>This Page</h3>
130 |     <ul class="this-page-menu">
131 |       <li><a href="_sources/prepare.rst.txt"
132 |             rel="nofollow">Show Source</a></li>
133 |     </ul>
134 |    </div>
135 | <div id="searchbox" style="display: none" role="search">
136 |   <h3>Quick search</h3>
137 |     <form class="search" action="search.html" method="get">
138 |       <div><input type="text" name="q" /></div>
139 |       <div><input type="submit" value="Go" /></div>
140 |       <input type="hidden" name="check_keywords" value="yes" />
141 |       <input type="hidden" name="area" value="default" />
142 |     </form>
143 | </div>
144 | <script type="text/javascript">$('#searchbox').show(0);</script>
145 |         </div>
146 |       </div>
147 |       <div class="clearer"></div>
148 |     </div>
149 |     <div class="related" role="navigation" aria-label="related navigation">
150 |       <h3>Navigation</h3>
151 |       <ul>
152 |         <li class="right" style="margin-right: 10px">
153 |           <a href="genindex.html" title="General Index"
154 |              >index</a></li>
155 |         <li class="right" >
156 |           <a href="structure.html" title="架构概览"
157 |              >next</a> |</li>
158 |         <li class="right" >
159 |           <a href="tutorial.html" title="PyCreeper初探"
160 |              >previous</a> |</li>
161 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
162 |       </ul>
163 |     </div>
164 |     <div class="footer" role="contentinfo">
165 |         &#169; Copyright 2017, Jim Zheng.
166 |       Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.5.3.
167 |     </div>
168 |   </body>
169 | </html>


--------------------------------------------------------------------------------
/doc/_build/html/_sources/intro.rst.txt:
--------------------------------------------------------------------------------
  1 | PyCreeper初探
  2 | ==============
  3 | PyCreeper是一个用来快速提取网页内容的信息采集（爬虫）框架。项目底层异步网络I/O使用 **Gevent** 协程库，将网络请求分为静态请求和动态请求，
  4 | 静态请求交给 **Requests** 处理，动态请求则使用 **Selenium.Webdriver** 加载。
  5 | 
  6 | 在设计这个项目的过程中，我参考了很多 **Scrapy** （项目网站: https://scrapy.org/）的架构和实现方式。Scrapy是一个非常棒的爬虫框架，
  7 | 我之前花了很多心血在Scrapy框架之上！
  8 | 
  9 | 这篇PyCreeper初探会编写一个简单的爬虫例子，让您明白PyCreeper大致的工作流程，使您快速上手。
 10 | 
 11 | 目标任务
 12 | ---------
 13 | 知乎（https://www.zhihu.com/）与Quora类似，是一个分享知识提出问题的平台。我们的Demo任务是模拟登陆知乎，保存Cookie，
 14 | 之后发出一系列静态请求，获取首页的问题题目与描述。
 15 | 
 16 | 由于模拟登陆一步我们采用了基于Selenium.Webdriver的动态请求处理，所以你可以抛开复杂的抓包与分析代码，只需要点几个按钮，
 17 | 就像在真实环境登录知乎一样简单便利！
 18 | 
 19 | 
 20 | 定义一个爬虫
 21 | -------------
 22 | 定义一个爬虫类需要需要继承Spider类，代码如下::
 23 | 
 24 |     from pycreeper.spider import Spider
 25 | 
 26 |     class Zhihu_Spider(Spider):
 27 |         pass
 28 | 
 29 | 选择中间件MiddleWares
 30 | ----------------------
 31 | 对于Spider的中间件选择，通过修改custom_settings对象实现::
 32 | 
 33 |     custom_settings = {
 34 |         'DOWNLOADER_MIDDLEWARES': {
 35 |             'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
 36 |             'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
 37 |             'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
 38 |         },
 39 |         'DRIVER': 'Chrome',
 40 |         'DOWNLOAD_DELAY': 2,
 41 |         'USER_AGENT_LIST': [
 42 |            '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
 43 |             (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
 44 |         ]
 45 |     }
 46 | 
 47 | 其中，DOWNLOADER_MIDDLEWARES是这个爬虫爬取过程中使用的中间件。UserAgentMiddleware提供了一种简单的控制请求User-Agent的方式（只对静态请求有效，
 48 | 动态请求的UA取决于使用的WebDriver）。RetryMiddleware对失败的请求（错误的返回码，超时等）进行多次重试。CookiesMiddleware在全体的请求之间共享CookieJar池，
 49 | 一组请求可以共享一个CookieJar，CookiesMiddleware维护CookieJar的有效性与一致性。
 50 | 
 51 | DRIVER表明了动态请求的浏览器，这里我们使用Chrome。
 52 | 
 53 | DOWNLOAD_DELAY表明了下载之间的延迟时间（秒），这个选项当网站有某种防爬策略时还是很有用的。
 54 | 
 55 | USER_AGENT_LIST中包含请求使用的User-Agent，UserAgentMiddleware会从中随机取出一个来使用。
 56 | 
 57 | 
 58 | 最开始的请求
 59 | -------------
 60 | 下面这段代码通过重写start_requests方法yield一个PyCreeper请求::
 61 | 
 62 |     def start_requests(self):
 63 | 
 64 |         def _login(driver):
 65 |             driver.find_element_by_name('account').send_keys("username")
 66 |             driver.find_element_by_name('password').send_keys("password")
 67 |             driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
 68 |             gevent.sleep(5)
 69 | 
 70 |         yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
 71 |                       callback=self.after_login, dynamic=True, browser_actions=[_login])
 72 |                       
 73 | 在Request对象的参数中，dynamic=True表明这是一个动态请求，将会调用WebDriver加载，
 74 | 而browser_actions=[_login]则定义了浏览器加载完成之后进行的动作。本例中输入了用户名与密码，然后点击登录。
 75 | gevent.sleep(5)则是令爬虫等待浏览器加载完成。
 76 | 
 77 | meta={"cookiejar": "zhihu"}这个选项表明本次请求产生的Cookie将会被存储在名为zhihu的CookieJar当中
 78 | 
 79 | callback=self.after_login定义了本次响应的处理函数。
 80 | 
 81 | 接下来？
 82 | --------
 83 | 接下来一步将在知乎首页中提取问题链接，发出静态问题请求::
 84 | 
 85 |     def after_login(self, response):
 86 |         html = response.body
 87 |         selector = etree.HTML(html)
 88 |         links = selector.xpath('//a[@class="question_link"]')
 89 |         for link in links:
 90 |             yield Request('https://www.zhihu.com' + link.attrib["href"],
 91 |                           meta={"cookiejar": "zhihu"}, callback=self.get_item)
 92 | 
 93 | response.body存储了响应的内容。我们使用了lxml提取html文本中的标签，然后发出一系列静态请求。
 94 | 
 95 | 在获得问题页面的数据之后，我们需要做的是提取出其中的问题标题与详情::
 96 | 
 97 |     def get_item(self, response):
 98 |         html = response.body
 99 |         selector = etree.HTML(html)
100 |         head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
101 |         body = selector.xpath('//span[@class="RichText"]')[0].text
102 |         yield {
103 |             'head': head,
104 |             'body': body
105 |         }
106 |         
107 | 过程与上个函数类似，通过xpath定位元素。
108 | 
109 | 处理你获得的数据
110 | -----------------
111 | 处理数据通过重写process_item方法实现::
112 | 
113 |     def process_item(self, item):
114 |         print json.dumps(item, ensure_ascii=False)
115 |        
116 | 这里我们只是将结果打印。
117 | 
118 | 运行你的爬虫
119 | -------------
120 | 最后我们通过这样一段代码运行爬虫::
121 | 
122 |     if __name__ == "__main__":
123 |         spider = Zhihu_Spider()
124 |         spider.start()
125 |         
126 | 完整的代码如下::
127 | 
128 |     # -*- coding:utf-8 -*-
129 | 
130 |     from pycreeper.spider import Spider
131 |     from pycreeper.http.request import Request
132 |     from lxml import etree
133 |     import json
134 |     import gevent
135 | 
136 | 
137 |     class Zhihu_Spider(Spider):
138 | 
139 |         custom_settings = {
140 |             'DOWNLOADER_MIDDLEWARES': {
141 |                 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
142 |                 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
143 |                 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
144 |             },
145 |             'DRIVER': 'Chrome',
146 |             'DOWNLOAD_DELAY': 2,
147 |             'STATIC_REQUEST_SSL_VERIFY': False,
148 |             'USER_AGENT_LIST': [
149 |                 '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
150 |                 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
151 |             ]
152 |         }
153 | 
154 |         def start_requests(self):
155 | 
156 |             def _login(driver):
157 |                 driver.find_element_by_name('account').send_keys("15501277123")
158 |                 driver.find_element_by_name('password').send_keys("zcymichael")
159 |                 driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
160 |                 gevent.sleep(5)
161 | 
162 |             yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
163 |                           callback=self.after_login, dynamic=True, browser_actions=[_login])
164 | 
165 |         def after_login(self, response):
166 |             html = response.body
167 |             selector = etree.HTML(html)
168 |             links = selector.xpath('//a[@class="question_link"]')
169 |             for link in links:
170 |                 yield Request('https://www.zhihu.com' + link.attrib["href"],
171 |                               meta={"cookiejar": "zhihu"}, callback=self.get_item)
172 | 
173 |         def get_item(self, response):
174 |             html = response.body
175 |             selector = etree.HTML(html)
176 |             head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
177 |             body = selector.xpath('//span[@class="RichText"]')[0].text
178 |             yield {
179 |                 'head': head,
180 |                 'body': body
181 |             }
182 | 
183 |         def process_item(self, item):
184 |             print json.dumps(item, ensure_ascii=False)
185 | 
186 |     if __name__ == "__main__":
187 |         spider = Zhihu_Spider()
188 |         spider.start()
189 | 
190 | 
191 | 写在后面
192 | ---------
193 | 项目已经通过PyPi发布，您可以通过以下命令下载::
194 | 
195 |     pip install pycreeper
196 | 
197 | 未来我们将会引入Docker的支持。
198 | 
199 | 目前项目刚刚发布1.0.0版本，如果在使用时，遇到各种问题，我们都欢迎您反馈给我们，您可以通过github，
200 | 项目主页：https://github.com/ZcyAndWt/pyCreeper，也可以通过邮件，作者的邮箱：zhengchenyu.backend@gmail.com。
201 | 
202 | 如果您使用中，觉得本项目有可取之处，提高了您爬取数据的速度，希望您能在github上star本项目。
203 | 您的支持是我们前进最大的动力！
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 


--------------------------------------------------------------------------------
/doc/tutorial.rst:
--------------------------------------------------------------------------------
  1 | PyCreeper初探
  2 | ==============
  3 | PyCreeper是一个用来快速提取网页内容的信息采集（爬虫）框架。项目底层异步网络I/O使用 **Gevent** 协程库，将网络请求分为静态请求和动态请求，
  4 | 静态请求交给 **Requests** 处理，动态请求则使用 **Selenium.Webdriver** 加载。
  5 | 
  6 | 在设计这个项目的过程中，我参考了很多 **Scrapy** （项目网站: https://scrapy.org/）的架构和实现方式。Scrapy是一个非常棒的爬虫框架，
  7 | 我之前花了很多心血在Scrapy框架之上！
  8 | 
  9 | 这篇PyCreeper初探会编写一个简单的爬虫例子，让您明白PyCreeper大致的工作流程，使您快速上手。
 10 | 
 11 | 如果您的PyCreeper还没有安装好，请参考： :doc:`prepare`。
 12 | 
 13 | 目标任务
 14 | ---------
 15 | 知乎（https://www.zhihu.com/）与Quora类似，是一个分享知识提出问题的平台。我们的Demo任务是模拟登陆知乎，保存Cookie，
 16 | 之后发出一系列静态请求，获取首页的问题题目与描述。
 17 | 
 18 | 由于模拟登陆一步我们采用了基于Selenium.Webdriver的动态请求处理，所以你可以抛开复杂的抓包与分析代码，只需要点几个按钮，
 19 | 就像在真实环境登录知乎一样简单便利！
 20 | 
 21 | 
 22 | 定义一个爬虫
 23 | -------------
 24 | 定义一个爬虫类需要需要继承Spider类，代码如下::
 25 | 
 26 |     from pycreeper.spider import Spider
 27 | 
 28 |     class Zhihu_Spider(Spider):
 29 |         pass
 30 | 
 31 | 选择中间件MiddleWares
 32 | ----------------------
 33 | 对于Spider的中间件选择，通过修改custom_settings对象实现::
 34 | 
 35 |     custom_settings = {
 36 |         'DOWNLOADER_MIDDLEWARES': {
 37 |             'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
 38 |             'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
 39 |             'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
 40 |         },
 41 |         'DRIVER': 'Chrome',
 42 |         'DOWNLOAD_DELAY': 2,
 43 |         'USER_AGENT_LIST': [
 44 |            '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
 45 |             (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
 46 |         ]
 47 |     }
 48 | 
 49 | 其中，DOWNLOADER_MIDDLEWARES是这个爬虫爬取过程中使用的中间件。UserAgentMiddleware提供了一种简单的控制请求User-Agent的方式（只对静态请求有效，
 50 | 动态请求的UA取决于使用的WebDriver）。RetryMiddleware对失败的请求（错误的返回码，超时等）进行多次重试。CookiesMiddleware在全体的请求之间共享CookieJar池，
 51 | 一组请求可以共享一个CookieJar，CookiesMiddleware维护CookieJar的有效性与一致性。
 52 | 
 53 | DRIVER表明了动态请求的浏览器，这里我们使用Chrome。
 54 | 
 55 | DOWNLOAD_DELAY表明了下载之间的延迟时间（秒），这个选项当网站有某种防爬策略时还是很有用的。
 56 | 
 57 | USER_AGENT_LIST中包含请求使用的User-Agent，UserAgentMiddleware会从中随机取出一个来使用。
 58 | 
 59 | 
 60 | 最开始的请求
 61 | -------------
 62 | 下面这段代码通过重写start_requests方法yield一个PyCreeper请求::
 63 | 
 64 |     def start_requests(self):
 65 | 
 66 |         def _login(driver):
 67 |             driver.find_element_by_name('account').send_keys("username")
 68 |             driver.find_element_by_name('password').send_keys("password")
 69 |             driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
 70 |             gevent.sleep(5)
 71 | 
 72 |         yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
 73 |                       callback=self.after_login, dynamic=True, browser_actions=[_login])
 74 |                       
 75 | 在Request对象的参数中，dynamic=True表明这是一个动态请求，将会调用WebDriver加载，
 76 | 而browser_actions=[_login]则定义了浏览器加载完成之后进行的动作。本例中输入了用户名与密码，然后点击登录。
 77 | gevent.sleep(5)则是令爬虫等待浏览器加载完成。
 78 | 
 79 | meta={"cookiejar": "zhihu"}这个选项表明本次请求产生的Cookie将会被存储在名为zhihu的CookieJar当中
 80 | 
 81 | callback=self.after_login定义了本次响应的处理函数。
 82 | 
 83 | 接下来？
 84 | --------
 85 | 接下来一步将在知乎首页中提取问题链接，发出静态问题请求::
 86 | 
 87 |     def after_login(self, response):
 88 |         html = response.body
 89 |         selector = etree.HTML(html)
 90 |         links = selector.xpath('//a[@class="question_link"]')
 91 |         for link in links:
 92 |             yield Request('https://www.zhihu.com' + link.attrib["href"],
 93 |                           meta={"cookiejar": "zhihu"}, callback=self.get_item)
 94 | 
 95 | response.body存储了响应的内容。我们使用了lxml提取html文本中的标签，然后发出一系列静态请求。
 96 | 
 97 | 在获得问题页面的数据之后，我们需要做的是提取出其中的问题标题与详情::
 98 | 
 99 |     def get_item(self, response):
100 |         html = response.body
101 |         selector = etree.HTML(html)
102 |         head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
103 |         body = selector.xpath('//span[@class="RichText"]')[0].text
104 |         yield {
105 |             'head': head,
106 |             'body': body
107 |         }
108 |         
109 | 过程与上个函数类似，通过xpath定位元素。
110 | 
111 | 处理你获得的数据
112 | -----------------
113 | 处理数据通过重写process_item方法实现::
114 | 
115 |     def process_item(self, item):
116 |         print json.dumps(item, ensure_ascii=False)
117 |        
118 | 这里我们只是将结果打印。
119 | 
120 | 运行你的爬虫
121 | -------------
122 | 最后我们通过这样一段代码运行爬虫::
123 | 
124 |     if __name__ == "__main__":
125 |         spider = Zhihu_Spider()
126 |         spider.start()
127 |         
128 | 完整的代码如下::
129 | 
130 |     # -*- coding:utf-8 -*-
131 | 
132 |     from pycreeper.spider import Spider
133 |     from pycreeper.http.request import Request
134 |     from lxml import etree
135 |     import json
136 |     import gevent
137 | 
138 | 
139 |     class Zhihu_Spider(Spider):
140 | 
141 |         custom_settings = {
142 |             'DOWNLOADER_MIDDLEWARES': {
143 |                 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
144 |                 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
145 |                 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
146 |             },
147 |             'DRIVER': 'Chrome',
148 |             'DOWNLOAD_DELAY': 2,
149 |             'STATIC_REQUEST_SSL_VERIFY': False,
150 |             'USER_AGENT_LIST': [
151 |                 '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
152 |                 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
153 |             ]
154 |         }
155 | 
156 |         def start_requests(self):
157 | 
158 |             def _login(driver):
159 |                 driver.find_element_by_name('account').send_keys("username")
160 |                 driver.find_element_by_name('password').send_keys("password")
161 |                 driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
162 |                 gevent.sleep(5)
163 | 
164 |             yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
165 |                           callback=self.after_login, dynamic=True, browser_actions=[_login])
166 | 
167 |         def after_login(self, response):
168 |             html = response.body
169 |             selector = etree.HTML(html)
170 |             links = selector.xpath('//a[@class="question_link"]')
171 |             for link in links:
172 |                 yield Request('https://www.zhihu.com' + link.attrib["href"],
173 |                               meta={"cookiejar": "zhihu"}, callback=self.get_item)
174 | 
175 |         def get_item(self, response):
176 |             html = response.body
177 |             selector = etree.HTML(html)
178 |             head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
179 |             body = selector.xpath('//span[@class="RichText"]')[0].text
180 |             yield {
181 |                 'head': head,
182 |                 'body': body
183 |             }
184 | 
185 |         def process_item(self, item):
186 |             print json.dumps(item, ensure_ascii=False)
187 | 
188 |     if __name__ == "__main__":
189 |         spider = Zhihu_Spider()
190 |         spider.start()
191 | 
192 | 
193 | 写在后面
194 | ---------
195 | 项目已经通过PyPi发布，您可以通过以下命令下载::
196 | 
197 |     pip install pycreeper
198 | 
199 | 未来我们将会引入Docker的支持。
200 | 
201 | 目前项目刚刚发布1.0.0版本，如果在使用时，遇到各种问题，我们都欢迎您反馈给我们，您可以通过github，
202 | 项目主页：https://github.com/ZcyAndWt/pyCreeper，也可以通过邮件，作者的邮箱：zhengchenyu.backend@gmail.com。
203 | 
204 | 如果您使用中，觉得本项目有可取之处，提高了您爬取数据的效率，希望您能在github上star本项目。
205 | 您的支持是我们前进最大的动力！
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 


--------------------------------------------------------------------------------
/doc/_build/html/_sources/tutorial.rst.txt:
--------------------------------------------------------------------------------
  1 | PyCreeper初探
  2 | ==============
  3 | PyCreeper是一个用来快速提取网页内容的信息采集（爬虫）框架。项目底层异步网络I/O使用 **Gevent** 协程库，将网络请求分为静态请求和动态请求，
  4 | 静态请求交给 **Requests** 处理，动态请求则使用 **Selenium.Webdriver** 加载。
  5 | 
  6 | 在设计这个项目的过程中，我参考了很多 **Scrapy** （项目网站: https://scrapy.org/）的架构和实现方式。Scrapy是一个非常棒的爬虫框架，
  7 | 我之前花了很多心血在Scrapy框架之上！
  8 | 
  9 | 这篇PyCreeper初探会编写一个简单的爬虫例子，让您明白PyCreeper大致的工作流程，使您快速上手。
 10 | 
 11 | 如果您的PyCreeper还没有安装好，请参考： :doc:`prepare`。
 12 | 
 13 | 目标任务
 14 | ---------
 15 | 知乎（https://www.zhihu.com/）与Quora类似，是一个分享知识提出问题的平台。我们的Demo任务是模拟登陆知乎，保存Cookie，
 16 | 之后发出一系列静态请求，获取首页的问题题目与描述。
 17 | 
 18 | 由于模拟登陆一步我们采用了基于Selenium.Webdriver的动态请求处理，所以你可以抛开复杂的抓包与分析代码，只需要点几个按钮，
 19 | 就像在真实环境登录知乎一样简单便利！
 20 | 
 21 | 
 22 | 定义一个爬虫
 23 | -------------
 24 | 定义一个爬虫类需要需要继承Spider类，代码如下::
 25 | 
 26 |     from pycreeper.spider import Spider
 27 | 
 28 |     class Zhihu_Spider(Spider):
 29 |         pass
 30 | 
 31 | 选择中间件MiddleWares
 32 | ----------------------
 33 | 对于Spider的中间件选择，通过修改custom_settings对象实现::
 34 | 
 35 |     custom_settings = {
 36 |         'DOWNLOADER_MIDDLEWARES': {
 37 |             'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
 38 |             'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
 39 |             'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
 40 |         },
 41 |         'DRIVER': 'Chrome',
 42 |         'DOWNLOAD_DELAY': 2,
 43 |         'USER_AGENT_LIST': [
 44 |            '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
 45 |             (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
 46 |         ]
 47 |     }
 48 | 
 49 | 其中，DOWNLOADER_MIDDLEWARES是这个爬虫爬取过程中使用的中间件。UserAgentMiddleware提供了一种简单的控制请求User-Agent的方式（只对静态请求有效，
 50 | 动态请求的UA取决于使用的WebDriver）。RetryMiddleware对失败的请求（错误的返回码，超时等）进行多次重试。CookiesMiddleware在全体的请求之间共享CookieJar池，
 51 | 一组请求可以共享一个CookieJar，CookiesMiddleware维护CookieJar的有效性与一致性。
 52 | 
 53 | DRIVER表明了动态请求的浏览器，这里我们使用Chrome。
 54 | 
 55 | DOWNLOAD_DELAY表明了下载之间的延迟时间（秒），这个选项当网站有某种防爬策略时还是很有用的。
 56 | 
 57 | USER_AGENT_LIST中包含请求使用的User-Agent，UserAgentMiddleware会从中随机取出一个来使用。
 58 | 
 59 | 
 60 | 最开始的请求
 61 | -------------
 62 | 下面这段代码通过重写start_requests方法yield一个PyCreeper请求::
 63 | 
 64 |     def start_requests(self):
 65 | 
 66 |         def _login(driver):
 67 |             driver.find_element_by_name('account').send_keys("username")
 68 |             driver.find_element_by_name('password').send_keys("password")
 69 |             driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
 70 |             gevent.sleep(5)
 71 | 
 72 |         yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
 73 |                       callback=self.after_login, dynamic=True, browser_actions=[_login])
 74 |                       
 75 | 在Request对象的参数中，dynamic=True表明这是一个动态请求，将会调用WebDriver加载，
 76 | 而browser_actions=[_login]则定义了浏览器加载完成之后进行的动作。本例中输入了用户名与密码，然后点击登录。
 77 | gevent.sleep(5)则是令爬虫等待浏览器加载完成。
 78 | 
 79 | meta={"cookiejar": "zhihu"}这个选项表明本次请求产生的Cookie将会被存储在名为zhihu的CookieJar当中
 80 | 
 81 | callback=self.after_login定义了本次响应的处理函数。
 82 | 
 83 | 接下来？
 84 | --------
 85 | 接下来一步将在知乎首页中提取问题链接，发出静态问题请求::
 86 | 
 87 |     def after_login(self, response):
 88 |         html = response.body
 89 |         selector = etree.HTML(html)
 90 |         links = selector.xpath('//a[@class="question_link"]')
 91 |         for link in links:
 92 |             yield Request('https://www.zhihu.com' + link.attrib["href"],
 93 |                           meta={"cookiejar": "zhihu"}, callback=self.get_item)
 94 | 
 95 | response.body存储了响应的内容。我们使用了lxml提取html文本中的标签，然后发出一系列静态请求。
 96 | 
 97 | 在获得问题页面的数据之后，我们需要做的是提取出其中的问题标题与详情::
 98 | 
 99 |     def get_item(self, response):
100 |         html = response.body
101 |         selector = etree.HTML(html)
102 |         head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
103 |         body = selector.xpath('//span[@class="RichText"]')[0].text
104 |         yield {
105 |             'head': head,
106 |             'body': body
107 |         }
108 |         
109 | 过程与上个函数类似，通过xpath定位元素。
110 | 
111 | 处理你获得的数据
112 | -----------------
113 | 处理数据通过重写process_item方法实现::
114 | 
115 |     def process_item(self, item):
116 |         print json.dumps(item, ensure_ascii=False)
117 |        
118 | 这里我们只是将结果打印。
119 | 
120 | 运行你的爬虫
121 | -------------
122 | 最后我们通过这样一段代码运行爬虫::
123 | 
124 |     if __name__ == "__main__":
125 |         spider = Zhihu_Spider()
126 |         spider.start()
127 |         
128 | 完整的代码如下::
129 | 
130 |     # -*- coding:utf-8 -*-
131 | 
132 |     from pycreeper.spider import Spider
133 |     from pycreeper.http.request import Request
134 |     from lxml import etree
135 |     import json
136 |     import gevent
137 | 
138 | 
139 |     class Zhihu_Spider(Spider):
140 | 
141 |         custom_settings = {
142 |             'DOWNLOADER_MIDDLEWARES': {
143 |                 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
144 |                 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
145 |                 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
146 |             },
147 |             'DRIVER': 'Chrome',
148 |             'DOWNLOAD_DELAY': 2,
149 |             'STATIC_REQUEST_SSL_VERIFY': False,
150 |             'USER_AGENT_LIST': [
151 |                 '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
152 |                 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
153 |             ]
154 |         }
155 | 
156 |         def start_requests(self):
157 | 
158 |             def _login(driver):
159 |                 driver.find_element_by_name('account').send_keys("username")
160 |                 driver.find_element_by_name('password').send_keys("password")
161 |                 driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
162 |                 gevent.sleep(5)
163 | 
164 |             yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
165 |                           callback=self.after_login, dynamic=True, browser_actions=[_login])
166 | 
167 |         def after_login(self, response):
168 |             html = response.body
169 |             selector = etree.HTML(html)
170 |             links = selector.xpath('//a[@class="question_link"]')
171 |             for link in links:
172 |                 yield Request('https://www.zhihu.com' + link.attrib["href"],
173 |                               meta={"cookiejar": "zhihu"}, callback=self.get_item)
174 | 
175 |         def get_item(self, response):
176 |             html = response.body
177 |             selector = etree.HTML(html)
178 |             head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
179 |             body = selector.xpath('//span[@class="RichText"]')[0].text
180 |             yield {
181 |                 'head': head,
182 |                 'body': body
183 |             }
184 | 
185 |         def process_item(self, item):
186 |             print json.dumps(item, ensure_ascii=False)
187 | 
188 |     if __name__ == "__main__":
189 |         spider = Zhihu_Spider()
190 |         spider.start()
191 | 
192 | 
193 | 写在后面
194 | ---------
195 | 项目已经通过PyPi发布，您可以通过以下命令下载::
196 | 
197 |     pip install pycreeper
198 | 
199 | 未来我们将会引入Docker的支持。
200 | 
201 | 目前项目刚刚发布1.0.0版本，如果在使用时，遇到各种问题，我们都欢迎您反馈给我们，您可以通过github，
202 | 项目主页：https://github.com/ZcyAndWt/pyCreeper，也可以通过邮件，作者的邮箱：zhengchenyu.backend@gmail.com。
203 | 
204 | 如果您使用中，觉得本项目有可取之处，提高了您爬取数据的效率，希望您能在github上star本项目。
205 | 您的支持是我们前进最大的动力！
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 


--------------------------------------------------------------------------------
/doc/_build/html/http.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  2 |   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  3 | 
  4 | 
  5 | <html xmlns="http://www.w3.org/1999/xhtml">
  6 |   <head>
  7 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  8 |     
  9 |     <title>request对象和response对象 &#8212; PyCreeper 1.0.0 documentation</title>
 10 |     
 11 |     <link rel="stylesheet" href="_static/nature.css" type="text/css" />
 12 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 13 |     
 14 |     <script type="text/javascript">
 15 |       var DOCUMENTATION_OPTIONS = {
 16 |         URL_ROOT:    './',
 17 |         VERSION:     '1.0.0',
 18 |         COLLAPSE_INDEX: false,
 19 |         FILE_SUFFIX: '.html',
 20 |         HAS_SOURCE:  true,
 21 |         SOURCELINK_SUFFIX: '.txt'
 22 |       };
 23 |     </script>
 24 |     <script type="text/javascript" src="_static/jquery.js"></script>
 25 |     <script type="text/javascript" src="_static/underscore.js"></script>
 26 |     <script type="text/javascript" src="_static/doctools.js"></script>
 27 |     <link rel="index" title="Index" href="genindex.html" />
 28 |     <link rel="search" title="Search" href="search.html" />
 29 |     <link rel="next" title="downloader：下载器" href="downloader.html" />
 30 |     <link rel="prev" title="settings：项目设置" href="settings.html" /> 
 31 |   </head>
 32 |   <body role="document">
 33 |     <div class="related" role="navigation" aria-label="related navigation">
 34 |       <h3>Navigation</h3>
 35 |       <ul>
 36 |         <li class="right" style="margin-right: 10px">
 37 |           <a href="genindex.html" title="General Index"
 38 |              accesskey="I">index</a></li>
 39 |         <li class="right" >
 40 |           <a href="downloader.html" title="downloader：下载器"
 41 |              accesskey="N">next</a> |</li>
 42 |         <li class="right" >
 43 |           <a href="settings.html" title="settings：项目设置"
 44 |              accesskey="P">previous</a> |</li>
 45 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
 46 |       </ul>
 47 |     </div>  
 48 | 
 49 |     <div class="document">
 50 |       <div class="documentwrapper">
 51 |         <div class="bodywrapper">
 52 |           <div class="body" role="main">
 53 |             
 54 |   <div class="section" id="requestresponse">
 55 | <h1>request对象和response对象<a class="headerlink" href="#requestresponse" title="Permalink to this headline">¶</a></h1>
 56 | <p>request对象和response对象负责在各个PyCreeper组件之间传递信息，您在使用爬虫的过程中，会经常需要对这两个对象进行操作。</p>
 57 | <div class="section" id="request">
 58 | <h2>Request：自定义您的请求<a class="headerlink" href="#request" title="Permalink to this headline">¶</a></h2>
 59 | <p>构造参数:</p>
 60 | <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">Request</span><span class="p">(</span><span class="n">url</span><span class="p">,</span> <span class="n">callback</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="s1">&#39;GET&#39;</span><span class="p">,</span> <span class="n">headers</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span><span class="n">body</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">meta</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
 61 |         <span class="n">encoding</span><span class="o">=</span><span class="s1">&#39;utf-8&#39;</span><span class="p">,</span> <span class="n">cookiejar</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span><span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">browser_actions</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">wait</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
 62 | </pre></div>
 63 | </div>
 64 | <p><strong>url</strong></p>
 65 | <p>请求的url</p>
 66 | <p><strong>callback</strong></p>
 67 | <p>请求的回调函数，如果未定义则使用Spider.parse方法处理响应。</p>
 68 | <p><strong>method</strong></p>
 69 | <p>支持GET型和POST型请求方法，其中，POST方法只有当dynamic=False时才会被支持，
 70 | 如果dynamic=True将会抛出一个AttributeError。</p>
 71 | <p><strong>headers</strong></p>
 72 | <p>该参数可以传入一个字典（dict），用于静态请求的头部信息。</p>
 73 | <p><strong>body</strong></p>
 74 | <p>该参数用于静态请求的请求体。</p>
 75 | <p><strong>meta</strong></p>
 76 | <p>该参数为字典（dict）型，用于给request携带一些参数，这些参数可能在其他模块用到。</p>
 77 | <p><strong>encoding</strong></p>
 78 | <p>请求的编码方式，用于给url和body编码。</p>
 79 | <p><strong>cookiejar</strong></p>
 80 | <p>该参数用于取出request携带的cookiejar，在构造request对象时请不要向该参数传入值，传入的cookiejar不会被PyCreeper使用到。</p>
 81 | <p><strong>dynamic</strong></p>
 82 | <p>该参数用于标记request是否是动态请求。</p>
 83 | <p><strong>browser_actions</strong></p>
 84 | <p>该参数用于定义浏览器打开指定网址之后，到提取数据之前，执行的一系列操作。该参数可以传入一个函数列表。</p>
 85 | <p><strong>wait</strong></p>
 86 | <p>该参数用于定义浏览器打开指定网址之后，到执行browser_actions中定义的函数之前，等待的时间。
 87 | 当网页存在大量异步加载请求的时候，这个参数格外有用。</p>
 88 | </div>
 89 | </div>
 90 | 
 91 | 
 92 |           </div>
 93 |         </div>
 94 |       </div>
 95 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
 96 |         <div class="sphinxsidebarwrapper">
 97 |   <h3><a href="index.html">Table Of Contents</a></h3>
 98 |   <ul>
 99 | <li><a class="reference internal" href="#">request对象和response对象</a><ul>
100 | <li><a class="reference internal" href="#request">Request：自定义您的请求</a></li>
101 | </ul>
102 | </li>
103 | </ul>
104 | 
105 |   <h4>Previous topic</h4>
106 |   <p class="topless"><a href="settings.html"
107 |                         title="previous chapter">settings：项目设置</a></p>
108 |   <h4>Next topic</h4>
109 |   <p class="topless"><a href="downloader.html"
110 |                         title="next chapter">downloader：下载器</a></p>
111 |   <div role="note" aria-label="source link">
112 |     <h3>This Page</h3>
113 |     <ul class="this-page-menu">
114 |       <li><a href="_sources/http.rst.txt"
115 |             rel="nofollow">Show Source</a></li>
116 |     </ul>
117 |    </div>
118 | <div id="searchbox" style="display: none" role="search">
119 |   <h3>Quick search</h3>
120 |     <form class="search" action="search.html" method="get">
121 |       <div><input type="text" name="q" /></div>
122 |       <div><input type="submit" value="Go" /></div>
123 |       <input type="hidden" name="check_keywords" value="yes" />
124 |       <input type="hidden" name="area" value="default" />
125 |     </form>
126 | </div>
127 | <script type="text/javascript">$('#searchbox').show(0);</script>
128 |         </div>
129 |       </div>
130 |       <div class="clearer"></div>
131 |     </div>
132 |     <div class="related" role="navigation" aria-label="related navigation">
133 |       <h3>Navigation</h3>
134 |       <ul>
135 |         <li class="right" style="margin-right: 10px">
136 |           <a href="genindex.html" title="General Index"
137 |              >index</a></li>
138 |         <li class="right" >
139 |           <a href="downloader.html" title="downloader：下载器"
140 |              >next</a> |</li>
141 |         <li class="right" >
142 |           <a href="settings.html" title="settings：项目设置"
143 |              >previous</a> |</li>
144 |         <li class="nav-item nav-item-0"><a href="index.html">PyCreeper 1.0.0 documentation</a> &#187;</li> 
145 |       </ul>
146 |     </div>
147 |     <div class="footer" role="contentinfo">
148 |         &#169; Copyright 2017, Jim Zheng.
149 |       Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.5.3.
150 |     </div>
151 |   </body>
152 | </html>


--------------------------------------------------------------------------------
/pycreeper/downloader/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | reload(__import__('sys')).setdefaultencoding('utf-8')
  3 | __author__ = 'zcy'
  4 | 
  5 | """ Downloader """
  6 | import cookielib
  7 | import requests
  8 | from pycreeper.http.response import Response
  9 | from pycreeper.downloader_middlewares import DownloaderMiddlewareManager
 10 | from selenium.common.exceptions import TimeoutException as _TimeoutException
 11 | from pycreeper.utils.exceptions import TimeoutException
 12 | from requests.exceptions import Timeout
 13 | import six
 14 | from pycreeper.utils import _get_cookies_from_cookiejar
 15 | import gevent
 16 | import traceback
 17 | 
 18 | 
 19 | class DownloadHandler(object):
 20 |     """ DownloadHandler """
 21 | 
 22 |     def __init__(self, spider, driver, driver_sem, **kwargs):
 23 |         self.settings = spider.settings
 24 |         self.logger = spider.logger
 25 |         self.session_map = {}
 26 |         self.kwargs = kwargs
 27 |         self.driver = driver
 28 |         self.driver_sem = driver_sem
 29 | 
 30 |     def fetch(self, request):
 31 |         """fetch
 32 |         """
 33 |         url = request.url
 34 |         if request.dynamic:
 35 |             return self._fetch_dynamic(request, url)
 36 |         else:
 37 |             return self._fetch_static(request, url)
 38 | 
 39 |     def _fetch_static(self, request, url):
 40 |         self.logger.info("processing static page %s", url)
 41 |         kwargs = {
 42 |             "timeout": self.settings["TIMEOUT"],
 43 |             "headers": request.headers,
 44 |             "verify": self.settings["STATIC_REQUEST_SSL_VERIFY"],
 45 |         }
 46 |         if "proxy" in request.meta and request.meta["proxy"]:
 47 |             kwargs.update(proxies=request.meta["proxy"])
 48 |         try:
 49 |             session = requests.Session()
 50 |             if request.cookiejar:
 51 |                 session.cookies = request.cookiejar
 52 |             if request.method == 'GET':
 53 |                 response = session.get(url, **kwargs)
 54 |             elif request.method == 'POST':
 55 |                 if request.body:
 56 |                     kwargs.update(data=request.body)
 57 |                 response = session.post(url, **kwargs)
 58 |             else:
 59 |                 raise ValueError('Unacceptable HTTP verb %s' % request.method)
 60 |             return Response(response.url, request, status=response.status_code,
 61 |                             cookiejar=response.cookies, body=response.content)
 62 |         except Timeout as e:
 63 |             raise TimeoutException(e.message)
 64 |         except Exception as e:
 65 |             self.logger.error("download error: %s", str(e), exc_info=True)
 66 |             raise e
 67 | 
 68 | 
 69 |     def _fetch_dynamic(self, request, url):
 70 |         self.logger.info("processing dynamic page %s", url)
 71 |         try:
 72 |             self.driver_sem.acquire()
 73 |             if request.cookiejar:
 74 |                 cookies = _get_cookies_from_cookiejar(request.cookiejar)
 75 |                 cookies = self._covert_cookies_to_dict(cookies)
 76 |                 #self._removed_first_dot_in_front_of_domain(cookies)
 77 |                 command_list = self._get_command_list(cookies)
 78 |                 # make the current page to have the same domain with cookies
 79 |                 self.driver.get(url)
 80 |                 # load cookies
 81 |                 for command in command_list:
 82 |                     self.driver.execute_script(command)
 83 | 
 84 |             self.driver.set_page_load_timeout(self.settings["TIMEOUT"])
 85 |             self.driver.get(url)
 86 |             gevent.sleep(request.wait)
 87 |             for func in request.browser_actions:
 88 |                 func(self.driver)
 89 |             url = self.driver.current_url
 90 |             html = self.driver.page_source
 91 | 
 92 |             # generate cookies
 93 |             all_cookies = self.driver.get_cookies()
 94 |             self.driver.delete_all_cookies()
 95 |             self.driver_sem.release()
 96 | 
 97 |             all_cookies = self._to_byte(all_cookies)
 98 |             cookies = [self._make_cookie(**d) for d in all_cookies]
 99 | 
100 |             # set cookies to cookiejar
101 |             cj = cookielib.CookieJar()
102 |             for cookie in cookies:
103 |                 cj.set_cookie(cookie)
104 |             return Response(url, request, cookiejar=cj, body=html)
105 |         except _TimeoutException as e:
106 |             raise TimeoutException(e.message)
107 |         except Exception as e:
108 |             self.logger.error("download error: %s", str(e), exc_info=True)
109 |             raise e
110 | 
111 |     def _removed_first_dot_in_front_of_domain(self, cookies):
112 |         for cookie in cookies:
113 |             for k in cookie:
114 |                 if k == 'domain' and str(cookie[k]).startswith('.'):
115 |                     cookie[k] = cookie[k][1:]
116 | 
117 |     def _get_command_list(self, cookies):
118 |         js_list = []
119 |         for cookie in cookies:
120 |             item_list = [cookie['name'] + '=' + cookie['value']]
121 |             for k in ('domain', 'path', 'expiry'):
122 |                 if k in cookie and not (cookie[k] is None):
123 |                     item_list.append(str(k) + '=' + str(cookie[k]))
124 |             js_list.append("document.cookie = '%s';\n" % ('; '.join(item_list)))
125 |         return js_list
126 | 
127 |     def _make_cookie(self, **kwargs):
128 |         return cookielib.Cookie(
129 |             version=0,
130 |             name=kwargs.get('name', None),
131 |             value=kwargs.get('value', None),
132 |             port=None,
133 |             port_specified=False,
134 |             domain=kwargs.get('domain', None),
135 |             domain_specified=True,
136 |             domain_initial_dot=False,
137 |             path=kwargs.get('path', None),
138 |             path_specified=True,
139 |             secure=False,
140 |             expires=kwargs.get('expires', None),
141 |             discard=False,
142 |             comment=None,
143 |             comment_url=None,
144 |             rest=None
145 |         )
146 | 
147 |     def _covert_cookies_to_dict(self, cookies):
148 |         result = []
149 |         for cookie in cookies:
150 |             cookie_dict = {}
151 |             for key in ['name', 'value', 'domain', 'path', 'expires']:
152 |                 if getattr(cookie, key):
153 |                     cookie_dict[key] = getattr(cookie, key)
154 |             result.append(cookie_dict)
155 |         return result
156 | 
157 |     def _to_byte(self, cookies):
158 |         result = []
159 |         for cookie in cookies:
160 |             temp = {}
161 |             for key in cookie.keys():
162 |                 temp[key.encode('utf-8') if isinstance(key, six.text_type) else key] = \
163 |                     cookie[key].encode('utf-8') if isinstance(cookie[key], six.text_type) else cookie[key]
164 |             result.append(temp)
165 |         return result
166 | 
167 | 
168 | 
169 | 
170 | class Downloader(object):
171 |     """ Downloader """
172 | 
173 |     def __init__(self, spider, driver, driver_sem):
174 |         self.hanlder = DownloadHandler(spider, driver, driver_sem)
175 |         self.middleware = DownloaderMiddlewareManager(spider)
176 | 
177 |     def fetch(self, request, spider):
178 |         """fetch
179 | 
180 |         @request, Request, 请求
181 |         """
182 |         return self.middleware.download(self._download, request)
183 | 
184 |     def _download(self, request):
185 |         """download
186 |         """
187 |         return self.hanlder.fetch(request)
188 | 


--------------------------------------------------------------------------------
/doc/_build/html/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  2 |   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  3 | 
  4 | 
  5 | <html xmlns="http://www.w3.org/1999/xhtml">
  6 |   <head>
  7 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  8 |     
  9 |     <title>PyCreeper: 抓取你能看到的一切！ &#8212; PyCreeper 1.0.0 documentation</title>
 10 |     
 11 |     <link rel="stylesheet" href="_static/nature.css" type="text/css" />
 12 |     <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
 13 |     
 14 |     <script type="text/javascript">
 15 |       var DOCUMENTATION_OPTIONS = {
 16 |         URL_ROOT:    './',
 17 |         VERSION:     '1.0.0',
 18 |         COLLAPSE_INDEX: false,
 19 |         FILE_SUFFIX: '.html',
 20 |         HAS_SOURCE:  true,
 21 |         SOURCELINK_SUFFIX: '.txt'
 22 |       };
 23 |     </script>
 24 |     <script type="text/javascript" src="_static/jquery.js"></script>
 25 |     <script type="text/javascript" src="_static/underscore.js"></script>
 26 |     <script type="text/javascript" src="_static/doctools.js"></script>
 27 |     <link rel="index" title="Index" href="genindex.html" />
 28 |     <link rel="search" title="Search" href="search.html" />
 29 |     <link rel="next" title="PyCreeper初探" href="tutorial.html" /> 
 30 |   </head>
 31 |   <body role="document">
 32 |     <div class="related" role="navigation" aria-label="related navigation">
 33 |       <h3>Navigation</h3>
 34 |       <ul>
 35 |         <li class="right" style="margin-right: 10px">
 36 |           <a href="genindex.html" title="General Index"
 37 |              accesskey="I">index</a></li>
 38 |         <li class="right" >
 39 |           <a href="tutorial.html" title="PyCreeper初探"
 40 |              accesskey="N">next</a> |</li>
 41 |         <li class="nav-item nav-item-0"><a href="#">PyCreeper 1.0.0 documentation</a> &#187;</li> 
 42 |       </ul>
 43 |     </div>  
 44 | 
 45 |     <div class="document">
 46 |       <div class="documentwrapper">
 47 |         <div class="bodywrapper">
 48 |           <div class="body" role="main">
 49 |             
 50 |   <div class="section" id="pycreeper">
 51 | <h1>PyCreeper: 抓取你能看到的一切！<a class="headerlink" href="#pycreeper" title="Permalink to this headline">¶</a></h1>
 52 | <p>PyCreeper是一个用来快速提取网页内容的信息采集（爬虫）框架。项目通过控制 <strong>Selenium.WebDriver</strong> 实现对网页的动态加载与控制，
 53 | 希望可以减少爬虫爱好者分析网页源码，抓取http包，分析Cookies等诸多不便。</p>
 54 | <p>项目主页：<a class="reference external" href="https://github.com/ZcyAndWt/pyCreeper">https://github.com/ZcyAndWt/pyCreeper</a></p>
 55 | <p>作者邮箱：<a class="reference external" href="mailto:zhengchenyu&#46;backend&#37;&#52;&#48;gmail&#46;com">zhengchenyu<span>&#46;</span>backend<span>&#64;</span>gmail<span>&#46;</span>com</a></p>
 56 | <p>项目使用过程中，当您发现任何问题或感受到任何不快，请及时联系我们！</p>
 57 | <div class="toctree-wrapper compound">
 58 | <ul>
 59 | <li class="toctree-l1"><a class="reference internal" href="tutorial.html">PyCreeper初探</a><ul>
 60 | <li class="toctree-l2"><a class="reference internal" href="tutorial.html#id1">目标任务</a></li>
 61 | <li class="toctree-l2"><a class="reference internal" href="tutorial.html#id2">定义一个爬虫</a></li>
 62 | <li class="toctree-l2"><a class="reference internal" href="tutorial.html#middlewares">选择中间件MiddleWares</a></li>
 63 | <li class="toctree-l2"><a class="reference internal" href="tutorial.html#id3">最开始的请求</a></li>
 64 | <li class="toctree-l2"><a class="reference internal" href="tutorial.html#id4">接下来？</a></li>
 65 | <li class="toctree-l2"><a class="reference internal" href="tutorial.html#id5">处理你获得的数据</a></li>
 66 | <li class="toctree-l2"><a class="reference internal" href="tutorial.html#id6">运行你的爬虫</a></li>
 67 | <li class="toctree-l2"><a class="reference internal" href="tutorial.html#id7">写在后面</a></li>
 68 | </ul>
 69 | </li>
 70 | <li class="toctree-l1"><a class="reference internal" href="prepare.html">使用前的准备</a><ul>
 71 | <li class="toctree-l2"><a class="reference internal" href="prepare.html#selenium-driver">配置Selenium Driver</a></li>
 72 | </ul>
 73 | </li>
 74 | <li class="toctree-l1"><a class="reference internal" href="structure.html">架构概览</a><ul>
 75 | <li class="toctree-l2"><a class="reference internal" href="structure.html#id2">各个部分的功能简述</a></li>
 76 | <li class="toctree-l2"><a class="reference internal" href="structure.html#id3">数据流动过程</a></li>
 77 | </ul>
 78 | </li>
 79 | <li class="toctree-l1"><a class="reference internal" href="settings.html">settings：项目设置</a><ul>
 80 | <li class="toctree-l2"><a class="reference internal" href="settings.html#id1">如何覆盖项目的默认设定？</a></li>
 81 | <li class="toctree-l2"><a class="reference internal" href="settings.html#id2">设定的可选参数和默认值</a></li>
 82 | </ul>
 83 | </li>
 84 | <li class="toctree-l1"><a class="reference internal" href="http.html">request对象和response对象</a><ul>
 85 | <li class="toctree-l2"><a class="reference internal" href="http.html#request">Request：自定义您的请求</a></li>
 86 | </ul>
 87 | </li>
 88 | <li class="toctree-l1"><a class="reference internal" href="downloader.html">downloader：下载器</a></li>
 89 | <li class="toctree-l1"><a class="reference internal" href="downloader_middlewares.html">downloader_middlewares：下载器中间件</a></li>
 90 | <li class="toctree-l1"><a class="reference internal" href="schedular.html">schedular：调度器</a><ul>
 91 | <li class="toctree-l2"><a class="reference internal" href="schedular.html#enqueue-request-request">将请求入队：enqueue_request(request)</a></li>
 92 | <li class="toctree-l2"><a class="reference internal" href="schedular.html#next-request">取得队列中的请求：next_request()</a></li>
 93 | </ul>
 94 | </li>
 95 | <li class="toctree-l1"><a class="reference internal" href="spider.html">spider：爬虫</a></li>
 96 | <li class="toctree-l1"><a class="reference internal" href="last.html">写在最后</a></li>
 97 | </ul>
 98 | </div>
 99 | </div>
100 | 
101 | 
102 |           </div>
103 |         </div>
104 |       </div>
105 |       <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
106 |         <div class="sphinxsidebarwrapper">
107 |   <h4>Next topic</h4>
108 |   <p class="topless"><a href="tutorial.html"
109 |                         title="next chapter">PyCreeper初探</a></p>
110 |   <div role="note" aria-label="source link">
111 |     <h3>This Page</h3>
112 |     <ul class="this-page-menu">
113 |       <li><a href="_sources/index.rst.txt"
114 |             rel="nofollow">Show Source</a></li>
115 |     </ul>
116 |    </div>
117 | <div id="searchbox" style="display: none" role="search">
118 |   <h3>Quick search</h3>
119 |     <form class="search" action="search.html" method="get">
120 |       <div><input type="text" name="q" /></div>
121 |       <div><input type="submit" value="Go" /></div>
122 |       <input type="hidden" name="check_keywords" value="yes" />
123 |       <input type="hidden" name="area" value="default" />
124 |     </form>
125 | </div>
126 | <script type="text/javascript">$('#searchbox').show(0);</script>
127 |         </div>
128 |       </div>
129 |       <div class="clearer"></div>
130 |     </div>
131 |     <div class="related" role="navigation" aria-label="related navigation">
132 |       <h3>Navigation</h3>
133 |       <ul>
134 |         <li class="right" style="margin-right: 10px">
135 |           <a href="genindex.html" title="General Index"
136 |              >index</a></li>
137 |         <li class="right" >
138 |           <a href="tutorial.html" title="PyCreeper初探"
139 |              >next</a> |</li>
140 |         <li class="nav-item nav-item-0"><a href="#">PyCreeper 1.0.0 documentation</a> &#187;</li> 
141 |       </ul>
142 |     </div>
143 |     <div class="footer" role="contentinfo">
144 |         &#169; Copyright 2017, Jim Zheng.
145 |       Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.5.3.
146 |     </div>
147 |   </body>
148 | </html>


--------------------------------------------------------------------------------
/tests/test_downloader.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | reload(__import__('sys')).setdefaultencoding('utf-8')
  3 | __author__ = 'zcy'
  4 | 
  5 | import unittest
  6 | import time
  7 | import json
  8 | 
  9 | from pycreeper.utils.exceptions import TimeoutException
 10 | import gevent
 11 | 
 12 | from gevent.pool import Pool
 13 | from pycreeper.downloader_middlewares.cookies_middlewares import CookiesMiddleware
 14 | from pycreeper.downloader import DownloadHandler
 15 | from pycreeper.spider import Spider
 16 | from pycreeper.http.request import Request
 17 | from pycreeper.http.response import Response
 18 | from selenium import webdriver
 19 | from gevent.lock import BoundedSemaphore
 20 | 
 21 | HTTPBIN_URL = 'http://httpbin.org'
 22 | 
 23 | 
 24 | 
 25 | 
 26 | class DownloadHandlerTest(unittest.TestCase):
 27 |     def setUp(self):
 28 |         self.spider = Spider()
 29 |         self.spider.settings.set('TIMEOUT', 15)
 30 |         self.driver = None
 31 |         self.driver_sem = BoundedSemaphore(1)
 32 | 
 33 |     def test_concurrency_with_delayed_url(self):
 34 |         dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
 35 |         n = 5
 36 |         pool = Pool(n)
 37 |         urls = []
 38 |         for i in range(n):
 39 |             urls.append(HTTPBIN_URL + '/delay/1')
 40 |         time_start = time.time()
 41 |         pool.map(dh.fetch, [Request(url) for url in urls])
 42 |         time_total = time.time() - time_start
 43 |         self.assertLess(time_total, n)
 44 | 
 45 |     def test_timeout_static(self):
 46 |         self.spider.settings.set('TIMEOUT', 5)
 47 |         dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
 48 |         self.assertRaises(TimeoutException, dh.fetch, Request(HTTPBIN_URL + '/delay/10'))
 49 | 
 50 |     def test_timeout_dynamic(self):
 51 |         self.driver = webdriver.PhantomJS()
 52 |         self.spider.settings.set('TIMEOUT', 5)
 53 |         dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
 54 |         self.assertRaises(TimeoutException, dh.fetch, Request(HTTPBIN_URL + '/delay/10', dynamic=True))
 55 |         self.driver.close()
 56 | 
 57 |     def test_post_data_static(self):
 58 |         dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
 59 |         response = dh.fetch(Request(HTTPBIN_URL + '/post', method='POST'))
 60 |         self.assertIsInstance(response, Response)
 61 |         self.assertEqual(response.status, 200)
 62 | 
 63 |     def test_post_data_content_static(self):
 64 |         dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
 65 |         response = dh.fetch(Request(HTTPBIN_URL + '/post', method='POST', body={'text': 'pycreeper'}))
 66 |         self.assertIsInstance(response, Response)
 67 |         self.assertEqual(json.loads(response.body)['form'], {'text': 'pycreeper'})
 68 | 
 69 |         response = dh.fetch(Request(HTTPBIN_URL + '/post', method='POST', body=u'Unicode测试'))
 70 |         self.assertEqual(json.loads(response.body)['data'], 'Unicode测试')
 71 | 
 72 |         response = dh.fetch(Request(HTTPBIN_URL + '/post', method='POST', body='中文测试'))
 73 |         self.assertEqual(json.loads(response.body)['data'], '中文测试')
 74 |         self.assertEqual(response.status, 200)
 75 | 
 76 |     def test_get_data(self):
 77 |         dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
 78 |         response = dh.fetch(Request(HTTPBIN_URL + '/get'))
 79 |         self.assertIsInstance(response, Response)
 80 |         self.assertEqual(response.status, 200)
 81 | 
 82 |     def test_dynamic_request(self):
 83 |         self.driver = webdriver.PhantomJS()
 84 |         dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
 85 |         request = Request(HTTPBIN_URL + '/get', dynamic=True)
 86 |         dh.fetch(request)
 87 |         self.driver.close()
 88 | 
 89 |     def test_dynamic_request_wait(self):
 90 |         self.driver = webdriver.PhantomJS()
 91 |         dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
 92 |         request = Request(HTTPBIN_URL + '/get', dynamic=True, wait=3)
 93 |         dh.fetch(request)
 94 |         self.driver.close()
 95 | 
 96 |     def test_dynamic_request_timeout(self):
 97 |         self.driver = webdriver.PhantomJS()
 98 |         self.spider.settings.set('TIMEOUT', 5)
 99 |         dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
100 |         request = Request(HTTPBIN_URL + '/delay/10', dynamic=True)
101 |         self.assertRaises(TimeoutException, dh.fetch, request)
102 |         self.driver.close()
103 | 
104 |     def test_dynamic_request_concurrency(self):
105 |         self.driver = webdriver.PhantomJS()
106 |         dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
107 |         n = 5
108 |         pool = Pool(n)
109 |         urls = []
110 |         for i in range(n):
111 |             urls.append(HTTPBIN_URL + '/delay/1')
112 |         time1 = time.time()
113 |         pool.map(dh.fetch, [Request(url, dynamic=True, wait=5) for url in urls])
114 |         self.assertGreater(time.time() - time1, n)
115 |         self.driver.close()
116 | 
117 |     def test_dynamic_request_cookie_between_static_and_dynamic(self):
118 |         cm = CookiesMiddleware(self.spider, self.spider.settings)
119 |         self.driver = webdriver.PhantomJS()
120 |         dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
121 |         request = Request(HTTPBIN_URL + '/cookies/set?key1=val1&key2=val2',
122 |                           dynamic=True, meta={'cookiejar': 'test'})
123 |         response = dh.fetch(request)
124 |         cm.process_response(request, response)
125 |         request = Request(HTTPBIN_URL + '/cookies', meta={'cookiejar': 'test'})
126 |         cm.process_request(request)
127 |         response = dh.fetch(request)
128 |         self.assertEqual(json.loads(response.body)['cookies'],
129 |                          {u'key1': u'val1', u'key2': u'val2'})
130 |         self.driver.close()
131 | 
132 |     def test_dynamic_request_multi_cookiejar(self):
133 |         cm = CookiesMiddleware(self.spider, self.spider.settings)
134 |         self.driver = webdriver.PhantomJS()
135 |         dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
136 | 
137 |         # jar 1
138 |         request = Request(HTTPBIN_URL + '/cookies/set?key1=val1',
139 |                           dynamic=True, meta={'cookiejar': 'test1'})
140 |         cm.process_request(request)
141 |         response = dh.fetch(request)
142 |         cm.process_response(request, response)
143 | 
144 |         # jar 2
145 |         request = Request(HTTPBIN_URL + '/cookies/set?key2=val2',
146 |                           dynamic=True, meta={'cookiejar': 'test2'})
147 |         cm.process_request(request)
148 |         response = dh.fetch(request)
149 |         cm.process_response(request, response)
150 | 
151 |         # test jar2
152 |         request = Request(HTTPBIN_URL + '/cookies', meta={'cookiejar': 'test2'})
153 |         cm.process_request(request)
154 |         response = dh.fetch(request)
155 |         cm.process_response(request, response)
156 |         self.assertEqual(json.loads(response.body)['cookies'], {u'key2': u'val2'})
157 | 
158 |         # test jar1
159 |         request = Request(HTTPBIN_URL + '/cookies', meta={'cookiejar': 'test1'})
160 |         cm.process_request(request)
161 |         response = dh.fetch(request)
162 |         cm.process_response(request, response)
163 |         self.assertEqual(json.loads(response.body)['cookies'], {u'key1': u'val1'})
164 |         self.driver.close()
165 | 
166 |     def test_dynamic_request_browser_actions(self):
167 |         cm = CookiesMiddleware(self.spider, self.spider.settings)
168 |         self.driver = webdriver.Chrome()
169 |         dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
170 | 
171 |         def _actions(driver):
172 |             driver.find_element_by_name('account').send_keys("username")
173 |             driver.find_element_by_name('password').send_keys("pwd")
174 |             driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
175 |             gevent.sleep(5)
176 | 
177 |         request = Request('https://www.zhihu.com/#signin',
178 |                           dynamic=True, meta={'cookiejar': 'test'},
179 |                           browser_actions=[_actions],
180 |                           )
181 |         cm.process_request(request)
182 |         response = dh.fetch(request)
183 |         cm.process_response(request, response)
184 | 
185 |         request = Request('https://www.zhihu.com', dynamic=True, meta={'cookiejar': 'test'})
186 |         cm.process_request(request)
187 |         response = dh.fetch(request)
188 |         cm.process_response(request, response)
189 |         print response.body
190 |         self.driver.close()
191 | 
192 | 
193 | class DownloadTest(unittest.TestCase):
194 |     pass
195 | 
196 | 
197 | if __name__ == "__main__":
198 |     unittest.main()
199 | 


--------------------------------------------------------------------------------