├── doc ├── _build │ ├── html │ │ ├── .nojekyll │ │ ├── _static │ │ │ ├── custom.css │ │ │ ├── down.png │ │ │ ├── file.png │ │ │ ├── plus.png │ │ │ ├── up.png │ │ │ ├── minus.png │ │ │ ├── comment.png │ │ │ ├── structure.jpg │ │ │ ├── ajax-loader.gif │ │ │ ├── structure.vsdx │ │ │ ├── up-pressed.png │ │ │ ├── comment-bright.png │ │ │ ├── comment-close.png │ │ │ ├── down-pressed.png │ │ │ ├── ~$$structure.~vsdx │ │ │ ├── pygments.css │ │ │ └── nature.css │ │ ├── _sources │ │ │ ├── last.rst.txt │ │ │ ├── spider.rst.txt │ │ │ ├── downloader.rst.txt │ │ │ ├── downloader_middlewares.rst.txt │ │ │ ├── schedular.rst.txt │ │ │ ├── index.rst.txt │ │ │ ├── structure.rst.txt │ │ │ ├── http.rst.txt │ │ │ ├── prepare.rst.txt │ │ │ ├── settings.rst.txt │ │ │ ├── ssettings.rst.txt │ │ │ ├── intro.rst.txt │ │ │ └── tutorial.rst.txt │ │ ├── debug.log │ │ ├── objects.inv │ │ ├── _images │ │ │ └── structure.jpg │ │ ├── .buildinfo │ │ ├── genindex.html │ │ ├── search.html │ │ ├── last.html │ │ ├── spider.html │ │ ├── downloader_middlewares.html │ │ ├── downloader.html │ │ ├── schedular.html │ │ ├── structure.html │ │ ├── prepare.html │ │ ├── http.html │ │ └── index.html │ └── doctrees │ │ ├── http.doctree │ │ ├── last.doctree │ │ ├── index.doctree │ │ ├── intro.doctree │ │ ├── spider.doctree │ │ ├── prepare.doctree │ │ ├── schedular.doctree │ │ ├── settings.doctree │ │ ├── ssettings.doctree │ │ ├── structure.doctree │ │ ├── tutorial.doctree │ │ ├── downloader.doctree │ │ ├── environment.pickle │ │ └── downloader_middlewares.doctree ├── spider.rst ├── downloader.rst ├── _static │ ├── structure.jpg │ └── structure.vsdx ├── downloader_middlewares.rst ├── last.rst ├── Makefile ├── schedular.rst ├── index.rst ├── make.bat ├── structure.rst ├── http.rst ├── prepare.rst ├── settings.rst ├── conf.py └── tutorial.rst ├── tests ├── __init__.py ├── http │ ├── __init__.py │ ├── test_http_request.py │ └── test_http_response.py ├── utils │ ├── __init__.py │ ├── test_utils_log.py │ ├── test_utils_hash.py │ └── test_utils_datatypes.py ├── test_data │ ├── __init__.py │ └── test_settings_data.py ├── test_conf_settings.py ├── test_scheduler.py ├── test_downloader_middlewares.py └── test_downloader.py ├── pycreeper ├── conf │ ├── __init__.py │ ├── settings.py │ └── default_settings.py ├── http │ ├── __init__.py │ ├── response.py │ └── request.py ├── __init__.py ├── utils │ ├── gevent_wrapper.py │ ├── hash.py │ ├── exceptions.py │ ├── log.py │ ├── datatypes.py │ └── __init__.py ├── spider.py ├── downloader_middlewares │ ├── cookies_middlewares.py │ ├── __init__.py │ └── middlewares.py ├── scheduler.py ├── engine.py └── downloader │ └── __init__.py ├── setup.py ├── examples ├── zhihu_spider.py └── jd_spider.py └── README.md /doc/_build/html/.nojekyll: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/spider.rst: -------------------------------------------------------------------------------- 1 | spider:爬虫 2 | ============================ 3 | 4 | -------------------------------------------------------------------------------- /doc/downloader.rst: -------------------------------------------------------------------------------- 1 | downloader:下载器 2 | ============================ 3 | 4 | -------------------------------------------------------------------------------- /doc/_build/html/_static/custom.css: -------------------------------------------------------------------------------- 1 | /* This file intentionally left blank. */ 2 | -------------------------------------------------------------------------------- /doc/_build/html/_sources/last.rst.txt: -------------------------------------------------------------------------------- 1 | 写在最后 2 | ============================ 3 | 4 | -------------------------------------------------------------------------------- /doc/_build/html/_sources/spider.rst.txt: -------------------------------------------------------------------------------- 1 | spider:爬虫 2 | ============================ 3 | 4 | -------------------------------------------------------------------------------- /doc/_build/html/_sources/downloader.rst.txt: -------------------------------------------------------------------------------- 1 | downloader:下载器 2 | ============================ 3 | 4 | -------------------------------------------------------------------------------- /doc/_build/html/debug.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/debug.log -------------------------------------------------------------------------------- /doc/_build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/objects.inv -------------------------------------------------------------------------------- /doc/_static/structure.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_static/structure.jpg -------------------------------------------------------------------------------- /doc/_static/structure.vsdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_static/structure.vsdx -------------------------------------------------------------------------------- /doc/downloader_middlewares.rst: -------------------------------------------------------------------------------- 1 | downloader_middlewares:下载器中间件 2 | ======================================= 3 | 4 | -------------------------------------------------------------------------------- /doc/_build/doctrees/http.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/http.doctree -------------------------------------------------------------------------------- /doc/_build/doctrees/last.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/last.doctree -------------------------------------------------------------------------------- /doc/_build/html/_static/down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/down.png -------------------------------------------------------------------------------- /doc/_build/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/file.png -------------------------------------------------------------------------------- /doc/_build/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/plus.png -------------------------------------------------------------------------------- /doc/_build/html/_static/up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/up.png -------------------------------------------------------------------------------- /doc/_build/doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/index.doctree -------------------------------------------------------------------------------- /doc/_build/doctrees/intro.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/intro.doctree -------------------------------------------------------------------------------- /doc/_build/doctrees/spider.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/spider.doctree -------------------------------------------------------------------------------- /doc/_build/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/minus.png -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' -------------------------------------------------------------------------------- /doc/_build/doctrees/prepare.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/prepare.doctree -------------------------------------------------------------------------------- /doc/_build/doctrees/schedular.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/schedular.doctree -------------------------------------------------------------------------------- /doc/_build/doctrees/settings.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/settings.doctree -------------------------------------------------------------------------------- /doc/_build/doctrees/ssettings.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/ssettings.doctree -------------------------------------------------------------------------------- /doc/_build/doctrees/structure.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/structure.doctree -------------------------------------------------------------------------------- /doc/_build/doctrees/tutorial.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/tutorial.doctree -------------------------------------------------------------------------------- /doc/_build/html/_images/structure.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_images/structure.jpg -------------------------------------------------------------------------------- /doc/_build/html/_static/comment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/comment.png -------------------------------------------------------------------------------- /doc/_build/html/_static/structure.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/structure.jpg -------------------------------------------------------------------------------- /tests/http/__init__.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' -------------------------------------------------------------------------------- /doc/_build/doctrees/downloader.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/downloader.doctree -------------------------------------------------------------------------------- /doc/_build/doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/environment.pickle -------------------------------------------------------------------------------- /doc/_build/html/_static/ajax-loader.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/ajax-loader.gif -------------------------------------------------------------------------------- /doc/_build/html/_static/structure.vsdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/structure.vsdx -------------------------------------------------------------------------------- /doc/_build/html/_static/up-pressed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/up-pressed.png -------------------------------------------------------------------------------- /pycreeper/conf/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' -------------------------------------------------------------------------------- /doc/_build/html/_sources/downloader_middlewares.rst.txt: -------------------------------------------------------------------------------- 1 | downloader_middlewares:下载器中间件 2 | ======================================= 3 | 4 | -------------------------------------------------------------------------------- /doc/_build/html/_static/comment-bright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/comment-bright.png -------------------------------------------------------------------------------- /doc/_build/html/_static/comment-close.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/comment-close.png -------------------------------------------------------------------------------- /doc/_build/html/_static/down-pressed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/down-pressed.png -------------------------------------------------------------------------------- /doc/_build/html/_static/~$$structure.~vsdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/~$$structure.~vsdx -------------------------------------------------------------------------------- /pycreeper/http/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | -------------------------------------------------------------------------------- /tests/test_data/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | -------------------------------------------------------------------------------- /doc/_build/doctrees/downloader_middlewares.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/downloader_middlewares.doctree -------------------------------------------------------------------------------- /pycreeper/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | VERSION = (0, 0, 1) 6 | -------------------------------------------------------------------------------- /doc/_build/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: a4bea7a19f3fdfa82050b591f2231270 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /doc/last.rst: -------------------------------------------------------------------------------- 1 | 写在最后 2 | ============================ 3 | PyCreeper旨在提高爬虫爱好者爬取动态页面的效率,在使用时,如果您遇到各种问题,我们都欢迎您反馈给我们,您可以通过github, 4 | 项目主页:https://github.com/ZcyAndWt/pyCreeper,也可以通过邮件,作者的邮箱:zhengchenyu.backend@gmail.com。 5 | 6 | 未来我们将引入通过Docker安装的支持。 7 | 8 | 如果您觉得PyCreeper减少了您的工作量,提高了您的开发效率,希望您能在Github上给我们star。您的好评是我们前进的动力! 9 | -------------------------------------------------------------------------------- /pycreeper/utils/gevent_wrapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import gevent 6 | 7 | 8 | def spawn(func, *args, **kwargs): 9 | return gevent.spawn(func, *args, **kwargs) 10 | 11 | 12 | def join_all(funcs): 13 | gevent.joinall(funcs) 14 | -------------------------------------------------------------------------------- /tests/test_data/test_settings_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | """ test settings """ 6 | 7 | TEST_INT = 10 8 | 9 | TEST_JSON = '{"foo": ["bar", "baz"]}' 10 | 11 | TEST_STR = 'foo,bar,baz' 12 | 13 | TEST_DICT = { 14 | "foo": "bar" 15 | } 16 | 17 | TEST_LIST = [ 18 | "foo", 19 | "bar", 20 | "baz" 21 | ] 22 | 23 | TEST_FLOAT = 9.11 24 | 25 | test_lowercase = True 26 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = PyCreeper 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /pycreeper/utils/hash.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import urllib 6 | from urlparse import parse_qsl, urlparse, urlunparse 7 | import hashlib 8 | 9 | 10 | def request_fingerprint(request): 11 | """request fingerprint 12 | """ 13 | scheme, netloc, path, params, query, fragment = urlparse(request.url) 14 | keyvals = parse_qsl(query) 15 | keyvals.sort() 16 | query = urllib.urlencode(keyvals) 17 | canonicalize_url = urlunparse(( 18 | scheme, netloc.lower(), path, params, query, fragment)) 19 | fpr = hashlib.sha1() 20 | fpr.update(canonicalize_url) 21 | return fpr.hexdigest() -------------------------------------------------------------------------------- /doc/schedular.rst: -------------------------------------------------------------------------------- 1 | schedular:调度器 2 | ============================ 3 | 4 | 调度器实现的核心是gevent之中的Queue和布隆过滤器 5 | (Wiki: https://en.wikipedia.org/wiki/Bloom_filter)。 6 | 其中,Queue保证了多个Downloader协程读取队列时的协程安全,布隆过滤器则提供了url去重功能。 7 | 8 | 将请求入队:enqueue_request(request) 9 | -------------------------------------------------- 10 | 11 | request入队时,首先使用布隆过滤器检查url是否已经抓取过。如果没有抓取过则直接入队, 12 | 如果抓取过,则会输出一条logging.DEBUG信息,表示忽略了这个url。 13 | 14 | 取得队列中的请求:next_request() 15 | ----------------------------------------------- 16 | 17 | 这个方法将会从Queue中取出一条request。如果在 **custom_settings** 中设置了 **DOWNLOAD_DELAY** 18 | 项目的话,每次取出request会等待一个固定的时间。 19 | 20 | PyCreeper将 **TIMEOUT** 值的3倍作为检验爬虫结束的标志。具体是指,如果3*TIMEOUT时间之内Queue为空的话, 21 | 那么则认为爬取任务全部结束,爬虫退出。 22 | 23 | -------------------------------------------------------------------------------- /pycreeper/utils/exceptions.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | class PycreeperException(Exception): 6 | """ 7 | Base pycreeper exception. 8 | """ 9 | 10 | def __init__(self, msg=None, stacktrace=None): 11 | self.msg = msg 12 | self.stacktrace = stacktrace 13 | 14 | def __str__(self): 15 | exception_msg = "Message: %s\n" % self.msg 16 | if self.stacktrace is not None: 17 | stacktrace = "\n".join(self.stacktrace) 18 | exception_msg += "Stacktrace:\n%s" % stacktrace 19 | return exception_msg 20 | 21 | 22 | class TimeoutException(PycreeperException): 23 | pass 24 | -------------------------------------------------------------------------------- /doc/_build/html/_sources/schedular.rst.txt: -------------------------------------------------------------------------------- 1 | schedular:调度器 2 | ============================ 3 | 4 | 调度器实现的核心是gevent之中的Queue和布隆过滤器 5 | (Wiki: https://en.wikipedia.org/wiki/Bloom_filter)。 6 | 其中,Queue保证了多个Downloader协程读取队列时的协程安全,布隆过滤器则提供了url去重功能。 7 | 8 | 将请求入队:enqueue_request(request) 9 | -------------------------------------------------- 10 | 11 | request入队时,首先使用布隆过滤器检查url是否已经抓取过。如果没有抓取过则直接入队, 12 | 如果抓取过,则会输出一条logging.DEBUG信息,表示忽略了这个url。 13 | 14 | 取得队列中的请求:next_request() 15 | ----------------------------------------------- 16 | 17 | 这个方法将会从Queue中取出一条request。如果在 **custom_settings** 中设置了 **DOWNLOAD_DELAY** 18 | 项目的话,每次取出request会等待一个固定的时间。 19 | 20 | PyCreeper将 **TIMEOUT** 值的3倍作为检验爬虫结束的标志。具体是指,如果3*TIMEOUT时间之内Queue为空的话, 21 | 那么则认为爬取任务全部结束,爬虫退出。 22 | 23 | -------------------------------------------------------------------------------- /pycreeper/utils/log.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | 6 | import logging 7 | 8 | 9 | def get_logger(settings, name='pyCreeperLogger'): 10 | """Create a Logger 11 | """ 12 | log_level = getattr(logging, settings.get('LOG_LEVEL'), None) 13 | if not log_level: 14 | raise ValueError('Invaild LOG_LEVE. Please check your settings.py.') 15 | logger = logging.getLogger(name) 16 | logger.setLevel(log_level) 17 | stream = logging.StreamHandler() 18 | stream.setLevel(log_level) 19 | formatter = logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s") 20 | stream.setFormatter(formatter) 21 | logger.addHandler(stream) 22 | return logger 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="pycreeper", 5 | version="1.0.0", 6 | description='''A web crawler that is able to crawl dynamic web page.''', 7 | author="zcy", 8 | author_email="zhengchenyu.backend@gmail.com", 9 | url="https://github.com/ZcyAndWt/pyCreeper", 10 | license="LGPL", 11 | packages=find_packages(exclude=('doc', 'doc.*', 'tests', 12 | 'tests.*', 'examples', 'examples.*')), 13 | install_requires=[ 14 | 'gevent>=1.2.1', 15 | 'importlib>=1.0.4', 16 | 'requests>=2.8.1', 17 | 'chardet>=2.3.0', 18 | 'w3lib>=1.16.0', 19 | 'six>=1.9.0', 20 | 'pybloom>=1.1', 21 | 'selenium>=2.48.0' 22 | ], 23 | ) 24 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. PyCreeper documentation master file, created by 2 | sphinx-quickstart on Sat Mar 18 20:46:54 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | PyCreeper: 抓取你能看到的一切! 7 | ================================= 8 | PyCreeper是一个用来快速提取网页内容的信息采集(爬虫)框架。项目通过控制 **Selenium.WebDriver** 实现对网页的动态加载与控制, 9 | 希望可以减少爬虫爱好者分析网页源码,抓取http包,分析Cookies等诸多不便。 10 | 11 | 项目主页:https://github.com/ZcyAndWt/pyCreeper 12 | 13 | 作者邮箱:zhengchenyu.backend@gmail.com 14 | 15 | 项目使用过程中,当您发现任何问题或感受到任何不快,请及时联系我们! 16 | 17 | .. toctree:: 18 | :maxdepth: 2 19 | 20 | 21 | tutorial 22 | prepare 23 | structure 24 | settings 25 | http 26 | downloader 27 | downloader_middlewares 28 | schedular 29 | spider 30 | last 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /doc/_build/html/_sources/index.rst.txt: -------------------------------------------------------------------------------- 1 | .. PyCreeper documentation master file, created by 2 | sphinx-quickstart on Sat Mar 18 20:46:54 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | PyCreeper: 抓取你能看到的一切! 7 | ================================= 8 | PyCreeper是一个用来快速提取网页内容的信息采集(爬虫)框架。项目通过控制 **Selenium.WebDriver** 实现对网页的动态加载与控制, 9 | 希望可以减少爬虫爱好者分析网页源码,抓取http包,分析Cookies等诸多不便。 10 | 11 | 项目主页:https://github.com/ZcyAndWt/pyCreeper 12 | 13 | 作者邮箱:zhengchenyu.backend@gmail.com 14 | 15 | 项目使用过程中,当您发现任何问题或感受到任何不快,请及时联系我们! 16 | 17 | .. toctree:: 18 | :maxdepth: 2 19 | 20 | 21 | tutorial 22 | prepare 23 | structure 24 | settings 25 | http 26 | downloader 27 | downloader_middlewares 28 | schedular 29 | spider 30 | last 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=PyCreeper 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /tests/utils/test_utils_log.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | 6 | import unittest 7 | import logging 8 | 9 | from pycreeper.conf.settings import Settings 10 | from pycreeper.utils.log import get_logger 11 | 12 | 13 | class SettingsTest(unittest.TestCase): 14 | 15 | def test_get_logger(self): 16 | settings = Settings() 17 | logger = get_logger(settings, 'testLogger') 18 | self.assertEqual(logger.level, logging.DEBUG) 19 | 20 | settings.set('LOG_LEVEL', 'INFO') 21 | logger = get_logger(settings, 'testLogger') 22 | self.assertEqual(logger.level, logging.INFO) 23 | 24 | settings.set('LOG_LEVEL', 'foo') 25 | self.assertRaises(ValueError, get_logger, settings, 'testLogger') 26 | 27 | self.assertEqual(logger.name, 'testLogger') 28 | 29 | 30 | 31 | if __name__ == "__main__": 32 | unittest.main() -------------------------------------------------------------------------------- /doc/structure.rst: -------------------------------------------------------------------------------- 1 | 架构概览 2 | ========== 3 | PyCreeper的整体架构可以分为引擎,下载器,下载器中间件,调度器,爬虫五个部分。 4 | 在各个部分之间传递的数据为Request/Response对象。 5 | 6 | 数据的流动方向如下图的绿色箭头所示。 7 | 8 | 各个部分的功能简述 9 | -------------------- 10 | 11 | .. image:: _static/structure.jpg 12 | 13 | ------------------------------------ 14 | 15 | **引擎** 是PyCreeper的核心部分,负责调度各个部分的工作。引擎在内部的实现为gevent.Pool。 16 | 17 | **下载器** 负责下载request请求,在这里将静态请求与动态请求分别处理,静态请求使用requests库实现, 18 | 动态请求使用selenium.webdriver实现。在请求完成后,将响应返回给引擎。 19 | 20 | **下载器中间件** 可以理解为存在于下载器和引擎之间的钩子系统,可以通过自定义下载器中间件完成对request和response的特殊处理。 21 | 22 | **调度器** 调度器实现的核心为gevent中的Queue和布隆过滤器,通过对requests进行判重,非重复请求入队,等待引擎取走处理。 23 | 24 | **爬虫** 爬虫相当于对用户定义的接口,由用户来定义起始的url,对于各个request的callback以及对于爬取结果的处理方法。 25 | 26 | 数据流动过程 27 | ------------- 28 | 29 | 数据流动的过程如下面各个步骤所示: 30 | 31 | #. 引擎启动,将爬虫中的start_urls加入到调度器中。 32 | 33 | #. 引擎从调度器中取得一个request。 34 | 35 | #. 引擎将请求交给下载器处理,中间经过了下载器中间件对于request的处理。 36 | 37 | #. 下载器根据request的类型分别操作,静态请求交给requests库,动态请求使用selenium.webdriver加载。 38 | 39 | #. 下载器将response返回给引擎,中间经过下载器中间件对response的处理。 40 | 41 | #. 引擎将response交给爬虫定义的处理方法。 42 | 43 | #. 爬虫的处理方法可能返回一个request(转2),或者返回一个包含爬取结果的字典(转下一个)。 44 | 45 | #. 引擎根据爬虫定义的对于爬取结果的处理方法,处理结果。 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /doc/_build/html/_sources/structure.rst.txt: -------------------------------------------------------------------------------- 1 | 架构概览 2 | ========== 3 | PyCreeper的整体架构可以分为引擎,下载器,下载器中间件,调度器,爬虫五个部分。 4 | 在各个部分之间传递的数据为Request/Response对象。 5 | 6 | 数据的流动方向如下图的绿色箭头所示。 7 | 8 | 各个部分的功能简述 9 | -------------------- 10 | 11 | .. image:: _static/structure.jpg 12 | 13 | ------------------------------------ 14 | 15 | **引擎** 是PyCreeper的核心部分,负责调度各个部分的工作。引擎在内部的实现为gevent.Pool。 16 | 17 | **下载器** 负责下载request请求,在这里将静态请求与动态请求分别处理,静态请求使用requests库实现, 18 | 动态请求使用selenium.webdriver实现。在请求完成后,将响应返回给引擎。 19 | 20 | **下载器中间件** 可以理解为存在于下载器和引擎之间的钩子系统,可以通过自定义下载器中间件完成对request和response的特殊处理。 21 | 22 | **调度器** 调度器实现的核心为gevent中的Queue和布隆过滤器,通过对requests进行判重,非重复请求入队,等待引擎取走处理。 23 | 24 | **爬虫** 爬虫相当于对用户定义的接口,由用户来定义起始的url,对于各个request的callback以及对于爬取结果的处理方法。 25 | 26 | 数据流动过程 27 | ------------- 28 | 29 | 数据流动的过程如下面各个步骤所示: 30 | 31 | #. 引擎启动,将爬虫中的start_urls加入到调度器中。 32 | 33 | #. 引擎从调度器中取得一个request。 34 | 35 | #. 引擎将请求交给下载器处理,中间经过了下载器中间件对于request的处理。 36 | 37 | #. 下载器根据request的类型分别操作,静态请求交给requests库,动态请求使用selenium.webdriver加载。 38 | 39 | #. 下载器将response返回给引擎,中间经过下载器中间件对response的处理。 40 | 41 | #. 引擎将response交给爬虫定义的处理方法。 42 | 43 | #. 爬虫的处理方法可能返回一个request(转2),或者返回一个包含爬取结果的字典(转下一个)。 44 | 45 | #. 引擎根据爬虫定义的对于爬取结果的处理方法,处理结果。 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /doc/http.rst: -------------------------------------------------------------------------------- 1 | request对象和response对象 2 | ============================ 3 | 4 | request对象和response对象负责在各个PyCreeper组件之间传递信息,您在使用爬虫的过程中,会经常需要对这两个对象进行操作。 5 | 6 | Request:自定义您的请求 7 | ----------------------------- 8 | 9 | 构造参数:: 10 | 11 | Request(url, callback=None, method='GET', headers=None,body=None, meta=None, 12 | encoding='utf-8', cookiejar=None,dynamic=False, browser_actions=None, wait=0) 13 | 14 | **url** 15 | 16 | 请求的url 17 | 18 | **callback** 19 | 20 | 请求的回调函数,如果未定义则使用Spider.parse方法处理响应。 21 | 22 | **method** 23 | 24 | 支持GET型和POST型请求方法,其中,POST方法只有当dynamic=False时才会被支持, 25 | 如果dynamic=True将会抛出一个AttributeError。 26 | 27 | **headers** 28 | 29 | 该参数可以传入一个字典(dict),用于静态请求的头部信息。 30 | 31 | **body** 32 | 33 | 该参数用于静态请求的请求体。 34 | 35 | **meta** 36 | 37 | 该参数为字典(dict)型,用于给request携带一些参数,这些参数可能在其他模块用到。 38 | 39 | **encoding** 40 | 41 | 请求的编码方式,用于给url和body编码。 42 | 43 | **cookiejar** 44 | 45 | 该参数用于取出request携带的cookiejar,在构造request对象时请不要向该参数传入值,传入的cookiejar不会被PyCreeper使用到。 46 | 47 | **dynamic** 48 | 49 | 该参数用于标记request是否是动态请求。 50 | 51 | **browser_actions** 52 | 53 | 该参数用于定义浏览器打开指定网址之后,到提取数据之前,执行的一系列操作。该参数可以传入一个函数列表。 54 | 55 | **wait** 56 | 57 | 该参数用于定义浏览器打开指定网址之后,到执行browser_actions中定义的函数之前,等待的时间。 58 | 当网页存在大量异步加载请求的时候,这个参数格外有用。 -------------------------------------------------------------------------------- /doc/_build/html/_sources/http.rst.txt: -------------------------------------------------------------------------------- 1 | request对象和response对象 2 | ============================ 3 | 4 | request对象和response对象负责在各个PyCreeper组件之间传递信息,您在使用爬虫的过程中,会经常需要对这两个对象进行操作。 5 | 6 | Request:自定义您的请求 7 | ----------------------------- 8 | 9 | 构造参数:: 10 | 11 | Request(url, callback=None, method='GET', headers=None,body=None, meta=None, 12 | encoding='utf-8', cookiejar=None,dynamic=False, browser_actions=None, wait=0) 13 | 14 | **url** 15 | 16 | 请求的url 17 | 18 | **callback** 19 | 20 | 请求的回调函数,如果未定义则使用Spider.parse方法处理响应。 21 | 22 | **method** 23 | 24 | 支持GET型和POST型请求方法,其中,POST方法只有当dynamic=False时才会被支持, 25 | 如果dynamic=True将会抛出一个AttributeError。 26 | 27 | **headers** 28 | 29 | 该参数可以传入一个字典(dict),用于静态请求的头部信息。 30 | 31 | **body** 32 | 33 | 该参数用于静态请求的请求体。 34 | 35 | **meta** 36 | 37 | 该参数为字典(dict)型,用于给request携带一些参数,这些参数可能在其他模块用到。 38 | 39 | **encoding** 40 | 41 | 请求的编码方式,用于给url和body编码。 42 | 43 | **cookiejar** 44 | 45 | 该参数用于取出request携带的cookiejar,在构造request对象时请不要向该参数传入值,传入的cookiejar不会被PyCreeper使用到。 46 | 47 | **dynamic** 48 | 49 | 该参数用于标记request是否是动态请求。 50 | 51 | **browser_actions** 52 | 53 | 该参数用于定义浏览器打开指定网址之后,到提取数据之前,执行的一系列操作。该参数可以传入一个函数列表。 54 | 55 | **wait** 56 | 57 | 该参数用于定义浏览器打开指定网址之后,到执行browser_actions中定义的函数之前,等待的时间。 58 | 当网页存在大量异步加载请求的时候,这个参数格外有用。 -------------------------------------------------------------------------------- /doc/prepare.rst: -------------------------------------------------------------------------------- 1 | 使用前的准备 2 | ============== 3 | 我们假定您已经安装了Python2.7及以上版本,若没有安装,请参考Python官网(https://www.python.org/)选择合适的版本进行安装。 4 | 5 | PyCreeper对于以下几个库存在依赖关系: 6 | 7 | * gevent 8 | * importlib 9 | * requests 10 | * chardet 11 | * w3lib 12 | * six 13 | * pybloom 14 | * Selenium 15 | 16 | 当然,如果您选择使用pip安装本项目,那么依赖库会自动安装到您的电脑内(至少理论上会是这样)。 17 | 18 | 使用pip安装项目:: 19 | 20 | pip install pycreeper 21 | 22 | 配置Selenium Driver 23 | --------------------- 24 | 当您希望调用指定的浏览器时,Selenium需要您安装指定浏览器的接口。 25 | 举例来说,如果您希望使用Chrome加载请求,您需要下载安装 *Chromedriver* (https://sites.google.com/a/chromium.org/chromedriver/downloads), 26 | 然后将该程序放在您的PATH之下,确保Python能访问到它。 27 | 28 | 几个常用的Driver: 29 | 30 | ============== ======================================================================= 31 | 名称 link 32 | ============== ======================================================================= 33 | Chrome https://sites.google.com/a/chromium.org/chromedriver/downloads 34 | Firefox https://github.com/mozilla/geckodriver/releases 35 | PhantomJS http://phantomjs.org/download.html 36 | ============== ======================================================================= 37 | 38 | 其中,PhantomJS是一款无界面化WebKit,当您在无GUI设备的情况下,该浏览器是您最好的选择。 39 | 40 | 对于Selenium更详细的配置,请参考 http://selenium-python.readthedocs.io/ 41 | -------------------------------------------------------------------------------- /doc/_build/html/_sources/prepare.rst.txt: -------------------------------------------------------------------------------- 1 | 使用前的准备 2 | ============== 3 | 我们假定您已经安装了Python2.7及以上版本,若没有安装,请参考Python官网(https://www.python.org/)选择合适的版本进行安装。 4 | 5 | PyCreeper对于以下几个库存在依赖关系: 6 | 7 | * gevent 8 | * importlib 9 | * requests 10 | * chardet 11 | * w3lib 12 | * six 13 | * pybloom 14 | * Selenium 15 | 16 | 当然,如果您选择使用pip安装本项目,那么依赖库会自动安装到您的电脑内(至少理论上会是这样)。 17 | 18 | 使用pip安装项目:: 19 | 20 | pip install pycreeper 21 | 22 | 配置Selenium Driver 23 | --------------------- 24 | 当您希望调用指定的浏览器时,Selenium需要您安装指定浏览器的接口。 25 | 举例来说,如果您希望使用Chrome加载请求,您需要下载安装 *Chromedriver* (https://sites.google.com/a/chromium.org/chromedriver/downloads), 26 | 然后将该程序放在您的PATH之下,确保Python能访问到它。 27 | 28 | 几个常用的Driver: 29 | 30 | ============== ======================================================================= 31 | 名称 link 32 | ============== ======================================================================= 33 | Chrome https://sites.google.com/a/chromium.org/chromedriver/downloads 34 | Firefox https://github.com/mozilla/geckodriver/releases 35 | PhantomJS http://phantomjs.org/download.html 36 | ============== ======================================================================= 37 | 38 | 其中,PhantomJS是一款无界面化WebKit,当您在无GUI设备的情况下,该浏览器是您最好的选择。 39 | 40 | 对于Selenium更详细的配置,请参考 http://selenium-python.readthedocs.io/ 41 | -------------------------------------------------------------------------------- /tests/utils/test_utils_hash.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import unittest 6 | 7 | from pycreeper.utils.hash import request_fingerprint 8 | from pycreeper.http.request import Request 9 | 10 | __doctests__ = ['pycreeper.utils.hash'] 11 | 12 | URLS = [ 13 | 'http://www.example.com/index.html#print', 14 | 'http://www.example.com/index.html', 15 | 'http://www.xxx.com/index.html?id=77&nameid=2905210001&page=1', 16 | 'http://www.xxxxx.com/index.html?id=77&nameid=2905210001&page=1', 17 | 'http://www.xxxxx.com/index.html?test123123', 18 | 'http://www.xxxxx.com/index.html', 19 | 'ftp://www.xxxxx.com/index.html' 20 | ] 21 | 22 | REQUEST = [Request(url) for url in URLS] 23 | 24 | 25 | class RequestFingerprintTest(unittest.TestCase): 26 | 27 | def test_basic(self): 28 | self.assertRaises(AttributeError, request_fingerprint, None) 29 | self.assertNotEqual(REQUEST[0], REQUEST[1]) 30 | 31 | def test_not_equal(self): 32 | self.assertNotEqual(REQUEST[2], REQUEST[3]) 33 | self.assertNotEqual(REQUEST[3], REQUEST[4]) 34 | self.assertNotEqual(REQUEST[3], REQUEST[4]) 35 | self.assertNotEqual(REQUEST[4], REQUEST[5]) 36 | self.assertNotEqual(REQUEST[5], REQUEST[6]) 37 | 38 | if __name__ == "__main__": 39 | unittest.main() 40 | -------------------------------------------------------------------------------- /pycreeper/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | """ Base Spider""" 6 | 7 | import json 8 | 9 | from pycreeper.conf.settings import Settings 10 | from pycreeper.http.request import Request 11 | from pycreeper.engine import Engine 12 | from pycreeper.utils.log import get_logger 13 | 14 | 15 | class Spider(object): 16 | """ Base Spider""" 17 | 18 | custom_settings = None 19 | 20 | def __init__(self): 21 | if not hasattr(self, "start_urls"): 22 | self.start_urls = [] 23 | # init settings 24 | self.settings = Settings(self.custom_settings) 25 | self.logger = get_logger(self.settings) 26 | self.initialize() 27 | 28 | def initialize(self): 29 | """initialize 30 | """ 31 | pass 32 | 33 | def start_requests(self): 34 | """start_requests 35 | """ 36 | for url in self.start_urls: 37 | yield Request(url) 38 | 39 | def start(self): 40 | """start 41 | """ 42 | engine = Engine(self) 43 | engine.start() 44 | 45 | def parse(self, response): 46 | """parse 47 | """ 48 | raise NotImplementedError 49 | 50 | def process_item(self, item): 51 | """process item 52 | """ 53 | self.logger.debug(json.dumps(item)) 54 | -------------------------------------------------------------------------------- /pycreeper/downloader_middlewares/cookies_middlewares.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | from pycreeper.downloader_middlewares import DownloaderMiddleware 6 | import six 7 | from collections import defaultdict 8 | from pycreeper.utils import _get_cookies_from_cookiejar 9 | from pycreeper.http.response import Response 10 | from cookielib import CookieJar 11 | 12 | 13 | class CookiesMiddleware(DownloaderMiddleware): 14 | """This middleware enables working with sites that need cookies""" 15 | 16 | def __init__(self, settings, logger): 17 | self.jars = defaultdict(CookieJar) 18 | self.settings = settings 19 | self.logger = logger 20 | 21 | def process_request(self, request): 22 | if not request.meta or request.meta.get("cookiejar", None) is None: 23 | return 24 | cookiejarkey = request.meta.get("cookiejar") 25 | jar = self.jars[cookiejarkey] 26 | # set CookieJar 27 | request.cookiejar = jar 28 | 29 | def process_response(self, request, response): 30 | if not request.meta or request.meta.get("cookiejar", None) is None: 31 | return response 32 | # extract cookies from response.cookiejar 33 | cookiejarkey = request.meta.get("cookiejar") 34 | jar = self.jars[cookiejarkey] 35 | cookies = _get_cookies_from_cookiejar(response.cookiejar) 36 | for cookie in cookies: 37 | jar.set_cookie(cookie) 38 | return response -------------------------------------------------------------------------------- /pycreeper/scheduler.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | """ Scheduler """ 6 | 7 | from gevent.queue import Queue 8 | from pybloom import ScalableBloomFilter 9 | import gevent 10 | from pycreeper.utils.hash import request_fingerprint 11 | 12 | 13 | class Scheduler(object): 14 | """ Scheduler """ 15 | 16 | def __init__(self, spider): 17 | self.request_filter = RequestFilter() 18 | self.queue = Queue() 19 | self.settings = spider.settings 20 | self.timeout = self.settings.get('TIMEOUT', 5) 21 | self.download_delay = self.settings.get('DOWNLOAD_DELAY', 0) 22 | self.logger = spider.logger 23 | 24 | def enqueue_request(self, request): 25 | """put request 26 | """ 27 | if self.request_filter.request_seen(request): 28 | self.logger.debug("ignore %s", request.url) 29 | return 30 | self.queue.put(request) 31 | 32 | def next_request(self): 33 | """next request 34 | """ 35 | gevent.sleep(self.download_delay) 36 | return self.queue.get(timeout=self.timeout * 3) 37 | 38 | def __len__(self): 39 | return self.queue.qsize() 40 | 41 | 42 | class RequestFilter(object): 43 | """ RequestFilter """ 44 | 45 | def __init__(self): 46 | self.sbf = ScalableBloomFilter( 47 | mode=ScalableBloomFilter.SMALL_SET_GROWTH) 48 | 49 | def request_seen(self, request): 50 | """request seen 51 | """ 52 | finger = request_fingerprint(request) 53 | if finger in self.sbf: 54 | return True 55 | self.sbf.add(finger) 56 | return False 57 | -------------------------------------------------------------------------------- /pycreeper/utils/datatypes.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | 6 | class CaselessDict(dict): 7 | __slots__ = () 8 | 9 | def __init__(self, seq=None): 10 | super(CaselessDict, self).__init__() 11 | if seq: 12 | self.update(seq) 13 | 14 | def __getitem__(self, key): 15 | return dict.__getitem__(self, self.normkey(key)) 16 | 17 | def __setitem__(self, key, value): 18 | dict.__setitem__(self, self.normkey(key), self.normvalue(value)) 19 | 20 | def __delitem__(self, key): 21 | dict.__delitem__(self, self.normkey(key)) 22 | 23 | def __contains__(self, key): 24 | return dict.__contains__(self, self.normkey(key)) 25 | 26 | has_key = __contains__ 27 | 28 | def __copy__(self): 29 | return self.__class__(self) 30 | 31 | copy = __copy__ 32 | 33 | def normkey(self, key): 34 | """Method to normalize dictionary key access""" 35 | return key.lower() 36 | 37 | def normvalue(self, value): 38 | """Method to normalize values prior to be setted""" 39 | return value 40 | 41 | def get(self, key, def_val=None): 42 | return dict.get(self, self.normkey(key), self.normvalue(def_val)) 43 | 44 | def setdefault(self, key, def_val=None): 45 | return dict.setdefault(self, self.normkey(key), self.normvalue(def_val)) 46 | 47 | def update(self, seq): 48 | seq = seq.items() 49 | iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq) 50 | super(CaselessDict, self).update(iseq) 51 | 52 | @classmethod 53 | def fromkeys(cls, keys, value=None): 54 | return cls((k, value) for k in keys) 55 | 56 | def pop(self, key, *args): 57 | return dict.pop(self, self.normkey(key), *args) 58 | -------------------------------------------------------------------------------- /doc/settings.rst: -------------------------------------------------------------------------------- 1 | settings:项目设置 2 | ===================== 3 | 4 | 这篇文档主要介绍项目的设定(settings)参数和其默认值。 5 | 6 | 如何覆盖项目的默认设定? 7 | -------------------------- 8 | 9 | 可以在您定义的爬虫中设置 **custom_settings** 属性,覆盖掉PyCreeper的默认设定。 10 | 11 | 示例:: 12 | 13 | custom_settings = { 14 | 'DOWNLOADER_MIDDLEWARES': { 15 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100, 16 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200, 17 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300, 18 | 'pycreeper.downloader_middlewares.middlewares.EncodingDiscriminateMiddleware': 400 19 | }, 20 | 'DRIVER': 'Chrome', 21 | 'DOWNLOAD_DELAY': 2, 22 | 'USER_AGENT_LIST': [ 23 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 24 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''', 25 | ] 26 | } 27 | 28 | 设定的可选参数和默认值 29 | --------------------------- 30 | 31 | **LOG_LEVEL** 32 | 33 | 该参数为byte类型,默认值为DEBUG,该参数控制PyCreeper日志的输出等级。 34 | 35 | **RETRY_COUNT** 36 | 37 | 该参数为数值型,默认值为3,表示对于失败请求的最大尝试次数(该参数只对静态请求有效)。 38 | 39 | **RETRY_STATUS_CODES** 40 | 41 | 该参数为list型,默认值为[500, 502, 503, 504, 400, 403, 408],表示返回码在列表中的请求将会被重发(该参数只对静态请求有效)。 42 | 43 | **TIMEOUT** 44 | 45 | 该参数为数值型,默认值为5,表示发出请求定义的超时时间(秒)。 46 | 47 | **MAX_REQUEST_SIZE** 48 | 49 | 该参数为int型,默认值为20,表示可以同时进行的静态请求个数(该参数只对静态请求有效)。 50 | 51 | **USER_AGENT_LIST** 52 | 53 | 该参数为list型,默认值为空列表,表示发送请求时可以携带的User-Agent(需要使用UserAgentMiddleware,该参数只对静态请求有效)。 54 | 55 | **DOWNLOADER_MIDDLEWARES** 56 | 57 | 该参数为dict型,默认值为空字典,表示使用的下载器中间件。字典的key值为希望使用的中间件的reference, 58 | value值为该中间件的优先级,优先级越高的中间件将会越先被使用。 59 | 60 | **DYNAMIC_CRAWL** 61 | 62 | 该参数为bool型,默认值为True,表示引擎是否加载WebDriver。如果在设为False的情况下发出了一系列动态请求,将会引发一系列异常。 63 | 64 | **DRIVER** 65 | 66 | 该参数为byte型,默认值为Firefox,表示PyCreeper使用的Driver类型。可以选择任意一种Selenium支持的Driver,前提是需要配置好Driver的相关环境。 67 | 68 | **DRIVER_INIT_KWARGS** 69 | 70 | 该参数为dict型,默认为空字典,表示启动Driver时传入的参数,您可以通过定义该值修改Driver的属性。 71 | 72 | **DOWNLOAD_DELAY** 73 | 74 | 该参数为数值型,默认值为0,表示下载延迟(秒)。 75 | 76 | **PROXY_INTERVAL** 77 | 78 | 该参数为数值型,默认值为3,表示每个代理使用的最大时间。使用proxy需要搭配ProxyMiddleware, 79 | 并且此处的proxy只对静态请求有效。如果您想配置动态请求的proxy,可以设置DRIVER_INIT_KWARGS参数,在Driver启动时传入配置信息。 80 | 81 | **PROXY_LIST** 82 | 83 | 该参数为list型,默认为空数组,表示请求可以用到的proxy。格式为'IP:端口号'。 84 | 85 | **STATIC_REQUEST_SSL_VERIFY** 86 | 87 | 该参数为bool型,默认值为True,表示发起静态请求是,是否进行ssl认证。 88 | 该参数用于在使用代理的情况下,https认证失败的情况。 -------------------------------------------------------------------------------- /doc/_build/html/_sources/settings.rst.txt: -------------------------------------------------------------------------------- 1 | settings:项目设置 2 | ===================== 3 | 4 | 这篇文档主要介绍项目的设定(settings)参数和其默认值。 5 | 6 | 如何覆盖项目的默认设定? 7 | -------------------------- 8 | 9 | 可以在您定义的爬虫中设置 **custom_settings** 属性,覆盖掉PyCreeper的默认设定。 10 | 11 | 示例:: 12 | 13 | custom_settings = { 14 | 'DOWNLOADER_MIDDLEWARES': { 15 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100, 16 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200, 17 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300, 18 | 'pycreeper.downloader_middlewares.middlewares.EncodingDiscriminateMiddleware': 400 19 | }, 20 | 'DRIVER': 'Chrome', 21 | 'DOWNLOAD_DELAY': 2, 22 | 'USER_AGENT_LIST': [ 23 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 24 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''', 25 | ] 26 | } 27 | 28 | 设定的可选参数和默认值 29 | --------------------------- 30 | 31 | **LOG_LEVEL** 32 | 33 | 该参数为byte类型,默认值为DEBUG,该参数控制PyCreeper日志的输出等级。 34 | 35 | **RETRY_COUNT** 36 | 37 | 该参数为数值型,默认值为3,表示对于失败请求的最大尝试次数(该参数只对静态请求有效)。 38 | 39 | **RETRY_STATUS_CODES** 40 | 41 | 该参数为list型,默认值为[500, 502, 503, 504, 400, 403, 408],表示返回码在列表中的请求将会被重发(该参数只对静态请求有效)。 42 | 43 | **TIMEOUT** 44 | 45 | 该参数为数值型,默认值为5,表示发出请求定义的超时时间(秒)。 46 | 47 | **MAX_REQUEST_SIZE** 48 | 49 | 该参数为int型,默认值为20,表示可以同时进行的静态请求个数(该参数只对静态请求有效)。 50 | 51 | **USER_AGENT_LIST** 52 | 53 | 该参数为list型,默认值为空列表,表示发送请求时可以携带的User-Agent(需要使用UserAgentMiddleware,该参数只对静态请求有效)。 54 | 55 | **DOWNLOADER_MIDDLEWARES** 56 | 57 | 该参数为dict型,默认值为空字典,表示使用的下载器中间件。字典的key值为希望使用的中间件的reference, 58 | value值为该中间件的优先级,优先级越高的中间件将会越先被使用。 59 | 60 | **DYNAMIC_CRAWL** 61 | 62 | 该参数为bool型,默认值为True,表示引擎是否加载WebDriver。如果在设为False的情况下发出了一系列动态请求,将会引发一系列异常。 63 | 64 | **DRIVER** 65 | 66 | 该参数为byte型,默认值为Firefox,表示PyCreeper使用的Driver类型。可以选择任意一种Selenium支持的Driver,前提是需要配置好Driver的相关环境。 67 | 68 | **DRIVER_INIT_KWARGS** 69 | 70 | 该参数为dict型,默认为空字典,表示启动Driver时传入的参数,您可以通过定义该值修改Driver的属性。 71 | 72 | **DOWNLOAD_DELAY** 73 | 74 | 该参数为数值型,默认值为0,表示下载延迟(秒)。 75 | 76 | **PROXY_INTERVAL** 77 | 78 | 该参数为数值型,默认值为3,表示每个代理使用的最大时间。使用proxy需要搭配ProxyMiddleware, 79 | 并且此处的proxy只对静态请求有效。如果您想配置动态请求的proxy,可以设置DRIVER_INIT_KWARGS参数,在Driver启动时传入配置信息。 80 | 81 | **PROXY_LIST** 82 | 83 | 该参数为list型,默认为空数组,表示请求可以用到的proxy。格式为'IP:端口号'。 84 | 85 | **STATIC_REQUEST_SSL_VERIFY** 86 | 87 | 该参数为bool型,默认值为True,表示发起静态请求是,是否进行ssl认证。 88 | 该参数用于在使用代理的情况下,https认证失败的情况。 -------------------------------------------------------------------------------- /doc/_build/html/_sources/ssettings.rst.txt: -------------------------------------------------------------------------------- 1 | settings:项目设置 2 | ===================== 3 | 4 | 这篇文档主要介绍项目的设定(settings)参数和其默认值。 5 | 6 | 如何覆盖项目的默认设定? 7 | -------------------------- 8 | 9 | 可以在您定义的爬虫中设置 **custom_settings** 属性,覆盖掉PyCreeper的默认设定。 10 | 11 | 示例:: 12 | 13 | custom_settings = { 14 | 'DOWNLOADER_MIDDLEWARES': { 15 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100, 16 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200, 17 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300, 18 | 'pycreeper.downloader_middlewares.middlewares.EncodingDiscriminateMiddleware': 400 19 | }, 20 | 'DRIVER': 'Chrome', 21 | 'DOWNLOAD_DELAY': 2, 22 | 'USER_AGENT_LIST': [ 23 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 24 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''', 25 | ] 26 | } 27 | 28 | 设定的可选参数和默认值 29 | --------------------------- 30 | 31 | **LOG_LEVEL** 32 | 33 | 该参数为byte类型,默认值为DEBUG,该参数控制PyCreeper日志的输出等级。 34 | 35 | **RETRY_COUNT** 36 | 37 | 该参数为数值型,默认值为3,表示对于失败请求的最大尝试次数(该参数只对静态请求有效)。 38 | 39 | **RETRY_STATUS_CODES** 40 | 41 | 该参数为list型,默认值为[500, 502, 503, 504, 400, 403, 408],表示返回码在列表中的请求将会被重发(该参数只对静态请求有效)。 42 | 43 | **TIMEOUT** 44 | 45 | 该参数为数值型,默认值为5,表示发出请求定义的超时时间(秒)。 46 | 47 | **MAX_REQUEST_SIZE** 48 | 49 | 该参数为int型,默认值为20,表示可以同时进行的静态请求个数(该参数只对静态请求有效)。 50 | 51 | **USER_AGENT_LIST** 52 | 53 | 该参数为list型,默认值为空列表,表示发送请求时可以携带的User-Agent(需要使用UserAgentMiddleware,该参数只对静态请求有效)。 54 | 55 | **DOWNLOADER_MIDDLEWARES** 56 | 57 | 该参数为dict型,默认值为空字典,表示使用的下载器中间件。字典的key值为希望使用的中间件的reference, 58 | value值为该中间件的优先级,优先级越高的中间件将会越先被使用。 59 | 60 | **DYNAMIC_CRAWL** 61 | 62 | 该参数为bool型,默认值为True,表示引擎是否加载WebDriver。如果在设为False的情况下发出了一系列动态请求,将会引发一系列异常。 63 | 64 | **DRIVER** 65 | 66 | 该参数为byte型,默认值为Firefox,表示PyCreeper使用的Driver类型。可以选择任意一种Selenium支持的Driver,前提是需要配置好Driver的相关环境。 67 | 68 | **DRIVER_INIT_KWARGS** 69 | 70 | 该参数为dict型,默认为空字典,表示启动Driver时传入的参数,您可以通过定义该值修改Driver的属性。 71 | 72 | **DOWNLOAD_DELAY** 73 | 74 | 该参数为数值型,默认值为0,表示下载延迟(秒)。 75 | 76 | **PROXY_INTERVAL** 77 | 78 | 该参数为数值型,默认值为3,表示每个代理使用的最大时间。使用proxy需要搭配ProxyMiddleware, 79 | 并且此处的proxy只对静态请求有效。如果您想配置动态请求的proxy,可以设置DRIVER_INIT_KWARGS参数,在Driver启动时传入配置信息。 80 | 81 | **PROXY_LIST** 82 | 83 | 该参数为list型,默认为空数组,表示请求可以用到的proxy。格式为'IP:端口号'。 84 | 85 | **STATIC_REQUEST_SSL_VERIFY** 86 | 87 | 该参数为bool型,默认值为True,表示发起静态请求是,是否进行ssl认证。 88 | 该参数用于在使用代理的情况下,https认证失败的情况。 -------------------------------------------------------------------------------- /examples/zhihu_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import json 6 | 7 | from pycreeper.spider import Spider 8 | from pycreeper.http.request import Request 9 | import gevent 10 | from lxml import etree 11 | 12 | class Zhihu_Spider(Spider): 13 | 14 | custom_settings = { 15 | 'DOWNLOADER_MIDDLEWARES': { 16 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100, 17 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200, 18 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300 19 | }, 20 | 'DRIVER': 'Chrome', 21 | 'DOWNLOAD_DELAY': 2, 22 | 'STATIC_REQUEST_SSL_VERIFY': False, 23 | 'USER_AGENT_LIST': [ 24 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 25 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''', 26 | ] 27 | } 28 | 29 | def start_requests(self): 30 | 31 | def _login(driver): 32 | driver.find_element_by_name('account').send_keys("username") 33 | driver.find_element_by_name('password').send_keys("password") 34 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click() 35 | gevent.sleep(5) 36 | 37 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"}, 38 | callback=self.after_login, dynamic=True, browser_actions=[_login]) 39 | 40 | def after_login(self, response): 41 | html = response.body 42 | selector = etree.HTML(html) 43 | links = selector.xpath('//a[@class="question_link"]') 44 | for link in links: 45 | yield Request('https://www.zhihu.com' + link.attrib["href"], 46 | meta={"cookiejar": "zhihu"}, callback=self.get_item) 47 | 48 | def get_item(self, response): 49 | html = response.body 50 | selector = etree.HTML(html) 51 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text 52 | body = selector.xpath('//span[@class="RichText"]')[0].text 53 | yield { 54 | 'head': head, 55 | 'body': body 56 | } 57 | 58 | def process_item(self, item): 59 | print json.dumps(item, ensure_ascii=False).encode('GBK', 'ignore') 60 | 61 | if __name__ == "__main__": 62 | spider = Zhihu_Spider() 63 | spider.start() 64 | -------------------------------------------------------------------------------- /tests/http/test_http_request.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import unittest 6 | 7 | from w3lib.url import safe_url_string 8 | 9 | from pycreeper.http.request import Request 10 | 11 | 12 | class RequestTest(unittest.TestCase): 13 | def test_init(self): 14 | self.assertRaises(Exception, Request) 15 | self.assertRaises(ValueError, Request, 'foo') 16 | request = Request('http://www.example.com/') 17 | assert request.url 18 | assert not request.body 19 | request = Request('http://www.example.com/', 20 | headers={'Content-Type': 'text/html', 21 | 'Content-Length': 1234 22 | }, 23 | method='get' 24 | ) 25 | self.assertEqual(request.method, 'GET') 26 | 27 | def test_copy(self): 28 | request1 = Request('http://www.example.com/', 29 | headers={'Content-Type': 'text/html', 30 | 'Content-Length': 1234 31 | }, 32 | method='get' 33 | ) 34 | request2 = request1.copy() 35 | assert request1.__dict__ == request2.__dict__ 36 | self.assertEqual(request1.headers, request2.headers) 37 | self.assertEqual(request1, request2) 38 | self.assertIsNot(request1, request2) 39 | 40 | def test_url(self): 41 | request = Request('http://www.example.com/') 42 | self.assertIsInstance(request.url, str) 43 | self.assertEqual(request.url, 'http://www.example.com/') 44 | request = Request(u'http://www.example.com?content=测试') 45 | self.assertEqual(request.url, 46 | safe_url_string('http://www.example.com?content=测试')) 47 | self.assertRaises(TypeError, Request, 123) 48 | 49 | def test_body(self): 50 | r1 = Request(url="http://www.example.com/") 51 | assert r1.body == b'' 52 | 53 | r2 = Request(url="http://www.example.com/", body=b"") 54 | assert isinstance(r2.body, bytes) 55 | self.assertEqual(r2.encoding, 'utf-8') # default encoding 56 | 57 | r3 = Request(url="http://www.example.com/", body=u"Price: \xa3100", encoding='utf-8') 58 | assert isinstance(r3.body, bytes) 59 | self.assertEqual(r3.body, b"Price: \xc2\xa3100") 60 | 61 | r4 = Request(url="http://www.example.com/", body=u"Price: \xa3100", encoding='latin1') 62 | assert isinstance(r4.body, bytes) 63 | self.assertEqual(r4.body, b"Price: \xa3100") 64 | -------------------------------------------------------------------------------- /pycreeper/conf/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | """" Settings """ 6 | 7 | import json 8 | from importlib import import_module 9 | 10 | from pycreeper.conf import default_settings 11 | 12 | 13 | class Settings(object): 14 | """ Settings Object """ 15 | 16 | def __init__(self, values=None): 17 | self.attrs = {} 18 | self.load_config(default_settings) 19 | if values: 20 | self.load_config(values) 21 | 22 | def __getitem__(self, key): 23 | """__getitem__ 24 | 25 | @key, str, key 26 | """ 27 | return self.attrs[key] if key in self.attrs else None 28 | 29 | def load_config(self, module): 30 | """load config 31 | 32 | @module, module 33 | """ 34 | if isinstance(module, basestring): 35 | module = import_module(module) 36 | for key in module if isinstance(module, dict) else dir(module): 37 | if key.isupper(): 38 | self.set(key, module.get(key) \ 39 | if isinstance(module, dict) else getattr(module, key)) 40 | 41 | def set(self, key, value): 42 | """set 43 | 44 | @key, str, key 45 | @value, str/int/float value 46 | """ 47 | self.attrs[key] = value 48 | 49 | def set_dict(self, values): 50 | """set dict 51 | 52 | @values, dict, values 53 | """ 54 | for key, value in values.iteritems(): 55 | self.set(key, value) 56 | 57 | def get(self, key, default=None): 58 | """get 59 | 60 | @key, str, key 61 | @default, default 62 | """ 63 | return self[key] or default 64 | 65 | def get_int(self, key, default=0): 66 | """get int 67 | 68 | @key, str, key 69 | @default, int 70 | """ 71 | return int(self.get(key, default)) 72 | 73 | def get_float(self, key, default=0.0): 74 | """get float 75 | 76 | @key, str, key 77 | @default, float 78 | """ 79 | return float(self.get(key, default)) 80 | 81 | def get_list(self, key, default=None): 82 | """get list 83 | 84 | @key, str, key 85 | @default, list 86 | """ 87 | value = self.get(key, default or None) 88 | if isinstance(value, basestring): 89 | value = value.split(",") 90 | return value 91 | 92 | def get_dict(self, key, default=None): 93 | """get dict 94 | 95 | @key, str, key 96 | @default, dict 97 | """ 98 | value = self.get(key, default or None) 99 | if isinstance(value, basestring): 100 | value = json.loads(value) 101 | return value 102 | -------------------------------------------------------------------------------- /pycreeper/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import six 6 | 7 | def result2list(result): 8 | """result to list 9 | """ 10 | if result is None: 11 | return [] 12 | if isinstance(result, (dict, basestring)): 13 | return [result] 14 | if hasattr(result, "__iter__"): 15 | return result 16 | 17 | 18 | def call_func(func, errback=None, callback=None, *args, **kwargs): 19 | """执行某个函数,并自动包装异常和回调 20 | 21 | :param func: 22 | :param errback: 23 | :param callback: 24 | :param args: 25 | :param kwargs: 26 | """ 27 | try: 28 | result = func(*args, **kwargs) 29 | except Exception as exc: 30 | if errback: 31 | errback(exc) 32 | else: 33 | if callback: 34 | result = callback(result) 35 | return result 36 | 37 | 38 | def sorted_priority_dict(d): 39 | """Sort the priority dict to a ordered list. 40 | 41 | :param d: A priority dict. 42 | :return: Ordered list. 43 | """ 44 | modules = sorted(d.items(), key=lambda x: x[1]) 45 | modules = [x[0] for x in modules] 46 | return modules 47 | 48 | 49 | def to_unicode(text, encoding=None, errors='strict'): 50 | """Return the unicode representation of a bytes object `text`. If `text` 51 | is already an unicode object, return it as-is.""" 52 | if isinstance(text, six.text_type): 53 | return text 54 | if not isinstance(text, (bytes, six.text_type)): 55 | raise TypeError('to_unicode must receive a bytes, str or unicode ' 56 | 'object, got %s' % type(text).__name__) 57 | if encoding is None: 58 | encoding = 'utf-8' 59 | return text.decode(encoding, errors) 60 | 61 | 62 | def to_bytes(text, encoding=None, errors='strict'): 63 | """Return the binary representation of `text`. If `text` 64 | is already a bytes object, return it as-is.""" 65 | if isinstance(text, bytes): 66 | return text 67 | if not isinstance(text, six.string_types): 68 | raise TypeError('to_bytes must receive a unicode, str or bytes ' 69 | 'object, got %s' % type(text).__name__) 70 | if encoding is None: 71 | encoding = 'utf-8' 72 | return text.encode(encoding, errors) 73 | 74 | 75 | def to_native_str(text, encoding=None, errors='strict'): 76 | """ Return str representation of `text` 77 | (bytes in Python 2.x and unicode in Python 3.x). """ 78 | if six.PY2: 79 | return to_bytes(text, encoding, errors) 80 | else: 81 | return to_unicode(text, encoding, errors) 82 | 83 | 84 | def _get_cookies_from_cookiejar(cj): 85 | result = [] 86 | for domain in cj._cookies.keys(): 87 | for path in cj._cookies[domain].keys(): 88 | for cookie in cj._cookies[domain][path].values(): 89 | result.append(cookie) 90 | return result 91 | -------------------------------------------------------------------------------- /pycreeper/conf/default_settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | """ default config settings """ 6 | 7 | LOG_LEVEL = 'DEBUG' 8 | 9 | RETRY_COUNT = 3 10 | 11 | RETRY_STATUS_CODES = [500, 502, 503, 504, 400, 403, 408] 12 | 13 | TIMEOUT = 5 14 | 15 | MAX_REQUEST_SIZE = 20 16 | 17 | USER_AGENT_LIST = [ 18 | 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31', 19 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17', 20 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17', 21 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)', 22 | 'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)', 23 | 'Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)', 24 | 'Mozilla/6.0 (Windows NT 6.2; WOW64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1', 25 | 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1', 26 | 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:15.0) Gecko/20120910144328 Firefox/15.0.2', 27 | 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201', 28 | 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9a3pre) Gecko/20070330', 29 | 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.13; ) Gecko/20101203', 30 | 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', 31 | 'Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50', 32 | 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52', 33 | 'Mozilla/5.0 (Windows; U; Win 9x 4.90; SG; rv:1.9.2.4) Gecko/20101104 Netscape/9.1.0285', 34 | 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.1.7pre) Gecko/20070815 Firefox/2.0.0.6 Navigator/9.0b3', 35 | 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6', 36 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", 37 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)", 38 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)", 39 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", 40 | "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT)", 41 | "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13", 42 | "Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3", 43 | ] 44 | 45 | DOWNLOADER_MIDDLEWARES = {} 46 | 47 | DYNAMIC_CRAWL = True 48 | 49 | DRIVER = 'Firefox' 50 | 51 | DRIVER_INIT_KWARGS = {} 52 | 53 | DOWNLOAD_DELAY = 0 54 | 55 | PROXY_INTERVAL = 3 56 | 57 | PROXY_LIST = [] 58 | 59 | STATIC_REQUEST_SSL_VERIFY = True 60 | -------------------------------------------------------------------------------- /tests/test_conf_settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import unittest 6 | 7 | from pycreeper.conf.settings import Settings 8 | from tests.test_data import test_settings_data 9 | 10 | CONF_PATH = 'tests.test_data.test_settings_data' 11 | 12 | 13 | class SettingsTest(unittest.TestCase): 14 | def test_basics(self): 15 | settings = Settings() 16 | self.assertEqual(settings['RETRY_COUNT'], 3) 17 | settings = Settings(test_settings_data) 18 | self.assertEqual(settings['TEST_INT'], 10) 19 | 20 | def test_get_item(self): 21 | settings = Settings(test_settings_data) 22 | self.assertEqual(settings['TEST_STR'], 'foo,bar,baz') 23 | self.assertEqual(settings['TEST_DICT'], {"foo": "bar"}) 24 | 25 | def test_load_config(self): 26 | settings = Settings(test_settings_data) 27 | self.assertEqual(settings['TEST_STR'], 'foo,bar,baz') 28 | settings = Settings(CONF_PATH) 29 | self.assertEqual(settings['TEST_STR'], 'foo,bar,baz') 30 | self.assertRaises(KeyError, settings['test_lowercase']) 31 | 32 | def test_set(self): 33 | settings = Settings(test_settings_data) 34 | self.assertRaises(KeyError, settings['TEST_SET']) 35 | settings.set('TEST_SET', True) 36 | self.assertEqual(settings['TEST_SET'], True) 37 | 38 | def test_set_dict(self): 39 | settings = Settings(test_settings_data) 40 | self.assertRaises(KeyError, settings['TEST_SET_1']) 41 | self.assertRaises(KeyError, settings['TEST_SET_2']) 42 | settings.set_dict( 43 | { 44 | 'TEST_SET_1': True, 45 | 'TEST_SET_2': False 46 | } 47 | ) 48 | self.assertEqual(settings['TEST_SET_1'], True) 49 | self.assertEqual(settings['TEST_SET_2'], False) 50 | 51 | def test_get(self): 52 | settings = Settings(test_settings_data) 53 | self.assertEqual(settings.get('TEST_GET'), None) 54 | self.assertEqual(settings.get('TEST_GET', 'foo'), 'foo') 55 | settings.set('TEST_GET', 'bar') 56 | self.assertEqual(settings.get('TEST_GET', 'foo'), 'bar') 57 | 58 | def test_get_int_and_float(self): 59 | settings = Settings(test_settings_data) 60 | self.assertIsInstance(settings.get_float('TEST_INT'), float) 61 | self.assertIsInstance(settings.get_int('TEST_FLOAT'), int) 62 | 63 | def test_get_list(self): 64 | settings = Settings(test_settings_data) 65 | self.assertIsInstance(settings.get_list('TEST_LIST'), list) 66 | self.assertIsInstance(settings.get_list('TEST_STR'), list) 67 | 68 | def test_get_dict(self): 69 | settings = Settings(test_settings_data) 70 | self.assertIsInstance(settings.get_dict('TEST_DICT'), dict) 71 | self.assertIsInstance(settings.get_dict('TEST_JSON'), dict) 72 | 73 | 74 | if __name__ == "__main__": 75 | unittest.main() 76 | -------------------------------------------------------------------------------- /pycreeper/downloader_middlewares/__init__.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | """Dowloader Midlleware""" 6 | 7 | from collections import defaultdict 8 | 9 | from importlib import import_module 10 | 11 | from pycreeper.utils import call_func, sorted_priority_dict 12 | from pycreeper.http.request import Request 13 | 14 | 15 | class DownloaderMiddleware(object): 16 | """ DownloaderMiddleware iterface """ 17 | 18 | pass 19 | 20 | 21 | class DownloaderMiddlewareManager(object): 22 | """ DownloaderMiddlewareManager """ 23 | 24 | def __init__(self, spider): 25 | self.settings = spider.settings 26 | self.logger = spider.logger 27 | self.methods = defaultdict(list) 28 | self.middlewares = self.load_middleware() 29 | for miw in self.middlewares: 30 | self._add_middleware(miw) 31 | 32 | def load_middleware(self): 33 | """load middleware 34 | """ 35 | middlewares = [] 36 | modules = sorted_priority_dict( 37 | self.settings.get('DOWNLOADER_MIDDLEWARES', {}) 38 | ) 39 | for module_name in modules: 40 | module = import_module('.'.join(module_name.split('.')[:-1])) 41 | middleware_class = getattr(module, module_name.split('.')[-1]) 42 | middlewares.append(middleware_class(self.settings, self.logger)) 43 | return middlewares 44 | 45 | def _add_middleware(self, miw): 46 | """add middleware 47 | """ 48 | if hasattr(miw, "process_request"): 49 | self.methods["process_request"].append(miw.process_request) 50 | if hasattr(miw, "process_response"): 51 | self.methods["process_response"].insert(0, miw.process_response) 52 | if hasattr(miw, "process_exception"): 53 | self.methods["process_exception"].insert(0, miw.process_exception) 54 | 55 | def download(self, download_func, request): 56 | """download 57 | """ 58 | 59 | def process_request(request): 60 | """ process request """ 61 | for method in self.methods["process_request"]: 62 | method(request) 63 | return download_func(request) 64 | 65 | def process_response(response): 66 | """ process response """ 67 | for method in self.methods["process_response"]: 68 | response = method(request, response) 69 | if isinstance(response, Request): 70 | return response 71 | return response 72 | 73 | def process_exception(exception): 74 | """ process exception """ 75 | for method in self.methods["process_exception"]: 76 | response = method(request, exception) 77 | if response: 78 | return response 79 | return exception 80 | 81 | return call_func(process_request, process_exception, 82 | process_response, request) 83 | -------------------------------------------------------------------------------- /tests/test_scheduler.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import unittest 6 | import time 7 | from pycreeper.scheduler import RequestFilter, Scheduler 8 | from pycreeper.http.request import Request 9 | from pycreeper.spider import Spider 10 | from Queue import Empty 11 | 12 | __doctests__ = ['pycreeper.utils.scheduler'] 13 | 14 | URLS = [ 15 | 'http://www.example.com/index.html#print', 16 | 'http://www.example.com/index.html', 17 | 'http://www.xxx.com/index.html?id=77&nameid=2905210001&page=1', 18 | 'http://www.xxxxx.com/index.html?id=77&nameid=2905210001&page=1', 19 | 'http://www.xxxxx.com/index.html?test123123', 20 | 'http://www.xxxxx.com/index.html', 21 | 'ftp://www.xxxxx.com/index.html' 22 | ] 23 | 24 | REQUEST = [Request(url) for url in URLS] 25 | 26 | 27 | class RequestTest(unittest.TestCase): 28 | 29 | def test_basic(self): 30 | request_filter = RequestFilter() 31 | request_filter.request_seen(REQUEST[0]) 32 | self.assertEqual(request_filter.request_seen(REQUEST[0]), True) 33 | self.assertEqual(request_filter.request_seen(REQUEST[1]), False) 34 | self.assertEqual(request_filter.request_seen(REQUEST[1]), True) 35 | self.assertRaises(AttributeError, request_filter.request_seen, None) 36 | 37 | 38 | class SchedulerTest(unittest.TestCase): 39 | 40 | def setUp(self): 41 | self.spider = Spider() 42 | 43 | def test_basic(self): 44 | self.assertRaises(AttributeError, Scheduler, None) 45 | 46 | def test_enqueue(self): 47 | scheduler = Scheduler(self.spider) 48 | self.assertRaises(AttributeError, scheduler.enqueue_request, None) 49 | self.assertEqual(len(scheduler.queue), 0) 50 | scheduler.enqueue_request(REQUEST[0]) 51 | self.assertEqual(len(scheduler.queue), 1) 52 | scheduler.enqueue_request(REQUEST[0]) 53 | self.assertEqual(len(scheduler.queue), 1) 54 | scheduler.enqueue_request(REQUEST[1]) 55 | self.assertEqual(len(scheduler.queue), 2) 56 | scheduler.enqueue_request(REQUEST[0]) 57 | self.assertEqual(len(scheduler.queue), 2) 58 | 59 | def test_next_request(self): 60 | scheduler = Scheduler(self.spider) 61 | self.assertRaises(Empty, scheduler.next_request) 62 | scheduler.enqueue_request(REQUEST[0]) 63 | scheduler.enqueue_request(REQUEST[1]) 64 | scheduler.enqueue_request(REQUEST[2]) 65 | self.assertEqual(scheduler.next_request(), REQUEST[0]) 66 | self.assertEqual(scheduler.next_request(), REQUEST[1]) 67 | self.assertEqual(scheduler.next_request(), REQUEST[2]) 68 | self.assertRaises(Empty, scheduler.next_request) 69 | 70 | def test_download_delay(self): 71 | self.spider.settings.set('DOWNLOAD_DELAY', 5) 72 | scheduler = Scheduler(self.spider) 73 | scheduler.enqueue_request(REQUEST[0]) 74 | time1 = time.time() 75 | scheduler.next_request() 76 | self.assertGreater(time.time() - time1, 5) 77 | 78 | 79 | if __name__ == "__main__": 80 | unittest.main() 81 | -------------------------------------------------------------------------------- /pycreeper/http/response.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | """ Response Object """ 6 | 7 | import six 8 | from w3lib.url import safe_url_string 9 | from pycreeper.http.request import Request 10 | import copy 11 | 12 | 13 | class Response(object): 14 | """ Response """ 15 | 16 | def __init__(self, url, request, headers=None, status=200, 17 | cookiejar=None, body='', encoding='utf-8'): 18 | self._encoding = encoding 19 | self.headers = copy.deepcopy(headers) if headers else {} 20 | self.url = url 21 | self.status = int(status) 22 | self.cookiejar = cookiejar 23 | self.body = body 24 | self.request = request 25 | 26 | @property 27 | def encoding(self): 28 | return self._encoding 29 | 30 | @property 31 | def url(self): 32 | return self._url 33 | 34 | @url.setter 35 | def url(self, url): 36 | if isinstance(url, str): 37 | self._url = safe_url_string(url) 38 | elif isinstance(url, six.text_type): 39 | if self.encoding is None: 40 | raise TypeError('Cannot convert unicode url - %s has no encoding' % 41 | type(self).__name__) 42 | self._url = safe_url_string(url.encode(self.encoding)) 43 | else: 44 | raise TypeError('Response url must be str or unicode, got %s:' % type(url).__name__) 45 | if ':' not in self._url: 46 | raise ValueError('Missing scheme in request url: %s' % self._url) 47 | 48 | @property 49 | def body(self): 50 | return self._body 51 | 52 | @body.setter 53 | def body(self, body): 54 | if isinstance(body, str): 55 | self._body = body 56 | elif isinstance(body, six.text_type): 57 | if self.encoding is None: 58 | raise TypeError('Cannot convert unicode body - %s has no encoding' % 59 | type(self).__name__) 60 | self._body = body.encode(self.encoding) 61 | elif body is None: 62 | self._body = '' 63 | else: 64 | raise TypeError("Response body must either str or unicode. Got: '%s'" % type(body).__name__) 65 | 66 | @property 67 | def request(self): 68 | return self._request 69 | 70 | @request.setter 71 | def request(self, value): 72 | if isinstance(value, Request): 73 | self._request = value.copy() 74 | else: 75 | raise TypeError("Response request must be pycreeper.Request. Got: '%s'" % type(value).__name__) 76 | 77 | def copy(self, *args, **kwargs): 78 | """ copy """ 79 | for key in ["url", "status", "cookiejar", "body", "request", "encoding", "headers"]: 80 | kwargs.setdefault(key, getattr(self, key)) 81 | 82 | cls = kwargs.pop('cls', self.__class__) 83 | return cls(*args, **kwargs) 84 | 85 | def __str__(self): 86 | return "<%d %s>" % (self.status, self.url) 87 | 88 | __repr__ = __str__ 89 | 90 | def __eq__(self, other): 91 | return self.__dict__ == other.__dict__ 92 | 93 | def __ne__(self, other): 94 | return self.__dict__ != other.__dict__ 95 | -------------------------------------------------------------------------------- /doc/_build/html/genindex.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | Index — PyCreeper 1.0.0 documentation 11 | 12 | 13 | 14 | 15 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 41 | 42 |
43 |
44 |
45 |
46 | 47 | 48 |

Index

49 | 50 |
51 | 52 |
53 | 54 | 55 |
56 |
57 |
58 | 75 |
76 |
77 | 86 | 90 | 91 | -------------------------------------------------------------------------------- /pycreeper/http/request.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import six 6 | from w3lib.url import safe_url_string 7 | import copy 8 | 9 | 10 | class Request(object): 11 | """ Request """ 12 | 13 | def __init__(self, url, callback=None, method='GET', headers=None, 14 | body=None, meta=None, encoding='utf-8', cookiejar=None, 15 | dynamic=False, browser_actions=None, wait=0): 16 | self._encoding = encoding 17 | self.headers = copy.deepcopy(headers) if headers else {} 18 | self.cookiejar = cookiejar 19 | self.url = url 20 | self.body = body 21 | self.method = str(method).upper() 22 | self.callback = callback 23 | self.meta = dict(meta) if meta else {} 24 | self.dynamic = bool(dynamic) 25 | if self.dynamic: 26 | if self.method == 'POST': 27 | raise AttributeError('Pycreeper can\'t make a dynamic POST request.') 28 | self.browser_actions = browser_actions if browser_actions else [] 29 | self.wait = int(wait) 30 | else: 31 | self.browser_actions = [] 32 | self.wait = 0 33 | 34 | @property 35 | def encoding(self): 36 | return self._encoding 37 | 38 | @property 39 | def url(self): 40 | return self._url 41 | 42 | @url.setter 43 | def url(self, url): 44 | if isinstance(url, str): 45 | self._url = safe_url_string(url) 46 | elif isinstance(url, six.text_type): 47 | if self._encoding is None: 48 | raise TypeError('Cannot convert unicode url - %s has no encoding' % 49 | type(self).__name__) 50 | self._url = safe_url_string(url.encode(self._encoding)) 51 | else: 52 | raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__) 53 | if ':' not in self._url: 54 | raise ValueError('Missing scheme in request url: %s' % self._url) 55 | 56 | @property 57 | def body(self): 58 | return self._body 59 | 60 | @body.setter 61 | def body(self, body): 62 | if isinstance(body, str): 63 | self._body = body 64 | elif isinstance(body, six.text_type): 65 | if self._encoding is None: 66 | raise TypeError('Cannot convert unicode body - %s has no encoding' % 67 | type(self).__name__) 68 | self._body = body.encode(self._encoding) 69 | elif body is None: 70 | self._body = '' 71 | elif isinstance(body, dict): 72 | self._body = body 73 | else: 74 | raise TypeError("Request body must either str, unicode or dict. Got: '%s'" % type(body).__name__) 75 | 76 | def copy(self, *args, **kwargs): 77 | """ copy """ 78 | for key in ["encoding", "url", "method", "callback", 79 | "cookiejar", "body", "meta", "headers"]: 80 | kwargs.setdefault(key, getattr(self, key)) 81 | cls = kwargs.pop('cls', self.__class__) 82 | return cls(*args, **kwargs) 83 | 84 | def __str__(self): 85 | return "<%s %s>" % (self.method, self.url) 86 | 87 | __repr__ = __str__ 88 | 89 | def __eq__(self, other): 90 | return self.__dict__ == other.__dict__ 91 | 92 | def __ne__(self, other): 93 | return self.__dict__ != other.__dict__ 94 | -------------------------------------------------------------------------------- /examples/jd_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import json 6 | import HTMLParser 7 | from pycreeper.spider import Spider 8 | from pycreeper.http.request import Request 9 | from selenium.webdriver.common.keys import Keys 10 | import gevent 11 | from lxml import etree 12 | from selenium.common.exceptions import NoSuchElementException 13 | 14 | parser = HTMLParser.HTMLParser() 15 | 16 | class Jd_Spider(Spider): 17 | 18 | custom_settings = { 19 | 'DOWNLOADER_MIDDLEWARES': { 20 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100, 21 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200, 22 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300, 23 | 'pycreeper.downloader_middlewares.middlewares.EncodingDiscriminateMiddleware': 400 24 | }, 25 | 'DRIVER': 'Chrome', 26 | 'DOWNLOAD_DELAY': 2, 27 | 'USER_AGENT_LIST': [ 28 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 29 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''', 30 | ] 31 | } 32 | 33 | def start_requests(self): 34 | def _search(driver): 35 | driver.find_element_by_id('key').send_keys(u"联想笔记本", Keys.ENTER) 36 | gevent.sleep(3) 37 | self._jump_guide(driver) 38 | gevent.sleep(3) 39 | 40 | yield Request(url='https://www.jd.com/', 41 | meta={"cookiejar": "jd"}, 42 | callback=self.parse_list, 43 | dynamic=True, 44 | browser_actions=[_search] 45 | ) 46 | 47 | def _jump_guide(self, driver): 48 | try: 49 | driver.find_element_by_xpath('//*[@id="guide-price"]/div[2]/a').click() 50 | except NoSuchElementException as e: 51 | pass 52 | 53 | def parse_list(self, response): 54 | html = response.body 55 | selector = etree.HTML(html) 56 | links = selector.xpath('//div[@class="p-img"]/a') 57 | titles = selector.xpath('//div[@class="p-name p-name-type-2"]/a/em') 58 | imgs = selector.xpath('//div[@class="p-img"]/a/img') 59 | prices = selector.xpath('//div[@class="p-price"]/strong/i') 60 | for i in range(len(links)): 61 | try: 62 | yield { 63 | 'path': links[i].attrib["href"] if 'http' in links[i].attrib["href"] 64 | else 'http:' + links[i].attrib["href"], 65 | 'title': parser.unescape(etree.tostring(titles[i], pretty_print=True)), 66 | 'img': imgs[i].attrib["src"] if 'http' in imgs[i].attrib["src"] 67 | else 'http:' + imgs[i].attrib["src"], 68 | 'price': prices[i].text, 69 | } 70 | except Exception as e: 71 | pass 72 | 73 | url = response.url 74 | 75 | def _next_page(driver): 76 | self._jump_guide(driver) 77 | driver.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[9]').click() 78 | self._jump_guide(driver) 79 | 80 | yield Request(url=url, 81 | meta={"cookiejar": "jd"}, 82 | callback=self.parse_list, 83 | dynamic=True, 84 | browser_actions=[_next_page] 85 | ) 86 | 87 | def process_item(self, item): 88 | print json.dumps(item, ensure_ascii=False).encode('GBK', 'ignore') 89 | 90 | if __name__ == "__main__": 91 | spider = Jd_Spider() 92 | spider.start() 93 | -------------------------------------------------------------------------------- /doc/_build/html/search.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Search — PyCreeper 1.0.0 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 48 | 49 |
50 |
51 |
52 |
53 | 54 |

Search

55 |
56 | 57 |

58 | Please activate JavaScript to enable the search 59 | functionality. 60 |

61 |
62 |

63 | From here you can search these documents. Enter your search 64 | words into the box below and click "search". Note that the search 65 | function will automatically search for all of the words. Pages 66 | containing fewer words won't appear in the result list. 67 |

68 |
69 | 70 | 71 | 72 |
73 | 74 |
75 | 76 |
77 | 78 |
79 |
80 |
81 | 85 |
86 |
87 | 96 | 100 | 101 | -------------------------------------------------------------------------------- /doc/_build/html/last.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 写在最后 — PyCreeper 1.0.0 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 44 | 45 |
46 |
47 |
48 |
49 | 50 |
51 |

写在最后

52 |
53 | 54 | 55 |
56 |
57 |
58 | 82 |
83 |
84 | 96 | 100 | 101 | -------------------------------------------------------------------------------- /tests/utils/test_utils_datatypes.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import copy 6 | import unittest 7 | 8 | from pycreeper.utils.datatypes import CaselessDict 9 | 10 | __doctests__ = ['pycreeper.utils.datatypes'] 11 | 12 | 13 | class CaselessDictTest(unittest.TestCase): 14 | def test_init(self): 15 | seq = {'red': 1, 'black': 3} 16 | d = CaselessDict(seq) 17 | self.assertEqual(d['red'], 1) 18 | self.assertEqual(d['black'], 3) 19 | 20 | seq = (('red', 1), ('black', 3)) 21 | d = CaselessDict(seq) 22 | self.assertEqual(d['red'], 1) 23 | self.assertEqual(d['black'], 3) 24 | 25 | def test_caseless(self): 26 | d = CaselessDict() 27 | d['key_Lower'] = 1 28 | self.assertEqual(d['KEy_loWer'], 1) 29 | self.assertEqual(d.get('KEy_loWer'), 1) 30 | 31 | d['KEY_LOWER'] = 3 32 | self.assertEqual(d['key_Lower'], 3) 33 | self.assertEqual(d.get('key_Lower'), 3) 34 | 35 | def test_delete(self): 36 | d = CaselessDict({'key_lower': 1}) 37 | del d['key_LOWER'] 38 | self.assertRaises(KeyError, d.__getitem__, 'key_LOWER') 39 | self.assertRaises(KeyError, d.__getitem__, 'key_lower') 40 | 41 | def test_getdefault(self): 42 | d = CaselessDict() 43 | self.assertEqual(d.get('c', 5), 5) 44 | d['c'] = 10 45 | self.assertEqual(d.get('c', 5), 10) 46 | 47 | def test_setdefault(self): 48 | d = CaselessDict({'a': 1, 'b': 2}) 49 | 50 | r = d.setdefault('A', 5) 51 | self.assertEqual(r, 1) 52 | self.assertEqual(d['A'], 1) 53 | 54 | r = d.setdefault('c', 5) 55 | self.assertEqual(r, 5) 56 | self.assertEqual(d['C'], 5) 57 | 58 | def test_fromkeys(self): 59 | keys = ('a', 'b') 60 | 61 | d = CaselessDict.fromkeys(keys) 62 | self.assertEqual(d['A'], None) 63 | self.assertEqual(d['B'], None) 64 | 65 | d = CaselessDict.fromkeys(keys, 1) 66 | self.assertEqual(d['A'], 1) 67 | self.assertEqual(d['B'], 1) 68 | 69 | instance = CaselessDict() 70 | d = instance.fromkeys(keys) 71 | self.assertEqual(d['A'], None) 72 | self.assertEqual(d['B'], None) 73 | 74 | d = instance.fromkeys(keys, 1) 75 | self.assertEqual(d['A'], 1) 76 | self.assertEqual(d['B'], 1) 77 | 78 | def test_contains(self): 79 | d = CaselessDict() 80 | d['a'] = 1 81 | assert 'a' in d 82 | 83 | def test_pop(self): 84 | d = CaselessDict() 85 | d['a'] = 1 86 | self.assertEqual(d.pop('A'), 1) 87 | self.assertRaises(KeyError, d.pop, 'A') 88 | 89 | def test_normkey(self): 90 | class MyDict(CaselessDict): 91 | def normkey(self, key): 92 | return key.title() 93 | 94 | d = MyDict() 95 | d['key-one'] = 2 96 | self.assertEqual(list(d.keys()), ['Key-One']) 97 | 98 | def test_normvalue(self): 99 | class MyDict(CaselessDict): 100 | def normvalue(self, value): 101 | if value is not None: 102 | return value + 1 103 | 104 | d = MyDict({'key': 1}) 105 | self.assertEqual(d['key'], 2) 106 | self.assertEqual(d.get('key'), 2) 107 | 108 | d = MyDict() 109 | d['key'] = 1 110 | self.assertEqual(d['key'], 2) 111 | self.assertEqual(d.get('key'), 2) 112 | 113 | d = MyDict() 114 | d.setdefault('key', 1) 115 | self.assertEqual(d['key'], 2) 116 | self.assertEqual(d.get('key'), 2) 117 | 118 | d = MyDict() 119 | d.update({'key': 1}) 120 | self.assertEqual(d['key'], 2) 121 | self.assertEqual(d.get('key'), 2) 122 | 123 | d = MyDict.fromkeys(('key',), 1) 124 | self.assertEqual(d['key'], 2) 125 | self.assertEqual(d.get('key'), 2) 126 | 127 | def test_copy(self): 128 | h1 = CaselessDict({'header1': 'value'}) 129 | h2 = copy.copy(h1) 130 | self.assertEqual(h1, h2) 131 | self.assertEqual(h1.get('header1'), h2.get('header1')) 132 | assert isinstance(h2, CaselessDict) 133 | 134 | 135 | if __name__ == "__main__": 136 | unittest.main() 137 | -------------------------------------------------------------------------------- /doc/_build/html/_static/pygments.css: -------------------------------------------------------------------------------- 1 | .highlight .hll { background-color: #ffffcc } 2 | .highlight { background: #eeffcc; } 3 | .highlight .c { color: #408090; font-style: italic } /* Comment */ 4 | .highlight .err { border: 1px solid #FF0000 } /* Error */ 5 | .highlight .k { color: #007020; font-weight: bold } /* Keyword */ 6 | .highlight .o { color: #666666 } /* Operator */ 7 | .highlight .ch { color: #408090; font-style: italic } /* Comment.Hashbang */ 8 | .highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */ 9 | .highlight .cp { color: #007020 } /* Comment.Preproc */ 10 | .highlight .cpf { color: #408090; font-style: italic } /* Comment.PreprocFile */ 11 | .highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */ 12 | .highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */ 13 | .highlight .gd { color: #A00000 } /* Generic.Deleted */ 14 | .highlight .ge { font-style: italic } /* Generic.Emph */ 15 | .highlight .gr { color: #FF0000 } /* Generic.Error */ 16 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ 17 | .highlight .gi { color: #00A000 } /* Generic.Inserted */ 18 | .highlight .go { color: #333333 } /* Generic.Output */ 19 | .highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */ 20 | .highlight .gs { font-weight: bold } /* Generic.Strong */ 21 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ 22 | .highlight .gt { color: #0044DD } /* Generic.Traceback */ 23 | .highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */ 24 | .highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */ 25 | .highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */ 26 | .highlight .kp { color: #007020 } /* Keyword.Pseudo */ 27 | .highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */ 28 | .highlight .kt { color: #902000 } /* Keyword.Type */ 29 | .highlight .m { color: #208050 } /* Literal.Number */ 30 | .highlight .s { color: #4070a0 } /* Literal.String */ 31 | .highlight .na { color: #4070a0 } /* Name.Attribute */ 32 | .highlight .nb { color: #007020 } /* Name.Builtin */ 33 | .highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */ 34 | .highlight .no { color: #60add5 } /* Name.Constant */ 35 | .highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */ 36 | .highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */ 37 | .highlight .ne { color: #007020 } /* Name.Exception */ 38 | .highlight .nf { color: #06287e } /* Name.Function */ 39 | .highlight .nl { color: #002070; font-weight: bold } /* Name.Label */ 40 | .highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */ 41 | .highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */ 42 | .highlight .nv { color: #bb60d5 } /* Name.Variable */ 43 | .highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */ 44 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */ 45 | .highlight .mb { color: #208050 } /* Literal.Number.Bin */ 46 | .highlight .mf { color: #208050 } /* Literal.Number.Float */ 47 | .highlight .mh { color: #208050 } /* Literal.Number.Hex */ 48 | .highlight .mi { color: #208050 } /* Literal.Number.Integer */ 49 | .highlight .mo { color: #208050 } /* Literal.Number.Oct */ 50 | .highlight .sa { color: #4070a0 } /* Literal.String.Affix */ 51 | .highlight .sb { color: #4070a0 } /* Literal.String.Backtick */ 52 | .highlight .sc { color: #4070a0 } /* Literal.String.Char */ 53 | .highlight .dl { color: #4070a0 } /* Literal.String.Delimiter */ 54 | .highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */ 55 | .highlight .s2 { color: #4070a0 } /* Literal.String.Double */ 56 | .highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */ 57 | .highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */ 58 | .highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */ 59 | .highlight .sx { color: #c65d09 } /* Literal.String.Other */ 60 | .highlight .sr { color: #235388 } /* Literal.String.Regex */ 61 | .highlight .s1 { color: #4070a0 } /* Literal.String.Single */ 62 | .highlight .ss { color: #517918 } /* Literal.String.Symbol */ 63 | .highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */ 64 | .highlight .fm { color: #06287e } /* Name.Function.Magic */ 65 | .highlight .vc { color: #bb60d5 } /* Name.Variable.Class */ 66 | .highlight .vg { color: #bb60d5 } /* Name.Variable.Global */ 67 | .highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */ 68 | .highlight .vm { color: #bb60d5 } /* Name.Variable.Magic */ 69 | .highlight .il { color: #208050 } /* Literal.Number.Integer.Long */ -------------------------------------------------------------------------------- /doc/_build/html/spider.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | spider:爬虫 — PyCreeper 1.0.0 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 48 | 49 |
50 |
51 |
52 |
53 | 54 |
55 |

spider:爬虫

56 |
57 | 58 | 59 |
60 |
61 |
62 | 89 |
90 |
91 | 106 | 110 | 111 | -------------------------------------------------------------------------------- /doc/_build/html/downloader_middlewares.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | downloader_middlewares:下载器中间件 — PyCreeper 1.0.0 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 48 | 49 |
50 |
51 |
52 |
53 | 54 |
55 |

downloader_middlewares:下载器中间件

56 |
57 | 58 | 59 |
60 |
61 |
62 | 89 |
90 |
91 | 106 | 110 | 111 | -------------------------------------------------------------------------------- /doc/_build/html/downloader.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | downloader:下载器 — PyCreeper 1.0.0 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 48 | 49 |
50 |
51 |
52 |
53 | 54 |
55 |

downloader:下载器

56 |
57 | 58 | 59 |
60 |
61 |
62 | 89 |
90 |
91 | 106 | 110 | 111 | -------------------------------------------------------------------------------- /pycreeper/downloader_middlewares/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import random 6 | import time 7 | from urlparse import urlparse 8 | from logging import Logger 9 | import chardet 10 | import gevent 11 | from pycreeper.downloader_middlewares import DownloaderMiddleware 12 | from pycreeper.utils.exceptions import TimeoutException 13 | from collections import deque 14 | 15 | 16 | class RetryMiddleware(DownloaderMiddleware): 17 | """ Retry Middleware """ 18 | 19 | RETRY_EXCEPTIONS = TimeoutException 20 | 21 | def __init__(self, settings, logger): 22 | self.max_retry_count = settings.get_int("RETRY_COUNT") 23 | self.retry_status_codes = settings.get_list("RETRY_STATUS_CODES") 24 | if not isinstance(logger, Logger): 25 | raise AttributeError('logger must be instance of logging.Logger') 26 | self.logger = logger 27 | 28 | def process_response(self, request, response): 29 | """process response 30 | """ 31 | if request.meta.get("dont_retry", False): 32 | return response 33 | if response.status in self.retry_status_codes: 34 | return self._retry(request) or response 35 | return response 36 | 37 | def process_exception(self, request, exception): 38 | """process exception 39 | """ 40 | if isinstance(exception, self.RETRY_EXCEPTIONS) \ 41 | and request.meta.get("dont_retry", False): 42 | return self._retry(request) 43 | 44 | def _retry(self, request): 45 | """retry 46 | """ 47 | retry_count = request.meta.get("retry_count", 0) + 1 48 | if retry_count <= self.max_retry_count: 49 | retry_request = request.copy() 50 | retry_request.meta["retry_count"] = retry_count 51 | return retry_request 52 | 53 | 54 | class UserAgentMiddleware(DownloaderMiddleware): 55 | """ UserAgent Middleware """ 56 | 57 | def __init__(self, settings, logger): 58 | self.user_agent_list = settings.get_list("USER_AGENT_LIST") 59 | if not isinstance(logger, Logger): 60 | raise AttributeError('logger must be instance of logging.Logger') 61 | self.logger = logger 62 | 63 | def process_request(self, request): 64 | """process request 65 | 66 | static requests only. 67 | """ 68 | if not request.dynamic: 69 | request.headers["User-Agent"] = random.choice(self.user_agent_list) 70 | 71 | 72 | class ProxyMiddleware(DownloaderMiddleware): 73 | """ Proxy Middleware """ 74 | 75 | def __init__(self, settings, logger): 76 | self.host_time_queue = deque() 77 | self.proxy_interval = settings["PROXY_INTERVAL"] 78 | self.proxy_list = settings["PROXY_LIST"] 79 | for proxy in self.proxy_list: 80 | self.host_time_queue.append((proxy, 0)) 81 | if not isinstance(logger, Logger): 82 | raise AttributeError('logger must be instance of logging.Logger') 83 | self.logger = logger 84 | 85 | def process_request(self, request): 86 | """process request 87 | 88 | static requests only. 89 | """ 90 | if not request.dynamic: 91 | request.meta["proxy"] = { 92 | "http": self._get_proxy(), 93 | } 94 | 95 | def _get_proxy(self): 96 | """get proxy 97 | """ 98 | proxy, latest = self.host_time_queue.popleft() 99 | interval = time.time() - latest 100 | if interval < self.proxy_interval: 101 | self.logger.info("Proxy %s waitting ...", proxy) 102 | gevent.sleep(self.proxy_interval - interval) 103 | self.host_time_queue.append((proxy, time.time())) 104 | return "http://%s" % proxy 105 | 106 | 107 | class EncodingDiscriminateMiddleware(DownloaderMiddleware): 108 | """ Encoding Discriminate Middleware """ 109 | 110 | ENCODING_MAP = {} 111 | 112 | def __init__(self, settings, logger): 113 | self.settings = settings 114 | if not isinstance(logger, Logger): 115 | raise AttributeError('logger must be instance of logging.Logger') 116 | self.logger = logger 117 | 118 | def process_response(self, request, response): 119 | """process respoonse 120 | :param request: 121 | :param response: 122 | """ 123 | netloc = urlparse(request.url).netloc 124 | content = response.body 125 | if self.ENCODING_MAP.get(netloc) is None: 126 | encoding = chardet.detect(content)["encoding"] 127 | encoding = "GB18030" \ 128 | if encoding.upper() in ("GBK", "GB2312") else encoding 129 | self.ENCODING_MAP[netloc] = encoding 130 | body = content.decode(self.ENCODING_MAP[netloc], "replace") 131 | return response.copy(body=body) 132 | -------------------------------------------------------------------------------- /tests/http/test_http_response.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import unittest 6 | 7 | from w3lib.url import safe_url_string 8 | 9 | from pycreeper.http.request import Request 10 | from pycreeper.http.response import Response 11 | 12 | 13 | class ResponseTest(unittest.TestCase): 14 | def test_init(self): 15 | self.assertRaises(Exception, Response) 16 | self.assertRaises(Exception, Response, url='http://www.example.com/') 17 | self.assertRaises(Exception, Response, request=Request('http://www.example.com/')) 18 | self.assertRaises(ValueError, 19 | Response, 20 | url='foo', 21 | request=Request('http://www.example.com/') 22 | ) 23 | self.assertRaises(ValueError, 24 | Response, 25 | 'http://www.example.com/', 26 | status='foo', 27 | request=Request('http://www.example.com/') 28 | ) 29 | self.assertRaises(TypeError, 30 | Response, 31 | 'http://www.example.com/', 32 | request='foo' 33 | ) 34 | response = Response('http://www.example.com/', 35 | Request('http://www.example.com/') 36 | ) 37 | assert response.url 38 | assert not response.body 39 | response = Response('http://www.example.com/', 40 | Request('http://www.example.com/'), 41 | headers={'Content-Type': 'text/html', 42 | 'Content-Length': 1234 43 | } 44 | ) 45 | 46 | def test_copy(self): 47 | response1 = Response('http://www.example.com/', 48 | headers={'Content-Type': 'text/html', 49 | 'Content-Length': 1234 50 | }, 51 | request=Request('http://www.example.com/') 52 | ) 53 | response2 = response1.copy() 54 | assert response1.__dict__ == response2.__dict__ 55 | self.assertEqual(response1.headers, response2.headers) 56 | self.assertEqual(response1.request, response2.request) 57 | self.assertEqual(response1, response2) 58 | 59 | self.assertIsNot(response1.headers, response2.headers) 60 | self.assertIsNot(response1.request, response2.request) 61 | self.assertIsNot(response1, response2) 62 | 63 | def test_url(self): 64 | response = Response('http://www.example.com/', 65 | request=Request('http://www.example.com/') 66 | ) 67 | self.assertIsInstance(response.url, str) 68 | self.assertEqual(response.url, 'http://www.example.com/') 69 | response = Response(u'http://www.example.com?content=测试', 70 | request=Request('http://www.example.com/') 71 | ) 72 | self.assertEqual(response.url, 73 | safe_url_string('http://www.example.com?content=测试')) 74 | self.assertRaises(TypeError, Response, 123) 75 | 76 | def test_body(self): 77 | r1 = Response(url="http://www.example.com/", 78 | request=Request('http://www.example.com/') 79 | ) 80 | assert r1.body == b'' 81 | 82 | r2 = Response(url="http://www.example.com/", 83 | body=b"", 84 | request=Request('http://www.example.com/')) 85 | assert isinstance(r2.body, bytes) 86 | self.assertEqual(r2.encoding, 'utf-8') # default encoding 87 | 88 | r3 = Response(url="http://www.example.com/", 89 | body=u"Price: \xa3100", 90 | encoding='utf-8', 91 | request=Request('http://www.example.com/')) 92 | assert isinstance(r3.body, bytes) 93 | self.assertEqual(r3.body, b"Price: \xc2\xa3100") 94 | 95 | r4 = Response(url="http://www.example.com/", 96 | request=Request('http://www.example.com/'), 97 | body=u"Price: \xa3100", 98 | encoding='latin1' 99 | ) 100 | assert isinstance(r4.body, bytes) 101 | self.assertEqual(r4.body, b"Price: \xa3100") 102 | 103 | def test_request(self): 104 | response = Response('http://www.example.com/', 105 | request=Request('http://www.example.com/') 106 | ) 107 | self.assertIsInstance(response.request, Request) 108 | self.assertEqual(response.request, Request('http://www.example.com/')) 109 | -------------------------------------------------------------------------------- /pycreeper/engine.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | """ Engine """ 6 | 7 | from gevent import monkey 8 | monkey.patch_all() 9 | 10 | import logging 11 | from gevent.lock import BoundedSemaphore 12 | from gevent.pool import Pool 13 | from importlib import import_module 14 | from pycreeper.scheduler import Scheduler 15 | from pycreeper.downloader import Downloader 16 | from pycreeper.utils.gevent_wrapper import spawn, join_all 17 | from pycreeper.utils import result2list 18 | from pycreeper.http.request import Request 19 | from Queue import Empty 20 | 21 | DRIVER_MODULE = 'selenium.webdriver' 22 | 23 | class Engine(object): 24 | """ Engine """ 25 | 26 | def __init__(self, spider): 27 | self.spider = spider 28 | self.logger = spider.logger 29 | self.scheduler = Scheduler(spider) 30 | self.settings = spider.settings 31 | max_request_size = self.settings["MAX_REQUEST_SIZE"] 32 | self.dynamic = self.settings["DYNAMIC_CRAWL"] 33 | if self.dynamic: 34 | module_path = DRIVER_MODULE 35 | module = import_module(module_path) 36 | init_kwargs = self.settings['DRIVER_INIT_KWARGS'] 37 | self.driver = getattr(module, 38 | self.settings.get('DRIVER').title())(**init_kwargs) 39 | else: 40 | self.driver = None 41 | self.driver_sem = BoundedSemaphore(1) 42 | self.downloader = Downloader(spider, self.driver, self.driver_sem) 43 | self.pool = Pool(size=max_request_size) 44 | 45 | def start(self): 46 | """start 47 | """ 48 | start_requests = iter(self.spider.start_requests()) 49 | self.execute(self.spider, start_requests) 50 | 51 | def execute(self, spider, start_requests): 52 | """execute 53 | """ 54 | self.start_requests = start_requests 55 | all_routines = [] 56 | all_routines.append(spawn(self._init_start_requests)) 57 | all_routines.append(spawn(self._next_request, spider)) 58 | join_all(all_routines) 59 | 60 | def _init_start_requests(self): 61 | """init start requests 62 | """ 63 | for req in self.start_requests: 64 | self.crawl(req) 65 | 66 | def _next_request(self, spider): 67 | """next request 68 | """ 69 | while True: 70 | try: 71 | request = self.scheduler.next_request() 72 | self.pool.spawn( 73 | self._process_request, request, spider) 74 | except Empty: 75 | self.logger.info('All requests are finished, program exit...') 76 | if self.driver: 77 | self.driver.close() 78 | return 79 | 80 | def _process_request(self, request, spider): 81 | """process request 82 | """ 83 | try: 84 | response = self.download(request, spider) 85 | except Exception as exc: 86 | logging.error("download error: %s", str(exc), exc_info=True) 87 | else: 88 | self._handle_downloader_output(response, request, spider) 89 | return response 90 | 91 | def download(self, request, spider): 92 | """ download 93 | 94 | Download a request, use self.downloader.fetch 95 | 96 | """ 97 | response = self.downloader.fetch(request, spider) 98 | #response.request = request 99 | return response 100 | 101 | def _handle_downloader_output(self, response, request, spider): 102 | """handle downloader output 103 | 104 | 105 | """ 106 | if isinstance(response, Request): 107 | self.crawl(response) 108 | return 109 | 110 | self.process_response(response, request, spider) 111 | 112 | def process_response(self, response, request, spider): 113 | """process response 114 | 115 | Use request.callback or spider.parse to process response 116 | 117 | """ 118 | callback = request.callback or spider.parse 119 | result = callback(response) 120 | ret = result2list(result) 121 | self.handle_spider_output(ret, spider) 122 | 123 | def handle_spider_output(self, result, spider): 124 | """handle spider output 125 | 126 | If a spider return a request, crawling it. 127 | Else if it's a dict, use self.process_item. 128 | 129 | """ 130 | for item in result: 131 | if item is None: 132 | continue 133 | elif isinstance(item, Request): 134 | self.crawl(item) 135 | elif isinstance(item, dict): 136 | self.process_item(item, spider) 137 | else: 138 | logging.error("Spider must return Request, dict or None") 139 | 140 | def process_item(self, item, spider): 141 | """handle item 142 | 143 | Use spider.process_item function. 144 | 145 | """ 146 | spider.process_item(item) 147 | 148 | def crawl(self, request): 149 | """crawl request 150 | 151 | Add request to scheduler's queue. 152 | 153 | """ 154 | self.scheduler.enqueue_request(request) 155 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # PyCreeper documentation build configuration file, created by 4 | # sphinx-quickstart on Sat Mar 18 20:46:54 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | # import os 20 | # import sys 21 | # sys.path.insert(0, os.path.abspath('.')) 22 | 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | # 28 | # needs_sphinx = '1.0' 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = ['sphinx.ext.autodoc', 34 | 'sphinx.ext.viewcode', 35 | 'sphinx.ext.githubpages'] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # The suffix(es) of source filenames. 41 | # You can specify multiple suffix as a list of string: 42 | # 43 | # source_suffix = ['.rst', '.md'] 44 | source_suffix = '.rst' 45 | 46 | # The master toctree document. 47 | master_doc = 'index' 48 | 49 | # General information about the project. 50 | project = u'PyCreeper' 51 | copyright = u'2017, Jim Zheng' 52 | author = u'Jim Zheng' 53 | 54 | # The version info for the project you're documenting, acts as replacement for 55 | # |version| and |release|, also used in various other places throughout the 56 | # built documents. 57 | # 58 | # The short X.Y version. 59 | version = u'1.0.0' 60 | # The full version, including alpha/beta/rc tags. 61 | release = u'1.0.0' 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | # 66 | # This is also used if you do content translation via gettext catalogs. 67 | # Usually you set "language" from the command line for these cases. 68 | language = None 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | # This patterns also effect to html_static_path and html_extra_path 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 74 | 75 | # The name of the Pygments (syntax highlighting) style to use. 76 | pygments_style = 'sphinx' 77 | 78 | # If true, `todo` and `todoList` produce output, else they produce nothing. 79 | todo_include_todos = False 80 | 81 | 82 | # -- Options for HTML output ---------------------------------------------- 83 | 84 | # The theme to use for HTML and HTML Help pages. See the documentation for 85 | # a list of builtin themes. 86 | # 87 | html_theme = 'nature' 88 | 89 | # Theme options are theme-specific and customize the look and feel of a theme 90 | # further. For a list of options available for each theme, see the 91 | # documentation. 92 | # 93 | # html_theme_options = {} 94 | 95 | # Add any paths that contain custom static files (such as style sheets) here, 96 | # relative to this directory. They are copied after the builtin static files, 97 | # so a file named "default.css" will overwrite the builtin "default.css". 98 | html_static_path = ['_static'] 99 | 100 | 101 | # -- Options for HTMLHelp output ------------------------------------------ 102 | 103 | # Output file base name for HTML help builder. 104 | htmlhelp_basename = 'PyCreeperdoc' 105 | 106 | 107 | # -- Options for LaTeX output --------------------------------------------- 108 | 109 | latex_elements = { 110 | # The paper size ('letterpaper' or 'a4paper'). 111 | # 112 | # 'papersize': 'letterpaper', 113 | 114 | # The font size ('10pt', '11pt' or '12pt'). 115 | # 116 | # 'pointsize': '10pt', 117 | 118 | # Additional stuff for the LaTeX preamble. 119 | # 120 | # 'preamble': '', 121 | 122 | # Latex figure (float) alignment 123 | # 124 | # 'figure_align': 'htbp', 125 | } 126 | 127 | # Grouping the document tree into LaTeX files. List of tuples 128 | # (source start file, target name, title, 129 | # author, documentclass [howto, manual, or own class]). 130 | latex_documents = [ 131 | (master_doc, 'PyCreeper.tex', u'PyCreeper Documentation', 132 | u'zcy', 'manual'), 133 | ] 134 | 135 | 136 | # -- Options for manual page output --------------------------------------- 137 | 138 | # One entry per manual page. List of tuples 139 | # (source start file, name, description, authors, manual section). 140 | man_pages = [ 141 | (master_doc, 'pycreeper', u'PyCreeper Documentation', 142 | [author], 1) 143 | ] 144 | 145 | 146 | # -- Options for Texinfo output ------------------------------------------- 147 | 148 | # Grouping the document tree into Texinfo files. List of tuples 149 | # (source start file, target name, title, author, 150 | # dir menu entry, description, category) 151 | texinfo_documents = [ 152 | (master_doc, 'PyCreeper', u'PyCreeper Documentation', 153 | author, 'PyCreeper', 'One line description of project.', 154 | 'Miscellaneous'), 155 | ] 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /tests/test_downloader_middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import time 6 | import unittest 7 | import json 8 | from pycreeper.downloader_middlewares import DownloaderMiddlewareManager 9 | from pycreeper.downloader_middlewares.middlewares import UserAgentMiddleware, RetryMiddleware, ProxyMiddleware 10 | from pycreeper.spider import Spider 11 | from pycreeper.http.request import Request 12 | from pycreeper.http.response import Response 13 | from pycreeper.downloader import DownloadHandler 14 | from gevent.lock import BoundedSemaphore 15 | 16 | 17 | class RetryMiddlewareTest(unittest.TestCase): 18 | def setUp(self): 19 | self.spider = Spider() 20 | 21 | def test_basic(self): 22 | self.assertRaises(AttributeError, RetryMiddleware, 23 | self.spider.settings, None) 24 | 25 | def test_process_response(self): 26 | request = Request('http://httpbin.org/') 27 | response = Response('http://httpbin.org/', request, status=500) 28 | rm = RetryMiddleware(self.spider.settings, self.spider.logger) 29 | request.meta["dont_retry"] = True 30 | self.assertEqual(rm.process_response(request, response), response) 31 | 32 | request.meta["dont_retry"] = False 33 | request = rm.process_response(request, response) 34 | self.assertIsInstance(request, Request) 35 | self.assertEqual(request.meta.get("retry_count"), 1) 36 | request = rm.process_response(request, response) 37 | self.assertIsInstance(request, Request) 38 | request = rm.process_response(request, response) 39 | self.assertIsInstance(request, Request) 40 | self.assertIsInstance(rm.process_response(request, response), Response) 41 | 42 | 43 | class UserAgentMiddlewareTest(unittest.TestCase): 44 | def setUp(self): 45 | self.spider = Spider() 46 | 47 | def test_basic(self): 48 | self.assertRaises(AttributeError, ProxyMiddleware, 49 | self.spider.settings, None) 50 | 51 | def test_process_request(self): 52 | self.spider.settings.set("PROXY_LIST", ['124.88.67.54:80']) 53 | request = Request('http://httpbin.org/get') 54 | pm = ProxyMiddleware(self.spider.settings, self.spider.logger) 55 | dh = DownloadHandler(self.spider, None, BoundedSemaphore(1)) 56 | pm.process_request(request) 57 | response = dh.fetch(request) 58 | assert response.body 59 | 60 | def test_process_request_interval(self): 61 | self.spider.settings.set("PROXY_LIST", ['218.76.106.78:3128']) 62 | request = Request('http://httpbin.org/get') 63 | pm = ProxyMiddleware(self.spider.settings, self.spider.logger) 64 | dh = DownloadHandler(self.spider, None, BoundedSemaphore(1)) 65 | pm.process_request(request) 66 | time1 = time.time() 67 | dh.fetch(request) 68 | 69 | request = Request('http://httpbin.org/get') 70 | pm.process_request(request) 71 | self.assertGreater(time.time() - time1, 3) 72 | 73 | 74 | class ProxyMiddlewareTest(unittest.TestCase): 75 | def setUp(self): 76 | self.spider = Spider() 77 | 78 | def test_basic(self): 79 | self.assertRaises(AttributeError, UserAgentMiddleware, 80 | self.spider.settings, None) 81 | 82 | def test_process_request(self): 83 | request = Request('http://httpbin.org/user-agent') 84 | self.assertIs(request.headers.get("User-Agent"), None) 85 | uam = UserAgentMiddleware(self.spider.settings, self.spider.logger) 86 | dh = DownloadHandler(self.spider, None, BoundedSemaphore(1)) 87 | uam.process_request(request) 88 | response = dh.fetch(request) 89 | self.assertEqual(json.loads(response.body)['user-agent'], request.headers['User-Agent']) 90 | 91 | 92 | class DownloaderMiddlewareManagerTest(unittest.TestCase): 93 | def setUp(self): 94 | self.spider = Spider() 95 | self.spider.settings.set('DOWNLOADER_MIDDLEWARES', 96 | { 97 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100, 98 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200, 99 | }) 100 | 101 | def test_methods(self): 102 | dmm = DownloaderMiddlewareManager(self.spider) 103 | rm = RetryMiddleware(self.spider.settings, self.spider.logger) 104 | uam = UserAgentMiddleware(self.spider.settings, self.spider.logger) 105 | process_request = [uam.process_request] 106 | process_response = [rm.process_response] 107 | process_exception = [rm.process_exception] 108 | self.assertEqual(len(dmm.methods['process_request']), len(process_request)) 109 | for i in range(len(process_request)): 110 | self.assertEqual(dmm.methods['process_request'][i].__name__, process_request[i].__name__) 111 | 112 | self.assertEqual(len(dmm.methods['process_response']), len(process_response)) 113 | for i in range(len(process_response)): 114 | self.assertEqual(dmm.methods['process_response'][i].__name__, process_response[i].__name__) 115 | 116 | self.assertEqual(len(dmm.methods['process_exception']), len(process_exception)) 117 | for i in range(len(process_exception)): 118 | self.assertEqual(dmm.methods['process_exception'][i].__name__, process_exception[i].__name__) 119 | 120 | 121 | if __name__ == "__main__": 122 | unittest.main() 123 | -------------------------------------------------------------------------------- /doc/_build/html/_static/nature.css: -------------------------------------------------------------------------------- 1 | /* 2 | * nature.css_t 3 | * ~~~~~~~~~~~~ 4 | * 5 | * Sphinx stylesheet -- nature theme. 6 | * 7 | * :copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS. 8 | * :license: BSD, see LICENSE for details. 9 | * 10 | */ 11 | 12 | @import url("basic.css"); 13 | 14 | /* -- page layout ----------------------------------------------------------- */ 15 | 16 | body { 17 | font-family: Arial, sans-serif; 18 | font-size: 100%; 19 | background-color: #111; 20 | color: #555; 21 | margin: 0; 22 | padding: 0; 23 | } 24 | 25 | div.documentwrapper { 26 | float: left; 27 | width: 100%; 28 | } 29 | 30 | div.bodywrapper { 31 | margin: 0 0 0 230px; 32 | } 33 | 34 | hr { 35 | border: 1px solid #B1B4B6; 36 | } 37 | 38 | div.document { 39 | background-color: #eee; 40 | } 41 | 42 | div.body { 43 | background-color: #ffffff; 44 | color: #3E4349; 45 | padding: 0 30px 30px 30px; 46 | font-size: 0.9em; 47 | } 48 | 49 | div.footer { 50 | color: #555; 51 | width: 100%; 52 | padding: 13px 0; 53 | text-align: center; 54 | font-size: 75%; 55 | } 56 | 57 | div.footer a { 58 | color: #444; 59 | text-decoration: underline; 60 | } 61 | 62 | div.related { 63 | background-color: #6BA81E; 64 | line-height: 32px; 65 | color: #fff; 66 | text-shadow: 0px 1px 0 #444; 67 | font-size: 0.9em; 68 | } 69 | 70 | div.related a { 71 | color: #E2F3CC; 72 | } 73 | 74 | div.sphinxsidebar { 75 | font-size: 0.75em; 76 | line-height: 1.5em; 77 | } 78 | 79 | div.sphinxsidebarwrapper{ 80 | padding: 20px 0; 81 | } 82 | 83 | div.sphinxsidebar h3, 84 | div.sphinxsidebar h4 { 85 | font-family: Arial, sans-serif; 86 | color: #222; 87 | font-size: 1.2em; 88 | font-weight: normal; 89 | margin: 0; 90 | padding: 5px 10px; 91 | background-color: #ddd; 92 | text-shadow: 1px 1px 0 white 93 | } 94 | 95 | div.sphinxsidebar h4{ 96 | font-size: 1.1em; 97 | } 98 | 99 | div.sphinxsidebar h3 a { 100 | color: #444; 101 | } 102 | 103 | 104 | div.sphinxsidebar p { 105 | color: #888; 106 | padding: 5px 20px; 107 | } 108 | 109 | div.sphinxsidebar p.topless { 110 | } 111 | 112 | div.sphinxsidebar ul { 113 | margin: 10px 20px; 114 | padding: 0; 115 | color: #000; 116 | } 117 | 118 | div.sphinxsidebar a { 119 | color: #444; 120 | } 121 | 122 | div.sphinxsidebar input { 123 | border: 1px solid #ccc; 124 | font-family: sans-serif; 125 | font-size: 1em; 126 | } 127 | 128 | div.sphinxsidebar input[type=text]{ 129 | margin-left: 20px; 130 | } 131 | 132 | div.sphinxsidebar input[type=submit]{ 133 | margin-left: 20px; 134 | } 135 | 136 | /* -- body styles ----------------------------------------------------------- */ 137 | 138 | a { 139 | color: #005B81; 140 | text-decoration: none; 141 | } 142 | 143 | a:hover { 144 | color: #E32E00; 145 | text-decoration: underline; 146 | } 147 | 148 | div.body h1, 149 | div.body h2, 150 | div.body h3, 151 | div.body h4, 152 | div.body h5, 153 | div.body h6 { 154 | font-family: Arial, sans-serif; 155 | background-color: #BED4EB; 156 | font-weight: normal; 157 | color: #212224; 158 | margin: 30px 0px 10px 0px; 159 | padding: 5px 0 5px 10px; 160 | text-shadow: 0px 1px 0 white 161 | } 162 | 163 | div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 200%; } 164 | div.body h2 { font-size: 150%; background-color: #C8D5E3; } 165 | div.body h3 { font-size: 120%; background-color: #D8DEE3; } 166 | div.body h4 { font-size: 110%; background-color: #D8DEE3; } 167 | div.body h5 { font-size: 100%; background-color: #D8DEE3; } 168 | div.body h6 { font-size: 100%; background-color: #D8DEE3; } 169 | 170 | a.headerlink { 171 | color: #c60f0f; 172 | font-size: 0.8em; 173 | padding: 0 4px 0 4px; 174 | text-decoration: none; 175 | } 176 | 177 | a.headerlink:hover { 178 | background-color: #c60f0f; 179 | color: white; 180 | } 181 | 182 | div.body p, div.body dd, div.body li { 183 | line-height: 1.5em; 184 | } 185 | 186 | div.admonition p.admonition-title + p { 187 | display: inline; 188 | } 189 | 190 | div.highlight{ 191 | background-color: white; 192 | } 193 | 194 | div.note { 195 | background-color: #eee; 196 | border: 1px solid #ccc; 197 | } 198 | 199 | div.seealso { 200 | background-color: #ffc; 201 | border: 1px solid #ff6; 202 | } 203 | 204 | div.topic { 205 | background-color: #eee; 206 | } 207 | 208 | div.warning { 209 | background-color: #ffe4e4; 210 | border: 1px solid #f66; 211 | } 212 | 213 | p.admonition-title { 214 | display: inline; 215 | } 216 | 217 | p.admonition-title:after { 218 | content: ":"; 219 | } 220 | 221 | pre { 222 | padding: 10px; 223 | background-color: White; 224 | color: #222; 225 | line-height: 1.2em; 226 | border: 1px solid #C6C9CB; 227 | font-size: 1.1em; 228 | margin: 1.5em 0 1.5em 0; 229 | -webkit-box-shadow: 1px 1px 1px #d8d8d8; 230 | -moz-box-shadow: 1px 1px 1px #d8d8d8; 231 | } 232 | 233 | code { 234 | background-color: #ecf0f3; 235 | color: #222; 236 | /* padding: 1px 2px; */ 237 | font-size: 1.1em; 238 | font-family: monospace; 239 | } 240 | 241 | .viewcode-back { 242 | font-family: Arial, sans-serif; 243 | } 244 | 245 | div.viewcode-block:target { 246 | background-color: #f4debf; 247 | border-top: 1px solid #ac9; 248 | border-bottom: 1px solid #ac9; 249 | } 250 | 251 | div.code-block-caption { 252 | background-color: #ddd; 253 | color: #222; 254 | border: 1px solid #C6C9CB; 255 | } -------------------------------------------------------------------------------- /doc/_build/html/schedular.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | schedular:调度器 — PyCreeper 1.0.0 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 48 | 49 |
50 |
51 |
52 |
53 | 54 |
55 |

schedular:调度器

56 |

调度器实现的核心是gevent之中的Queue和布隆过滤器 57 | (Wiki: https://en.wikipedia.org/wiki/Bloom_filter)。 58 | 其中,Queue保证了多个Downloader协程读取队列时的协程安全,布隆过滤器则提供了url去重功能。

59 |
60 |

将请求入队:enqueue_request(request)

61 |

request入队时,首先使用布隆过滤器检查url是否已经抓取过。如果没有抓取过则直接入队, 62 | 如果抓取过,则会输出一条logging.DEBUG信息,表示忽略了这个url。

63 |
64 |
65 |

取得队列中的请求:next_request()

66 |

这个方法将会从Queue中取出一条request。如果在 custom_settings 中设置了 DOWNLOAD_DELAY 67 | 项目的话,每次取出request会等待一个固定的时间。

68 |

PyCreeper将 TIMEOUT 值的3倍作为检验爬虫结束的标志。具体是指,如果3*TIMEOUT时间之内Queue为空的话, 69 | 那么则认为爬取任务全部结束,爬虫退出。

70 |
71 |
72 | 73 | 74 |
75 |
76 |
77 | 113 |
114 |
115 | 130 | 134 | 135 | -------------------------------------------------------------------------------- /doc/_build/html/structure.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 架构概览 — PyCreeper 1.0.0 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 48 | 49 |
50 |
51 |
52 |
53 | 54 |
55 |

架构概览

56 |

PyCreeper的整体架构可以分为引擎,下载器,下载器中间件,调度器,爬虫五个部分。 57 | 在各个部分之间传递的数据为Request/Response对象。

58 |

数据的流动方向如下图的绿色箭头所示。

59 |
60 |

各个部分的功能简述

61 | _images/structure.jpg 62 |
63 |

引擎 是PyCreeper的核心部分,负责调度各个部分的工作。引擎在内部的实现为gevent.Pool。

64 |

下载器 负责下载request请求,在这里将静态请求与动态请求分别处理,静态请求使用requests库实现, 65 | 动态请求使用selenium.webdriver实现。在请求完成后,将响应返回给引擎。

66 |

下载器中间件 可以理解为存在于下载器和引擎之间的钩子系统,可以通过自定义下载器中间件完成对request和response的特殊处理。

67 |

调度器 调度器实现的核心为gevent中的Queue和布隆过滤器,通过对requests进行判重,非重复请求入队,等待引擎取走处理。

68 |

爬虫 爬虫相当于对用户定义的接口,由用户来定义起始的url,对于各个request的callback以及对于爬取结果的处理方法。

69 |
70 |
71 |

数据流动过程

72 |

数据流动的过程如下面各个步骤所示:

73 |
    74 |
  1. 引擎启动,将爬虫中的start_urls加入到调度器中。
  2. 75 |
  3. 引擎从调度器中取得一个request。
  4. 76 |
  5. 引擎将请求交给下载器处理,中间经过了下载器中间件对于request的处理。
  6. 77 |
  7. 下载器根据request的类型分别操作,静态请求交给requests库,动态请求使用selenium.webdriver加载。
  8. 78 |
  9. 下载器将response返回给引擎,中间经过下载器中间件对response的处理。
  10. 79 |
  11. 引擎将response交给爬虫定义的处理方法。
  12. 80 |
  13. 爬虫的处理方法可能返回一个request(转2),或者返回一个包含爬取结果的字典(转下一个)。
  14. 81 |
  15. 引擎根据爬虫定义的对于爬取结果的处理方法,处理结果。
  16. 82 |
83 |
84 |
85 | 86 | 87 |
88 |
89 |
90 | 126 |
127 |
128 | 143 | 147 | 148 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyCreeper 2 | PyCreeper是一个用来快速提取网页内容的信息采集(爬虫)框架。项目底层异步网络I/O使用 **Gevent** 协程库,将网络请求分为静态请求和动态请求, 3 | 静态请求交给 **Requests** 处理,动态请求则使用 **Selenium.Webdriver** 加载。 4 | 5 | 在设计这个项目的过程中,我参考了很多[Scrapy](https://scrapy.org/)的架构和实现方式。Scrapy是一个非常棒的爬虫框架, 6 | 我之前花了很多心血在Scrapy框架之上! 7 | 8 | 这篇PyCreeper初探会编写一个简单的爬虫例子,让您明白PyCreeper大致的工作流程,使您快速上手。 9 | 10 | ## 目标任务 11 | [知乎](https://www.zhihu.com/)与Quora类似,是一个分享知识提出问题的平台。我们的Demo任务是模拟登陆知乎,保存Cookie, 12 | 之后发出一系列静态请求,获取首页的问题题目与描述。 13 | 14 | 由于模拟登陆一步我们采用了基于Selenium.Webdriver的动态请求处理,所以你可以抛开复杂的抓包与分析代码,只需要点几个按钮, 15 | 就像在真实环境登录知乎一样简单便利! 16 | 17 | 18 | ## 定义一个爬虫 19 | 定义一个爬虫类需要需要继承Spider类,代码如下: 20 | 21 | ``` 22 | from pycreeper.spider import Spider 23 | 24 | class Zhihu_Spider(Spider): 25 | pass 26 | ``` 27 | 28 | ## 选择中间件MiddleWares 29 | 对于Spider的中间件选择,通过修改custom_settings对象实现: 30 | 31 | ``` 32 | custom_settings = { 33 | 'DOWNLOADER_MIDDLEWARES': { 34 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100, 35 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200, 36 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300 37 | }, 38 | 'DRIVER': 'Chrome', 39 | 'DOWNLOAD_DELAY': 2, 40 | 'USER_AGENT_LIST': [ 41 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 42 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''', 43 | ] 44 | } 45 | ``` 46 | 47 | 其中,DOWNLOADER_MIDDLEWARES是这个爬虫爬取过程中使用的中间件。UserAgentMiddleware提供了一种简单的控制请求User-Agent的方式(只对静态请求有效, 48 | 动态请求的UA取决于使用的WebDriver)。RetryMiddleware对失败的请求(错误的返回码,超时等)进行多次重试。CookiesMiddleware在全体的请求之间共享CookieJar池, 49 | 一组请求可以共享一个CookieJar,CookiesMiddleware维护CookieJar的有效性与一致性。 50 | 51 | DRIVER表明了动态请求的浏览器,这里我们使用Chrome。 52 | 53 | DOWNLOAD_DELAY表明了下载之间的延迟时间(秒),这个选项当网站有某种防爬策略时还是很有用的。 54 | 55 | USER_AGENT_LIST中包含请求使用的User-Agent,UserAgentMiddleware会从中随机取出一个来使用。 56 | 57 | 58 | ## 最开始的请求 59 | 下面这段代码通过重写start_requests方法yield一个PyCreeper请求: 60 | 61 | ``` 62 | def start_requests(self): 63 | 64 | def _login(driver): 65 | driver.find_element_by_name('account').send_keys("username") 66 | driver.find_element_by_name('password').send_keys("password") 67 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click() 68 | gevent.sleep(5) 69 | 70 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"}, 71 | callback=self.after_login, dynamic=True, browser_actions=[_login]) 72 | ``` 73 | 74 | 在Request对象的参数中,dynamic=True表明这是一个动态请求,将会调用WebDriver加载, 75 | 而browser_actions=[_login]则定义了浏览器加载完成之后进行的动作。本例中输入了用户名与密码,然后点击登录。 76 | gevent.sleep(5)则是令爬虫等待浏览器加载完成。 77 | 78 | meta={"cookiejar": "zhihu"}这个选项表明本次请求产生的Cookie将会被存储在名为zhihu的CookieJar当中 79 | 80 | callback=self.after_login定义了本次响应的处理函数。 81 | 82 | ## 接下来? 83 | 84 | 接下来一步将在知乎首页中提取问题链接,发出静态问题请求: 85 | 86 | ``` 87 | def after_login(self, response): 88 | html = response.body 89 | selector = etree.HTML(html) 90 | links = selector.xpath('//a[@class="question_link"]') 91 | for link in links: 92 | yield Request('https://www.zhihu.com' + link.attrib["href"], 93 | meta={"cookiejar": "zhihu"}, callback=self.get_item) 94 | ``` 95 | 96 | response.body存储了响应的内容。我们使用了lxml提取html文本中的标签,然后发出一系列静态请求。 97 | 98 | 在获得问题页面的数据之后,我们需要做的是提取出其中的问题标题与详情: 99 | 100 | ``` 101 | def get_item(self, response): 102 | html = response.body 103 | selector = etree.HTML(html) 104 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text 105 | body = selector.xpath('//span[@class="RichText"]')[0].text 106 | yield { 107 | 'head': head, 108 | 'body': body 109 | } 110 | ``` 111 | 112 | 过程与上个函数类似,通过xpath定位元素。 113 | 114 | ## 处理你获得的数据 115 | 处理数据通过重写process_item方法实现: 116 | 117 | ``` 118 | def process_item(self, item): 119 | print json.dumps(item, ensure_ascii=False) 120 | ```` 121 | 122 | 这里我们只是将结果打印。 123 | 124 | ## 运行你的爬虫 125 | 最后我们通过这样一段代码运行爬虫: 126 | 127 | ``` 128 | if __name__ == "__main__": 129 | spider = Zhihu_Spider() 130 | spider.start() 131 | ``` 132 | 133 | 完整的代码如下: 134 | 135 | ``` 136 | # -*- coding:utf-8 -*- 137 | 138 | from pycreeper.spider import Spider 139 | from pycreeper.http.request import Request 140 | from lxml import etree 141 | import json 142 | import gevent 143 | 144 | 145 | class Zhihu_Spider(Spider): 146 | 147 | custom_settings = { 148 | 'DOWNLOADER_MIDDLEWARES': { 149 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100, 150 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200, 151 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300 152 | }, 153 | 'DRIVER': 'Chrome', 154 | 'DOWNLOAD_DELAY': 2, 155 | 'STATIC_REQUEST_SSL_VERIFY': False, 156 | 'USER_AGENT_LIST': [ 157 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 158 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''', 159 | ] 160 | } 161 | 162 | def start_requests(self): 163 | 164 | def _login(driver): 165 | driver.find_element_by_name('account').send_keys("username") 166 | driver.find_element_by_name('password').send_keys("password") 167 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click() 168 | gevent.sleep(5) 169 | 170 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"}, 171 | callback=self.after_login, dynamic=True, browser_actions=[_login]) 172 | 173 | def after_login(self, response): 174 | html = response.body 175 | selector = etree.HTML(html) 176 | links = selector.xpath('//a[@class="question_link"]') 177 | for link in links: 178 | yield Request('https://www.zhihu.com' + link.attrib["href"], 179 | meta={"cookiejar": "zhihu"}, callback=self.get_item) 180 | 181 | def get_item(self, response): 182 | html = response.body 183 | selector = etree.HTML(html) 184 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text 185 | body = selector.xpath('//span[@class="RichText"]')[0].text 186 | yield { 187 | 'head': head, 188 | 'body': body 189 | } 190 | 191 | def process_item(self, item): 192 | print json.dumps(item, ensure_ascii=False) 193 | 194 | if __name__ == "__main__": 195 | spider = Zhihu_Spider() 196 | spider.start() 197 | 198 | ``` 199 | 200 | ## 写在后面 201 | 项目已经通过PyPi发布,您可以通过以下命令下载: 202 | 203 | ``` 204 | pip install pycreeper 205 | ``` 206 | 207 | 未来我们将会引入Docker的支持。 208 | 209 | 目前项目刚刚发布1.0.0版本,如果在使用时,遇到各种问题,我们都欢迎您反馈给我们,您可以通过github[项目主页](https://github.com/ZcyAndWt/pyCreeper),也可以通过邮件,作者的邮箱:zhengchenyu.backend@gmail.com。 210 | 211 | 如果您使用中,觉得本项目有可取之处,提高了您爬取数据的效率,希望您能在github上star本项目。 212 | 您的支持是我们前进最大的动力! 213 | -------------------------------------------------------------------------------- /doc/_build/html/prepare.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 使用前的准备 — PyCreeper 1.0.0 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 48 | 49 |
50 |
51 |
52 |
53 | 54 |
55 |

使用前的准备

56 |

我们假定您已经安装了Python2.7及以上版本,若没有安装,请参考Python官网(https://www.python.org/)选择合适的版本进行安装。

57 |

PyCreeper对于以下几个库存在依赖关系:

58 |
59 |
    60 |
  • gevent
  • 61 |
  • importlib
  • 62 |
  • requests
  • 63 |
  • chardet
  • 64 |
  • w3lib
  • 65 |
  • six
  • 66 |
  • pybloom
  • 67 |
  • Selenium
  • 68 |
69 |
70 |

当然,如果您选择使用pip安装本项目,那么依赖库会自动安装到您的电脑内(至少理论上会是这样)。

71 |

使用pip安装项目:

72 |
pip install pycreeper
 73 | 
74 |
75 |
76 |

配置Selenium Driver

77 |

当您希望调用指定的浏览器时,Selenium需要您安装指定浏览器的接口。 78 | 举例来说,如果您希望使用Chrome加载请求,您需要下载安装 Chromedriverhttps://sites.google.com/a/chromium.org/chromedriver/downloads), 79 | 然后将该程序放在您的PATH之下,确保Python能访问到它。

80 |

几个常用的Driver:

81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 |
名称link
Chromehttps://sites.google.com/a/chromium.org/chromedriver/downloads
Firefoxhttps://github.com/mozilla/geckodriver/releases
PhantomJShttp://phantomjs.org/download.html
103 |

其中,PhantomJS是一款无界面化WebKit,当您在无GUI设备的情况下,该浏览器是您最好的选择。

104 |

对于Selenium更详细的配置,请参考 http://selenium-python.readthedocs.io/

105 |
106 |
107 | 108 | 109 |
110 |
111 |
112 | 147 |
148 |
149 | 164 | 168 | 169 | -------------------------------------------------------------------------------- /doc/_build/html/_sources/intro.rst.txt: -------------------------------------------------------------------------------- 1 | PyCreeper初探 2 | ============== 3 | PyCreeper是一个用来快速提取网页内容的信息采集(爬虫)框架。项目底层异步网络I/O使用 **Gevent** 协程库,将网络请求分为静态请求和动态请求, 4 | 静态请求交给 **Requests** 处理,动态请求则使用 **Selenium.Webdriver** 加载。 5 | 6 | 在设计这个项目的过程中,我参考了很多 **Scrapy** (项目网站: https://scrapy.org/)的架构和实现方式。Scrapy是一个非常棒的爬虫框架, 7 | 我之前花了很多心血在Scrapy框架之上! 8 | 9 | 这篇PyCreeper初探会编写一个简单的爬虫例子,让您明白PyCreeper大致的工作流程,使您快速上手。 10 | 11 | 目标任务 12 | --------- 13 | 知乎(https://www.zhihu.com/)与Quora类似,是一个分享知识提出问题的平台。我们的Demo任务是模拟登陆知乎,保存Cookie, 14 | 之后发出一系列静态请求,获取首页的问题题目与描述。 15 | 16 | 由于模拟登陆一步我们采用了基于Selenium.Webdriver的动态请求处理,所以你可以抛开复杂的抓包与分析代码,只需要点几个按钮, 17 | 就像在真实环境登录知乎一样简单便利! 18 | 19 | 20 | 定义一个爬虫 21 | ------------- 22 | 定义一个爬虫类需要需要继承Spider类,代码如下:: 23 | 24 | from pycreeper.spider import Spider 25 | 26 | class Zhihu_Spider(Spider): 27 | pass 28 | 29 | 选择中间件MiddleWares 30 | ---------------------- 31 | 对于Spider的中间件选择,通过修改custom_settings对象实现:: 32 | 33 | custom_settings = { 34 | 'DOWNLOADER_MIDDLEWARES': { 35 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100, 36 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200, 37 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300 38 | }, 39 | 'DRIVER': 'Chrome', 40 | 'DOWNLOAD_DELAY': 2, 41 | 'USER_AGENT_LIST': [ 42 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 43 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''', 44 | ] 45 | } 46 | 47 | 其中,DOWNLOADER_MIDDLEWARES是这个爬虫爬取过程中使用的中间件。UserAgentMiddleware提供了一种简单的控制请求User-Agent的方式(只对静态请求有效, 48 | 动态请求的UA取决于使用的WebDriver)。RetryMiddleware对失败的请求(错误的返回码,超时等)进行多次重试。CookiesMiddleware在全体的请求之间共享CookieJar池, 49 | 一组请求可以共享一个CookieJar,CookiesMiddleware维护CookieJar的有效性与一致性。 50 | 51 | DRIVER表明了动态请求的浏览器,这里我们使用Chrome。 52 | 53 | DOWNLOAD_DELAY表明了下载之间的延迟时间(秒),这个选项当网站有某种防爬策略时还是很有用的。 54 | 55 | USER_AGENT_LIST中包含请求使用的User-Agent,UserAgentMiddleware会从中随机取出一个来使用。 56 | 57 | 58 | 最开始的请求 59 | ------------- 60 | 下面这段代码通过重写start_requests方法yield一个PyCreeper请求:: 61 | 62 | def start_requests(self): 63 | 64 | def _login(driver): 65 | driver.find_element_by_name('account').send_keys("username") 66 | driver.find_element_by_name('password').send_keys("password") 67 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click() 68 | gevent.sleep(5) 69 | 70 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"}, 71 | callback=self.after_login, dynamic=True, browser_actions=[_login]) 72 | 73 | 在Request对象的参数中,dynamic=True表明这是一个动态请求,将会调用WebDriver加载, 74 | 而browser_actions=[_login]则定义了浏览器加载完成之后进行的动作。本例中输入了用户名与密码,然后点击登录。 75 | gevent.sleep(5)则是令爬虫等待浏览器加载完成。 76 | 77 | meta={"cookiejar": "zhihu"}这个选项表明本次请求产生的Cookie将会被存储在名为zhihu的CookieJar当中 78 | 79 | callback=self.after_login定义了本次响应的处理函数。 80 | 81 | 接下来? 82 | -------- 83 | 接下来一步将在知乎首页中提取问题链接,发出静态问题请求:: 84 | 85 | def after_login(self, response): 86 | html = response.body 87 | selector = etree.HTML(html) 88 | links = selector.xpath('//a[@class="question_link"]') 89 | for link in links: 90 | yield Request('https://www.zhihu.com' + link.attrib["href"], 91 | meta={"cookiejar": "zhihu"}, callback=self.get_item) 92 | 93 | response.body存储了响应的内容。我们使用了lxml提取html文本中的标签,然后发出一系列静态请求。 94 | 95 | 在获得问题页面的数据之后,我们需要做的是提取出其中的问题标题与详情:: 96 | 97 | def get_item(self, response): 98 | html = response.body 99 | selector = etree.HTML(html) 100 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text 101 | body = selector.xpath('//span[@class="RichText"]')[0].text 102 | yield { 103 | 'head': head, 104 | 'body': body 105 | } 106 | 107 | 过程与上个函数类似,通过xpath定位元素。 108 | 109 | 处理你获得的数据 110 | ----------------- 111 | 处理数据通过重写process_item方法实现:: 112 | 113 | def process_item(self, item): 114 | print json.dumps(item, ensure_ascii=False) 115 | 116 | 这里我们只是将结果打印。 117 | 118 | 运行你的爬虫 119 | ------------- 120 | 最后我们通过这样一段代码运行爬虫:: 121 | 122 | if __name__ == "__main__": 123 | spider = Zhihu_Spider() 124 | spider.start() 125 | 126 | 完整的代码如下:: 127 | 128 | # -*- coding:utf-8 -*- 129 | 130 | from pycreeper.spider import Spider 131 | from pycreeper.http.request import Request 132 | from lxml import etree 133 | import json 134 | import gevent 135 | 136 | 137 | class Zhihu_Spider(Spider): 138 | 139 | custom_settings = { 140 | 'DOWNLOADER_MIDDLEWARES': { 141 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100, 142 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200, 143 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300 144 | }, 145 | 'DRIVER': 'Chrome', 146 | 'DOWNLOAD_DELAY': 2, 147 | 'STATIC_REQUEST_SSL_VERIFY': False, 148 | 'USER_AGENT_LIST': [ 149 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 150 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''', 151 | ] 152 | } 153 | 154 | def start_requests(self): 155 | 156 | def _login(driver): 157 | driver.find_element_by_name('account').send_keys("15501277123") 158 | driver.find_element_by_name('password').send_keys("zcymichael") 159 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click() 160 | gevent.sleep(5) 161 | 162 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"}, 163 | callback=self.after_login, dynamic=True, browser_actions=[_login]) 164 | 165 | def after_login(self, response): 166 | html = response.body 167 | selector = etree.HTML(html) 168 | links = selector.xpath('//a[@class="question_link"]') 169 | for link in links: 170 | yield Request('https://www.zhihu.com' + link.attrib["href"], 171 | meta={"cookiejar": "zhihu"}, callback=self.get_item) 172 | 173 | def get_item(self, response): 174 | html = response.body 175 | selector = etree.HTML(html) 176 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text 177 | body = selector.xpath('//span[@class="RichText"]')[0].text 178 | yield { 179 | 'head': head, 180 | 'body': body 181 | } 182 | 183 | def process_item(self, item): 184 | print json.dumps(item, ensure_ascii=False) 185 | 186 | if __name__ == "__main__": 187 | spider = Zhihu_Spider() 188 | spider.start() 189 | 190 | 191 | 写在后面 192 | --------- 193 | 项目已经通过PyPi发布,您可以通过以下命令下载:: 194 | 195 | pip install pycreeper 196 | 197 | 未来我们将会引入Docker的支持。 198 | 199 | 目前项目刚刚发布1.0.0版本,如果在使用时,遇到各种问题,我们都欢迎您反馈给我们,您可以通过github, 200 | 项目主页:https://github.com/ZcyAndWt/pyCreeper,也可以通过邮件,作者的邮箱:zhengchenyu.backend@gmail.com。 201 | 202 | 如果您使用中,觉得本项目有可取之处,提高了您爬取数据的速度,希望您能在github上star本项目。 203 | 您的支持是我们前进最大的动力! 204 | 205 | 206 | 207 | 208 | 209 | -------------------------------------------------------------------------------- /doc/tutorial.rst: -------------------------------------------------------------------------------- 1 | PyCreeper初探 2 | ============== 3 | PyCreeper是一个用来快速提取网页内容的信息采集(爬虫)框架。项目底层异步网络I/O使用 **Gevent** 协程库,将网络请求分为静态请求和动态请求, 4 | 静态请求交给 **Requests** 处理,动态请求则使用 **Selenium.Webdriver** 加载。 5 | 6 | 在设计这个项目的过程中,我参考了很多 **Scrapy** (项目网站: https://scrapy.org/)的架构和实现方式。Scrapy是一个非常棒的爬虫框架, 7 | 我之前花了很多心血在Scrapy框架之上! 8 | 9 | 这篇PyCreeper初探会编写一个简单的爬虫例子,让您明白PyCreeper大致的工作流程,使您快速上手。 10 | 11 | 如果您的PyCreeper还没有安装好,请参考: :doc:`prepare`。 12 | 13 | 目标任务 14 | --------- 15 | 知乎(https://www.zhihu.com/)与Quora类似,是一个分享知识提出问题的平台。我们的Demo任务是模拟登陆知乎,保存Cookie, 16 | 之后发出一系列静态请求,获取首页的问题题目与描述。 17 | 18 | 由于模拟登陆一步我们采用了基于Selenium.Webdriver的动态请求处理,所以你可以抛开复杂的抓包与分析代码,只需要点几个按钮, 19 | 就像在真实环境登录知乎一样简单便利! 20 | 21 | 22 | 定义一个爬虫 23 | ------------- 24 | 定义一个爬虫类需要需要继承Spider类,代码如下:: 25 | 26 | from pycreeper.spider import Spider 27 | 28 | class Zhihu_Spider(Spider): 29 | pass 30 | 31 | 选择中间件MiddleWares 32 | ---------------------- 33 | 对于Spider的中间件选择,通过修改custom_settings对象实现:: 34 | 35 | custom_settings = { 36 | 'DOWNLOADER_MIDDLEWARES': { 37 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100, 38 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200, 39 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300 40 | }, 41 | 'DRIVER': 'Chrome', 42 | 'DOWNLOAD_DELAY': 2, 43 | 'USER_AGENT_LIST': [ 44 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 45 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''', 46 | ] 47 | } 48 | 49 | 其中,DOWNLOADER_MIDDLEWARES是这个爬虫爬取过程中使用的中间件。UserAgentMiddleware提供了一种简单的控制请求User-Agent的方式(只对静态请求有效, 50 | 动态请求的UA取决于使用的WebDriver)。RetryMiddleware对失败的请求(错误的返回码,超时等)进行多次重试。CookiesMiddleware在全体的请求之间共享CookieJar池, 51 | 一组请求可以共享一个CookieJar,CookiesMiddleware维护CookieJar的有效性与一致性。 52 | 53 | DRIVER表明了动态请求的浏览器,这里我们使用Chrome。 54 | 55 | DOWNLOAD_DELAY表明了下载之间的延迟时间(秒),这个选项当网站有某种防爬策略时还是很有用的。 56 | 57 | USER_AGENT_LIST中包含请求使用的User-Agent,UserAgentMiddleware会从中随机取出一个来使用。 58 | 59 | 60 | 最开始的请求 61 | ------------- 62 | 下面这段代码通过重写start_requests方法yield一个PyCreeper请求:: 63 | 64 | def start_requests(self): 65 | 66 | def _login(driver): 67 | driver.find_element_by_name('account').send_keys("username") 68 | driver.find_element_by_name('password').send_keys("password") 69 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click() 70 | gevent.sleep(5) 71 | 72 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"}, 73 | callback=self.after_login, dynamic=True, browser_actions=[_login]) 74 | 75 | 在Request对象的参数中,dynamic=True表明这是一个动态请求,将会调用WebDriver加载, 76 | 而browser_actions=[_login]则定义了浏览器加载完成之后进行的动作。本例中输入了用户名与密码,然后点击登录。 77 | gevent.sleep(5)则是令爬虫等待浏览器加载完成。 78 | 79 | meta={"cookiejar": "zhihu"}这个选项表明本次请求产生的Cookie将会被存储在名为zhihu的CookieJar当中 80 | 81 | callback=self.after_login定义了本次响应的处理函数。 82 | 83 | 接下来? 84 | -------- 85 | 接下来一步将在知乎首页中提取问题链接,发出静态问题请求:: 86 | 87 | def after_login(self, response): 88 | html = response.body 89 | selector = etree.HTML(html) 90 | links = selector.xpath('//a[@class="question_link"]') 91 | for link in links: 92 | yield Request('https://www.zhihu.com' + link.attrib["href"], 93 | meta={"cookiejar": "zhihu"}, callback=self.get_item) 94 | 95 | response.body存储了响应的内容。我们使用了lxml提取html文本中的标签,然后发出一系列静态请求。 96 | 97 | 在获得问题页面的数据之后,我们需要做的是提取出其中的问题标题与详情:: 98 | 99 | def get_item(self, response): 100 | html = response.body 101 | selector = etree.HTML(html) 102 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text 103 | body = selector.xpath('//span[@class="RichText"]')[0].text 104 | yield { 105 | 'head': head, 106 | 'body': body 107 | } 108 | 109 | 过程与上个函数类似,通过xpath定位元素。 110 | 111 | 处理你获得的数据 112 | ----------------- 113 | 处理数据通过重写process_item方法实现:: 114 | 115 | def process_item(self, item): 116 | print json.dumps(item, ensure_ascii=False) 117 | 118 | 这里我们只是将结果打印。 119 | 120 | 运行你的爬虫 121 | ------------- 122 | 最后我们通过这样一段代码运行爬虫:: 123 | 124 | if __name__ == "__main__": 125 | spider = Zhihu_Spider() 126 | spider.start() 127 | 128 | 完整的代码如下:: 129 | 130 | # -*- coding:utf-8 -*- 131 | 132 | from pycreeper.spider import Spider 133 | from pycreeper.http.request import Request 134 | from lxml import etree 135 | import json 136 | import gevent 137 | 138 | 139 | class Zhihu_Spider(Spider): 140 | 141 | custom_settings = { 142 | 'DOWNLOADER_MIDDLEWARES': { 143 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100, 144 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200, 145 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300 146 | }, 147 | 'DRIVER': 'Chrome', 148 | 'DOWNLOAD_DELAY': 2, 149 | 'STATIC_REQUEST_SSL_VERIFY': False, 150 | 'USER_AGENT_LIST': [ 151 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 152 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''', 153 | ] 154 | } 155 | 156 | def start_requests(self): 157 | 158 | def _login(driver): 159 | driver.find_element_by_name('account').send_keys("username") 160 | driver.find_element_by_name('password').send_keys("password") 161 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click() 162 | gevent.sleep(5) 163 | 164 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"}, 165 | callback=self.after_login, dynamic=True, browser_actions=[_login]) 166 | 167 | def after_login(self, response): 168 | html = response.body 169 | selector = etree.HTML(html) 170 | links = selector.xpath('//a[@class="question_link"]') 171 | for link in links: 172 | yield Request('https://www.zhihu.com' + link.attrib["href"], 173 | meta={"cookiejar": "zhihu"}, callback=self.get_item) 174 | 175 | def get_item(self, response): 176 | html = response.body 177 | selector = etree.HTML(html) 178 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text 179 | body = selector.xpath('//span[@class="RichText"]')[0].text 180 | yield { 181 | 'head': head, 182 | 'body': body 183 | } 184 | 185 | def process_item(self, item): 186 | print json.dumps(item, ensure_ascii=False) 187 | 188 | if __name__ == "__main__": 189 | spider = Zhihu_Spider() 190 | spider.start() 191 | 192 | 193 | 写在后面 194 | --------- 195 | 项目已经通过PyPi发布,您可以通过以下命令下载:: 196 | 197 | pip install pycreeper 198 | 199 | 未来我们将会引入Docker的支持。 200 | 201 | 目前项目刚刚发布1.0.0版本,如果在使用时,遇到各种问题,我们都欢迎您反馈给我们,您可以通过github, 202 | 项目主页:https://github.com/ZcyAndWt/pyCreeper,也可以通过邮件,作者的邮箱:zhengchenyu.backend@gmail.com。 203 | 204 | 如果您使用中,觉得本项目有可取之处,提高了您爬取数据的效率,希望您能在github上star本项目。 205 | 您的支持是我们前进最大的动力! 206 | 207 | 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /doc/_build/html/_sources/tutorial.rst.txt: -------------------------------------------------------------------------------- 1 | PyCreeper初探 2 | ============== 3 | PyCreeper是一个用来快速提取网页内容的信息采集(爬虫)框架。项目底层异步网络I/O使用 **Gevent** 协程库,将网络请求分为静态请求和动态请求, 4 | 静态请求交给 **Requests** 处理,动态请求则使用 **Selenium.Webdriver** 加载。 5 | 6 | 在设计这个项目的过程中,我参考了很多 **Scrapy** (项目网站: https://scrapy.org/)的架构和实现方式。Scrapy是一个非常棒的爬虫框架, 7 | 我之前花了很多心血在Scrapy框架之上! 8 | 9 | 这篇PyCreeper初探会编写一个简单的爬虫例子,让您明白PyCreeper大致的工作流程,使您快速上手。 10 | 11 | 如果您的PyCreeper还没有安装好,请参考: :doc:`prepare`。 12 | 13 | 目标任务 14 | --------- 15 | 知乎(https://www.zhihu.com/)与Quora类似,是一个分享知识提出问题的平台。我们的Demo任务是模拟登陆知乎,保存Cookie, 16 | 之后发出一系列静态请求,获取首页的问题题目与描述。 17 | 18 | 由于模拟登陆一步我们采用了基于Selenium.Webdriver的动态请求处理,所以你可以抛开复杂的抓包与分析代码,只需要点几个按钮, 19 | 就像在真实环境登录知乎一样简单便利! 20 | 21 | 22 | 定义一个爬虫 23 | ------------- 24 | 定义一个爬虫类需要需要继承Spider类,代码如下:: 25 | 26 | from pycreeper.spider import Spider 27 | 28 | class Zhihu_Spider(Spider): 29 | pass 30 | 31 | 选择中间件MiddleWares 32 | ---------------------- 33 | 对于Spider的中间件选择,通过修改custom_settings对象实现:: 34 | 35 | custom_settings = { 36 | 'DOWNLOADER_MIDDLEWARES': { 37 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100, 38 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200, 39 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300 40 | }, 41 | 'DRIVER': 'Chrome', 42 | 'DOWNLOAD_DELAY': 2, 43 | 'USER_AGENT_LIST': [ 44 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 45 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''', 46 | ] 47 | } 48 | 49 | 其中,DOWNLOADER_MIDDLEWARES是这个爬虫爬取过程中使用的中间件。UserAgentMiddleware提供了一种简单的控制请求User-Agent的方式(只对静态请求有效, 50 | 动态请求的UA取决于使用的WebDriver)。RetryMiddleware对失败的请求(错误的返回码,超时等)进行多次重试。CookiesMiddleware在全体的请求之间共享CookieJar池, 51 | 一组请求可以共享一个CookieJar,CookiesMiddleware维护CookieJar的有效性与一致性。 52 | 53 | DRIVER表明了动态请求的浏览器,这里我们使用Chrome。 54 | 55 | DOWNLOAD_DELAY表明了下载之间的延迟时间(秒),这个选项当网站有某种防爬策略时还是很有用的。 56 | 57 | USER_AGENT_LIST中包含请求使用的User-Agent,UserAgentMiddleware会从中随机取出一个来使用。 58 | 59 | 60 | 最开始的请求 61 | ------------- 62 | 下面这段代码通过重写start_requests方法yield一个PyCreeper请求:: 63 | 64 | def start_requests(self): 65 | 66 | def _login(driver): 67 | driver.find_element_by_name('account').send_keys("username") 68 | driver.find_element_by_name('password').send_keys("password") 69 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click() 70 | gevent.sleep(5) 71 | 72 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"}, 73 | callback=self.after_login, dynamic=True, browser_actions=[_login]) 74 | 75 | 在Request对象的参数中,dynamic=True表明这是一个动态请求,将会调用WebDriver加载, 76 | 而browser_actions=[_login]则定义了浏览器加载完成之后进行的动作。本例中输入了用户名与密码,然后点击登录。 77 | gevent.sleep(5)则是令爬虫等待浏览器加载完成。 78 | 79 | meta={"cookiejar": "zhihu"}这个选项表明本次请求产生的Cookie将会被存储在名为zhihu的CookieJar当中 80 | 81 | callback=self.after_login定义了本次响应的处理函数。 82 | 83 | 接下来? 84 | -------- 85 | 接下来一步将在知乎首页中提取问题链接,发出静态问题请求:: 86 | 87 | def after_login(self, response): 88 | html = response.body 89 | selector = etree.HTML(html) 90 | links = selector.xpath('//a[@class="question_link"]') 91 | for link in links: 92 | yield Request('https://www.zhihu.com' + link.attrib["href"], 93 | meta={"cookiejar": "zhihu"}, callback=self.get_item) 94 | 95 | response.body存储了响应的内容。我们使用了lxml提取html文本中的标签,然后发出一系列静态请求。 96 | 97 | 在获得问题页面的数据之后,我们需要做的是提取出其中的问题标题与详情:: 98 | 99 | def get_item(self, response): 100 | html = response.body 101 | selector = etree.HTML(html) 102 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text 103 | body = selector.xpath('//span[@class="RichText"]')[0].text 104 | yield { 105 | 'head': head, 106 | 'body': body 107 | } 108 | 109 | 过程与上个函数类似,通过xpath定位元素。 110 | 111 | 处理你获得的数据 112 | ----------------- 113 | 处理数据通过重写process_item方法实现:: 114 | 115 | def process_item(self, item): 116 | print json.dumps(item, ensure_ascii=False) 117 | 118 | 这里我们只是将结果打印。 119 | 120 | 运行你的爬虫 121 | ------------- 122 | 最后我们通过这样一段代码运行爬虫:: 123 | 124 | if __name__ == "__main__": 125 | spider = Zhihu_Spider() 126 | spider.start() 127 | 128 | 完整的代码如下:: 129 | 130 | # -*- coding:utf-8 -*- 131 | 132 | from pycreeper.spider import Spider 133 | from pycreeper.http.request import Request 134 | from lxml import etree 135 | import json 136 | import gevent 137 | 138 | 139 | class Zhihu_Spider(Spider): 140 | 141 | custom_settings = { 142 | 'DOWNLOADER_MIDDLEWARES': { 143 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100, 144 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200, 145 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300 146 | }, 147 | 'DRIVER': 'Chrome', 148 | 'DOWNLOAD_DELAY': 2, 149 | 'STATIC_REQUEST_SSL_VERIFY': False, 150 | 'USER_AGENT_LIST': [ 151 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 152 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''', 153 | ] 154 | } 155 | 156 | def start_requests(self): 157 | 158 | def _login(driver): 159 | driver.find_element_by_name('account').send_keys("username") 160 | driver.find_element_by_name('password').send_keys("password") 161 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click() 162 | gevent.sleep(5) 163 | 164 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"}, 165 | callback=self.after_login, dynamic=True, browser_actions=[_login]) 166 | 167 | def after_login(self, response): 168 | html = response.body 169 | selector = etree.HTML(html) 170 | links = selector.xpath('//a[@class="question_link"]') 171 | for link in links: 172 | yield Request('https://www.zhihu.com' + link.attrib["href"], 173 | meta={"cookiejar": "zhihu"}, callback=self.get_item) 174 | 175 | def get_item(self, response): 176 | html = response.body 177 | selector = etree.HTML(html) 178 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text 179 | body = selector.xpath('//span[@class="RichText"]')[0].text 180 | yield { 181 | 'head': head, 182 | 'body': body 183 | } 184 | 185 | def process_item(self, item): 186 | print json.dumps(item, ensure_ascii=False) 187 | 188 | if __name__ == "__main__": 189 | spider = Zhihu_Spider() 190 | spider.start() 191 | 192 | 193 | 写在后面 194 | --------- 195 | 项目已经通过PyPi发布,您可以通过以下命令下载:: 196 | 197 | pip install pycreeper 198 | 199 | 未来我们将会引入Docker的支持。 200 | 201 | 目前项目刚刚发布1.0.0版本,如果在使用时,遇到各种问题,我们都欢迎您反馈给我们,您可以通过github, 202 | 项目主页:https://github.com/ZcyAndWt/pyCreeper,也可以通过邮件,作者的邮箱:zhengchenyu.backend@gmail.com。 203 | 204 | 如果您使用中,觉得本项目有可取之处,提高了您爬取数据的效率,希望您能在github上star本项目。 205 | 您的支持是我们前进最大的动力! 206 | 207 | 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /doc/_build/html/http.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | request对象和response对象 — PyCreeper 1.0.0 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 48 | 49 |
50 |
51 |
52 |
53 | 54 |
55 |

request对象和response对象

56 |

request对象和response对象负责在各个PyCreeper组件之间传递信息,您在使用爬虫的过程中,会经常需要对这两个对象进行操作。

57 |
58 |

Request:自定义您的请求

59 |

构造参数:

60 |
Request(url, callback=None, method='GET', headers=None,body=None, meta=None,
 61 |         encoding='utf-8', cookiejar=None,dynamic=False, browser_actions=None, wait=0)
 62 | 
63 |
64 |

url

65 |

请求的url

66 |

callback

67 |

请求的回调函数,如果未定义则使用Spider.parse方法处理响应。

68 |

method

69 |

支持GET型和POST型请求方法,其中,POST方法只有当dynamic=False时才会被支持, 70 | 如果dynamic=True将会抛出一个AttributeError。

71 |

headers

72 |

该参数可以传入一个字典(dict),用于静态请求的头部信息。

73 |

body

74 |

该参数用于静态请求的请求体。

75 |

meta

76 |

该参数为字典(dict)型,用于给request携带一些参数,这些参数可能在其他模块用到。

77 |

encoding

78 |

请求的编码方式,用于给url和body编码。

79 |

cookiejar

80 |

该参数用于取出request携带的cookiejar,在构造request对象时请不要向该参数传入值,传入的cookiejar不会被PyCreeper使用到。

81 |

dynamic

82 |

该参数用于标记request是否是动态请求。

83 |

browser_actions

84 |

该参数用于定义浏览器打开指定网址之后,到提取数据之前,执行的一系列操作。该参数可以传入一个函数列表。

85 |

wait

86 |

该参数用于定义浏览器打开指定网址之后,到执行browser_actions中定义的函数之前,等待的时间。 87 | 当网页存在大量异步加载请求的时候,这个参数格外有用。

88 |
89 |
90 | 91 | 92 |
93 |
94 |
95 | 130 |
131 |
132 | 147 | 151 | 152 | -------------------------------------------------------------------------------- /pycreeper/downloader/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | """ Downloader """ 6 | import cookielib 7 | import requests 8 | from pycreeper.http.response import Response 9 | from pycreeper.downloader_middlewares import DownloaderMiddlewareManager 10 | from selenium.common.exceptions import TimeoutException as _TimeoutException 11 | from pycreeper.utils.exceptions import TimeoutException 12 | from requests.exceptions import Timeout 13 | import six 14 | from pycreeper.utils import _get_cookies_from_cookiejar 15 | import gevent 16 | import traceback 17 | 18 | 19 | class DownloadHandler(object): 20 | """ DownloadHandler """ 21 | 22 | def __init__(self, spider, driver, driver_sem, **kwargs): 23 | self.settings = spider.settings 24 | self.logger = spider.logger 25 | self.session_map = {} 26 | self.kwargs = kwargs 27 | self.driver = driver 28 | self.driver_sem = driver_sem 29 | 30 | def fetch(self, request): 31 | """fetch 32 | """ 33 | url = request.url 34 | if request.dynamic: 35 | return self._fetch_dynamic(request, url) 36 | else: 37 | return self._fetch_static(request, url) 38 | 39 | def _fetch_static(self, request, url): 40 | self.logger.info("processing static page %s", url) 41 | kwargs = { 42 | "timeout": self.settings["TIMEOUT"], 43 | "headers": request.headers, 44 | "verify": self.settings["STATIC_REQUEST_SSL_VERIFY"], 45 | } 46 | if "proxy" in request.meta and request.meta["proxy"]: 47 | kwargs.update(proxies=request.meta["proxy"]) 48 | try: 49 | session = requests.Session() 50 | if request.cookiejar: 51 | session.cookies = request.cookiejar 52 | if request.method == 'GET': 53 | response = session.get(url, **kwargs) 54 | elif request.method == 'POST': 55 | if request.body: 56 | kwargs.update(data=request.body) 57 | response = session.post(url, **kwargs) 58 | else: 59 | raise ValueError('Unacceptable HTTP verb %s' % request.method) 60 | return Response(response.url, request, status=response.status_code, 61 | cookiejar=response.cookies, body=response.content) 62 | except Timeout as e: 63 | raise TimeoutException(e.message) 64 | except Exception as e: 65 | self.logger.error("download error: %s", str(e), exc_info=True) 66 | raise e 67 | 68 | 69 | def _fetch_dynamic(self, request, url): 70 | self.logger.info("processing dynamic page %s", url) 71 | try: 72 | self.driver_sem.acquire() 73 | if request.cookiejar: 74 | cookies = _get_cookies_from_cookiejar(request.cookiejar) 75 | cookies = self._covert_cookies_to_dict(cookies) 76 | #self._removed_first_dot_in_front_of_domain(cookies) 77 | command_list = self._get_command_list(cookies) 78 | # make the current page to have the same domain with cookies 79 | self.driver.get(url) 80 | # load cookies 81 | for command in command_list: 82 | self.driver.execute_script(command) 83 | 84 | self.driver.set_page_load_timeout(self.settings["TIMEOUT"]) 85 | self.driver.get(url) 86 | gevent.sleep(request.wait) 87 | for func in request.browser_actions: 88 | func(self.driver) 89 | url = self.driver.current_url 90 | html = self.driver.page_source 91 | 92 | # generate cookies 93 | all_cookies = self.driver.get_cookies() 94 | self.driver.delete_all_cookies() 95 | self.driver_sem.release() 96 | 97 | all_cookies = self._to_byte(all_cookies) 98 | cookies = [self._make_cookie(**d) for d in all_cookies] 99 | 100 | # set cookies to cookiejar 101 | cj = cookielib.CookieJar() 102 | for cookie in cookies: 103 | cj.set_cookie(cookie) 104 | return Response(url, request, cookiejar=cj, body=html) 105 | except _TimeoutException as e: 106 | raise TimeoutException(e.message) 107 | except Exception as e: 108 | self.logger.error("download error: %s", str(e), exc_info=True) 109 | raise e 110 | 111 | def _removed_first_dot_in_front_of_domain(self, cookies): 112 | for cookie in cookies: 113 | for k in cookie: 114 | if k == 'domain' and str(cookie[k]).startswith('.'): 115 | cookie[k] = cookie[k][1:] 116 | 117 | def _get_command_list(self, cookies): 118 | js_list = [] 119 | for cookie in cookies: 120 | item_list = [cookie['name'] + '=' + cookie['value']] 121 | for k in ('domain', 'path', 'expiry'): 122 | if k in cookie and not (cookie[k] is None): 123 | item_list.append(str(k) + '=' + str(cookie[k])) 124 | js_list.append("document.cookie = '%s';\n" % ('; '.join(item_list))) 125 | return js_list 126 | 127 | def _make_cookie(self, **kwargs): 128 | return cookielib.Cookie( 129 | version=0, 130 | name=kwargs.get('name', None), 131 | value=kwargs.get('value', None), 132 | port=None, 133 | port_specified=False, 134 | domain=kwargs.get('domain', None), 135 | domain_specified=True, 136 | domain_initial_dot=False, 137 | path=kwargs.get('path', None), 138 | path_specified=True, 139 | secure=False, 140 | expires=kwargs.get('expires', None), 141 | discard=False, 142 | comment=None, 143 | comment_url=None, 144 | rest=None 145 | ) 146 | 147 | def _covert_cookies_to_dict(self, cookies): 148 | result = [] 149 | for cookie in cookies: 150 | cookie_dict = {} 151 | for key in ['name', 'value', 'domain', 'path', 'expires']: 152 | if getattr(cookie, key): 153 | cookie_dict[key] = getattr(cookie, key) 154 | result.append(cookie_dict) 155 | return result 156 | 157 | def _to_byte(self, cookies): 158 | result = [] 159 | for cookie in cookies: 160 | temp = {} 161 | for key in cookie.keys(): 162 | temp[key.encode('utf-8') if isinstance(key, six.text_type) else key] = \ 163 | cookie[key].encode('utf-8') if isinstance(cookie[key], six.text_type) else cookie[key] 164 | result.append(temp) 165 | return result 166 | 167 | 168 | 169 | 170 | class Downloader(object): 171 | """ Downloader """ 172 | 173 | def __init__(self, spider, driver, driver_sem): 174 | self.hanlder = DownloadHandler(spider, driver, driver_sem) 175 | self.middleware = DownloaderMiddlewareManager(spider) 176 | 177 | def fetch(self, request, spider): 178 | """fetch 179 | 180 | @request, Request, 请求 181 | """ 182 | return self.middleware.download(self._download, request) 183 | 184 | def _download(self, request): 185 | """download 186 | """ 187 | return self.hanlder.fetch(request) 188 | -------------------------------------------------------------------------------- /doc/_build/html/index.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | PyCreeper: 抓取你能看到的一切! — PyCreeper 1.0.0 documentation 10 | 11 | 12 | 13 | 14 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 44 | 45 |
46 |
47 |
48 |
49 | 50 |
51 |

PyCreeper: 抓取你能看到的一切!

52 |

PyCreeper是一个用来快速提取网页内容的信息采集(爬虫)框架。项目通过控制 Selenium.WebDriver 实现对网页的动态加载与控制, 53 | 希望可以减少爬虫爱好者分析网页源码,抓取http包,分析Cookies等诸多不便。

54 |

项目主页:https://github.com/ZcyAndWt/pyCreeper

55 |

作者邮箱:zhengchenyu.backend@gmail.com

56 |

项目使用过程中,当您发现任何问题或感受到任何不快,请及时联系我们!

57 | 99 |
100 | 101 | 102 |
103 |
104 |
105 | 129 |
130 |
131 | 143 | 147 | 148 | -------------------------------------------------------------------------------- /tests/test_downloader.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | reload(__import__('sys')).setdefaultencoding('utf-8') 3 | __author__ = 'zcy' 4 | 5 | import unittest 6 | import time 7 | import json 8 | 9 | from pycreeper.utils.exceptions import TimeoutException 10 | import gevent 11 | 12 | from gevent.pool import Pool 13 | from pycreeper.downloader_middlewares.cookies_middlewares import CookiesMiddleware 14 | from pycreeper.downloader import DownloadHandler 15 | from pycreeper.spider import Spider 16 | from pycreeper.http.request import Request 17 | from pycreeper.http.response import Response 18 | from selenium import webdriver 19 | from gevent.lock import BoundedSemaphore 20 | 21 | HTTPBIN_URL = 'http://httpbin.org' 22 | 23 | 24 | 25 | 26 | class DownloadHandlerTest(unittest.TestCase): 27 | def setUp(self): 28 | self.spider = Spider() 29 | self.spider.settings.set('TIMEOUT', 15) 30 | self.driver = None 31 | self.driver_sem = BoundedSemaphore(1) 32 | 33 | def test_concurrency_with_delayed_url(self): 34 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem) 35 | n = 5 36 | pool = Pool(n) 37 | urls = [] 38 | for i in range(n): 39 | urls.append(HTTPBIN_URL + '/delay/1') 40 | time_start = time.time() 41 | pool.map(dh.fetch, [Request(url) for url in urls]) 42 | time_total = time.time() - time_start 43 | self.assertLess(time_total, n) 44 | 45 | def test_timeout_static(self): 46 | self.spider.settings.set('TIMEOUT', 5) 47 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem) 48 | self.assertRaises(TimeoutException, dh.fetch, Request(HTTPBIN_URL + '/delay/10')) 49 | 50 | def test_timeout_dynamic(self): 51 | self.driver = webdriver.PhantomJS() 52 | self.spider.settings.set('TIMEOUT', 5) 53 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem) 54 | self.assertRaises(TimeoutException, dh.fetch, Request(HTTPBIN_URL + '/delay/10', dynamic=True)) 55 | self.driver.close() 56 | 57 | def test_post_data_static(self): 58 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem) 59 | response = dh.fetch(Request(HTTPBIN_URL + '/post', method='POST')) 60 | self.assertIsInstance(response, Response) 61 | self.assertEqual(response.status, 200) 62 | 63 | def test_post_data_content_static(self): 64 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem) 65 | response = dh.fetch(Request(HTTPBIN_URL + '/post', method='POST', body={'text': 'pycreeper'})) 66 | self.assertIsInstance(response, Response) 67 | self.assertEqual(json.loads(response.body)['form'], {'text': 'pycreeper'}) 68 | 69 | response = dh.fetch(Request(HTTPBIN_URL + '/post', method='POST', body=u'Unicode测试')) 70 | self.assertEqual(json.loads(response.body)['data'], 'Unicode测试') 71 | 72 | response = dh.fetch(Request(HTTPBIN_URL + '/post', method='POST', body='中文测试')) 73 | self.assertEqual(json.loads(response.body)['data'], '中文测试') 74 | self.assertEqual(response.status, 200) 75 | 76 | def test_get_data(self): 77 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem) 78 | response = dh.fetch(Request(HTTPBIN_URL + '/get')) 79 | self.assertIsInstance(response, Response) 80 | self.assertEqual(response.status, 200) 81 | 82 | def test_dynamic_request(self): 83 | self.driver = webdriver.PhantomJS() 84 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem) 85 | request = Request(HTTPBIN_URL + '/get', dynamic=True) 86 | dh.fetch(request) 87 | self.driver.close() 88 | 89 | def test_dynamic_request_wait(self): 90 | self.driver = webdriver.PhantomJS() 91 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem) 92 | request = Request(HTTPBIN_URL + '/get', dynamic=True, wait=3) 93 | dh.fetch(request) 94 | self.driver.close() 95 | 96 | def test_dynamic_request_timeout(self): 97 | self.driver = webdriver.PhantomJS() 98 | self.spider.settings.set('TIMEOUT', 5) 99 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem) 100 | request = Request(HTTPBIN_URL + '/delay/10', dynamic=True) 101 | self.assertRaises(TimeoutException, dh.fetch, request) 102 | self.driver.close() 103 | 104 | def test_dynamic_request_concurrency(self): 105 | self.driver = webdriver.PhantomJS() 106 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem) 107 | n = 5 108 | pool = Pool(n) 109 | urls = [] 110 | for i in range(n): 111 | urls.append(HTTPBIN_URL + '/delay/1') 112 | time1 = time.time() 113 | pool.map(dh.fetch, [Request(url, dynamic=True, wait=5) for url in urls]) 114 | self.assertGreater(time.time() - time1, n) 115 | self.driver.close() 116 | 117 | def test_dynamic_request_cookie_between_static_and_dynamic(self): 118 | cm = CookiesMiddleware(self.spider, self.spider.settings) 119 | self.driver = webdriver.PhantomJS() 120 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem) 121 | request = Request(HTTPBIN_URL + '/cookies/set?key1=val1&key2=val2', 122 | dynamic=True, meta={'cookiejar': 'test'}) 123 | response = dh.fetch(request) 124 | cm.process_response(request, response) 125 | request = Request(HTTPBIN_URL + '/cookies', meta={'cookiejar': 'test'}) 126 | cm.process_request(request) 127 | response = dh.fetch(request) 128 | self.assertEqual(json.loads(response.body)['cookies'], 129 | {u'key1': u'val1', u'key2': u'val2'}) 130 | self.driver.close() 131 | 132 | def test_dynamic_request_multi_cookiejar(self): 133 | cm = CookiesMiddleware(self.spider, self.spider.settings) 134 | self.driver = webdriver.PhantomJS() 135 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem) 136 | 137 | # jar 1 138 | request = Request(HTTPBIN_URL + '/cookies/set?key1=val1', 139 | dynamic=True, meta={'cookiejar': 'test1'}) 140 | cm.process_request(request) 141 | response = dh.fetch(request) 142 | cm.process_response(request, response) 143 | 144 | # jar 2 145 | request = Request(HTTPBIN_URL + '/cookies/set?key2=val2', 146 | dynamic=True, meta={'cookiejar': 'test2'}) 147 | cm.process_request(request) 148 | response = dh.fetch(request) 149 | cm.process_response(request, response) 150 | 151 | # test jar2 152 | request = Request(HTTPBIN_URL + '/cookies', meta={'cookiejar': 'test2'}) 153 | cm.process_request(request) 154 | response = dh.fetch(request) 155 | cm.process_response(request, response) 156 | self.assertEqual(json.loads(response.body)['cookies'], {u'key2': u'val2'}) 157 | 158 | # test jar1 159 | request = Request(HTTPBIN_URL + '/cookies', meta={'cookiejar': 'test1'}) 160 | cm.process_request(request) 161 | response = dh.fetch(request) 162 | cm.process_response(request, response) 163 | self.assertEqual(json.loads(response.body)['cookies'], {u'key1': u'val1'}) 164 | self.driver.close() 165 | 166 | def test_dynamic_request_browser_actions(self): 167 | cm = CookiesMiddleware(self.spider, self.spider.settings) 168 | self.driver = webdriver.Chrome() 169 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem) 170 | 171 | def _actions(driver): 172 | driver.find_element_by_name('account').send_keys("username") 173 | driver.find_element_by_name('password').send_keys("pwd") 174 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click() 175 | gevent.sleep(5) 176 | 177 | request = Request('https://www.zhihu.com/#signin', 178 | dynamic=True, meta={'cookiejar': 'test'}, 179 | browser_actions=[_actions], 180 | ) 181 | cm.process_request(request) 182 | response = dh.fetch(request) 183 | cm.process_response(request, response) 184 | 185 | request = Request('https://www.zhihu.com', dynamic=True, meta={'cookiejar': 'test'}) 186 | cm.process_request(request) 187 | response = dh.fetch(request) 188 | cm.process_response(request, response) 189 | print response.body 190 | self.driver.close() 191 | 192 | 193 | class DownloadTest(unittest.TestCase): 194 | pass 195 | 196 | 197 | if __name__ == "__main__": 198 | unittest.main() 199 | --------------------------------------------------------------------------------