├── doc
├── _build
│ ├── html
│ │ ├── .nojekyll
│ │ ├── _static
│ │ │ ├── custom.css
│ │ │ ├── down.png
│ │ │ ├── file.png
│ │ │ ├── plus.png
│ │ │ ├── up.png
│ │ │ ├── minus.png
│ │ │ ├── comment.png
│ │ │ ├── structure.jpg
│ │ │ ├── ajax-loader.gif
│ │ │ ├── structure.vsdx
│ │ │ ├── up-pressed.png
│ │ │ ├── comment-bright.png
│ │ │ ├── comment-close.png
│ │ │ ├── down-pressed.png
│ │ │ ├── ~$$structure.~vsdx
│ │ │ ├── pygments.css
│ │ │ └── nature.css
│ │ ├── _sources
│ │ │ ├── last.rst.txt
│ │ │ ├── spider.rst.txt
│ │ │ ├── downloader.rst.txt
│ │ │ ├── downloader_middlewares.rst.txt
│ │ │ ├── schedular.rst.txt
│ │ │ ├── index.rst.txt
│ │ │ ├── structure.rst.txt
│ │ │ ├── http.rst.txt
│ │ │ ├── prepare.rst.txt
│ │ │ ├── settings.rst.txt
│ │ │ ├── ssettings.rst.txt
│ │ │ ├── intro.rst.txt
│ │ │ └── tutorial.rst.txt
│ │ ├── debug.log
│ │ ├── objects.inv
│ │ ├── _images
│ │ │ └── structure.jpg
│ │ ├── .buildinfo
│ │ ├── genindex.html
│ │ ├── search.html
│ │ ├── last.html
│ │ ├── spider.html
│ │ ├── downloader_middlewares.html
│ │ ├── downloader.html
│ │ ├── schedular.html
│ │ ├── structure.html
│ │ ├── prepare.html
│ │ ├── http.html
│ │ └── index.html
│ └── doctrees
│ │ ├── http.doctree
│ │ ├── last.doctree
│ │ ├── index.doctree
│ │ ├── intro.doctree
│ │ ├── spider.doctree
│ │ ├── prepare.doctree
│ │ ├── schedular.doctree
│ │ ├── settings.doctree
│ │ ├── ssettings.doctree
│ │ ├── structure.doctree
│ │ ├── tutorial.doctree
│ │ ├── downloader.doctree
│ │ ├── environment.pickle
│ │ └── downloader_middlewares.doctree
├── spider.rst
├── downloader.rst
├── _static
│ ├── structure.jpg
│ └── structure.vsdx
├── downloader_middlewares.rst
├── last.rst
├── Makefile
├── schedular.rst
├── index.rst
├── make.bat
├── structure.rst
├── http.rst
├── prepare.rst
├── settings.rst
├── conf.py
└── tutorial.rst
├── tests
├── __init__.py
├── http
│ ├── __init__.py
│ ├── test_http_request.py
│ └── test_http_response.py
├── utils
│ ├── __init__.py
│ ├── test_utils_log.py
│ ├── test_utils_hash.py
│ └── test_utils_datatypes.py
├── test_data
│ ├── __init__.py
│ └── test_settings_data.py
├── test_conf_settings.py
├── test_scheduler.py
├── test_downloader_middlewares.py
└── test_downloader.py
├── pycreeper
├── conf
│ ├── __init__.py
│ ├── settings.py
│ └── default_settings.py
├── http
│ ├── __init__.py
│ ├── response.py
│ └── request.py
├── __init__.py
├── utils
│ ├── gevent_wrapper.py
│ ├── hash.py
│ ├── exceptions.py
│ ├── log.py
│ ├── datatypes.py
│ └── __init__.py
├── spider.py
├── downloader_middlewares
│ ├── cookies_middlewares.py
│ ├── __init__.py
│ └── middlewares.py
├── scheduler.py
├── engine.py
└── downloader
│ └── __init__.py
├── setup.py
├── examples
├── zhihu_spider.py
└── jd_spider.py
└── README.md
/doc/_build/html/.nojekyll:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/doc/spider.rst:
--------------------------------------------------------------------------------
1 | spider:爬虫
2 | ============================
3 |
4 |
--------------------------------------------------------------------------------
/doc/downloader.rst:
--------------------------------------------------------------------------------
1 | downloader:下载器
2 | ============================
3 |
4 |
--------------------------------------------------------------------------------
/doc/_build/html/_static/custom.css:
--------------------------------------------------------------------------------
1 | /* This file intentionally left blank. */
2 |
--------------------------------------------------------------------------------
/doc/_build/html/_sources/last.rst.txt:
--------------------------------------------------------------------------------
1 | 写在最后
2 | ============================
3 |
4 |
--------------------------------------------------------------------------------
/doc/_build/html/_sources/spider.rst.txt:
--------------------------------------------------------------------------------
1 | spider:爬虫
2 | ============================
3 |
4 |
--------------------------------------------------------------------------------
/doc/_build/html/_sources/downloader.rst.txt:
--------------------------------------------------------------------------------
1 | downloader:下载器
2 | ============================
3 |
4 |
--------------------------------------------------------------------------------
/doc/_build/html/debug.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/debug.log
--------------------------------------------------------------------------------
/doc/_build/html/objects.inv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/objects.inv
--------------------------------------------------------------------------------
/doc/_static/structure.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_static/structure.jpg
--------------------------------------------------------------------------------
/doc/_static/structure.vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_static/structure.vsdx
--------------------------------------------------------------------------------
/doc/downloader_middlewares.rst:
--------------------------------------------------------------------------------
1 | downloader_middlewares:下载器中间件
2 | =======================================
3 |
4 |
--------------------------------------------------------------------------------
/doc/_build/doctrees/http.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/http.doctree
--------------------------------------------------------------------------------
/doc/_build/doctrees/last.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/last.doctree
--------------------------------------------------------------------------------
/doc/_build/html/_static/down.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/down.png
--------------------------------------------------------------------------------
/doc/_build/html/_static/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/file.png
--------------------------------------------------------------------------------
/doc/_build/html/_static/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/plus.png
--------------------------------------------------------------------------------
/doc/_build/html/_static/up.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/up.png
--------------------------------------------------------------------------------
/doc/_build/doctrees/index.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/index.doctree
--------------------------------------------------------------------------------
/doc/_build/doctrees/intro.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/intro.doctree
--------------------------------------------------------------------------------
/doc/_build/doctrees/spider.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/spider.doctree
--------------------------------------------------------------------------------
/doc/_build/html/_static/minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/minus.png
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
--------------------------------------------------------------------------------
/doc/_build/doctrees/prepare.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/prepare.doctree
--------------------------------------------------------------------------------
/doc/_build/doctrees/schedular.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/schedular.doctree
--------------------------------------------------------------------------------
/doc/_build/doctrees/settings.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/settings.doctree
--------------------------------------------------------------------------------
/doc/_build/doctrees/ssettings.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/ssettings.doctree
--------------------------------------------------------------------------------
/doc/_build/doctrees/structure.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/structure.doctree
--------------------------------------------------------------------------------
/doc/_build/doctrees/tutorial.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/tutorial.doctree
--------------------------------------------------------------------------------
/doc/_build/html/_images/structure.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_images/structure.jpg
--------------------------------------------------------------------------------
/doc/_build/html/_static/comment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/comment.png
--------------------------------------------------------------------------------
/doc/_build/html/_static/structure.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/structure.jpg
--------------------------------------------------------------------------------
/tests/http/__init__.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
--------------------------------------------------------------------------------
/doc/_build/doctrees/downloader.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/downloader.doctree
--------------------------------------------------------------------------------
/doc/_build/doctrees/environment.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/environment.pickle
--------------------------------------------------------------------------------
/doc/_build/html/_static/ajax-loader.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/ajax-loader.gif
--------------------------------------------------------------------------------
/doc/_build/html/_static/structure.vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/structure.vsdx
--------------------------------------------------------------------------------
/doc/_build/html/_static/up-pressed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/up-pressed.png
--------------------------------------------------------------------------------
/pycreeper/conf/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
--------------------------------------------------------------------------------
/doc/_build/html/_sources/downloader_middlewares.rst.txt:
--------------------------------------------------------------------------------
1 | downloader_middlewares:下载器中间件
2 | =======================================
3 |
4 |
--------------------------------------------------------------------------------
/doc/_build/html/_static/comment-bright.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/comment-bright.png
--------------------------------------------------------------------------------
/doc/_build/html/_static/comment-close.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/comment-close.png
--------------------------------------------------------------------------------
/doc/_build/html/_static/down-pressed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/down-pressed.png
--------------------------------------------------------------------------------
/doc/_build/html/_static/~$$structure.~vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/html/_static/~$$structure.~vsdx
--------------------------------------------------------------------------------
/pycreeper/http/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
--------------------------------------------------------------------------------
/tests/test_data/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
--------------------------------------------------------------------------------
/doc/_build/doctrees/downloader_middlewares.doctree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cy-zheng/pyCreeper/HEAD/doc/_build/doctrees/downloader_middlewares.doctree
--------------------------------------------------------------------------------
/pycreeper/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | VERSION = (0, 0, 1)
6 |
--------------------------------------------------------------------------------
/doc/_build/html/.buildinfo:
--------------------------------------------------------------------------------
1 | # Sphinx build info version 1
2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3 | config: a4bea7a19f3fdfa82050b591f2231270
4 | tags: 645f666f9bcd5a90fca523b33c5a78b7
5 |
--------------------------------------------------------------------------------
/doc/last.rst:
--------------------------------------------------------------------------------
1 | 写在最后
2 | ============================
3 | PyCreeper旨在提高爬虫爱好者爬取动态页面的效率,在使用时,如果您遇到各种问题,我们都欢迎您反馈给我们,您可以通过github,
4 | 项目主页:https://github.com/ZcyAndWt/pyCreeper,也可以通过邮件,作者的邮箱:zhengchenyu.backend@gmail.com。
5 |
6 | 未来我们将引入通过Docker安装的支持。
7 |
8 | 如果您觉得PyCreeper减少了您的工作量,提高了您的开发效率,希望您能在Github上给我们star。您的好评是我们前进的动力!
9 |
--------------------------------------------------------------------------------
/pycreeper/utils/gevent_wrapper.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import gevent
6 |
7 |
8 | def spawn(func, *args, **kwargs):
9 | return gevent.spawn(func, *args, **kwargs)
10 |
11 |
12 | def join_all(funcs):
13 | gevent.joinall(funcs)
14 |
--------------------------------------------------------------------------------
/tests/test_data/test_settings_data.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | """ test settings """
6 |
7 | TEST_INT = 10
8 |
9 | TEST_JSON = '{"foo": ["bar", "baz"]}'
10 |
11 | TEST_STR = 'foo,bar,baz'
12 |
13 | TEST_DICT = {
14 | "foo": "bar"
15 | }
16 |
17 | TEST_LIST = [
18 | "foo",
19 | "bar",
20 | "baz"
21 | ]
22 |
23 | TEST_FLOAT = 9.11
24 |
25 | test_lowercase = True
26 |
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = PyCreeper
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/pycreeper/utils/hash.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import urllib
6 | from urlparse import parse_qsl, urlparse, urlunparse
7 | import hashlib
8 |
9 |
10 | def request_fingerprint(request):
11 | """request fingerprint
12 | """
13 | scheme, netloc, path, params, query, fragment = urlparse(request.url)
14 | keyvals = parse_qsl(query)
15 | keyvals.sort()
16 | query = urllib.urlencode(keyvals)
17 | canonicalize_url = urlunparse((
18 | scheme, netloc.lower(), path, params, query, fragment))
19 | fpr = hashlib.sha1()
20 | fpr.update(canonicalize_url)
21 | return fpr.hexdigest()
--------------------------------------------------------------------------------
/doc/schedular.rst:
--------------------------------------------------------------------------------
1 | schedular:调度器
2 | ============================
3 |
4 | 调度器实现的核心是gevent之中的Queue和布隆过滤器
5 | (Wiki: https://en.wikipedia.org/wiki/Bloom_filter)。
6 | 其中,Queue保证了多个Downloader协程读取队列时的协程安全,布隆过滤器则提供了url去重功能。
7 |
8 | 将请求入队:enqueue_request(request)
9 | --------------------------------------------------
10 |
11 | request入队时,首先使用布隆过滤器检查url是否已经抓取过。如果没有抓取过则直接入队,
12 | 如果抓取过,则会输出一条logging.DEBUG信息,表示忽略了这个url。
13 |
14 | 取得队列中的请求:next_request()
15 | -----------------------------------------------
16 |
17 | 这个方法将会从Queue中取出一条request。如果在 **custom_settings** 中设置了 **DOWNLOAD_DELAY**
18 | 项目的话,每次取出request会等待一个固定的时间。
19 |
20 | PyCreeper将 **TIMEOUT** 值的3倍作为检验爬虫结束的标志。具体是指,如果3*TIMEOUT时间之内Queue为空的话,
21 | 那么则认为爬取任务全部结束,爬虫退出。
22 |
23 |
--------------------------------------------------------------------------------
/pycreeper/utils/exceptions.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | class PycreeperException(Exception):
6 | """
7 | Base pycreeper exception.
8 | """
9 |
10 | def __init__(self, msg=None, stacktrace=None):
11 | self.msg = msg
12 | self.stacktrace = stacktrace
13 |
14 | def __str__(self):
15 | exception_msg = "Message: %s\n" % self.msg
16 | if self.stacktrace is not None:
17 | stacktrace = "\n".join(self.stacktrace)
18 | exception_msg += "Stacktrace:\n%s" % stacktrace
19 | return exception_msg
20 |
21 |
22 | class TimeoutException(PycreeperException):
23 | pass
24 |
--------------------------------------------------------------------------------
/doc/_build/html/_sources/schedular.rst.txt:
--------------------------------------------------------------------------------
1 | schedular:调度器
2 | ============================
3 |
4 | 调度器实现的核心是gevent之中的Queue和布隆过滤器
5 | (Wiki: https://en.wikipedia.org/wiki/Bloom_filter)。
6 | 其中,Queue保证了多个Downloader协程读取队列时的协程安全,布隆过滤器则提供了url去重功能。
7 |
8 | 将请求入队:enqueue_request(request)
9 | --------------------------------------------------
10 |
11 | request入队时,首先使用布隆过滤器检查url是否已经抓取过。如果没有抓取过则直接入队,
12 | 如果抓取过,则会输出一条logging.DEBUG信息,表示忽略了这个url。
13 |
14 | 取得队列中的请求:next_request()
15 | -----------------------------------------------
16 |
17 | 这个方法将会从Queue中取出一条request。如果在 **custom_settings** 中设置了 **DOWNLOAD_DELAY**
18 | 项目的话,每次取出request会等待一个固定的时间。
19 |
20 | PyCreeper将 **TIMEOUT** 值的3倍作为检验爬虫结束的标志。具体是指,如果3*TIMEOUT时间之内Queue为空的话,
21 | 那么则认为爬取任务全部结束,爬虫退出。
22 |
23 |
--------------------------------------------------------------------------------
/pycreeper/utils/log.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 |
6 | import logging
7 |
8 |
9 | def get_logger(settings, name='pyCreeperLogger'):
10 | """Create a Logger
11 | """
12 | log_level = getattr(logging, settings.get('LOG_LEVEL'), None)
13 | if not log_level:
14 | raise ValueError('Invaild LOG_LEVE. Please check your settings.py.')
15 | logger = logging.getLogger(name)
16 | logger.setLevel(log_level)
17 | stream = logging.StreamHandler()
18 | stream.setLevel(log_level)
19 | formatter = logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s")
20 | stream.setFormatter(formatter)
21 | logger.addHandler(stream)
22 | return logger
23 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(
4 | name="pycreeper",
5 | version="1.0.0",
6 | description='''A web crawler that is able to crawl dynamic web page.''',
7 | author="zcy",
8 | author_email="zhengchenyu.backend@gmail.com",
9 | url="https://github.com/ZcyAndWt/pyCreeper",
10 | license="LGPL",
11 | packages=find_packages(exclude=('doc', 'doc.*', 'tests',
12 | 'tests.*', 'examples', 'examples.*')),
13 | install_requires=[
14 | 'gevent>=1.2.1',
15 | 'importlib>=1.0.4',
16 | 'requests>=2.8.1',
17 | 'chardet>=2.3.0',
18 | 'w3lib>=1.16.0',
19 | 'six>=1.9.0',
20 | 'pybloom>=1.1',
21 | 'selenium>=2.48.0'
22 | ],
23 | )
24 |
--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
1 | .. PyCreeper documentation master file, created by
2 | sphinx-quickstart on Sat Mar 18 20:46:54 2017.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | PyCreeper: 抓取你能看到的一切!
7 | =================================
8 | PyCreeper是一个用来快速提取网页内容的信息采集(爬虫)框架。项目通过控制 **Selenium.WebDriver** 实现对网页的动态加载与控制,
9 | 希望可以减少爬虫爱好者分析网页源码,抓取http包,分析Cookies等诸多不便。
10 |
11 | 项目主页:https://github.com/ZcyAndWt/pyCreeper
12 |
13 | 作者邮箱:zhengchenyu.backend@gmail.com
14 |
15 | 项目使用过程中,当您发现任何问题或感受到任何不快,请及时联系我们!
16 |
17 | .. toctree::
18 | :maxdepth: 2
19 |
20 |
21 | tutorial
22 | prepare
23 | structure
24 | settings
25 | http
26 | downloader
27 | downloader_middlewares
28 | schedular
29 | spider
30 | last
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/doc/_build/html/_sources/index.rst.txt:
--------------------------------------------------------------------------------
1 | .. PyCreeper documentation master file, created by
2 | sphinx-quickstart on Sat Mar 18 20:46:54 2017.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | PyCreeper: 抓取你能看到的一切!
7 | =================================
8 | PyCreeper是一个用来快速提取网页内容的信息采集(爬虫)框架。项目通过控制 **Selenium.WebDriver** 实现对网页的动态加载与控制,
9 | 希望可以减少爬虫爱好者分析网页源码,抓取http包,分析Cookies等诸多不便。
10 |
11 | 项目主页:https://github.com/ZcyAndWt/pyCreeper
12 |
13 | 作者邮箱:zhengchenyu.backend@gmail.com
14 |
15 | 项目使用过程中,当您发现任何问题或感受到任何不快,请及时联系我们!
16 |
17 | .. toctree::
18 | :maxdepth: 2
19 |
20 |
21 | tutorial
22 | prepare
23 | structure
24 | settings
25 | http
26 | downloader
27 | downloader_middlewares
28 | schedular
29 | spider
30 | last
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=PyCreeper
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | echo.installed, then set the SPHINXBUILD environment variable to point
21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | echo.may add the Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/tests/utils/test_utils_log.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 |
6 | import unittest
7 | import logging
8 |
9 | from pycreeper.conf.settings import Settings
10 | from pycreeper.utils.log import get_logger
11 |
12 |
13 | class SettingsTest(unittest.TestCase):
14 |
15 | def test_get_logger(self):
16 | settings = Settings()
17 | logger = get_logger(settings, 'testLogger')
18 | self.assertEqual(logger.level, logging.DEBUG)
19 |
20 | settings.set('LOG_LEVEL', 'INFO')
21 | logger = get_logger(settings, 'testLogger')
22 | self.assertEqual(logger.level, logging.INFO)
23 |
24 | settings.set('LOG_LEVEL', 'foo')
25 | self.assertRaises(ValueError, get_logger, settings, 'testLogger')
26 |
27 | self.assertEqual(logger.name, 'testLogger')
28 |
29 |
30 |
31 | if __name__ == "__main__":
32 | unittest.main()
--------------------------------------------------------------------------------
/doc/structure.rst:
--------------------------------------------------------------------------------
1 | 架构概览
2 | ==========
3 | PyCreeper的整体架构可以分为引擎,下载器,下载器中间件,调度器,爬虫五个部分。
4 | 在各个部分之间传递的数据为Request/Response对象。
5 |
6 | 数据的流动方向如下图的绿色箭头所示。
7 |
8 | 各个部分的功能简述
9 | --------------------
10 |
11 | .. image:: _static/structure.jpg
12 |
13 | ------------------------------------
14 |
15 | **引擎** 是PyCreeper的核心部分,负责调度各个部分的工作。引擎在内部的实现为gevent.Pool。
16 |
17 | **下载器** 负责下载request请求,在这里将静态请求与动态请求分别处理,静态请求使用requests库实现,
18 | 动态请求使用selenium.webdriver实现。在请求完成后,将响应返回给引擎。
19 |
20 | **下载器中间件** 可以理解为存在于下载器和引擎之间的钩子系统,可以通过自定义下载器中间件完成对request和response的特殊处理。
21 |
22 | **调度器** 调度器实现的核心为gevent中的Queue和布隆过滤器,通过对requests进行判重,非重复请求入队,等待引擎取走处理。
23 |
24 | **爬虫** 爬虫相当于对用户定义的接口,由用户来定义起始的url,对于各个request的callback以及对于爬取结果的处理方法。
25 |
26 | 数据流动过程
27 | -------------
28 |
29 | 数据流动的过程如下面各个步骤所示:
30 |
31 | #. 引擎启动,将爬虫中的start_urls加入到调度器中。
32 |
33 | #. 引擎从调度器中取得一个request。
34 |
35 | #. 引擎将请求交给下载器处理,中间经过了下载器中间件对于request的处理。
36 |
37 | #. 下载器根据request的类型分别操作,静态请求交给requests库,动态请求使用selenium.webdriver加载。
38 |
39 | #. 下载器将response返回给引擎,中间经过下载器中间件对response的处理。
40 |
41 | #. 引擎将response交给爬虫定义的处理方法。
42 |
43 | #. 爬虫的处理方法可能返回一个request(转2),或者返回一个包含爬取结果的字典(转下一个)。
44 |
45 | #. 引擎根据爬虫定义的对于爬取结果的处理方法,处理结果。
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/doc/_build/html/_sources/structure.rst.txt:
--------------------------------------------------------------------------------
1 | 架构概览
2 | ==========
3 | PyCreeper的整体架构可以分为引擎,下载器,下载器中间件,调度器,爬虫五个部分。
4 | 在各个部分之间传递的数据为Request/Response对象。
5 |
6 | 数据的流动方向如下图的绿色箭头所示。
7 |
8 | 各个部分的功能简述
9 | --------------------
10 |
11 | .. image:: _static/structure.jpg
12 |
13 | ------------------------------------
14 |
15 | **引擎** 是PyCreeper的核心部分,负责调度各个部分的工作。引擎在内部的实现为gevent.Pool。
16 |
17 | **下载器** 负责下载request请求,在这里将静态请求与动态请求分别处理,静态请求使用requests库实现,
18 | 动态请求使用selenium.webdriver实现。在请求完成后,将响应返回给引擎。
19 |
20 | **下载器中间件** 可以理解为存在于下载器和引擎之间的钩子系统,可以通过自定义下载器中间件完成对request和response的特殊处理。
21 |
22 | **调度器** 调度器实现的核心为gevent中的Queue和布隆过滤器,通过对requests进行判重,非重复请求入队,等待引擎取走处理。
23 |
24 | **爬虫** 爬虫相当于对用户定义的接口,由用户来定义起始的url,对于各个request的callback以及对于爬取结果的处理方法。
25 |
26 | 数据流动过程
27 | -------------
28 |
29 | 数据流动的过程如下面各个步骤所示:
30 |
31 | #. 引擎启动,将爬虫中的start_urls加入到调度器中。
32 |
33 | #. 引擎从调度器中取得一个request。
34 |
35 | #. 引擎将请求交给下载器处理,中间经过了下载器中间件对于request的处理。
36 |
37 | #. 下载器根据request的类型分别操作,静态请求交给requests库,动态请求使用selenium.webdriver加载。
38 |
39 | #. 下载器将response返回给引擎,中间经过下载器中间件对response的处理。
40 |
41 | #. 引擎将response交给爬虫定义的处理方法。
42 |
43 | #. 爬虫的处理方法可能返回一个request(转2),或者返回一个包含爬取结果的字典(转下一个)。
44 |
45 | #. 引擎根据爬虫定义的对于爬取结果的处理方法,处理结果。
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/doc/http.rst:
--------------------------------------------------------------------------------
1 | request对象和response对象
2 | ============================
3 |
4 | request对象和response对象负责在各个PyCreeper组件之间传递信息,您在使用爬虫的过程中,会经常需要对这两个对象进行操作。
5 |
6 | Request:自定义您的请求
7 | -----------------------------
8 |
9 | 构造参数::
10 |
11 | Request(url, callback=None, method='GET', headers=None,body=None, meta=None,
12 | encoding='utf-8', cookiejar=None,dynamic=False, browser_actions=None, wait=0)
13 |
14 | **url**
15 |
16 | 请求的url
17 |
18 | **callback**
19 |
20 | 请求的回调函数,如果未定义则使用Spider.parse方法处理响应。
21 |
22 | **method**
23 |
24 | 支持GET型和POST型请求方法,其中,POST方法只有当dynamic=False时才会被支持,
25 | 如果dynamic=True将会抛出一个AttributeError。
26 |
27 | **headers**
28 |
29 | 该参数可以传入一个字典(dict),用于静态请求的头部信息。
30 |
31 | **body**
32 |
33 | 该参数用于静态请求的请求体。
34 |
35 | **meta**
36 |
37 | 该参数为字典(dict)型,用于给request携带一些参数,这些参数可能在其他模块用到。
38 |
39 | **encoding**
40 |
41 | 请求的编码方式,用于给url和body编码。
42 |
43 | **cookiejar**
44 |
45 | 该参数用于取出request携带的cookiejar,在构造request对象时请不要向该参数传入值,传入的cookiejar不会被PyCreeper使用到。
46 |
47 | **dynamic**
48 |
49 | 该参数用于标记request是否是动态请求。
50 |
51 | **browser_actions**
52 |
53 | 该参数用于定义浏览器打开指定网址之后,到提取数据之前,执行的一系列操作。该参数可以传入一个函数列表。
54 |
55 | **wait**
56 |
57 | 该参数用于定义浏览器打开指定网址之后,到执行browser_actions中定义的函数之前,等待的时间。
58 | 当网页存在大量异步加载请求的时候,这个参数格外有用。
--------------------------------------------------------------------------------
/doc/_build/html/_sources/http.rst.txt:
--------------------------------------------------------------------------------
1 | request对象和response对象
2 | ============================
3 |
4 | request对象和response对象负责在各个PyCreeper组件之间传递信息,您在使用爬虫的过程中,会经常需要对这两个对象进行操作。
5 |
6 | Request:自定义您的请求
7 | -----------------------------
8 |
9 | 构造参数::
10 |
11 | Request(url, callback=None, method='GET', headers=None,body=None, meta=None,
12 | encoding='utf-8', cookiejar=None,dynamic=False, browser_actions=None, wait=0)
13 |
14 | **url**
15 |
16 | 请求的url
17 |
18 | **callback**
19 |
20 | 请求的回调函数,如果未定义则使用Spider.parse方法处理响应。
21 |
22 | **method**
23 |
24 | 支持GET型和POST型请求方法,其中,POST方法只有当dynamic=False时才会被支持,
25 | 如果dynamic=True将会抛出一个AttributeError。
26 |
27 | **headers**
28 |
29 | 该参数可以传入一个字典(dict),用于静态请求的头部信息。
30 |
31 | **body**
32 |
33 | 该参数用于静态请求的请求体。
34 |
35 | **meta**
36 |
37 | 该参数为字典(dict)型,用于给request携带一些参数,这些参数可能在其他模块用到。
38 |
39 | **encoding**
40 |
41 | 请求的编码方式,用于给url和body编码。
42 |
43 | **cookiejar**
44 |
45 | 该参数用于取出request携带的cookiejar,在构造request对象时请不要向该参数传入值,传入的cookiejar不会被PyCreeper使用到。
46 |
47 | **dynamic**
48 |
49 | 该参数用于标记request是否是动态请求。
50 |
51 | **browser_actions**
52 |
53 | 该参数用于定义浏览器打开指定网址之后,到提取数据之前,执行的一系列操作。该参数可以传入一个函数列表。
54 |
55 | **wait**
56 |
57 | 该参数用于定义浏览器打开指定网址之后,到执行browser_actions中定义的函数之前,等待的时间。
58 | 当网页存在大量异步加载请求的时候,这个参数格外有用。
--------------------------------------------------------------------------------
/doc/prepare.rst:
--------------------------------------------------------------------------------
1 | 使用前的准备
2 | ==============
3 | 我们假定您已经安装了Python2.7及以上版本,若没有安装,请参考Python官网(https://www.python.org/)选择合适的版本进行安装。
4 |
5 | PyCreeper对于以下几个库存在依赖关系:
6 |
7 | * gevent
8 | * importlib
9 | * requests
10 | * chardet
11 | * w3lib
12 | * six
13 | * pybloom
14 | * Selenium
15 |
16 | 当然,如果您选择使用pip安装本项目,那么依赖库会自动安装到您的电脑内(至少理论上会是这样)。
17 |
18 | 使用pip安装项目::
19 |
20 | pip install pycreeper
21 |
22 | 配置Selenium Driver
23 | ---------------------
24 | 当您希望调用指定的浏览器时,Selenium需要您安装指定浏览器的接口。
25 | 举例来说,如果您希望使用Chrome加载请求,您需要下载安装 *Chromedriver* (https://sites.google.com/a/chromium.org/chromedriver/downloads),
26 | 然后将该程序放在您的PATH之下,确保Python能访问到它。
27 |
28 | 几个常用的Driver:
29 |
30 | ============== =======================================================================
31 | 名称 link
32 | ============== =======================================================================
33 | Chrome https://sites.google.com/a/chromium.org/chromedriver/downloads
34 | Firefox https://github.com/mozilla/geckodriver/releases
35 | PhantomJS http://phantomjs.org/download.html
36 | ============== =======================================================================
37 |
38 | 其中,PhantomJS是一款无界面化WebKit,当您在无GUI设备的情况下,该浏览器是您最好的选择。
39 |
40 | 对于Selenium更详细的配置,请参考 http://selenium-python.readthedocs.io/
41 |
--------------------------------------------------------------------------------
/doc/_build/html/_sources/prepare.rst.txt:
--------------------------------------------------------------------------------
1 | 使用前的准备
2 | ==============
3 | 我们假定您已经安装了Python2.7及以上版本,若没有安装,请参考Python官网(https://www.python.org/)选择合适的版本进行安装。
4 |
5 | PyCreeper对于以下几个库存在依赖关系:
6 |
7 | * gevent
8 | * importlib
9 | * requests
10 | * chardet
11 | * w3lib
12 | * six
13 | * pybloom
14 | * Selenium
15 |
16 | 当然,如果您选择使用pip安装本项目,那么依赖库会自动安装到您的电脑内(至少理论上会是这样)。
17 |
18 | 使用pip安装项目::
19 |
20 | pip install pycreeper
21 |
22 | 配置Selenium Driver
23 | ---------------------
24 | 当您希望调用指定的浏览器时,Selenium需要您安装指定浏览器的接口。
25 | 举例来说,如果您希望使用Chrome加载请求,您需要下载安装 *Chromedriver* (https://sites.google.com/a/chromium.org/chromedriver/downloads),
26 | 然后将该程序放在您的PATH之下,确保Python能访问到它。
27 |
28 | 几个常用的Driver:
29 |
30 | ============== =======================================================================
31 | 名称 link
32 | ============== =======================================================================
33 | Chrome https://sites.google.com/a/chromium.org/chromedriver/downloads
34 | Firefox https://github.com/mozilla/geckodriver/releases
35 | PhantomJS http://phantomjs.org/download.html
36 | ============== =======================================================================
37 |
38 | 其中,PhantomJS是一款无界面化WebKit,当您在无GUI设备的情况下,该浏览器是您最好的选择。
39 |
40 | 对于Selenium更详细的配置,请参考 http://selenium-python.readthedocs.io/
41 |
--------------------------------------------------------------------------------
/tests/utils/test_utils_hash.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import unittest
6 |
7 | from pycreeper.utils.hash import request_fingerprint
8 | from pycreeper.http.request import Request
9 |
10 | __doctests__ = ['pycreeper.utils.hash']
11 |
12 | URLS = [
13 | 'http://www.example.com/index.html#print',
14 | 'http://www.example.com/index.html',
15 | 'http://www.xxx.com/index.html?id=77&nameid=2905210001&page=1',
16 | 'http://www.xxxxx.com/index.html?id=77&nameid=2905210001&page=1',
17 | 'http://www.xxxxx.com/index.html?test123123',
18 | 'http://www.xxxxx.com/index.html',
19 | 'ftp://www.xxxxx.com/index.html'
20 | ]
21 |
22 | REQUEST = [Request(url) for url in URLS]
23 |
24 |
25 | class RequestFingerprintTest(unittest.TestCase):
26 |
27 | def test_basic(self):
28 | self.assertRaises(AttributeError, request_fingerprint, None)
29 | self.assertNotEqual(REQUEST[0], REQUEST[1])
30 |
31 | def test_not_equal(self):
32 | self.assertNotEqual(REQUEST[2], REQUEST[3])
33 | self.assertNotEqual(REQUEST[3], REQUEST[4])
34 | self.assertNotEqual(REQUEST[3], REQUEST[4])
35 | self.assertNotEqual(REQUEST[4], REQUEST[5])
36 | self.assertNotEqual(REQUEST[5], REQUEST[6])
37 |
38 | if __name__ == "__main__":
39 | unittest.main()
40 |
--------------------------------------------------------------------------------
/pycreeper/spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | """ Base Spider"""
6 |
7 | import json
8 |
9 | from pycreeper.conf.settings import Settings
10 | from pycreeper.http.request import Request
11 | from pycreeper.engine import Engine
12 | from pycreeper.utils.log import get_logger
13 |
14 |
15 | class Spider(object):
16 | """ Base Spider"""
17 |
18 | custom_settings = None
19 |
20 | def __init__(self):
21 | if not hasattr(self, "start_urls"):
22 | self.start_urls = []
23 | # init settings
24 | self.settings = Settings(self.custom_settings)
25 | self.logger = get_logger(self.settings)
26 | self.initialize()
27 |
28 | def initialize(self):
29 | """initialize
30 | """
31 | pass
32 |
33 | def start_requests(self):
34 | """start_requests
35 | """
36 | for url in self.start_urls:
37 | yield Request(url)
38 |
39 | def start(self):
40 | """start
41 | """
42 | engine = Engine(self)
43 | engine.start()
44 |
45 | def parse(self, response):
46 | """parse
47 | """
48 | raise NotImplementedError
49 |
50 | def process_item(self, item):
51 | """process item
52 | """
53 | self.logger.debug(json.dumps(item))
54 |
--------------------------------------------------------------------------------
/pycreeper/downloader_middlewares/cookies_middlewares.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | from pycreeper.downloader_middlewares import DownloaderMiddleware
6 | import six
7 | from collections import defaultdict
8 | from pycreeper.utils import _get_cookies_from_cookiejar
9 | from pycreeper.http.response import Response
10 | from cookielib import CookieJar
11 |
12 |
13 | class CookiesMiddleware(DownloaderMiddleware):
14 | """This middleware enables working with sites that need cookies"""
15 |
16 | def __init__(self, settings, logger):
17 | self.jars = defaultdict(CookieJar)
18 | self.settings = settings
19 | self.logger = logger
20 |
21 | def process_request(self, request):
22 | if not request.meta or request.meta.get("cookiejar", None) is None:
23 | return
24 | cookiejarkey = request.meta.get("cookiejar")
25 | jar = self.jars[cookiejarkey]
26 | # set CookieJar
27 | request.cookiejar = jar
28 |
29 | def process_response(self, request, response):
30 | if not request.meta or request.meta.get("cookiejar", None) is None:
31 | return response
32 | # extract cookies from response.cookiejar
33 | cookiejarkey = request.meta.get("cookiejar")
34 | jar = self.jars[cookiejarkey]
35 | cookies = _get_cookies_from_cookiejar(response.cookiejar)
36 | for cookie in cookies:
37 | jar.set_cookie(cookie)
38 | return response
--------------------------------------------------------------------------------
/pycreeper/scheduler.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | """ Scheduler """
6 |
7 | from gevent.queue import Queue
8 | from pybloom import ScalableBloomFilter
9 | import gevent
10 | from pycreeper.utils.hash import request_fingerprint
11 |
12 |
13 | class Scheduler(object):
14 | """ Scheduler """
15 |
16 | def __init__(self, spider):
17 | self.request_filter = RequestFilter()
18 | self.queue = Queue()
19 | self.settings = spider.settings
20 | self.timeout = self.settings.get('TIMEOUT', 5)
21 | self.download_delay = self.settings.get('DOWNLOAD_DELAY', 0)
22 | self.logger = spider.logger
23 |
24 | def enqueue_request(self, request):
25 | """put request
26 | """
27 | if self.request_filter.request_seen(request):
28 | self.logger.debug("ignore %s", request.url)
29 | return
30 | self.queue.put(request)
31 |
32 | def next_request(self):
33 | """next request
34 | """
35 | gevent.sleep(self.download_delay)
36 | return self.queue.get(timeout=self.timeout * 3)
37 |
38 | def __len__(self):
39 | return self.queue.qsize()
40 |
41 |
42 | class RequestFilter(object):
43 | """ RequestFilter """
44 |
45 | def __init__(self):
46 | self.sbf = ScalableBloomFilter(
47 | mode=ScalableBloomFilter.SMALL_SET_GROWTH)
48 |
49 | def request_seen(self, request):
50 | """request seen
51 | """
52 | finger = request_fingerprint(request)
53 | if finger in self.sbf:
54 | return True
55 | self.sbf.add(finger)
56 | return False
57 |
--------------------------------------------------------------------------------
/pycreeper/utils/datatypes.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 |
6 | class CaselessDict(dict):
7 | __slots__ = ()
8 |
9 | def __init__(self, seq=None):
10 | super(CaselessDict, self).__init__()
11 | if seq:
12 | self.update(seq)
13 |
14 | def __getitem__(self, key):
15 | return dict.__getitem__(self, self.normkey(key))
16 |
17 | def __setitem__(self, key, value):
18 | dict.__setitem__(self, self.normkey(key), self.normvalue(value))
19 |
20 | def __delitem__(self, key):
21 | dict.__delitem__(self, self.normkey(key))
22 |
23 | def __contains__(self, key):
24 | return dict.__contains__(self, self.normkey(key))
25 |
26 | has_key = __contains__
27 |
28 | def __copy__(self):
29 | return self.__class__(self)
30 |
31 | copy = __copy__
32 |
33 | def normkey(self, key):
34 | """Method to normalize dictionary key access"""
35 | return key.lower()
36 |
37 | def normvalue(self, value):
38 | """Method to normalize values prior to be setted"""
39 | return value
40 |
41 | def get(self, key, def_val=None):
42 | return dict.get(self, self.normkey(key), self.normvalue(def_val))
43 |
44 | def setdefault(self, key, def_val=None):
45 | return dict.setdefault(self, self.normkey(key), self.normvalue(def_val))
46 |
47 | def update(self, seq):
48 | seq = seq.items()
49 | iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq)
50 | super(CaselessDict, self).update(iseq)
51 |
52 | @classmethod
53 | def fromkeys(cls, keys, value=None):
54 | return cls((k, value) for k in keys)
55 |
56 | def pop(self, key, *args):
57 | return dict.pop(self, self.normkey(key), *args)
58 |
--------------------------------------------------------------------------------
/doc/settings.rst:
--------------------------------------------------------------------------------
1 | settings:项目设置
2 | =====================
3 |
4 | 这篇文档主要介绍项目的设定(settings)参数和其默认值。
5 |
6 | 如何覆盖项目的默认设定?
7 | --------------------------
8 |
9 | 可以在您定义的爬虫中设置 **custom_settings** 属性,覆盖掉PyCreeper的默认设定。
10 |
11 | 示例::
12 |
13 | custom_settings = {
14 | 'DOWNLOADER_MIDDLEWARES': {
15 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
16 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
17 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300,
18 | 'pycreeper.downloader_middlewares.middlewares.EncodingDiscriminateMiddleware': 400
19 | },
20 | 'DRIVER': 'Chrome',
21 | 'DOWNLOAD_DELAY': 2,
22 | 'USER_AGENT_LIST': [
23 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
24 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
25 | ]
26 | }
27 |
28 | 设定的可选参数和默认值
29 | ---------------------------
30 |
31 | **LOG_LEVEL**
32 |
33 | 该参数为byte类型,默认值为DEBUG,该参数控制PyCreeper日志的输出等级。
34 |
35 | **RETRY_COUNT**
36 |
37 | 该参数为数值型,默认值为3,表示对于失败请求的最大尝试次数(该参数只对静态请求有效)。
38 |
39 | **RETRY_STATUS_CODES**
40 |
41 | 该参数为list型,默认值为[500, 502, 503, 504, 400, 403, 408],表示返回码在列表中的请求将会被重发(该参数只对静态请求有效)。
42 |
43 | **TIMEOUT**
44 |
45 | 该参数为数值型,默认值为5,表示发出请求定义的超时时间(秒)。
46 |
47 | **MAX_REQUEST_SIZE**
48 |
49 | 该参数为int型,默认值为20,表示可以同时进行的静态请求个数(该参数只对静态请求有效)。
50 |
51 | **USER_AGENT_LIST**
52 |
53 | 该参数为list型,默认值为空列表,表示发送请求时可以携带的User-Agent(需要使用UserAgentMiddleware,该参数只对静态请求有效)。
54 |
55 | **DOWNLOADER_MIDDLEWARES**
56 |
57 | 该参数为dict型,默认值为空字典,表示使用的下载器中间件。字典的key值为希望使用的中间件的reference,
58 | value值为该中间件的优先级,优先级越高的中间件将会越先被使用。
59 |
60 | **DYNAMIC_CRAWL**
61 |
62 | 该参数为bool型,默认值为True,表示引擎是否加载WebDriver。如果在设为False的情况下发出了一系列动态请求,将会引发一系列异常。
63 |
64 | **DRIVER**
65 |
66 | 该参数为byte型,默认值为Firefox,表示PyCreeper使用的Driver类型。可以选择任意一种Selenium支持的Driver,前提是需要配置好Driver的相关环境。
67 |
68 | **DRIVER_INIT_KWARGS**
69 |
70 | 该参数为dict型,默认为空字典,表示启动Driver时传入的参数,您可以通过定义该值修改Driver的属性。
71 |
72 | **DOWNLOAD_DELAY**
73 |
74 | 该参数为数值型,默认值为0,表示下载延迟(秒)。
75 |
76 | **PROXY_INTERVAL**
77 |
78 | 该参数为数值型,默认值为3,表示每个代理使用的最大时间。使用proxy需要搭配ProxyMiddleware,
79 | 并且此处的proxy只对静态请求有效。如果您想配置动态请求的proxy,可以设置DRIVER_INIT_KWARGS参数,在Driver启动时传入配置信息。
80 |
81 | **PROXY_LIST**
82 |
83 | 该参数为list型,默认为空数组,表示请求可以用到的proxy。格式为'IP:端口号'。
84 |
85 | **STATIC_REQUEST_SSL_VERIFY**
86 |
87 | 该参数为bool型,默认值为True,表示发起静态请求是,是否进行ssl认证。
88 | 该参数用于在使用代理的情况下,https认证失败的情况。
--------------------------------------------------------------------------------
/doc/_build/html/_sources/settings.rst.txt:
--------------------------------------------------------------------------------
1 | settings:项目设置
2 | =====================
3 |
4 | 这篇文档主要介绍项目的设定(settings)参数和其默认值。
5 |
6 | 如何覆盖项目的默认设定?
7 | --------------------------
8 |
9 | 可以在您定义的爬虫中设置 **custom_settings** 属性,覆盖掉PyCreeper的默认设定。
10 |
11 | 示例::
12 |
13 | custom_settings = {
14 | 'DOWNLOADER_MIDDLEWARES': {
15 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
16 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
17 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300,
18 | 'pycreeper.downloader_middlewares.middlewares.EncodingDiscriminateMiddleware': 400
19 | },
20 | 'DRIVER': 'Chrome',
21 | 'DOWNLOAD_DELAY': 2,
22 | 'USER_AGENT_LIST': [
23 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
24 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
25 | ]
26 | }
27 |
28 | 设定的可选参数和默认值
29 | ---------------------------
30 |
31 | **LOG_LEVEL**
32 |
33 | 该参数为byte类型,默认值为DEBUG,该参数控制PyCreeper日志的输出等级。
34 |
35 | **RETRY_COUNT**
36 |
37 | 该参数为数值型,默认值为3,表示对于失败请求的最大尝试次数(该参数只对静态请求有效)。
38 |
39 | **RETRY_STATUS_CODES**
40 |
41 | 该参数为list型,默认值为[500, 502, 503, 504, 400, 403, 408],表示返回码在列表中的请求将会被重发(该参数只对静态请求有效)。
42 |
43 | **TIMEOUT**
44 |
45 | 该参数为数值型,默认值为5,表示发出请求定义的超时时间(秒)。
46 |
47 | **MAX_REQUEST_SIZE**
48 |
49 | 该参数为int型,默认值为20,表示可以同时进行的静态请求个数(该参数只对静态请求有效)。
50 |
51 | **USER_AGENT_LIST**
52 |
53 | 该参数为list型,默认值为空列表,表示发送请求时可以携带的User-Agent(需要使用UserAgentMiddleware,该参数只对静态请求有效)。
54 |
55 | **DOWNLOADER_MIDDLEWARES**
56 |
57 | 该参数为dict型,默认值为空字典,表示使用的下载器中间件。字典的key值为希望使用的中间件的reference,
58 | value值为该中间件的优先级,优先级越高的中间件将会越先被使用。
59 |
60 | **DYNAMIC_CRAWL**
61 |
62 | 该参数为bool型,默认值为True,表示引擎是否加载WebDriver。如果在设为False的情况下发出了一系列动态请求,将会引发一系列异常。
63 |
64 | **DRIVER**
65 |
66 | 该参数为byte型,默认值为Firefox,表示PyCreeper使用的Driver类型。可以选择任意一种Selenium支持的Driver,前提是需要配置好Driver的相关环境。
67 |
68 | **DRIVER_INIT_KWARGS**
69 |
70 | 该参数为dict型,默认为空字典,表示启动Driver时传入的参数,您可以通过定义该值修改Driver的属性。
71 |
72 | **DOWNLOAD_DELAY**
73 |
74 | 该参数为数值型,默认值为0,表示下载延迟(秒)。
75 |
76 | **PROXY_INTERVAL**
77 |
78 | 该参数为数值型,默认值为3,表示每个代理使用的最大时间。使用proxy需要搭配ProxyMiddleware,
79 | 并且此处的proxy只对静态请求有效。如果您想配置动态请求的proxy,可以设置DRIVER_INIT_KWARGS参数,在Driver启动时传入配置信息。
80 |
81 | **PROXY_LIST**
82 |
83 | 该参数为list型,默认为空数组,表示请求可以用到的proxy。格式为'IP:端口号'。
84 |
85 | **STATIC_REQUEST_SSL_VERIFY**
86 |
87 | 该参数为bool型,默认值为True,表示发起静态请求是,是否进行ssl认证。
88 | 该参数用于在使用代理的情况下,https认证失败的情况。
--------------------------------------------------------------------------------
/doc/_build/html/_sources/ssettings.rst.txt:
--------------------------------------------------------------------------------
1 | settings:项目设置
2 | =====================
3 |
4 | 这篇文档主要介绍项目的设定(settings)参数和其默认值。
5 |
6 | 如何覆盖项目的默认设定?
7 | --------------------------
8 |
9 | 可以在您定义的爬虫中设置 **custom_settings** 属性,覆盖掉PyCreeper的默认设定。
10 |
11 | 示例::
12 |
13 | custom_settings = {
14 | 'DOWNLOADER_MIDDLEWARES': {
15 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
16 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
17 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300,
18 | 'pycreeper.downloader_middlewares.middlewares.EncodingDiscriminateMiddleware': 400
19 | },
20 | 'DRIVER': 'Chrome',
21 | 'DOWNLOAD_DELAY': 2,
22 | 'USER_AGENT_LIST': [
23 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
24 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
25 | ]
26 | }
27 |
28 | 设定的可选参数和默认值
29 | ---------------------------
30 |
31 | **LOG_LEVEL**
32 |
33 | 该参数为byte类型,默认值为DEBUG,该参数控制PyCreeper日志的输出等级。
34 |
35 | **RETRY_COUNT**
36 |
37 | 该参数为数值型,默认值为3,表示对于失败请求的最大尝试次数(该参数只对静态请求有效)。
38 |
39 | **RETRY_STATUS_CODES**
40 |
41 | 该参数为list型,默认值为[500, 502, 503, 504, 400, 403, 408],表示返回码在列表中的请求将会被重发(该参数只对静态请求有效)。
42 |
43 | **TIMEOUT**
44 |
45 | 该参数为数值型,默认值为5,表示发出请求定义的超时时间(秒)。
46 |
47 | **MAX_REQUEST_SIZE**
48 |
49 | 该参数为int型,默认值为20,表示可以同时进行的静态请求个数(该参数只对静态请求有效)。
50 |
51 | **USER_AGENT_LIST**
52 |
53 | 该参数为list型,默认值为空列表,表示发送请求时可以携带的User-Agent(需要使用UserAgentMiddleware,该参数只对静态请求有效)。
54 |
55 | **DOWNLOADER_MIDDLEWARES**
56 |
57 | 该参数为dict型,默认值为空字典,表示使用的下载器中间件。字典的key值为希望使用的中间件的reference,
58 | value值为该中间件的优先级,优先级越高的中间件将会越先被使用。
59 |
60 | **DYNAMIC_CRAWL**
61 |
62 | 该参数为bool型,默认值为True,表示引擎是否加载WebDriver。如果在设为False的情况下发出了一系列动态请求,将会引发一系列异常。
63 |
64 | **DRIVER**
65 |
66 | 该参数为byte型,默认值为Firefox,表示PyCreeper使用的Driver类型。可以选择任意一种Selenium支持的Driver,前提是需要配置好Driver的相关环境。
67 |
68 | **DRIVER_INIT_KWARGS**
69 |
70 | 该参数为dict型,默认为空字典,表示启动Driver时传入的参数,您可以通过定义该值修改Driver的属性。
71 |
72 | **DOWNLOAD_DELAY**
73 |
74 | 该参数为数值型,默认值为0,表示下载延迟(秒)。
75 |
76 | **PROXY_INTERVAL**
77 |
78 | 该参数为数值型,默认值为3,表示每个代理使用的最大时间。使用proxy需要搭配ProxyMiddleware,
79 | 并且此处的proxy只对静态请求有效。如果您想配置动态请求的proxy,可以设置DRIVER_INIT_KWARGS参数,在Driver启动时传入配置信息。
80 |
81 | **PROXY_LIST**
82 |
83 | 该参数为list型,默认为空数组,表示请求可以用到的proxy。格式为'IP:端口号'。
84 |
85 | **STATIC_REQUEST_SSL_VERIFY**
86 |
87 | 该参数为bool型,默认值为True,表示发起静态请求是,是否进行ssl认证。
88 | 该参数用于在使用代理的情况下,https认证失败的情况。
--------------------------------------------------------------------------------
/examples/zhihu_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import json
6 |
7 | from pycreeper.spider import Spider
8 | from pycreeper.http.request import Request
9 | import gevent
10 | from lxml import etree
11 |
12 | class Zhihu_Spider(Spider):
13 |
14 | custom_settings = {
15 | 'DOWNLOADER_MIDDLEWARES': {
16 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
17 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
18 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
19 | },
20 | 'DRIVER': 'Chrome',
21 | 'DOWNLOAD_DELAY': 2,
22 | 'STATIC_REQUEST_SSL_VERIFY': False,
23 | 'USER_AGENT_LIST': [
24 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
25 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
26 | ]
27 | }
28 |
29 | def start_requests(self):
30 |
31 | def _login(driver):
32 | driver.find_element_by_name('account').send_keys("username")
33 | driver.find_element_by_name('password').send_keys("password")
34 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
35 | gevent.sleep(5)
36 |
37 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
38 | callback=self.after_login, dynamic=True, browser_actions=[_login])
39 |
40 | def after_login(self, response):
41 | html = response.body
42 | selector = etree.HTML(html)
43 | links = selector.xpath('//a[@class="question_link"]')
44 | for link in links:
45 | yield Request('https://www.zhihu.com' + link.attrib["href"],
46 | meta={"cookiejar": "zhihu"}, callback=self.get_item)
47 |
48 | def get_item(self, response):
49 | html = response.body
50 | selector = etree.HTML(html)
51 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
52 | body = selector.xpath('//span[@class="RichText"]')[0].text
53 | yield {
54 | 'head': head,
55 | 'body': body
56 | }
57 |
58 | def process_item(self, item):
59 | print json.dumps(item, ensure_ascii=False).encode('GBK', 'ignore')
60 |
61 | if __name__ == "__main__":
62 | spider = Zhihu_Spider()
63 | spider.start()
64 |
--------------------------------------------------------------------------------
/tests/http/test_http_request.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import unittest
6 |
7 | from w3lib.url import safe_url_string
8 |
9 | from pycreeper.http.request import Request
10 |
11 |
12 | class RequestTest(unittest.TestCase):
13 | def test_init(self):
14 | self.assertRaises(Exception, Request)
15 | self.assertRaises(ValueError, Request, 'foo')
16 | request = Request('http://www.example.com/')
17 | assert request.url
18 | assert not request.body
19 | request = Request('http://www.example.com/',
20 | headers={'Content-Type': 'text/html',
21 | 'Content-Length': 1234
22 | },
23 | method='get'
24 | )
25 | self.assertEqual(request.method, 'GET')
26 |
27 | def test_copy(self):
28 | request1 = Request('http://www.example.com/',
29 | headers={'Content-Type': 'text/html',
30 | 'Content-Length': 1234
31 | },
32 | method='get'
33 | )
34 | request2 = request1.copy()
35 | assert request1.__dict__ == request2.__dict__
36 | self.assertEqual(request1.headers, request2.headers)
37 | self.assertEqual(request1, request2)
38 | self.assertIsNot(request1, request2)
39 |
40 | def test_url(self):
41 | request = Request('http://www.example.com/')
42 | self.assertIsInstance(request.url, str)
43 | self.assertEqual(request.url, 'http://www.example.com/')
44 | request = Request(u'http://www.example.com?content=测试')
45 | self.assertEqual(request.url,
46 | safe_url_string('http://www.example.com?content=测试'))
47 | self.assertRaises(TypeError, Request, 123)
48 |
49 | def test_body(self):
50 | r1 = Request(url="http://www.example.com/")
51 | assert r1.body == b''
52 |
53 | r2 = Request(url="http://www.example.com/", body=b"")
54 | assert isinstance(r2.body, bytes)
55 | self.assertEqual(r2.encoding, 'utf-8') # default encoding
56 |
57 | r3 = Request(url="http://www.example.com/", body=u"Price: \xa3100", encoding='utf-8')
58 | assert isinstance(r3.body, bytes)
59 | self.assertEqual(r3.body, b"Price: \xc2\xa3100")
60 |
61 | r4 = Request(url="http://www.example.com/", body=u"Price: \xa3100", encoding='latin1')
62 | assert isinstance(r4.body, bytes)
63 | self.assertEqual(r4.body, b"Price: \xa3100")
64 |
--------------------------------------------------------------------------------
/pycreeper/conf/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | """" Settings """
6 |
7 | import json
8 | from importlib import import_module
9 |
10 | from pycreeper.conf import default_settings
11 |
12 |
13 | class Settings(object):
14 | """ Settings Object """
15 |
16 | def __init__(self, values=None):
17 | self.attrs = {}
18 | self.load_config(default_settings)
19 | if values:
20 | self.load_config(values)
21 |
22 | def __getitem__(self, key):
23 | """__getitem__
24 |
25 | @key, str, key
26 | """
27 | return self.attrs[key] if key in self.attrs else None
28 |
29 | def load_config(self, module):
30 | """load config
31 |
32 | @module, module
33 | """
34 | if isinstance(module, basestring):
35 | module = import_module(module)
36 | for key in module if isinstance(module, dict) else dir(module):
37 | if key.isupper():
38 | self.set(key, module.get(key) \
39 | if isinstance(module, dict) else getattr(module, key))
40 |
41 | def set(self, key, value):
42 | """set
43 |
44 | @key, str, key
45 | @value, str/int/float value
46 | """
47 | self.attrs[key] = value
48 |
49 | def set_dict(self, values):
50 | """set dict
51 |
52 | @values, dict, values
53 | """
54 | for key, value in values.iteritems():
55 | self.set(key, value)
56 |
57 | def get(self, key, default=None):
58 | """get
59 |
60 | @key, str, key
61 | @default, default
62 | """
63 | return self[key] or default
64 |
65 | def get_int(self, key, default=0):
66 | """get int
67 |
68 | @key, str, key
69 | @default, int
70 | """
71 | return int(self.get(key, default))
72 |
73 | def get_float(self, key, default=0.0):
74 | """get float
75 |
76 | @key, str, key
77 | @default, float
78 | """
79 | return float(self.get(key, default))
80 |
81 | def get_list(self, key, default=None):
82 | """get list
83 |
84 | @key, str, key
85 | @default, list
86 | """
87 | value = self.get(key, default or None)
88 | if isinstance(value, basestring):
89 | value = value.split(",")
90 | return value
91 |
92 | def get_dict(self, key, default=None):
93 | """get dict
94 |
95 | @key, str, key
96 | @default, dict
97 | """
98 | value = self.get(key, default or None)
99 | if isinstance(value, basestring):
100 | value = json.loads(value)
101 | return value
102 |
--------------------------------------------------------------------------------
/pycreeper/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import six
6 |
7 | def result2list(result):
8 | """result to list
9 | """
10 | if result is None:
11 | return []
12 | if isinstance(result, (dict, basestring)):
13 | return [result]
14 | if hasattr(result, "__iter__"):
15 | return result
16 |
17 |
18 | def call_func(func, errback=None, callback=None, *args, **kwargs):
19 | """执行某个函数,并自动包装异常和回调
20 |
21 | :param func:
22 | :param errback:
23 | :param callback:
24 | :param args:
25 | :param kwargs:
26 | """
27 | try:
28 | result = func(*args, **kwargs)
29 | except Exception as exc:
30 | if errback:
31 | errback(exc)
32 | else:
33 | if callback:
34 | result = callback(result)
35 | return result
36 |
37 |
38 | def sorted_priority_dict(d):
39 | """Sort the priority dict to a ordered list.
40 |
41 | :param d: A priority dict.
42 | :return: Ordered list.
43 | """
44 | modules = sorted(d.items(), key=lambda x: x[1])
45 | modules = [x[0] for x in modules]
46 | return modules
47 |
48 |
49 | def to_unicode(text, encoding=None, errors='strict'):
50 | """Return the unicode representation of a bytes object `text`. If `text`
51 | is already an unicode object, return it as-is."""
52 | if isinstance(text, six.text_type):
53 | return text
54 | if not isinstance(text, (bytes, six.text_type)):
55 | raise TypeError('to_unicode must receive a bytes, str or unicode '
56 | 'object, got %s' % type(text).__name__)
57 | if encoding is None:
58 | encoding = 'utf-8'
59 | return text.decode(encoding, errors)
60 |
61 |
62 | def to_bytes(text, encoding=None, errors='strict'):
63 | """Return the binary representation of `text`. If `text`
64 | is already a bytes object, return it as-is."""
65 | if isinstance(text, bytes):
66 | return text
67 | if not isinstance(text, six.string_types):
68 | raise TypeError('to_bytes must receive a unicode, str or bytes '
69 | 'object, got %s' % type(text).__name__)
70 | if encoding is None:
71 | encoding = 'utf-8'
72 | return text.encode(encoding, errors)
73 |
74 |
75 | def to_native_str(text, encoding=None, errors='strict'):
76 | """ Return str representation of `text`
77 | (bytes in Python 2.x and unicode in Python 3.x). """
78 | if six.PY2:
79 | return to_bytes(text, encoding, errors)
80 | else:
81 | return to_unicode(text, encoding, errors)
82 |
83 |
84 | def _get_cookies_from_cookiejar(cj):
85 | result = []
86 | for domain in cj._cookies.keys():
87 | for path in cj._cookies[domain].keys():
88 | for cookie in cj._cookies[domain][path].values():
89 | result.append(cookie)
90 | return result
91 |
--------------------------------------------------------------------------------
/pycreeper/conf/default_settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | """ default config settings """
6 |
7 | LOG_LEVEL = 'DEBUG'
8 |
9 | RETRY_COUNT = 3
10 |
11 | RETRY_STATUS_CODES = [500, 502, 503, 504, 400, 403, 408]
12 |
13 | TIMEOUT = 5
14 |
15 | MAX_REQUEST_SIZE = 20
16 |
17 | USER_AGENT_LIST = [
18 | 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31',
19 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17',
20 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17',
21 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',
22 | 'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)',
23 | 'Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
24 | 'Mozilla/6.0 (Windows NT 6.2; WOW64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1',
25 | 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1',
26 | 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:15.0) Gecko/20120910144328 Firefox/15.0.2',
27 | 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
28 | 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9a3pre) Gecko/20070330',
29 | 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.13; ) Gecko/20101203',
30 | 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
31 | 'Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50',
32 | 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52',
33 | 'Mozilla/5.0 (Windows; U; Win 9x 4.90; SG; rv:1.9.2.4) Gecko/20101104 Netscape/9.1.0285',
34 | 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.1.7pre) Gecko/20070815 Firefox/2.0.0.6 Navigator/9.0b3',
35 | 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
36 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
37 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)",
38 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)",
39 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
40 | "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT)",
41 | "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13",
42 | "Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3",
43 | ]
44 |
45 | DOWNLOADER_MIDDLEWARES = {}
46 |
47 | DYNAMIC_CRAWL = True
48 |
49 | DRIVER = 'Firefox'
50 |
51 | DRIVER_INIT_KWARGS = {}
52 |
53 | DOWNLOAD_DELAY = 0
54 |
55 | PROXY_INTERVAL = 3
56 |
57 | PROXY_LIST = []
58 |
59 | STATIC_REQUEST_SSL_VERIFY = True
60 |
--------------------------------------------------------------------------------
/tests/test_conf_settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import unittest
6 |
7 | from pycreeper.conf.settings import Settings
8 | from tests.test_data import test_settings_data
9 |
10 | CONF_PATH = 'tests.test_data.test_settings_data'
11 |
12 |
13 | class SettingsTest(unittest.TestCase):
14 | def test_basics(self):
15 | settings = Settings()
16 | self.assertEqual(settings['RETRY_COUNT'], 3)
17 | settings = Settings(test_settings_data)
18 | self.assertEqual(settings['TEST_INT'], 10)
19 |
20 | def test_get_item(self):
21 | settings = Settings(test_settings_data)
22 | self.assertEqual(settings['TEST_STR'], 'foo,bar,baz')
23 | self.assertEqual(settings['TEST_DICT'], {"foo": "bar"})
24 |
25 | def test_load_config(self):
26 | settings = Settings(test_settings_data)
27 | self.assertEqual(settings['TEST_STR'], 'foo,bar,baz')
28 | settings = Settings(CONF_PATH)
29 | self.assertEqual(settings['TEST_STR'], 'foo,bar,baz')
30 | self.assertRaises(KeyError, settings['test_lowercase'])
31 |
32 | def test_set(self):
33 | settings = Settings(test_settings_data)
34 | self.assertRaises(KeyError, settings['TEST_SET'])
35 | settings.set('TEST_SET', True)
36 | self.assertEqual(settings['TEST_SET'], True)
37 |
38 | def test_set_dict(self):
39 | settings = Settings(test_settings_data)
40 | self.assertRaises(KeyError, settings['TEST_SET_1'])
41 | self.assertRaises(KeyError, settings['TEST_SET_2'])
42 | settings.set_dict(
43 | {
44 | 'TEST_SET_1': True,
45 | 'TEST_SET_2': False
46 | }
47 | )
48 | self.assertEqual(settings['TEST_SET_1'], True)
49 | self.assertEqual(settings['TEST_SET_2'], False)
50 |
51 | def test_get(self):
52 | settings = Settings(test_settings_data)
53 | self.assertEqual(settings.get('TEST_GET'), None)
54 | self.assertEqual(settings.get('TEST_GET', 'foo'), 'foo')
55 | settings.set('TEST_GET', 'bar')
56 | self.assertEqual(settings.get('TEST_GET', 'foo'), 'bar')
57 |
58 | def test_get_int_and_float(self):
59 | settings = Settings(test_settings_data)
60 | self.assertIsInstance(settings.get_float('TEST_INT'), float)
61 | self.assertIsInstance(settings.get_int('TEST_FLOAT'), int)
62 |
63 | def test_get_list(self):
64 | settings = Settings(test_settings_data)
65 | self.assertIsInstance(settings.get_list('TEST_LIST'), list)
66 | self.assertIsInstance(settings.get_list('TEST_STR'), list)
67 |
68 | def test_get_dict(self):
69 | settings = Settings(test_settings_data)
70 | self.assertIsInstance(settings.get_dict('TEST_DICT'), dict)
71 | self.assertIsInstance(settings.get_dict('TEST_JSON'), dict)
72 |
73 |
74 | if __name__ == "__main__":
75 | unittest.main()
76 |
--------------------------------------------------------------------------------
/pycreeper/downloader_middlewares/__init__.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | """Dowloader Midlleware"""
6 |
7 | from collections import defaultdict
8 |
9 | from importlib import import_module
10 |
11 | from pycreeper.utils import call_func, sorted_priority_dict
12 | from pycreeper.http.request import Request
13 |
14 |
15 | class DownloaderMiddleware(object):
16 | """ DownloaderMiddleware iterface """
17 |
18 | pass
19 |
20 |
21 | class DownloaderMiddlewareManager(object):
22 | """ DownloaderMiddlewareManager """
23 |
24 | def __init__(self, spider):
25 | self.settings = spider.settings
26 | self.logger = spider.logger
27 | self.methods = defaultdict(list)
28 | self.middlewares = self.load_middleware()
29 | for miw in self.middlewares:
30 | self._add_middleware(miw)
31 |
32 | def load_middleware(self):
33 | """load middleware
34 | """
35 | middlewares = []
36 | modules = sorted_priority_dict(
37 | self.settings.get('DOWNLOADER_MIDDLEWARES', {})
38 | )
39 | for module_name in modules:
40 | module = import_module('.'.join(module_name.split('.')[:-1]))
41 | middleware_class = getattr(module, module_name.split('.')[-1])
42 | middlewares.append(middleware_class(self.settings, self.logger))
43 | return middlewares
44 |
45 | def _add_middleware(self, miw):
46 | """add middleware
47 | """
48 | if hasattr(miw, "process_request"):
49 | self.methods["process_request"].append(miw.process_request)
50 | if hasattr(miw, "process_response"):
51 | self.methods["process_response"].insert(0, miw.process_response)
52 | if hasattr(miw, "process_exception"):
53 | self.methods["process_exception"].insert(0, miw.process_exception)
54 |
55 | def download(self, download_func, request):
56 | """download
57 | """
58 |
59 | def process_request(request):
60 | """ process request """
61 | for method in self.methods["process_request"]:
62 | method(request)
63 | return download_func(request)
64 |
65 | def process_response(response):
66 | """ process response """
67 | for method in self.methods["process_response"]:
68 | response = method(request, response)
69 | if isinstance(response, Request):
70 | return response
71 | return response
72 |
73 | def process_exception(exception):
74 | """ process exception """
75 | for method in self.methods["process_exception"]:
76 | response = method(request, exception)
77 | if response:
78 | return response
79 | return exception
80 |
81 | return call_func(process_request, process_exception,
82 | process_response, request)
83 |
--------------------------------------------------------------------------------
/tests/test_scheduler.py:
--------------------------------------------------------------------------------
1 | #-*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import unittest
6 | import time
7 | from pycreeper.scheduler import RequestFilter, Scheduler
8 | from pycreeper.http.request import Request
9 | from pycreeper.spider import Spider
10 | from Queue import Empty
11 |
12 | __doctests__ = ['pycreeper.utils.scheduler']
13 |
14 | URLS = [
15 | 'http://www.example.com/index.html#print',
16 | 'http://www.example.com/index.html',
17 | 'http://www.xxx.com/index.html?id=77&nameid=2905210001&page=1',
18 | 'http://www.xxxxx.com/index.html?id=77&nameid=2905210001&page=1',
19 | 'http://www.xxxxx.com/index.html?test123123',
20 | 'http://www.xxxxx.com/index.html',
21 | 'ftp://www.xxxxx.com/index.html'
22 | ]
23 |
24 | REQUEST = [Request(url) for url in URLS]
25 |
26 |
27 | class RequestTest(unittest.TestCase):
28 |
29 | def test_basic(self):
30 | request_filter = RequestFilter()
31 | request_filter.request_seen(REQUEST[0])
32 | self.assertEqual(request_filter.request_seen(REQUEST[0]), True)
33 | self.assertEqual(request_filter.request_seen(REQUEST[1]), False)
34 | self.assertEqual(request_filter.request_seen(REQUEST[1]), True)
35 | self.assertRaises(AttributeError, request_filter.request_seen, None)
36 |
37 |
38 | class SchedulerTest(unittest.TestCase):
39 |
40 | def setUp(self):
41 | self.spider = Spider()
42 |
43 | def test_basic(self):
44 | self.assertRaises(AttributeError, Scheduler, None)
45 |
46 | def test_enqueue(self):
47 | scheduler = Scheduler(self.spider)
48 | self.assertRaises(AttributeError, scheduler.enqueue_request, None)
49 | self.assertEqual(len(scheduler.queue), 0)
50 | scheduler.enqueue_request(REQUEST[0])
51 | self.assertEqual(len(scheduler.queue), 1)
52 | scheduler.enqueue_request(REQUEST[0])
53 | self.assertEqual(len(scheduler.queue), 1)
54 | scheduler.enqueue_request(REQUEST[1])
55 | self.assertEqual(len(scheduler.queue), 2)
56 | scheduler.enqueue_request(REQUEST[0])
57 | self.assertEqual(len(scheduler.queue), 2)
58 |
59 | def test_next_request(self):
60 | scheduler = Scheduler(self.spider)
61 | self.assertRaises(Empty, scheduler.next_request)
62 | scheduler.enqueue_request(REQUEST[0])
63 | scheduler.enqueue_request(REQUEST[1])
64 | scheduler.enqueue_request(REQUEST[2])
65 | self.assertEqual(scheduler.next_request(), REQUEST[0])
66 | self.assertEqual(scheduler.next_request(), REQUEST[1])
67 | self.assertEqual(scheduler.next_request(), REQUEST[2])
68 | self.assertRaises(Empty, scheduler.next_request)
69 |
70 | def test_download_delay(self):
71 | self.spider.settings.set('DOWNLOAD_DELAY', 5)
72 | scheduler = Scheduler(self.spider)
73 | scheduler.enqueue_request(REQUEST[0])
74 | time1 = time.time()
75 | scheduler.next_request()
76 | self.assertGreater(time.time() - time1, 5)
77 |
78 |
79 | if __name__ == "__main__":
80 | unittest.main()
81 |
--------------------------------------------------------------------------------
/pycreeper/http/response.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | """ Response Object """
6 |
7 | import six
8 | from w3lib.url import safe_url_string
9 | from pycreeper.http.request import Request
10 | import copy
11 |
12 |
13 | class Response(object):
14 | """ Response """
15 |
16 | def __init__(self, url, request, headers=None, status=200,
17 | cookiejar=None, body='', encoding='utf-8'):
18 | self._encoding = encoding
19 | self.headers = copy.deepcopy(headers) if headers else {}
20 | self.url = url
21 | self.status = int(status)
22 | self.cookiejar = cookiejar
23 | self.body = body
24 | self.request = request
25 |
26 | @property
27 | def encoding(self):
28 | return self._encoding
29 |
30 | @property
31 | def url(self):
32 | return self._url
33 |
34 | @url.setter
35 | def url(self, url):
36 | if isinstance(url, str):
37 | self._url = safe_url_string(url)
38 | elif isinstance(url, six.text_type):
39 | if self.encoding is None:
40 | raise TypeError('Cannot convert unicode url - %s has no encoding' %
41 | type(self).__name__)
42 | self._url = safe_url_string(url.encode(self.encoding))
43 | else:
44 | raise TypeError('Response url must be str or unicode, got %s:' % type(url).__name__)
45 | if ':' not in self._url:
46 | raise ValueError('Missing scheme in request url: %s' % self._url)
47 |
48 | @property
49 | def body(self):
50 | return self._body
51 |
52 | @body.setter
53 | def body(self, body):
54 | if isinstance(body, str):
55 | self._body = body
56 | elif isinstance(body, six.text_type):
57 | if self.encoding is None:
58 | raise TypeError('Cannot convert unicode body - %s has no encoding' %
59 | type(self).__name__)
60 | self._body = body.encode(self.encoding)
61 | elif body is None:
62 | self._body = ''
63 | else:
64 | raise TypeError("Response body must either str or unicode. Got: '%s'" % type(body).__name__)
65 |
66 | @property
67 | def request(self):
68 | return self._request
69 |
70 | @request.setter
71 | def request(self, value):
72 | if isinstance(value, Request):
73 | self._request = value.copy()
74 | else:
75 | raise TypeError("Response request must be pycreeper.Request. Got: '%s'" % type(value).__name__)
76 |
77 | def copy(self, *args, **kwargs):
78 | """ copy """
79 | for key in ["url", "status", "cookiejar", "body", "request", "encoding", "headers"]:
80 | kwargs.setdefault(key, getattr(self, key))
81 |
82 | cls = kwargs.pop('cls', self.__class__)
83 | return cls(*args, **kwargs)
84 |
85 | def __str__(self):
86 | return "<%d %s>" % (self.status, self.url)
87 |
88 | __repr__ = __str__
89 |
90 | def __eq__(self, other):
91 | return self.__dict__ == other.__dict__
92 |
93 | def __ne__(self, other):
94 | return self.__dict__ != other.__dict__
95 |
--------------------------------------------------------------------------------
/doc/_build/html/genindex.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | Index — PyCreeper 1.0.0 documentation
11 |
12 |
13 |
14 |
15 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
Index
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
75 |
76 |
77 |
86 |
90 |
91 |
--------------------------------------------------------------------------------
/pycreeper/http/request.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import six
6 | from w3lib.url import safe_url_string
7 | import copy
8 |
9 |
10 | class Request(object):
11 | """ Request """
12 |
13 | def __init__(self, url, callback=None, method='GET', headers=None,
14 | body=None, meta=None, encoding='utf-8', cookiejar=None,
15 | dynamic=False, browser_actions=None, wait=0):
16 | self._encoding = encoding
17 | self.headers = copy.deepcopy(headers) if headers else {}
18 | self.cookiejar = cookiejar
19 | self.url = url
20 | self.body = body
21 | self.method = str(method).upper()
22 | self.callback = callback
23 | self.meta = dict(meta) if meta else {}
24 | self.dynamic = bool(dynamic)
25 | if self.dynamic:
26 | if self.method == 'POST':
27 | raise AttributeError('Pycreeper can\'t make a dynamic POST request.')
28 | self.browser_actions = browser_actions if browser_actions else []
29 | self.wait = int(wait)
30 | else:
31 | self.browser_actions = []
32 | self.wait = 0
33 |
34 | @property
35 | def encoding(self):
36 | return self._encoding
37 |
38 | @property
39 | def url(self):
40 | return self._url
41 |
42 | @url.setter
43 | def url(self, url):
44 | if isinstance(url, str):
45 | self._url = safe_url_string(url)
46 | elif isinstance(url, six.text_type):
47 | if self._encoding is None:
48 | raise TypeError('Cannot convert unicode url - %s has no encoding' %
49 | type(self).__name__)
50 | self._url = safe_url_string(url.encode(self._encoding))
51 | else:
52 | raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
53 | if ':' not in self._url:
54 | raise ValueError('Missing scheme in request url: %s' % self._url)
55 |
56 | @property
57 | def body(self):
58 | return self._body
59 |
60 | @body.setter
61 | def body(self, body):
62 | if isinstance(body, str):
63 | self._body = body
64 | elif isinstance(body, six.text_type):
65 | if self._encoding is None:
66 | raise TypeError('Cannot convert unicode body - %s has no encoding' %
67 | type(self).__name__)
68 | self._body = body.encode(self._encoding)
69 | elif body is None:
70 | self._body = ''
71 | elif isinstance(body, dict):
72 | self._body = body
73 | else:
74 | raise TypeError("Request body must either str, unicode or dict. Got: '%s'" % type(body).__name__)
75 |
76 | def copy(self, *args, **kwargs):
77 | """ copy """
78 | for key in ["encoding", "url", "method", "callback",
79 | "cookiejar", "body", "meta", "headers"]:
80 | kwargs.setdefault(key, getattr(self, key))
81 | cls = kwargs.pop('cls', self.__class__)
82 | return cls(*args, **kwargs)
83 |
84 | def __str__(self):
85 | return "<%s %s>" % (self.method, self.url)
86 |
87 | __repr__ = __str__
88 |
89 | def __eq__(self, other):
90 | return self.__dict__ == other.__dict__
91 |
92 | def __ne__(self, other):
93 | return self.__dict__ != other.__dict__
94 |
--------------------------------------------------------------------------------
/examples/jd_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import json
6 | import HTMLParser
7 | from pycreeper.spider import Spider
8 | from pycreeper.http.request import Request
9 | from selenium.webdriver.common.keys import Keys
10 | import gevent
11 | from lxml import etree
12 | from selenium.common.exceptions import NoSuchElementException
13 |
14 | parser = HTMLParser.HTMLParser()
15 |
16 | class Jd_Spider(Spider):
17 |
18 | custom_settings = {
19 | 'DOWNLOADER_MIDDLEWARES': {
20 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
21 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
22 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300,
23 | 'pycreeper.downloader_middlewares.middlewares.EncodingDiscriminateMiddleware': 400
24 | },
25 | 'DRIVER': 'Chrome',
26 | 'DOWNLOAD_DELAY': 2,
27 | 'USER_AGENT_LIST': [
28 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
29 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
30 | ]
31 | }
32 |
33 | def start_requests(self):
34 | def _search(driver):
35 | driver.find_element_by_id('key').send_keys(u"联想笔记本", Keys.ENTER)
36 | gevent.sleep(3)
37 | self._jump_guide(driver)
38 | gevent.sleep(3)
39 |
40 | yield Request(url='https://www.jd.com/',
41 | meta={"cookiejar": "jd"},
42 | callback=self.parse_list,
43 | dynamic=True,
44 | browser_actions=[_search]
45 | )
46 |
47 | def _jump_guide(self, driver):
48 | try:
49 | driver.find_element_by_xpath('//*[@id="guide-price"]/div[2]/a').click()
50 | except NoSuchElementException as e:
51 | pass
52 |
53 | def parse_list(self, response):
54 | html = response.body
55 | selector = etree.HTML(html)
56 | links = selector.xpath('//div[@class="p-img"]/a')
57 | titles = selector.xpath('//div[@class="p-name p-name-type-2"]/a/em')
58 | imgs = selector.xpath('//div[@class="p-img"]/a/img')
59 | prices = selector.xpath('//div[@class="p-price"]/strong/i')
60 | for i in range(len(links)):
61 | try:
62 | yield {
63 | 'path': links[i].attrib["href"] if 'http' in links[i].attrib["href"]
64 | else 'http:' + links[i].attrib["href"],
65 | 'title': parser.unescape(etree.tostring(titles[i], pretty_print=True)),
66 | 'img': imgs[i].attrib["src"] if 'http' in imgs[i].attrib["src"]
67 | else 'http:' + imgs[i].attrib["src"],
68 | 'price': prices[i].text,
69 | }
70 | except Exception as e:
71 | pass
72 |
73 | url = response.url
74 |
75 | def _next_page(driver):
76 | self._jump_guide(driver)
77 | driver.find_element_by_xpath('//*[@id="J_bottomPage"]/span[1]/a[9]').click()
78 | self._jump_guide(driver)
79 |
80 | yield Request(url=url,
81 | meta={"cookiejar": "jd"},
82 | callback=self.parse_list,
83 | dynamic=True,
84 | browser_actions=[_next_page]
85 | )
86 |
87 | def process_item(self, item):
88 | print json.dumps(item, ensure_ascii=False).encode('GBK', 'ignore')
89 |
90 | if __name__ == "__main__":
91 | spider = Jd_Spider()
92 | spider.start()
93 |
--------------------------------------------------------------------------------
/doc/_build/html/search.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Search — PyCreeper 1.0.0 documentation
10 |
11 |
12 |
13 |
14 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
Search
55 |
56 |
57 |
58 | Please activate JavaScript to enable the search
59 | functionality.
60 |
61 |
62 |
63 | From here you can search these documents. Enter your search
64 | words into the box below and click "search". Note that the search
65 | function will automatically search for all of the words. Pages
66 | containing fewer words won't appear in the result list.
67 |
68 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
85 |
86 |
87 |
96 |
100 |
101 |
--------------------------------------------------------------------------------
/doc/_build/html/last.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | 写在最后 — PyCreeper 1.0.0 documentation
10 |
11 |
12 |
13 |
14 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
写在最后
52 |
53 |
54 |
55 |
56 |
57 |
58 |
82 |
83 |
84 |
96 |
100 |
101 |
--------------------------------------------------------------------------------
/tests/utils/test_utils_datatypes.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import copy
6 | import unittest
7 |
8 | from pycreeper.utils.datatypes import CaselessDict
9 |
10 | __doctests__ = ['pycreeper.utils.datatypes']
11 |
12 |
13 | class CaselessDictTest(unittest.TestCase):
14 | def test_init(self):
15 | seq = {'red': 1, 'black': 3}
16 | d = CaselessDict(seq)
17 | self.assertEqual(d['red'], 1)
18 | self.assertEqual(d['black'], 3)
19 |
20 | seq = (('red', 1), ('black', 3))
21 | d = CaselessDict(seq)
22 | self.assertEqual(d['red'], 1)
23 | self.assertEqual(d['black'], 3)
24 |
25 | def test_caseless(self):
26 | d = CaselessDict()
27 | d['key_Lower'] = 1
28 | self.assertEqual(d['KEy_loWer'], 1)
29 | self.assertEqual(d.get('KEy_loWer'), 1)
30 |
31 | d['KEY_LOWER'] = 3
32 | self.assertEqual(d['key_Lower'], 3)
33 | self.assertEqual(d.get('key_Lower'), 3)
34 |
35 | def test_delete(self):
36 | d = CaselessDict({'key_lower': 1})
37 | del d['key_LOWER']
38 | self.assertRaises(KeyError, d.__getitem__, 'key_LOWER')
39 | self.assertRaises(KeyError, d.__getitem__, 'key_lower')
40 |
41 | def test_getdefault(self):
42 | d = CaselessDict()
43 | self.assertEqual(d.get('c', 5), 5)
44 | d['c'] = 10
45 | self.assertEqual(d.get('c', 5), 10)
46 |
47 | def test_setdefault(self):
48 | d = CaselessDict({'a': 1, 'b': 2})
49 |
50 | r = d.setdefault('A', 5)
51 | self.assertEqual(r, 1)
52 | self.assertEqual(d['A'], 1)
53 |
54 | r = d.setdefault('c', 5)
55 | self.assertEqual(r, 5)
56 | self.assertEqual(d['C'], 5)
57 |
58 | def test_fromkeys(self):
59 | keys = ('a', 'b')
60 |
61 | d = CaselessDict.fromkeys(keys)
62 | self.assertEqual(d['A'], None)
63 | self.assertEqual(d['B'], None)
64 |
65 | d = CaselessDict.fromkeys(keys, 1)
66 | self.assertEqual(d['A'], 1)
67 | self.assertEqual(d['B'], 1)
68 |
69 | instance = CaselessDict()
70 | d = instance.fromkeys(keys)
71 | self.assertEqual(d['A'], None)
72 | self.assertEqual(d['B'], None)
73 |
74 | d = instance.fromkeys(keys, 1)
75 | self.assertEqual(d['A'], 1)
76 | self.assertEqual(d['B'], 1)
77 |
78 | def test_contains(self):
79 | d = CaselessDict()
80 | d['a'] = 1
81 | assert 'a' in d
82 |
83 | def test_pop(self):
84 | d = CaselessDict()
85 | d['a'] = 1
86 | self.assertEqual(d.pop('A'), 1)
87 | self.assertRaises(KeyError, d.pop, 'A')
88 |
89 | def test_normkey(self):
90 | class MyDict(CaselessDict):
91 | def normkey(self, key):
92 | return key.title()
93 |
94 | d = MyDict()
95 | d['key-one'] = 2
96 | self.assertEqual(list(d.keys()), ['Key-One'])
97 |
98 | def test_normvalue(self):
99 | class MyDict(CaselessDict):
100 | def normvalue(self, value):
101 | if value is not None:
102 | return value + 1
103 |
104 | d = MyDict({'key': 1})
105 | self.assertEqual(d['key'], 2)
106 | self.assertEqual(d.get('key'), 2)
107 |
108 | d = MyDict()
109 | d['key'] = 1
110 | self.assertEqual(d['key'], 2)
111 | self.assertEqual(d.get('key'), 2)
112 |
113 | d = MyDict()
114 | d.setdefault('key', 1)
115 | self.assertEqual(d['key'], 2)
116 | self.assertEqual(d.get('key'), 2)
117 |
118 | d = MyDict()
119 | d.update({'key': 1})
120 | self.assertEqual(d['key'], 2)
121 | self.assertEqual(d.get('key'), 2)
122 |
123 | d = MyDict.fromkeys(('key',), 1)
124 | self.assertEqual(d['key'], 2)
125 | self.assertEqual(d.get('key'), 2)
126 |
127 | def test_copy(self):
128 | h1 = CaselessDict({'header1': 'value'})
129 | h2 = copy.copy(h1)
130 | self.assertEqual(h1, h2)
131 | self.assertEqual(h1.get('header1'), h2.get('header1'))
132 | assert isinstance(h2, CaselessDict)
133 |
134 |
135 | if __name__ == "__main__":
136 | unittest.main()
137 |
--------------------------------------------------------------------------------
/doc/_build/html/_static/pygments.css:
--------------------------------------------------------------------------------
1 | .highlight .hll { background-color: #ffffcc }
2 | .highlight { background: #eeffcc; }
3 | .highlight .c { color: #408090; font-style: italic } /* Comment */
4 | .highlight .err { border: 1px solid #FF0000 } /* Error */
5 | .highlight .k { color: #007020; font-weight: bold } /* Keyword */
6 | .highlight .o { color: #666666 } /* Operator */
7 | .highlight .ch { color: #408090; font-style: italic } /* Comment.Hashbang */
8 | .highlight .cm { color: #408090; font-style: italic } /* Comment.Multiline */
9 | .highlight .cp { color: #007020 } /* Comment.Preproc */
10 | .highlight .cpf { color: #408090; font-style: italic } /* Comment.PreprocFile */
11 | .highlight .c1 { color: #408090; font-style: italic } /* Comment.Single */
12 | .highlight .cs { color: #408090; background-color: #fff0f0 } /* Comment.Special */
13 | .highlight .gd { color: #A00000 } /* Generic.Deleted */
14 | .highlight .ge { font-style: italic } /* Generic.Emph */
15 | .highlight .gr { color: #FF0000 } /* Generic.Error */
16 | .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
17 | .highlight .gi { color: #00A000 } /* Generic.Inserted */
18 | .highlight .go { color: #333333 } /* Generic.Output */
19 | .highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */
20 | .highlight .gs { font-weight: bold } /* Generic.Strong */
21 | .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
22 | .highlight .gt { color: #0044DD } /* Generic.Traceback */
23 | .highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */
24 | .highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */
25 | .highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */
26 | .highlight .kp { color: #007020 } /* Keyword.Pseudo */
27 | .highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */
28 | .highlight .kt { color: #902000 } /* Keyword.Type */
29 | .highlight .m { color: #208050 } /* Literal.Number */
30 | .highlight .s { color: #4070a0 } /* Literal.String */
31 | .highlight .na { color: #4070a0 } /* Name.Attribute */
32 | .highlight .nb { color: #007020 } /* Name.Builtin */
33 | .highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */
34 | .highlight .no { color: #60add5 } /* Name.Constant */
35 | .highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */
36 | .highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */
37 | .highlight .ne { color: #007020 } /* Name.Exception */
38 | .highlight .nf { color: #06287e } /* Name.Function */
39 | .highlight .nl { color: #002070; font-weight: bold } /* Name.Label */
40 | .highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */
41 | .highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */
42 | .highlight .nv { color: #bb60d5 } /* Name.Variable */
43 | .highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */
44 | .highlight .w { color: #bbbbbb } /* Text.Whitespace */
45 | .highlight .mb { color: #208050 } /* Literal.Number.Bin */
46 | .highlight .mf { color: #208050 } /* Literal.Number.Float */
47 | .highlight .mh { color: #208050 } /* Literal.Number.Hex */
48 | .highlight .mi { color: #208050 } /* Literal.Number.Integer */
49 | .highlight .mo { color: #208050 } /* Literal.Number.Oct */
50 | .highlight .sa { color: #4070a0 } /* Literal.String.Affix */
51 | .highlight .sb { color: #4070a0 } /* Literal.String.Backtick */
52 | .highlight .sc { color: #4070a0 } /* Literal.String.Char */
53 | .highlight .dl { color: #4070a0 } /* Literal.String.Delimiter */
54 | .highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */
55 | .highlight .s2 { color: #4070a0 } /* Literal.String.Double */
56 | .highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */
57 | .highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */
58 | .highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */
59 | .highlight .sx { color: #c65d09 } /* Literal.String.Other */
60 | .highlight .sr { color: #235388 } /* Literal.String.Regex */
61 | .highlight .s1 { color: #4070a0 } /* Literal.String.Single */
62 | .highlight .ss { color: #517918 } /* Literal.String.Symbol */
63 | .highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */
64 | .highlight .fm { color: #06287e } /* Name.Function.Magic */
65 | .highlight .vc { color: #bb60d5 } /* Name.Variable.Class */
66 | .highlight .vg { color: #bb60d5 } /* Name.Variable.Global */
67 | .highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */
68 | .highlight .vm { color: #bb60d5 } /* Name.Variable.Magic */
69 | .highlight .il { color: #208050 } /* Literal.Number.Integer.Long */
--------------------------------------------------------------------------------
/doc/_build/html/spider.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | spider:爬虫 — PyCreeper 1.0.0 documentation
10 |
11 |
12 |
13 |
14 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
spider:爬虫
56 |
57 |
58 |
59 |
60 |
61 |
62 |
89 |
90 |
91 |
106 |
110 |
111 |
--------------------------------------------------------------------------------
/doc/_build/html/downloader_middlewares.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | downloader_middlewares:下载器中间件 — PyCreeper 1.0.0 documentation
10 |
11 |
12 |
13 |
14 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
downloader_middlewares:下载器中间件
56 |
57 |
58 |
59 |
60 |
61 |
62 |
89 |
90 |
91 |
106 |
110 |
111 |
--------------------------------------------------------------------------------
/doc/_build/html/downloader.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | downloader:下载器 — PyCreeper 1.0.0 documentation
10 |
11 |
12 |
13 |
14 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
downloader:下载器
56 |
57 |
58 |
59 |
60 |
61 |
62 |
89 |
90 |
91 |
106 |
110 |
111 |
--------------------------------------------------------------------------------
/pycreeper/downloader_middlewares/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import random
6 | import time
7 | from urlparse import urlparse
8 | from logging import Logger
9 | import chardet
10 | import gevent
11 | from pycreeper.downloader_middlewares import DownloaderMiddleware
12 | from pycreeper.utils.exceptions import TimeoutException
13 | from collections import deque
14 |
15 |
16 | class RetryMiddleware(DownloaderMiddleware):
17 | """ Retry Middleware """
18 |
19 | RETRY_EXCEPTIONS = TimeoutException
20 |
21 | def __init__(self, settings, logger):
22 | self.max_retry_count = settings.get_int("RETRY_COUNT")
23 | self.retry_status_codes = settings.get_list("RETRY_STATUS_CODES")
24 | if not isinstance(logger, Logger):
25 | raise AttributeError('logger must be instance of logging.Logger')
26 | self.logger = logger
27 |
28 | def process_response(self, request, response):
29 | """process response
30 | """
31 | if request.meta.get("dont_retry", False):
32 | return response
33 | if response.status in self.retry_status_codes:
34 | return self._retry(request) or response
35 | return response
36 |
37 | def process_exception(self, request, exception):
38 | """process exception
39 | """
40 | if isinstance(exception, self.RETRY_EXCEPTIONS) \
41 | and request.meta.get("dont_retry", False):
42 | return self._retry(request)
43 |
44 | def _retry(self, request):
45 | """retry
46 | """
47 | retry_count = request.meta.get("retry_count", 0) + 1
48 | if retry_count <= self.max_retry_count:
49 | retry_request = request.copy()
50 | retry_request.meta["retry_count"] = retry_count
51 | return retry_request
52 |
53 |
54 | class UserAgentMiddleware(DownloaderMiddleware):
55 | """ UserAgent Middleware """
56 |
57 | def __init__(self, settings, logger):
58 | self.user_agent_list = settings.get_list("USER_AGENT_LIST")
59 | if not isinstance(logger, Logger):
60 | raise AttributeError('logger must be instance of logging.Logger')
61 | self.logger = logger
62 |
63 | def process_request(self, request):
64 | """process request
65 |
66 | static requests only.
67 | """
68 | if not request.dynamic:
69 | request.headers["User-Agent"] = random.choice(self.user_agent_list)
70 |
71 |
72 | class ProxyMiddleware(DownloaderMiddleware):
73 | """ Proxy Middleware """
74 |
75 | def __init__(self, settings, logger):
76 | self.host_time_queue = deque()
77 | self.proxy_interval = settings["PROXY_INTERVAL"]
78 | self.proxy_list = settings["PROXY_LIST"]
79 | for proxy in self.proxy_list:
80 | self.host_time_queue.append((proxy, 0))
81 | if not isinstance(logger, Logger):
82 | raise AttributeError('logger must be instance of logging.Logger')
83 | self.logger = logger
84 |
85 | def process_request(self, request):
86 | """process request
87 |
88 | static requests only.
89 | """
90 | if not request.dynamic:
91 | request.meta["proxy"] = {
92 | "http": self._get_proxy(),
93 | }
94 |
95 | def _get_proxy(self):
96 | """get proxy
97 | """
98 | proxy, latest = self.host_time_queue.popleft()
99 | interval = time.time() - latest
100 | if interval < self.proxy_interval:
101 | self.logger.info("Proxy %s waitting ...", proxy)
102 | gevent.sleep(self.proxy_interval - interval)
103 | self.host_time_queue.append((proxy, time.time()))
104 | return "http://%s" % proxy
105 |
106 |
107 | class EncodingDiscriminateMiddleware(DownloaderMiddleware):
108 | """ Encoding Discriminate Middleware """
109 |
110 | ENCODING_MAP = {}
111 |
112 | def __init__(self, settings, logger):
113 | self.settings = settings
114 | if not isinstance(logger, Logger):
115 | raise AttributeError('logger must be instance of logging.Logger')
116 | self.logger = logger
117 |
118 | def process_response(self, request, response):
119 | """process respoonse
120 | :param request:
121 | :param response:
122 | """
123 | netloc = urlparse(request.url).netloc
124 | content = response.body
125 | if self.ENCODING_MAP.get(netloc) is None:
126 | encoding = chardet.detect(content)["encoding"]
127 | encoding = "GB18030" \
128 | if encoding.upper() in ("GBK", "GB2312") else encoding
129 | self.ENCODING_MAP[netloc] = encoding
130 | body = content.decode(self.ENCODING_MAP[netloc], "replace")
131 | return response.copy(body=body)
132 |
--------------------------------------------------------------------------------
/tests/http/test_http_response.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import unittest
6 |
7 | from w3lib.url import safe_url_string
8 |
9 | from pycreeper.http.request import Request
10 | from pycreeper.http.response import Response
11 |
12 |
13 | class ResponseTest(unittest.TestCase):
14 | def test_init(self):
15 | self.assertRaises(Exception, Response)
16 | self.assertRaises(Exception, Response, url='http://www.example.com/')
17 | self.assertRaises(Exception, Response, request=Request('http://www.example.com/'))
18 | self.assertRaises(ValueError,
19 | Response,
20 | url='foo',
21 | request=Request('http://www.example.com/')
22 | )
23 | self.assertRaises(ValueError,
24 | Response,
25 | 'http://www.example.com/',
26 | status='foo',
27 | request=Request('http://www.example.com/')
28 | )
29 | self.assertRaises(TypeError,
30 | Response,
31 | 'http://www.example.com/',
32 | request='foo'
33 | )
34 | response = Response('http://www.example.com/',
35 | Request('http://www.example.com/')
36 | )
37 | assert response.url
38 | assert not response.body
39 | response = Response('http://www.example.com/',
40 | Request('http://www.example.com/'),
41 | headers={'Content-Type': 'text/html',
42 | 'Content-Length': 1234
43 | }
44 | )
45 |
46 | def test_copy(self):
47 | response1 = Response('http://www.example.com/',
48 | headers={'Content-Type': 'text/html',
49 | 'Content-Length': 1234
50 | },
51 | request=Request('http://www.example.com/')
52 | )
53 | response2 = response1.copy()
54 | assert response1.__dict__ == response2.__dict__
55 | self.assertEqual(response1.headers, response2.headers)
56 | self.assertEqual(response1.request, response2.request)
57 | self.assertEqual(response1, response2)
58 |
59 | self.assertIsNot(response1.headers, response2.headers)
60 | self.assertIsNot(response1.request, response2.request)
61 | self.assertIsNot(response1, response2)
62 |
63 | def test_url(self):
64 | response = Response('http://www.example.com/',
65 | request=Request('http://www.example.com/')
66 | )
67 | self.assertIsInstance(response.url, str)
68 | self.assertEqual(response.url, 'http://www.example.com/')
69 | response = Response(u'http://www.example.com?content=测试',
70 | request=Request('http://www.example.com/')
71 | )
72 | self.assertEqual(response.url,
73 | safe_url_string('http://www.example.com?content=测试'))
74 | self.assertRaises(TypeError, Response, 123)
75 |
76 | def test_body(self):
77 | r1 = Response(url="http://www.example.com/",
78 | request=Request('http://www.example.com/')
79 | )
80 | assert r1.body == b''
81 |
82 | r2 = Response(url="http://www.example.com/",
83 | body=b"",
84 | request=Request('http://www.example.com/'))
85 | assert isinstance(r2.body, bytes)
86 | self.assertEqual(r2.encoding, 'utf-8') # default encoding
87 |
88 | r3 = Response(url="http://www.example.com/",
89 | body=u"Price: \xa3100",
90 | encoding='utf-8',
91 | request=Request('http://www.example.com/'))
92 | assert isinstance(r3.body, bytes)
93 | self.assertEqual(r3.body, b"Price: \xc2\xa3100")
94 |
95 | r4 = Response(url="http://www.example.com/",
96 | request=Request('http://www.example.com/'),
97 | body=u"Price: \xa3100",
98 | encoding='latin1'
99 | )
100 | assert isinstance(r4.body, bytes)
101 | self.assertEqual(r4.body, b"Price: \xa3100")
102 |
103 | def test_request(self):
104 | response = Response('http://www.example.com/',
105 | request=Request('http://www.example.com/')
106 | )
107 | self.assertIsInstance(response.request, Request)
108 | self.assertEqual(response.request, Request('http://www.example.com/'))
109 |
--------------------------------------------------------------------------------
/pycreeper/engine.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | """ Engine """
6 |
7 | from gevent import monkey
8 | monkey.patch_all()
9 |
10 | import logging
11 | from gevent.lock import BoundedSemaphore
12 | from gevent.pool import Pool
13 | from importlib import import_module
14 | from pycreeper.scheduler import Scheduler
15 | from pycreeper.downloader import Downloader
16 | from pycreeper.utils.gevent_wrapper import spawn, join_all
17 | from pycreeper.utils import result2list
18 | from pycreeper.http.request import Request
19 | from Queue import Empty
20 |
21 | DRIVER_MODULE = 'selenium.webdriver'
22 |
23 | class Engine(object):
24 | """ Engine """
25 |
26 | def __init__(self, spider):
27 | self.spider = spider
28 | self.logger = spider.logger
29 | self.scheduler = Scheduler(spider)
30 | self.settings = spider.settings
31 | max_request_size = self.settings["MAX_REQUEST_SIZE"]
32 | self.dynamic = self.settings["DYNAMIC_CRAWL"]
33 | if self.dynamic:
34 | module_path = DRIVER_MODULE
35 | module = import_module(module_path)
36 | init_kwargs = self.settings['DRIVER_INIT_KWARGS']
37 | self.driver = getattr(module,
38 | self.settings.get('DRIVER').title())(**init_kwargs)
39 | else:
40 | self.driver = None
41 | self.driver_sem = BoundedSemaphore(1)
42 | self.downloader = Downloader(spider, self.driver, self.driver_sem)
43 | self.pool = Pool(size=max_request_size)
44 |
45 | def start(self):
46 | """start
47 | """
48 | start_requests = iter(self.spider.start_requests())
49 | self.execute(self.spider, start_requests)
50 |
51 | def execute(self, spider, start_requests):
52 | """execute
53 | """
54 | self.start_requests = start_requests
55 | all_routines = []
56 | all_routines.append(spawn(self._init_start_requests))
57 | all_routines.append(spawn(self._next_request, spider))
58 | join_all(all_routines)
59 |
60 | def _init_start_requests(self):
61 | """init start requests
62 | """
63 | for req in self.start_requests:
64 | self.crawl(req)
65 |
66 | def _next_request(self, spider):
67 | """next request
68 | """
69 | while True:
70 | try:
71 | request = self.scheduler.next_request()
72 | self.pool.spawn(
73 | self._process_request, request, spider)
74 | except Empty:
75 | self.logger.info('All requests are finished, program exit...')
76 | if self.driver:
77 | self.driver.close()
78 | return
79 |
80 | def _process_request(self, request, spider):
81 | """process request
82 | """
83 | try:
84 | response = self.download(request, spider)
85 | except Exception as exc:
86 | logging.error("download error: %s", str(exc), exc_info=True)
87 | else:
88 | self._handle_downloader_output(response, request, spider)
89 | return response
90 |
91 | def download(self, request, spider):
92 | """ download
93 |
94 | Download a request, use self.downloader.fetch
95 |
96 | """
97 | response = self.downloader.fetch(request, spider)
98 | #response.request = request
99 | return response
100 |
101 | def _handle_downloader_output(self, response, request, spider):
102 | """handle downloader output
103 |
104 |
105 | """
106 | if isinstance(response, Request):
107 | self.crawl(response)
108 | return
109 |
110 | self.process_response(response, request, spider)
111 |
112 | def process_response(self, response, request, spider):
113 | """process response
114 |
115 | Use request.callback or spider.parse to process response
116 |
117 | """
118 | callback = request.callback or spider.parse
119 | result = callback(response)
120 | ret = result2list(result)
121 | self.handle_spider_output(ret, spider)
122 |
123 | def handle_spider_output(self, result, spider):
124 | """handle spider output
125 |
126 | If a spider return a request, crawling it.
127 | Else if it's a dict, use self.process_item.
128 |
129 | """
130 | for item in result:
131 | if item is None:
132 | continue
133 | elif isinstance(item, Request):
134 | self.crawl(item)
135 | elif isinstance(item, dict):
136 | self.process_item(item, spider)
137 | else:
138 | logging.error("Spider must return Request, dict or None")
139 |
140 | def process_item(self, item, spider):
141 | """handle item
142 |
143 | Use spider.process_item function.
144 |
145 | """
146 | spider.process_item(item)
147 |
148 | def crawl(self, request):
149 | """crawl request
150 |
151 | Add request to scheduler's queue.
152 |
153 | """
154 | self.scheduler.enqueue_request(request)
155 |
--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # PyCreeper documentation build configuration file, created by
4 | # sphinx-quickstart on Sat Mar 18 20:46:54 2017.
5 | #
6 | # This file is execfile()d with the current directory set to its
7 | # containing dir.
8 | #
9 | # Note that not all possible configuration values are present in this
10 | # autogenerated file.
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | # If extensions (or modules to document with autodoc) are in another directory,
16 | # add these directories to sys.path here. If the directory is relative to the
17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
18 | #
19 | # import os
20 | # import sys
21 | # sys.path.insert(0, os.path.abspath('.'))
22 |
23 |
24 | # -- General configuration ------------------------------------------------
25 |
26 | # If your documentation needs a minimal Sphinx version, state it here.
27 | #
28 | # needs_sphinx = '1.0'
29 |
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = ['sphinx.ext.autodoc',
34 | 'sphinx.ext.viewcode',
35 | 'sphinx.ext.githubpages']
36 |
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ['_templates']
39 |
40 | # The suffix(es) of source filenames.
41 | # You can specify multiple suffix as a list of string:
42 | #
43 | # source_suffix = ['.rst', '.md']
44 | source_suffix = '.rst'
45 |
46 | # The master toctree document.
47 | master_doc = 'index'
48 |
49 | # General information about the project.
50 | project = u'PyCreeper'
51 | copyright = u'2017, Jim Zheng'
52 | author = u'Jim Zheng'
53 |
54 | # The version info for the project you're documenting, acts as replacement for
55 | # |version| and |release|, also used in various other places throughout the
56 | # built documents.
57 | #
58 | # The short X.Y version.
59 | version = u'1.0.0'
60 | # The full version, including alpha/beta/rc tags.
61 | release = u'1.0.0'
62 |
63 | # The language for content autogenerated by Sphinx. Refer to documentation
64 | # for a list of supported languages.
65 | #
66 | # This is also used if you do content translation via gettext catalogs.
67 | # Usually you set "language" from the command line for these cases.
68 | language = None
69 |
70 | # List of patterns, relative to source directory, that match files and
71 | # directories to ignore when looking for source files.
72 | # This patterns also effect to html_static_path and html_extra_path
73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
74 |
75 | # The name of the Pygments (syntax highlighting) style to use.
76 | pygments_style = 'sphinx'
77 |
78 | # If true, `todo` and `todoList` produce output, else they produce nothing.
79 | todo_include_todos = False
80 |
81 |
82 | # -- Options for HTML output ----------------------------------------------
83 |
84 | # The theme to use for HTML and HTML Help pages. See the documentation for
85 | # a list of builtin themes.
86 | #
87 | html_theme = 'nature'
88 |
89 | # Theme options are theme-specific and customize the look and feel of a theme
90 | # further. For a list of options available for each theme, see the
91 | # documentation.
92 | #
93 | # html_theme_options = {}
94 |
95 | # Add any paths that contain custom static files (such as style sheets) here,
96 | # relative to this directory. They are copied after the builtin static files,
97 | # so a file named "default.css" will overwrite the builtin "default.css".
98 | html_static_path = ['_static']
99 |
100 |
101 | # -- Options for HTMLHelp output ------------------------------------------
102 |
103 | # Output file base name for HTML help builder.
104 | htmlhelp_basename = 'PyCreeperdoc'
105 |
106 |
107 | # -- Options for LaTeX output ---------------------------------------------
108 |
109 | latex_elements = {
110 | # The paper size ('letterpaper' or 'a4paper').
111 | #
112 | # 'papersize': 'letterpaper',
113 |
114 | # The font size ('10pt', '11pt' or '12pt').
115 | #
116 | # 'pointsize': '10pt',
117 |
118 | # Additional stuff for the LaTeX preamble.
119 | #
120 | # 'preamble': '',
121 |
122 | # Latex figure (float) alignment
123 | #
124 | # 'figure_align': 'htbp',
125 | }
126 |
127 | # Grouping the document tree into LaTeX files. List of tuples
128 | # (source start file, target name, title,
129 | # author, documentclass [howto, manual, or own class]).
130 | latex_documents = [
131 | (master_doc, 'PyCreeper.tex', u'PyCreeper Documentation',
132 | u'zcy', 'manual'),
133 | ]
134 |
135 |
136 | # -- Options for manual page output ---------------------------------------
137 |
138 | # One entry per manual page. List of tuples
139 | # (source start file, name, description, authors, manual section).
140 | man_pages = [
141 | (master_doc, 'pycreeper', u'PyCreeper Documentation',
142 | [author], 1)
143 | ]
144 |
145 |
146 | # -- Options for Texinfo output -------------------------------------------
147 |
148 | # Grouping the document tree into Texinfo files. List of tuples
149 | # (source start file, target name, title, author,
150 | # dir menu entry, description, category)
151 | texinfo_documents = [
152 | (master_doc, 'PyCreeper', u'PyCreeper Documentation',
153 | author, 'PyCreeper', 'One line description of project.',
154 | 'Miscellaneous'),
155 | ]
156 |
157 |
158 |
159 |
--------------------------------------------------------------------------------
/tests/test_downloader_middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import time
6 | import unittest
7 | import json
8 | from pycreeper.downloader_middlewares import DownloaderMiddlewareManager
9 | from pycreeper.downloader_middlewares.middlewares import UserAgentMiddleware, RetryMiddleware, ProxyMiddleware
10 | from pycreeper.spider import Spider
11 | from pycreeper.http.request import Request
12 | from pycreeper.http.response import Response
13 | from pycreeper.downloader import DownloadHandler
14 | from gevent.lock import BoundedSemaphore
15 |
16 |
17 | class RetryMiddlewareTest(unittest.TestCase):
18 | def setUp(self):
19 | self.spider = Spider()
20 |
21 | def test_basic(self):
22 | self.assertRaises(AttributeError, RetryMiddleware,
23 | self.spider.settings, None)
24 |
25 | def test_process_response(self):
26 | request = Request('http://httpbin.org/')
27 | response = Response('http://httpbin.org/', request, status=500)
28 | rm = RetryMiddleware(self.spider.settings, self.spider.logger)
29 | request.meta["dont_retry"] = True
30 | self.assertEqual(rm.process_response(request, response), response)
31 |
32 | request.meta["dont_retry"] = False
33 | request = rm.process_response(request, response)
34 | self.assertIsInstance(request, Request)
35 | self.assertEqual(request.meta.get("retry_count"), 1)
36 | request = rm.process_response(request, response)
37 | self.assertIsInstance(request, Request)
38 | request = rm.process_response(request, response)
39 | self.assertIsInstance(request, Request)
40 | self.assertIsInstance(rm.process_response(request, response), Response)
41 |
42 |
43 | class UserAgentMiddlewareTest(unittest.TestCase):
44 | def setUp(self):
45 | self.spider = Spider()
46 |
47 | def test_basic(self):
48 | self.assertRaises(AttributeError, ProxyMiddleware,
49 | self.spider.settings, None)
50 |
51 | def test_process_request(self):
52 | self.spider.settings.set("PROXY_LIST", ['124.88.67.54:80'])
53 | request = Request('http://httpbin.org/get')
54 | pm = ProxyMiddleware(self.spider.settings, self.spider.logger)
55 | dh = DownloadHandler(self.spider, None, BoundedSemaphore(1))
56 | pm.process_request(request)
57 | response = dh.fetch(request)
58 | assert response.body
59 |
60 | def test_process_request_interval(self):
61 | self.spider.settings.set("PROXY_LIST", ['218.76.106.78:3128'])
62 | request = Request('http://httpbin.org/get')
63 | pm = ProxyMiddleware(self.spider.settings, self.spider.logger)
64 | dh = DownloadHandler(self.spider, None, BoundedSemaphore(1))
65 | pm.process_request(request)
66 | time1 = time.time()
67 | dh.fetch(request)
68 |
69 | request = Request('http://httpbin.org/get')
70 | pm.process_request(request)
71 | self.assertGreater(time.time() - time1, 3)
72 |
73 |
74 | class ProxyMiddlewareTest(unittest.TestCase):
75 | def setUp(self):
76 | self.spider = Spider()
77 |
78 | def test_basic(self):
79 | self.assertRaises(AttributeError, UserAgentMiddleware,
80 | self.spider.settings, None)
81 |
82 | def test_process_request(self):
83 | request = Request('http://httpbin.org/user-agent')
84 | self.assertIs(request.headers.get("User-Agent"), None)
85 | uam = UserAgentMiddleware(self.spider.settings, self.spider.logger)
86 | dh = DownloadHandler(self.spider, None, BoundedSemaphore(1))
87 | uam.process_request(request)
88 | response = dh.fetch(request)
89 | self.assertEqual(json.loads(response.body)['user-agent'], request.headers['User-Agent'])
90 |
91 |
92 | class DownloaderMiddlewareManagerTest(unittest.TestCase):
93 | def setUp(self):
94 | self.spider = Spider()
95 | self.spider.settings.set('DOWNLOADER_MIDDLEWARES',
96 | {
97 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
98 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
99 | })
100 |
101 | def test_methods(self):
102 | dmm = DownloaderMiddlewareManager(self.spider)
103 | rm = RetryMiddleware(self.spider.settings, self.spider.logger)
104 | uam = UserAgentMiddleware(self.spider.settings, self.spider.logger)
105 | process_request = [uam.process_request]
106 | process_response = [rm.process_response]
107 | process_exception = [rm.process_exception]
108 | self.assertEqual(len(dmm.methods['process_request']), len(process_request))
109 | for i in range(len(process_request)):
110 | self.assertEqual(dmm.methods['process_request'][i].__name__, process_request[i].__name__)
111 |
112 | self.assertEqual(len(dmm.methods['process_response']), len(process_response))
113 | for i in range(len(process_response)):
114 | self.assertEqual(dmm.methods['process_response'][i].__name__, process_response[i].__name__)
115 |
116 | self.assertEqual(len(dmm.methods['process_exception']), len(process_exception))
117 | for i in range(len(process_exception)):
118 | self.assertEqual(dmm.methods['process_exception'][i].__name__, process_exception[i].__name__)
119 |
120 |
121 | if __name__ == "__main__":
122 | unittest.main()
123 |
--------------------------------------------------------------------------------
/doc/_build/html/_static/nature.css:
--------------------------------------------------------------------------------
1 | /*
2 | * nature.css_t
3 | * ~~~~~~~~~~~~
4 | *
5 | * Sphinx stylesheet -- nature theme.
6 | *
7 | * :copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS.
8 | * :license: BSD, see LICENSE for details.
9 | *
10 | */
11 |
12 | @import url("basic.css");
13 |
14 | /* -- page layout ----------------------------------------------------------- */
15 |
16 | body {
17 | font-family: Arial, sans-serif;
18 | font-size: 100%;
19 | background-color: #111;
20 | color: #555;
21 | margin: 0;
22 | padding: 0;
23 | }
24 |
25 | div.documentwrapper {
26 | float: left;
27 | width: 100%;
28 | }
29 |
30 | div.bodywrapper {
31 | margin: 0 0 0 230px;
32 | }
33 |
34 | hr {
35 | border: 1px solid #B1B4B6;
36 | }
37 |
38 | div.document {
39 | background-color: #eee;
40 | }
41 |
42 | div.body {
43 | background-color: #ffffff;
44 | color: #3E4349;
45 | padding: 0 30px 30px 30px;
46 | font-size: 0.9em;
47 | }
48 |
49 | div.footer {
50 | color: #555;
51 | width: 100%;
52 | padding: 13px 0;
53 | text-align: center;
54 | font-size: 75%;
55 | }
56 |
57 | div.footer a {
58 | color: #444;
59 | text-decoration: underline;
60 | }
61 |
62 | div.related {
63 | background-color: #6BA81E;
64 | line-height: 32px;
65 | color: #fff;
66 | text-shadow: 0px 1px 0 #444;
67 | font-size: 0.9em;
68 | }
69 |
70 | div.related a {
71 | color: #E2F3CC;
72 | }
73 |
74 | div.sphinxsidebar {
75 | font-size: 0.75em;
76 | line-height: 1.5em;
77 | }
78 |
79 | div.sphinxsidebarwrapper{
80 | padding: 20px 0;
81 | }
82 |
83 | div.sphinxsidebar h3,
84 | div.sphinxsidebar h4 {
85 | font-family: Arial, sans-serif;
86 | color: #222;
87 | font-size: 1.2em;
88 | font-weight: normal;
89 | margin: 0;
90 | padding: 5px 10px;
91 | background-color: #ddd;
92 | text-shadow: 1px 1px 0 white
93 | }
94 |
95 | div.sphinxsidebar h4{
96 | font-size: 1.1em;
97 | }
98 |
99 | div.sphinxsidebar h3 a {
100 | color: #444;
101 | }
102 |
103 |
104 | div.sphinxsidebar p {
105 | color: #888;
106 | padding: 5px 20px;
107 | }
108 |
109 | div.sphinxsidebar p.topless {
110 | }
111 |
112 | div.sphinxsidebar ul {
113 | margin: 10px 20px;
114 | padding: 0;
115 | color: #000;
116 | }
117 |
118 | div.sphinxsidebar a {
119 | color: #444;
120 | }
121 |
122 | div.sphinxsidebar input {
123 | border: 1px solid #ccc;
124 | font-family: sans-serif;
125 | font-size: 1em;
126 | }
127 |
128 | div.sphinxsidebar input[type=text]{
129 | margin-left: 20px;
130 | }
131 |
132 | div.sphinxsidebar input[type=submit]{
133 | margin-left: 20px;
134 | }
135 |
136 | /* -- body styles ----------------------------------------------------------- */
137 |
138 | a {
139 | color: #005B81;
140 | text-decoration: none;
141 | }
142 |
143 | a:hover {
144 | color: #E32E00;
145 | text-decoration: underline;
146 | }
147 |
148 | div.body h1,
149 | div.body h2,
150 | div.body h3,
151 | div.body h4,
152 | div.body h5,
153 | div.body h6 {
154 | font-family: Arial, sans-serif;
155 | background-color: #BED4EB;
156 | font-weight: normal;
157 | color: #212224;
158 | margin: 30px 0px 10px 0px;
159 | padding: 5px 0 5px 10px;
160 | text-shadow: 0px 1px 0 white
161 | }
162 |
163 | div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 200%; }
164 | div.body h2 { font-size: 150%; background-color: #C8D5E3; }
165 | div.body h3 { font-size: 120%; background-color: #D8DEE3; }
166 | div.body h4 { font-size: 110%; background-color: #D8DEE3; }
167 | div.body h5 { font-size: 100%; background-color: #D8DEE3; }
168 | div.body h6 { font-size: 100%; background-color: #D8DEE3; }
169 |
170 | a.headerlink {
171 | color: #c60f0f;
172 | font-size: 0.8em;
173 | padding: 0 4px 0 4px;
174 | text-decoration: none;
175 | }
176 |
177 | a.headerlink:hover {
178 | background-color: #c60f0f;
179 | color: white;
180 | }
181 |
182 | div.body p, div.body dd, div.body li {
183 | line-height: 1.5em;
184 | }
185 |
186 | div.admonition p.admonition-title + p {
187 | display: inline;
188 | }
189 |
190 | div.highlight{
191 | background-color: white;
192 | }
193 |
194 | div.note {
195 | background-color: #eee;
196 | border: 1px solid #ccc;
197 | }
198 |
199 | div.seealso {
200 | background-color: #ffc;
201 | border: 1px solid #ff6;
202 | }
203 |
204 | div.topic {
205 | background-color: #eee;
206 | }
207 |
208 | div.warning {
209 | background-color: #ffe4e4;
210 | border: 1px solid #f66;
211 | }
212 |
213 | p.admonition-title {
214 | display: inline;
215 | }
216 |
217 | p.admonition-title:after {
218 | content: ":";
219 | }
220 |
221 | pre {
222 | padding: 10px;
223 | background-color: White;
224 | color: #222;
225 | line-height: 1.2em;
226 | border: 1px solid #C6C9CB;
227 | font-size: 1.1em;
228 | margin: 1.5em 0 1.5em 0;
229 | -webkit-box-shadow: 1px 1px 1px #d8d8d8;
230 | -moz-box-shadow: 1px 1px 1px #d8d8d8;
231 | }
232 |
233 | code {
234 | background-color: #ecf0f3;
235 | color: #222;
236 | /* padding: 1px 2px; */
237 | font-size: 1.1em;
238 | font-family: monospace;
239 | }
240 |
241 | .viewcode-back {
242 | font-family: Arial, sans-serif;
243 | }
244 |
245 | div.viewcode-block:target {
246 | background-color: #f4debf;
247 | border-top: 1px solid #ac9;
248 | border-bottom: 1px solid #ac9;
249 | }
250 |
251 | div.code-block-caption {
252 | background-color: #ddd;
253 | color: #222;
254 | border: 1px solid #C6C9CB;
255 | }
--------------------------------------------------------------------------------
/doc/_build/html/schedular.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | schedular:调度器 — PyCreeper 1.0.0 documentation
10 |
11 |
12 |
13 |
14 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
schedular:调度器
56 |
调度器实现的核心是gevent之中的Queue和布隆过滤器
57 | (Wiki: https://en.wikipedia.org/wiki/Bloom_filter )。
58 | 其中,Queue保证了多个Downloader协程读取队列时的协程安全,布隆过滤器则提供了url去重功能。
59 |
60 |
将请求入队:enqueue_request(request)
61 |
request入队时,首先使用布隆过滤器检查url是否已经抓取过。如果没有抓取过则直接入队,
62 | 如果抓取过,则会输出一条logging.DEBUG信息,表示忽略了这个url。
63 |
64 |
65 |
取得队列中的请求:next_request()
66 |
这个方法将会从Queue中取出一条request。如果在 custom_settings 中设置了 DOWNLOAD_DELAY
67 | 项目的话,每次取出request会等待一个固定的时间。
68 |
PyCreeper将 TIMEOUT 值的3倍作为检验爬虫结束的标志。具体是指,如果3*TIMEOUT时间之内Queue为空的话,
69 | 那么则认为爬取任务全部结束,爬虫退出。
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
113 |
114 |
115 |
130 |
134 |
135 |
--------------------------------------------------------------------------------
/doc/_build/html/structure.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | 架构概览 — PyCreeper 1.0.0 documentation
10 |
11 |
12 |
13 |
14 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
架构概览
56 |
PyCreeper的整体架构可以分为引擎,下载器,下载器中间件,调度器,爬虫五个部分。
57 | 在各个部分之间传递的数据为Request/Response对象。
58 |
数据的流动方向如下图的绿色箭头所示。
59 |
60 |
各个部分的功能简述
61 |
62 |
63 |
引擎 是PyCreeper的核心部分,负责调度各个部分的工作。引擎在内部的实现为gevent.Pool。
64 |
下载器 负责下载request请求,在这里将静态请求与动态请求分别处理,静态请求使用requests库实现,
65 | 动态请求使用selenium.webdriver实现。在请求完成后,将响应返回给引擎。
66 |
下载器中间件 可以理解为存在于下载器和引擎之间的钩子系统,可以通过自定义下载器中间件完成对request和response的特殊处理。
67 |
调度器 调度器实现的核心为gevent中的Queue和布隆过滤器,通过对requests进行判重,非重复请求入队,等待引擎取走处理。
68 |
爬虫 爬虫相当于对用户定义的接口,由用户来定义起始的url,对于各个request的callback以及对于爬取结果的处理方法。
69 |
70 |
71 |
数据流动过程
72 |
数据流动的过程如下面各个步骤所示:
73 |
74 | 引擎启动,将爬虫中的start_urls加入到调度器中。
75 | 引擎从调度器中取得一个request。
76 | 引擎将请求交给下载器处理,中间经过了下载器中间件对于request的处理。
77 | 下载器根据request的类型分别操作,静态请求交给requests库,动态请求使用selenium.webdriver加载。
78 | 下载器将response返回给引擎,中间经过下载器中间件对response的处理。
79 | 引擎将response交给爬虫定义的处理方法。
80 | 爬虫的处理方法可能返回一个request(转2),或者返回一个包含爬取结果的字典(转下一个)。
81 | 引擎根据爬虫定义的对于爬取结果的处理方法,处理结果。
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
126 |
127 |
128 |
143 |
147 |
148 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pyCreeper
2 | PyCreeper是一个用来快速提取网页内容的信息采集(爬虫)框架。项目底层异步网络I/O使用 **Gevent** 协程库,将网络请求分为静态请求和动态请求,
3 | 静态请求交给 **Requests** 处理,动态请求则使用 **Selenium.Webdriver** 加载。
4 |
5 | 在设计这个项目的过程中,我参考了很多[Scrapy](https://scrapy.org/)的架构和实现方式。Scrapy是一个非常棒的爬虫框架,
6 | 我之前花了很多心血在Scrapy框架之上!
7 |
8 | 这篇PyCreeper初探会编写一个简单的爬虫例子,让您明白PyCreeper大致的工作流程,使您快速上手。
9 |
10 | ## 目标任务
11 | [知乎](https://www.zhihu.com/)与Quora类似,是一个分享知识提出问题的平台。我们的Demo任务是模拟登陆知乎,保存Cookie,
12 | 之后发出一系列静态请求,获取首页的问题题目与描述。
13 |
14 | 由于模拟登陆一步我们采用了基于Selenium.Webdriver的动态请求处理,所以你可以抛开复杂的抓包与分析代码,只需要点几个按钮,
15 | 就像在真实环境登录知乎一样简单便利!
16 |
17 |
18 | ## 定义一个爬虫
19 | 定义一个爬虫类需要需要继承Spider类,代码如下:
20 |
21 | ```
22 | from pycreeper.spider import Spider
23 |
24 | class Zhihu_Spider(Spider):
25 | pass
26 | ```
27 |
28 | ## 选择中间件MiddleWares
29 | 对于Spider的中间件选择,通过修改custom_settings对象实现:
30 |
31 | ```
32 | custom_settings = {
33 | 'DOWNLOADER_MIDDLEWARES': {
34 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
35 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
36 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
37 | },
38 | 'DRIVER': 'Chrome',
39 | 'DOWNLOAD_DELAY': 2,
40 | 'USER_AGENT_LIST': [
41 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
42 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
43 | ]
44 | }
45 | ```
46 |
47 | 其中,DOWNLOADER_MIDDLEWARES是这个爬虫爬取过程中使用的中间件。UserAgentMiddleware提供了一种简单的控制请求User-Agent的方式(只对静态请求有效,
48 | 动态请求的UA取决于使用的WebDriver)。RetryMiddleware对失败的请求(错误的返回码,超时等)进行多次重试。CookiesMiddleware在全体的请求之间共享CookieJar池,
49 | 一组请求可以共享一个CookieJar,CookiesMiddleware维护CookieJar的有效性与一致性。
50 |
51 | DRIVER表明了动态请求的浏览器,这里我们使用Chrome。
52 |
53 | DOWNLOAD_DELAY表明了下载之间的延迟时间(秒),这个选项当网站有某种防爬策略时还是很有用的。
54 |
55 | USER_AGENT_LIST中包含请求使用的User-Agent,UserAgentMiddleware会从中随机取出一个来使用。
56 |
57 |
58 | ## 最开始的请求
59 | 下面这段代码通过重写start_requests方法yield一个PyCreeper请求:
60 |
61 | ```
62 | def start_requests(self):
63 |
64 | def _login(driver):
65 | driver.find_element_by_name('account').send_keys("username")
66 | driver.find_element_by_name('password').send_keys("password")
67 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
68 | gevent.sleep(5)
69 |
70 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
71 | callback=self.after_login, dynamic=True, browser_actions=[_login])
72 | ```
73 |
74 | 在Request对象的参数中,dynamic=True表明这是一个动态请求,将会调用WebDriver加载,
75 | 而browser_actions=[_login]则定义了浏览器加载完成之后进行的动作。本例中输入了用户名与密码,然后点击登录。
76 | gevent.sleep(5)则是令爬虫等待浏览器加载完成。
77 |
78 | meta={"cookiejar": "zhihu"}这个选项表明本次请求产生的Cookie将会被存储在名为zhihu的CookieJar当中
79 |
80 | callback=self.after_login定义了本次响应的处理函数。
81 |
82 | ## 接下来?
83 |
84 | 接下来一步将在知乎首页中提取问题链接,发出静态问题请求:
85 |
86 | ```
87 | def after_login(self, response):
88 | html = response.body
89 | selector = etree.HTML(html)
90 | links = selector.xpath('//a[@class="question_link"]')
91 | for link in links:
92 | yield Request('https://www.zhihu.com' + link.attrib["href"],
93 | meta={"cookiejar": "zhihu"}, callback=self.get_item)
94 | ```
95 |
96 | response.body存储了响应的内容。我们使用了lxml提取html文本中的标签,然后发出一系列静态请求。
97 |
98 | 在获得问题页面的数据之后,我们需要做的是提取出其中的问题标题与详情:
99 |
100 | ```
101 | def get_item(self, response):
102 | html = response.body
103 | selector = etree.HTML(html)
104 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
105 | body = selector.xpath('//span[@class="RichText"]')[0].text
106 | yield {
107 | 'head': head,
108 | 'body': body
109 | }
110 | ```
111 |
112 | 过程与上个函数类似,通过xpath定位元素。
113 |
114 | ## 处理你获得的数据
115 | 处理数据通过重写process_item方法实现:
116 |
117 | ```
118 | def process_item(self, item):
119 | print json.dumps(item, ensure_ascii=False)
120 | ````
121 |
122 | 这里我们只是将结果打印。
123 |
124 | ## 运行你的爬虫
125 | 最后我们通过这样一段代码运行爬虫:
126 |
127 | ```
128 | if __name__ == "__main__":
129 | spider = Zhihu_Spider()
130 | spider.start()
131 | ```
132 |
133 | 完整的代码如下:
134 |
135 | ```
136 | # -*- coding:utf-8 -*-
137 |
138 | from pycreeper.spider import Spider
139 | from pycreeper.http.request import Request
140 | from lxml import etree
141 | import json
142 | import gevent
143 |
144 |
145 | class Zhihu_Spider(Spider):
146 |
147 | custom_settings = {
148 | 'DOWNLOADER_MIDDLEWARES': {
149 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
150 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
151 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
152 | },
153 | 'DRIVER': 'Chrome',
154 | 'DOWNLOAD_DELAY': 2,
155 | 'STATIC_REQUEST_SSL_VERIFY': False,
156 | 'USER_AGENT_LIST': [
157 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
158 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
159 | ]
160 | }
161 |
162 | def start_requests(self):
163 |
164 | def _login(driver):
165 | driver.find_element_by_name('account').send_keys("username")
166 | driver.find_element_by_name('password').send_keys("password")
167 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
168 | gevent.sleep(5)
169 |
170 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
171 | callback=self.after_login, dynamic=True, browser_actions=[_login])
172 |
173 | def after_login(self, response):
174 | html = response.body
175 | selector = etree.HTML(html)
176 | links = selector.xpath('//a[@class="question_link"]')
177 | for link in links:
178 | yield Request('https://www.zhihu.com' + link.attrib["href"],
179 | meta={"cookiejar": "zhihu"}, callback=self.get_item)
180 |
181 | def get_item(self, response):
182 | html = response.body
183 | selector = etree.HTML(html)
184 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
185 | body = selector.xpath('//span[@class="RichText"]')[0].text
186 | yield {
187 | 'head': head,
188 | 'body': body
189 | }
190 |
191 | def process_item(self, item):
192 | print json.dumps(item, ensure_ascii=False)
193 |
194 | if __name__ == "__main__":
195 | spider = Zhihu_Spider()
196 | spider.start()
197 |
198 | ```
199 |
200 | ## 写在后面
201 | 项目已经通过PyPi发布,您可以通过以下命令下载:
202 |
203 | ```
204 | pip install pycreeper
205 | ```
206 |
207 | 未来我们将会引入Docker的支持。
208 |
209 | 目前项目刚刚发布1.0.0版本,如果在使用时,遇到各种问题,我们都欢迎您反馈给我们,您可以通过github[项目主页](https://github.com/ZcyAndWt/pyCreeper),也可以通过邮件,作者的邮箱:zhengchenyu.backend@gmail.com。
210 |
211 | 如果您使用中,觉得本项目有可取之处,提高了您爬取数据的效率,希望您能在github上star本项目。
212 | 您的支持是我们前进最大的动力!
213 |
--------------------------------------------------------------------------------
/doc/_build/html/prepare.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | 使用前的准备 — PyCreeper 1.0.0 documentation
10 |
11 |
12 |
13 |
14 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
使用前的准备
56 |
我们假定您已经安装了Python2.7及以上版本,若没有安装,请参考Python官网(https://www.python.org/ )选择合适的版本进行安装。
57 |
PyCreeper对于以下几个库存在依赖关系:
58 |
59 |
60 | gevent
61 | importlib
62 | requests
63 | chardet
64 | w3lib
65 | six
66 | pybloom
67 | Selenium
68 |
69 |
70 |
当然,如果您选择使用pip安装本项目,那么依赖库会自动安装到您的电脑内(至少理论上会是这样)。
71 |
使用pip安装项目:
72 |
pip install pycreeper
73 |
74 |
75 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
147 |
148 |
149 |
164 |
168 |
169 |
--------------------------------------------------------------------------------
/doc/_build/html/_sources/intro.rst.txt:
--------------------------------------------------------------------------------
1 | PyCreeper初探
2 | ==============
3 | PyCreeper是一个用来快速提取网页内容的信息采集(爬虫)框架。项目底层异步网络I/O使用 **Gevent** 协程库,将网络请求分为静态请求和动态请求,
4 | 静态请求交给 **Requests** 处理,动态请求则使用 **Selenium.Webdriver** 加载。
5 |
6 | 在设计这个项目的过程中,我参考了很多 **Scrapy** (项目网站: https://scrapy.org/)的架构和实现方式。Scrapy是一个非常棒的爬虫框架,
7 | 我之前花了很多心血在Scrapy框架之上!
8 |
9 | 这篇PyCreeper初探会编写一个简单的爬虫例子,让您明白PyCreeper大致的工作流程,使您快速上手。
10 |
11 | 目标任务
12 | ---------
13 | 知乎(https://www.zhihu.com/)与Quora类似,是一个分享知识提出问题的平台。我们的Demo任务是模拟登陆知乎,保存Cookie,
14 | 之后发出一系列静态请求,获取首页的问题题目与描述。
15 |
16 | 由于模拟登陆一步我们采用了基于Selenium.Webdriver的动态请求处理,所以你可以抛开复杂的抓包与分析代码,只需要点几个按钮,
17 | 就像在真实环境登录知乎一样简单便利!
18 |
19 |
20 | 定义一个爬虫
21 | -------------
22 | 定义一个爬虫类需要需要继承Spider类,代码如下::
23 |
24 | from pycreeper.spider import Spider
25 |
26 | class Zhihu_Spider(Spider):
27 | pass
28 |
29 | 选择中间件MiddleWares
30 | ----------------------
31 | 对于Spider的中间件选择,通过修改custom_settings对象实现::
32 |
33 | custom_settings = {
34 | 'DOWNLOADER_MIDDLEWARES': {
35 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
36 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
37 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
38 | },
39 | 'DRIVER': 'Chrome',
40 | 'DOWNLOAD_DELAY': 2,
41 | 'USER_AGENT_LIST': [
42 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
43 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
44 | ]
45 | }
46 |
47 | 其中,DOWNLOADER_MIDDLEWARES是这个爬虫爬取过程中使用的中间件。UserAgentMiddleware提供了一种简单的控制请求User-Agent的方式(只对静态请求有效,
48 | 动态请求的UA取决于使用的WebDriver)。RetryMiddleware对失败的请求(错误的返回码,超时等)进行多次重试。CookiesMiddleware在全体的请求之间共享CookieJar池,
49 | 一组请求可以共享一个CookieJar,CookiesMiddleware维护CookieJar的有效性与一致性。
50 |
51 | DRIVER表明了动态请求的浏览器,这里我们使用Chrome。
52 |
53 | DOWNLOAD_DELAY表明了下载之间的延迟时间(秒),这个选项当网站有某种防爬策略时还是很有用的。
54 |
55 | USER_AGENT_LIST中包含请求使用的User-Agent,UserAgentMiddleware会从中随机取出一个来使用。
56 |
57 |
58 | 最开始的请求
59 | -------------
60 | 下面这段代码通过重写start_requests方法yield一个PyCreeper请求::
61 |
62 | def start_requests(self):
63 |
64 | def _login(driver):
65 | driver.find_element_by_name('account').send_keys("username")
66 | driver.find_element_by_name('password').send_keys("password")
67 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
68 | gevent.sleep(5)
69 |
70 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
71 | callback=self.after_login, dynamic=True, browser_actions=[_login])
72 |
73 | 在Request对象的参数中,dynamic=True表明这是一个动态请求,将会调用WebDriver加载,
74 | 而browser_actions=[_login]则定义了浏览器加载完成之后进行的动作。本例中输入了用户名与密码,然后点击登录。
75 | gevent.sleep(5)则是令爬虫等待浏览器加载完成。
76 |
77 | meta={"cookiejar": "zhihu"}这个选项表明本次请求产生的Cookie将会被存储在名为zhihu的CookieJar当中
78 |
79 | callback=self.after_login定义了本次响应的处理函数。
80 |
81 | 接下来?
82 | --------
83 | 接下来一步将在知乎首页中提取问题链接,发出静态问题请求::
84 |
85 | def after_login(self, response):
86 | html = response.body
87 | selector = etree.HTML(html)
88 | links = selector.xpath('//a[@class="question_link"]')
89 | for link in links:
90 | yield Request('https://www.zhihu.com' + link.attrib["href"],
91 | meta={"cookiejar": "zhihu"}, callback=self.get_item)
92 |
93 | response.body存储了响应的内容。我们使用了lxml提取html文本中的标签,然后发出一系列静态请求。
94 |
95 | 在获得问题页面的数据之后,我们需要做的是提取出其中的问题标题与详情::
96 |
97 | def get_item(self, response):
98 | html = response.body
99 | selector = etree.HTML(html)
100 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
101 | body = selector.xpath('//span[@class="RichText"]')[0].text
102 | yield {
103 | 'head': head,
104 | 'body': body
105 | }
106 |
107 | 过程与上个函数类似,通过xpath定位元素。
108 |
109 | 处理你获得的数据
110 | -----------------
111 | 处理数据通过重写process_item方法实现::
112 |
113 | def process_item(self, item):
114 | print json.dumps(item, ensure_ascii=False)
115 |
116 | 这里我们只是将结果打印。
117 |
118 | 运行你的爬虫
119 | -------------
120 | 最后我们通过这样一段代码运行爬虫::
121 |
122 | if __name__ == "__main__":
123 | spider = Zhihu_Spider()
124 | spider.start()
125 |
126 | 完整的代码如下::
127 |
128 | # -*- coding:utf-8 -*-
129 |
130 | from pycreeper.spider import Spider
131 | from pycreeper.http.request import Request
132 | from lxml import etree
133 | import json
134 | import gevent
135 |
136 |
137 | class Zhihu_Spider(Spider):
138 |
139 | custom_settings = {
140 | 'DOWNLOADER_MIDDLEWARES': {
141 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
142 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
143 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
144 | },
145 | 'DRIVER': 'Chrome',
146 | 'DOWNLOAD_DELAY': 2,
147 | 'STATIC_REQUEST_SSL_VERIFY': False,
148 | 'USER_AGENT_LIST': [
149 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
150 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
151 | ]
152 | }
153 |
154 | def start_requests(self):
155 |
156 | def _login(driver):
157 | driver.find_element_by_name('account').send_keys("15501277123")
158 | driver.find_element_by_name('password').send_keys("zcymichael")
159 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
160 | gevent.sleep(5)
161 |
162 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
163 | callback=self.after_login, dynamic=True, browser_actions=[_login])
164 |
165 | def after_login(self, response):
166 | html = response.body
167 | selector = etree.HTML(html)
168 | links = selector.xpath('//a[@class="question_link"]')
169 | for link in links:
170 | yield Request('https://www.zhihu.com' + link.attrib["href"],
171 | meta={"cookiejar": "zhihu"}, callback=self.get_item)
172 |
173 | def get_item(self, response):
174 | html = response.body
175 | selector = etree.HTML(html)
176 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
177 | body = selector.xpath('//span[@class="RichText"]')[0].text
178 | yield {
179 | 'head': head,
180 | 'body': body
181 | }
182 |
183 | def process_item(self, item):
184 | print json.dumps(item, ensure_ascii=False)
185 |
186 | if __name__ == "__main__":
187 | spider = Zhihu_Spider()
188 | spider.start()
189 |
190 |
191 | 写在后面
192 | ---------
193 | 项目已经通过PyPi发布,您可以通过以下命令下载::
194 |
195 | pip install pycreeper
196 |
197 | 未来我们将会引入Docker的支持。
198 |
199 | 目前项目刚刚发布1.0.0版本,如果在使用时,遇到各种问题,我们都欢迎您反馈给我们,您可以通过github,
200 | 项目主页:https://github.com/ZcyAndWt/pyCreeper,也可以通过邮件,作者的邮箱:zhengchenyu.backend@gmail.com。
201 |
202 | 如果您使用中,觉得本项目有可取之处,提高了您爬取数据的速度,希望您能在github上star本项目。
203 | 您的支持是我们前进最大的动力!
204 |
205 |
206 |
207 |
208 |
209 |
--------------------------------------------------------------------------------
/doc/tutorial.rst:
--------------------------------------------------------------------------------
1 | PyCreeper初探
2 | ==============
3 | PyCreeper是一个用来快速提取网页内容的信息采集(爬虫)框架。项目底层异步网络I/O使用 **Gevent** 协程库,将网络请求分为静态请求和动态请求,
4 | 静态请求交给 **Requests** 处理,动态请求则使用 **Selenium.Webdriver** 加载。
5 |
6 | 在设计这个项目的过程中,我参考了很多 **Scrapy** (项目网站: https://scrapy.org/)的架构和实现方式。Scrapy是一个非常棒的爬虫框架,
7 | 我之前花了很多心血在Scrapy框架之上!
8 |
9 | 这篇PyCreeper初探会编写一个简单的爬虫例子,让您明白PyCreeper大致的工作流程,使您快速上手。
10 |
11 | 如果您的PyCreeper还没有安装好,请参考: :doc:`prepare`。
12 |
13 | 目标任务
14 | ---------
15 | 知乎(https://www.zhihu.com/)与Quora类似,是一个分享知识提出问题的平台。我们的Demo任务是模拟登陆知乎,保存Cookie,
16 | 之后发出一系列静态请求,获取首页的问题题目与描述。
17 |
18 | 由于模拟登陆一步我们采用了基于Selenium.Webdriver的动态请求处理,所以你可以抛开复杂的抓包与分析代码,只需要点几个按钮,
19 | 就像在真实环境登录知乎一样简单便利!
20 |
21 |
22 | 定义一个爬虫
23 | -------------
24 | 定义一个爬虫类需要需要继承Spider类,代码如下::
25 |
26 | from pycreeper.spider import Spider
27 |
28 | class Zhihu_Spider(Spider):
29 | pass
30 |
31 | 选择中间件MiddleWares
32 | ----------------------
33 | 对于Spider的中间件选择,通过修改custom_settings对象实现::
34 |
35 | custom_settings = {
36 | 'DOWNLOADER_MIDDLEWARES': {
37 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
38 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
39 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
40 | },
41 | 'DRIVER': 'Chrome',
42 | 'DOWNLOAD_DELAY': 2,
43 | 'USER_AGENT_LIST': [
44 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
45 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
46 | ]
47 | }
48 |
49 | 其中,DOWNLOADER_MIDDLEWARES是这个爬虫爬取过程中使用的中间件。UserAgentMiddleware提供了一种简单的控制请求User-Agent的方式(只对静态请求有效,
50 | 动态请求的UA取决于使用的WebDriver)。RetryMiddleware对失败的请求(错误的返回码,超时等)进行多次重试。CookiesMiddleware在全体的请求之间共享CookieJar池,
51 | 一组请求可以共享一个CookieJar,CookiesMiddleware维护CookieJar的有效性与一致性。
52 |
53 | DRIVER表明了动态请求的浏览器,这里我们使用Chrome。
54 |
55 | DOWNLOAD_DELAY表明了下载之间的延迟时间(秒),这个选项当网站有某种防爬策略时还是很有用的。
56 |
57 | USER_AGENT_LIST中包含请求使用的User-Agent,UserAgentMiddleware会从中随机取出一个来使用。
58 |
59 |
60 | 最开始的请求
61 | -------------
62 | 下面这段代码通过重写start_requests方法yield一个PyCreeper请求::
63 |
64 | def start_requests(self):
65 |
66 | def _login(driver):
67 | driver.find_element_by_name('account').send_keys("username")
68 | driver.find_element_by_name('password').send_keys("password")
69 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
70 | gevent.sleep(5)
71 |
72 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
73 | callback=self.after_login, dynamic=True, browser_actions=[_login])
74 |
75 | 在Request对象的参数中,dynamic=True表明这是一个动态请求,将会调用WebDriver加载,
76 | 而browser_actions=[_login]则定义了浏览器加载完成之后进行的动作。本例中输入了用户名与密码,然后点击登录。
77 | gevent.sleep(5)则是令爬虫等待浏览器加载完成。
78 |
79 | meta={"cookiejar": "zhihu"}这个选项表明本次请求产生的Cookie将会被存储在名为zhihu的CookieJar当中
80 |
81 | callback=self.after_login定义了本次响应的处理函数。
82 |
83 | 接下来?
84 | --------
85 | 接下来一步将在知乎首页中提取问题链接,发出静态问题请求::
86 |
87 | def after_login(self, response):
88 | html = response.body
89 | selector = etree.HTML(html)
90 | links = selector.xpath('//a[@class="question_link"]')
91 | for link in links:
92 | yield Request('https://www.zhihu.com' + link.attrib["href"],
93 | meta={"cookiejar": "zhihu"}, callback=self.get_item)
94 |
95 | response.body存储了响应的内容。我们使用了lxml提取html文本中的标签,然后发出一系列静态请求。
96 |
97 | 在获得问题页面的数据之后,我们需要做的是提取出其中的问题标题与详情::
98 |
99 | def get_item(self, response):
100 | html = response.body
101 | selector = etree.HTML(html)
102 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
103 | body = selector.xpath('//span[@class="RichText"]')[0].text
104 | yield {
105 | 'head': head,
106 | 'body': body
107 | }
108 |
109 | 过程与上个函数类似,通过xpath定位元素。
110 |
111 | 处理你获得的数据
112 | -----------------
113 | 处理数据通过重写process_item方法实现::
114 |
115 | def process_item(self, item):
116 | print json.dumps(item, ensure_ascii=False)
117 |
118 | 这里我们只是将结果打印。
119 |
120 | 运行你的爬虫
121 | -------------
122 | 最后我们通过这样一段代码运行爬虫::
123 |
124 | if __name__ == "__main__":
125 | spider = Zhihu_Spider()
126 | spider.start()
127 |
128 | 完整的代码如下::
129 |
130 | # -*- coding:utf-8 -*-
131 |
132 | from pycreeper.spider import Spider
133 | from pycreeper.http.request import Request
134 | from lxml import etree
135 | import json
136 | import gevent
137 |
138 |
139 | class Zhihu_Spider(Spider):
140 |
141 | custom_settings = {
142 | 'DOWNLOADER_MIDDLEWARES': {
143 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
144 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
145 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
146 | },
147 | 'DRIVER': 'Chrome',
148 | 'DOWNLOAD_DELAY': 2,
149 | 'STATIC_REQUEST_SSL_VERIFY': False,
150 | 'USER_AGENT_LIST': [
151 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
152 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
153 | ]
154 | }
155 |
156 | def start_requests(self):
157 |
158 | def _login(driver):
159 | driver.find_element_by_name('account').send_keys("username")
160 | driver.find_element_by_name('password').send_keys("password")
161 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
162 | gevent.sleep(5)
163 |
164 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
165 | callback=self.after_login, dynamic=True, browser_actions=[_login])
166 |
167 | def after_login(self, response):
168 | html = response.body
169 | selector = etree.HTML(html)
170 | links = selector.xpath('//a[@class="question_link"]')
171 | for link in links:
172 | yield Request('https://www.zhihu.com' + link.attrib["href"],
173 | meta={"cookiejar": "zhihu"}, callback=self.get_item)
174 |
175 | def get_item(self, response):
176 | html = response.body
177 | selector = etree.HTML(html)
178 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
179 | body = selector.xpath('//span[@class="RichText"]')[0].text
180 | yield {
181 | 'head': head,
182 | 'body': body
183 | }
184 |
185 | def process_item(self, item):
186 | print json.dumps(item, ensure_ascii=False)
187 |
188 | if __name__ == "__main__":
189 | spider = Zhihu_Spider()
190 | spider.start()
191 |
192 |
193 | 写在后面
194 | ---------
195 | 项目已经通过PyPi发布,您可以通过以下命令下载::
196 |
197 | pip install pycreeper
198 |
199 | 未来我们将会引入Docker的支持。
200 |
201 | 目前项目刚刚发布1.0.0版本,如果在使用时,遇到各种问题,我们都欢迎您反馈给我们,您可以通过github,
202 | 项目主页:https://github.com/ZcyAndWt/pyCreeper,也可以通过邮件,作者的邮箱:zhengchenyu.backend@gmail.com。
203 |
204 | 如果您使用中,觉得本项目有可取之处,提高了您爬取数据的效率,希望您能在github上star本项目。
205 | 您的支持是我们前进最大的动力!
206 |
207 |
208 |
209 |
210 |
211 |
--------------------------------------------------------------------------------
/doc/_build/html/_sources/tutorial.rst.txt:
--------------------------------------------------------------------------------
1 | PyCreeper初探
2 | ==============
3 | PyCreeper是一个用来快速提取网页内容的信息采集(爬虫)框架。项目底层异步网络I/O使用 **Gevent** 协程库,将网络请求分为静态请求和动态请求,
4 | 静态请求交给 **Requests** 处理,动态请求则使用 **Selenium.Webdriver** 加载。
5 |
6 | 在设计这个项目的过程中,我参考了很多 **Scrapy** (项目网站: https://scrapy.org/)的架构和实现方式。Scrapy是一个非常棒的爬虫框架,
7 | 我之前花了很多心血在Scrapy框架之上!
8 |
9 | 这篇PyCreeper初探会编写一个简单的爬虫例子,让您明白PyCreeper大致的工作流程,使您快速上手。
10 |
11 | 如果您的PyCreeper还没有安装好,请参考: :doc:`prepare`。
12 |
13 | 目标任务
14 | ---------
15 | 知乎(https://www.zhihu.com/)与Quora类似,是一个分享知识提出问题的平台。我们的Demo任务是模拟登陆知乎,保存Cookie,
16 | 之后发出一系列静态请求,获取首页的问题题目与描述。
17 |
18 | 由于模拟登陆一步我们采用了基于Selenium.Webdriver的动态请求处理,所以你可以抛开复杂的抓包与分析代码,只需要点几个按钮,
19 | 就像在真实环境登录知乎一样简单便利!
20 |
21 |
22 | 定义一个爬虫
23 | -------------
24 | 定义一个爬虫类需要需要继承Spider类,代码如下::
25 |
26 | from pycreeper.spider import Spider
27 |
28 | class Zhihu_Spider(Spider):
29 | pass
30 |
31 | 选择中间件MiddleWares
32 | ----------------------
33 | 对于Spider的中间件选择,通过修改custom_settings对象实现::
34 |
35 | custom_settings = {
36 | 'DOWNLOADER_MIDDLEWARES': {
37 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
38 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
39 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
40 | },
41 | 'DRIVER': 'Chrome',
42 | 'DOWNLOAD_DELAY': 2,
43 | 'USER_AGENT_LIST': [
44 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
45 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
46 | ]
47 | }
48 |
49 | 其中,DOWNLOADER_MIDDLEWARES是这个爬虫爬取过程中使用的中间件。UserAgentMiddleware提供了一种简单的控制请求User-Agent的方式(只对静态请求有效,
50 | 动态请求的UA取决于使用的WebDriver)。RetryMiddleware对失败的请求(错误的返回码,超时等)进行多次重试。CookiesMiddleware在全体的请求之间共享CookieJar池,
51 | 一组请求可以共享一个CookieJar,CookiesMiddleware维护CookieJar的有效性与一致性。
52 |
53 | DRIVER表明了动态请求的浏览器,这里我们使用Chrome。
54 |
55 | DOWNLOAD_DELAY表明了下载之间的延迟时间(秒),这个选项当网站有某种防爬策略时还是很有用的。
56 |
57 | USER_AGENT_LIST中包含请求使用的User-Agent,UserAgentMiddleware会从中随机取出一个来使用。
58 |
59 |
60 | 最开始的请求
61 | -------------
62 | 下面这段代码通过重写start_requests方法yield一个PyCreeper请求::
63 |
64 | def start_requests(self):
65 |
66 | def _login(driver):
67 | driver.find_element_by_name('account').send_keys("username")
68 | driver.find_element_by_name('password').send_keys("password")
69 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
70 | gevent.sleep(5)
71 |
72 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
73 | callback=self.after_login, dynamic=True, browser_actions=[_login])
74 |
75 | 在Request对象的参数中,dynamic=True表明这是一个动态请求,将会调用WebDriver加载,
76 | 而browser_actions=[_login]则定义了浏览器加载完成之后进行的动作。本例中输入了用户名与密码,然后点击登录。
77 | gevent.sleep(5)则是令爬虫等待浏览器加载完成。
78 |
79 | meta={"cookiejar": "zhihu"}这个选项表明本次请求产生的Cookie将会被存储在名为zhihu的CookieJar当中
80 |
81 | callback=self.after_login定义了本次响应的处理函数。
82 |
83 | 接下来?
84 | --------
85 | 接下来一步将在知乎首页中提取问题链接,发出静态问题请求::
86 |
87 | def after_login(self, response):
88 | html = response.body
89 | selector = etree.HTML(html)
90 | links = selector.xpath('//a[@class="question_link"]')
91 | for link in links:
92 | yield Request('https://www.zhihu.com' + link.attrib["href"],
93 | meta={"cookiejar": "zhihu"}, callback=self.get_item)
94 |
95 | response.body存储了响应的内容。我们使用了lxml提取html文本中的标签,然后发出一系列静态请求。
96 |
97 | 在获得问题页面的数据之后,我们需要做的是提取出其中的问题标题与详情::
98 |
99 | def get_item(self, response):
100 | html = response.body
101 | selector = etree.HTML(html)
102 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
103 | body = selector.xpath('//span[@class="RichText"]')[0].text
104 | yield {
105 | 'head': head,
106 | 'body': body
107 | }
108 |
109 | 过程与上个函数类似,通过xpath定位元素。
110 |
111 | 处理你获得的数据
112 | -----------------
113 | 处理数据通过重写process_item方法实现::
114 |
115 | def process_item(self, item):
116 | print json.dumps(item, ensure_ascii=False)
117 |
118 | 这里我们只是将结果打印。
119 |
120 | 运行你的爬虫
121 | -------------
122 | 最后我们通过这样一段代码运行爬虫::
123 |
124 | if __name__ == "__main__":
125 | spider = Zhihu_Spider()
126 | spider.start()
127 |
128 | 完整的代码如下::
129 |
130 | # -*- coding:utf-8 -*-
131 |
132 | from pycreeper.spider import Spider
133 | from pycreeper.http.request import Request
134 | from lxml import etree
135 | import json
136 | import gevent
137 |
138 |
139 | class Zhihu_Spider(Spider):
140 |
141 | custom_settings = {
142 | 'DOWNLOADER_MIDDLEWARES': {
143 | 'pycreeper.downloader_middlewares.middlewares.UserAgentMiddleware': 100,
144 | 'pycreeper.downloader_middlewares.middlewares.RetryMiddleware': 200,
145 | 'pycreeper.downloader_middlewares.cookies_middlewares.CookiesMiddleware': 300
146 | },
147 | 'DRIVER': 'Chrome',
148 | 'DOWNLOAD_DELAY': 2,
149 | 'STATIC_REQUEST_SSL_VERIFY': False,
150 | 'USER_AGENT_LIST': [
151 | '''Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
152 | (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36''',
153 | ]
154 | }
155 |
156 | def start_requests(self):
157 |
158 | def _login(driver):
159 | driver.find_element_by_name('account').send_keys("username")
160 | driver.find_element_by_name('password').send_keys("password")
161 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
162 | gevent.sleep(5)
163 |
164 | yield Request(url='https://www.zhihu.com/#signin', meta={"cookiejar": "zhihu"},
165 | callback=self.after_login, dynamic=True, browser_actions=[_login])
166 |
167 | def after_login(self, response):
168 | html = response.body
169 | selector = etree.HTML(html)
170 | links = selector.xpath('//a[@class="question_link"]')
171 | for link in links:
172 | yield Request('https://www.zhihu.com' + link.attrib["href"],
173 | meta={"cookiejar": "zhihu"}, callback=self.get_item)
174 |
175 | def get_item(self, response):
176 | html = response.body
177 | selector = etree.HTML(html)
178 | head = selector.xpath('//h1[@class="QuestionHeader-title"]')[0].text
179 | body = selector.xpath('//span[@class="RichText"]')[0].text
180 | yield {
181 | 'head': head,
182 | 'body': body
183 | }
184 |
185 | def process_item(self, item):
186 | print json.dumps(item, ensure_ascii=False)
187 |
188 | if __name__ == "__main__":
189 | spider = Zhihu_Spider()
190 | spider.start()
191 |
192 |
193 | 写在后面
194 | ---------
195 | 项目已经通过PyPi发布,您可以通过以下命令下载::
196 |
197 | pip install pycreeper
198 |
199 | 未来我们将会引入Docker的支持。
200 |
201 | 目前项目刚刚发布1.0.0版本,如果在使用时,遇到各种问题,我们都欢迎您反馈给我们,您可以通过github,
202 | 项目主页:https://github.com/ZcyAndWt/pyCreeper,也可以通过邮件,作者的邮箱:zhengchenyu.backend@gmail.com。
203 |
204 | 如果您使用中,觉得本项目有可取之处,提高了您爬取数据的效率,希望您能在github上star本项目。
205 | 您的支持是我们前进最大的动力!
206 |
207 |
208 |
209 |
210 |
211 |
--------------------------------------------------------------------------------
/doc/_build/html/http.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | request对象和response对象 — PyCreeper 1.0.0 documentation
10 |
11 |
12 |
13 |
14 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
request对象和response对象
56 |
request对象和response对象负责在各个PyCreeper组件之间传递信息,您在使用爬虫的过程中,会经常需要对这两个对象进行操作。
57 |
58 |
Request:自定义您的请求
59 |
构造参数:
60 |
Request ( url , callback = None , method = 'GET' , headers = None , body = None , meta = None ,
61 | encoding = 'utf-8' , cookiejar = None , dynamic = False , browser_actions = None , wait = 0 )
62 |
63 |
64 |
url
65 |
请求的url
66 |
callback
67 |
请求的回调函数,如果未定义则使用Spider.parse方法处理响应。
68 |
method
69 |
支持GET型和POST型请求方法,其中,POST方法只有当dynamic=False时才会被支持,
70 | 如果dynamic=True将会抛出一个AttributeError。
71 |
headers
72 |
该参数可以传入一个字典(dict),用于静态请求的头部信息。
73 |
body
74 |
该参数用于静态请求的请求体。
75 |
meta
76 |
该参数为字典(dict)型,用于给request携带一些参数,这些参数可能在其他模块用到。
77 |
encoding
78 |
请求的编码方式,用于给url和body编码。
79 |
cookiejar
80 |
该参数用于取出request携带的cookiejar,在构造request对象时请不要向该参数传入值,传入的cookiejar不会被PyCreeper使用到。
81 |
dynamic
82 |
该参数用于标记request是否是动态请求。
83 |
browser_actions
84 |
该参数用于定义浏览器打开指定网址之后,到提取数据之前,执行的一系列操作。该参数可以传入一个函数列表。
85 |
wait
86 |
该参数用于定义浏览器打开指定网址之后,到执行browser_actions中定义的函数之前,等待的时间。
87 | 当网页存在大量异步加载请求的时候,这个参数格外有用。
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
130 |
131 |
132 |
147 |
151 |
152 |
--------------------------------------------------------------------------------
/pycreeper/downloader/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | """ Downloader """
6 | import cookielib
7 | import requests
8 | from pycreeper.http.response import Response
9 | from pycreeper.downloader_middlewares import DownloaderMiddlewareManager
10 | from selenium.common.exceptions import TimeoutException as _TimeoutException
11 | from pycreeper.utils.exceptions import TimeoutException
12 | from requests.exceptions import Timeout
13 | import six
14 | from pycreeper.utils import _get_cookies_from_cookiejar
15 | import gevent
16 | import traceback
17 |
18 |
19 | class DownloadHandler(object):
20 | """ DownloadHandler """
21 |
22 | def __init__(self, spider, driver, driver_sem, **kwargs):
23 | self.settings = spider.settings
24 | self.logger = spider.logger
25 | self.session_map = {}
26 | self.kwargs = kwargs
27 | self.driver = driver
28 | self.driver_sem = driver_sem
29 |
30 | def fetch(self, request):
31 | """fetch
32 | """
33 | url = request.url
34 | if request.dynamic:
35 | return self._fetch_dynamic(request, url)
36 | else:
37 | return self._fetch_static(request, url)
38 |
39 | def _fetch_static(self, request, url):
40 | self.logger.info("processing static page %s", url)
41 | kwargs = {
42 | "timeout": self.settings["TIMEOUT"],
43 | "headers": request.headers,
44 | "verify": self.settings["STATIC_REQUEST_SSL_VERIFY"],
45 | }
46 | if "proxy" in request.meta and request.meta["proxy"]:
47 | kwargs.update(proxies=request.meta["proxy"])
48 | try:
49 | session = requests.Session()
50 | if request.cookiejar:
51 | session.cookies = request.cookiejar
52 | if request.method == 'GET':
53 | response = session.get(url, **kwargs)
54 | elif request.method == 'POST':
55 | if request.body:
56 | kwargs.update(data=request.body)
57 | response = session.post(url, **kwargs)
58 | else:
59 | raise ValueError('Unacceptable HTTP verb %s' % request.method)
60 | return Response(response.url, request, status=response.status_code,
61 | cookiejar=response.cookies, body=response.content)
62 | except Timeout as e:
63 | raise TimeoutException(e.message)
64 | except Exception as e:
65 | self.logger.error("download error: %s", str(e), exc_info=True)
66 | raise e
67 |
68 |
69 | def _fetch_dynamic(self, request, url):
70 | self.logger.info("processing dynamic page %s", url)
71 | try:
72 | self.driver_sem.acquire()
73 | if request.cookiejar:
74 | cookies = _get_cookies_from_cookiejar(request.cookiejar)
75 | cookies = self._covert_cookies_to_dict(cookies)
76 | #self._removed_first_dot_in_front_of_domain(cookies)
77 | command_list = self._get_command_list(cookies)
78 | # make the current page to have the same domain with cookies
79 | self.driver.get(url)
80 | # load cookies
81 | for command in command_list:
82 | self.driver.execute_script(command)
83 |
84 | self.driver.set_page_load_timeout(self.settings["TIMEOUT"])
85 | self.driver.get(url)
86 | gevent.sleep(request.wait)
87 | for func in request.browser_actions:
88 | func(self.driver)
89 | url = self.driver.current_url
90 | html = self.driver.page_source
91 |
92 | # generate cookies
93 | all_cookies = self.driver.get_cookies()
94 | self.driver.delete_all_cookies()
95 | self.driver_sem.release()
96 |
97 | all_cookies = self._to_byte(all_cookies)
98 | cookies = [self._make_cookie(**d) for d in all_cookies]
99 |
100 | # set cookies to cookiejar
101 | cj = cookielib.CookieJar()
102 | for cookie in cookies:
103 | cj.set_cookie(cookie)
104 | return Response(url, request, cookiejar=cj, body=html)
105 | except _TimeoutException as e:
106 | raise TimeoutException(e.message)
107 | except Exception as e:
108 | self.logger.error("download error: %s", str(e), exc_info=True)
109 | raise e
110 |
111 | def _removed_first_dot_in_front_of_domain(self, cookies):
112 | for cookie in cookies:
113 | for k in cookie:
114 | if k == 'domain' and str(cookie[k]).startswith('.'):
115 | cookie[k] = cookie[k][1:]
116 |
117 | def _get_command_list(self, cookies):
118 | js_list = []
119 | for cookie in cookies:
120 | item_list = [cookie['name'] + '=' + cookie['value']]
121 | for k in ('domain', 'path', 'expiry'):
122 | if k in cookie and not (cookie[k] is None):
123 | item_list.append(str(k) + '=' + str(cookie[k]))
124 | js_list.append("document.cookie = '%s';\n" % ('; '.join(item_list)))
125 | return js_list
126 |
127 | def _make_cookie(self, **kwargs):
128 | return cookielib.Cookie(
129 | version=0,
130 | name=kwargs.get('name', None),
131 | value=kwargs.get('value', None),
132 | port=None,
133 | port_specified=False,
134 | domain=kwargs.get('domain', None),
135 | domain_specified=True,
136 | domain_initial_dot=False,
137 | path=kwargs.get('path', None),
138 | path_specified=True,
139 | secure=False,
140 | expires=kwargs.get('expires', None),
141 | discard=False,
142 | comment=None,
143 | comment_url=None,
144 | rest=None
145 | )
146 |
147 | def _covert_cookies_to_dict(self, cookies):
148 | result = []
149 | for cookie in cookies:
150 | cookie_dict = {}
151 | for key in ['name', 'value', 'domain', 'path', 'expires']:
152 | if getattr(cookie, key):
153 | cookie_dict[key] = getattr(cookie, key)
154 | result.append(cookie_dict)
155 | return result
156 |
157 | def _to_byte(self, cookies):
158 | result = []
159 | for cookie in cookies:
160 | temp = {}
161 | for key in cookie.keys():
162 | temp[key.encode('utf-8') if isinstance(key, six.text_type) else key] = \
163 | cookie[key].encode('utf-8') if isinstance(cookie[key], six.text_type) else cookie[key]
164 | result.append(temp)
165 | return result
166 |
167 |
168 |
169 |
170 | class Downloader(object):
171 | """ Downloader """
172 |
173 | def __init__(self, spider, driver, driver_sem):
174 | self.hanlder = DownloadHandler(spider, driver, driver_sem)
175 | self.middleware = DownloaderMiddlewareManager(spider)
176 |
177 | def fetch(self, request, spider):
178 | """fetch
179 |
180 | @request, Request, 请求
181 | """
182 | return self.middleware.download(self._download, request)
183 |
184 | def _download(self, request):
185 | """download
186 | """
187 | return self.hanlder.fetch(request)
188 |
--------------------------------------------------------------------------------
/doc/_build/html/index.html:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | PyCreeper: 抓取你能看到的一切! — PyCreeper 1.0.0 documentation
10 |
11 |
12 |
13 |
14 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
100 |
101 |
102 |
103 |
104 |
105 |
129 |
130 |
131 |
143 |
147 |
148 |
--------------------------------------------------------------------------------
/tests/test_downloader.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | reload(__import__('sys')).setdefaultencoding('utf-8')
3 | __author__ = 'zcy'
4 |
5 | import unittest
6 | import time
7 | import json
8 |
9 | from pycreeper.utils.exceptions import TimeoutException
10 | import gevent
11 |
12 | from gevent.pool import Pool
13 | from pycreeper.downloader_middlewares.cookies_middlewares import CookiesMiddleware
14 | from pycreeper.downloader import DownloadHandler
15 | from pycreeper.spider import Spider
16 | from pycreeper.http.request import Request
17 | from pycreeper.http.response import Response
18 | from selenium import webdriver
19 | from gevent.lock import BoundedSemaphore
20 |
21 | HTTPBIN_URL = 'http://httpbin.org'
22 |
23 |
24 |
25 |
26 | class DownloadHandlerTest(unittest.TestCase):
27 | def setUp(self):
28 | self.spider = Spider()
29 | self.spider.settings.set('TIMEOUT', 15)
30 | self.driver = None
31 | self.driver_sem = BoundedSemaphore(1)
32 |
33 | def test_concurrency_with_delayed_url(self):
34 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
35 | n = 5
36 | pool = Pool(n)
37 | urls = []
38 | for i in range(n):
39 | urls.append(HTTPBIN_URL + '/delay/1')
40 | time_start = time.time()
41 | pool.map(dh.fetch, [Request(url) for url in urls])
42 | time_total = time.time() - time_start
43 | self.assertLess(time_total, n)
44 |
45 | def test_timeout_static(self):
46 | self.spider.settings.set('TIMEOUT', 5)
47 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
48 | self.assertRaises(TimeoutException, dh.fetch, Request(HTTPBIN_URL + '/delay/10'))
49 |
50 | def test_timeout_dynamic(self):
51 | self.driver = webdriver.PhantomJS()
52 | self.spider.settings.set('TIMEOUT', 5)
53 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
54 | self.assertRaises(TimeoutException, dh.fetch, Request(HTTPBIN_URL + '/delay/10', dynamic=True))
55 | self.driver.close()
56 |
57 | def test_post_data_static(self):
58 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
59 | response = dh.fetch(Request(HTTPBIN_URL + '/post', method='POST'))
60 | self.assertIsInstance(response, Response)
61 | self.assertEqual(response.status, 200)
62 |
63 | def test_post_data_content_static(self):
64 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
65 | response = dh.fetch(Request(HTTPBIN_URL + '/post', method='POST', body={'text': 'pycreeper'}))
66 | self.assertIsInstance(response, Response)
67 | self.assertEqual(json.loads(response.body)['form'], {'text': 'pycreeper'})
68 |
69 | response = dh.fetch(Request(HTTPBIN_URL + '/post', method='POST', body=u'Unicode测试'))
70 | self.assertEqual(json.loads(response.body)['data'], 'Unicode测试')
71 |
72 | response = dh.fetch(Request(HTTPBIN_URL + '/post', method='POST', body='中文测试'))
73 | self.assertEqual(json.loads(response.body)['data'], '中文测试')
74 | self.assertEqual(response.status, 200)
75 |
76 | def test_get_data(self):
77 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
78 | response = dh.fetch(Request(HTTPBIN_URL + '/get'))
79 | self.assertIsInstance(response, Response)
80 | self.assertEqual(response.status, 200)
81 |
82 | def test_dynamic_request(self):
83 | self.driver = webdriver.PhantomJS()
84 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
85 | request = Request(HTTPBIN_URL + '/get', dynamic=True)
86 | dh.fetch(request)
87 | self.driver.close()
88 |
89 | def test_dynamic_request_wait(self):
90 | self.driver = webdriver.PhantomJS()
91 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
92 | request = Request(HTTPBIN_URL + '/get', dynamic=True, wait=3)
93 | dh.fetch(request)
94 | self.driver.close()
95 |
96 | def test_dynamic_request_timeout(self):
97 | self.driver = webdriver.PhantomJS()
98 | self.spider.settings.set('TIMEOUT', 5)
99 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
100 | request = Request(HTTPBIN_URL + '/delay/10', dynamic=True)
101 | self.assertRaises(TimeoutException, dh.fetch, request)
102 | self.driver.close()
103 |
104 | def test_dynamic_request_concurrency(self):
105 | self.driver = webdriver.PhantomJS()
106 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
107 | n = 5
108 | pool = Pool(n)
109 | urls = []
110 | for i in range(n):
111 | urls.append(HTTPBIN_URL + '/delay/1')
112 | time1 = time.time()
113 | pool.map(dh.fetch, [Request(url, dynamic=True, wait=5) for url in urls])
114 | self.assertGreater(time.time() - time1, n)
115 | self.driver.close()
116 |
117 | def test_dynamic_request_cookie_between_static_and_dynamic(self):
118 | cm = CookiesMiddleware(self.spider, self.spider.settings)
119 | self.driver = webdriver.PhantomJS()
120 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
121 | request = Request(HTTPBIN_URL + '/cookies/set?key1=val1&key2=val2',
122 | dynamic=True, meta={'cookiejar': 'test'})
123 | response = dh.fetch(request)
124 | cm.process_response(request, response)
125 | request = Request(HTTPBIN_URL + '/cookies', meta={'cookiejar': 'test'})
126 | cm.process_request(request)
127 | response = dh.fetch(request)
128 | self.assertEqual(json.loads(response.body)['cookies'],
129 | {u'key1': u'val1', u'key2': u'val2'})
130 | self.driver.close()
131 |
132 | def test_dynamic_request_multi_cookiejar(self):
133 | cm = CookiesMiddleware(self.spider, self.spider.settings)
134 | self.driver = webdriver.PhantomJS()
135 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
136 |
137 | # jar 1
138 | request = Request(HTTPBIN_URL + '/cookies/set?key1=val1',
139 | dynamic=True, meta={'cookiejar': 'test1'})
140 | cm.process_request(request)
141 | response = dh.fetch(request)
142 | cm.process_response(request, response)
143 |
144 | # jar 2
145 | request = Request(HTTPBIN_URL + '/cookies/set?key2=val2',
146 | dynamic=True, meta={'cookiejar': 'test2'})
147 | cm.process_request(request)
148 | response = dh.fetch(request)
149 | cm.process_response(request, response)
150 |
151 | # test jar2
152 | request = Request(HTTPBIN_URL + '/cookies', meta={'cookiejar': 'test2'})
153 | cm.process_request(request)
154 | response = dh.fetch(request)
155 | cm.process_response(request, response)
156 | self.assertEqual(json.loads(response.body)['cookies'], {u'key2': u'val2'})
157 |
158 | # test jar1
159 | request = Request(HTTPBIN_URL + '/cookies', meta={'cookiejar': 'test1'})
160 | cm.process_request(request)
161 | response = dh.fetch(request)
162 | cm.process_response(request, response)
163 | self.assertEqual(json.loads(response.body)['cookies'], {u'key1': u'val1'})
164 | self.driver.close()
165 |
166 | def test_dynamic_request_browser_actions(self):
167 | cm = CookiesMiddleware(self.spider, self.spider.settings)
168 | self.driver = webdriver.Chrome()
169 | dh = DownloadHandler(self.spider, self.driver, self.driver_sem)
170 |
171 | def _actions(driver):
172 | driver.find_element_by_name('account').send_keys("username")
173 | driver.find_element_by_name('password').send_keys("pwd")
174 | driver.find_element_by_xpath('/html/body/div[1]/div/div[2]/div[2]/form/div[2]/button').click()
175 | gevent.sleep(5)
176 |
177 | request = Request('https://www.zhihu.com/#signin',
178 | dynamic=True, meta={'cookiejar': 'test'},
179 | browser_actions=[_actions],
180 | )
181 | cm.process_request(request)
182 | response = dh.fetch(request)
183 | cm.process_response(request, response)
184 |
185 | request = Request('https://www.zhihu.com', dynamic=True, meta={'cookiejar': 'test'})
186 | cm.process_request(request)
187 | response = dh.fetch(request)
188 | cm.process_response(request, response)
189 | print response.body
190 | self.driver.close()
191 |
192 |
193 | class DownloadTest(unittest.TestCase):
194 | pass
195 |
196 |
197 | if __name__ == "__main__":
198 | unittest.main()
199 |
--------------------------------------------------------------------------------