├── scrapyz
├── __init__.py
├── examples
│ ├── __init__.py
│ └── reddit_spider.py
├── test
│ ├── __init__.py
│ ├── util.py
│ ├── spiders.py
│ ├── basic_some_missing.html
│ ├── basic_parse.html
│ └── test_generic_spider.py
├── util.py
├── pipelines.py
├── settings.py
└── core.py
├── setup.cfg
├── scrapy.cfg
├── .gitignore
├── setup.py
└── README.md
/scrapyz/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [scrapyz]
2 | universal=1
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | default = scrapyz.settings
--------------------------------------------------------------------------------
/scrapyz/examples/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'shane'
2 |
--------------------------------------------------------------------------------
/scrapyz/test/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'shane'
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[cod]
2 | build
3 | dist
4 | Include
5 | Lib
6 | Scripts
7 | .idea
8 | scrapyz/.idea
9 | *.egg-info
10 |
--------------------------------------------------------------------------------
/scrapyz/test/util.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from scrapy.http import HtmlResponse, Request
4 |
5 |
6 | def fake_response(file, url=None):
7 | if not url:
8 | url = "http://www.test.com/offers"
9 | file = os.path.join(os.path.dirname(__file__), file)
10 | return HtmlResponse(url=url, request=Request(url=url), body=open(file, 'r').read())
--------------------------------------------------------------------------------
/scrapyz/util.py:
--------------------------------------------------------------------------------
1 | from scrapy import Item
2 | from scrapy.http.request import Request
3 | from scrapy.utils.response import get_base_url
4 | import urlparse
5 |
6 |
7 | def gen_item(fields):
8 | return type("GenericItem", (Item,), fields)
9 |
10 |
11 | def gen_request(url, callback, item=None):
12 | r = Request(url, callback=callback)
13 | if item:
14 | r.meta['item'] = item
15 | return r
16 |
17 |
18 | def absolute_url(link, response):
19 | if isinstance(link, list):
20 | link = link[0] if len(link) else ""
21 | return urlparse.urljoin(get_base_url(response), link)
22 |
23 |
24 | def nth(n):
25 | def processor(field, response):
26 | return field[n]
27 | return processor
28 |
29 |
30 | def strip(field, response):
31 | if isinstance(field, list) and field:
32 | field = field[0]
33 | return field.strip()
34 |
--------------------------------------------------------------------------------
/scrapyz/test/spiders.py:
--------------------------------------------------------------------------------
1 | from scrapyz.core import GenericSpider, CssTarget
2 | from scrapyz.util import nth, strip, absolute_url
3 |
4 |
5 | class BasicParseFirstElementTestSpider(GenericSpider):
6 | name = "test"
7 | start_urls = ["http://www.test.com"]
8 |
9 | class Meta:
10 | items = CssTarget("items", ".offer")
11 | targets = [
12 | CssTarget("title", ".title::text", [nth(0), strip]),
13 | CssTarget("discount", ".discount::text", [nth(0), strip]),
14 | CssTarget("disclaimer", ".disclaimer::text", [nth(0), strip]),
15 | CssTarget("offer_url", ".offer_url::attr(href)", [nth(0), strip, absolute_url]),
16 | CssTarget("image_url", ".image::attr(src)", [nth(0), strip])
17 | ]
18 |
19 | class BasicParseTestSpider(GenericSpider):
20 | name = "test"
21 | start_urls = ["http://www.test.com"]
22 |
23 | class Meta:
24 | items = CssTarget("items", ".offer")
25 | targets = [
26 | CssTarget("title", ".title::text"),
27 | CssTarget("discount", ".discount::text"),
28 | CssTarget("disclaimer", ".disclaimer::text"),
29 | CssTarget("offer_url", ".offer_url::attr(href)"),
30 | CssTarget("image_url", ".image::attr(src)",)
31 | ]
32 |
33 |
34 | class NoMetaSpider(GenericSpider):
35 | pass
36 |
37 |
38 | class NoStartSpider(GenericSpider):
39 | name = "nostart"
40 |
41 | class Meta:
42 | pass
43 |
44 |
45 | class GoodSpider(GenericSpider):
46 | name = "good"
47 | start_urls = []
48 |
49 | class Meta:
50 | items = ""
51 | targets = []
52 |
--------------------------------------------------------------------------------
/scrapyz/test/basic_some_missing.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
21 |
59 |
60 |
--------------------------------------------------------------------------------
/scrapyz/test/basic_parse.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
21 |
65 |
66 |
--------------------------------------------------------------------------------
/scrapyz/pipelines.py:
--------------------------------------------------------------------------------
1 | import functools
2 | from scrapy.exceptions import DropItem
3 |
4 | """
5 | Helper Functions
6 | """
7 |
8 |
9 | def check_spider_pipelines(process_item_method):
10 | """
11 | I can't remember where I found this but
12 | """
13 |
14 | @functools.wraps(process_item_method)
15 | def wrapper(self, item, spider):
16 | if hasattr(spider, "pipelines") and self.__class__ in spider.pipelines:
17 | return process_item_method(self, item, spider)
18 | else:
19 | return item
20 |
21 | return wrapper
22 |
23 | def get_scrapyz_pipelines():
24 | return {
25 | 'scrapyz.pipelines.RequiredFields': 300,
26 | 'scrapyz.pipelines.MinTargets': 300,
27 | }
28 |
29 | """
30 | Pipeline Classes
31 | """
32 |
33 |
34 | class FilterBase(object):
35 | """
36 | Abstract class. Override the validate function to suit your needs.
37 | """
38 |
39 | @check_spider_pipelines
40 | def process_item(self, item, spider):
41 | if self.validate(item, spider):
42 | return item
43 | raise DropItem("Item failed in filter pipeline.")
44 |
45 | def validate(self, item, spider):
46 | """
47 | Override this function to return true if an item passes the filter and false otherwise.
48 | You can use attributes on the spider or the item for your filtering.
49 | """
50 | return item
51 |
52 |
53 | class RequiredFields(FilterBase):
54 | """
55 | Requires the spider to implement Meta.required_fields. Drops any item that doesn't have a value for each
56 | required field.
57 | """
58 |
59 | def validate(self, item, spider):
60 | missing = []
61 | for field in spider.Meta.required_fields:
62 | if field not in item:
63 | missing.append(field)
64 | return not missing
65 |
66 |
67 | class MinTargets(FilterBase):
68 | """
69 | Filters out items that weren't able to hit a minimum number for Targets.
70 | """
71 |
72 | def validate(self, item, spider):
73 | return len(item.keys()) >= spider.Meta.min_targets
74 |
75 |
--------------------------------------------------------------------------------
/scrapyz/examples/reddit_spider.py:
--------------------------------------------------------------------------------
1 | from scrapyz.core import *
2 | from scrapyz.pipelines import RequiredFields
3 | from scrapyz.util import absolute_url
4 |
5 |
6 | def join(value, response):
7 | if isinstance(value, (list, tuple)):
8 | value = " ".join(value)
9 | return value
10 |
11 |
12 | class RedditSpider(GenericSpider):
13 | name = "reddit"
14 | start_urls = ["https://www.reddit.com/"]
15 |
16 | class Meta:
17 | items = CssTarget("items", ".thing")
18 | targets = [
19 | CssTarget("rank", ".rank::text"),
20 | CssTarget("upvoted", ".upvoted::text"),
21 | CssTarget("dislikes", ".dislikes::text"),
22 | CssTarget("likes", ".likes::text"),
23 | CssTarget("title", "a.title::text"),
24 | CssTarget("domain", ".domain > a::text"),
25 | CssTarget("datetime", ".tagline > time::attr(datetime)"),
26 | CssTarget("author", ".tagline > .author::text"),
27 | CssTarget("subreddit", ".tagline > .subreddit::text"),
28 | CssTarget("comments", ".comments::text")
29 | ]
30 |
31 |
32 | class RedditSpider2(IndexDetailSpider):
33 | name = "reddit2"
34 | start_urls = ["https://www.reddit.com/"]
35 |
36 | class Meta:
37 | detail_path = [
38 | CssTarget("detail_path", ".title > a::attr(href)", [absolute_url])
39 | ]
40 | detail_targets = [
41 | CssTarget("content", ".usertext-body > div > p::text", [join]),
42 | ]
43 | items = CssTarget("items", ".thing")
44 | targets = [
45 | CssTarget("rank", ".rank::text"),
46 | CssTarget("upvoted", ".upvoted::text"),
47 | CssTarget("dislikes", ".dislikes::text"),
48 | CssTarget("likes", ".likes::text"),
49 | CssTarget("title", "a.title::text"),
50 | CssTarget("domain", ".domain > a::text"),
51 | CssTarget("datetime", ".tagline > time::attr(datetime)"),
52 | CssTarget("author", ".tagline > .author::text"),
53 | CssTarget("subreddit", ".tagline > .subreddit::text"),
54 | CssTarget("comments", ".comments::text")
55 | ]
56 |
--------------------------------------------------------------------------------
/scrapyz/settings.py:
--------------------------------------------------------------------------------
1 | from scrapyz import pipelines
2 |
3 | BOT_NAME = '{your_bot}'
4 |
5 | SPIDER_MODULES = ['scrapyz.examples']
6 | NEWSPIDER_MODULE = 'scrapyz.examples'
7 |
8 |
9 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
10 | #USER_AGENT = 'scrapyz (+http://www.yourdomain.com)'
11 |
12 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
13 | #CONCURRENT_REQUESTS=32
14 |
15 | # Configure a delay for requests for the same website (default: 0)
16 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
17 | # See also autothrottle settings and docs
18 | #DOWNLOAD_DELAY=3
19 | # The download delay setting will honor only one of:
20 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
21 | #CONCURRENT_REQUESTS_PER_IP=16
22 |
23 | # Disable cookies (enabled by default)
24 | #COOKIES_ENABLED=False
25 |
26 | # Disable Telnet Console (enabled by default)
27 | #TELNETCONSOLE_ENABLED=False
28 |
29 | # Override the default request headers:
30 | #DEFAULT_REQUEST_HEADERS = {
31 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
32 | # 'Accept-Language': 'en',
33 | #}
34 |
35 | # Enable or disable spider middlewares
36 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
37 | #SPIDER_MIDDLEWARES = {
38 | # 'scrapyz.middlewares.MyCustomSpiderMiddleware': 543,
39 | #}
40 |
41 | # Enable or disable downloader middlewares
42 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
43 | #DOWNLOADER_MIDDLEWARES = {
44 | # 'scrapyz.middlewares.MyCustomDownloaderMiddleware': 543,
45 | #}
46 |
47 | # Enable or disable extensions
48 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
49 | #EXTENSIONS = {
50 | # 'scrapy.telnet.TelnetConsole': None,
51 | #}
52 |
53 | # Configure item pipelines
54 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
55 |
56 | ITEM_PIPELINES = pipelines.get_scrapyz_pipelines()
57 |
58 | # Enable and configure the AutoThrottle extension (disabled by default)
59 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
60 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
61 | #AUTOTHROTTLE_ENABLED=True
62 | # The initial download delay
63 | #AUTOTHROTTLE_START_DELAY=5
64 | # The maximum download delay to be set in case of high latencies
65 | #AUTOTHROTTLE_MAX_DELAY=60
66 | # Enable showing throttling stats for every response received:
67 | #AUTOTHROTTLE_DEBUG=False
68 |
69 | # Enable and configure HTTP caching (disabled by default)
70 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
71 | #HTTPCACHE_ENABLED=True
72 | #HTTPCACHE_EXPIRATION_SECS=0
73 | #HTTPCACHE_DIR='httpcache'
74 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
75 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
76 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | """A setuptools based setup module.
2 | See:
3 | https://packaging.python.org/en/latest/distributing.html
4 | https://github.com/pypa/sampleproject
5 | """
6 |
7 | # Always prefer setuptools over distutils
8 | from setuptools import setup, find_packages
9 | # To use a consistent encoding
10 | from codecs import open
11 | from os import path
12 |
13 | here = path.abspath(path.dirname(__file__))
14 |
15 | # Get the long description from the relevant file
16 | # with open(path.join(here, 'DESCRIPTION.rst'), encoding='utf-8') as f:
17 | # long_description = f.read()
18 |
19 | setup(
20 | name='scrapyz',
21 |
22 | # Versions should comply with PEP440. For a discussion on single-sourcing
23 | # the version across setup.py and the project code, see
24 | # https://packaging.python.org/en/latest/single_source_version.html
25 | version='0.3.2',
26 |
27 | description='Scrape Easy',
28 | long_description='Scrapyz is a scrapy extension.',
29 |
30 | # The project's main homepage.
31 | url='https://github.com/ssteuteville/scrapyz',
32 |
33 | # Author details
34 | author='ssteutevile',
35 | author_email='ssteuteville@gmail.com',
36 |
37 | # Choose your license
38 | license='MIT',
39 |
40 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
41 | classifiers=[
42 | # How mature is this project? Common values are
43 | # 3 - Alpha
44 | # 4 - Beta
45 | # 5 - Production/Stable
46 | 'Development Status :: 3 - Alpha',
47 |
48 | # Indicate who your project is intended for
49 | 'Intended Audience :: Developers',
50 | 'Topic :: Software Development :: Build Tools',
51 |
52 | # Pick your license as you wish (should match "license" above)
53 | 'License :: OSI Approved :: MIT License',
54 |
55 | # Specify the Python versions you support here. In particular, ensure
56 | # that you indicate whether you support Python 2, Python 3 or both.
57 | 'Programming Language :: Python :: 2.7',
58 |
59 | ],
60 |
61 | # What does your project relate to?
62 | keywords='scrapy scraping web extraction',
63 |
64 | # You can just specify the packages manually here if your project is
65 | # simple. Or you can use find_packages().
66 | packages=find_packages(exclude=['contrib', 'docs', 'tests*']),
67 |
68 | # List run-time dependencies here. These will be installed by pip when
69 | # your project is installed. For an analysis of "install_requires" vs pip's
70 | # requirements files see:
71 | # https://packaging.python.org/en/latest/requirements.html
72 | install_requires=['scrapy'],
73 |
74 | # List additional groups of dependencies here (e.g. development
75 | # dependencies). You can install these using the following syntax,
76 | # for example:
77 | # $ pip install -e .[dev,test]
78 | extras_require={
79 | # 'dev': ['check-manifest'],
80 | # 'test': ['coverage'],
81 | },
82 |
83 | # If there are data files included in your packages that need to be
84 | # installed, specify them here. If using Python 2.6 or less, then these
85 | # have to be included in MANIFEST.in as well.
86 | package_data={
87 | # 'sample': ['package_data.dat'],
88 | },
89 |
90 | # Although 'package_data' is the preferred approach, in some case you may
91 | # need to place data files outside of your packages. See:
92 | # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa
93 | # In this case, 'data_file' will be installed into '/my_data'
94 | data_files=[
95 | # ('my_data', ['data/data_file'])
96 | ],
97 |
98 | # To provide executable scripts, use entry points in preference to the
99 | # "scripts" keyword. Entry points provide cross-platform support and allow
100 | # pip to create the appropriate form of executable for the target platform.
101 | entry_points={
102 | # 'console_scripts': [
103 | # 'sample=sample:main',
104 | # ],
105 | },
106 | )
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## Scrapyz
3 | ##### "scrape easy" is an extension for the python web scraping framework scrapy. The aim of this package is to cut down on the amount of code needed to create simple spiders with scrapy.
4 | ---
5 | ###Installation:
6 |
7 | pip install scrapyz
8 |
9 | [](https://img.shields.io/pypi/v/scrapyz.svg)
10 | [](https://img.shields.io/pypi/status/scrapyz.svg)
11 | [](https://img.shields.io/pypi/pyversions/scrapyz.svg)
12 | # Usage:
13 | ##### These examples apply to the current version released to Pypi. See examples/tests for updated usage. See core.py for target classes and util.py for helpers.
14 | For scraping items off a single page:
15 | ```python
16 |
17 | class RedditSpider(GenericSpider):
18 | name = "reddit"
19 | start_urls = ["https://www.reddit.com/"]
20 |
21 | class Meta:
22 | items = CssTarget("items", ".thing")
23 | targets = [ # scrapyz also has XpathTarget and RegexTarget classes for extraction
24 | CssTarget("rank", ".rank::text"),
25 | CssTarget("upvoted", ".upvoted::text"),
26 | CssTarget("dislikes", ".dislikes::text"),
27 | CssTarget("likes", ".likes::text"),
28 | CssTarget("title", "a.title::text"),
29 | CssTarget("domain", ".domain > a::text"),
30 | CssTarget("datetime", ".tagline > time::attr(datetime)"),
31 | CssTarget("author", ".tagline > .author::text"),
32 | CssTarget("subreddit", ".tagline > .subreddit::text"),
33 | CssTarget("comments", ".comments::text")
34 | ]
35 | ```
36 |
37 | For scraping data off of an index page, following a link and collecting data off of a details page:
38 | ```python
39 |
40 | class RedditSpider2(IndexDetailSpider):
41 | name = "reddit2"
42 | start_urls = ["https://www.reddit.com/"]
43 |
44 | class Meta:
45 | detail_path = [CssTarget("detail_path", ".title > a::attr(href)", [absolute_url])]
46 | detail_targets = [
47 | CssTarget("content", ".usertext-body > div > p::text", [join]),
48 | ]
49 | items = CssTarget("items", ".thing")
50 | targets = [
51 | CssTarget("rank", ".rank::text"),
52 | CssTarget("upvoted", ".upvoted::text"),
53 | CssTarget("dislikes", ".dislikes::text"),
54 | CssTarget("likes", ".likes::text"),
55 | CssTarget("title", "a.title::text"),
56 | CssTarget("domain", ".domain > a::text"),
57 | CssTarget("datetime", ".tagline > time::attr(datetime)"),
58 | CssTarget("author", ".tagline > .author::text"),
59 | CssTarget("subreddit", ".tagline > .subreddit::text"),
60 | CssTarget("comments", ".comments::text")
61 | ]
62 | ```
63 |
64 | pipelines:
65 | ```python
66 | class RedditSpider(GenericSpider):
67 | name = "reddit"
68 | start_urls = ["https://www.reddit.com/"]
69 | pipelines = [scrapyz.pipelines.RequiredFields]
70 |
71 | class Meta:
72 | items = CssTarget("items", ".thing")
73 | required_fields = ["rank", "author", "domain", "comments"]
74 | targets = [
75 | CssTarget("rank", ".rank::text"),
76 | CssTarget("upvoted", ".upvoted::text"),
77 | CssTarget("dislikes", ".dislikes::text"),
78 | CssTarget("likes", ".likes::text"),
79 | CssTarget("title", "a.title::text"),
80 | CssTarget("domain", ".domain > a::text"),
81 | CssTarget("datetime", ".tagline > time::attr(datetime)"),
82 | CssTarget("author", ".tagline > .author::text"),
83 | CssTarget("subreddit", ".tagline > .subreddit::text"),
84 | CssTarget("comments", ".comments::text")
85 | ]
86 | ```
87 | To include all scrapyz pipelines to your project add this to the bottom of your project's settings.py:
88 | ```
89 | ITEM_PIPELINES.update(pipelines.get_scrapyz_pipelines())
90 | ```
91 | note: scrapyz pipelines will only execute if you include a field called pipelines in your spider and the appropriate meta fields. Documentation for this might come later. For now use the code and comments.
92 | Contribution
93 | -----------
94 | Please feel free to submit pull requests or create issues. If you have a feature request create an issue with the label Feature Request.
95 |
--------------------------------------------------------------------------------
/scrapyz/core.py:
--------------------------------------------------------------------------------
1 | from scrapy.loader.processors import TakeFirst
2 | from scrapy.spiders import Spider
3 | from scrapy.http import Request
4 | from scrapy.selector import Selector
5 | from scrapy.item import Field
6 | from scrapy.loader import ItemLoader
7 | from scrapyz.util import gen_item, gen_request
8 |
9 |
10 | class GenericSpider(Spider):
11 |
12 | def __init__(self, name=None, **kwargs):
13 | if not hasattr(self, "Meta"):
14 | raise AttributeError("GenericSpider must implement a Meta inner class.")
15 | if not hasattr(self, "start_urls"):
16 | raise AttributeError("Generic spider must implement start_urls.")
17 |
18 | super(GenericSpider, self).__init__(name, **kwargs)
19 | self.item_class = self.get_item_class()
20 | self._items = None
21 |
22 | def start_requests(self):
23 | for url in self.start_urls:
24 | yield Request(url=url, callback=self.get_parse())
25 |
26 | def parse(self, response):
27 | for item in self.find_items(response):
28 | loader = ItemLoader(item=self.item_class())
29 | for target in self.get_targets():
30 | loader.add_value(target.name, target.get_value(item, response))
31 | yield loader.load_item()
32 |
33 | def find_items(self, response):
34 | if not self._items:
35 | self._items = self.Meta.items.select(Selector(response), extract=False)
36 | return self._items
37 |
38 | def get_targets(self):
39 | return self.Meta.targets
40 |
41 | def get_parse(self):
42 | return self.parse
43 |
44 | def get_item_class(self):
45 | return gen_item(self.gen_fields())
46 |
47 | def gen_fields(self):
48 | fields = {target.name: target.field_class(output_processor=TakeFirst()) for target in self.get_targets()}
49 | if hasattr(self.Meta, "extra_fields"):
50 | fields.update(self.Meta.extra_fields)
51 | return fields
52 |
53 |
54 | class IndexDetailSpider(GenericSpider):
55 |
56 | def __init__(self, name=None, **kwargs):
57 | super(IndexDetailSpider, self).__init__(name, **kwargs)
58 |
59 | if not hasattr(self.Meta, "detail_path"):
60 | raise AttributeError("IndexDetailSpider's Meta class must implement detail_path")
61 | if not hasattr(self.Meta, "detail_targets"):
62 | raise AttributeError("IndexDetailSpider's Meta class must implement detail_targets")
63 |
64 | def parse(self, response):
65 | for item in self.find_items(response):
66 | loader = ItemLoader(item=self.item_class())
67 | for target in self.get_targets():
68 | loader.add_value(target.name, target.get_value(item, response))
69 |
70 | for target in self.Meta.detail_path:
71 | val = target.get_value(item, response)
72 | yield gen_request(val, self.parse_details, loader.load_item())
73 |
74 | def parse_details(self, response):
75 | dom = Selector(response)
76 | loader = ItemLoader(item=response.meta['item'])
77 | for target in self.Meta.detail_targets:
78 | loader.add_value(target.name, target.get_value(dom, response))
79 | yield loader.load_item()
80 |
81 | def gen_fields(self):
82 | fields = super(IndexDetailSpider, self).gen_fields()
83 | fields.update({target.name: target.field_class(output_processor=TakeFirst()) for target in self.Meta.detail_targets})
84 | return fields
85 |
86 |
87 | class Target(object):
88 |
89 | def __init__(self, name, path, processors=None, field_class=Field):
90 | self.name = name
91 | self.path = path
92 | self.processors = processors if processors else []
93 | self.field_class = field_class
94 |
95 | def get_value(self, selector, response):
96 | if isinstance(self.path, (list, tuple)):
97 | return self.process(" ".join(selector.css(_).extract() for _ in self.path), response)
98 | return self.process(self.select(selector), response)
99 |
100 | def process(self, value, response):
101 | for processor in self.processors:
102 | value = processor(value, response)
103 | return value
104 |
105 | def select(self, selector, extract=False):
106 | raise NotImplementedError("Target is meant as a base class. Use CssTarget, RegexTarget,"
107 | " or XPathTarget instead.")
108 |
109 |
110 | class RegexTarget(Target):
111 |
112 | def select(self, selector, extract=True):
113 | """
114 | Extract has no effect.
115 | """
116 | return selector.re(self.path)
117 |
118 |
119 | class XPathTarget(Target):
120 |
121 | def select(self, selector, extract=True):
122 | sel = selector.xpath(self.path)
123 | return sel.extract() if extract else sel
124 |
125 |
126 | class CssTarget(Target):
127 |
128 | def select(self, selector, extract=True):
129 | sel = selector.css(self.path)
130 | return sel.extract() if extract else sel
131 |
--------------------------------------------------------------------------------
/scrapyz/test/test_generic_spider.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from scrapy.exceptions import DropItem
3 | from scrapyz.pipelines import RequiredFields, MinTargets
4 | from util import fake_response
5 | from spiders import *
6 |
7 |
8 | class TestSpiders(unittest.TestCase):
9 | """
10 | Tests the basic functionality of GenericSpider.
11 | """
12 | expected_items = [
13 | {
14 | 'disclaimer': u'Disclaimer One',
15 | 'discount': u'Discount One',
16 | 'image_url': u'Image One',
17 | 'offer_url': 'http://www.test.com/offer_1',
18 | 'title': u'Title One'
19 | },
20 | {
21 | 'disclaimer': u'Disclaimer Two',
22 | 'discount': u'Discount Two',
23 | 'image_url': u'Image Two',
24 | 'offer_url': 'http://www.test.com/offer_2',
25 | 'title': u'Title Two'
26 | },
27 | {
28 | 'disclaimer': u'Disclaimer Three',
29 | 'discount': u'Discount Three',
30 | 'image_url': u'Image Three',
31 | 'offer_url': 'http://www.test.com/offer_3',
32 | 'title': u'Title Three'
33 | }
34 | ]
35 |
36 | """
37 | Test a full parse on a static html document.
38 | """
39 | def test_basic_parse(self):
40 | spider = BasicParseFirstElementTestSpider()
41 | response = fake_response("basic_parse.html")
42 | results = [item for item in spider.parse(response)]
43 | self.assertEqual(len(results), 3)
44 | for result, expected in zip(results, self.expected_items):
45 | for key in result.keys():
46 | self.assertEqual(result[key], expected[key])
47 |
48 | """
49 | Test GenericSpider's helper function.
50 | """
51 | def test_start_requests(self):
52 | spider = BasicParseFirstElementTestSpider()
53 | spider.start_urls = ["http://abc.com", "http://123.com", "http://abc.com"]
54 | for i, request in enumerate(spider.start_requests()):
55 | self.assertEqual(request.url, spider.start_urls[i])
56 |
57 | """
58 | Test that the proper exceptions are raised in the right situations.
59 | """
60 | def test_bad_spider(self):
61 | classes = [NoMetaSpider, NoStartSpider]
62 | for cls in classes:
63 | with self.assertRaises(AttributeError):
64 | spider = cls()
65 |
66 | def test_good_spider(self):
67 | try:
68 | spider = GoodSpider()
69 | except Exception:
70 | self.fail()
71 |
72 |
73 | class TestPipelines(unittest.TestCase):
74 | """
75 | Test that pipelines.RequiredFields functions properly
76 | """
77 | def test_required_fields_fail(self):
78 | spider = BasicParseTestSpider()
79 | spider.Meta.required_fields = ["disclaimer", "discount", "image_url", "offer_url", "title"]
80 | spider.pipelines = [RequiredFields]
81 | pipeline = RequiredFields()
82 | response = fake_response("basic_some_missing.html")
83 | with self.assertRaises(DropItem):
84 | for item in spider.parse(response):
85 | pipeline.process_item(item, spider)
86 |
87 |
88 | def test_required_fields_success(self):
89 | spider = BasicParseTestSpider()
90 | spider.Meta.required_fields = ["disclaimer", "discount", "image_url", "offer_url", "title"]
91 | spider.pipelines = [RequiredFields]
92 | response = fake_response("basic_parse.html")
93 | pipeline = RequiredFields()
94 | try:
95 | results = [pipeline.process_item(item, spider) for item in spider.parse(response)]
96 | except DropItem:
97 | self.fail("Valid required fields dropped item.")
98 | self.assertEqual(len(results), 3)
99 |
100 | def test_required_fields_attribute_exception(self):
101 | spider = BasicParseTestSpider()
102 | spider.pipelines = [RequiredFields]
103 | pipeline = RequiredFields()
104 | response = fake_response("basic_parse.html")
105 | with self.assertRaises(AttributeError):
106 | results = [pipeline.process_item(item, spider) for item in spider.parse(response)]
107 |
108 | def test_min_target_fail(self):
109 | spider = BasicParseTestSpider()
110 | spider.pipelines = [MinTargets]
111 | spider.Meta.min_targets = 4
112 | pipeline = MinTargets()
113 | response = fake_response("basic_some_missing.html")
114 | with self.assertRaises(DropItem):
115 | results = [pipeline.process_item(item, spider) for item in spider.parse(response)]
116 |
117 | def test_min_target_success(self):
118 | spider = BasicParseTestSpider()
119 | spider.pipelines = [MinTargets]
120 | spider.Meta.min_targets = 4
121 | pipeline = MinTargets()
122 | response = fake_response("basic_parse.html")
123 | try:
124 | results = [pipeline.process_item(item, spider) for item in spider.parse(response)]
125 | except DropItem:
126 | self.fail("min_target dropeed item when it shouldn't.")
127 |
128 | class TestUtil(unittest.TestCase):
129 | pass
130 |
--------------------------------------------------------------------------------