├── scrapyz ├── __init__.py ├── examples │ ├── __init__.py │ └── reddit_spider.py ├── test │ ├── __init__.py │ ├── util.py │ ├── spiders.py │ ├── basic_some_missing.html │ ├── basic_parse.html │ └── test_generic_spider.py ├── util.py ├── pipelines.py ├── settings.py └── core.py ├── setup.cfg ├── scrapy.cfg ├── .gitignore ├── setup.py └── README.md /scrapyz/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [scrapyz] 2 | universal=1 -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | default = scrapyz.settings -------------------------------------------------------------------------------- /scrapyz/examples/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'shane' 2 | -------------------------------------------------------------------------------- /scrapyz/test/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'shane' 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | build 3 | dist 4 | Include 5 | Lib 6 | Scripts 7 | .idea 8 | scrapyz/.idea 9 | *.egg-info 10 | -------------------------------------------------------------------------------- /scrapyz/test/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from scrapy.http import HtmlResponse, Request 4 | 5 | 6 | def fake_response(file, url=None): 7 | if not url: 8 | url = "http://www.test.com/offers" 9 | file = os.path.join(os.path.dirname(__file__), file) 10 | return HtmlResponse(url=url, request=Request(url=url), body=open(file, 'r').read()) -------------------------------------------------------------------------------- /scrapyz/util.py: -------------------------------------------------------------------------------- 1 | from scrapy import Item 2 | from scrapy.http.request import Request 3 | from scrapy.utils.response import get_base_url 4 | import urlparse 5 | 6 | 7 | def gen_item(fields): 8 | return type("GenericItem", (Item,), fields) 9 | 10 | 11 | def gen_request(url, callback, item=None): 12 | r = Request(url, callback=callback) 13 | if item: 14 | r.meta['item'] = item 15 | return r 16 | 17 | 18 | def absolute_url(link, response): 19 | if isinstance(link, list): 20 | link = link[0] if len(link) else "" 21 | return urlparse.urljoin(get_base_url(response), link) 22 | 23 | 24 | def nth(n): 25 | def processor(field, response): 26 | return field[n] 27 | return processor 28 | 29 | 30 | def strip(field, response): 31 | if isinstance(field, list) and field: 32 | field = field[0] 33 | return field.strip() 34 | -------------------------------------------------------------------------------- /scrapyz/test/spiders.py: -------------------------------------------------------------------------------- 1 | from scrapyz.core import GenericSpider, CssTarget 2 | from scrapyz.util import nth, strip, absolute_url 3 | 4 | 5 | class BasicParseFirstElementTestSpider(GenericSpider): 6 | name = "test" 7 | start_urls = ["http://www.test.com"] 8 | 9 | class Meta: 10 | items = CssTarget("items", ".offer") 11 | targets = [ 12 | CssTarget("title", ".title::text", [nth(0), strip]), 13 | CssTarget("discount", ".discount::text", [nth(0), strip]), 14 | CssTarget("disclaimer", ".disclaimer::text", [nth(0), strip]), 15 | CssTarget("offer_url", ".offer_url::attr(href)", [nth(0), strip, absolute_url]), 16 | CssTarget("image_url", ".image::attr(src)", [nth(0), strip]) 17 | ] 18 | 19 | class BasicParseTestSpider(GenericSpider): 20 | name = "test" 21 | start_urls = ["http://www.test.com"] 22 | 23 | class Meta: 24 | items = CssTarget("items", ".offer") 25 | targets = [ 26 | CssTarget("title", ".title::text"), 27 | CssTarget("discount", ".discount::text"), 28 | CssTarget("disclaimer", ".disclaimer::text"), 29 | CssTarget("offer_url", ".offer_url::attr(href)"), 30 | CssTarget("image_url", ".image::attr(src)",) 31 | ] 32 | 33 | 34 | class NoMetaSpider(GenericSpider): 35 | pass 36 | 37 | 38 | class NoStartSpider(GenericSpider): 39 | name = "nostart" 40 | 41 | class Meta: 42 | pass 43 | 44 | 45 | class GoodSpider(GenericSpider): 46 | name = "good" 47 | start_urls = [] 48 | 49 | class Meta: 50 | items = "" 51 | targets = [] 52 | -------------------------------------------------------------------------------- /scrapyz/test/basic_some_missing.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 21 | 59 | 60 | -------------------------------------------------------------------------------- /scrapyz/test/basic_parse.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 21 | 65 | 66 | -------------------------------------------------------------------------------- /scrapyz/pipelines.py: -------------------------------------------------------------------------------- 1 | import functools 2 | from scrapy.exceptions import DropItem 3 | 4 | """ 5 | Helper Functions 6 | """ 7 | 8 | 9 | def check_spider_pipelines(process_item_method): 10 | """ 11 | I can't remember where I found this but 12 | """ 13 | 14 | @functools.wraps(process_item_method) 15 | def wrapper(self, item, spider): 16 | if hasattr(spider, "pipelines") and self.__class__ in spider.pipelines: 17 | return process_item_method(self, item, spider) 18 | else: 19 | return item 20 | 21 | return wrapper 22 | 23 | def get_scrapyz_pipelines(): 24 | return { 25 | 'scrapyz.pipelines.RequiredFields': 300, 26 | 'scrapyz.pipelines.MinTargets': 300, 27 | } 28 | 29 | """ 30 | Pipeline Classes 31 | """ 32 | 33 | 34 | class FilterBase(object): 35 | """ 36 | Abstract class. Override the validate function to suit your needs. 37 | """ 38 | 39 | @check_spider_pipelines 40 | def process_item(self, item, spider): 41 | if self.validate(item, spider): 42 | return item 43 | raise DropItem("Item failed in filter pipeline.") 44 | 45 | def validate(self, item, spider): 46 | """ 47 | Override this function to return true if an item passes the filter and false otherwise. 48 | You can use attributes on the spider or the item for your filtering. 49 | """ 50 | return item 51 | 52 | 53 | class RequiredFields(FilterBase): 54 | """ 55 | Requires the spider to implement Meta.required_fields. Drops any item that doesn't have a value for each 56 | required field. 57 | """ 58 | 59 | def validate(self, item, spider): 60 | missing = [] 61 | for field in spider.Meta.required_fields: 62 | if field not in item: 63 | missing.append(field) 64 | return not missing 65 | 66 | 67 | class MinTargets(FilterBase): 68 | """ 69 | Filters out items that weren't able to hit a minimum number for Targets. 70 | """ 71 | 72 | def validate(self, item, spider): 73 | return len(item.keys()) >= spider.Meta.min_targets 74 | 75 | -------------------------------------------------------------------------------- /scrapyz/examples/reddit_spider.py: -------------------------------------------------------------------------------- 1 | from scrapyz.core import * 2 | from scrapyz.pipelines import RequiredFields 3 | from scrapyz.util import absolute_url 4 | 5 | 6 | def join(value, response): 7 | if isinstance(value, (list, tuple)): 8 | value = " ".join(value) 9 | return value 10 | 11 | 12 | class RedditSpider(GenericSpider): 13 | name = "reddit" 14 | start_urls = ["https://www.reddit.com/"] 15 | 16 | class Meta: 17 | items = CssTarget("items", ".thing") 18 | targets = [ 19 | CssTarget("rank", ".rank::text"), 20 | CssTarget("upvoted", ".upvoted::text"), 21 | CssTarget("dislikes", ".dislikes::text"), 22 | CssTarget("likes", ".likes::text"), 23 | CssTarget("title", "a.title::text"), 24 | CssTarget("domain", ".domain > a::text"), 25 | CssTarget("datetime", ".tagline > time::attr(datetime)"), 26 | CssTarget("author", ".tagline > .author::text"), 27 | CssTarget("subreddit", ".tagline > .subreddit::text"), 28 | CssTarget("comments", ".comments::text") 29 | ] 30 | 31 | 32 | class RedditSpider2(IndexDetailSpider): 33 | name = "reddit2" 34 | start_urls = ["https://www.reddit.com/"] 35 | 36 | class Meta: 37 | detail_path = [ 38 | CssTarget("detail_path", ".title > a::attr(href)", [absolute_url]) 39 | ] 40 | detail_targets = [ 41 | CssTarget("content", ".usertext-body > div > p::text", [join]), 42 | ] 43 | items = CssTarget("items", ".thing") 44 | targets = [ 45 | CssTarget("rank", ".rank::text"), 46 | CssTarget("upvoted", ".upvoted::text"), 47 | CssTarget("dislikes", ".dislikes::text"), 48 | CssTarget("likes", ".likes::text"), 49 | CssTarget("title", "a.title::text"), 50 | CssTarget("domain", ".domain > a::text"), 51 | CssTarget("datetime", ".tagline > time::attr(datetime)"), 52 | CssTarget("author", ".tagline > .author::text"), 53 | CssTarget("subreddit", ".tagline > .subreddit::text"), 54 | CssTarget("comments", ".comments::text") 55 | ] 56 | -------------------------------------------------------------------------------- /scrapyz/settings.py: -------------------------------------------------------------------------------- 1 | from scrapyz import pipelines 2 | 3 | BOT_NAME = '{your_bot}' 4 | 5 | SPIDER_MODULES = ['scrapyz.examples'] 6 | NEWSPIDER_MODULE = 'scrapyz.examples' 7 | 8 | 9 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 10 | #USER_AGENT = 'scrapyz (+http://www.yourdomain.com)' 11 | 12 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 13 | #CONCURRENT_REQUESTS=32 14 | 15 | # Configure a delay for requests for the same website (default: 0) 16 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 17 | # See also autothrottle settings and docs 18 | #DOWNLOAD_DELAY=3 19 | # The download delay setting will honor only one of: 20 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 21 | #CONCURRENT_REQUESTS_PER_IP=16 22 | 23 | # Disable cookies (enabled by default) 24 | #COOKIES_ENABLED=False 25 | 26 | # Disable Telnet Console (enabled by default) 27 | #TELNETCONSOLE_ENABLED=False 28 | 29 | # Override the default request headers: 30 | #DEFAULT_REQUEST_HEADERS = { 31 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 32 | # 'Accept-Language': 'en', 33 | #} 34 | 35 | # Enable or disable spider middlewares 36 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 37 | #SPIDER_MIDDLEWARES = { 38 | # 'scrapyz.middlewares.MyCustomSpiderMiddleware': 543, 39 | #} 40 | 41 | # Enable or disable downloader middlewares 42 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 43 | #DOWNLOADER_MIDDLEWARES = { 44 | # 'scrapyz.middlewares.MyCustomDownloaderMiddleware': 543, 45 | #} 46 | 47 | # Enable or disable extensions 48 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 49 | #EXTENSIONS = { 50 | # 'scrapy.telnet.TelnetConsole': None, 51 | #} 52 | 53 | # Configure item pipelines 54 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 55 | 56 | ITEM_PIPELINES = pipelines.get_scrapyz_pipelines() 57 | 58 | # Enable and configure the AutoThrottle extension (disabled by default) 59 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 60 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 61 | #AUTOTHROTTLE_ENABLED=True 62 | # The initial download delay 63 | #AUTOTHROTTLE_START_DELAY=5 64 | # The maximum download delay to be set in case of high latencies 65 | #AUTOTHROTTLE_MAX_DELAY=60 66 | # Enable showing throttling stats for every response received: 67 | #AUTOTHROTTLE_DEBUG=False 68 | 69 | # Enable and configure HTTP caching (disabled by default) 70 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 71 | #HTTPCACHE_ENABLED=True 72 | #HTTPCACHE_EXPIRATION_SECS=0 73 | #HTTPCACHE_DIR='httpcache' 74 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 75 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 76 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """A setuptools based setup module. 2 | See: 3 | https://packaging.python.org/en/latest/distributing.html 4 | https://github.com/pypa/sampleproject 5 | """ 6 | 7 | # Always prefer setuptools over distutils 8 | from setuptools import setup, find_packages 9 | # To use a consistent encoding 10 | from codecs import open 11 | from os import path 12 | 13 | here = path.abspath(path.dirname(__file__)) 14 | 15 | # Get the long description from the relevant file 16 | # with open(path.join(here, 'DESCRIPTION.rst'), encoding='utf-8') as f: 17 | # long_description = f.read() 18 | 19 | setup( 20 | name='scrapyz', 21 | 22 | # Versions should comply with PEP440. For a discussion on single-sourcing 23 | # the version across setup.py and the project code, see 24 | # https://packaging.python.org/en/latest/single_source_version.html 25 | version='0.3.2', 26 | 27 | description='Scrape Easy', 28 | long_description='Scrapyz is a scrapy extension.', 29 | 30 | # The project's main homepage. 31 | url='https://github.com/ssteuteville/scrapyz', 32 | 33 | # Author details 34 | author='ssteutevile', 35 | author_email='ssteuteville@gmail.com', 36 | 37 | # Choose your license 38 | license='MIT', 39 | 40 | # See https://pypi.python.org/pypi?%3Aaction=list_classifiers 41 | classifiers=[ 42 | # How mature is this project? Common values are 43 | # 3 - Alpha 44 | # 4 - Beta 45 | # 5 - Production/Stable 46 | 'Development Status :: 3 - Alpha', 47 | 48 | # Indicate who your project is intended for 49 | 'Intended Audience :: Developers', 50 | 'Topic :: Software Development :: Build Tools', 51 | 52 | # Pick your license as you wish (should match "license" above) 53 | 'License :: OSI Approved :: MIT License', 54 | 55 | # Specify the Python versions you support here. In particular, ensure 56 | # that you indicate whether you support Python 2, Python 3 or both. 57 | 'Programming Language :: Python :: 2.7', 58 | 59 | ], 60 | 61 | # What does your project relate to? 62 | keywords='scrapy scraping web extraction', 63 | 64 | # You can just specify the packages manually here if your project is 65 | # simple. Or you can use find_packages(). 66 | packages=find_packages(exclude=['contrib', 'docs', 'tests*']), 67 | 68 | # List run-time dependencies here. These will be installed by pip when 69 | # your project is installed. For an analysis of "install_requires" vs pip's 70 | # requirements files see: 71 | # https://packaging.python.org/en/latest/requirements.html 72 | install_requires=['scrapy'], 73 | 74 | # List additional groups of dependencies here (e.g. development 75 | # dependencies). You can install these using the following syntax, 76 | # for example: 77 | # $ pip install -e .[dev,test] 78 | extras_require={ 79 | # 'dev': ['check-manifest'], 80 | # 'test': ['coverage'], 81 | }, 82 | 83 | # If there are data files included in your packages that need to be 84 | # installed, specify them here. If using Python 2.6 or less, then these 85 | # have to be included in MANIFEST.in as well. 86 | package_data={ 87 | # 'sample': ['package_data.dat'], 88 | }, 89 | 90 | # Although 'package_data' is the preferred approach, in some case you may 91 | # need to place data files outside of your packages. See: 92 | # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa 93 | # In this case, 'data_file' will be installed into '/my_data' 94 | data_files=[ 95 | # ('my_data', ['data/data_file']) 96 | ], 97 | 98 | # To provide executable scripts, use entry points in preference to the 99 | # "scripts" keyword. Entry points provide cross-platform support and allow 100 | # pip to create the appropriate form of executable for the target platform. 101 | entry_points={ 102 | # 'console_scripts': [ 103 | # 'sample=sample:main', 104 | # ], 105 | }, 106 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Scrapyz 3 | ##### "scrape easy" is an extension for the python web scraping framework scrapy. The aim of this package is to cut down on the amount of code needed to create simple spiders with scrapy. 4 | --- 5 | ###Installation: 6 | 7 | pip install scrapyz 8 | 9 | [![PyPI version](https://img.shields.io/pypi/v/scrapyz.svg)](https://img.shields.io/pypi/v/scrapyz.svg) 10 | [![PyPi status](https://img.shields.io/pypi/status/scrapyz.svg)](https://img.shields.io/pypi/status/scrapyz.svg) 11 | [![PyPi python version](https://img.shields.io/pypi/pyversions/scrapyz.svg)](https://img.shields.io/pypi/pyversions/scrapyz.svg) 12 | # Usage: 13 | ##### These examples apply to the current version released to Pypi. See examples/tests for updated usage. See core.py for target classes and util.py for helpers. 14 | For scraping items off a single page: 15 | ```python 16 | 17 | class RedditSpider(GenericSpider): 18 | name = "reddit" 19 | start_urls = ["https://www.reddit.com/"] 20 | 21 | class Meta: 22 | items = CssTarget("items", ".thing") 23 | targets = [ # scrapyz also has XpathTarget and RegexTarget classes for extraction 24 | CssTarget("rank", ".rank::text"), 25 | CssTarget("upvoted", ".upvoted::text"), 26 | CssTarget("dislikes", ".dislikes::text"), 27 | CssTarget("likes", ".likes::text"), 28 | CssTarget("title", "a.title::text"), 29 | CssTarget("domain", ".domain > a::text"), 30 | CssTarget("datetime", ".tagline > time::attr(datetime)"), 31 | CssTarget("author", ".tagline > .author::text"), 32 | CssTarget("subreddit", ".tagline > .subreddit::text"), 33 | CssTarget("comments", ".comments::text") 34 | ] 35 | ``` 36 | 37 | For scraping data off of an index page, following a link and collecting data off of a details page: 38 | ```python 39 | 40 | class RedditSpider2(IndexDetailSpider): 41 | name = "reddit2" 42 | start_urls = ["https://www.reddit.com/"] 43 | 44 | class Meta: 45 | detail_path = [CssTarget("detail_path", ".title > a::attr(href)", [absolute_url])] 46 | detail_targets = [ 47 | CssTarget("content", ".usertext-body > div > p::text", [join]), 48 | ] 49 | items = CssTarget("items", ".thing") 50 | targets = [ 51 | CssTarget("rank", ".rank::text"), 52 | CssTarget("upvoted", ".upvoted::text"), 53 | CssTarget("dislikes", ".dislikes::text"), 54 | CssTarget("likes", ".likes::text"), 55 | CssTarget("title", "a.title::text"), 56 | CssTarget("domain", ".domain > a::text"), 57 | CssTarget("datetime", ".tagline > time::attr(datetime)"), 58 | CssTarget("author", ".tagline > .author::text"), 59 | CssTarget("subreddit", ".tagline > .subreddit::text"), 60 | CssTarget("comments", ".comments::text") 61 | ] 62 | ``` 63 | 64 | pipelines: 65 | ```python 66 | class RedditSpider(GenericSpider): 67 | name = "reddit" 68 | start_urls = ["https://www.reddit.com/"] 69 | pipelines = [scrapyz.pipelines.RequiredFields] 70 | 71 | class Meta: 72 | items = CssTarget("items", ".thing") 73 | required_fields = ["rank", "author", "domain", "comments"] 74 | targets = [ 75 | CssTarget("rank", ".rank::text"), 76 | CssTarget("upvoted", ".upvoted::text"), 77 | CssTarget("dislikes", ".dislikes::text"), 78 | CssTarget("likes", ".likes::text"), 79 | CssTarget("title", "a.title::text"), 80 | CssTarget("domain", ".domain > a::text"), 81 | CssTarget("datetime", ".tagline > time::attr(datetime)"), 82 | CssTarget("author", ".tagline > .author::text"), 83 | CssTarget("subreddit", ".tagline > .subreddit::text"), 84 | CssTarget("comments", ".comments::text") 85 | ] 86 | ``` 87 | To include all scrapyz pipelines to your project add this to the bottom of your project's settings.py: 88 | ``` 89 | ITEM_PIPELINES.update(pipelines.get_scrapyz_pipelines()) 90 | ``` 91 | note: scrapyz pipelines will only execute if you include a field called pipelines in your spider and the appropriate meta fields. Documentation for this might come later. For now use the code and comments. 92 | Contribution 93 | ----------- 94 | Please feel free to submit pull requests or create issues. If you have a feature request create an issue with the label Feature Request. 95 | -------------------------------------------------------------------------------- /scrapyz/core.py: -------------------------------------------------------------------------------- 1 | from scrapy.loader.processors import TakeFirst 2 | from scrapy.spiders import Spider 3 | from scrapy.http import Request 4 | from scrapy.selector import Selector 5 | from scrapy.item import Field 6 | from scrapy.loader import ItemLoader 7 | from scrapyz.util import gen_item, gen_request 8 | 9 | 10 | class GenericSpider(Spider): 11 | 12 | def __init__(self, name=None, **kwargs): 13 | if not hasattr(self, "Meta"): 14 | raise AttributeError("GenericSpider must implement a Meta inner class.") 15 | if not hasattr(self, "start_urls"): 16 | raise AttributeError("Generic spider must implement start_urls.") 17 | 18 | super(GenericSpider, self).__init__(name, **kwargs) 19 | self.item_class = self.get_item_class() 20 | self._items = None 21 | 22 | def start_requests(self): 23 | for url in self.start_urls: 24 | yield Request(url=url, callback=self.get_parse()) 25 | 26 | def parse(self, response): 27 | for item in self.find_items(response): 28 | loader = ItemLoader(item=self.item_class()) 29 | for target in self.get_targets(): 30 | loader.add_value(target.name, target.get_value(item, response)) 31 | yield loader.load_item() 32 | 33 | def find_items(self, response): 34 | if not self._items: 35 | self._items = self.Meta.items.select(Selector(response), extract=False) 36 | return self._items 37 | 38 | def get_targets(self): 39 | return self.Meta.targets 40 | 41 | def get_parse(self): 42 | return self.parse 43 | 44 | def get_item_class(self): 45 | return gen_item(self.gen_fields()) 46 | 47 | def gen_fields(self): 48 | fields = {target.name: target.field_class(output_processor=TakeFirst()) for target in self.get_targets()} 49 | if hasattr(self.Meta, "extra_fields"): 50 | fields.update(self.Meta.extra_fields) 51 | return fields 52 | 53 | 54 | class IndexDetailSpider(GenericSpider): 55 | 56 | def __init__(self, name=None, **kwargs): 57 | super(IndexDetailSpider, self).__init__(name, **kwargs) 58 | 59 | if not hasattr(self.Meta, "detail_path"): 60 | raise AttributeError("IndexDetailSpider's Meta class must implement detail_path") 61 | if not hasattr(self.Meta, "detail_targets"): 62 | raise AttributeError("IndexDetailSpider's Meta class must implement detail_targets") 63 | 64 | def parse(self, response): 65 | for item in self.find_items(response): 66 | loader = ItemLoader(item=self.item_class()) 67 | for target in self.get_targets(): 68 | loader.add_value(target.name, target.get_value(item, response)) 69 | 70 | for target in self.Meta.detail_path: 71 | val = target.get_value(item, response) 72 | yield gen_request(val, self.parse_details, loader.load_item()) 73 | 74 | def parse_details(self, response): 75 | dom = Selector(response) 76 | loader = ItemLoader(item=response.meta['item']) 77 | for target in self.Meta.detail_targets: 78 | loader.add_value(target.name, target.get_value(dom, response)) 79 | yield loader.load_item() 80 | 81 | def gen_fields(self): 82 | fields = super(IndexDetailSpider, self).gen_fields() 83 | fields.update({target.name: target.field_class(output_processor=TakeFirst()) for target in self.Meta.detail_targets}) 84 | return fields 85 | 86 | 87 | class Target(object): 88 | 89 | def __init__(self, name, path, processors=None, field_class=Field): 90 | self.name = name 91 | self.path = path 92 | self.processors = processors if processors else [] 93 | self.field_class = field_class 94 | 95 | def get_value(self, selector, response): 96 | if isinstance(self.path, (list, tuple)): 97 | return self.process(" ".join(selector.css(_).extract() for _ in self.path), response) 98 | return self.process(self.select(selector), response) 99 | 100 | def process(self, value, response): 101 | for processor in self.processors: 102 | value = processor(value, response) 103 | return value 104 | 105 | def select(self, selector, extract=False): 106 | raise NotImplementedError("Target is meant as a base class. Use CssTarget, RegexTarget," 107 | " or XPathTarget instead.") 108 | 109 | 110 | class RegexTarget(Target): 111 | 112 | def select(self, selector, extract=True): 113 | """ 114 | Extract has no effect. 115 | """ 116 | return selector.re(self.path) 117 | 118 | 119 | class XPathTarget(Target): 120 | 121 | def select(self, selector, extract=True): 122 | sel = selector.xpath(self.path) 123 | return sel.extract() if extract else sel 124 | 125 | 126 | class CssTarget(Target): 127 | 128 | def select(self, selector, extract=True): 129 | sel = selector.css(self.path) 130 | return sel.extract() if extract else sel 131 | -------------------------------------------------------------------------------- /scrapyz/test/test_generic_spider.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from scrapy.exceptions import DropItem 3 | from scrapyz.pipelines import RequiredFields, MinTargets 4 | from util import fake_response 5 | from spiders import * 6 | 7 | 8 | class TestSpiders(unittest.TestCase): 9 | """ 10 | Tests the basic functionality of GenericSpider. 11 | """ 12 | expected_items = [ 13 | { 14 | 'disclaimer': u'Disclaimer One', 15 | 'discount': u'Discount One', 16 | 'image_url': u'Image One', 17 | 'offer_url': 'http://www.test.com/offer_1', 18 | 'title': u'Title One' 19 | }, 20 | { 21 | 'disclaimer': u'Disclaimer Two', 22 | 'discount': u'Discount Two', 23 | 'image_url': u'Image Two', 24 | 'offer_url': 'http://www.test.com/offer_2', 25 | 'title': u'Title Two' 26 | }, 27 | { 28 | 'disclaimer': u'Disclaimer Three', 29 | 'discount': u'Discount Three', 30 | 'image_url': u'Image Three', 31 | 'offer_url': 'http://www.test.com/offer_3', 32 | 'title': u'Title Three' 33 | } 34 | ] 35 | 36 | """ 37 | Test a full parse on a static html document. 38 | """ 39 | def test_basic_parse(self): 40 | spider = BasicParseFirstElementTestSpider() 41 | response = fake_response("basic_parse.html") 42 | results = [item for item in spider.parse(response)] 43 | self.assertEqual(len(results), 3) 44 | for result, expected in zip(results, self.expected_items): 45 | for key in result.keys(): 46 | self.assertEqual(result[key], expected[key]) 47 | 48 | """ 49 | Test GenericSpider's helper function. 50 | """ 51 | def test_start_requests(self): 52 | spider = BasicParseFirstElementTestSpider() 53 | spider.start_urls = ["http://abc.com", "http://123.com", "http://abc.com"] 54 | for i, request in enumerate(spider.start_requests()): 55 | self.assertEqual(request.url, spider.start_urls[i]) 56 | 57 | """ 58 | Test that the proper exceptions are raised in the right situations. 59 | """ 60 | def test_bad_spider(self): 61 | classes = [NoMetaSpider, NoStartSpider] 62 | for cls in classes: 63 | with self.assertRaises(AttributeError): 64 | spider = cls() 65 | 66 | def test_good_spider(self): 67 | try: 68 | spider = GoodSpider() 69 | except Exception: 70 | self.fail() 71 | 72 | 73 | class TestPipelines(unittest.TestCase): 74 | """ 75 | Test that pipelines.RequiredFields functions properly 76 | """ 77 | def test_required_fields_fail(self): 78 | spider = BasicParseTestSpider() 79 | spider.Meta.required_fields = ["disclaimer", "discount", "image_url", "offer_url", "title"] 80 | spider.pipelines = [RequiredFields] 81 | pipeline = RequiredFields() 82 | response = fake_response("basic_some_missing.html") 83 | with self.assertRaises(DropItem): 84 | for item in spider.parse(response): 85 | pipeline.process_item(item, spider) 86 | 87 | 88 | def test_required_fields_success(self): 89 | spider = BasicParseTestSpider() 90 | spider.Meta.required_fields = ["disclaimer", "discount", "image_url", "offer_url", "title"] 91 | spider.pipelines = [RequiredFields] 92 | response = fake_response("basic_parse.html") 93 | pipeline = RequiredFields() 94 | try: 95 | results = [pipeline.process_item(item, spider) for item in spider.parse(response)] 96 | except DropItem: 97 | self.fail("Valid required fields dropped item.") 98 | self.assertEqual(len(results), 3) 99 | 100 | def test_required_fields_attribute_exception(self): 101 | spider = BasicParseTestSpider() 102 | spider.pipelines = [RequiredFields] 103 | pipeline = RequiredFields() 104 | response = fake_response("basic_parse.html") 105 | with self.assertRaises(AttributeError): 106 | results = [pipeline.process_item(item, spider) for item in spider.parse(response)] 107 | 108 | def test_min_target_fail(self): 109 | spider = BasicParseTestSpider() 110 | spider.pipelines = [MinTargets] 111 | spider.Meta.min_targets = 4 112 | pipeline = MinTargets() 113 | response = fake_response("basic_some_missing.html") 114 | with self.assertRaises(DropItem): 115 | results = [pipeline.process_item(item, spider) for item in spider.parse(response)] 116 | 117 | def test_min_target_success(self): 118 | spider = BasicParseTestSpider() 119 | spider.pipelines = [MinTargets] 120 | spider.Meta.min_targets = 4 121 | pipeline = MinTargets() 122 | response = fake_response("basic_parse.html") 123 | try: 124 | results = [pipeline.process_item(item, spider) for item in spider.parse(response)] 125 | except DropItem: 126 | self.fail("min_target dropeed item when it shouldn't.") 127 | 128 | class TestUtil(unittest.TestCase): 129 | pass 130 | --------------------------------------------------------------------------------