├── MANIFEST.in ├── .gitignore ├── setup.py ├── examples ├── users.py ├── search.py ├── featured.py ├── top.py ├── crawler.py └── top_inline.py ├── README.rst └── scrapy_boilerplate.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | LONG_DESC = open(os.path.join(os.path.dirname(__file__), 'README.rst')).read() 5 | 6 | 7 | setup( 8 | name='scrapy-boilerplate', 9 | version='0.2.1', 10 | description='Small set of utilities to simplify writing Scrapy spiders.', 11 | long_description=LONG_DESC, 12 | author='Rolando Espinoza La fuente', 13 | author_email='darkrho@gmail.com', 14 | url='https://github.com/darkrho/scrapy-boilerplate', 15 | py_modules=['scrapy_boilerplate'], 16 | license='BSD', 17 | install_requires=['Scrapy>=0.16'], 18 | classifiers=[ 19 | 'Programming Language :: Python', 20 | 'Development Status :: 4 - Beta', 21 | 'Intended Audience :: Developers', 22 | ], 23 | ) 24 | -------------------------------------------------------------------------------- /examples/users.py: -------------------------------------------------------------------------------- 1 | """Stackoverflow's users scraper. 2 | 3 | Usage: scrapy runspider users.py 4 | """ 5 | from scrapy_boilerplate import NewCrawlSpider, NewItem 6 | from pyquery import PyQuery 7 | 8 | 9 | UserItem = NewItem('name about location website url') 10 | 11 | UsersSpider = NewCrawlSpider('users', start_urls=[ 12 | 'http://stackoverflow.com/users?tab=reputation&filter=week', 13 | ]) 14 | 15 | UsersSpider.follow(r'/users\?page=\d+') 16 | 17 | 18 | @UsersSpider.rule(r'/users/\d+/\w+') 19 | def parse_tags(spider, response): 20 | d = PyQuery(response.body) 21 | yield UserItem( 22 | name=d('#user-displayname').text(), 23 | about=d('#large-user-info .user-about-me').text(), 24 | location=d('#large-user-info .adr').text(), 25 | website=d('#large-user-info .url').attr('href'), 26 | url=response.url, 27 | ) 28 | 29 | 30 | if __name__ == '__main__': 31 | print __doc__ 32 | -------------------------------------------------------------------------------- /examples/search.py: -------------------------------------------------------------------------------- 1 | """Displays Stackoverflow's search results for given query. 2 | 3 | Usage: python search.py 4 | """ 5 | import sys 6 | from scrapy_boilerplate import NewSpider, run_spider 7 | from pyquery import PyQuery 8 | from urlparse import urljoin 9 | 10 | 11 | SearchSpider = NewSpider('search') 12 | 13 | 14 | if __name__ == '__main__': 15 | if not sys.argv[1:]: 16 | print __doc__ 17 | sys.exit(2) 18 | 19 | query = ' '.join(sys.argv[1:]) 20 | url = 'http://stackoverflow.com/search?q=%s' % query 21 | results = [] 22 | 23 | # register scrape function dynamically to perform the query 24 | @SearchSpider.scrape(url) 25 | def parse(spider, response): 26 | d = PyQuery(response.body) 27 | results.extend( 28 | (d(e).text(), urljoin(response.url, d(e).attr('href'))) 29 | for e in d('.result-link a') 30 | ) 31 | 32 | # run spider to collect results 33 | run_spider(SearchSpider(), { 34 | 'LOG_ENABLED': False, # shut up log 35 | }) 36 | 37 | # output results 38 | for i, (title, url) in enumerate(results, 1): 39 | print '%2d. %s\n %s\n' % (i, title, url) 40 | -------------------------------------------------------------------------------- /examples/featured.py: -------------------------------------------------------------------------------- 1 | """Stackoverflow's featured questions scraper. 2 | 3 | Usage: scrapy runspider featured.py 4 | """ 5 | from scrapy_boilerplate import NewSpider, NewItem 6 | from pyquery import PyQuery 7 | from urlparse import urljoin 8 | 9 | 10 | FeaturedItem = NewItem('title tags stats url') 11 | 12 | MySpider = NewSpider('featured') 13 | 14 | 15 | @MySpider.scrape('http://www.stackoverflow.com/?tab=featured') 16 | def parse(spider, response): 17 | """Scrapes featured questions.""" 18 | d = PyQuery(response.body) 19 | for summary in d('.question-summary'): 20 | el = d(summary) 21 | yield FeaturedItem( 22 | title=el.find('h3').text(), 23 | url=urljoin(response.url, el.find('h3 a').attr('href')), 24 | tags=[d(a).text() for a in el.find('.tags a')], 25 | stats={ 26 | 'votes': el.find('.votes .mini-counts').text(), 27 | 'answers': el.find('.status .mini-counts').text(), 28 | 'views': el.find('.views .mini-counts').text(), 29 | } 30 | ) 31 | spider.log("Finished extracting featured questions") 32 | 33 | 34 | if __name__ == '__main__': 35 | print __doc__ 36 | -------------------------------------------------------------------------------- /examples/top.py: -------------------------------------------------------------------------------- 1 | """Stackoverflow's top questions scraper. 2 | 3 | Usage: scrapy runspider top.py 4 | 5 | """ 6 | from scrapy_boilerplate import NewCrawlSpider, NewItem 7 | from pyquery import PyQuery 8 | from urlparse import urljoin 9 | from scrapy.http import Request 10 | 11 | 12 | FeaturedItem = NewItem('title body tags user url') 13 | 14 | TopSpider = NewCrawlSpider('top', start_urls=[ 15 | 'http://stackoverflow.com/?tab=hot', 16 | ]) 17 | 18 | 19 | @TopSpider.rule(r'/questions/\d+/[\w\-]+$') 20 | def parse_question(spider, response): 21 | d = PyQuery(response.body) 22 | item = FeaturedItem( 23 | title=d('h1:first').text(), 24 | tags=[d(a).text() for a in d('.post-taglist a')], 25 | url=response.url, 26 | ) 27 | # user page 28 | user_link = d('.post-signature .user-details a').attr('href') 29 | callback = spider.func_callback(parse_user, item=item) 30 | 31 | return Request(urljoin(response.url, user_link), callback=callback) 32 | 33 | 34 | def parse_user(spider, response, item): 35 | # extract user info 36 | d = PyQuery(response.body) 37 | item['user'] = { 38 | 'name': d('#user-displayname').text(), 39 | 'about': d('#large-user-info .user-about-me').text(), 40 | 'location': d('#large-user-info .adr').text(), 41 | 'website': d('#large-user-info .url').attr('href'), 42 | 'url': response.url, 43 | } 44 | yield item 45 | 46 | 47 | if __name__ == '__main__': 48 | print __doc__ 49 | -------------------------------------------------------------------------------- /examples/crawler.py: -------------------------------------------------------------------------------- 1 | """Script to illustrates the use of `run_crawler` helper. 2 | 3 | Usage:: 4 | 5 | python crawler.py -h 6 | 7 | python crawler.py -l 8 | 9 | python crawler.py spider_one 10 | 11 | """ 12 | from scrapy_boilerplate import (run_crawler, SpiderManager, NewItem, 13 | NewSpider, NewCrawlSpider) 14 | from scrapy.spider import BaseSpider 15 | 16 | 17 | BaseItem = NewItem('spider url') 18 | 19 | ItemOne = NewItem('title', base_cls=BaseItem) 20 | ItemTwo = NewItem('name', base_cls=BaseItem) 21 | ItemThree = NewItem('data', base_cls=BaseItem) 22 | 23 | 24 | class SpiderOne(BaseSpider): 25 | name = 'spider_one' 26 | start_urls = ['http://example.org'] 27 | 28 | def parse(self, response): 29 | return ItemOne( 30 | title='welcome to example.org', 31 | spider=self.name, 32 | url=response.url, 33 | ) 34 | 35 | 36 | SpiderTwo = NewSpider('spider_two') 37 | 38 | 39 | @SpiderTwo.scrape('http://example.net') 40 | def parse_net(spider, response): 41 | return ItemTwo( 42 | name='foo', 43 | spider=spider.name, 44 | url=response.url, 45 | ) 46 | 47 | 48 | SpiderThree = NewCrawlSpider('spider_three') 49 | 50 | 51 | if __name__ == '__main__': 52 | # register spiders classes in the spider manager so 53 | # the crawler knows which ones are available 54 | SpiderManager.register(SpiderOne) 55 | SpiderManager.register(SpiderTwo) 56 | #SpiderManager.register(SpiderThree) 57 | 58 | run_crawler() 59 | -------------------------------------------------------------------------------- /examples/top_inline.py: -------------------------------------------------------------------------------- 1 | """Stackoverflow's top questions scraper. 2 | Requires latest version of scrapy-inline-requests (https://github.com/darkrho/scrapy-inline-requests). 3 | 4 | Usage: scrapy runspider top_inline.py 5 | 6 | """ 7 | from inline_requests import inline_requests 8 | from scrapy_boilerplate import NewCrawlSpider, NewItem 9 | from pyquery import PyQuery 10 | from urlparse import urljoin 11 | from scrapy.http import Request 12 | 13 | 14 | FeaturedItem = NewItem('title body tags user url') 15 | 16 | TopSpider = NewCrawlSpider('top', start_urls=[ 17 | 'http://stackoverflow.com/?tab=hot', 18 | ]) 19 | 20 | 21 | @TopSpider.rule(r'/questions/\d+/[\w\-]+$') 22 | @inline_requests 23 | def parse_question(spider, response): 24 | d = PyQuery(response.body) 25 | item = FeaturedItem( 26 | title=d('h1:first').text(), 27 | tags=[d(a).text() for a in d('.post-taglist a')], 28 | url=response.url, 29 | ) 30 | # user page 31 | user_link = d('.post-signature .user-details a').attr('href') 32 | response = yield Request(urljoin(response.url, user_link)) 33 | 34 | # extract user info 35 | d = PyQuery(response.body) 36 | item['user'] = { 37 | 'name': d('#user-displayname').text(), 38 | 'about': d('#large-user-info .user-about-me').text(), 39 | 'location': d('#large-user-info .adr').text(), 40 | 'website': d('#large-user-info .url').attr('href'), 41 | 'url': response.url, 42 | } 43 | yield item 44 | 45 | 46 | if __name__ == '__main__': 47 | print __doc__ 48 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | scrapy-boilerplate 3 | ================== 4 | 5 | `scrapy-boilerplate` is a small set of utilities for `Scrapy`_ to simplify 6 | writing low-complexity spiders that are very common in small and one-off projects. 7 | 8 | It requires `Scrapy`_ `(>= 0.16)` and has been tested using `python 2.7`. 9 | Additionally, `PyQuery`_ is required to run the scripts in the `examples`_ 10 | directory. 11 | 12 | .. note:: 13 | 14 | The code is experimental, includes some magic under the hood and might be 15 | hard to debug. If you are new to `Scrapy`_, don't use this code unless 16 | you are ready to debug errors that nobody have seen before. 17 | 18 | 19 | ----------- 20 | Usage Guide 21 | ----------- 22 | 23 | Items 24 | ===== 25 | 26 | Standard item definition: 27 | 28 | .. code:: python 29 | 30 | from scrapy.item import Item, Field 31 | 32 | class BaseItem(Item): 33 | url = Field() 34 | crawled = Field() 35 | 36 | class UserItem(BaseItem): 37 | name = Field() 38 | about = Field() 39 | location = Field() 40 | 41 | class StoryItem(BaseItem): 42 | title = Field() 43 | body = Field() 44 | user = Field() 45 | 46 | Becomes: 47 | 48 | .. code:: python 49 | 50 | from scrapy_boilerplate import NewItem 51 | 52 | BaseItem = NewItem('url crawled') 53 | 54 | UserItem = NewItem('name about location', base_cls=BaseItem) 55 | 56 | StoryItem = NewItem('title body user', base_cls=BaseItem) 57 | 58 | 59 | BaseSpider 60 | ========== 61 | 62 | Standard spider definition: 63 | 64 | .. code:: python 65 | 66 | from scrapy.spider import BaseSpider 67 | 68 | class MySpider(BaseSpider): 69 | name = 'my_spider' 70 | start_urls = ['http://example.com/latest'] 71 | 72 | def parse(self, response): 73 | # do stuff 74 | 75 | 76 | Becomes: 77 | 78 | .. code:: python 79 | 80 | from scrapy_boilerplate import NewSpider 81 | 82 | MySpider = NewSpider('my_spider') 83 | 84 | @MySpider.scrape('http://example.com/latest') 85 | def parse(spider, response): 86 | # do stuff 87 | 88 | 89 | CrawlSpider 90 | =========== 91 | 92 | Standard crawl-spider definition: 93 | 94 | .. code:: python 95 | 96 | from scrapy.contrib.spiders import CrawlSpider, Rule 97 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 98 | 99 | 100 | class MySpider(CrawlSpider): 101 | name = 'my_spider' 102 | start_urls = ['http://example.com'] 103 | 104 | rules = ( 105 | Rule(SgmlLinkExtractor('category\.php'), follow=True), 106 | Rule(SgmlLinkExtractor('item\.php'), callback='parse_item'), 107 | ) 108 | 109 | def parse_item(self, response): 110 | # do stuff 111 | 112 | 113 | Becomes: 114 | 115 | .. code:: python 116 | 117 | from scrapy_boilerplate import NewCrawlSpider 118 | 119 | MySpider = NewCrawlSpider('my_spider') 120 | MySpider.follow('category\.php') 121 | 122 | @MySpider.rule('item\.php') 123 | def parse_item(spider, response): 124 | # do stuff 125 | 126 | 127 | Running Helpers 128 | =============== 129 | 130 | Single-spider running script: 131 | 132 | .. code:: python 133 | 134 | # file: my-spider.py 135 | # imports omitted ... 136 | 137 | class MySpider(BaseSpider): 138 | # spider code ... 139 | 140 | if __name__ == '__main__': 141 | from scrapy_boilerplate import run_spider 142 | custom_settings = { 143 | # ... 144 | } 145 | spider = MySpider() 146 | 147 | run_spider(spider, custom_settings) 148 | 149 | 150 | Multi-spider script with standard crawl command line options: 151 | 152 | .. code:: python 153 | 154 | # file: my-crawler.py 155 | # imports omitted ... 156 | 157 | 158 | class MySpider(BaseSpider): 159 | name = 'my_spider' 160 | # spider code ... 161 | 162 | 163 | class OtherSpider(CrawlSpider): 164 | name = 'other_spider' 165 | # spider code ... 166 | 167 | 168 | if __name__ == '__main__': 169 | from scrapy_boilerplate import run_crawler, SpiderManager 170 | custom_settings = { 171 | # ... 172 | } 173 | 174 | SpiderManager.register(MySpider) 175 | SpiderManager.register(OtherSpider) 176 | 177 | run_crawler(custom_settings) 178 | 179 | 180 | .. note:: See the `examples`_ directory for working code examples. 181 | 182 | 183 | .. _`Scrapy`: http://www.scrapy.org 184 | .. _`PyQuery`: http://pypi.python.org/pypi/pyquery 185 | .. _`examples`: https://github.com/darkrho/scrapy-boilerplate/tree/master/examples 186 | -------------------------------------------------------------------------------- /scrapy_boilerplate.py: -------------------------------------------------------------------------------- 1 | """Small set of utilities to simplify writing Scrapy spiders.""" 2 | import inspect 3 | import itertools 4 | import functools 5 | import optparse 6 | import sys 7 | 8 | from scrapy import cmdline, log 9 | from scrapy.commands.crawl import Command as CrawlCommand 10 | from scrapy.commands.list import Command as ListCommand 11 | from scrapy.contrib.spiders import Rule, CrawlSpider as _CrawlSpider 12 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 13 | from scrapy.crawler import CrawlerProcess 14 | from scrapy.item import Item, Field 15 | from scrapy.settings import CrawlerSettings 16 | from scrapy.spider import BaseSpider as _BaseSpider 17 | from scrapy.spidermanager import SpiderManager as _SpiderManager 18 | 19 | 20 | class SpiderManager(_SpiderManager): 21 | """Spider manager class that allows to register spiders on runtime. 22 | 23 | This class is intended to be used along with the `run_crawler` function. 24 | """ 25 | 26 | _spiders = {} 27 | 28 | def __init__(self): 29 | pass 30 | 31 | @classmethod 32 | def register(cls, spider_cls): 33 | """Register a spider class.""" 34 | cls._spiders[spider_cls.name] = spider_cls 35 | 36 | @classmethod 37 | def from_crawler(cls, crawler): 38 | return cls() 39 | 40 | 41 | class CallbackMixin(object): 42 | """Shared methods to register urls and callbacks. 43 | 44 | Usage:: 45 | 46 | from scrapy.spider import BaseSpider 47 | 48 | class MySpider(CallbackMixin, BaseSpider): 49 | 50 | # override start requests to return only registered 51 | # requests with `scrape` decorator. 52 | def start_requests(self): 53 | return self.scrape_requests() 54 | 55 | 56 | # register an url with a callback for crawling 57 | @MySpider.scrape('http://example.com') 58 | def parse_item(spider, response): 59 | # do stuff 60 | 61 | """ 62 | 63 | _scrape_urls = () 64 | 65 | @classmethod 66 | def scrape(cls, url, **func_kwargs): 67 | """Decorator to specify to crawl an url with the decorated function.""" 68 | 69 | def register_func(func): 70 | cls._scrape_urls += ((url, func, func_kwargs),) 71 | return func 72 | 73 | return register_func 74 | 75 | def scrape_requests(self): 76 | """Returns requests registered by `scrape` decorator.""" 77 | for url, func, kwargs in self._scrape_urls: 78 | req = self.make_requests_from_url(url) 79 | req.callback = self.func_callback(func, **kwargs) 80 | yield req 81 | 82 | def func_callback(self, func, **kwargs): 83 | """Bind a function to this spider instance. 84 | 85 | Usage:: 86 | 87 | from scrapy.spider import BaseSpider 88 | 89 | class MySpider(CallbackMixin, BaseSpider): 90 | 91 | # ... 92 | 93 | def parse(self, response): 94 | # do stuff... 95 | callback = self.func_callback(external_func) 96 | return Request(url, callback=callback) 97 | 98 | """ 99 | @functools.wraps(func) 100 | def callback(response): 101 | return func(spider=self, response=response, **kwargs) 102 | return callback 103 | 104 | 105 | class BaseSpider(CallbackMixin, _BaseSpider): 106 | """Spider base class. 107 | 108 | Usage:: 109 | 110 | class MySpider(BaseSpider): 111 | 112 | name = 'my_spider' 113 | start_urls = ['http://example.com'] 114 | 115 | def parse(self, response): 116 | # do stuff 117 | 118 | 119 | # register additional pages 120 | 121 | @MySpider.scrape('http://example.org/company_info') 122 | def parse_info(spider, response): 123 | # do stuff 124 | 125 | 126 | @MySpider.scrape('http://example.org/gallery') 127 | def parse_images(spider, response): 128 | # do stuff 129 | 130 | """ 131 | 132 | def start_requests(self): 133 | """Combine scrape and start requests.""" 134 | return itertools.chain(CallbackMixin.scrape_requests(self), 135 | _BaseSpider.start_requests(self)) 136 | 137 | 138 | class CrawlSpider(CallbackMixin, _CrawlSpider): 139 | """Spider class with syntatic sugar to register rules and callbacks. 140 | 141 | Usage:: 142 | 143 | class MySpider(CrawlSpider): 144 | name = 'my_spider' 145 | start_urls = ['http://example.com'] 146 | 147 | 148 | MySpider.follow('next-page') 149 | 150 | 151 | @MySpider.rule('item\.php') 152 | def parse_item(spider, response): 153 | # do stuff 154 | 155 | """ 156 | 157 | # ensure an inmutable type for rules to avoid sharing the same 158 | # attribute with other instances/subclasses of this class 159 | rules = () 160 | 161 | def _call_func(self, response, _func, **kwargs): 162 | """Simple callback helper to pass the spider instance to an external function.""" 163 | return _func(spider=self, response=response, **kwargs) 164 | 165 | @classmethod 166 | def rule(cls, link_extractor, **params): 167 | """Decorator to associate a function as callback for given rule.""" 168 | if isinstance(link_extractor, basestring): 169 | link_extractor = SgmlLinkExtractor(allow=link_extractor) 170 | 171 | def decorator(func): 172 | params['callback'] = '_call_func' 173 | params.setdefault('cb_kwargs', {})['_func'] = func 174 | cls.rules += (Rule(link_extractor, **params),) 175 | return func 176 | 177 | return decorator 178 | 179 | @classmethod 180 | def follow(cls, link_extractor, **params): 181 | """Method to register a callback-less follow rule.""" 182 | params['follow'] = True 183 | if isinstance(link_extractor, basestring): 184 | link_extractor = SgmlLinkExtractor(allow=link_extractor) 185 | cls.rules += (Rule(link_extractor, **params),) 186 | 187 | def start_requests(self): 188 | """Combine scrape and start requests.""" 189 | return itertools.chain(CallbackMixin.scrape_requests(self), 190 | _CrawlSpider.start_requests(self)) 191 | 192 | 193 | def NewSpider(name, **params): 194 | """Create a new subclass of BaseSpider or given base_cls. 195 | 196 | Usage:: 197 | 198 | MySpider = NewSpider('my_spider') 199 | """ 200 | base_cls = params.pop('base_cls', BaseSpider) 201 | attrs = dict(base_cls.__dict__) 202 | attrs.update(name=name, **params) 203 | Spider = type('%s[%s]' % (base_cls.__name__, name), (base_cls,), attrs) 204 | # XXX: modify class's __module__ so scrapy can find it when using 205 | # default spider manager. 206 | Spider.__module__ = inspect.currentframe().f_back.f_globals['__name__'] 207 | return Spider 208 | 209 | 210 | NewCrawlSpider = functools.partial(NewSpider, base_cls=CrawlSpider) 211 | NewCrawlSpider.__doc__ = """Create a new subclass of CrawlSpider. 212 | Usage:: 213 | 214 | MySpider = NewCrawlSpider('my_spider') 215 | """ 216 | 217 | 218 | def NewItem(names, base_cls=Item): 219 | """Creates an Item class with given fields specification. 220 | 221 | Usage:: 222 | 223 | BaseItem = NewItem('title body url') 224 | 225 | QuestionItem = NewItem('tags status', base_cls=BaseItem) 226 | 227 | AnswerItem = NewItem('user', base_cls=BaseItem) 228 | 229 | """ 230 | if isinstance(names, basestring): 231 | names = names.split() 232 | attrs = dict((name, Field()) for name in names) 233 | return type('%s[%s]' % (base_cls.__name__, ' '.join(names)), 234 | (base_cls,), attrs) 235 | 236 | 237 | def run_spider(spider, settings=None): 238 | """Run a spider instance through the scrapy crawler. 239 | 240 | This function is suitable for standalone scripts. 241 | """ 242 | crawler = CrawlerProcess(_build_settings(settings)) 243 | crawler.install() 244 | crawler.configure() 245 | log.start_from_crawler(crawler) 246 | crawler.crawl(spider) 247 | crawler.start() 248 | 249 | 250 | def run_crawler(argv=None, settings=None): 251 | """Run the scrapy crawler bounded to registered spiders. 252 | 253 | This function is suitable for standalone scripts. 254 | 255 | Usage:: 256 | 257 | # mimic 'scrapy crawl' command having these two spiders available 258 | SpiderManager.register(FooSpider) 259 | SpiderManager.register(BarSpider) 260 | 261 | run_crawler() 262 | 263 | """ 264 | argv = argv or sys.argv 265 | settings = _build_settings(settings) 266 | 267 | # load spider manager from this module 268 | settings.overrides.update({ 269 | 'SPIDER_MANAGER_CLASS': '%s.%s' % (__name__, SpiderManager.__name__), 270 | }) 271 | 272 | crawler = CrawlerProcess(settings) 273 | crawler.install() 274 | 275 | parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter()) 276 | parser.add_option('-l', '--list', action='store_true', 277 | help="List available spiders") 278 | 279 | cmd = CrawlCommand() 280 | settings.defaults.update(cmd.default_settings) 281 | cmd.settings = settings 282 | cmd.add_options(parser) 283 | 284 | parser.usage = "%s %s" % (argv[0], cmd.syntax()) 285 | opts, args = parser.parse_args() 286 | if opts.list: 287 | settings.defaults.update(ListCommand.default_settings) 288 | listcmd = ListCommand() 289 | listcmd.set_crawler(crawler) 290 | listcmd.run(args, opts) 291 | sys.exit(listcmd.exitcode) 292 | else: 293 | cmdline._run_print_help(parser, cmd.process_options, args, opts) 294 | cmd.set_crawler(crawler) 295 | cmdline._run_print_help(parser, cmdline._run_command, cmd, args, opts) 296 | sys.exit(cmd.exitcode) 297 | 298 | 299 | def _build_settings(settings=None): 300 | if settings is None: 301 | settings = CrawlerSettings() 302 | elif isinstance(settings, dict): 303 | values = settings 304 | settings = CrawlerSettings() 305 | settings.defaults.update(values) 306 | return settings 307 | --------------------------------------------------------------------------------