├── Dockerfile ├── bin └── testargs.py ├── requirements.txt ├── scrapy.cfg ├── setup.py └── testspiders ├── __init__.py ├── items.py ├── middleware.py ├── settings.py └── spiders ├── __init__.py ├── broken_link.py ├── dummy.py ├── followall.py ├── justfollow.py ├── localinfo.py ├── loremipsum.py ├── mad.py ├── noop.py ├── timed.py └── timewaste.py /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM scrapinghub/scrapinghub-stack-scrapy:1.4 2 | WORKDIR /app 3 | COPY requirements.txt /app/requirements.txt 4 | RUN pip install -r /app/requirements.txt 5 | ENV SCRAPY_SETTINGS_MODULE testspiders.settings 6 | COPY . /app 7 | RUN python setup.py install 8 | -------------------------------------------------------------------------------- /bin/testargs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import logging 4 | from argparse import ArgumentParser 5 | 6 | 7 | def main(): 8 | ap = ArgumentParser() 9 | ap.add_argument('--debug', action='store_true') 10 | ap.add_argument('--loglevel', default=logging.INFO) 11 | ap.add_argument('others', nargs='*') 12 | args = ap.parse_args() 13 | 14 | #logging.basicConfig(level=args.loglevel) 15 | 16 | print >>sys.stderr, 'SYS ARGV', sys.argv 17 | print 'ARGS', args 18 | logger = logging.getLogger('testargs') 19 | logger.setLevel(args.loglevel) 20 | logger.error('testargs logger ERROR level') 21 | logger.info('testargs logger INFO level') 22 | logger.debug('testargs logger DEBUG level') 23 | # Root logger 24 | #logging.root.setLevel(args.loglevel) 25 | logging.error('root ERROR level') 26 | logging.info('root INFO level') 27 | logging.debug('root DEBUG level') 28 | 29 | if __name__ == '__main__': 30 | sys.exit(main()) 31 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy>=1.0.2 2 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | default = testspiders.settings 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name = 'testspiders', 5 | version = '1.0', 6 | packages = find_packages(), 7 | entry_points = {'scrapy': ['settings = testspiders.settings']}, 8 | scripts = ['bin/testargs.py'] 9 | ) 10 | -------------------------------------------------------------------------------- /testspiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/testspiders/c7008e50e320b43445e333db7387b29be8aa7cf9/testspiders/__init__.py -------------------------------------------------------------------------------- /testspiders/items.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class Page(scrapy.Item): 5 | url = scrapy.Field() 6 | title = scrapy.Field() 7 | size = scrapy.Field() 8 | referer = scrapy.Field() 9 | newcookies = scrapy.Field() 10 | body = scrapy.Field() 11 | -------------------------------------------------------------------------------- /testspiders/middleware.py: -------------------------------------------------------------------------------- 1 | import random 2 | from scrapy.exceptions import IgnoreRequest 3 | 4 | 5 | class RandomUserAgent(object): 6 | """Randomly rotate user agents based on a list of predefined ones""" 7 | 8 | def __init__(self, agents): 9 | self.agents = agents 10 | 11 | @classmethod 12 | def from_crawler(cls, crawler): 13 | return cls(crawler.settings.getlist('USER_AGENTS')) 14 | 15 | def process_request(self, request, spider): 16 | request.headers.setdefault('User-Agent', random.choice(self.agents)) 17 | 18 | 19 | class ErrorMonkeyMiddleware(object): 20 | 21 | def process_request(self, request, spider): 22 | if 'x-ignore-request' in request.url: 23 | raise IgnoreRequest() 24 | elif 'x-error-request' in request.url: 25 | _ = 1 / 0 26 | 27 | def process_response(self, request, response, spider): 28 | if 'x-ignore-response' in request.url: 29 | raise IgnoreRequest() 30 | elif 'x-error-response' in request.url: 31 | _ = 1 / 0 32 | else: 33 | return response 34 | -------------------------------------------------------------------------------- /testspiders/settings.py: -------------------------------------------------------------------------------- 1 | BOT_NAME = 'testspiders' 2 | 3 | SPIDER_MODULES = ['testspiders.spiders'] 4 | NEWSPIDER_MODULE = 'testspiders.spiders' 5 | 6 | # some sane limits by default (override if needed) 7 | CLOSESPIDER_PAGECOUNT = 1000 8 | CLOSESPIDER_TIMEOUT = 3600 9 | 10 | RETRY_ENABLED = False 11 | COOKIES_ENABLED = False 12 | 13 | DOWNLOADER_MIDDLEWARES = { 14 | 'testspiders.middleware.RandomUserAgent': 1, 15 | 'testspiders.middleware.ErrorMonkeyMiddleware': 2, 16 | } 17 | 18 | USER_AGENTS = [ 19 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 20 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 21 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 22 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 23 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 24 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 25 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 26 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 27 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 28 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 29 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 30 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 31 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 32 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 33 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 34 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 35 | ] 36 | 37 | try: 38 | from local_settings import * 39 | except ImportError: 40 | pass 41 | -------------------------------------------------------------------------------- /testspiders/spiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/testspiders/c7008e50e320b43445e333db7387b29be8aa7cf9/testspiders/spiders/__init__.py -------------------------------------------------------------------------------- /testspiders/spiders/broken_link.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | 3 | import scrapy 4 | 5 | try: 6 | from urllib.parse import urlparse 7 | except ImportError: 8 | from urlparse import urlparse 9 | 10 | 11 | class BrokenLink(scrapy.Spider): 12 | """ 13 | Spider arguments: 14 | - input_url: Where to start the crawl with. 15 | - allowed_domains (optional): Comma-separated list of domains to restrict the crawl with. If not specified, it would be inferred from the input URL, e.g. http://doc.scrapy.org/en/latest/intro/overview.html -> doc.scrapy.org 16 | 17 | Settings: 18 | - DEPTH_LIMIT: Controls the maximum depth (defaults to 50). 19 | - MAX_REQUESTS: Controls the maximum requests (defaults to 100000). The actual number of requests may be slightly different, e.g. MAX_REQUESTS=1000 and the spider stops when having sent 1008 requests. 20 | """ 21 | name = 'broken_link' 22 | custom_settings = { 23 | 'HTTPERROR_ALLOW_ALL': True, 24 | 'DEPTH_LIMIT': 50, 25 | 'MAX_REQUESTS': 100000, 26 | 'RETRY_HTTP_CODES': [], 27 | } 28 | 29 | def __init__(self, input_url, allowed_domains=None, *args, **kwargs): 30 | """Initializes the instance""" 31 | super(BrokenLink, self).__init__(*args, **kwargs) 32 | self.start_urls = [input_url] 33 | if allowed_domains: 34 | self.allowed_domains = allowed_domains.split(',') 35 | else: 36 | netloc = urlparse(input_url).netloc 37 | domain = netloc.split('@')[-1].split(':')[0] 38 | self.allowed_domains = [domain] 39 | 40 | def start_requests(self): 41 | """Generates initial requests""" 42 | for url in self.start_urls: 43 | # Explicitly set the errback handler 44 | yield scrapy.Request( 45 | url, 46 | dont_filter=True, 47 | callback=self.parse, 48 | errback=self.errback 49 | ) 50 | 51 | def parse(self, response): 52 | """Parses a default response""" 53 | if not isinstance(response, scrapy.http.TextResponse): 54 | self.crawler.stats.inc_value('non_text_response') 55 | return 56 | if response.status >= 400 and response.status <= 599: 57 | yield { 58 | 'url': response.url, 59 | 'status': 'invalid_http_status', 60 | 'http_status': response.status, 61 | } 62 | max_reqs = self.settings.getint('MAX_REQUESTS', 0) 63 | stats = self.crawler.stats 64 | for href in response.css('a::attr(href)').extract(): 65 | if max_reqs and max_reqs < stats.get_value('scheduler/enqueued'): 66 | break 67 | yield scrapy.Request( 68 | response.urljoin(href), 69 | callback=self.parse, 70 | errback=self.errback 71 | ) 72 | 73 | def errback(self, err): 74 | """Handles an error""" 75 | return { 76 | 'url': err.request.url, 77 | 'status': 'error_downloading_http_response', 78 | 'message': str(err.value), 79 | } 80 | -------------------------------------------------------------------------------- /testspiders/spiders/dummy.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | class DummySpider(scrapy.Spider): 4 | name = "dummy" 5 | allowed_domains = ["example.com", "iana.org"] 6 | start_urls = ( 7 | 'http://www.example.com/', 8 | ) 9 | 10 | def parse(self, response): 11 | pass 12 | -------------------------------------------------------------------------------- /testspiders/spiders/followall.py: -------------------------------------------------------------------------------- 1 | import re 2 | from six.moves.urllib.parse import urlparse 3 | 4 | import scrapy 5 | from scrapy.http import Request, HtmlResponse 6 | from scrapy.linkextractors import LinkExtractor 7 | 8 | from testspiders.items import Page 9 | 10 | 11 | class FollowAllSpider(scrapy.Spider): 12 | 13 | name = 'followall' 14 | 15 | def __init__(self, **kw): 16 | super(FollowAllSpider, self).__init__(**kw) 17 | url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/' 18 | if not url.startswith('http://') and not url.startswith('https://'): 19 | url = 'http://%s/' % url 20 | self.url = url 21 | self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)] 22 | self.link_extractor = LinkExtractor() 23 | self.cookies_seen = set() 24 | 25 | def start_requests(self): 26 | return [Request(self.url, callback=self.parse, dont_filter=True)] 27 | 28 | def parse(self, response): 29 | """Parse a PageItem and all requests to follow 30 | 31 | @url http://www.scrapinghub.com/ 32 | @returns items 1 1 33 | @returns requests 1 34 | @scrapes url title foo 35 | """ 36 | page = self._get_item(response) 37 | r = [page] 38 | r.extend(self._extract_requests(response)) 39 | return r 40 | 41 | def _get_item(self, response): 42 | item = Page( 43 | url=response.url, 44 | size=str(len(response.body)), 45 | referer=response.request.headers.get('Referer'), 46 | ) 47 | self._set_title(item, response) 48 | self._set_new_cookies(item, response) 49 | return item 50 | 51 | def _extract_requests(self, response): 52 | r = [] 53 | if isinstance(response, HtmlResponse): 54 | links = self.link_extractor.extract_links(response) 55 | r.extend(Request(x.url, callback=self.parse) for x in links) 56 | return r 57 | 58 | def _set_title(self, page, response): 59 | if isinstance(response, HtmlResponse): 60 | title = response.xpath("//title/text()").extract() 61 | if title: 62 | page['title'] = title[0] 63 | 64 | def _set_new_cookies(self, page, response): 65 | cookies = [] 66 | for cookie in [x.split(b';', 1)[0] for x in 67 | response.headers.getlist('Set-Cookie')]: 68 | if cookie not in self.cookies_seen: 69 | self.cookies_seen.add(cookie) 70 | cookies.append(cookie) 71 | if cookies: 72 | page['newcookies'] = cookies 73 | -------------------------------------------------------------------------------- /testspiders/spiders/justfollow.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy.http import TextResponse 3 | 4 | 5 | class Spider(scrapy.Spider): 6 | 7 | name = 'justfollow' 8 | 9 | def start_requests(self): 10 | url = getattr(self, 'url', 'http://scrapinghub.com') 11 | yield scrapy.Request(url, dont_filter=True) 12 | 13 | def parse(self, response): 14 | if not isinstance(response, TextResponse): 15 | return 16 | 17 | if response.xpath('//form'): 18 | yield scrapy.FormRequest.from_response(response, 19 | callback=self.parse) 20 | 21 | for href in response.xpath('//a/@href').extract(): 22 | yield scrapy.Request(response.urljoin(href), self.parse) 23 | -------------------------------------------------------------------------------- /testspiders/spiders/localinfo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | import platform 5 | 6 | import twisted 7 | import OpenSSL 8 | import lxml.etree 9 | import scrapy 10 | 11 | 12 | class LocalInfo(scrapy.Spider): 13 | name = 'localinfo' 14 | start_urls = 'https://example.com', 15 | 16 | def parse(self, response): 17 | item = { 18 | '__file__': __file__, 19 | '__name__': __name__, 20 | 'cwd': os.path.abspath(os.path.curdir), 21 | 'tmpdir': tempfile.gettempdir(), 22 | } 23 | item['versions'] = _versions() 24 | item['environ'] = os.environ.copy() 25 | return item 26 | 27 | 28 | def _versions(): 29 | lxml_version = ".".join(map(str, lxml.etree.LXML_VERSION)) 30 | libxml2_version = ".".join(map(str, lxml.etree.LIBXML_VERSION)) 31 | return { 32 | "Scrapy": scrapy.__version__, 33 | "lxml": lxml_version, 34 | "libxml2": libxml2_version, 35 | "Twisted": twisted.version.short(), 36 | "Python": sys.version.replace("\n", "- "), 37 | "pyOpenSSL": _get_openssl_version(), 38 | "Platform": platform.platform(), 39 | } 40 | 41 | 42 | def _get_openssl_version(): 43 | try: 44 | openssl = OpenSSL.SSL.SSLeay_version(OpenSSL.SSL.SSLEAY_VERSION)\ 45 | .decode('ascii', errors='replace') 46 | # pyOpenSSL 0.12 does not expose openssl version 47 | except AttributeError: 48 | openssl = 'Unknown OpenSSL version' 49 | 50 | return '{} ({})'.format(OpenSSL.version.__version__, openssl) 51 | -------------------------------------------------------------------------------- /testspiders/spiders/loremipsum.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import tempfile 3 | import scrapy 4 | from testspiders.items import Page 5 | 6 | 7 | LOREMIPSUM = b'''\ 8 | Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed 9 | diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat 10 | volutpat. Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper 11 | suscipit lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum 12 | iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum 13 | dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio 14 | dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te 15 | feugait nulla facilisi. Nam liber tempor cum soluta nobis eleifend option 16 | congue nihil imperdiet doming id quod mazim placerat facer possim assum. Typi 17 | non habent claritatem insitam; est usus legentis in iis qui facit eorum 18 | claritatem. Investigationes demonstraverunt lectores legere me lius quod ii 19 | legunt saepius. Claritas est etiam processus dynamicus, qui sequitur mutationem 20 | consuetudium lectorum. Mirum est notare quam littera gothica, quam nunc putamus 21 | parum claram, anteposuerit litterarum formas humanitatis per seacula quarta 22 | decima et quinta decima. Eodem modo typi, qui nunc nobis videntur parum clari, 23 | fiant sollemnes in futurum.''' 24 | 25 | 26 | class LoremipsumSpider(scrapy.Spider): 27 | name = "loremipsum" 28 | loremfile = None 29 | 30 | def start_requests(self): 31 | self.loremfile = tempfile.NamedTemporaryFile() 32 | self.loremfile.write(LOREMIPSUM) 33 | yield scrapy.Request('file://{0}'.format(self.loremfile.name)) 34 | 35 | def parse(self, response): 36 | """Extract lorem ipsum text 37 | 38 | @url http://es.lipsum.com/ 39 | @returns items 1 1 40 | @scrapes url title body 41 | """ 42 | self.log(LOREMIPSUM[:30], level=logging.DEBUG) 43 | self.log(LOREMIPSUM[30:60], level=logging.INFO) 44 | self.log(LOREMIPSUM[60:90], level=logging.WARNING) 45 | self.log(LOREMIPSUM[90:120], level=logging.ERROR) 46 | yield Page(url=response.url, title=LOREMIPSUM[:20], body=LOREMIPSUM) 47 | if self.loremfile: 48 | url = 'file://{0}?x-error-response'.format(self.loremfile.name) 49 | yield scrapy.Request(url, callback=self.parse, errback=self.recover) 50 | 51 | def recover(self, failure): 52 | raise ValueError('hoho') 53 | -------------------------------------------------------------------------------- /testspiders/spiders/mad.py: -------------------------------------------------------------------------------- 1 | """ 2 | Spider that blocks, logs a warning and raises an error randomly 3 | """ 4 | import time 5 | import random 6 | import logging 7 | from testspiders.spiders.followall import FollowAllSpider 8 | 9 | 10 | class MadSpider(FollowAllSpider): 11 | 12 | name = 'mad' 13 | url = None 14 | timeout_choices = range(10) 15 | 16 | def _get_item(self, response): 17 | # simulate block call 18 | timeout = random.choice(self.timeout_choices) 19 | time.sleep(timeout) 20 | 21 | # simulate warnings and errors 22 | if timeout % 3: 23 | self.log("something happened", level=logging.WARNING) 24 | else: 25 | raise Exception("something bad happened") 26 | 27 | return super(MadSpider, self)._get_item(response) 28 | -------------------------------------------------------------------------------- /testspiders/spiders/noop.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class NoopSpider(scrapy.Spider): 5 | name = "noop" 6 | 7 | def parse(self, response): 8 | pass 9 | -------------------------------------------------------------------------------- /testspiders/spiders/timed.py: -------------------------------------------------------------------------------- 1 | """ 2 | Crawll-all spider without domain restriction 3 | """ 4 | from testspiders.spiders.followall import FollowAllSpider 5 | 6 | 7 | class TimedSpider(FollowAllSpider): 8 | 9 | name = 'timed' 10 | url = None 11 | 12 | def __init__(self, **kw): 13 | self.timeout = int(kw.pop('timeout', '60')) 14 | super(TimedSpider, self).__init__(**kw) 15 | 16 | def start_requests(self): 17 | from twisted.internet import reactor 18 | reactor.callLater(self.timeout, self.stop) 19 | return super(TimedSpider, self).start_requests() 20 | 21 | def stop(self): 22 | self.crawler.engine.close_spider(self, 'timeout') 23 | -------------------------------------------------------------------------------- /testspiders/spiders/timewaste.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class Spider(scrapy.Spider): 5 | name = 'timewaste' 6 | start_urls = ('https://example.com',) 7 | 8 | def __init__(self, **kw): 9 | self.timeout = int(kw.pop('timeout', '600')) 10 | super(Spider, self).__init__(**kw) 11 | 12 | def parse(self, response): 13 | from twisted.internet import reactor, defer 14 | self.log('I will waste your time for {} seconds'.format(self.timeout)) 15 | dfd = defer.Deferred() 16 | reactor.callLater(self.timeout, dfd.callback, None) 17 | return dfd 18 | 19 | def stop(self): 20 | self.crawler.engine.close_spider(self, 'timeout') 21 | --------------------------------------------------------------------------------