├── .gitignore ├── README.md ├── scrapy.cfg └── scrapysplashtest ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py └── taobao.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ScrapySplashTest 2 | 3 | Scrapy Splash on Taobao Product -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapysplashtest.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapysplashtest 12 | -------------------------------------------------------------------------------- /scrapysplashtest/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Python3WebSpider/ScrapySplashTest/f568af46f6de1e5b5cfcb73d2fe38ed4cb7828df/scrapysplashtest/__init__.py -------------------------------------------------------------------------------- /scrapysplashtest/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | 9 | from scrapy import Item, Field 10 | 11 | 12 | class ProductItem(Item): 13 | collection = 'products' 14 | 15 | image = Field() 16 | price = Field() 17 | deal = Field() 18 | title = Field() 19 | shop = Field() 20 | location = Field() 21 | -------------------------------------------------------------------------------- /scrapysplashtest/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ScrapysplashtestSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /scrapysplashtest/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | import pymongo 10 | 11 | class MongoPipeline(object): 12 | def __init__(self, mongo_uri, mongo_db): 13 | self.mongo_uri = mongo_uri 14 | self.mongo_db = mongo_db 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | return cls(mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DB')) 19 | 20 | def open_spider(self, spider): 21 | self.client = pymongo.MongoClient(self.mongo_uri) 22 | self.db = self.client[self.mongo_db] 23 | 24 | def process_item(self, item, spider): 25 | self.db[item.collection].insert(dict(item)) 26 | return item 27 | 28 | def close_spider(self, spider): 29 | self.client.close() 30 | -------------------------------------------------------------------------------- /scrapysplashtest/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for scrapysplashtest project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'scrapysplashtest' 13 | 14 | SPIDER_MODULES = ['scrapysplashtest.spiders'] 15 | NEWSPIDER_MODULE = 'scrapysplashtest.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'scrapysplashtest (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | # DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | # COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | # DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | # } 45 | 46 | # Enable or disable spider middlewares 47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 48 | SPIDER_MIDDLEWARES = { 49 | 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 50 | } 51 | 52 | # Enable or disable downloader middlewares 53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 54 | DOWNLOADER_MIDDLEWARES = { 55 | 'scrapy_splash.SplashCookiesMiddleware': 723, 56 | 'scrapy_splash.SplashMiddleware': 725, 57 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810 58 | } 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | # EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | # } 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'scrapysplashtest.pipelines.MongoPipeline': 300, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | # AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | # AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | # AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | # AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | # HTTPCACHE_ENABLED = True 88 | # HTTPCACHE_EXPIRATION_SECS = 0 89 | # HTTPCACHE_DIR = 'httpcache' 90 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | 93 | KEYWORDS = ['iPad'] 94 | 95 | MAX_PAGE = 100 96 | 97 | SPLASH_URL = 'http://localhost:8050' 98 | 99 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' 100 | HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' 101 | 102 | MONGO_URI = 'localhost' 103 | MONGO_DB = 'taobao' 104 | -------------------------------------------------------------------------------- /scrapysplashtest/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapysplashtest/spiders/taobao.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider, Request 3 | from urllib.parse import quote 4 | from scrapysplashtest.items import ProductItem 5 | from scrapy_splash import SplashRequest 6 | 7 | script = """ 8 | function main(splash, args) 9 | splash.images_enabled = false 10 | assert(splash:go(args.url)) 11 | assert(splash:wait(args.wait)) 12 | js = string.format("document.querySelector('#mainsrp-pager div.form > input').value=%d;document.querySelector('#mainsrp-pager div.form > span.btn.J_Submit').click()", args.page) 13 | splash:evaljs(js) 14 | assert(splash:wait(args.wait)) 15 | return splash:html() 16 | end 17 | """ 18 | 19 | 20 | class TaobaoSpider(Spider): 21 | name = 'taobao' 22 | allowed_domains = ['www.taobao.com'] 23 | base_url = 'https://s.taobao.com/search?q=' 24 | 25 | def start_requests(self): 26 | for keyword in self.settings.get('KEYWORDS'): 27 | for page in range(1, self.settings.get('MAX_PAGE') + 1): 28 | url = self.base_url + quote(keyword) 29 | yield SplashRequest(url, callback=self.parse, endpoint='execute', 30 | args={'lua_source': script, 'page': page, 'wait': 7}) 31 | 32 | def parse(self, response): 33 | products = response.xpath( 34 | '//div[@id="mainsrp-itemlist"]//div[@class="items"][1]//div[contains(@class, "item")]') 35 | for product in products: 36 | item = ProductItem() 37 | item['price'] = ''.join(product.xpath('.//div[contains(@class, "price")]//text()').extract()).strip() 38 | item['title'] = ''.join(product.xpath('.//div[contains(@class, "title")]//text()').extract()).strip() 39 | item['shop'] = ''.join(product.xpath('.//div[contains(@class, "shop")]//text()').extract()).strip() 40 | item['image'] = ''.join( 41 | product.xpath('.//div[@class="pic"]//img[contains(@class, "img")]/@data-src').extract()).strip() 42 | item['deal'] = product.xpath('.//div[contains(@class, "deal-cnt")]//text()').extract_first() 43 | item['location'] = product.xpath('.//div[contains(@class, "location")]//text()').extract_first() 44 | yield item 45 | --------------------------------------------------------------------------------