├── .gitignore ├── README.md ├── quotes.js.disable.html ├── quotes.js.enable.html ├── scrapy.cfg ├── scrapypyppeteer ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ ├── quotes.py │ └── taobao.py └── taobao.html /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | *.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ScrapyPyppeteer 2 | Scrapy Pyppeteer Demo 3 | 4 | ## Run 5 | 6 | ``` 7 | scrapy crawl quotes 8 | ``` 9 | -------------------------------------------------------------------------------- /quotes.js.disable.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Quotes to Scrape 6 | 7 | 8 | 9 | 10 |
11 |
12 |
13 |

14 | Quotes to Scrape 15 |

16 |
17 |
18 |

19 | 20 | Login 21 | 22 |

23 |
24 |
25 | 26 | 27 | 168 | 178 | 179 |
180 | 190 | 191 | -------------------------------------------------------------------------------- /quotes.js.enable.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Quotes to Scrape 4 | 5 | 6 | 7 | 8 |
9 |
10 |
11 |

12 | Quotes to Scrape 13 |

14 |
15 |
16 |

17 | 18 | Login 19 | 20 |

21 |
22 |
23 | 24 | 25 |
“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”by Albert Einstein
“It is our choices, Harry, that show what we truly are, far more than our abilities.”by J.K. Rowling
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”by Albert Einstein
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”by Jane Austen
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”by Marilyn Monroe
“Try not to become a man of success. Rather become a man of value.”by Albert Einstein
“It is better to be hated for what you are than to be loved for what you are not.”by André Gide
Tags: life love
“I have not failed. I've just found 10,000 ways that won't work.”by Thomas A. Edison
“A woman is like a tea bag; you never know how strong it is until it's in hot water.”by Eleanor Roosevelt
“A day without sunshine is like, you know, night.”by Steve Martin
166 | 176 | 177 |
178 | 188 | 189 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapypyppeteer.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapypyppeteer 12 | -------------------------------------------------------------------------------- /scrapypyppeteer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Python3WebSpider/ScrapyPyppeteerDeprecated/f64eece1e04a42265a95fff65a1261ae2c10af69/scrapypyppeteer/__init__.py -------------------------------------------------------------------------------- /scrapypyppeteer/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ScrapypyppeteerItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /scrapypyppeteer/middlewares.py: -------------------------------------------------------------------------------- 1 | import websockets 2 | from scrapy.http import HtmlResponse 3 | from logging import getLogger 4 | import asyncio 5 | import pyppeteer 6 | import logging 7 | from concurrent.futures._base import TimeoutError 8 | 9 | pyppeteer_level = logging.WARNING 10 | logging.getLogger('websockets.protocol').setLevel(pyppeteer_level) 11 | logging.getLogger('pyppeteer').setLevel(pyppeteer_level) 12 | 13 | 14 | class PyppeteerMiddleware(): 15 | def __init__(self, **args): 16 | """ 17 | init logger, loop, browser 18 | :param args: 19 | """ 20 | self.logger = getLogger(__name__) 21 | self.loop = asyncio.get_event_loop() 22 | self.browser = self.loop.run_until_complete( 23 | pyppeteer.launch(headless=True)) 24 | self.args = args 25 | 26 | def __del__(self): 27 | """ 28 | close loop 29 | :return: 30 | """ 31 | self.loop.close() 32 | 33 | def render(self, url, retries=1, script=None, wait=0.3, scrolldown=False, sleep=0, 34 | timeout=8.0, keep_page=False): 35 | """ 36 | render page with pyppeteer 37 | :param url: page url 38 | :param retries: max retry times 39 | :param script: js script to evaluate 40 | :param wait: number of seconds to wait before loading the page, preventing timeouts 41 | :param scrolldown: how many times to page down 42 | :param sleep: how many long to sleep after initial render 43 | :param timeout: the longest wait time, otherwise raise timeout error 44 | :param keep_page: keep page not to be closed, browser object needed 45 | :param browser: pyppetter browser object 46 | :param with_result: return with js evaluation result 47 | :return: content, [result] 48 | """ 49 | 50 | # define async render 51 | async def async_render(url, script, scrolldown, sleep, wait, timeout, keep_page): 52 | try: 53 | # basic render 54 | page = await self.browser.newPage() 55 | await asyncio.sleep(wait) 56 | response = await page.goto(url, options={'timeout': int(timeout * 1000)}) 57 | if response.status != 200: 58 | return None, None, response.status 59 | result = None 60 | # evaluate with script 61 | if script: 62 | result = await page.evaluate(script) 63 | 64 | # scroll down for {scrolldown} times 65 | if scrolldown: 66 | for _ in range(scrolldown): 67 | await page._keyboard.down('PageDown') 68 | await asyncio.sleep(sleep) 69 | else: 70 | await asyncio.sleep(sleep) 71 | if scrolldown: 72 | await page._keyboard.up('PageDown') 73 | 74 | # get html of page 75 | content = await page.content() 76 | 77 | return content, result, response.status 78 | except TimeoutError: 79 | return None, None, 500 80 | finally: 81 | # if keep page, do not close it 82 | if not keep_page: 83 | await page.close() 84 | 85 | content, result, status = [None] * 3 86 | 87 | # retry for {retries} times 88 | for i in range(retries): 89 | if not content: 90 | content, result, status = self.loop.run_until_complete( 91 | async_render(url=url, script=script, sleep=sleep, wait=wait, 92 | scrolldown=scrolldown, timeout=timeout, keep_page=keep_page)) 93 | else: 94 | break 95 | 96 | # if need to return js evaluation result 97 | return content, result, status 98 | 99 | def process_request(self, request, spider): 100 | """ 101 | :param request: request object 102 | :param spider: spider object 103 | :return: HtmlResponse 104 | """ 105 | if request.meta.get('render'): 106 | try: 107 | self.logger.debug('rendering %s', request.url) 108 | html, result, status = self.render(request.url) 109 | return HtmlResponse(url=request.url, body=html, request=request, encoding='utf-8', 110 | status=status) 111 | except websockets.exceptions.ConnectionClosed: 112 | pass 113 | 114 | @classmethod 115 | def from_crawler(cls, crawler): 116 | return cls(**crawler.settings.get('PYPPETEER_ARGS', {})) 117 | -------------------------------------------------------------------------------- /scrapypyppeteer/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class ScrapypyppeteerPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /scrapypyppeteer/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for scrapypyppeteer project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'scrapypyppeteer' 13 | 14 | SPIDER_MODULES = ['scrapypyppeteer.spiders'] 15 | NEWSPIDER_MODULE = 'scrapypyppeteer.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'scrapypyppeteer (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | # DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | # COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | # DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | # } 45 | 46 | # Enable or disable spider middlewares 47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 48 | # SPIDER_MIDDLEWARES = { 49 | # 'scrapypyppeteer.middlewares.ScrapypyppeteerSpiderMiddleware': 543, 50 | # } 51 | 52 | # Enable or disable downloader middlewares 53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 54 | DOWNLOADER_MIDDLEWARES = { 55 | 'scrapypyppeteer.middlewares.PyppeteerMiddleware': 543, 56 | } 57 | 58 | # Enable or disable extensions 59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 60 | # EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | # } 63 | 64 | # Configure item pipelines 65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 66 | # ITEM_PIPELINES = { 67 | # 'scrapypyppeteer.pipelines.ScrapypyppeteerPipeline': 300, 68 | # } 69 | 70 | # Enable and configure the AutoThrottle extension (disabled by default) 71 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 72 | # AUTOTHROTTLE_ENABLED = True 73 | # The initial download delay 74 | # AUTOTHROTTLE_START_DELAY = 5 75 | # The maximum download delay to be set in case of high latencies 76 | # AUTOTHROTTLE_MAX_DELAY = 60 77 | # The average number of requests Scrapy should be sending in parallel to 78 | # each remote server 79 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 80 | # Enable showing throttling stats for every response received: 81 | # AUTOTHROTTLE_DEBUG = False 82 | 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | # HTTPCACHE_ENABLED = True 86 | # HTTPCACHE_EXPIRATION_SECS = 0 87 | # HTTPCACHE_DIR = 'httpcache' 88 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 89 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 90 | 91 | 92 | PYPPETEER_ARGS = { 93 | 'timeout': 8 94 | } 95 | 96 | LOG_LEVEL = 'INFO' -------------------------------------------------------------------------------- /scrapypyppeteer/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapypyppeteer/spiders/quotes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | class QuotesSpider(scrapy.Spider): 6 | name = 'quotes' 7 | allowed_domains = ['quotes.toscrape.com'] 8 | start_urls = ['http://quotes.toscrape.com/js/'] 9 | 10 | def parse(self, response): 11 | for quote in response.css('div.quote'): 12 | yield { 13 | 'text': quote.css('span.text::text').extract_first(), 14 | 'author': quote.css('small.author::text').extract_first(), 15 | 'tags': quote.css('div.tags > a.tag::text').extract() 16 | } 17 | with open('quotes.js.enable.html', 'w', encoding='utf-8') as f: 18 | f.write(response.text) 19 | -------------------------------------------------------------------------------- /scrapypyppeteer/spiders/taobao.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider, Request 3 | 4 | 5 | class TaobaoSpider(Spider): 6 | name = 'taobao' 7 | allowed_domains = ['s.taobao.com'] 8 | start_url = 'http://s.taobao.com/search?q={keyword}' 9 | keywords = ['ipad'] 10 | 11 | def start_requests(self): 12 | for keyword in self.keywords: 13 | url = self.start_url.format(keyword=keyword) 14 | yield Request(url, callback=self.parse_list) 15 | 16 | def parse_list(self, response): 17 | with open('taobao.html', 'w', encoding='utf-8') as f: 18 | f.write(response.text) 19 | --------------------------------------------------------------------------------