├── .gitignore
├── README.md
├── quotes.js.disable.html
├── quotes.js.enable.html
├── scrapy.cfg
├── scrapypyppeteer
├── __init__.py
├── items.py
├── middlewares.py
├── pipelines.py
├── settings.py
└── spiders
│ ├── __init__.py
│ ├── quotes.py
│ └── taobao.py
└── taobao.html
/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea
2 | *.pyc
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ScrapyPyppeteer
2 | Scrapy Pyppeteer Demo
3 |
4 | ## Run
5 |
6 | ```
7 | scrapy crawl quotes
8 | ```
9 |
--------------------------------------------------------------------------------
/quotes.js.disable.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Quotes to Scrape
6 |
7 |
8 |
9 |
10 |
11 |
25 |
26 |
27 |
168 |
178 |
179 |
180 |
190 |
191 |
--------------------------------------------------------------------------------
/quotes.js.enable.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Quotes to Scrape
4 |
5 |
6 |
7 |
8 |
9 |
23 |
24 |
25 |
“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”by Albert Einstein “It is our choices, Harry, that show what we truly are, far more than our abilities.”by J.K. Rowling “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”by Albert Einstein “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”by Jane Austen “Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”by Marilyn Monroe “Try not to become a man of success. Rather become a man of value.”by Albert Einstein “It is better to be hated for what you are than to be loved for what you are not.”by André Gide “I have not failed. I've just found 10,000 ways that won't work.”by Thomas A. Edison “A woman is like a tea bag; you never know how strong it is until it's in hot water.”by Eleanor Roosevelt “A day without sunshine is like, you know, night.”by Steve Martin
166 |
176 |
177 |
178 |
188 |
189 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = scrapypyppeteer.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapypyppeteer
12 |
--------------------------------------------------------------------------------
/scrapypyppeteer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Python3WebSpider/ScrapyPyppeteerDeprecated/f64eece1e04a42265a95fff65a1261ae2c10af69/scrapypyppeteer/__init__.py
--------------------------------------------------------------------------------
/scrapypyppeteer/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class ScrapypyppeteerItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/scrapypyppeteer/middlewares.py:
--------------------------------------------------------------------------------
1 | import websockets
2 | from scrapy.http import HtmlResponse
3 | from logging import getLogger
4 | import asyncio
5 | import pyppeteer
6 | import logging
7 | from concurrent.futures._base import TimeoutError
8 |
9 | pyppeteer_level = logging.WARNING
10 | logging.getLogger('websockets.protocol').setLevel(pyppeteer_level)
11 | logging.getLogger('pyppeteer').setLevel(pyppeteer_level)
12 |
13 |
14 | class PyppeteerMiddleware():
15 | def __init__(self, **args):
16 | """
17 | init logger, loop, browser
18 | :param args:
19 | """
20 | self.logger = getLogger(__name__)
21 | self.loop = asyncio.get_event_loop()
22 | self.browser = self.loop.run_until_complete(
23 | pyppeteer.launch(headless=True))
24 | self.args = args
25 |
26 | def __del__(self):
27 | """
28 | close loop
29 | :return:
30 | """
31 | self.loop.close()
32 |
33 | def render(self, url, retries=1, script=None, wait=0.3, scrolldown=False, sleep=0,
34 | timeout=8.0, keep_page=False):
35 | """
36 | render page with pyppeteer
37 | :param url: page url
38 | :param retries: max retry times
39 | :param script: js script to evaluate
40 | :param wait: number of seconds to wait before loading the page, preventing timeouts
41 | :param scrolldown: how many times to page down
42 | :param sleep: how many long to sleep after initial render
43 | :param timeout: the longest wait time, otherwise raise timeout error
44 | :param keep_page: keep page not to be closed, browser object needed
45 | :param browser: pyppetter browser object
46 | :param with_result: return with js evaluation result
47 | :return: content, [result]
48 | """
49 |
50 | # define async render
51 | async def async_render(url, script, scrolldown, sleep, wait, timeout, keep_page):
52 | try:
53 | # basic render
54 | page = await self.browser.newPage()
55 | await asyncio.sleep(wait)
56 | response = await page.goto(url, options={'timeout': int(timeout * 1000)})
57 | if response.status != 200:
58 | return None, None, response.status
59 | result = None
60 | # evaluate with script
61 | if script:
62 | result = await page.evaluate(script)
63 |
64 | # scroll down for {scrolldown} times
65 | if scrolldown:
66 | for _ in range(scrolldown):
67 | await page._keyboard.down('PageDown')
68 | await asyncio.sleep(sleep)
69 | else:
70 | await asyncio.sleep(sleep)
71 | if scrolldown:
72 | await page._keyboard.up('PageDown')
73 |
74 | # get html of page
75 | content = await page.content()
76 |
77 | return content, result, response.status
78 | except TimeoutError:
79 | return None, None, 500
80 | finally:
81 | # if keep page, do not close it
82 | if not keep_page:
83 | await page.close()
84 |
85 | content, result, status = [None] * 3
86 |
87 | # retry for {retries} times
88 | for i in range(retries):
89 | if not content:
90 | content, result, status = self.loop.run_until_complete(
91 | async_render(url=url, script=script, sleep=sleep, wait=wait,
92 | scrolldown=scrolldown, timeout=timeout, keep_page=keep_page))
93 | else:
94 | break
95 |
96 | # if need to return js evaluation result
97 | return content, result, status
98 |
99 | def process_request(self, request, spider):
100 | """
101 | :param request: request object
102 | :param spider: spider object
103 | :return: HtmlResponse
104 | """
105 | if request.meta.get('render'):
106 | try:
107 | self.logger.debug('rendering %s', request.url)
108 | html, result, status = self.render(request.url)
109 | return HtmlResponse(url=request.url, body=html, request=request, encoding='utf-8',
110 | status=status)
111 | except websockets.exceptions.ConnectionClosed:
112 | pass
113 |
114 | @classmethod
115 | def from_crawler(cls, crawler):
116 | return cls(**crawler.settings.get('PYPPETEER_ARGS', {}))
117 |
--------------------------------------------------------------------------------
/scrapypyppeteer/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class ScrapypyppeteerPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/scrapypyppeteer/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for scrapypyppeteer project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'scrapypyppeteer'
13 |
14 | SPIDER_MODULES = ['scrapypyppeteer.spiders']
15 | NEWSPIDER_MODULE = 'scrapypyppeteer.spiders'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'scrapypyppeteer (+http://www.yourdomain.com)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | # DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | # DEFAULT_REQUEST_HEADERS = {
42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | # 'Accept-Language': 'en',
44 | # }
45 |
46 | # Enable or disable spider middlewares
47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
48 | # SPIDER_MIDDLEWARES = {
49 | # 'scrapypyppeteer.middlewares.ScrapypyppeteerSpiderMiddleware': 543,
50 | # }
51 |
52 | # Enable or disable downloader middlewares
53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
54 | DOWNLOADER_MIDDLEWARES = {
55 | 'scrapypyppeteer.middlewares.PyppeteerMiddleware': 543,
56 | }
57 |
58 | # Enable or disable extensions
59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
60 | # EXTENSIONS = {
61 | # 'scrapy.extensions.telnet.TelnetConsole': None,
62 | # }
63 |
64 | # Configure item pipelines
65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
66 | # ITEM_PIPELINES = {
67 | # 'scrapypyppeteer.pipelines.ScrapypyppeteerPipeline': 300,
68 | # }
69 |
70 | # Enable and configure the AutoThrottle extension (disabled by default)
71 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
72 | # AUTOTHROTTLE_ENABLED = True
73 | # The initial download delay
74 | # AUTOTHROTTLE_START_DELAY = 5
75 | # The maximum download delay to be set in case of high latencies
76 | # AUTOTHROTTLE_MAX_DELAY = 60
77 | # The average number of requests Scrapy should be sending in parallel to
78 | # each remote server
79 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
80 | # Enable showing throttling stats for every response received:
81 | # AUTOTHROTTLE_DEBUG = False
82 |
83 | # Enable and configure HTTP caching (disabled by default)
84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85 | # HTTPCACHE_ENABLED = True
86 | # HTTPCACHE_EXPIRATION_SECS = 0
87 | # HTTPCACHE_DIR = 'httpcache'
88 | # HTTPCACHE_IGNORE_HTTP_CODES = []
89 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
90 |
91 |
92 | PYPPETEER_ARGS = {
93 | 'timeout': 8
94 | }
95 |
96 | LOG_LEVEL = 'INFO'
--------------------------------------------------------------------------------
/scrapypyppeteer/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapypyppeteer/spiders/quotes.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 |
4 |
5 | class QuotesSpider(scrapy.Spider):
6 | name = 'quotes'
7 | allowed_domains = ['quotes.toscrape.com']
8 | start_urls = ['http://quotes.toscrape.com/js/']
9 |
10 | def parse(self, response):
11 | for quote in response.css('div.quote'):
12 | yield {
13 | 'text': quote.css('span.text::text').extract_first(),
14 | 'author': quote.css('small.author::text').extract_first(),
15 | 'tags': quote.css('div.tags > a.tag::text').extract()
16 | }
17 | with open('quotes.js.enable.html', 'w', encoding='utf-8') as f:
18 | f.write(response.text)
19 |
--------------------------------------------------------------------------------
/scrapypyppeteer/spiders/taobao.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy import Spider, Request
3 |
4 |
5 | class TaobaoSpider(Spider):
6 | name = 'taobao'
7 | allowed_domains = ['s.taobao.com']
8 | start_url = 'http://s.taobao.com/search?q={keyword}'
9 | keywords = ['ipad']
10 |
11 | def start_requests(self):
12 | for keyword in self.keywords:
13 | url = self.start_url.format(keyword=keyword)
14 | yield Request(url, callback=self.parse_list)
15 |
16 | def parse_list(self, response):
17 | with open('taobao.html', 'w', encoding='utf-8') as f:
18 | f.write(response.text)
19 |
--------------------------------------------------------------------------------