├── .env ├── .gitignore ├── .pylintrc ├── .vscode ├── launch.json └── settings.json ├── LICENSE ├── Pipfile ├── README.md ├── example ├── jianshu_js_example.py └── request_example.py ├── ruia_pyppeteer ├── __init__.py ├── request.py ├── response.py └── spider.py ├── setup.cfg └── setup.py /.env: -------------------------------------------------------------------------------- 1 | PYTHONPATH=${PYTHONPATH}:${PWD} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .venv 86 | env/ 87 | venv/ 88 | ENV/ 89 | env.bak/ 90 | venv.bak/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | .idea/ 105 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [FORMAT] 2 | max-line-length=160 3 | [MESSAGES CONTROL] 4 | disable=C0103,C0330,W0221,R0913,R0914,R0903,R0902,W1202,W0703,W1203,C0209 5 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [{ 7 | "name": "Python: Current File", 8 | "type": "python", 9 | "request": "launch", 10 | "program": "${file}", 11 | "console": "integratedTerminal" 12 | }] 13 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.sortImports.args": [ 3 | "--settings-path=${workspaceFolder}/setup.cfg" 4 | ], 5 | "python.linting.pylintArgs": [ 6 | "--rcfile=${workspaceFolder}/.pylintrc" 7 | ], 8 | "editor.codeActionsOnSave": { 9 | "source.organizeImports": true 10 | }, 11 | "python.formatting.provider": "black", 12 | "python.linting.pylintEnabled": true, 13 | "python.testing.pytestEnabled": true, 14 | "python.envFile": "${workspaceFolder}/.env", 15 | "python.testing.pytestArgs": [ 16 | "tests" 17 | ], 18 | "python.testing.unittestEnabled": false, 19 | "python.linting.enabled": true, 20 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | ruia = ">=0.6.7" 8 | async-timeout = "*" 9 | cchardet = "*" 10 | pyppeteer = "*" 11 | 12 | [dev-packages] 13 | black = "*" 14 | isort = "*" 15 | pylint = "*" 16 | 17 | [requires] 18 | python_version = "3.6" 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## ruia-pyppeteer 2 | 3 | A [Ruia](https://github.com/howie6879/ruia) plugin for loading javascript 4 | 5 | > Notice: Works on ruia >= 0.8.0 6 | 7 | ### Installation 8 | 9 | ```shell 10 | pip install ruia_pyppeteer 11 | # New features 12 | pip install git+https://github.com/ruia-plugins/ruia-pyppeteer 13 | ``` 14 | 15 | ### Usage 16 | 17 | `ruia_pyppeteer` will load js by using pyppeteer. 18 | 19 | You need to pay attention when you use load_js, it will download a recent version of Chromium (~100MB). This only happens once. 20 | 21 | **Load JavaScript** 22 | 23 | ```python 24 | import asyncio 25 | 26 | from ruia_pyppeteer import PyppeteerRequest as Request 27 | 28 | request = Request("https://www.jianshu.com/", load_js=True) 29 | response = asyncio.get_event_loop().run_until_complete(request.fetch()) 30 | print(response) 31 | ``` 32 | 33 | **Complete example** 34 | 35 | ```python 36 | from ruia import AttrField, Item, TextField 37 | 38 | from ruia_pyppeteer import PyppeteerSpider as Spider 39 | 40 | 41 | class JianshuItem(Item): 42 | target_item = TextField(css_select="ul.list>li") 43 | author_name = TextField(css_select="a.name") 44 | author_url = AttrField(attr="href", css_select="a.name") 45 | 46 | async def clean_author_name(self, author_name): 47 | return author_name.strip() 48 | 49 | async def clean_author_url(self, author_url): 50 | return f"https://www.jianshu.com{author_url}" 51 | 52 | 53 | class JianshuSpider(Spider): 54 | start_urls = ["https://www.jianshu.com/"] 55 | concurrency = 10 56 | 57 | async def parse(self, response): 58 | html = await response.page.content() 59 | async for item in JianshuItem.get_items(html=html): 60 | # Loading js by using PyppeteerRequest 61 | print(item) 62 | await response.browser.close() 63 | 64 | 65 | if __name__ == "__main__": 66 | JianshuSpider.start() 67 | 68 | ``` 69 | 70 | Enjoy it :) 71 | -------------------------------------------------------------------------------- /example/jianshu_js_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Created by howie.hu at 2018/9/8. 4 | """ 5 | 6 | from ruia import AttrField, Item, TextField 7 | 8 | from ruia_pyppeteer import PyppeteerSpider as Spider 9 | 10 | 11 | class JianshuItem(Item): 12 | target_item = TextField(css_select="ul.list>li") 13 | author_name = TextField(css_select="a.name") 14 | author_url = AttrField(attr="href", css_select="a.name") 15 | 16 | async def clean_author_name(self, author_name): 17 | return author_name.strip() 18 | 19 | async def clean_author_url(self, author_url): 20 | return f"https://www.jianshu.com{author_url}" 21 | 22 | 23 | class JianshuSpider(Spider): 24 | start_urls = ["https://www.jianshu.com/"] 25 | concurrency = 10 26 | 27 | async def parse(self, response): 28 | html = await response.page.content() 29 | async for item in JianshuItem.get_items(html=html): 30 | # Loading js by using PyppeteerRequest 31 | print(item) 32 | await response.browser.close() 33 | 34 | 35 | if __name__ == "__main__": 36 | JianshuSpider.start() 37 | -------------------------------------------------------------------------------- /example/request_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Created by howie.hu at 2018/11/23. 4 | """ 5 | 6 | import asyncio 7 | 8 | from ruia_pyppeteer import PyppeteerRequest as Request 9 | 10 | # pyppeteer_args = ['--proxy-server=127.0.0.1:1087'] 11 | pyppeteer_args = [] 12 | pyppeteer_page_options = { 13 | "waitUntil": "networkidle0", 14 | } 15 | 16 | 17 | async def load_js_script(): 18 | request = Request("https://www.jianshu.com/") 19 | response = await request.fetch() 20 | # await response.page.screenshot(path="example.png") 21 | dimensions = await response.page.evaluate( 22 | """() => { 23 | return { 24 | width: document.documentElement.clientWidth, 25 | height: document.documentElement.clientHeight, 26 | deviceScaleFactor: window.devicePixelRatio, 27 | } 28 | }""" 29 | ) 30 | 31 | print(dimensions) 32 | 33 | 34 | if __name__ == "__main__": 35 | asyncio.get_event_loop().run_until_complete(load_js_script()) 36 | -------------------------------------------------------------------------------- /ruia_pyppeteer/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Created by howie.hu at 2018/11/22. 4 | """ 5 | 6 | from .request import PyppeteerRequest 7 | from .spider import PyppeteerSpider 8 | -------------------------------------------------------------------------------- /ruia_pyppeteer/request.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Created by howie.hu at 2018/11/22. 4 | """ 5 | import asyncio 6 | 7 | from typing import Optional 8 | 9 | import async_timeout 10 | import pyppeteer 11 | 12 | from ruia import Request 13 | from ruia.response import Response 14 | 15 | from ruia_pyppeteer.response import PyppeteerResponse 16 | 17 | 18 | class PyppeteerRequest(Request): 19 | def __init__( 20 | self, 21 | url: str, 22 | method: str = "GET", 23 | *, 24 | callback=None, 25 | encoding: Optional[str] = None, 26 | headers: dict = None, 27 | metadata: dict = None, 28 | request_config: dict = None, 29 | request_session=None, 30 | load_js: bool = True, 31 | pyppeteer_args: list = None, 32 | pyppeteer_launch_options: dict = None, 33 | pyppeteer_page_options: dict = None, 34 | pyppeteer_viewport: dict = None, 35 | close_pyppeteer_browser=False, 36 | **kwargs, 37 | ): 38 | super(PyppeteerRequest, self).__init__( 39 | url, 40 | method, 41 | callback=callback, 42 | encoding=encoding, 43 | headers=headers, 44 | metadata=metadata, 45 | request_config=request_config, 46 | request_session=request_session, 47 | **kwargs, 48 | ) 49 | self.load_js = load_js 50 | self.pyppeteer_args = pyppeteer_args or [] 51 | self.pyppeteer_launch_options = pyppeteer_launch_options or {} 52 | self.pyppeteer_page_options = pyppeteer_page_options or {} 53 | self.pyppeteer_viewport = pyppeteer_viewport or {"width": 1080, "height": 900} 54 | self.close_pyppeteer_browser = close_pyppeteer_browser 55 | 56 | async def fetch(self, delay=True) -> PyppeteerResponse: 57 | """Fetch all the information by using aiohttp""" 58 | if delay and self.request_config.get("DELAY", 0) > 0: 59 | await asyncio.sleep(self.request_config["DELAY"]) 60 | 61 | timeout = self.request_config.get("TIMEOUT", 10) 62 | try: 63 | if self.load_js: 64 | if not hasattr(self, "browser"): 65 | self.pyppeteer_args.extend(["--no-sandbox", "--disable-infobars"]) 66 | self.browser = await pyppeteer.launch( 67 | headless=True, 68 | args=self.pyppeteer_args, 69 | options=self.pyppeteer_launch_options, 70 | dumpio=True, 71 | ) 72 | page = await self.browser.newPage() 73 | self.pyppeteer_page_options.update({"timeout": int(timeout * 1000)}) 74 | 75 | resp = await page.goto(self.url, options=self.pyppeteer_page_options) 76 | await page.setViewport(self.pyppeteer_viewport) 77 | 78 | response = PyppeteerResponse( 79 | url=self.url, 80 | method=self.method, 81 | encoding=self.encoding, 82 | page=page, 83 | browser=self.browser, 84 | metadata=self.metadata, 85 | cookies=await page.cookies(), 86 | headers=resp.headers, 87 | history=(), 88 | status=resp.status, 89 | aws_json=resp.json, 90 | aws_text=resp.text, 91 | aws_read=resp.buffer, 92 | ) 93 | else: 94 | async with async_timeout.timeout(timeout): 95 | resp = await self._make_request() 96 | response = Response( 97 | url=self.url, 98 | method=self.method, 99 | encoding=resp.get_encoding(), 100 | metadata=self.metadata, 101 | cookies=resp.cookies, 102 | headers=resp.headers, 103 | history=resp.history, 104 | status=resp.status, 105 | aws_json=resp.json, 106 | aws_text=resp.text, 107 | aws_read=resp.read, 108 | ) 109 | if not response.ok: 110 | return await self._retry( 111 | error_msg=f"Request url failed with status {response.status}!" 112 | ) 113 | return response 114 | except asyncio.TimeoutError: 115 | # Retry for timeout 116 | return await self._retry("timeout") 117 | finally: 118 | # Close client session 119 | if not self.load_js: 120 | await self._close_request() 121 | if self.close_pyppeteer_browser: 122 | await self.browser.close() 123 | -------------------------------------------------------------------------------- /ruia_pyppeteer/response.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from typing import Callable 3 | 4 | from ruia.response import Response 5 | 6 | 7 | class PyppeteerResponse(Response): 8 | def __init__( 9 | self, 10 | url: str, 11 | method: str, 12 | *, 13 | encoding: str = "", 14 | page, 15 | browser, 16 | metadata: dict, 17 | cookies, 18 | history, 19 | headers: dict = None, 20 | status: int = -1, 21 | aws_json: Callable = None, 22 | aws_read: Callable = None, 23 | aws_text: Callable = None, 24 | ): 25 | super(PyppeteerResponse, self).__init__( 26 | url=url, 27 | method=method, 28 | encoding=encoding, 29 | metadata=metadata, 30 | cookies=cookies, 31 | history=history, 32 | headers=headers, 33 | status=status, 34 | aws_json=aws_json, 35 | aws_read=aws_read, 36 | aws_text=aws_text, 37 | ) 38 | self._page = page 39 | self._browser = browser 40 | 41 | @property 42 | def page(self): 43 | return self._page 44 | 45 | @property 46 | def browser(self): 47 | return self._browser 48 | 49 | async def json(self, **kwargs): 50 | """Read and decodes JSON response.""" 51 | return await self._aws_json(**kwargs) 52 | 53 | async def read(self, **kwargs): 54 | """Read response payload.""" 55 | return await self._aws_read(**kwargs) 56 | 57 | async def text(self, **kwargs): 58 | """Read response payload and decode.""" 59 | return await self._aws_text(**kwargs) 60 | 61 | def __str__(self): 62 | return f"" 63 | -------------------------------------------------------------------------------- /ruia_pyppeteer/spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Created by howie.hu at 2018/11/22. 4 | """ 5 | 6 | from typing import Optional 7 | 8 | from ruia import Spider 9 | 10 | from ruia_pyppeteer.request import PyppeteerRequest as Request 11 | 12 | 13 | class PyppeteerSpider(Spider): 14 | pyppeteer_args = [] 15 | pyppeteer_launch_options = {} 16 | pyppeteer_page_options = {} 17 | pyppeteer_viewport = {} 18 | close_pyppeteer_browser = False 19 | 20 | def request( 21 | self, 22 | url: str, 23 | method: str = "GET", 24 | *, 25 | callback=None, 26 | encoding: Optional[str] = None, 27 | headers: dict = None, 28 | metadata: dict = None, 29 | request_config: dict = None, 30 | request_session=None, 31 | load_js: bool = True, 32 | pyppeteer_args: list = None, 33 | pyppeteer_launch_options: dict = None, 34 | pyppeteer_page_options: dict = None, 35 | pyppeteer_viewport: dict = None, 36 | close_pyppeteer_browser: bool = None, 37 | **kwargs 38 | ): 39 | """Init a Request class for crawling html""" 40 | headers = headers or {} 41 | metadata = metadata or {} 42 | request_config = request_config or {} 43 | request_session = request_session or self.request_session 44 | pyppeteer_args = pyppeteer_args or [] 45 | pyppeteer_launch_options = pyppeteer_launch_options or {} 46 | pyppeteer_page_options = pyppeteer_page_options or {} 47 | pyppeteer_viewport = pyppeteer_viewport or {} 48 | close_pyppeteer_browser = ( 49 | close_pyppeteer_browser or self.close_pyppeteer_browser 50 | ) 51 | 52 | headers.update(self.headers.copy()) 53 | request_config.update(self.request_config.copy()) 54 | 55 | pyppeteer_args.extend(self.pyppeteer_args.copy()) 56 | pyppeteer_launch_options.update(self.pyppeteer_launch_options.copy()) 57 | pyppeteer_page_options.update(self.pyppeteer_page_options.copy()) 58 | pyppeteer_viewport.update(self.pyppeteer_viewport.copy()) 59 | 60 | return Request( 61 | url, 62 | method, 63 | callback=callback, 64 | encoding=encoding, 65 | headers=headers, 66 | metadata=metadata, 67 | request_config=request_config, 68 | request_session=request_session, 69 | load_js=load_js, 70 | pyppeteer_args=pyppeteer_args, 71 | pyppeteer_launch_options=pyppeteer_launch_options, 72 | pyppeteer_page_options=pyppeteer_page_options, 73 | pyppeteer_viewport=pyppeteer_viewport, 74 | close_pyppeteer_browser=close_pyppeteer_browser, 75 | ) 76 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | multi_line_output=3 3 | include_trailing_comma=True 4 | force_grid_wrap=0 5 | use_parentheses=True 6 | line_length=88 7 | known_first_party=src 8 | lines_between_types=1 9 | default_section=THIRDPARTY 10 | sections=FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Created by howie.hu at 2018/11/22. 4 | - python setup.py sdist 5 | - twine upload dist/* 6 | """ 7 | 8 | import os 9 | 10 | from setuptools import find_packages, setup 11 | 12 | 13 | def read(file_name): 14 | with open(os.path.join(os.path.dirname(__file__), file_name)) as f: 15 | return f.read() 16 | 17 | 18 | setup( 19 | name="ruia_pyppeteer", 20 | version="0.0.8", 21 | author="Howie Hu", 22 | description="ruia_pyppeteer - A Ruia plugin for loading javascript - pyppeteer.", 23 | long_description=read("README.md"), 24 | long_description_content_type="text/markdown", 25 | author_email="xiaozizayang@gmail.com", 26 | install_requires=["ruia>=0.6.7", "pyppeteer"], 27 | url="https://github.com/ruia-plugins/ruia-pyppeteer", 28 | packages=find_packages(), 29 | license="MIT", 30 | classifiers=[ 31 | "Intended Audience :: Developers", 32 | "License :: OSI Approved :: MIT License", 33 | "Programming Language :: Python :: 3.6", 34 | "Programming Language :: Python :: 3.7", 35 | "Programming Language :: Python :: 3.8", 36 | "Programming Language :: Python :: 3.9", 37 | "Topic :: Software Development :: Libraries :: Python Modules", 38 | ], 39 | project_urls={ 40 | "Documentation": "https://github.com/python-ruia/ruia-pyppeteer", 41 | "Source": "https://github.com/python-ruia/ruia-pyppeteer", 42 | }, 43 | ) 44 | --------------------------------------------------------------------------------