├── .gitattributes ├── .gitignore ├── .travis.yml ├── LICENSE ├── Makefile ├── Pipfile ├── Pipfile.lock ├── README.rst ├── docs ├── Makefile ├── make.bat └── source │ ├── _static │ └── requests-html-logo.png │ ├── _templates │ ├── hacks.html │ ├── sidebarintro.html │ └── sidebarlogo.html │ ├── conf.py │ └── index.rst ├── ext └── requests-html-logo.ai ├── pytest.ini ├── requests_html.py ├── setup.py └── tests ├── python.html ├── test_internet.py └── test_requests_html.py /.gitattributes: -------------------------------------------------------------------------------- 1 | docs/source/_templates/*.html linguist-vendored 2 | tests/*.html linguist-vendored 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.DS_Store 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | .static_storage/ 58 | .media/ 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # Visual Studio Code 109 | .vscode -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | - "3.7" 5 | - "3.8" 6 | - "3.9-dev" 7 | 8 | matrix: 9 | allow_failures: 10 | - python: "3.9-dev" 11 | 12 | # command to install dependencies 13 | install: 14 | - "pip install pipenv --upgrade-strategy=only-if-needed" 15 | - "pipenv install --dev" 16 | 17 | # command to run the dependencies 18 | script: 19 | - "pipenv run tests" 20 | 21 | # command to run tests 22 | # jobs: 23 | # include: 24 | # - stage: "✨ Flake8 Nit–Picking ✨" 25 | # python: "3.6" 26 | # script: "pipenv run flake8" 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright 2018 Kenneth Reitz 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | documentation: 2 | cd docs && make html 3 | cd docs/build/html && git add -A && git commit -m 'updates' 4 | cd docs/build/html && git push origin gh-pages 5 | 6 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.python.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | requests = "*" 8 | pyquery = "*" 9 | fake-useragent = "*" 10 | parse = "*" 11 | "bs4" = "*" 12 | "w3lib" = "*" 13 | pyppeteer = "*" 14 | "rfc3986" = "*" 15 | 16 | [dev-packages] 17 | twine = "*" 18 | requests-file = "*" 19 | pytest = "*" 20 | e1839a8 = {path = ".",editable = true} 21 | sphinx = "*" 22 | mypy = "*" 23 | pytest-asyncio = "*" 24 | white = "*" 25 | 26 | [scripts] 27 | tests = "pytest -v -m 'not internet' " 28 | 29 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Requests-HTML: HTML Parsing for Humans™ 2 | ======================================= 3 | 4 | .. image:: https://farm5.staticflickr.com/4695/39152770914_a3ab8af40d_k_d.jpg 5 | 6 | .. image:: https://travis-ci.com/psf/requests-html.svg?branch=master 7 | :target: https://travis-ci.com/psf/requests-html 8 | 9 | This library intends to make parsing HTML (e.g. scraping the web) as 10 | simple and intuitive as possible. 11 | 12 | When using this library you automatically get: 13 | 14 | - **Full JavaScript support**! (Using Chromium, thanks to pyppeteer) 15 | - *CSS Selectors* (a.k.a jQuery-style, thanks to PyQuery). 16 | - *XPath Selectors*, for the faint of heart. 17 | - Mocked user-agent (like a real web browser). 18 | - Automatic following of redirects. 19 | - Connection–pooling and cookie persistence. 20 | - The Requests experience you know and love, with magical parsing abilities. 21 | - **Async Support** 22 | 23 | .. Other nice features include: 24 | 25 | - Markdown export of pages and elements. 26 | 27 | 28 | Tutorial & Usage 29 | ================ 30 | 31 | Make a GET request to 'python.org', using Requests: 32 | 33 | .. code-block:: pycon 34 | 35 | >>> from requests_html import HTMLSession 36 | >>> session = HTMLSession() 37 | >>> r = session.get('https://python.org/') 38 | 39 | Try async and get some sites at the same time: 40 | 41 | .. code-block:: pycon 42 | 43 | >>> from requests_html import AsyncHTMLSession 44 | >>> asession = AsyncHTMLSession() 45 | >>> async def get_pythonorg(): 46 | ... r = await asession.get('https://python.org/') 47 | ... return r 48 | ... 49 | >>> async def get_reddit(): 50 | ... r = await asession.get('https://reddit.com/') 51 | ... return r 52 | ... 53 | >>> async def get_google(): 54 | ... r = await asession.get('https://google.com/') 55 | ... return r 56 | ... 57 | >>> results = asession.run(get_pythonorg, get_reddit, get_google) 58 | >>> results # check the requests all returned a 200 (success) code 59 | [, , ] 60 | >>> # Each item in the results list is a response object and can be interacted with as such 61 | >>> for result in results: 62 | ... print(result.html.url) 63 | ... 64 | https://www.python.org/ 65 | https://www.google.com/ 66 | https://www.reddit.com/ 67 | 68 | Note that the order of the objects in the results list represents the order they were returned in, not the order that the coroutines are passed to the ``run`` method, which is shown in the example by the order being different. 69 | 70 | Grab a list of all links on the page, as–is (anchors excluded): 71 | 72 | .. code-block:: pycon 73 | 74 | >>> r.html.links 75 | {'//docs.python.org/3/tutorial/', '/about/apps/', 'https://github.com/python/pythondotorg/issues', '/accounts/login/', '/dev/peps/', '/about/legal/', '//docs.python.org/3/tutorial/introduction.html#lists', '/download/alternatives', 'http://feedproxy.google.com/~r/PythonInsider/~3/kihd2DW98YY/python-370a4-is-available-for-testing.html', '/download/other/', '/downloads/windows/', 'https://mail.python.org/mailman/listinfo/python-dev', '/doc/av', 'https://devguide.python.org/', '/about/success/#engineering', 'https://wiki.python.org/moin/PythonEventsCalendar#Submitting_an_Event', 'https://www.openstack.org', '/about/gettingstarted/', 'http://feedproxy.google.com/~r/PythonInsider/~3/AMoBel8b8Mc/python-3.html', '/success-stories/industrial-light-magic-runs-python/', 'http://docs.python.org/3/tutorial/introduction.html#using-python-as-a-calculator', '/', 'http://pyfound.blogspot.com/', '/events/python-events/past/', '/downloads/release/python-2714/', 'https://wiki.python.org/moin/PythonBooks', 'http://plus.google.com/+Python', 'https://wiki.python.org/moin/', 'https://status.python.org/', '/community/workshops/', '/community/lists/', 'http://buildbot.net/', '/community/awards', 'http://twitter.com/ThePSF', 'https://docs.python.org/3/license.html', '/psf/donations/', 'http://wiki.python.org/moin/Languages', '/dev/', '/events/python-user-group/', 'https://wiki.qt.io/PySide', '/community/sigs/', 'https://wiki.gnome.org/Projects/PyGObject', 'http://www.ansible.com', 'http://www.saltstack.com', 'http://planetpython.org/', '/events/python-events', '/about/help/', '/events/python-user-group/past/', '/about/success/', '/psf-landing/', '/about/apps', '/about/', 'http://www.wxpython.org/', '/events/python-user-group/665/', 'https://www.python.org/psf/codeofconduct/', '/dev/peps/peps.rss', '/downloads/source/', '/psf/sponsorship/sponsors/', 'http://bottlepy.org', 'http://roundup.sourceforge.net/', 'http://pandas.pydata.org/', 'http://brochure.getpython.info/', 'https://bugs.python.org/', '/community/merchandise/', 'http://tornadoweb.org', '/events/python-user-group/650/', 'http://flask.pocoo.org/', '/downloads/release/python-364/', '/events/python-user-group/660/', '/events/python-user-group/638/', '/psf/', '/doc/', 'http://blog.python.org', '/events/python-events/604/', '/about/success/#government', 'http://python.org/dev/peps/', 'https://docs.python.org', 'http://feedproxy.google.com/~r/PythonInsider/~3/zVC80sq9s00/python-364-is-now-available.html', '/users/membership/', '/about/success/#arts', 'https://wiki.python.org/moin/Python2orPython3', '/downloads/', '/jobs/', 'http://trac.edgewall.org/', 'http://feedproxy.google.com/~r/PythonInsider/~3/wh73_1A-N7Q/python-355rc1-and-python-348rc1-are-now.html', '/privacy/', 'https://pypi.python.org/', 'http://www.riverbankcomputing.co.uk/software/pyqt/intro', 'http://www.scipy.org', '/community/forums/', '/about/success/#scientific', '/about/success/#software-development', '/shell/', '/accounts/signup/', 'http://www.facebook.com/pythonlang?fref=ts', '/community/', 'https://kivy.org/', '/about/quotes/', 'http://www.web2py.com/', '/community/logos/', '/community/diversity/', '/events/calendars/', 'https://wiki.python.org/moin/BeginnersGuide', '/success-stories/', '/doc/essays/', '/dev/core-mentorship/', 'http://ipython.org', '/events/', '//docs.python.org/3/tutorial/controlflow.html', '/about/success/#education', '/blogs/', '/community/irc/', 'http://pycon.blogspot.com/', '//jobs.python.org', 'http://www.pylonsproject.org/', 'http://www.djangoproject.com/', '/downloads/mac-osx/', '/about/success/#business', 'http://feedproxy.google.com/~r/PythonInsider/~3/x_c9D0S-4C4/python-370b1-is-now-available-for.html', 'http://wiki.python.org/moin/TkInter', 'https://docs.python.org/faq/', '//docs.python.org/3/tutorial/controlflow.html#defining-functions'} 76 | 77 | Grab a list of all links on the page, in absolute form (anchors excluded): 78 | 79 | .. code-block:: pycon 80 | 81 | >>> r.html.absolute_links 82 | {'https://github.com/python/pythondotorg/issues', 'https://docs.python.org/3/tutorial/', 'https://www.python.org/about/success/', 'http://feedproxy.google.com/~r/PythonInsider/~3/kihd2DW98YY/python-370a4-is-available-for-testing.html', 'https://www.python.org/dev/peps/', 'https://mail.python.org/mailman/listinfo/python-dev', 'https://www.python.org/doc/', 'https://www.python.org/', 'https://www.python.org/about/', 'https://www.python.org/events/python-events/past/', 'https://devguide.python.org/', 'https://wiki.python.org/moin/PythonEventsCalendar#Submitting_an_Event', 'https://www.openstack.org', 'http://feedproxy.google.com/~r/PythonInsider/~3/AMoBel8b8Mc/python-3.html', 'https://docs.python.org/3/tutorial/introduction.html#lists', 'http://docs.python.org/3/tutorial/introduction.html#using-python-as-a-calculator', 'http://pyfound.blogspot.com/', 'https://wiki.python.org/moin/PythonBooks', 'http://plus.google.com/+Python', 'https://wiki.python.org/moin/', 'https://www.python.org/events/python-events', 'https://status.python.org/', 'https://www.python.org/about/apps', 'https://www.python.org/downloads/release/python-2714/', 'https://www.python.org/psf/donations/', 'http://buildbot.net/', 'http://twitter.com/ThePSF', 'https://docs.python.org/3/license.html', 'http://wiki.python.org/moin/Languages', 'https://docs.python.org/faq/', 'https://jobs.python.org', 'https://www.python.org/about/success/#software-development', 'https://www.python.org/about/success/#education', 'https://www.python.org/community/logos/', 'https://www.python.org/doc/av', 'https://wiki.qt.io/PySide', 'https://www.python.org/events/python-user-group/660/', 'https://wiki.gnome.org/Projects/PyGObject', 'http://www.ansible.com', 'http://www.saltstack.com', 'https://www.python.org/dev/peps/peps.rss', 'http://planetpython.org/', 'https://www.python.org/events/python-user-group/past/', 'https://docs.python.org/3/tutorial/controlflow.html#defining-functions', 'https://www.python.org/community/diversity/', 'https://docs.python.org/3/tutorial/controlflow.html', 'https://www.python.org/community/awards', 'https://www.python.org/events/python-user-group/638/', 'https://www.python.org/about/legal/', 'https://www.python.org/dev/', 'https://www.python.org/download/alternatives', 'https://www.python.org/downloads/', 'https://www.python.org/community/lists/', 'http://www.wxpython.org/', 'https://www.python.org/about/success/#government', 'https://www.python.org/psf/', 'https://www.python.org/psf/codeofconduct/', 'http://bottlepy.org', 'http://roundup.sourceforge.net/', 'http://pandas.pydata.org/', 'http://brochure.getpython.info/', 'https://www.python.org/downloads/source/', 'https://bugs.python.org/', 'https://www.python.org/downloads/mac-osx/', 'https://www.python.org/about/help/', 'http://tornadoweb.org', 'http://flask.pocoo.org/', 'https://www.python.org/users/membership/', 'http://blog.python.org', 'https://www.python.org/privacy/', 'https://www.python.org/about/gettingstarted/', 'http://python.org/dev/peps/', 'https://www.python.org/about/apps/', 'https://docs.python.org', 'https://www.python.org/success-stories/', 'https://www.python.org/community/forums/', 'http://feedproxy.google.com/~r/PythonInsider/~3/zVC80sq9s00/python-364-is-now-available.html', 'https://www.python.org/community/merchandise/', 'https://www.python.org/about/success/#arts', 'https://wiki.python.org/moin/Python2orPython3', 'http://trac.edgewall.org/', 'http://feedproxy.google.com/~r/PythonInsider/~3/wh73_1A-N7Q/python-355rc1-and-python-348rc1-are-now.html', 'https://pypi.python.org/', 'https://www.python.org/events/python-user-group/650/', 'http://www.riverbankcomputing.co.uk/software/pyqt/intro', 'https://www.python.org/about/quotes/', 'https://www.python.org/downloads/windows/', 'https://www.python.org/events/calendars/', 'http://www.scipy.org', 'https://www.python.org/community/workshops/', 'https://www.python.org/blogs/', 'https://www.python.org/accounts/signup/', 'https://www.python.org/events/', 'https://kivy.org/', 'http://www.facebook.com/pythonlang?fref=ts', 'http://www.web2py.com/', 'https://www.python.org/psf/sponsorship/sponsors/', 'https://www.python.org/community/', 'https://www.python.org/download/other/', 'https://www.python.org/psf-landing/', 'https://www.python.org/events/python-user-group/665/', 'https://wiki.python.org/moin/BeginnersGuide', 'https://www.python.org/accounts/login/', 'https://www.python.org/downloads/release/python-364/', 'https://www.python.org/dev/core-mentorship/', 'https://www.python.org/about/success/#business', 'https://www.python.org/community/sigs/', 'https://www.python.org/events/python-user-group/', 'http://ipython.org', 'https://www.python.org/shell/', 'https://www.python.org/community/irc/', 'https://www.python.org/about/success/#engineering', 'http://www.pylonsproject.org/', 'http://pycon.blogspot.com/', 'https://www.python.org/about/success/#scientific', 'https://www.python.org/doc/essays/', 'http://www.djangoproject.com/', 'https://www.python.org/success-stories/industrial-light-magic-runs-python/', 'http://feedproxy.google.com/~r/PythonInsider/~3/x_c9D0S-4C4/python-370b1-is-now-available-for.html', 'http://wiki.python.org/moin/TkInter', 'https://www.python.org/jobs/', 'https://www.python.org/events/python-events/604/'} 83 | 84 | Select an element with a CSS Selector: 85 | 86 | .. code-block:: pycon 87 | 88 | >>> about = r.html.find('#about', first=True) 89 | 90 | Grab an element's text contents: 91 | 92 | .. code-block:: pycon 93 | 94 | >>> print(about.text) 95 | About 96 | Applications 97 | Quotes 98 | Getting Started 99 | Help 100 | Python Brochure 101 | 102 | Introspect an Element's attributes: 103 | 104 | .. code-block:: pycon 105 | 106 | >>> about.attrs 107 | {'id': 'about', 'class': ('tier-1', 'element-1'), 'aria-haspopup': 'true'} 108 | 109 | Render out an Element's HTML: 110 | 111 | .. code-block:: pycon 112 | 113 | >>> about.html 114 | '
  • \nAbout\n\n
  • ' 115 | 116 | 117 | 118 | Select Elements within Elements: 119 | 120 | .. code-block:: pycon 121 | 122 | >>> about.find('a') 123 | [, , , , , ] 124 | 125 | Search for links within an element: 126 | 127 | .. code-block:: pycon 128 | 129 | >>> about.absolute_links 130 | {'http://brochure.getpython.info/', 'https://www.python.org/about/gettingstarted/', 'https://www.python.org/about/', 'https://www.python.org/about/quotes/', 'https://www.python.org/about/help/', 'https://www.python.org/about/apps/'} 131 | 132 | 133 | Search for text on the page: 134 | 135 | .. code-block:: pycon 136 | 137 | >>> r.html.search('Python is a {} language')[0] 138 | programming 139 | 140 | More complex CSS Selector example (copied from Chrome dev tools): 141 | 142 | .. code-block:: pycon 143 | 144 | >>> r = session.get('https://github.com/') 145 | >>> sel = 'body > div.application-main > div.jumbotron.jumbotron-codelines > div > div > div.col-md-7.text-center.text-md-left > p' 146 | >>> print(r.html.find(sel, first=True).text) 147 | GitHub is a development platform inspired by the way you work. From open source to business, you can host and review code, manage projects, and build software alongside millions of other developers. 148 | 149 | XPath is also supported: 150 | 151 | .. code-block:: pycon 152 | 153 | >>> r.html.xpath('/html/body/div[1]/a') 154 | [] 155 | 156 | 157 | JavaScript Support 158 | ================== 159 | 160 | Let's grab some text that's rendered by JavaScript. Until 2020, the Python 2.7 countdown clock (https://pythonclock.org) will serve as a good test page: 161 | 162 | .. code-block:: pycon 163 | 164 | >>> r = session.get('https://pythonclock.org') 165 | 166 | Let's try and see the dynamically rendered code (The countdown clock). To do that quickly at first, we'll search between the last text we see before it ('Python 2.7 will retire in...') and the first text we see after it ('Enable Guido Mode'). 167 | 168 | .. code-block:: pycon 169 | 170 | >>> r.html.search('Python 2.7 will retire in...{}Enable Guido Mode')[0] 171 | '\n \n
    \n
    \n
    \n
    \n
    1Year2Months28Days16Hours52Minutes46Seconds
    \n
    \n
    \n
    \n' 189 | '
    1Year2Months28Days16Hours52Minutes46Seconds
    \n' 203 | '
    \n' 204 | '
    \n' 205 | ' 198 | 199 | 200 | 201 | 202 | 203 |
    215 | 226 |
    239 | 240 |
    241 | 242 | 449 | 450 |
    451 | 452 |
    453 | 454 | 461 | 462 | 536 |
    537 | 538 | 539 |
    540 | 541 | 542 |
    543 |

    Python is a programming language that lets you work quickly and integrate systems more effectively. Learn More

    544 |
    545 | 546 | 547 |
    548 | 549 | 550 |
    551 | 552 |
    553 | 554 |
    555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 |
    564 | 565 |
    566 |

    Get Started

    567 |

    Whether you're new to programming or an experienced developer, it's easy to learn and use Python.

    568 |

    Start with our Beginner’s Guide

    569 |
    570 | 571 |
    572 |

    Download

    573 |

    Python source code and installers are available for download for all versions! Not sure which version to use? Check here.

    574 |

    Latest: Python 3.6.4 - Python 2.7.14

    575 |
    576 | 577 |
    578 |

    Docs

    579 |

    Documentation for Python's standard library, along with tutorials and guides, are available online.

    580 |

    docs.python.org

    581 |
    582 | 583 |
    584 |

    Jobs

    585 |

    Looking for work or have a Python related position that you're trying to hire for? Our relaunched community-run job board is the place to go.

    586 |

    jobs.python.org

    587 |
    588 | 589 |
    590 | 591 |
    592 | 593 |
    594 | 595 |
    596 | 597 |

    Latest News

    598 |

    More

    599 | 600 | 624 |
    625 | 626 |
    627 | 628 |
    629 | 630 |
    631 | 632 |

    Upcoming Events

    633 |

    More

    634 | 635 | 669 |
    670 | 671 |
    672 | 673 |
    674 | 675 |
    676 | 677 | 709 | 710 |
    711 |
    712 |

    Use Python for…

    713 |

    More

    714 | 715 | 728 | 729 |
    730 |
    731 | 732 |
    733 | 734 | 735 |
    736 | 737 |

    738 | >>> Python Enhancement Proposals (PEPs): The future of Python is discussed here. 739 | 740 |

    741 | 742 | 743 | 744 | 745 |
    746 | 747 |
    748 | 749 | 750 | 751 |

    752 | >>> Python Software Foundation 753 |

    754 |

    The mission of the Python Software Foundation is to promote, protect, and advance the Python programming language, and to support and facilitate the growth of a diverse and international community of Python programmers. Learn more

    755 |

    756 | Become a Member 757 | Donate to the PSF 758 |

    759 |
    760 | 761 | 762 | 763 | 764 |
    765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 |
    774 |
    775 | 776 | 777 | 1023 | 1024 |
    1025 | 1026 | 1027 | 1028 | 1029 | 1030 | 1031 | 1032 | 1033 | 1034 | 1035 | 1040 | 1041 | 1046 | 1047 | 1048 | 1049 | 1050 | 1051 | 1052 | 1053 | 1054 | -------------------------------------------------------------------------------- /tests/test_internet.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from requests_html import HTMLSession, AsyncHTMLSession, HTMLResponse 3 | 4 | 5 | urls = [ 6 | 'https://xkcd.com/1957/', 7 | 'https://www.reddit.com/', 8 | 'https://github.com/psf/requests-html/issues', 9 | 'https://discord.com/category/engineering', 10 | 'https://stackoverflow.com/', 11 | 'https://www.frontiersin.org/', 12 | 'https://azure.microsoft.com/en-us' 13 | ] 14 | 15 | 16 | @pytest.mark.parametrize('url', urls) 17 | @pytest.mark.internet 18 | def test_pagination(url: str): 19 | session = HTMLSession() 20 | r = session.get(url) 21 | assert next(r.html) 22 | 23 | 24 | @pytest.mark.parametrize('url', urls) 25 | @pytest.mark.internet 26 | @pytest.mark.asyncio 27 | async def test_async_pagination(event_loop, url): 28 | asession = AsyncHTMLSession() 29 | 30 | r = await asession.get(url) 31 | assert await r.html.__anext__() 32 | 33 | 34 | @pytest.mark.internet 35 | def test_async_run(): 36 | asession = AsyncHTMLSession() 37 | 38 | async_list = [] 39 | for url in urls: 40 | async def _test(): 41 | return await asession.get(url) 42 | async_list.append(_test) 43 | 44 | r = asession.run(*async_list) 45 | 46 | assert len(r) == len(urls) 47 | assert isinstance(r[0], HTMLResponse) 48 | -------------------------------------------------------------------------------- /tests/test_requests_html.py: -------------------------------------------------------------------------------- 1 | import os 2 | from functools import partial 3 | 4 | import pytest 5 | from pyppeteer.browser import Browser 6 | from pyppeteer.page import Page 7 | from requests_html import HTMLSession, AsyncHTMLSession, HTML 8 | from requests_file import FileAdapter 9 | 10 | session = HTMLSession() 11 | session.mount('file://', FileAdapter()) 12 | 13 | 14 | def get(): 15 | path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html')) 16 | url = f'file://{path}' 17 | 18 | return session.get(url) 19 | 20 | 21 | @pytest.fixture 22 | def async_get(event_loop): 23 | """AsyncSession cannot be created global since it will create 24 | a different loop from pytest-asyncio. """ 25 | async_session = AsyncHTMLSession() 26 | async_session.mount('file://', FileAdapter()) 27 | path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html')) 28 | url = 'file://{}'.format(path) 29 | 30 | return partial(async_session.get, url) 31 | 32 | 33 | def test_file_get(): 34 | r = get() 35 | assert r.status_code == 200 36 | 37 | 38 | @pytest.mark.asyncio 39 | async def test_async_file_get(async_get): 40 | r = await async_get() 41 | assert r.status_code == 200 42 | 43 | 44 | def test_class_seperation(): 45 | r = get() 46 | 47 | about = r.html.find('#about', first=True) 48 | assert len(about.attrs['class']) == 2 49 | 50 | 51 | def test_css_selector(): 52 | r = get() 53 | 54 | about = r.html.find('#about', first=True) 55 | 56 | for menu_item in ( 57 | 'About', 'Applications', 'Quotes', 'Getting Started', 'Help', 58 | 'Python Brochure' 59 | ): 60 | assert menu_item in about.text.split('\n') 61 | assert menu_item in about.full_text.split('\n') 62 | 63 | 64 | def test_containing(): 65 | r = get() 66 | 67 | python = r.html.find(containing='python') 68 | assert len(python) == 192 69 | 70 | for e in python: 71 | assert 'python' in e.full_text.lower() 72 | 73 | 74 | def test_attrs(): 75 | r = get() 76 | about = r.html.find('#about', first=True) 77 | 78 | assert 'aria-haspopup' in about.attrs 79 | assert len(about.attrs['class']) == 2 80 | 81 | 82 | def test_links(): 83 | r = get() 84 | about = r.html.find('#about', first=True) 85 | 86 | assert len(about.links) == 6 87 | assert len(about.absolute_links) == 6 88 | 89 | 90 | @pytest.mark.asyncio 91 | async def test_async_links(async_get): 92 | r = await async_get() 93 | about = r.html.find('#about', first=True) 94 | 95 | assert len(about.links) == 6 96 | assert len(about.absolute_links) == 6 97 | 98 | 99 | def test_search(): 100 | r = get() 101 | style = r.html.search('Python is a {} language')[0] 102 | assert style == 'programming' 103 | 104 | 105 | def test_xpath(): 106 | r = get() 107 | html = r.html.xpath('/html', first=True) 108 | assert 'no-js' in html.attrs['class'] 109 | 110 | a_hrefs = r.html.xpath('//a/@href') 111 | assert '#site-map' in a_hrefs 112 | 113 | 114 | def test_html_loading(): 115 | doc = """""" 116 | html = HTML(html=doc) 117 | 118 | assert 'https://httpbin.org' in html.links 119 | assert isinstance(html.raw_html, bytes) 120 | assert isinstance(html.html, str) 121 | 122 | 123 | def test_anchor_links(): 124 | r = get() 125 | r.html.skip_anchors = False 126 | 127 | assert '#site-map' in r.html.links 128 | 129 | 130 | @pytest.mark.parametrize('url,link,expected', [ 131 | ('http://example.com/', 'test.html', 'http://example.com/test.html'), 132 | ('http://example.com', 'test.html', 'http://example.com/test.html'), 133 | ('http://example.com/foo/', 'test.html', 'http://example.com/foo/test.html'), 134 | ('http://example.com/foo/bar', 'test.html', 'http://example.com/foo/test.html'), 135 | ('http://example.com/foo/', '/test.html', 'http://example.com/test.html'), 136 | ('http://example.com/', 'http://xkcd.com/about/', 'http://xkcd.com/about/'), 137 | ('http://example.com/', '//xkcd.com/about/', 'http://xkcd.com/about/'), 138 | ]) 139 | def test_absolute_links(url, link, expected): 140 | head_template = """""" 141 | body_template = """Next""" 142 | 143 | # Test without `` tag (url is base) 144 | html = HTML(html=body_template.format(link), url=url) 145 | assert html.absolute_links.pop() == expected 146 | 147 | # Test with `` tag (url is other) 148 | html = HTML( 149 | html=head_template.format(url) + body_template.format(link), 150 | url='http://example.com/foobar/') 151 | assert html.absolute_links.pop() == expected 152 | 153 | 154 | def test_parser(): 155 | doc = """httpbin.org\n""" 156 | html = HTML(html=doc) 157 | 158 | assert html.find('html') 159 | assert html.element('a').text().strip() == 'httpbin.org' 160 | 161 | 162 | @pytest.mark.render 163 | def test_render(): 164 | r = get() 165 | script = """ 166 | () => { 167 | return { 168 | width: document.documentElement.clientWidth, 169 | height: document.documentElement.clientHeight, 170 | deviceScaleFactor: window.devicePixelRatio, 171 | } 172 | } 173 | """ 174 | val = r.html.render(script=script) 175 | for value in ('width', 'height', 'deviceScaleFactor'): 176 | assert value in val 177 | 178 | about = r.html.find('#about', first=True) 179 | assert len(about.links) == 6 180 | 181 | 182 | @pytest.mark.render 183 | @pytest.mark.asyncio 184 | async def test_async_render(async_get): 185 | r = await async_get() 186 | script = """ 187 | () => { 188 | return { 189 | width: document.documentElement.clientWidth, 190 | height: document.documentElement.clientHeight, 191 | deviceScaleFactor: window.devicePixelRatio, 192 | } 193 | } 194 | """ 195 | val = await r.html.arender(script=script) 196 | for value in ('width', 'height', 'deviceScaleFactor'): 197 | assert value in val 198 | 199 | about = r.html.find('#about', first=True) 200 | assert len(about.links) == 6 201 | await r.html.browser.close() 202 | 203 | 204 | @pytest.mark.render 205 | def test_bare_render(): 206 | doc = """""" 207 | html = HTML(html=doc) 208 | script = """ 209 | () => { 210 | return { 211 | width: document.documentElement.clientWidth, 212 | height: document.documentElement.clientHeight, 213 | deviceScaleFactor: window.devicePixelRatio, 214 | } 215 | } 216 | """ 217 | val = html.render(script=script, reload=False) 218 | for value in ('width', 'height', 'deviceScaleFactor'): 219 | assert value in val 220 | 221 | assert html.find('html') 222 | assert 'https://httpbin.org' in html.links 223 | 224 | 225 | @pytest.mark.render 226 | @pytest.mark.asyncio 227 | async def test_bare_arender(): 228 | doc = """""" 229 | html = HTML(html=doc, async_=True) 230 | script = """ 231 | () => { 232 | return { 233 | width: document.documentElement.clientWidth, 234 | height: document.documentElement.clientHeight, 235 | deviceScaleFactor: window.devicePixelRatio, 236 | } 237 | } 238 | """ 239 | val = await html.arender(script=script, reload=False) 240 | for value in ('width', 'height', 'deviceScaleFactor'): 241 | assert value in val 242 | 243 | assert html.find('html') 244 | assert 'https://httpbin.org' in html.links 245 | await html.browser.close() 246 | 247 | 248 | @pytest.mark.render 249 | def test_bare_js_eval(): 250 | doc = """ 251 | 252 | 253 | 254 |
    This gets replaced
    255 | 256 | 259 | 260 | 261 | """ 262 | 263 | html = HTML(html=doc) 264 | html.render() 265 | 266 | assert html.find('#replace', first=True).text == 'yolo' 267 | 268 | 269 | @pytest.mark.render 270 | @pytest.mark.asyncio 271 | async def test_bare_js_async_eval(): 272 | doc = """ 273 | 274 | 275 | 276 |
    This gets replaced
    277 | 278 | 281 | 282 | 283 | """ 284 | 285 | html = HTML(html=doc, async_=True) 286 | await html.arender() 287 | 288 | assert html.find('#replace', first=True).text == 'yolo' 289 | await html.browser.close() 290 | 291 | 292 | def test_browser_session(): 293 | """ Test browser instances is created and properly close when session is closed. 294 | Note: session.close method need to be tested together with browser creation, 295 | since not doing that will leave the browser running. """ 296 | session = HTMLSession() 297 | assert isinstance(session.browser, Browser) 298 | assert hasattr(session, "loop") 299 | session.close() 300 | # assert count_chromium_process() == 0 301 | 302 | 303 | def test_browser_process(): 304 | for _ in range(3): 305 | r = get() 306 | r.html.render() 307 | 308 | assert r.html.page is None 309 | 310 | 311 | @pytest.mark.asyncio 312 | async def test_browser_session_fail(): 313 | """ HTMLSession.browser should not be call within an existing event loop> """ 314 | session = HTMLSession() 315 | with pytest.raises(RuntimeError): 316 | session.browser 317 | 318 | 319 | @pytest.mark.asyncio 320 | async def test_async_browser_session(): 321 | session = AsyncHTMLSession() 322 | browser = await session.browser 323 | assert isinstance(browser, Browser) 324 | await session.close() 325 | --------------------------------------------------------------------------------