├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── test_special_cases.py ├── test_url.py ├── test_urllib.py ├── test_web_platform_tests.py └── testdata.json ├── tox.ini └── whatwg_url.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: python 3 | 4 | branches: 5 | only: 6 | - master 7 | 8 | cache: 9 | - pip 10 | 11 | install: 12 | - "python -m pip install -U pip" 13 | - "python -m pip install -U setuptools" 14 | - "python -m pip install -U tox" 15 | 16 | script: 17 | - "tox" 18 | 19 | after_success: 20 | - "python -m pip install -U codecov" 21 | - "codecov" 22 | 23 | matrix: 24 | include: 25 | - env: TOXENV=py27 26 | python: 2.7 27 | - env: TOXENV=py34 28 | python: 3.4 29 | - env: TOXENV=py35 30 | python: 3.5 31 | - env: TOXENV=py35 32 | python: 3.5-dev 33 | - env: TOXENV=py36 34 | python: 3.6 35 | - env: TOXENV=py36 36 | python: 3.6-dev 37 | - env: TOXENV=py37 38 | python: 3.7-dev 39 | dist: xenial 40 | 41 | - env: TOXENV=lint 42 | python: 3.6 43 | 44 | - env: TOXENV=dist 45 | python: 3.6 46 | 47 | deploy: 48 | - provider: "pypi" 49 | user: "SethMichaelLarson" 50 | password: 51 | secure: "ZW+MxzTZ1K/CK5MyDR2CHS8EhS3/XGagI5rK+AVUx87U+0dcv5iTeLpU/L5Iu6kOajYuGeOFcyHuwFa8ILbGqDeMcR0bp3i6Z4AicGQVK1xcTC8C3/nkxGU62Er4iWYpl3oiPN75xhNvfamU4ZYro672Mx0ebAuFJXxmPzfj1YggKmTB7kNv1AUV94S85bwsb4Vmgbd0H5ie74uzaSR3GrLUunT0mqtiJzOAwyGBXAFRjvAX8cBY4bQ6axNnFTyj+84m5S2T/4BnMTg/Y/VwNBkP2rJ3FlhqgxVWoQwvpOMrkqXPiRzojJF6frthBJXC4/L4/luaeDOIk6gaWBlzwpAnQnatVZBQqIJHzVjSnefMk8v5Sh5D3QWlacJK/eU7gXqxJbGj+uV6MY+5dDYG4dJ7G1y9A/vbmVoHBv/0IkMo+yh0KrSHIO4/DTmp0irSqeQr71iCECNxmdlxv6o6JmVgcjIqj56nsvHaQEFB/ylZdhWI2PYA+cIE6sqeVaBE2e0mq7WuYLb4tNYTK1hcRp5dqKoKu7f4W9N6/yWhqxm9iN86LQG0Uq4wwscCdkohMZgkUNvuUQH6r9tSbHIdUpQUD46OkkM/aIzfw2dMVdbC2s3e8XyRaIiKctRrtcktb1YMZgfanv0BbeLH6vBcqfxguML84SCfgGgGzTi4KC8=" 52 | on: 53 | branch: "master" 54 | tags: true 55 | distributions: "sdist" 56 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 2018.8.26 4 | 5 | ### Added 6 | 7 | - Added `UrlParser` and `Url` 8 | - Added `UrlParser.parse_host()` 9 | - Added `UrlParser.parse_ipv4_host()` 10 | - Added `Url.origin` 11 | - Added `Url.authority` 12 | - Added `urlparse` and `urljoin` to be compatible with 13 | [`urllib3.parse.urlparse`](https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlparse) 14 | and [`urllib.parse.urljoin`](https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin) 15 | - Added support for Python 2.7, 3.4, and 3.5 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE README.md CHANGELOG.md tox.ini setup.cfg 2 | recursive-include tests *.json 3 | recursive-include tests *.py 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # whatwg-url 2 | 3 | [![No Maintenance Intended](http://unmaintained.tech/badge.svg)](http://unmaintained.tech/) 4 | 5 | Python implementation of the [WHATWG URL Living Standard](https://url.spec.whatwg.org/). 6 | 7 | The latest revision that this package implements of the standard is August 7th, 2018 ([`commit 49060c7`](https://github.com/whatwg/url/commit/49060c74d3047602a572f9e88a6a1101f4fd32f3)) 8 | 9 | ## Getting Started 10 | 11 | Install the `whatwg-url` package using `pip`. 12 | 13 | `python -m pip install whatwg-url` 14 | 15 | And use the module like so: 16 | 17 | ```python 18 | import whatwg_url 19 | 20 | url = whatwg_url.parse_url("https://www.google.com") 21 | print(url) 22 | # Url(scheme='https', hostname='www.google.com', port=None, path='', query='', fragment='') 23 | ``` 24 | 25 | ## Features 26 | 27 | ### Compatibility with `urllib.parse.urlparse()` 28 | 29 | ```python 30 | import whatwg_url 31 | 32 | parseresult = whatwg_url.urlparse("https://seth:larson@www.google.com:1234/maps?query=string#fragment") 33 | 34 | print(parseresult.scheme) # 'https' 35 | print(parseresult.netloc) # 'www.google.com:1234' 36 | print(parseresult.userinfo) # 'seth:larson' 37 | print(parseresult.path) # '/maps' 38 | print(parseresult.params) # '' 39 | print(parseresult.query) # 'query=string' 40 | print(parseresult.fragment) # 'fragment' 41 | print(parseresult.username) # 'seth' 42 | print(parseresult.password) # 'larson' 43 | print(parseresult.hostname) # 'www.google.com' 44 | print(parseresult.port) # 1234 45 | print(parseresult.geturl()) # 'https://seth:larson@www.google.com:1234/maps?query=string#fragment' 46 | ``` 47 | 48 | ### URL Normalization 49 | 50 | The WHATWG URL specification describes methods of normalizing URL inputs to usable URLs. 51 | It handles percent-encodings, default ports, paths, IPv4 and IPv6 addresses, IDNA (2008 and 2003), multiple slashes after scheme, etc. 52 | 53 | ```python 54 | import whatwg_url 55 | 56 | print(whatwg_url.normalize_url("https://////www.google.com")) # https://www.google.com 57 | print(whatwg_url.normalize_url("https://www.google.com/dir1/../dir2")) # https://www.google.com/dir2 58 | print(whatwg_url.normalize_url("https://你好你好")) # https://xn--6qqa088eba/ 59 | print(whatwg_url.normalize_url("https://0Xc0.0250.01")) # https://192.168.0.1/ 60 | ``` 61 | 62 | ### URL Validation 63 | 64 | ```python 65 | print(whatwg_url.is_valid_url("https://www.google.com")) # True 66 | print(whatwg_url.is_valid_url("https://www .google.com")) # False 67 | ``` 68 | 69 | ### Relative URLs 70 | 71 | HTTP redirects often contain relative URLs (via the `Location` header) that need to be applied to the current URL location. 72 | Specifying the `base` parameter allows for giving relative URLs as input and the changes be applied to a new `URL` object. 73 | 74 | ```python 75 | import whatwg_url 76 | 77 | url = whatwg_url.parse_url("../dev?a=1#f", base="https://www.google.com/maps") 78 | print(url.href) # https://www.google.com/dev?a=1#f 79 | ``` 80 | 81 | ### URL Property Mutators 82 | 83 | Modifying properties on a `URL` object use the parser and "state overrides" to properly mutate the `URL` object. 84 | 85 | ```python 86 | url = whatwg_url.parse_url("http://www.google.com:443") 87 | 88 | print(url.scheme) # 'http' 89 | print(url.port) # 443 90 | 91 | url.scheme = 'https' 92 | 93 | print(url.scheme) # 'https' 94 | print(url.port) # None 95 | ``` 96 | 97 | ### "Splatable" 98 | 99 | The module is a single file which allows for easy vendoring into projects. 100 | 101 | ## License 102 | 103 | [Apache-2.0](https://github.com/SethMichaelLarson/whatwg-url/blob/master/LICENSE) 104 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, E266, E501, W503 3 | max-line-length = 80 4 | max-complexity = 18 5 | select = B,C,E,F,W,T4,B9 6 | 7 | [check-manifest] 8 | ignore = 9 | .travis.yml 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import re 4 | from setuptools import setup 5 | 6 | base_dir = os.path.dirname(os.path.abspath(__file__)) 7 | 8 | version = None 9 | 10 | with io.open(os.path.join(base_dir, "whatwg_url.py"), encoding="utf-8") as f: 11 | for line in f: 12 | match = re.search(r"^__version__\s+=\s+\"([^\"]+)\"$", line) 13 | if match: 14 | version = match.group(1) 15 | break 16 | else: 17 | raise ValueError("Could not find __version__ in whatwg_url.py") 18 | 19 | 20 | def get_long_description(): 21 | with io.open(os.path.join(base_dir, "README.md"), encoding="utf-8") as f: 22 | data = f.read() 23 | data += "\n\n" 24 | with io.open(os.path.join(base_dir, "CHANGELOG.md"), encoding="utf-8") as f: 25 | data += f.read() 26 | return data 27 | 28 | 29 | setup( 30 | name="whatwg-url", 31 | version=version, 32 | description="Python implementation of the WHATWG URL Living Standard", 33 | long_description=get_long_description(), 34 | long_description_content_type="text/markdown", 35 | author="Seth Michael Larson", 36 | author_email="sethmichaellarson@gmail.com", 37 | url="https://github.com/SethMichaelLarson/whatwg-url", 38 | license="Apache-2.0", 39 | py_modules=["whatwg_url"], 40 | install_requires=["idna", "six", "ipaddress"], 41 | classifiers=[ 42 | "Development Status :: 4 - Beta", 43 | "Intended Audience :: Developers", 44 | "Natural Language :: English", 45 | "License :: OSI Approved :: Apache Software License", 46 | "Programming Language :: Python :: 2", 47 | "Programming Language :: Python :: 2.7", 48 | "Programming Language :: Python :: 3", 49 | "Programming Language :: Python :: 3.4", 50 | "Programming Language :: Python :: 3.5", 51 | "Programming Language :: Python :: 3.6", 52 | "Programming Language :: Python :: 3.7", 53 | "Topic :: Internet", 54 | ], 55 | ) 56 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sethmlarson/whatwg-url/84be3cb71c327944a57422746d862270a09327e7/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_special_cases.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Source for these URLs: 3 | # https://www.blackhat.com/docs/us-17/thursday/ 4 | # us-17-Tsai-A-New-Era-Of-SSRF-Exploiting-URL- 5 | # Parser-In-Trending-Programming-Languages.pdf 6 | 7 | import pytest 8 | import whatwg_url 9 | 10 | 11 | def test_spaces_with_multiple_ipv4_addresses(): 12 | url = whatwg_url.parse_url("http://1.1.1.1 &@2.2.2.2# @3.3.3.3") 13 | 14 | assert url.username == "1.1.1.1%20&" 15 | assert url.password is None 16 | assert url.hostname == "2.2.2.2" 17 | assert url.fragment == "%20@3.3.3.3" 18 | 19 | 20 | def test_fragment_with_hostname(): 21 | url = whatwg_url.parse_url("http://google.com#@evil.com/") 22 | 23 | assert url.hostname == "google.com" 24 | assert url.fragment == "@evil.com/" 25 | 26 | 27 | def test_multiple_ats_within_authority(): 28 | url = whatwg_url.parse_url("http://foo@evil.com:80@google.com/") 29 | 30 | assert url.hostname == "google.com" 31 | assert url.username == "foo%40evil.com" 32 | assert url.password == "80" 33 | 34 | 35 | def test_multiple_ats_and_space_within_authority(): 36 | url = whatwg_url.parse_url("http://foo@evil.com:80 @google.com/") 37 | 38 | assert url.hostname == "google.com" 39 | assert url.username == "foo%40evil.com" 40 | assert url.password == "80%20" 41 | 42 | 43 | def test_unicode_double_dot_if_stripped_bom(): 44 | url = whatwg_url.parse_url("http://orange.tw/sandbox/NN/passwd") 45 | 46 | assert url.hostname == "orange.tw" 47 | assert url.path == "/sandbox/%EF%BC%AE%EF%BC%AE/passwd" 48 | 49 | 50 | def test_host_contains_tab_in_authority(): 51 | url = whatwg_url.parse_url("http://127.0.0.1\tfoo.google.com") 52 | 53 | assert url.host == "127.0.0.1foo.google.com" 54 | 55 | 56 | def test_host_contains_tab_in_authority_single_or_double_encoded(): 57 | with pytest.raises(whatwg_url.UrlParserError): 58 | whatwg_url.parse_url("http://127.0.0.1%09foo.google.com") 59 | 60 | with pytest.raises(whatwg_url.UrlParserError): 61 | whatwg_url.parse_url("http://127.0.0.1%2509foo.google.com") 62 | 63 | 64 | def test_injection_within_authority(): 65 | with pytest.raises(whatwg_url.UrlParserError): 66 | whatwg_url.parse_url("https://127.0.0.1\r\nSET foo 0 60 5\r\n:443/") 67 | 68 | 69 | def test_backslash_within_authority(): 70 | url = whatwg_url.parse_url("http://localhost\\@google.com:12345") 71 | 72 | assert url.hostname == "localhost" 73 | assert url.port is None 74 | assert url.path == "/@google.com:12345" 75 | 76 | 77 | def test_relative_url_with_url_contained(): 78 | url = whatwg_url.parse_url( 79 | url="/redirect?target=http://localhost:61020/", base="https://www.google.com" 80 | ) 81 | 82 | assert url.scheme == "https" 83 | assert url.hostname == "www.google.com" 84 | assert url.path == "/redirect" 85 | assert url.query == "target=http://localhost:61020/" 86 | -------------------------------------------------------------------------------- /tests/test_url.py: -------------------------------------------------------------------------------- 1 | import whatwg_url 2 | 3 | 4 | def test_url_scheme(): 5 | url = whatwg_url.parse_url("http://www.google.com:443") 6 | url.scheme = "https" 7 | 8 | assert url.scheme == "https" 9 | assert url.port is None 10 | assert url.href == "https://www.google.com/" 11 | 12 | url.scheme = "http" 13 | 14 | assert url.scheme == "http" 15 | assert url.port is None 16 | assert url.href == "http://www.google.com/" 17 | 18 | 19 | def test_url_host(): 20 | url = whatwg_url.parse_url("https://www.google.com") 21 | url.hostname = "example.com" 22 | 23 | assert url.hostname == "example.com" 24 | assert url.href == "https://example.com/" 25 | 26 | 27 | def test_url_port(): 28 | url = whatwg_url.parse_url("https://www.example.com") 29 | url.port = 123 30 | 31 | assert url.port == 123 32 | assert url.host == "www.example.com:123" 33 | assert url.href == "https://www.example.com:123/" 34 | 35 | url.port = 443 36 | 37 | assert url.port is None 38 | assert url.host == "www.example.com" 39 | assert url.href == "https://www.example.com/" 40 | 41 | 42 | def test_url_user_info(): 43 | url = whatwg_url.parse_url("https://github.com") 44 | 45 | url.username = "username" 46 | 47 | assert url.username == "username" 48 | assert url.password is None 49 | assert url.href == "https://username@github.com/" 50 | 51 | url.password = "password" 52 | 53 | assert url.username == "username" 54 | assert url.password == "password" 55 | assert url.href == "https://username:password@github.com/" 56 | 57 | url.username = None 58 | 59 | assert url.username is None 60 | assert url.password == "password" 61 | assert url.href == "https://:password@github.com/" 62 | 63 | url.password = None 64 | 65 | assert url.username is None 66 | assert url.password is None 67 | assert url.href == "https://github.com/" 68 | 69 | 70 | def test_url_query(): 71 | url = whatwg_url.parse_url("https://www.google.com") 72 | url.query = "?a=1" 73 | 74 | assert url.query == "a=1" 75 | assert url.href == "https://www.google.com/?a=1" 76 | 77 | url.query = "" 78 | 79 | assert url.query == "" 80 | assert url.href == "https://www.google.com/?" 81 | 82 | url.query = None 83 | 84 | assert url.query is None 85 | assert url.href == "https://www.google.com/" 86 | 87 | 88 | def test_url_fragment(): 89 | url = whatwg_url.parse_url("https://www.google.com") 90 | url.fragment = "abc" 91 | 92 | assert url.fragment == "abc" 93 | assert url.href == "https://www.google.com/#abc" 94 | 95 | url.fragment = "" 96 | 97 | assert url.fragment == "" 98 | assert url.href == "https://www.google.com/#" 99 | 100 | url.fragment = None 101 | 102 | assert url.fragment is None 103 | assert url.href == "https://www.google.com/" 104 | 105 | 106 | def test_url_origin(): 107 | url = whatwg_url.parse_url("https://www.google.com") 108 | 109 | assert url.origin == ("https", "www.google.com", None, None) 110 | 111 | 112 | def test_url_opaque_origin(): 113 | url = whatwg_url.parse_url("file:///var/tmp/file") 114 | 115 | assert tuple(url.origin) == (None, None, None, None) 116 | assert not url.origin == url.origin 117 | assert url.origin != url.origin 118 | 119 | 120 | def test_url_blob_origin(): 121 | url = whatwg_url.parse_url("blob:https://www.google.com") 122 | 123 | assert url.origin == whatwg_url.parse_url("https://www.google.com").origin 124 | -------------------------------------------------------------------------------- /tests/test_urllib.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from whatwg_url import urlparse as whatwg_urlparse, urljoin as whatwg_urljoin 3 | 4 | try: 5 | from urllib.parse import urlparse as urllib_urlparse, urljoin as urllib_urljoin 6 | except ImportError: 7 | from urlparse import urlparse as urllib_urlparse, urljoin as urllib_urljoin 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "url", 12 | [ 13 | "https://www.google.com/", 14 | "http://user:pass@www.example.com/", 15 | "http://:pass@www.example.com/", 16 | "http://user@www.example.com/", 17 | "http://www.example.com:432/", 18 | "http://www.example.com/?a=1;B=c", 19 | "http://www.example.com/#Fragment", 20 | "http://username:password@www.example.com:1234/?query=string#fragment", 21 | ], 22 | ) 23 | def test_assert_same_urlparse_result(url): 24 | urllib_result = urllib_urlparse(url) 25 | whatwg_result = whatwg_urlparse(url) 26 | 27 | assert urllib_result.netloc == whatwg_result.netloc 28 | assert urllib_result.hostname == whatwg_result.hostname 29 | assert urllib_result.port == whatwg_result.port 30 | assert urllib_result.path == whatwg_result.path 31 | assert urllib_result.query == whatwg_result.query 32 | assert urllib_result.fragment == whatwg_result.fragment 33 | assert urllib_result.username == whatwg_result.username 34 | assert urllib_result.password == whatwg_result.password 35 | assert tuple(urllib_result) == tuple(whatwg_result) 36 | 37 | 38 | @pytest.mark.parametrize( 39 | ["base", "url", "expected"], 40 | [ 41 | ("http://www.google.com/", "", "http://www.google.com/"), 42 | ("http://www.google.com/", "/", "http://www.google.com/"), 43 | ("http://www.google.com/", "maps/", "http://www.google.com/maps/"), 44 | ("http://www.google.com/", "one/two/", "http://www.google.com/one/two/"), 45 | ("http://www.google.com/mail", "/maps/", "http://www.google.com/maps/"), 46 | ("http://www.google.com/", "./", "http://www.google.com/"), 47 | ("http://www.google.com/maps", "..", "http://www.google.com/"), 48 | ( 49 | "http://www.google.com/", 50 | "https://www.google.com/", 51 | "https://www.google.com/", 52 | ), 53 | ( 54 | "http://www.google.com/", 55 | "https://maps.google.com/", 56 | "https://maps.google.com/", 57 | ), 58 | ( 59 | "https://www.google.com/", 60 | "https://www.google.com:1234/", 61 | "https://www.google.com:1234/", 62 | ), 63 | ( 64 | "https://www.google.com/", 65 | "?query=string", 66 | "https://www.google.com/?query=string", 67 | ), 68 | ("https://www.google.com/", "#fragment", "https://www.google.com/#fragment"), 69 | ( 70 | "http://www.google.com/", 71 | "http://user:pass@www.google.com/", 72 | "http://user:pass@www.google.com/", 73 | ), 74 | ( 75 | "http://www.google.com/", 76 | "http://user@www.google.com/", 77 | "http://user@www.google.com/", 78 | ), 79 | ( 80 | "http://www.google.com/", 81 | "http://:pass@www.google.com/", 82 | "http://:pass@www.google.com/", 83 | ), 84 | ], 85 | ) 86 | def test_assert_same_urljoin_result(base, url, expected): 87 | urllib_result = urllib_urljoin(base, url) 88 | whatwg_result = whatwg_urljoin(base, url) 89 | 90 | assert urllib_result == expected 91 | assert whatwg_result == expected 92 | -------------------------------------------------------------------------------- /tests/test_web_platform_tests.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | import pytest 4 | import os 5 | import whatwg_url 6 | 7 | 8 | with io.open( 9 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata.json"), "rb" 10 | ) as f: 11 | testdata = f.read() 12 | if not isinstance(testdata, str): 13 | testdata = testdata.decode("utf-8") 14 | testdata = json.loads(testdata, encoding="utf-8") 15 | testdata = [x for x in testdata if isinstance(x, dict)] 16 | 17 | 18 | def assert_with_empty(a, b): 19 | def f(x): 20 | return "" if x is None else x 21 | 22 | assert f(a) == f(b) 23 | 24 | 25 | @pytest.mark.parametrize("testdata", testdata) 26 | def test_web_platform_tests(testdata): 27 | if testdata["input"].startswith("blob:"): 28 | pytest.skip("blob") 29 | 30 | if "href" in testdata and "about:blank" in testdata["href"]: 31 | pytest.skip("about:blank") 32 | 33 | base = testdata.get("base", None) 34 | if base == "about:blank": 35 | base = None 36 | else: 37 | base = whatwg_url.parse_url(base) 38 | 39 | if testdata.get("failure", False): 40 | with pytest.raises(whatwg_url.UrlParserError): 41 | whatwg_url.parse_url(testdata["input"], base=base) 42 | 43 | else: 44 | url = whatwg_url.parse_url(testdata["input"], base=base) 45 | 46 | assert_with_empty(url._username, testdata.get("username", None)) 47 | assert_with_empty(url._password, testdata.get("password", None)) 48 | assert_with_empty(url.path, testdata.get("pathname", None)) 49 | 50 | port = testdata.get("port", None) 51 | if port is not None and port != "": 52 | port = int(port) 53 | if port == "": 54 | port = None 55 | 56 | assert_with_empty(url._port, port) 57 | assert url.href == testdata.get("href", None) 58 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = dist,lint,py27,py34,py35,py36,py37 3 | 4 | [testenv] 5 | deps = 6 | pytest 7 | pytest-cov 8 | commands = 9 | python --version 10 | python -m pip --version 11 | pytest -v --cov whatwg_url tests/ 12 | coverage report 13 | 14 | [testenv:lint] 15 | basepython = python3.6 16 | deps = 17 | black 18 | flake8 19 | flake8-bugbear 20 | commands = 21 | python -m black --check setup.py whatwg_url.py tests/ 22 | python -m flake8 setup.py whatwg_url.py tests/ 23 | 24 | [testenv:dist] 25 | basepython = python3.6 26 | deps = 27 | docutils 28 | check-manifest 29 | readme 30 | usedevelop = true 31 | commands = 32 | python setup.py check --strict --metadata 33 | check-manifest {toxinidir} 34 | -------------------------------------------------------------------------------- /whatwg_url.py: -------------------------------------------------------------------------------- 1 | """Python implementation of the WHATWG URL Living Standard""" 2 | 3 | import string 4 | import re 5 | import ipaddress 6 | import collections 7 | import encodings.idna as idna2003 8 | import idna 9 | import six 10 | 11 | 12 | __all__ = [ 13 | "parse_url", 14 | "normalize_url", 15 | "is_valid_url", 16 | "UrlParser", 17 | "Url", 18 | "UrlParserError", 19 | "urlparse", 20 | "urljoin", 21 | "ParseResult", 22 | ] 23 | __version__ = "2018.8.26" 24 | __license__ = "Apache-2.0" 25 | 26 | 27 | def parse_url(url, base=None, encoding="utf-8"): 28 | """ 29 | Parses a URL from a string input with an optional base URL. 30 | If the input URL is a relative URL then it will be parsed as 31 | relative to the base URL. 32 | 33 | :param str url: URL input string 34 | :param str base: Optional base URL to use while parsing. 35 | :param encoding: Character encoding to use for parsing the URL, defaults to UTF-8. 36 | :rtype: Url 37 | :raises: UrlParserError 38 | :return: The parsed URL. 39 | """ 40 | parser = UrlParser() 41 | return parser.parse(url, base=base, encoding=encoding) 42 | 43 | 44 | def normalize_url(url, base=None, encoding="utf-8"): 45 | """Normalizes a URL input with and optional base URL. 46 | 47 | :param str url: URL input to normalize. 48 | :param str base: Optional base URL to parse relative to. 49 | :param str encoding: Character encoding to parse with. Defaults to UTF-8. 50 | :rtype 51 | :raises: UrlParserError 52 | :return: The normalized URL as a string. 53 | """ 54 | return parse_url(url, base=base, encoding=encoding).href 55 | 56 | 57 | def is_valid_url(url, base=None, encoding="utf-8"): 58 | """Determines if a URL is a valid URL. 59 | 60 | :param str url: URL input to validate 61 | :param str base: Optional base URL to parse relative to. 62 | :param str encoding: Character encoding to parse with. Defaults to UTF-8. 63 | :rtype: bool 64 | :return: True if the given URL is a valid URL, False otherwise. 65 | """ 66 | try: 67 | parse_url(url, base=base, encoding=encoding) 68 | return True 69 | except UrlParserError: 70 | return False 71 | 72 | 73 | class _OpaqueOrigin(tuple): 74 | def __eq__(self, _): 75 | return False 76 | 77 | def __ne__(self, _): 78 | return True 79 | 80 | 81 | def b(x, encoding="ascii"): 82 | if isinstance(x, six.text_type): 83 | return x.encode(encoding) 84 | return x 85 | 86 | 87 | ASCII_ALPHA = set(string.ascii_letters) 88 | ASCII_DIGITS = set(string.digits) 89 | ASCII_ALPHANUMERIC = ASCII_ALPHA | ASCII_DIGITS 90 | TWO_ASCII_HEX = re.compile(r"^[a-fA-F0-9]{2}") 91 | URL_CODEPOINTS = ASCII_ALPHANUMERIC | set("!$&'()*+,-./:;=?@_~") 92 | SCHEME_CHARS = ASCII_ALPHANUMERIC | set("+-.") 93 | NONCHARACTERS = { 94 | 0xfdd0, 95 | 0xfdd1, 96 | 0xfdd2, 97 | 0xfdd3, 98 | 0xfdd4, 99 | 0xfdd5, 100 | 0xfdd6, 101 | 0xfdd7, 102 | 0xfdd8, 103 | 0xfdd9, 104 | 0xfdda, 105 | 0xfddb, 106 | 0xfddc, 107 | 0xfddd, 108 | 0xfdde, 109 | 0xfddf, 110 | 0xfde0, 111 | 0xfde1, 112 | 0xfde2, 113 | 0xfde3, 114 | 0xfde4, 115 | 0xfde5, 116 | 0xfde6, 117 | 0xfde7, 118 | 0xfde8, 119 | 0xfde9, 120 | 0xfdea, 121 | 0xfdeb, 122 | 0xfdec, 123 | 0xfded, 124 | 0xfdee, 125 | 0xfdef, 126 | 0xfffe, 127 | 0xffff, 128 | 0x1fffe, 129 | 0x1ffff, 130 | 0x2fffe, 131 | 0x2ffff, 132 | 0x3fffe, 133 | 0x3ffff, 134 | 0x4fffe, 135 | 0x4ffff, 136 | 0x5fffe, 137 | 0x5ffff, 138 | 0x6fffe, 139 | 0x6ffff, 140 | 0x7fffe, 141 | 0x7ffff, 142 | 0x8fffe, 143 | 0x8ffff, 144 | 0x9fffe, 145 | 0x9ffff, 146 | 0xafffe, 147 | 0xaffff, 148 | 0xbfffe, 149 | 0xbffff, 150 | 0xcfffe, 151 | 0xcffff, 152 | 0xdfffe, 153 | 0xdffff, 154 | 0xefffe, 155 | 0xeffff, 156 | 0xffffe, 157 | 0xfffff, 158 | 0x10fffe, 159 | 0x10ffff, 160 | } 161 | 162 | SINGLE_DOT_PATH_SEGMENTS = {".", "%2e", "%2E"} 163 | DOUBLE_DOT_PATH_SEGMENTS = { 164 | "..", 165 | ".%2e", 166 | ".%2E", 167 | "%2e.", 168 | "%2e%2e", 169 | "%2e%2E", 170 | "%2E.", 171 | "%2E%2e", 172 | "%2E%2E", 173 | } 174 | 175 | C0_PERCENT_ENCODE = set([chr(x) for x in range(0x20)]) 176 | FRAGMENT_PERCENT_ENCODE = set(' "<>`') | C0_PERCENT_ENCODE 177 | PATH_PERCENT_ENCODE = set("#?{}") | FRAGMENT_PERCENT_ENCODE 178 | USERINFO_PERCENT_ENCODE = set("/:;=@[\\]^|") | PATH_PERCENT_ENCODE 179 | 180 | FORBIDDEN_HOST_CODE_POINTS = { 181 | "\x00", 182 | "\t", 183 | "\x0a", 184 | "\x0d", 185 | " ", 186 | "#", 187 | "%", 188 | "/", 189 | ":", 190 | "?", 191 | "@", 192 | "[", 193 | "\\", 194 | "]", 195 | } 196 | 197 | WINDOWS_DRIVE_LETTER = re.compile(r"^([a-zA-Z][:|])(?:[/\\?#]|$)") 198 | NORMALIZED_WINDOWS_DRIVE_LETTER = re.compile(r"^[a-zA-Z][:]$") 199 | 200 | AUTHORITY_DELIMITERS = {"", "/", "?", "#"} 201 | PATH_DELIMITERS = {"", "/", "\\", "?", "#"} 202 | 203 | HEX_CHAR_MAP = dict( 204 | [ 205 | (b(_x + _y), b(chr(int(_x + _y, 16)), "charmap")) 206 | for _x in string.hexdigits 207 | for _y in string.hexdigits 208 | ] 209 | ) 210 | 211 | IDNA_DOTS_REGEX = re.compile(u"[\u002e\u3002\uff0e\uff61]") 212 | 213 | 214 | SPECIAL_SCHEMES = { 215 | "ftp": 21, 216 | "gopher": 70, 217 | "http": 80, 218 | "https": 443, 219 | "ws": 80, 220 | "wss": 443, 221 | "file": None, 222 | } 223 | 224 | 225 | PARSER_STATE_SCHEME_START = 1 226 | PARSER_STATE_SCHEME = 2 227 | PARSER_STATE_NO_SCHEME = 3 228 | PARSER_STATE_SPECIAL_RELATIVE_OR_AUTHORITY = 4 229 | PARSER_STATE_PATH_OR_AUTHORITY = 5 230 | PARSER_STATE_RELATIVE = 6 231 | PARSER_STATE_RELATIVE_SLASH = 7 232 | PARSER_STATE_SPECIAL_AUTHORITY_SLASHES = 8 233 | PARSER_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES = 9 234 | PARSER_STATE_AUTHORITY = 10 235 | PARSER_STATE_HOST = 11 236 | PARSER_STATE_HOSTNAME = 12 237 | PARSER_STATE_PORT = 13 238 | PARSER_STATE_FILE = 14 239 | PARSER_STATE_FILE_SLASH = 15 240 | PARSER_STATE_FILE_HOST = 16 241 | PARSER_STATE_PATH_START = 17 242 | PARSER_STATE_PATH = 18 243 | PARSER_STATE_CANNOT_BE_BASE_URL = 19 244 | PARSER_STATE_QUERY = 20 245 | PARSER_STATE_FRAGMENT = 21 246 | 247 | 248 | class UrlParserError(ValueError): 249 | pass 250 | 251 | 252 | class _UrlParserReturn(Exception): 253 | pass 254 | 255 | 256 | class Url(object): 257 | def __init__( 258 | self, 259 | scheme=None, 260 | hostname=None, 261 | port=None, 262 | username=None, 263 | password=None, 264 | query=None, 265 | fragment=None, 266 | path=None, 267 | cannot_be_base_url=False, 268 | encoding="utf-8", 269 | ): 270 | if path is None: 271 | path = [] 272 | 273 | self._scheme = scheme 274 | self._hostname = hostname 275 | self._port = port 276 | self._username = username 277 | self._password = password 278 | self._query = query 279 | self._fragment = fragment 280 | self._path = path 281 | 282 | self.encoding = encoding 283 | self.cannot_be_base_url = cannot_be_base_url 284 | 285 | @property 286 | def scheme(self): 287 | return self._scheme 288 | 289 | @property 290 | def hostname(self): 291 | return self._hostname 292 | 293 | @property 294 | def port(self): 295 | return self._port 296 | 297 | @property 298 | def username(self): 299 | return self._username 300 | 301 | @property 302 | def password(self): 303 | return self._password 304 | 305 | @property 306 | def query(self): 307 | return self._query 308 | 309 | @property 310 | def fragment(self): 311 | return self._fragment 312 | 313 | @property 314 | def host(self): 315 | if self._port is None: 316 | return self._hostname 317 | return "%s:%s" % (self._hostname, self._port) 318 | 319 | @property 320 | def path(self): 321 | if self.cannot_be_base_url: 322 | return self._path[0] 323 | else: 324 | return "".join(["/%s" % x for x in self._path]) 325 | 326 | @scheme.setter 327 | def scheme(self, scheme): 328 | parser = UrlParser(self) 329 | parser.parse( 330 | scheme + ":", 331 | encoding=self.encoding, 332 | state_override=PARSER_STATE_SCHEME_START, 333 | ) 334 | 335 | @username.setter 336 | def username(self, username): 337 | self._username = username 338 | 339 | @password.setter 340 | def password(self, password): 341 | self._password = password 342 | 343 | @hostname.setter 344 | def hostname(self, hostname): 345 | parser = UrlParser(self) 346 | parser.parse( 347 | hostname, encoding=self.encoding, state_override=PARSER_STATE_HOSTNAME 348 | ) 349 | 350 | @port.setter 351 | def port(self, port): 352 | parser = UrlParser(self) 353 | parser.parse(str(port), state_override=PARSER_STATE_PORT) 354 | 355 | @path.setter 356 | def path(self, path): 357 | if self.cannot_be_base_url: 358 | return 359 | 360 | self._path = [] 361 | parser = UrlParser(self) 362 | parser.parse(path, state_override=PARSER_STATE_PATH_START) 363 | 364 | @query.setter 365 | def query(self, query): 366 | if query is None: 367 | self._query = None 368 | return 369 | 370 | if query.startswith("?"): 371 | query = query[1:] 372 | 373 | self._query = "" 374 | parser = UrlParser(self) 375 | parser.parse(query, encoding=self.encoding, state_override=PARSER_STATE_QUERY) 376 | 377 | @fragment.setter 378 | def fragment(self, fragment): 379 | if fragment is None: 380 | self._fragment = None 381 | return 382 | 383 | if fragment.startswith("#"): 384 | fragment = fragment[1:] 385 | 386 | self._fragment = "" 387 | parser = UrlParser(self) 388 | parser.parse( 389 | fragment, encoding=self.encoding, state_override=PARSER_STATE_FRAGMENT 390 | ) 391 | 392 | @property 393 | def includes_credentials(self): 394 | """Determines if a URL includes credentials""" 395 | return bool(self._username) or bool(self._password) 396 | 397 | @property 398 | def origin(self): 399 | if self.scheme == "blob": 400 | try: 401 | url = parse_url(self._path[0], encoding=self.encoding) 402 | except UrlParserError: 403 | return _OpaqueOrigin((None, None, None, None)) 404 | return url.origin 405 | 406 | elif self.scheme in SPECIAL_SCHEMES and self.scheme != "file": 407 | return self.scheme, self.hostname, self.port, None 408 | 409 | else: 410 | return _OpaqueOrigin((None, None, None, None)) 411 | 412 | @property 413 | def authority(self): 414 | output = [] 415 | if self.includes_credentials: 416 | if self._username: 417 | output.append(self._username) 418 | if self._password: 419 | output.append(":" + self._password) 420 | output.append("@") 421 | 422 | output.append(self._hostname) 423 | if self._port is not None: 424 | output.append(":%s" % self._port) 425 | return "".join(output) 426 | 427 | @property 428 | def href(self): 429 | output = [self._scheme + ":"] 430 | if self._hostname is not None: 431 | output.append("//") 432 | 433 | if self.includes_credentials: 434 | if self._username: 435 | output.append(self._username) 436 | if self._password: 437 | output.append(":" + self._password) 438 | output.append("@") 439 | 440 | output.append(self._hostname) 441 | if self._port is not None: 442 | output.append(":%s" % self._port) 443 | 444 | if self._hostname is None and self._scheme == "file": 445 | output.append("//") 446 | 447 | if self.cannot_be_base_url: 448 | output.append(self._path[0]) 449 | else: 450 | output.append(self.path) 451 | 452 | if self._query is not None: 453 | output.append("?" + self._query) 454 | 455 | if self._fragment is not None: 456 | output.append("#" + self._fragment) 457 | 458 | return "".join(output) 459 | 460 | def __repr__(self): 461 | return ("<%s scheme=%r hostname=%r port=%r path=%r query=%r fragment=%r>") % ( 462 | self.__class__.__name__, 463 | self._scheme, 464 | self._hostname, 465 | self._port, 466 | self.path, 467 | self._query, 468 | self._fragment, 469 | ) 470 | 471 | def __str__(self): 472 | return self.href 473 | 474 | 475 | class UrlParser(object): 476 | def __init__(self, url=None): 477 | if url is None: 478 | url = Url() 479 | 480 | self.url = url 481 | self.base = None 482 | self.state_override = None 483 | self.validation_error = False 484 | 485 | self._state = None 486 | self._pointer = 0 487 | self._buffer = "" 488 | self._at_flag = False 489 | self._square_brace_flag = False 490 | self._password_token_seen_flag = False 491 | 492 | self._state_handlers = { 493 | PARSER_STATE_SCHEME_START: self._on_scheme_start, 494 | PARSER_STATE_SCHEME: self._on_scheme, 495 | PARSER_STATE_NO_SCHEME: self._on_no_scheme, 496 | PARSER_STATE_SPECIAL_RELATIVE_OR_AUTHORITY: ( 497 | self._on_special_relative_or_authority 498 | ), 499 | PARSER_STATE_PATH_OR_AUTHORITY: self._on_path_or_authority, 500 | PARSER_STATE_RELATIVE: self._on_relative, 501 | PARSER_STATE_RELATIVE_SLASH: self._on_relative_slash, 502 | PARSER_STATE_SPECIAL_AUTHORITY_SLASHES: self._on_special_authority_slashes, 503 | PARSER_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES: ( 504 | self._on_special_authority_ignore_slashes 505 | ), 506 | PARSER_STATE_AUTHORITY: self._on_authority, 507 | PARSER_STATE_HOST: self._on_host_or_hostname, 508 | PARSER_STATE_HOSTNAME: self._on_host_or_hostname, 509 | PARSER_STATE_PORT: self._on_port, 510 | PARSER_STATE_FILE: self._on_file, 511 | PARSER_STATE_FILE_SLASH: self._on_file_slash, 512 | PARSER_STATE_FILE_HOST: self._on_file_host, 513 | PARSER_STATE_PATH_START: self._on_path_start, 514 | PARSER_STATE_PATH: self._on_path, 515 | PARSER_STATE_CANNOT_BE_BASE_URL: self._on_cannot_be_base_url, 516 | PARSER_STATE_QUERY: self._on_query, 517 | PARSER_STATE_FRAGMENT: self._on_fragment, 518 | } 519 | 520 | def parse(self, data, base=None, encoding=None, state_override=None): 521 | self.reset() 522 | 523 | if isinstance(base, str): 524 | base_parser = UrlParser() 525 | base = base_parser.parse(base, encoding=encoding) 526 | self.base = base 527 | 528 | self.state_override = state_override 529 | self._state = state_override or PARSER_STATE_SCHEME_START 530 | 531 | if encoding is None: 532 | self.encoding = self.url.encoding or "utf-8" 533 | else: 534 | self.encoding = encoding 535 | 536 | self.url.encoding = self.encoding 537 | 538 | while data and _is_c0_control_or_space(data[0]): 539 | self.validation_error = True 540 | data = data[1:] 541 | 542 | while data and _is_c0_control_or_space(data[-1]): 543 | self.validation_error = True 544 | data = data[:-1] 545 | 546 | before_len = len(data) 547 | data = data.replace("\t", "").replace("\n", "").replace("\r", "") 548 | 549 | if len(data) < before_len: 550 | self.validation_error = True 551 | 552 | try: 553 | end_pointer = len(data) 554 | 555 | while self._pointer < end_pointer or ( 556 | end_pointer == 0 and self._pointer == 0 557 | ): 558 | if end_pointer > 0: 559 | self._call_state_handler( 560 | self._state, data[self._pointer], data[self._pointer + 1 :] 561 | ) 562 | 563 | while self._pointer == end_pointer: 564 | self._call_state_handler(self._state, "", "") 565 | 566 | except _UrlParserReturn: 567 | pass 568 | 569 | return self.url 570 | 571 | def _call_state_handler(self, state, c, remaining): 572 | self._state_handlers[state](c, remaining) 573 | self._pointer += 1 574 | 575 | def parse_host(self, host, is_not_special=False): 576 | # IPv6 parsing 577 | if host.startswith("["): 578 | if not host.endswith("]"): 579 | self.validation_error = True 580 | raise UrlParserError() 581 | 582 | try: 583 | return "[%s]" % ipaddress.IPv6Address(host[1:-1]) 584 | except ipaddress.AddressValueError: 585 | raise UrlParserError() 586 | 587 | # Opaque-host parsing 588 | if is_not_special: 589 | codepoints = set(host) 590 | if "%" in codepoints: 591 | codepoints.remove("%") 592 | if codepoints.intersection(FORBIDDEN_HOST_CODE_POINTS): 593 | self.validation_error = True 594 | raise UrlParserError() 595 | 596 | return "".join([_percent_encode(c, C0_PERCENT_ENCODE) for c in host]) 597 | 598 | try: 599 | domain = _string_percent_decode(host).decode("utf-8") 600 | except UnicodeDecodeError: 601 | raise UrlParserError() 602 | 603 | try: 604 | ascii_domain = _domain_to_ascii(domain).decode("utf-8").lower() 605 | except (idna.IDNAError, UnicodeError) as e: 606 | self.validation_error = True 607 | raise UrlParserError() 608 | 609 | # Contains forbidden host codepoint 610 | if set(ascii_domain).intersection(FORBIDDEN_HOST_CODE_POINTS): 611 | raise UrlParserError() 612 | 613 | # IPv4 parsing 614 | return self.parse_ipv4_host(ascii_domain) 615 | 616 | def parse_ipv4_host(self, ascii_domain): 617 | """Attempts to parse a domain as an IPv4 address with 618 | a lot of parsing rules for decimal, octal, hex, different 619 | numbers of separators, etc. 620 | """ 621 | parts = ascii_domain.split(".") 622 | 623 | if parts[-1] == "": 624 | self.validation_error = True 625 | if len(parts) > 1: 626 | parts.pop(-1) 627 | 628 | if len(parts) > 4: 629 | return ascii_domain 630 | 631 | numbers = [] 632 | for part in parts: 633 | if part == "": 634 | return ascii_domain 635 | 636 | n, flag = _parse_ipv4_number(part) 637 | if n is None: 638 | return ascii_domain 639 | 640 | numbers.append(n) 641 | 642 | for i, number in enumerate(numbers): 643 | if number > 255: 644 | self.validation_error = True 645 | if i < len(numbers) - 1: 646 | raise UrlParserError() 647 | 648 | if numbers[-1] >= 256 ** (5 - len(numbers)): 649 | self.validation_error = True 650 | raise UrlParserError() 651 | 652 | ipv4 = numbers.pop(-1) 653 | for i, number in enumerate(numbers): 654 | ipv4 += number * (256 ** (3 - i)) 655 | 656 | output = [] 657 | for _ in range(4): 658 | output.insert(0, str(ipv4 % 256)) 659 | ipv4 //= 256 660 | 661 | return ".".join(output) 662 | 663 | def reset(self): 664 | self.validation_error = False 665 | self._pointer = 0 666 | self._buffer = "" 667 | self._at_flag = False 668 | self._square_brace_flag = False 669 | self._password_token_seen_flag = False 670 | 671 | def shorten_url_path(self): 672 | path_len = len(self.url._path) 673 | if path_len == 0: 674 | return 675 | if ( 676 | self.url.scheme == "file" 677 | and path_len == 1 678 | and NORMALIZED_WINDOWS_DRIVE_LETTER.match(self.url._path[0]) is not None 679 | ): 680 | return 681 | self.url._path.pop(-1) 682 | 683 | def _on_scheme_start(self, c, _): 684 | """Handles the START SCHEME state.""" 685 | if c in ASCII_ALPHA: 686 | self._buffer += c.lower() 687 | self._state = PARSER_STATE_SCHEME 688 | 689 | elif self.state_override is None: 690 | self._state = PARSER_STATE_NO_SCHEME 691 | self._pointer -= 1 692 | 693 | else: 694 | self.validation_error = True 695 | raise UrlParserError() 696 | 697 | def _on_scheme(self, c, remaining): 698 | """Handles the SCHEME state.""" 699 | if c in SCHEME_CHARS: 700 | self._buffer += c.lower() 701 | 702 | elif c == ":": 703 | if self.state_override is not None: 704 | if (self._buffer in SPECIAL_SCHEMES) != ( 705 | self.url.scheme in SPECIAL_SCHEMES 706 | ): 707 | raise _UrlParserReturn() 708 | 709 | elif ( 710 | self.url.includes_credentials or self.url.port is not None 711 | ) and self._buffer == "file": 712 | raise _UrlParserReturn() 713 | 714 | elif self.url.scheme == "file" and ( 715 | self.url.hostname is None or self.url.hostname == "" 716 | ): 717 | raise _UrlParserReturn() 718 | 719 | self.url._scheme = self._buffer 720 | 721 | if self.state_override is not None: 722 | if ( 723 | self.url.scheme in SPECIAL_SCHEMES 724 | and SPECIAL_SCHEMES[self.url.scheme] == self.url.port 725 | ): 726 | self.url._port = None 727 | raise _UrlParserReturn() 728 | 729 | self._buffer = "" 730 | 731 | if self.url.scheme == "file": 732 | if not remaining.startswith("//"): 733 | self.validation_error = True 734 | self._state = PARSER_STATE_FILE 735 | 736 | elif ( 737 | self.url.scheme in SPECIAL_SCHEMES 738 | and self.base is not None 739 | and self.base.scheme == self.url.scheme 740 | ): 741 | self._state = PARSER_STATE_SPECIAL_RELATIVE_OR_AUTHORITY 742 | 743 | elif self.url.scheme in SPECIAL_SCHEMES: 744 | self._state = PARSER_STATE_SPECIAL_AUTHORITY_SLASHES 745 | 746 | elif remaining.startswith("/"): 747 | self._state = PARSER_STATE_PATH_OR_AUTHORITY 748 | self._pointer += 1 749 | 750 | else: 751 | self.url.cannot_be_base_url = True 752 | self.url._path.append("") 753 | self._state = PARSER_STATE_CANNOT_BE_BASE_URL 754 | 755 | elif self.state_override is None: 756 | self._buffer = "" 757 | self._state = PARSER_STATE_NO_SCHEME 758 | self._pointer = -1 759 | 760 | else: 761 | self.validation_error = True 762 | raise UrlParserError() 763 | 764 | def _on_no_scheme(self, c, _): 765 | """Handles the NO SCHEME state""" 766 | if self.base is None or (self.base.cannot_be_base_url and c != "#"): 767 | self.validation_error = True 768 | raise UrlParserError() 769 | 770 | elif self.base.cannot_be_base_url and c == "#": 771 | self.url._scheme = self.base.scheme 772 | self.url._path = self.base._path[:] 773 | self.url._query = self.base.query 774 | self.url._fragment = "" 775 | self.url.cannot_be_base_url = True 776 | self._state = PARSER_STATE_FRAGMENT 777 | 778 | elif self.base.scheme != "file": 779 | self._state = PARSER_STATE_RELATIVE 780 | self._pointer -= 1 781 | 782 | else: 783 | self._state = PARSER_STATE_FILE 784 | self._pointer -= 1 785 | 786 | def _on_special_relative_or_authority(self, c, remaining): 787 | """Handles the SPECIAL RELATIVE OR AUTHORITY state""" 788 | if c == "/" and remaining.startswith("/"): 789 | self._state = PARSER_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES 790 | self._pointer += 1 791 | 792 | else: 793 | self.validation_error = True 794 | self._state = PARSER_STATE_RELATIVE 795 | self._pointer -= 1 796 | 797 | def _on_path_or_authority(self, c, _): 798 | """Handles the PATH OR AUTHORITY state""" 799 | if c == "/": 800 | self._state = PARSER_STATE_AUTHORITY 801 | else: 802 | self._state = PARSER_STATE_PATH 803 | self._pointer -= 1 804 | 805 | def _on_relative(self, c, _): 806 | """Handles the RELATIVE state""" 807 | self.url._scheme = self.base.scheme 808 | 809 | if c == "": 810 | self.url._username = self.base.username 811 | self.url._password = self.base.password 812 | self.url._hostname = self.base.hostname 813 | self.url._port = self.base.port 814 | self.url._path = self.base._path[:] 815 | self.url._query = self.base.query 816 | 817 | elif c == "/": 818 | self._state = PARSER_STATE_RELATIVE_SLASH 819 | 820 | elif c == "?": 821 | self.url._username = self.base.username 822 | self.url._password = self.base.password 823 | self.url._hostname = self.base.hostname 824 | self.url._port = self.base.port 825 | self.url._path = self.base._path[:] 826 | self.url._query = "" 827 | 828 | self._state = PARSER_STATE_QUERY 829 | 830 | elif c == "#": 831 | self.url._username = self.base.username 832 | self.url._password = self.base.password 833 | self.url._hostname = self.base.hostname 834 | self.url._port = self.base.port 835 | self.url._path = self.base._path[:] 836 | self.url._query = self.base.query 837 | self.url._fragment = "" 838 | 839 | self._state = PARSER_STATE_FRAGMENT 840 | 841 | else: 842 | if self.url.scheme in SPECIAL_SCHEMES and c == "\\": 843 | self.validation_error = True 844 | self._state = PARSER_STATE_RELATIVE_SLASH 845 | 846 | else: 847 | self.url._username = self.base.username 848 | self.url._password = self.base.password 849 | self.url._hostname = self.base.hostname 850 | self.url._port = self.base.port 851 | self.url._path = self.base._path[:] 852 | 853 | if len(self.url._path): 854 | self.url._path.pop(-1) 855 | 856 | self._state = PARSER_STATE_PATH 857 | self._pointer -= 1 858 | 859 | def _on_relative_slash(self, c, _): 860 | if self.url.scheme in SPECIAL_SCHEMES and (c == "/" or c == "\\"): 861 | if c == "\\": 862 | self.validation_error = True 863 | self._state = PARSER_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES 864 | 865 | elif c == "/": 866 | self._state = PARSER_STATE_AUTHORITY 867 | 868 | else: 869 | self.url._username = self.base.username 870 | self.url._password = self.base.password 871 | self.url._hostname = self.base.hostname 872 | self.url._port = self.base.port 873 | 874 | self._pointer -= 1 875 | self._state = PARSER_STATE_PATH 876 | 877 | def _on_special_authority_slashes(self, c, remaining): 878 | """Handles the SPECIAL AUTHORITY SLASHES state""" 879 | if c == "/" and remaining.startswith("/"): 880 | self._state = PARSER_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES 881 | self._pointer += 1 882 | 883 | else: 884 | self.validation_error = True 885 | self._state = PARSER_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES 886 | self._pointer -= 1 887 | 888 | def _on_special_authority_ignore_slashes(self, c, _): 889 | """Handles the SPECIAL AUTHORITY IGNORE SLASHES state""" 890 | if c != "/" and c != "\\": 891 | self._state = PARSER_STATE_AUTHORITY 892 | self._pointer -= 1 893 | 894 | else: 895 | self.validation_error = True 896 | 897 | def _on_authority(self, c, _): 898 | """Handles the AUTHORITY state""" 899 | if c == "@": 900 | self.validation_error = True 901 | 902 | if self._at_flag: 903 | self._buffer = "%40" + self._buffer 904 | 905 | self._at_flag = True 906 | 907 | for char in self._buffer: 908 | if not self._password_token_seen_flag and char == ":": 909 | self._password_token_seen_flag = True 910 | continue 911 | 912 | if self._password_token_seen_flag: 913 | if self.url.password is None: 914 | self.url._password = "" 915 | self.url._password += _percent_encode(char, USERINFO_PERCENT_ENCODE) 916 | else: 917 | if self.url.username is None: 918 | self.url._username = "" 919 | self.url._username += _percent_encode(char, USERINFO_PERCENT_ENCODE) 920 | 921 | self._buffer = "" 922 | 923 | elif c in AUTHORITY_DELIMITERS or ( 924 | self.url.scheme in SPECIAL_SCHEMES and c == "\\" 925 | ): 926 | if self._at_flag and self._buffer == "": 927 | self.validation_error = True 928 | raise UrlParserError() 929 | 930 | self._pointer -= len(self._buffer) + 1 931 | self._buffer = "" 932 | self._state = PARSER_STATE_HOST 933 | 934 | else: 935 | self._buffer += c 936 | 937 | def _on_host_or_hostname(self, c, _): 938 | """Handles the HOST and HOSTNAME states""" 939 | if self.state_override is not None and self.url.scheme == "file": 940 | self._pointer -= 1 941 | self._state = PARSER_STATE_FILE_HOST 942 | 943 | elif c == ":" and not self._square_brace_flag: 944 | if self._buffer == "": 945 | self.validation_error = True 946 | raise UrlParserError() 947 | 948 | self.url._hostname = self.parse_host( 949 | self._buffer, self.url.scheme not in SPECIAL_SCHEMES 950 | ) 951 | self._buffer = "" 952 | self._state = PARSER_STATE_PORT 953 | 954 | if self.state_override == PARSER_STATE_HOSTNAME: 955 | raise _UrlParserReturn() 956 | 957 | elif c in AUTHORITY_DELIMITERS or ( 958 | c == "\\" and self.url.scheme in SPECIAL_SCHEMES 959 | ): 960 | self._pointer -= 1 961 | 962 | if self.url.scheme in SPECIAL_SCHEMES and self._buffer == "": 963 | self.validation_error = True 964 | raise UrlParserError() 965 | 966 | elif ( 967 | self.state_override is not None 968 | and self._buffer == "" 969 | and (self.url.includes_credentials or self.url.port is not None) 970 | ): 971 | self.validation_error = True 972 | raise _UrlParserReturn() 973 | 974 | self.url._hostname = self.parse_host( 975 | self._buffer, self.url.scheme not in SPECIAL_SCHEMES 976 | ) 977 | 978 | self._buffer = "" 979 | self._state = PARSER_STATE_PATH_START 980 | 981 | if self.state_override is not None: 982 | raise _UrlParserReturn() 983 | 984 | else: 985 | if c == "[": 986 | self._square_brace_flag = True 987 | elif c == "]": 988 | self._square_brace_flag = False 989 | self._buffer += c 990 | 991 | def _on_port(self, c, _): 992 | """Handles the PORT state""" 993 | if c in ASCII_DIGITS: 994 | self._buffer += c 995 | 996 | elif ( 997 | c in PATH_DELIMITERS 998 | or (c == "\\" and self.url.scheme in SPECIAL_SCHEMES) 999 | or self.state_override is not None 1000 | ): 1001 | if self._buffer != "": 1002 | try: 1003 | port = int(self._buffer) 1004 | except ValueError as e: 1005 | six.raise_from(UrlParserError(), e) 1006 | 1007 | if port > 2 ** 16 - 1: 1008 | self.validation_error = True 1009 | raise UrlParserError() 1010 | 1011 | self.url._port = ( 1012 | None if port == SPECIAL_SCHEMES.get(self.url.scheme, None) else port 1013 | ) 1014 | self._buffer = "" 1015 | 1016 | if self.state_override: 1017 | raise _UrlParserReturn() 1018 | 1019 | self._state = PARSER_STATE_PATH_START 1020 | self._pointer -= 1 1021 | 1022 | else: 1023 | self.validation_error = True 1024 | raise UrlParserError() 1025 | 1026 | def _on_file(self, c, remaining): 1027 | """Handles the FILE state""" 1028 | self.url._scheme = "file" 1029 | 1030 | if c == "/" or c == "\\": 1031 | if c == "\\": 1032 | self.validation_error = True 1033 | self._state = PARSER_STATE_FILE_SLASH 1034 | 1035 | elif self.base is not None and self.base.scheme == "file": 1036 | if c == "": 1037 | self.url._hostname = self.base.hostname 1038 | self.url._path = self.base._path[:] 1039 | self.url._query = self.base.query 1040 | 1041 | elif c == "?": 1042 | self.url._hostname = self.base.hostname 1043 | self.url._path = self.base._path[:] 1044 | self.url._query = "" 1045 | 1046 | self._state = PARSER_STATE_QUERY 1047 | 1048 | elif c == "#": 1049 | self.url._hostname = self.base.hostname 1050 | self.url._path = self.base._path[:] 1051 | self.url._query = self.base.query 1052 | self.url._fragment = "" 1053 | 1054 | self._state = PARSER_STATE_FRAGMENT 1055 | 1056 | else: 1057 | match = WINDOWS_DRIVE_LETTER.search(c + remaining) 1058 | if match is None: 1059 | self.url._hostname = self.base.hostname 1060 | self.url._path = self.base._path[:] 1061 | self.shorten_url_path() 1062 | 1063 | else: 1064 | self.validation_error = True 1065 | 1066 | self._state = PARSER_STATE_PATH 1067 | self._pointer -= 1 1068 | 1069 | else: 1070 | self._state = PARSER_STATE_PATH 1071 | self._pointer -= 1 1072 | 1073 | def _on_file_slash(self, c, remaining): 1074 | """Handles the FILE SLASH state""" 1075 | if c == "/" or c == "\\": 1076 | if c == "\\": 1077 | self.validation_error = True 1078 | self._state = PARSER_STATE_FILE_HOST 1079 | 1080 | else: 1081 | if ( 1082 | self.base is not None 1083 | and self.base.scheme == "file" 1084 | and WINDOWS_DRIVE_LETTER.search(c + remaining) is None 1085 | ): 1086 | if ( 1087 | len(self.base._path) > 0 1088 | and NORMALIZED_WINDOWS_DRIVE_LETTER.match(self.base._path[0]) 1089 | is not None 1090 | ): 1091 | self.url._path.append(self.base._path[0]) 1092 | 1093 | else: 1094 | self.url._hostname = self.base.hostname 1095 | 1096 | self._state = PARSER_STATE_PATH 1097 | self._pointer -= 1 1098 | 1099 | def _on_file_host(self, c, _): 1100 | """Handles the FILE HOST state""" 1101 | if c in PATH_DELIMITERS: 1102 | self._pointer -= 1 1103 | 1104 | if ( 1105 | self.state_override is None 1106 | and WINDOWS_DRIVE_LETTER.match(self._buffer) is not None 1107 | ): 1108 | self.validation_error = True 1109 | self._state = PARSER_STATE_PATH 1110 | 1111 | elif self._buffer == "": 1112 | self.url._hostname = "" 1113 | 1114 | if self.state_override is not None: 1115 | raise _UrlParserReturn() 1116 | 1117 | self._state = PARSER_STATE_PATH_START 1118 | 1119 | else: 1120 | self.url._hostname = self.parse_host( 1121 | self._buffer, self.url.scheme not in SPECIAL_SCHEMES 1122 | ) 1123 | 1124 | if self.url.hostname == "localhost": 1125 | self.url._hostname = "" 1126 | 1127 | if self.state_override is not None: 1128 | raise _UrlParserReturn() 1129 | 1130 | self._buffer = "" 1131 | self._state = PARSER_STATE_PATH_START 1132 | 1133 | else: 1134 | self._buffer += c 1135 | 1136 | def _on_path_start(self, c, _): 1137 | """Handles the PATH START state""" 1138 | if self.url.scheme in SPECIAL_SCHEMES: 1139 | if c == "\\": 1140 | self.validation_error = True 1141 | 1142 | self._state = PARSER_STATE_PATH 1143 | 1144 | if c != "/" and c != "\\": 1145 | self._pointer -= 1 1146 | 1147 | elif self.state_override is None and c == "?": 1148 | self.url._query = "" 1149 | self._state = PARSER_STATE_QUERY 1150 | 1151 | elif self.state_override is None and c == "#": 1152 | self.url._fragment = "" 1153 | self._state = PARSER_STATE_FRAGMENT 1154 | 1155 | elif c != "": 1156 | self._state = PARSER_STATE_PATH 1157 | 1158 | if c != "/": 1159 | self._pointer -= 1 1160 | 1161 | def _on_path(self, c, remaining): 1162 | """Handles the PATH state""" 1163 | cond = c == "\\" and self.url.scheme in SPECIAL_SCHEMES 1164 | if ( 1165 | c == "" 1166 | or c == "/" 1167 | or cond 1168 | or (self.state_override is None and (c == "?" or c == "#")) 1169 | ): 1170 | if cond: 1171 | self.validation_error = True 1172 | 1173 | if self._buffer in DOUBLE_DOT_PATH_SEGMENTS: 1174 | self.shorten_url_path() 1175 | 1176 | if not (c == "/" or cond): 1177 | self.url._path.append("") 1178 | 1179 | elif self._buffer in SINGLE_DOT_PATH_SEGMENTS and not (c == "/" or cond): 1180 | self.url._path.append("") 1181 | 1182 | elif self._buffer not in SINGLE_DOT_PATH_SEGMENTS: 1183 | if ( 1184 | self.url.scheme == "file" 1185 | and len(self.url._path) == 0 1186 | and WINDOWS_DRIVE_LETTER.match(self._buffer) is not None 1187 | ): 1188 | if self.url.hostname != "" and self.url.hostname is not None: 1189 | self.validation_error = True 1190 | self.url._hostname = "" 1191 | 1192 | self._buffer = self._buffer[0] + ":" + self._buffer[2:] 1193 | 1194 | self.url._path.append(self._buffer) 1195 | 1196 | self._buffer = "" 1197 | 1198 | if self.url.scheme == "file" and c in PATH_DELIMITERS: 1199 | while len(self.url._path) > 1 and self.url._path[0] == "": 1200 | self.validation_error = True 1201 | self.url._path.pop(0) 1202 | 1203 | if c == "?": 1204 | self.url._query = "" 1205 | self._state = PARSER_STATE_QUERY 1206 | 1207 | elif c == "#": 1208 | self.url._fragment = "" 1209 | self._state = PARSER_STATE_FRAGMENT 1210 | 1211 | else: 1212 | if c != "%" and not _is_url_codepoint(c): 1213 | self.validation_error = True 1214 | if c == "%" and TWO_ASCII_HEX.search(remaining) is None: 1215 | self.validation_error = True 1216 | self._buffer += _percent_encode(c, PATH_PERCENT_ENCODE) 1217 | 1218 | def _on_cannot_be_base_url(self, c, remaining): 1219 | """Handles the CANNOT BE BASE URL state""" 1220 | if c == "?": 1221 | self.url._query = "" 1222 | self._state = PARSER_STATE_QUERY 1223 | 1224 | elif c == "#": 1225 | self.url._fragment = "" 1226 | self._state = PARSER_STATE_FRAGMENT 1227 | 1228 | else: 1229 | if c != "" and c != "%" and not _is_url_codepoint(c): 1230 | self.validation_error = True 1231 | 1232 | if c == "%" and TWO_ASCII_HEX.search(remaining) is None: 1233 | self.validation_error = True 1234 | 1235 | if c != "": 1236 | self.url._path[0] += _percent_encode(c, C0_PERCENT_ENCODE) 1237 | 1238 | def _on_query(self, c, remaining): 1239 | """Handles the QUERY state""" 1240 | if self.encoding != "utf-8" and ( 1241 | self.url.scheme == "ws" 1242 | or self.url.scheme == "wss" 1243 | or self.url.scheme not in SPECIAL_SCHEMES 1244 | ): 1245 | self.encoding = "utf-8" 1246 | 1247 | if self.state_override is None and c == "#": 1248 | self.url._fragment = "" 1249 | self._state = PARSER_STATE_FRAGMENT 1250 | 1251 | elif c != "": 1252 | if c != "%" and not _is_url_codepoint(c): 1253 | self.validation_error = True 1254 | 1255 | if c == "%" and TWO_ASCII_HEX.search(remaining) is None: 1256 | self.validation_error = True 1257 | 1258 | bytes_ = c.encode(self.encoding) 1259 | 1260 | if bytes_.startswith(b"&#") and bytes_.endswith(b";"): 1261 | self.url._query += (b"%26%23" + bytes_[2:-1] + b"%3B").decode("ascii") 1262 | 1263 | else: 1264 | is_special = self.url.scheme in SPECIAL_SCHEMES 1265 | for byte in _iterbytes(bytes_): 1266 | if ( 1267 | byte < 0x21 1268 | or byte > 0x7e 1269 | or byte == 0x22 1270 | or byte == 0x23 1271 | or byte == 0x3c 1272 | or byte == 0x3e 1273 | or (is_special and byte == 0x27) 1274 | ): 1275 | self.url._query += "%" + _hex(byte) 1276 | else: 1277 | self.url._query += chr(byte) 1278 | 1279 | def _on_fragment(self, c, remaining): 1280 | if c == "": 1281 | pass 1282 | 1283 | elif c == "\x00": 1284 | self.validation_error = True 1285 | 1286 | else: 1287 | if c != "%" and _is_url_codepoint(c): 1288 | self.validation_error = True 1289 | 1290 | if c == "%" and TWO_ASCII_HEX.search(remaining) is None: 1291 | self.validation_error = True 1292 | 1293 | self.url._fragment += _percent_encode(c, FRAGMENT_PERCENT_ENCODE) 1294 | 1295 | 1296 | def _string_percent_decode(data): 1297 | bytes_ = data.encode("utf-8") 1298 | return _percent_decode(bytes_) 1299 | 1300 | 1301 | def _percent_encode(c, encode_set): 1302 | if c in encode_set or ord(c) > 0x7e: 1303 | if not isinstance(c, bytes): 1304 | c = c.encode("utf-8") 1305 | return "".join(["%" + _hex(x) for x in _iterbytes(c)]) 1306 | return c 1307 | 1308 | 1309 | def _is_url_codepoint(c): 1310 | if c in URL_CODEPOINTS: 1311 | return True 1312 | c_ord = ord(c) 1313 | return ( 1314 | 0xa0 <= c_ord <= 0x10fffd 1315 | and not 0xd800 <= c_ord <= 0xdfff 1316 | and not 0xfdd0 <= c_ord <= 0xfdef 1317 | and c_ord not in NONCHARACTERS 1318 | ) 1319 | 1320 | 1321 | def _is_c0_control_or_space(c): 1322 | return c == " " or 0 <= ord(c) <= 0x1f 1323 | 1324 | 1325 | def _percent_decode(bytes_): 1326 | output = [] 1327 | skip = 0 1328 | 1329 | def is_hex(x): 1330 | x = _byte2int(x) 1331 | return 0x30 <= x <= 0x39 or 0x41 <= x <= 0x46 or 0x61 <= x <= 0x66 1332 | 1333 | for i, byte in enumerate(_iterbytes(bytes_)): 1334 | if skip: 1335 | skip -= 1 1336 | continue 1337 | if byte != 0x25: 1338 | output.append(_int2byte(byte)) 1339 | elif ( 1340 | i + 2 >= len(bytes_) 1341 | or not is_hex(bytes_[i + 1]) 1342 | or not is_hex(bytes_[i + 2]) 1343 | ): 1344 | output.append(_int2byte(byte)) 1345 | else: 1346 | value = int(bytes_[i + 1 : i + 3].decode("ascii").lower(), 16) 1347 | skip = 2 1348 | output.append(_int2byte(value)) 1349 | 1350 | return b"".join(output) 1351 | 1352 | 1353 | def _domain_to_ascii(domain, strict=False): 1354 | """Attempt to encode with IDNA 2008 first, if that fails 1355 | then attempt to encode with IDNA 2003. 1356 | """ 1357 | try: 1358 | return idna.encode( 1359 | domain, strict=strict, std3_rules=strict, uts46=True, transitional=False 1360 | ) 1361 | except idna.IDNAError: 1362 | if isinstance(domain, (bytes, bytearray)): 1363 | domain = domain.decode("ascii") 1364 | domain = idna.uts46_remap(domain, std3_rules=strict, transitional=False) 1365 | trailing_dot = False 1366 | result = [] 1367 | if strict: 1368 | labels = domain.split(".") 1369 | else: 1370 | labels = IDNA_DOTS_REGEX.split(domain) 1371 | 1372 | if not labels or labels == [""]: 1373 | raise idna.IDNAError("Empty domain") 1374 | if labels[-1] == "": 1375 | del labels[-1] 1376 | trailing_dot = True 1377 | 1378 | for label in labels: 1379 | try: 1380 | s = idna2003.ToASCII(label) 1381 | except UnicodeError: 1382 | if strict: 1383 | raise 1384 | result.append(label.encode("utf-8")) 1385 | continue 1386 | if s: 1387 | result.append(s) 1388 | else: 1389 | raise idna.IDNAError("Empty label") 1390 | if trailing_dot: 1391 | result.append(b"") 1392 | s = b".".join(result) 1393 | if not idna.valid_string_length(s, trailing_dot): 1394 | raise idna.IDNAError("Domain too long") 1395 | return s 1396 | 1397 | 1398 | def _parse_ipv4_number(input_): 1399 | """Parses a single IPv4 number""" 1400 | 1401 | r = 10 1402 | 1403 | try: 1404 | if len(input_) >= 2: 1405 | if input_[:2].lower() == "0x": 1406 | r = 16 1407 | input_ = input_[2:] 1408 | 1409 | elif input_.startswith("0"): 1410 | r = 8 1411 | input_ = input_[1:] 1412 | 1413 | if input_ == "": 1414 | return 0, False 1415 | 1416 | return int(input_, r), r != 10 1417 | except ValueError: 1418 | return None, False 1419 | 1420 | 1421 | class ParseResultMixin(object): 1422 | def geturl(self): 1423 | return self.url.href 1424 | 1425 | @property 1426 | def username(self): 1427 | if self.url.password: 1428 | return self.url.username or "" 1429 | return self.url.username 1430 | 1431 | @property 1432 | def password(self): 1433 | return self.url.password 1434 | 1435 | @property 1436 | def hostname(self): 1437 | return self.url.hostname 1438 | 1439 | @property 1440 | def port(self): 1441 | return self.url.port 1442 | 1443 | 1444 | class ParseResult( 1445 | collections.namedtuple( 1446 | "ParseResult", ["scheme", "netloc", "path", "params", "query", "fragment"] 1447 | ), 1448 | ParseResultMixin, 1449 | ): 1450 | slots = () 1451 | 1452 | def __new__(cls, scheme, netloc, path, params, query, fragment, url): 1453 | parse_result = super(ParseResult, cls).__new__( 1454 | cls, 1455 | scheme or "", 1456 | netloc or "", 1457 | path or "", 1458 | params or "", 1459 | query or "", 1460 | fragment or "", 1461 | ) 1462 | parse_result.url = url 1463 | return parse_result 1464 | 1465 | 1466 | def urlparse(urlstring, scheme="", allow_fragments=True, encoding="utf-8"): 1467 | """Compatible with urllib.parse.urlparse(). 1468 | See documentation of urlparse() for more information. 1469 | """ 1470 | parser = UrlParser(Url()) 1471 | url = parser.parse(urlstring, encoding=encoding) 1472 | if scheme != "": 1473 | url.scheme = scheme 1474 | if not allow_fragments: 1475 | _add_url_fragment_to_path(url) 1476 | return ParseResult( 1477 | url.scheme, url.authority, url.path, "", url.query, url.fragment, url 1478 | ) 1479 | 1480 | 1481 | def urljoin(base, url, allow_fragments=True, encoding="utf-8"): 1482 | """Compatible with urllib.parse.urljoin() 1483 | See documentation of urljoin() for more information. 1484 | """ 1485 | parser = UrlParser(Url()) 1486 | url = parser.parse(url, base=base, encoding=encoding) 1487 | if not allow_fragments: 1488 | _add_url_fragment_to_path(url) 1489 | return url.href 1490 | 1491 | 1492 | def _add_url_fragment_to_path(url): 1493 | if len(url._path): 1494 | url._path[-1] += "#" + url.fragment 1495 | else: 1496 | url._path.append("#" + url.fragment) 1497 | url.fragment = None 1498 | 1499 | 1500 | def _iterbytes(bytes_): 1501 | if six.PY3: 1502 | return bytes_ 1503 | else: 1504 | return [ord(x) for x in bytes_] 1505 | 1506 | 1507 | def _byte2int(byte): 1508 | if not isinstance(byte, int): 1509 | return ord(byte) 1510 | return byte 1511 | 1512 | 1513 | def _int2byte(i): 1514 | if six.PY3: 1515 | return i.to_bytes(length=1, byteorder="little") 1516 | return chr(i) 1517 | 1518 | 1519 | def _hex(x): 1520 | return hex(_byte2int(x))[2:].zfill(2).upper() 1521 | --------------------------------------------------------------------------------