├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── test_special_cases.py
    ├── test_url.py
    ├── test_urllib.py
    ├── test_web_platform_tests.py
    └── testdata.json
├── tox.ini
└── whatwg_url.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: python
 3 | 
 4 | branches:
 5 |   only:
 6 |   - master
 7 | 
 8 | cache:
 9 |   - pip
10 | 
11 | install:
12 |   - "python -m pip install -U pip"
13 |   - "python -m pip install -U setuptools"
14 |   - "python -m pip install -U tox"
15 | 
16 | script:
17 |   - "tox"
18 | 
19 | after_success:
20 |   - "python -m pip install -U codecov"
21 |   - "codecov"
22 | 
23 | matrix:
24 |   include:
25 |     - env: TOXENV=py27
26 |       python: 2.7
27 |     - env: TOXENV=py34
28 |       python: 3.4
29 |     - env: TOXENV=py35
30 |       python: 3.5
31 |     - env: TOXENV=py35
32 |       python: 3.5-dev
33 |     - env: TOXENV=py36
34 |       python: 3.6
35 |     - env: TOXENV=py36
36 |       python: 3.6-dev
37 |     - env: TOXENV=py37
38 |       python: 3.7-dev
39 |       dist: xenial
40 | 
41 |     - env: TOXENV=lint
42 |       python: 3.6
43 | 
44 |     - env: TOXENV=dist
45 |       python: 3.6
46 | 
47 | deploy:
48 |   - provider: "pypi"
49 |     user: "SethMichaelLarson"
50 |     password:
51 |       secure: "ZW+MxzTZ1K/CK5MyDR2CHS8EhS3/XGagI5rK+AVUx87U+0dcv5iTeLpU/L5Iu6kOajYuGeOFcyHuwFa8ILbGqDeMcR0bp3i6Z4AicGQVK1xcTC8C3/nkxGU62Er4iWYpl3oiPN75xhNvfamU4ZYro672Mx0ebAuFJXxmPzfj1YggKmTB7kNv1AUV94S85bwsb4Vmgbd0H5ie74uzaSR3GrLUunT0mqtiJzOAwyGBXAFRjvAX8cBY4bQ6axNnFTyj+84m5S2T/4BnMTg/Y/VwNBkP2rJ3FlhqgxVWoQwvpOMrkqXPiRzojJF6frthBJXC4/L4/luaeDOIk6gaWBlzwpAnQnatVZBQqIJHzVjSnefMk8v5Sh5D3QWlacJK/eU7gXqxJbGj+uV6MY+5dDYG4dJ7G1y9A/vbmVoHBv/0IkMo+yh0KrSHIO4/DTmp0irSqeQr71iCECNxmdlxv6o6JmVgcjIqj56nsvHaQEFB/ylZdhWI2PYA+cIE6sqeVaBE2e0mq7WuYLb4tNYTK1hcRp5dqKoKu7f4W9N6/yWhqxm9iN86LQG0Uq4wwscCdkohMZgkUNvuUQH6r9tSbHIdUpQUD46OkkM/aIzfw2dMVdbC2s3e8XyRaIiKctRrtcktb1YMZgfanv0BbeLH6vBcqfxguML84SCfgGgGzTi4KC8="
52 |     on:
53 |       branch: "master"
54 |       tags: true
55 |     distributions: "sdist"
56 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## 2018.8.26
 4 | 
 5 | ### Added
 6 | 
 7 | - Added `UrlParser` and `Url`
 8 | - Added `UrlParser.parse_host()`
 9 | - Added `UrlParser.parse_ipv4_host()`
10 | - Added `Url.origin`
11 | - Added `Url.authority`
12 | - Added `urlparse` and `urljoin` to be compatible with
13 |   [`urllib3.parse.urlparse`](https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlparse)
14 |   and [`urllib.parse.urljoin`](https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin)
15 | - Added support for Python 2.7, 3.4, and 3.5
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE README.md CHANGELOG.md tox.ini setup.cfg
2 | recursive-include tests *.json
3 | recursive-include tests *.py
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # whatwg-url
  2 | 
  3 | [![No Maintenance Intended](http://unmaintained.tech/badge.svg)](http://unmaintained.tech/)
  4 | 
  5 | Python implementation of the [WHATWG URL Living Standard](https://url.spec.whatwg.org/).
  6 | 
  7 | The latest revision that this package implements of the standard is August 7th, 2018 ([`commit 49060c7`](https://github.com/whatwg/url/commit/49060c74d3047602a572f9e88a6a1101f4fd32f3))
  8 | 
  9 | ## Getting Started
 10 | 
 11 | Install the `whatwg-url` package using `pip`.
 12 | 
 13 | `python -m pip install whatwg-url`
 14 | 
 15 | And use the module like so:
 16 | 
 17 | ```python
 18 | import whatwg_url
 19 | 
 20 | url = whatwg_url.parse_url("https://www.google.com")
 21 | print(url)
 22 | # Url(scheme='https', hostname='www.google.com', port=None, path='', query='', fragment='')
 23 | ```
 24 | 
 25 | ## Features
 26 | 
 27 | ### Compatibility with `urllib.parse.urlparse()`
 28 | 
 29 | ```python
 30 | import whatwg_url
 31 | 
 32 | parseresult = whatwg_url.urlparse("https://seth:larson@www.google.com:1234/maps?query=string#fragment")
 33 | 
 34 | print(parseresult.scheme)  # 'https'
 35 | print(parseresult.netloc)  # 'www.google.com:1234'
 36 | print(parseresult.userinfo)  # 'seth:larson'
 37 | print(parseresult.path)  # '/maps'
 38 | print(parseresult.params)  # ''
 39 | print(parseresult.query)  # 'query=string'
 40 | print(parseresult.fragment)  # 'fragment'
 41 | print(parseresult.username)  # 'seth'
 42 | print(parseresult.password)  # 'larson'
 43 | print(parseresult.hostname)  # 'www.google.com'
 44 | print(parseresult.port)  # 1234
 45 | print(parseresult.geturl())  # 'https://seth:larson@www.google.com:1234/maps?query=string#fragment'
 46 | ```
 47 | 
 48 | ### URL Normalization
 49 | 
 50 | The WHATWG URL specification describes methods of normalizing URL inputs to usable URLs.
 51 | It handles percent-encodings, default ports, paths, IPv4 and IPv6 addresses, IDNA (2008 and 2003), multiple slashes after scheme, etc.
 52 | 
 53 | ```python
 54 | import whatwg_url
 55 | 
 56 | print(whatwg_url.normalize_url("https://////www.google.com"))  # https://www.google.com
 57 | print(whatwg_url.normalize_url("https://www.google.com/dir1/../dir2"))  # https://www.google.com/dir2
 58 | print(whatwg_url.normalize_url("https://你好你好"))  # https://xn--6qqa088eba/
 59 | print(whatwg_url.normalize_url("https://０Ｘｃ０．０２５０．０１"))  # https://192.168.0.1/
 60 | ```
 61 | 
 62 | ### URL Validation
 63 | 
 64 | ```python
 65 | print(whatwg_url.is_valid_url("https://www.google.com"))  # True
 66 | print(whatwg_url.is_valid_url("https://www .google.com"))  # False
 67 | ```
 68 | 
 69 | ### Relative URLs
 70 | 
 71 | HTTP redirects often contain relative URLs (via the `Location` header) that need to be applied to the current URL location.
 72 | Specifying the `base` parameter allows for giving relative URLs as input and the changes be applied to a new `URL` object.
 73 | 
 74 | ```python
 75 | import whatwg_url
 76 | 
 77 | url = whatwg_url.parse_url("../dev?a=1#f", base="https://www.google.com/maps")
 78 | print(url.href)  # https://www.google.com/dev?a=1#f
 79 | ```
 80 | 
 81 | ### URL Property Mutators
 82 | 
 83 | Modifying properties on a `URL` object use the parser and "state overrides" to properly mutate the `URL` object.
 84 | 
 85 | ```python
 86 | url = whatwg_url.parse_url("http://www.google.com:443")
 87 | 
 88 | print(url.scheme)  # 'http'
 89 | print(url.port)  # 443
 90 | 
 91 | url.scheme = 'https'
 92 | 
 93 | print(url.scheme)  # 'https'
 94 | print(url.port)  # None
 95 | ```
 96 | 
 97 | ### "Splatable"
 98 | 
 99 | The module is a single file which allows for easy vendoring into projects.
100 | 
101 | ## License
102 | 
103 | [Apache-2.0](https://github.com/SethMichaelLarson/whatwg-url/blob/master/LICENSE)
104 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | ignore = E203, E266, E501, W503
 3 | max-line-length = 80
 4 | max-complexity = 18
 5 | select = B,C,E,F,W,T4,B9
 6 | 
 7 | [check-manifest]
 8 | ignore =
 9 |     .travis.yml
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | import re
 4 | from setuptools import setup
 5 | 
 6 | base_dir = os.path.dirname(os.path.abspath(__file__))
 7 | 
 8 | version = None
 9 | 
10 | with io.open(os.path.join(base_dir, "whatwg_url.py"), encoding="utf-8") as f:
11 |     for line in f:
12 |         match = re.search(r"^__version__\s+=\s+\"([^\"]+)\"$", line)
13 |         if match:
14 |             version = match.group(1)
15 |             break
16 |     else:
17 |         raise ValueError("Could not find __version__ in whatwg_url.py")
18 | 
19 | 
20 | def get_long_description():
21 |     with io.open(os.path.join(base_dir, "README.md"), encoding="utf-8") as f:
22 |         data = f.read()
23 |     data += "\n\n"
24 |     with io.open(os.path.join(base_dir, "CHANGELOG.md"), encoding="utf-8") as f:
25 |         data += f.read()
26 |     return data
27 | 
28 | 
29 | setup(
30 |     name="whatwg-url",
31 |     version=version,
32 |     description="Python implementation of the WHATWG URL Living Standard",
33 |     long_description=get_long_description(),
34 |     long_description_content_type="text/markdown",
35 |     author="Seth Michael Larson",
36 |     author_email="sethmichaellarson@gmail.com",
37 |     url="https://github.com/SethMichaelLarson/whatwg-url",
38 |     license="Apache-2.0",
39 |     py_modules=["whatwg_url"],
40 |     install_requires=["idna", "six", "ipaddress"],
41 |     classifiers=[
42 |         "Development Status :: 4 - Beta",
43 |         "Intended Audience :: Developers",
44 |         "Natural Language :: English",
45 |         "License :: OSI Approved :: Apache Software License",
46 |         "Programming Language :: Python :: 2",
47 |         "Programming Language :: Python :: 2.7",
48 |         "Programming Language :: Python :: 3",
49 |         "Programming Language :: Python :: 3.4",
50 |         "Programming Language :: Python :: 3.5",
51 |         "Programming Language :: Python :: 3.6",
52 |         "Programming Language :: Python :: 3.7",
53 |         "Topic :: Internet",
54 |     ],
55 | )
56 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sethmlarson/whatwg-url/84be3cb71c327944a57422746d862270a09327e7/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_special_cases.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Source for these URLs:
 3 | # https://www.blackhat.com/docs/us-17/thursday/
 4 | # us-17-Tsai-A-New-Era-Of-SSRF-Exploiting-URL-
 5 | # Parser-In-Trending-Programming-Languages.pdf
 6 | 
 7 | import pytest
 8 | import whatwg_url
 9 | 
10 | 
11 | def test_spaces_with_multiple_ipv4_addresses():
12 |     url = whatwg_url.parse_url("http://1.1.1.1 &@2.2.2.2# @3.3.3.3")
13 | 
14 |     assert url.username == "1.1.1.1%20&"
15 |     assert url.password is None
16 |     assert url.hostname == "2.2.2.2"
17 |     assert url.fragment == "%20@3.3.3.3"
18 | 
19 | 
20 | def test_fragment_with_hostname():
21 |     url = whatwg_url.parse_url("http://google.com#@evil.com/")
22 | 
23 |     assert url.hostname == "google.com"
24 |     assert url.fragment == "@evil.com/"
25 | 
26 | 
27 | def test_multiple_ats_within_authority():
28 |     url = whatwg_url.parse_url("http://foo@evil.com:80@google.com/")
29 | 
30 |     assert url.hostname == "google.com"
31 |     assert url.username == "foo%40evil.com"
32 |     assert url.password == "80"
33 | 
34 | 
35 | def test_multiple_ats_and_space_within_authority():
36 |     url = whatwg_url.parse_url("http://foo@evil.com:80 @google.com/")
37 | 
38 |     assert url.hostname == "google.com"
39 |     assert url.username == "foo%40evil.com"
40 |     assert url.password == "80%20"
41 | 
42 | 
43 | def test_unicode_double_dot_if_stripped_bom():
44 |     url = whatwg_url.parse_url("http://orange.tw/sandbox/ＮＮ/passwd")
45 | 
46 |     assert url.hostname == "orange.tw"
47 |     assert url.path == "/sandbox/%EF%BC%AE%EF%BC%AE/passwd"
48 | 
49 | 
50 | def test_host_contains_tab_in_authority():
51 |     url = whatwg_url.parse_url("http://127.0.0.1\tfoo.google.com")
52 | 
53 |     assert url.host == "127.0.0.1foo.google.com"
54 | 
55 | 
56 | def test_host_contains_tab_in_authority_single_or_double_encoded():
57 |     with pytest.raises(whatwg_url.UrlParserError):
58 |         whatwg_url.parse_url("http://127.0.0.1%09foo.google.com")
59 | 
60 |     with pytest.raises(whatwg_url.UrlParserError):
61 |         whatwg_url.parse_url("http://127.0.0.1%2509foo.google.com")
62 | 
63 | 
64 | def test_injection_within_authority():
65 |     with pytest.raises(whatwg_url.UrlParserError):
66 |         whatwg_url.parse_url("https://127.0.0.1\r\nSET foo 0 60 5\r\n:443/")
67 | 
68 | 
69 | def test_backslash_within_authority():
70 |     url = whatwg_url.parse_url("http://localhost\\@google.com:12345")
71 | 
72 |     assert url.hostname == "localhost"
73 |     assert url.port is None
74 |     assert url.path == "/@google.com:12345"
75 | 
76 | 
77 | def test_relative_url_with_url_contained():
78 |     url = whatwg_url.parse_url(
79 |         url="/redirect?target=http://localhost:61020/", base="https://www.google.com"
80 |     )
81 | 
82 |     assert url.scheme == "https"
83 |     assert url.hostname == "www.google.com"
84 |     assert url.path == "/redirect"
85 |     assert url.query == "target=http://localhost:61020/"
86 | 


--------------------------------------------------------------------------------
/tests/test_url.py:
--------------------------------------------------------------------------------
  1 | import whatwg_url
  2 | 
  3 | 
  4 | def test_url_scheme():
  5 |     url = whatwg_url.parse_url("http://www.google.com:443")
  6 |     url.scheme = "https"
  7 | 
  8 |     assert url.scheme == "https"
  9 |     assert url.port is None
 10 |     assert url.href == "https://www.google.com/"
 11 | 
 12 |     url.scheme = "http"
 13 | 
 14 |     assert url.scheme == "http"
 15 |     assert url.port is None
 16 |     assert url.href == "http://www.google.com/"
 17 | 
 18 | 
 19 | def test_url_host():
 20 |     url = whatwg_url.parse_url("https://www.google.com")
 21 |     url.hostname = "example.com"
 22 | 
 23 |     assert url.hostname == "example.com"
 24 |     assert url.href == "https://example.com/"
 25 | 
 26 | 
 27 | def test_url_port():
 28 |     url = whatwg_url.parse_url("https://www.example.com")
 29 |     url.port = 123
 30 | 
 31 |     assert url.port == 123
 32 |     assert url.host == "www.example.com:123"
 33 |     assert url.href == "https://www.example.com:123/"
 34 | 
 35 |     url.port = 443
 36 | 
 37 |     assert url.port is None
 38 |     assert url.host == "www.example.com"
 39 |     assert url.href == "https://www.example.com/"
 40 | 
 41 | 
 42 | def test_url_user_info():
 43 |     url = whatwg_url.parse_url("https://github.com")
 44 | 
 45 |     url.username = "username"
 46 | 
 47 |     assert url.username == "username"
 48 |     assert url.password is None
 49 |     assert url.href == "https://username@github.com/"
 50 | 
 51 |     url.password = "password"
 52 | 
 53 |     assert url.username == "username"
 54 |     assert url.password == "password"
 55 |     assert url.href == "https://username:password@github.com/"
 56 | 
 57 |     url.username = None
 58 | 
 59 |     assert url.username is None
 60 |     assert url.password == "password"
 61 |     assert url.href == "https://:password@github.com/"
 62 | 
 63 |     url.password = None
 64 | 
 65 |     assert url.username is None
 66 |     assert url.password is None
 67 |     assert url.href == "https://github.com/"
 68 | 
 69 | 
 70 | def test_url_query():
 71 |     url = whatwg_url.parse_url("https://www.google.com")
 72 |     url.query = "?a=1"
 73 | 
 74 |     assert url.query == "a=1"
 75 |     assert url.href == "https://www.google.com/?a=1"
 76 | 
 77 |     url.query = ""
 78 | 
 79 |     assert url.query == ""
 80 |     assert url.href == "https://www.google.com/?"
 81 | 
 82 |     url.query = None
 83 | 
 84 |     assert url.query is None
 85 |     assert url.href == "https://www.google.com/"
 86 | 
 87 | 
 88 | def test_url_fragment():
 89 |     url = whatwg_url.parse_url("https://www.google.com")
 90 |     url.fragment = "abc"
 91 | 
 92 |     assert url.fragment == "abc"
 93 |     assert url.href == "https://www.google.com/#abc"
 94 | 
 95 |     url.fragment = ""
 96 | 
 97 |     assert url.fragment == ""
 98 |     assert url.href == "https://www.google.com/#"
 99 | 
100 |     url.fragment = None
101 | 
102 |     assert url.fragment is None
103 |     assert url.href == "https://www.google.com/"
104 | 
105 | 
106 | def test_url_origin():
107 |     url = whatwg_url.parse_url("https://www.google.com")
108 | 
109 |     assert url.origin == ("https", "www.google.com", None, None)
110 | 
111 | 
112 | def test_url_opaque_origin():
113 |     url = whatwg_url.parse_url("file:///var/tmp/file")
114 | 
115 |     assert tuple(url.origin) == (None, None, None, None)
116 |     assert not url.origin == url.origin
117 |     assert url.origin != url.origin
118 | 
119 | 
120 | def test_url_blob_origin():
121 |     url = whatwg_url.parse_url("blob:https://www.google.com")
122 | 
123 |     assert url.origin == whatwg_url.parse_url("https://www.google.com").origin
124 | 


--------------------------------------------------------------------------------
/tests/test_urllib.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from whatwg_url import urlparse as whatwg_urlparse, urljoin as whatwg_urljoin
 3 | 
 4 | try:
 5 |     from urllib.parse import urlparse as urllib_urlparse, urljoin as urllib_urljoin
 6 | except ImportError:
 7 |     from urlparse import urlparse as urllib_urlparse, urljoin as urllib_urljoin
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "url",
12 |     [
13 |         "https://www.google.com/",
14 |         "http://user:pass@www.example.com/",
15 |         "http://:pass@www.example.com/",
16 |         "http://user@www.example.com/",
17 |         "http://www.example.com:432/",
18 |         "http://www.example.com/?a=1;B=c",
19 |         "http://www.example.com/#Fragment",
20 |         "http://username:password@www.example.com:1234/?query=string#fragment",
21 |     ],
22 | )
23 | def test_assert_same_urlparse_result(url):
24 |     urllib_result = urllib_urlparse(url)
25 |     whatwg_result = whatwg_urlparse(url)
26 | 
27 |     assert urllib_result.netloc == whatwg_result.netloc
28 |     assert urllib_result.hostname == whatwg_result.hostname
29 |     assert urllib_result.port == whatwg_result.port
30 |     assert urllib_result.path == whatwg_result.path
31 |     assert urllib_result.query == whatwg_result.query
32 |     assert urllib_result.fragment == whatwg_result.fragment
33 |     assert urllib_result.username == whatwg_result.username
34 |     assert urllib_result.password == whatwg_result.password
35 |     assert tuple(urllib_result) == tuple(whatwg_result)
36 | 
37 | 
38 | @pytest.mark.parametrize(
39 |     ["base", "url", "expected"],
40 |     [
41 |         ("http://www.google.com/", "", "http://www.google.com/"),
42 |         ("http://www.google.com/", "/", "http://www.google.com/"),
43 |         ("http://www.google.com/", "maps/", "http://www.google.com/maps/"),
44 |         ("http://www.google.com/", "one/two/", "http://www.google.com/one/two/"),
45 |         ("http://www.google.com/mail", "/maps/", "http://www.google.com/maps/"),
46 |         ("http://www.google.com/", "./", "http://www.google.com/"),
47 |         ("http://www.google.com/maps", "..", "http://www.google.com/"),
48 |         (
49 |             "http://www.google.com/",
50 |             "https://www.google.com/",
51 |             "https://www.google.com/",
52 |         ),
53 |         (
54 |             "http://www.google.com/",
55 |             "https://maps.google.com/",
56 |             "https://maps.google.com/",
57 |         ),
58 |         (
59 |             "https://www.google.com/",
60 |             "https://www.google.com:1234/",
61 |             "https://www.google.com:1234/",
62 |         ),
63 |         (
64 |             "https://www.google.com/",
65 |             "?query=string",
66 |             "https://www.google.com/?query=string",
67 |         ),
68 |         ("https://www.google.com/", "#fragment", "https://www.google.com/#fragment"),
69 |         (
70 |             "http://www.google.com/",
71 |             "http://user:pass@www.google.com/",
72 |             "http://user:pass@www.google.com/",
73 |         ),
74 |         (
75 |             "http://www.google.com/",
76 |             "http://user@www.google.com/",
77 |             "http://user@www.google.com/",
78 |         ),
79 |         (
80 |             "http://www.google.com/",
81 |             "http://:pass@www.google.com/",
82 |             "http://:pass@www.google.com/",
83 |         ),
84 |     ],
85 | )
86 | def test_assert_same_urljoin_result(base, url, expected):
87 |     urllib_result = urllib_urljoin(base, url)
88 |     whatwg_result = whatwg_urljoin(base, url)
89 | 
90 |     assert urllib_result == expected
91 |     assert whatwg_result == expected
92 | 


--------------------------------------------------------------------------------
/tests/test_web_platform_tests.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import json
 3 | import pytest
 4 | import os
 5 | import whatwg_url
 6 | 
 7 | 
 8 | with io.open(
 9 |     os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata.json"), "rb"
10 | ) as f:
11 |     testdata = f.read()
12 |     if not isinstance(testdata, str):
13 |         testdata = testdata.decode("utf-8")
14 |     testdata = json.loads(testdata, encoding="utf-8")
15 |     testdata = [x for x in testdata if isinstance(x, dict)]
16 | 
17 | 
18 | def assert_with_empty(a, b):
19 |     def f(x):
20 |         return "" if x is None else x
21 | 
22 |     assert f(a) == f(b)
23 | 
24 | 
25 | @pytest.mark.parametrize("testdata", testdata)
26 | def test_web_platform_tests(testdata):
27 |     if testdata["input"].startswith("blob:"):
28 |         pytest.skip("blob")
29 | 
30 |     if "href" in testdata and "about:blank" in testdata["href"]:
31 |         pytest.skip("about:blank")
32 | 
33 |     base = testdata.get("base", None)
34 |     if base == "about:blank":
35 |         base = None
36 |     else:
37 |         base = whatwg_url.parse_url(base)
38 | 
39 |     if testdata.get("failure", False):
40 |         with pytest.raises(whatwg_url.UrlParserError):
41 |             whatwg_url.parse_url(testdata["input"], base=base)
42 | 
43 |     else:
44 |         url = whatwg_url.parse_url(testdata["input"], base=base)
45 | 
46 |         assert_with_empty(url._username, testdata.get("username", None))
47 |         assert_with_empty(url._password, testdata.get("password", None))
48 |         assert_with_empty(url.path, testdata.get("pathname", None))
49 | 
50 |         port = testdata.get("port", None)
51 |         if port is not None and port != "":
52 |             port = int(port)
53 |         if port == "":
54 |             port = None
55 | 
56 |         assert_with_empty(url._port, port)
57 |         assert url.href == testdata.get("href", None)
58 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = dist,lint,py27,py34,py35,py36,py37
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     pytest
 7 |     pytest-cov
 8 | commands =
 9 |     python --version
10 |     python -m pip --version
11 |     pytest -v --cov whatwg_url tests/
12 |     coverage report
13 | 
14 | [testenv:lint]
15 | basepython = python3.6
16 | deps =
17 |     black
18 |     flake8
19 |     flake8-bugbear
20 | commands =
21 |     python -m black --check setup.py whatwg_url.py tests/
22 |     python -m flake8 setup.py whatwg_url.py tests/
23 | 
24 | [testenv:dist]
25 | basepython = python3.6
26 | deps =
27 |     docutils
28 |     check-manifest
29 |     readme
30 | usedevelop = true
31 | commands =
32 |     python setup.py check --strict --metadata
33 |     check-manifest {toxinidir}
34 | 


--------------------------------------------------------------------------------
/whatwg_url.py:
--------------------------------------------------------------------------------
   1 | """Python implementation of the WHATWG URL Living Standard"""
   2 | 
   3 | import string
   4 | import re
   5 | import ipaddress
   6 | import collections
   7 | import encodings.idna as idna2003
   8 | import idna
   9 | import six
  10 | 
  11 | 
  12 | __all__ = [
  13 |     "parse_url",
  14 |     "normalize_url",
  15 |     "is_valid_url",
  16 |     "UrlParser",
  17 |     "Url",
  18 |     "UrlParserError",
  19 |     "urlparse",
  20 |     "urljoin",
  21 |     "ParseResult",
  22 | ]
  23 | __version__ = "2018.8.26"
  24 | __license__ = "Apache-2.0"
  25 | 
  26 | 
  27 | def parse_url(url, base=None, encoding="utf-8"):
  28 |     """
  29 |     Parses a URL from a string input with an optional base URL.
  30 |     If the input URL is a relative URL then it will be parsed as
  31 |     relative to the base URL.
  32 | 
  33 |     :param str url: URL input string
  34 |     :param str base: Optional base URL to use while parsing.
  35 |     :param encoding: Character encoding to use for parsing the URL, defaults to UTF-8.
  36 |     :rtype: Url
  37 |     :raises: UrlParserError
  38 |     :return: The parsed URL.
  39 |     """
  40 |     parser = UrlParser()
  41 |     return parser.parse(url, base=base, encoding=encoding)
  42 | 
  43 | 
  44 | def normalize_url(url, base=None, encoding="utf-8"):
  45 |     """Normalizes a URL input with and optional base URL.
  46 | 
  47 |     :param str url: URL input to normalize.
  48 |     :param str base: Optional base URL to parse relative to.
  49 |     :param str encoding: Character encoding to parse with. Defaults to UTF-8.
  50 |     :rtype
  51 |     :raises: UrlParserError
  52 |     :return: The normalized URL as a string.
  53 |     """
  54 |     return parse_url(url, base=base, encoding=encoding).href
  55 | 
  56 | 
  57 | def is_valid_url(url, base=None, encoding="utf-8"):
  58 |     """Determines if a URL is a valid URL.
  59 | 
  60 |     :param str url: URL input to validate
  61 |     :param str base: Optional base URL to parse relative to.
  62 |     :param str encoding: Character encoding to parse with. Defaults to UTF-8.
  63 |     :rtype: bool
  64 |     :return: True if the given URL is a valid URL, False otherwise.
  65 |     """
  66 |     try:
  67 |         parse_url(url, base=base, encoding=encoding)
  68 |         return True
  69 |     except UrlParserError:
  70 |         return False
  71 | 
  72 | 
  73 | class _OpaqueOrigin(tuple):
  74 |     def __eq__(self, _):
  75 |         return False
  76 | 
  77 |     def __ne__(self, _):
  78 |         return True
  79 | 
  80 | 
  81 | def b(x, encoding="ascii"):
  82 |     if isinstance(x, six.text_type):
  83 |         return x.encode(encoding)
  84 |     return x
  85 | 
  86 | 
  87 | ASCII_ALPHA = set(string.ascii_letters)
  88 | ASCII_DIGITS = set(string.digits)
  89 | ASCII_ALPHANUMERIC = ASCII_ALPHA | ASCII_DIGITS
  90 | TWO_ASCII_HEX = re.compile(r"^[a-fA-F0-9]{2}")
  91 | URL_CODEPOINTS = ASCII_ALPHANUMERIC | set("!$&'()*+,-./:;=?@_~")
  92 | SCHEME_CHARS = ASCII_ALPHANUMERIC | set("+-.")
  93 | NONCHARACTERS = {
  94 |     0xfdd0,
  95 |     0xfdd1,
  96 |     0xfdd2,
  97 |     0xfdd3,
  98 |     0xfdd4,
  99 |     0xfdd5,
 100 |     0xfdd6,
 101 |     0xfdd7,
 102 |     0xfdd8,
 103 |     0xfdd9,
 104 |     0xfdda,
 105 |     0xfddb,
 106 |     0xfddc,
 107 |     0xfddd,
 108 |     0xfdde,
 109 |     0xfddf,
 110 |     0xfde0,
 111 |     0xfde1,
 112 |     0xfde2,
 113 |     0xfde3,
 114 |     0xfde4,
 115 |     0xfde5,
 116 |     0xfde6,
 117 |     0xfde7,
 118 |     0xfde8,
 119 |     0xfde9,
 120 |     0xfdea,
 121 |     0xfdeb,
 122 |     0xfdec,
 123 |     0xfded,
 124 |     0xfdee,
 125 |     0xfdef,
 126 |     0xfffe,
 127 |     0xffff,
 128 |     0x1fffe,
 129 |     0x1ffff,
 130 |     0x2fffe,
 131 |     0x2ffff,
 132 |     0x3fffe,
 133 |     0x3ffff,
 134 |     0x4fffe,
 135 |     0x4ffff,
 136 |     0x5fffe,
 137 |     0x5ffff,
 138 |     0x6fffe,
 139 |     0x6ffff,
 140 |     0x7fffe,
 141 |     0x7ffff,
 142 |     0x8fffe,
 143 |     0x8ffff,
 144 |     0x9fffe,
 145 |     0x9ffff,
 146 |     0xafffe,
 147 |     0xaffff,
 148 |     0xbfffe,
 149 |     0xbffff,
 150 |     0xcfffe,
 151 |     0xcffff,
 152 |     0xdfffe,
 153 |     0xdffff,
 154 |     0xefffe,
 155 |     0xeffff,
 156 |     0xffffe,
 157 |     0xfffff,
 158 |     0x10fffe,
 159 |     0x10ffff,
 160 | }
 161 | 
 162 | SINGLE_DOT_PATH_SEGMENTS = {".", "%2e", "%2E"}
 163 | DOUBLE_DOT_PATH_SEGMENTS = {
 164 |     "..",
 165 |     ".%2e",
 166 |     ".%2E",
 167 |     "%2e.",
 168 |     "%2e%2e",
 169 |     "%2e%2E",
 170 |     "%2E.",
 171 |     "%2E%2e",
 172 |     "%2E%2E",
 173 | }
 174 | 
 175 | C0_PERCENT_ENCODE = set([chr(x) for x in range(0x20)])
 176 | FRAGMENT_PERCENT_ENCODE = set(' "<>`') | C0_PERCENT_ENCODE
 177 | PATH_PERCENT_ENCODE = set("#?{}") | FRAGMENT_PERCENT_ENCODE
 178 | USERINFO_PERCENT_ENCODE = set("/:;=@[\\]^|") | PATH_PERCENT_ENCODE
 179 | 
 180 | FORBIDDEN_HOST_CODE_POINTS = {
 181 |     "\x00",
 182 |     "\t",
 183 |     "\x0a",
 184 |     "\x0d",
 185 |     " ",
 186 |     "#",
 187 |     "%",
 188 |     "/",
 189 |     ":",
 190 |     "?",
 191 |     "@",
 192 |     "[",
 193 |     "\\",
 194 |     "]",
 195 | }
 196 | 
 197 | WINDOWS_DRIVE_LETTER = re.compile(r"^([a-zA-Z][:|])(?:[/\\?#]|$)")
 198 | NORMALIZED_WINDOWS_DRIVE_LETTER = re.compile(r"^[a-zA-Z][:]$")
 199 | 
 200 | AUTHORITY_DELIMITERS = {"", "/", "?", "#"}
 201 | PATH_DELIMITERS = {"", "/", "\\", "?", "#"}
 202 | 
 203 | HEX_CHAR_MAP = dict(
 204 |     [
 205 |         (b(_x + _y), b(chr(int(_x + _y, 16)), "charmap"))
 206 |         for _x in string.hexdigits
 207 |         for _y in string.hexdigits
 208 |     ]
 209 | )
 210 | 
 211 | IDNA_DOTS_REGEX = re.compile(u"[\u002e\u3002\uff0e\uff61]")
 212 | 
 213 | 
 214 | SPECIAL_SCHEMES = {
 215 |     "ftp": 21,
 216 |     "gopher": 70,
 217 |     "http": 80,
 218 |     "https": 443,
 219 |     "ws": 80,
 220 |     "wss": 443,
 221 |     "file": None,
 222 | }
 223 | 
 224 | 
 225 | PARSER_STATE_SCHEME_START = 1
 226 | PARSER_STATE_SCHEME = 2
 227 | PARSER_STATE_NO_SCHEME = 3
 228 | PARSER_STATE_SPECIAL_RELATIVE_OR_AUTHORITY = 4
 229 | PARSER_STATE_PATH_OR_AUTHORITY = 5
 230 | PARSER_STATE_RELATIVE = 6
 231 | PARSER_STATE_RELATIVE_SLASH = 7
 232 | PARSER_STATE_SPECIAL_AUTHORITY_SLASHES = 8
 233 | PARSER_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES = 9
 234 | PARSER_STATE_AUTHORITY = 10
 235 | PARSER_STATE_HOST = 11
 236 | PARSER_STATE_HOSTNAME = 12
 237 | PARSER_STATE_PORT = 13
 238 | PARSER_STATE_FILE = 14
 239 | PARSER_STATE_FILE_SLASH = 15
 240 | PARSER_STATE_FILE_HOST = 16
 241 | PARSER_STATE_PATH_START = 17
 242 | PARSER_STATE_PATH = 18
 243 | PARSER_STATE_CANNOT_BE_BASE_URL = 19
 244 | PARSER_STATE_QUERY = 20
 245 | PARSER_STATE_FRAGMENT = 21
 246 | 
 247 | 
 248 | class UrlParserError(ValueError):
 249 |     pass
 250 | 
 251 | 
 252 | class _UrlParserReturn(Exception):
 253 |     pass
 254 | 
 255 | 
 256 | class Url(object):
 257 |     def __init__(
 258 |         self,
 259 |         scheme=None,
 260 |         hostname=None,
 261 |         port=None,
 262 |         username=None,
 263 |         password=None,
 264 |         query=None,
 265 |         fragment=None,
 266 |         path=None,
 267 |         cannot_be_base_url=False,
 268 |         encoding="utf-8",
 269 |     ):
 270 |         if path is None:
 271 |             path = []
 272 | 
 273 |         self._scheme = scheme
 274 |         self._hostname = hostname
 275 |         self._port = port
 276 |         self._username = username
 277 |         self._password = password
 278 |         self._query = query
 279 |         self._fragment = fragment
 280 |         self._path = path
 281 | 
 282 |         self.encoding = encoding
 283 |         self.cannot_be_base_url = cannot_be_base_url
 284 | 
 285 |     @property
 286 |     def scheme(self):
 287 |         return self._scheme
 288 | 
 289 |     @property
 290 |     def hostname(self):
 291 |         return self._hostname
 292 | 
 293 |     @property
 294 |     def port(self):
 295 |         return self._port
 296 | 
 297 |     @property
 298 |     def username(self):
 299 |         return self._username
 300 | 
 301 |     @property
 302 |     def password(self):
 303 |         return self._password
 304 | 
 305 |     @property
 306 |     def query(self):
 307 |         return self._query
 308 | 
 309 |     @property
 310 |     def fragment(self):
 311 |         return self._fragment
 312 | 
 313 |     @property
 314 |     def host(self):
 315 |         if self._port is None:
 316 |             return self._hostname
 317 |         return "%s:%s" % (self._hostname, self._port)
 318 | 
 319 |     @property
 320 |     def path(self):
 321 |         if self.cannot_be_base_url:
 322 |             return self._path[0]
 323 |         else:
 324 |             return "".join(["/%s" % x for x in self._path])
 325 | 
 326 |     @scheme.setter
 327 |     def scheme(self, scheme):
 328 |         parser = UrlParser(self)
 329 |         parser.parse(
 330 |             scheme + ":",
 331 |             encoding=self.encoding,
 332 |             state_override=PARSER_STATE_SCHEME_START,
 333 |         )
 334 | 
 335 |     @username.setter
 336 |     def username(self, username):
 337 |         self._username = username
 338 | 
 339 |     @password.setter
 340 |     def password(self, password):
 341 |         self._password = password
 342 | 
 343 |     @hostname.setter
 344 |     def hostname(self, hostname):
 345 |         parser = UrlParser(self)
 346 |         parser.parse(
 347 |             hostname, encoding=self.encoding, state_override=PARSER_STATE_HOSTNAME
 348 |         )
 349 | 
 350 |     @port.setter
 351 |     def port(self, port):
 352 |         parser = UrlParser(self)
 353 |         parser.parse(str(port), state_override=PARSER_STATE_PORT)
 354 | 
 355 |     @path.setter
 356 |     def path(self, path):
 357 |         if self.cannot_be_base_url:
 358 |             return
 359 | 
 360 |         self._path = []
 361 |         parser = UrlParser(self)
 362 |         parser.parse(path, state_override=PARSER_STATE_PATH_START)
 363 | 
 364 |     @query.setter
 365 |     def query(self, query):
 366 |         if query is None:
 367 |             self._query = None
 368 |             return
 369 | 
 370 |         if query.startswith("?"):
 371 |             query = query[1:]
 372 | 
 373 |         self._query = ""
 374 |         parser = UrlParser(self)
 375 |         parser.parse(query, encoding=self.encoding, state_override=PARSER_STATE_QUERY)
 376 | 
 377 |     @fragment.setter
 378 |     def fragment(self, fragment):
 379 |         if fragment is None:
 380 |             self._fragment = None
 381 |             return
 382 | 
 383 |         if fragment.startswith("#"):
 384 |             fragment = fragment[1:]
 385 | 
 386 |         self._fragment = ""
 387 |         parser = UrlParser(self)
 388 |         parser.parse(
 389 |             fragment, encoding=self.encoding, state_override=PARSER_STATE_FRAGMENT
 390 |         )
 391 | 
 392 |     @property
 393 |     def includes_credentials(self):
 394 |         """Determines if a URL includes credentials"""
 395 |         return bool(self._username) or bool(self._password)
 396 | 
 397 |     @property
 398 |     def origin(self):
 399 |         if self.scheme == "blob":
 400 |             try:
 401 |                 url = parse_url(self._path[0], encoding=self.encoding)
 402 |             except UrlParserError:
 403 |                 return _OpaqueOrigin((None, None, None, None))
 404 |             return url.origin
 405 | 
 406 |         elif self.scheme in SPECIAL_SCHEMES and self.scheme != "file":
 407 |             return self.scheme, self.hostname, self.port, None
 408 | 
 409 |         else:
 410 |             return _OpaqueOrigin((None, None, None, None))
 411 | 
 412 |     @property
 413 |     def authority(self):
 414 |         output = []
 415 |         if self.includes_credentials:
 416 |             if self._username:
 417 |                 output.append(self._username)
 418 |             if self._password:
 419 |                 output.append(":" + self._password)
 420 |             output.append("@")
 421 | 
 422 |         output.append(self._hostname)
 423 |         if self._port is not None:
 424 |             output.append(":%s" % self._port)
 425 |         return "".join(output)
 426 | 
 427 |     @property
 428 |     def href(self):
 429 |         output = [self._scheme + ":"]
 430 |         if self._hostname is not None:
 431 |             output.append("//")
 432 | 
 433 |             if self.includes_credentials:
 434 |                 if self._username:
 435 |                     output.append(self._username)
 436 |                 if self._password:
 437 |                     output.append(":" + self._password)
 438 |                 output.append("@")
 439 | 
 440 |             output.append(self._hostname)
 441 |             if self._port is not None:
 442 |                 output.append(":%s" % self._port)
 443 | 
 444 |         if self._hostname is None and self._scheme == "file":
 445 |             output.append("//")
 446 | 
 447 |         if self.cannot_be_base_url:
 448 |             output.append(self._path[0])
 449 |         else:
 450 |             output.append(self.path)
 451 | 
 452 |         if self._query is not None:
 453 |             output.append("?" + self._query)
 454 | 
 455 |         if self._fragment is not None:
 456 |             output.append("#" + self._fragment)
 457 | 
 458 |         return "".join(output)
 459 | 
 460 |     def __repr__(self):
 461 |         return ("<%s scheme=%r hostname=%r port=%r path=%r query=%r fragment=%r>") % (
 462 |             self.__class__.__name__,
 463 |             self._scheme,
 464 |             self._hostname,
 465 |             self._port,
 466 |             self.path,
 467 |             self._query,
 468 |             self._fragment,
 469 |         )
 470 | 
 471 |     def __str__(self):
 472 |         return self.href
 473 | 
 474 | 
 475 | class UrlParser(object):
 476 |     def __init__(self, url=None):
 477 |         if url is None:
 478 |             url = Url()
 479 | 
 480 |         self.url = url
 481 |         self.base = None
 482 |         self.state_override = None
 483 |         self.validation_error = False
 484 | 
 485 |         self._state = None
 486 |         self._pointer = 0
 487 |         self._buffer = ""
 488 |         self._at_flag = False
 489 |         self._square_brace_flag = False
 490 |         self._password_token_seen_flag = False
 491 | 
 492 |         self._state_handlers = {
 493 |             PARSER_STATE_SCHEME_START: self._on_scheme_start,
 494 |             PARSER_STATE_SCHEME: self._on_scheme,
 495 |             PARSER_STATE_NO_SCHEME: self._on_no_scheme,
 496 |             PARSER_STATE_SPECIAL_RELATIVE_OR_AUTHORITY: (
 497 |                 self._on_special_relative_or_authority
 498 |             ),
 499 |             PARSER_STATE_PATH_OR_AUTHORITY: self._on_path_or_authority,
 500 |             PARSER_STATE_RELATIVE: self._on_relative,
 501 |             PARSER_STATE_RELATIVE_SLASH: self._on_relative_slash,
 502 |             PARSER_STATE_SPECIAL_AUTHORITY_SLASHES: self._on_special_authority_slashes,
 503 |             PARSER_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES: (
 504 |                 self._on_special_authority_ignore_slashes
 505 |             ),
 506 |             PARSER_STATE_AUTHORITY: self._on_authority,
 507 |             PARSER_STATE_HOST: self._on_host_or_hostname,
 508 |             PARSER_STATE_HOSTNAME: self._on_host_or_hostname,
 509 |             PARSER_STATE_PORT: self._on_port,
 510 |             PARSER_STATE_FILE: self._on_file,
 511 |             PARSER_STATE_FILE_SLASH: self._on_file_slash,
 512 |             PARSER_STATE_FILE_HOST: self._on_file_host,
 513 |             PARSER_STATE_PATH_START: self._on_path_start,
 514 |             PARSER_STATE_PATH: self._on_path,
 515 |             PARSER_STATE_CANNOT_BE_BASE_URL: self._on_cannot_be_base_url,
 516 |             PARSER_STATE_QUERY: self._on_query,
 517 |             PARSER_STATE_FRAGMENT: self._on_fragment,
 518 |         }
 519 | 
 520 |     def parse(self, data, base=None, encoding=None, state_override=None):
 521 |         self.reset()
 522 | 
 523 |         if isinstance(base, str):
 524 |             base_parser = UrlParser()
 525 |             base = base_parser.parse(base, encoding=encoding)
 526 |         self.base = base
 527 | 
 528 |         self.state_override = state_override
 529 |         self._state = state_override or PARSER_STATE_SCHEME_START
 530 | 
 531 |         if encoding is None:
 532 |             self.encoding = self.url.encoding or "utf-8"
 533 |         else:
 534 |             self.encoding = encoding
 535 | 
 536 |         self.url.encoding = self.encoding
 537 | 
 538 |         while data and _is_c0_control_or_space(data[0]):
 539 |             self.validation_error = True
 540 |             data = data[1:]
 541 | 
 542 |         while data and _is_c0_control_or_space(data[-1]):
 543 |             self.validation_error = True
 544 |             data = data[:-1]
 545 | 
 546 |         before_len = len(data)
 547 |         data = data.replace("\t", "").replace("\n", "").replace("\r", "")
 548 | 
 549 |         if len(data) < before_len:
 550 |             self.validation_error = True
 551 | 
 552 |         try:
 553 |             end_pointer = len(data)
 554 | 
 555 |             while self._pointer < end_pointer or (
 556 |                 end_pointer == 0 and self._pointer == 0
 557 |             ):
 558 |                 if end_pointer > 0:
 559 |                     self._call_state_handler(
 560 |                         self._state, data[self._pointer], data[self._pointer + 1 :]
 561 |                     )
 562 | 
 563 |                 while self._pointer == end_pointer:
 564 |                     self._call_state_handler(self._state, "", "")
 565 | 
 566 |         except _UrlParserReturn:
 567 |             pass
 568 | 
 569 |         return self.url
 570 | 
 571 |     def _call_state_handler(self, state, c, remaining):
 572 |         self._state_handlers[state](c, remaining)
 573 |         self._pointer += 1
 574 | 
 575 |     def parse_host(self, host, is_not_special=False):
 576 |         # IPv6 parsing
 577 |         if host.startswith("["):
 578 |             if not host.endswith("]"):
 579 |                 self.validation_error = True
 580 |                 raise UrlParserError()
 581 | 
 582 |             try:
 583 |                 return "[%s]" % ipaddress.IPv6Address(host[1:-1])
 584 |             except ipaddress.AddressValueError:
 585 |                 raise UrlParserError()
 586 | 
 587 |         # Opaque-host parsing
 588 |         if is_not_special:
 589 |             codepoints = set(host)
 590 |             if "%" in codepoints:
 591 |                 codepoints.remove("%")
 592 |             if codepoints.intersection(FORBIDDEN_HOST_CODE_POINTS):
 593 |                 self.validation_error = True
 594 |                 raise UrlParserError()
 595 | 
 596 |             return "".join([_percent_encode(c, C0_PERCENT_ENCODE) for c in host])
 597 | 
 598 |         try:
 599 |             domain = _string_percent_decode(host).decode("utf-8")
 600 |         except UnicodeDecodeError:
 601 |             raise UrlParserError()
 602 | 
 603 |         try:
 604 |             ascii_domain = _domain_to_ascii(domain).decode("utf-8").lower()
 605 |         except (idna.IDNAError, UnicodeError) as e:
 606 |             self.validation_error = True
 607 |             raise UrlParserError()
 608 | 
 609 |         # Contains forbidden host codepoint
 610 |         if set(ascii_domain).intersection(FORBIDDEN_HOST_CODE_POINTS):
 611 |             raise UrlParserError()
 612 | 
 613 |         # IPv4 parsing
 614 |         return self.parse_ipv4_host(ascii_domain)
 615 | 
 616 |     def parse_ipv4_host(self, ascii_domain):
 617 |         """Attempts to parse a domain as an IPv4 address with
 618 |         a lot of parsing rules for decimal, octal, hex, different
 619 |         numbers of separators, etc.
 620 |         """
 621 |         parts = ascii_domain.split(".")
 622 | 
 623 |         if parts[-1] == "":
 624 |             self.validation_error = True
 625 |             if len(parts) > 1:
 626 |                 parts.pop(-1)
 627 | 
 628 |         if len(parts) > 4:
 629 |             return ascii_domain
 630 | 
 631 |         numbers = []
 632 |         for part in parts:
 633 |             if part == "":
 634 |                 return ascii_domain
 635 | 
 636 |             n, flag = _parse_ipv4_number(part)
 637 |             if n is None:
 638 |                 return ascii_domain
 639 | 
 640 |             numbers.append(n)
 641 | 
 642 |         for i, number in enumerate(numbers):
 643 |             if number > 255:
 644 |                 self.validation_error = True
 645 |                 if i < len(numbers) - 1:
 646 |                     raise UrlParserError()
 647 | 
 648 |         if numbers[-1] >= 256 ** (5 - len(numbers)):
 649 |             self.validation_error = True
 650 |             raise UrlParserError()
 651 | 
 652 |         ipv4 = numbers.pop(-1)
 653 |         for i, number in enumerate(numbers):
 654 |             ipv4 += number * (256 ** (3 - i))
 655 | 
 656 |         output = []
 657 |         for _ in range(4):
 658 |             output.insert(0, str(ipv4 % 256))
 659 |             ipv4 //= 256
 660 | 
 661 |         return ".".join(output)
 662 | 
 663 |     def reset(self):
 664 |         self.validation_error = False
 665 |         self._pointer = 0
 666 |         self._buffer = ""
 667 |         self._at_flag = False
 668 |         self._square_brace_flag = False
 669 |         self._password_token_seen_flag = False
 670 | 
 671 |     def shorten_url_path(self):
 672 |         path_len = len(self.url._path)
 673 |         if path_len == 0:
 674 |             return
 675 |         if (
 676 |             self.url.scheme == "file"
 677 |             and path_len == 1
 678 |             and NORMALIZED_WINDOWS_DRIVE_LETTER.match(self.url._path[0]) is not None
 679 |         ):
 680 |             return
 681 |         self.url._path.pop(-1)
 682 | 
 683 |     def _on_scheme_start(self, c, _):
 684 |         """Handles the START SCHEME state."""
 685 |         if c in ASCII_ALPHA:
 686 |             self._buffer += c.lower()
 687 |             self._state = PARSER_STATE_SCHEME
 688 | 
 689 |         elif self.state_override is None:
 690 |             self._state = PARSER_STATE_NO_SCHEME
 691 |             self._pointer -= 1
 692 | 
 693 |         else:
 694 |             self.validation_error = True
 695 |             raise UrlParserError()
 696 | 
 697 |     def _on_scheme(self, c, remaining):
 698 |         """Handles the SCHEME state."""
 699 |         if c in SCHEME_CHARS:
 700 |             self._buffer += c.lower()
 701 | 
 702 |         elif c == ":":
 703 |             if self.state_override is not None:
 704 |                 if (self._buffer in SPECIAL_SCHEMES) != (
 705 |                     self.url.scheme in SPECIAL_SCHEMES
 706 |                 ):
 707 |                     raise _UrlParserReturn()
 708 | 
 709 |                 elif (
 710 |                     self.url.includes_credentials or self.url.port is not None
 711 |                 ) and self._buffer == "file":
 712 |                     raise _UrlParserReturn()
 713 | 
 714 |                 elif self.url.scheme == "file" and (
 715 |                     self.url.hostname is None or self.url.hostname == ""
 716 |                 ):
 717 |                     raise _UrlParserReturn()
 718 | 
 719 |             self.url._scheme = self._buffer
 720 | 
 721 |             if self.state_override is not None:
 722 |                 if (
 723 |                     self.url.scheme in SPECIAL_SCHEMES
 724 |                     and SPECIAL_SCHEMES[self.url.scheme] == self.url.port
 725 |                 ):
 726 |                     self.url._port = None
 727 |                 raise _UrlParserReturn()
 728 | 
 729 |             self._buffer = ""
 730 | 
 731 |             if self.url.scheme == "file":
 732 |                 if not remaining.startswith("//"):
 733 |                     self.validation_error = True
 734 |                 self._state = PARSER_STATE_FILE
 735 | 
 736 |             elif (
 737 |                 self.url.scheme in SPECIAL_SCHEMES
 738 |                 and self.base is not None
 739 |                 and self.base.scheme == self.url.scheme
 740 |             ):
 741 |                 self._state = PARSER_STATE_SPECIAL_RELATIVE_OR_AUTHORITY
 742 | 
 743 |             elif self.url.scheme in SPECIAL_SCHEMES:
 744 |                 self._state = PARSER_STATE_SPECIAL_AUTHORITY_SLASHES
 745 | 
 746 |             elif remaining.startswith("/"):
 747 |                 self._state = PARSER_STATE_PATH_OR_AUTHORITY
 748 |                 self._pointer += 1
 749 | 
 750 |             else:
 751 |                 self.url.cannot_be_base_url = True
 752 |                 self.url._path.append("")
 753 |                 self._state = PARSER_STATE_CANNOT_BE_BASE_URL
 754 | 
 755 |         elif self.state_override is None:
 756 |             self._buffer = ""
 757 |             self._state = PARSER_STATE_NO_SCHEME
 758 |             self._pointer = -1
 759 | 
 760 |         else:
 761 |             self.validation_error = True
 762 |             raise UrlParserError()
 763 | 
 764 |     def _on_no_scheme(self, c, _):
 765 |         """Handles the NO SCHEME state"""
 766 |         if self.base is None or (self.base.cannot_be_base_url and c != "#"):
 767 |             self.validation_error = True
 768 |             raise UrlParserError()
 769 | 
 770 |         elif self.base.cannot_be_base_url and c == "#":
 771 |             self.url._scheme = self.base.scheme
 772 |             self.url._path = self.base._path[:]
 773 |             self.url._query = self.base.query
 774 |             self.url._fragment = ""
 775 |             self.url.cannot_be_base_url = True
 776 |             self._state = PARSER_STATE_FRAGMENT
 777 | 
 778 |         elif self.base.scheme != "file":
 779 |             self._state = PARSER_STATE_RELATIVE
 780 |             self._pointer -= 1
 781 | 
 782 |         else:
 783 |             self._state = PARSER_STATE_FILE
 784 |             self._pointer -= 1
 785 | 
 786 |     def _on_special_relative_or_authority(self, c, remaining):
 787 |         """Handles the SPECIAL RELATIVE OR AUTHORITY state"""
 788 |         if c == "/" and remaining.startswith("/"):
 789 |             self._state = PARSER_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES
 790 |             self._pointer += 1
 791 | 
 792 |         else:
 793 |             self.validation_error = True
 794 |             self._state = PARSER_STATE_RELATIVE
 795 |             self._pointer -= 1
 796 | 
 797 |     def _on_path_or_authority(self, c, _):
 798 |         """Handles the PATH OR AUTHORITY state"""
 799 |         if c == "/":
 800 |             self._state = PARSER_STATE_AUTHORITY
 801 |         else:
 802 |             self._state = PARSER_STATE_PATH
 803 |             self._pointer -= 1
 804 | 
 805 |     def _on_relative(self, c, _):
 806 |         """Handles the RELATIVE state"""
 807 |         self.url._scheme = self.base.scheme
 808 | 
 809 |         if c == "":
 810 |             self.url._username = self.base.username
 811 |             self.url._password = self.base.password
 812 |             self.url._hostname = self.base.hostname
 813 |             self.url._port = self.base.port
 814 |             self.url._path = self.base._path[:]
 815 |             self.url._query = self.base.query
 816 | 
 817 |         elif c == "/":
 818 |             self._state = PARSER_STATE_RELATIVE_SLASH
 819 | 
 820 |         elif c == "?":
 821 |             self.url._username = self.base.username
 822 |             self.url._password = self.base.password
 823 |             self.url._hostname = self.base.hostname
 824 |             self.url._port = self.base.port
 825 |             self.url._path = self.base._path[:]
 826 |             self.url._query = ""
 827 | 
 828 |             self._state = PARSER_STATE_QUERY
 829 | 
 830 |         elif c == "#":
 831 |             self.url._username = self.base.username
 832 |             self.url._password = self.base.password
 833 |             self.url._hostname = self.base.hostname
 834 |             self.url._port = self.base.port
 835 |             self.url._path = self.base._path[:]
 836 |             self.url._query = self.base.query
 837 |             self.url._fragment = ""
 838 | 
 839 |             self._state = PARSER_STATE_FRAGMENT
 840 | 
 841 |         else:
 842 |             if self.url.scheme in SPECIAL_SCHEMES and c == "\\":
 843 |                 self.validation_error = True
 844 |                 self._state = PARSER_STATE_RELATIVE_SLASH
 845 | 
 846 |             else:
 847 |                 self.url._username = self.base.username
 848 |                 self.url._password = self.base.password
 849 |                 self.url._hostname = self.base.hostname
 850 |                 self.url._port = self.base.port
 851 |                 self.url._path = self.base._path[:]
 852 | 
 853 |                 if len(self.url._path):
 854 |                     self.url._path.pop(-1)
 855 | 
 856 |                 self._state = PARSER_STATE_PATH
 857 |                 self._pointer -= 1
 858 | 
 859 |     def _on_relative_slash(self, c, _):
 860 |         if self.url.scheme in SPECIAL_SCHEMES and (c == "/" or c == "\\"):
 861 |             if c == "\\":
 862 |                 self.validation_error = True
 863 |             self._state = PARSER_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES
 864 | 
 865 |         elif c == "/":
 866 |             self._state = PARSER_STATE_AUTHORITY
 867 | 
 868 |         else:
 869 |             self.url._username = self.base.username
 870 |             self.url._password = self.base.password
 871 |             self.url._hostname = self.base.hostname
 872 |             self.url._port = self.base.port
 873 | 
 874 |             self._pointer -= 1
 875 |             self._state = PARSER_STATE_PATH
 876 | 
 877 |     def _on_special_authority_slashes(self, c, remaining):
 878 |         """Handles the SPECIAL AUTHORITY SLASHES state"""
 879 |         if c == "/" and remaining.startswith("/"):
 880 |             self._state = PARSER_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES
 881 |             self._pointer += 1
 882 | 
 883 |         else:
 884 |             self.validation_error = True
 885 |             self._state = PARSER_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES
 886 |             self._pointer -= 1
 887 | 
 888 |     def _on_special_authority_ignore_slashes(self, c, _):
 889 |         """Handles the SPECIAL AUTHORITY IGNORE SLASHES state"""
 890 |         if c != "/" and c != "\\":
 891 |             self._state = PARSER_STATE_AUTHORITY
 892 |             self._pointer -= 1
 893 | 
 894 |         else:
 895 |             self.validation_error = True
 896 | 
 897 |     def _on_authority(self, c, _):
 898 |         """Handles the AUTHORITY state"""
 899 |         if c == "@":
 900 |             self.validation_error = True
 901 | 
 902 |             if self._at_flag:
 903 |                 self._buffer = "%40" + self._buffer
 904 | 
 905 |             self._at_flag = True
 906 | 
 907 |             for char in self._buffer:
 908 |                 if not self._password_token_seen_flag and char == ":":
 909 |                     self._password_token_seen_flag = True
 910 |                     continue
 911 | 
 912 |                 if self._password_token_seen_flag:
 913 |                     if self.url.password is None:
 914 |                         self.url._password = ""
 915 |                     self.url._password += _percent_encode(char, USERINFO_PERCENT_ENCODE)
 916 |                 else:
 917 |                     if self.url.username is None:
 918 |                         self.url._username = ""
 919 |                     self.url._username += _percent_encode(char, USERINFO_PERCENT_ENCODE)
 920 | 
 921 |             self._buffer = ""
 922 | 
 923 |         elif c in AUTHORITY_DELIMITERS or (
 924 |             self.url.scheme in SPECIAL_SCHEMES and c == "\\"
 925 |         ):
 926 |             if self._at_flag and self._buffer == "":
 927 |                 self.validation_error = True
 928 |                 raise UrlParserError()
 929 | 
 930 |             self._pointer -= len(self._buffer) + 1
 931 |             self._buffer = ""
 932 |             self._state = PARSER_STATE_HOST
 933 | 
 934 |         else:
 935 |             self._buffer += c
 936 | 
 937 |     def _on_host_or_hostname(self, c, _):
 938 |         """Handles the HOST and HOSTNAME states"""
 939 |         if self.state_override is not None and self.url.scheme == "file":
 940 |             self._pointer -= 1
 941 |             self._state = PARSER_STATE_FILE_HOST
 942 | 
 943 |         elif c == ":" and not self._square_brace_flag:
 944 |             if self._buffer == "":
 945 |                 self.validation_error = True
 946 |                 raise UrlParserError()
 947 | 
 948 |             self.url._hostname = self.parse_host(
 949 |                 self._buffer, self.url.scheme not in SPECIAL_SCHEMES
 950 |             )
 951 |             self._buffer = ""
 952 |             self._state = PARSER_STATE_PORT
 953 | 
 954 |             if self.state_override == PARSER_STATE_HOSTNAME:
 955 |                 raise _UrlParserReturn()
 956 | 
 957 |         elif c in AUTHORITY_DELIMITERS or (
 958 |             c == "\\" and self.url.scheme in SPECIAL_SCHEMES
 959 |         ):
 960 |             self._pointer -= 1
 961 | 
 962 |             if self.url.scheme in SPECIAL_SCHEMES and self._buffer == "":
 963 |                 self.validation_error = True
 964 |                 raise UrlParserError()
 965 | 
 966 |             elif (
 967 |                 self.state_override is not None
 968 |                 and self._buffer == ""
 969 |                 and (self.url.includes_credentials or self.url.port is not None)
 970 |             ):
 971 |                 self.validation_error = True
 972 |                 raise _UrlParserReturn()
 973 | 
 974 |             self.url._hostname = self.parse_host(
 975 |                 self._buffer, self.url.scheme not in SPECIAL_SCHEMES
 976 |             )
 977 | 
 978 |             self._buffer = ""
 979 |             self._state = PARSER_STATE_PATH_START
 980 | 
 981 |             if self.state_override is not None:
 982 |                 raise _UrlParserReturn()
 983 | 
 984 |         else:
 985 |             if c == "[":
 986 |                 self._square_brace_flag = True
 987 |             elif c == "]":
 988 |                 self._square_brace_flag = False
 989 |             self._buffer += c
 990 | 
 991 |     def _on_port(self, c, _):
 992 |         """Handles the PORT state"""
 993 |         if c in ASCII_DIGITS:
 994 |             self._buffer += c
 995 | 
 996 |         elif (
 997 |             c in PATH_DELIMITERS
 998 |             or (c == "\\" and self.url.scheme in SPECIAL_SCHEMES)
 999 |             or self.state_override is not None
1000 |         ):
1001 |             if self._buffer != "":
1002 |                 try:
1003 |                     port = int(self._buffer)
1004 |                 except ValueError as e:
1005 |                     six.raise_from(UrlParserError(), e)
1006 | 
1007 |                 if port > 2 ** 16 - 1:
1008 |                     self.validation_error = True
1009 |                     raise UrlParserError()
1010 | 
1011 |                 self.url._port = (
1012 |                     None if port == SPECIAL_SCHEMES.get(self.url.scheme, None) else port
1013 |                 )
1014 |                 self._buffer = ""
1015 | 
1016 |             if self.state_override:
1017 |                 raise _UrlParserReturn()
1018 | 
1019 |             self._state = PARSER_STATE_PATH_START
1020 |             self._pointer -= 1
1021 | 
1022 |         else:
1023 |             self.validation_error = True
1024 |             raise UrlParserError()
1025 | 
1026 |     def _on_file(self, c, remaining):
1027 |         """Handles the FILE state"""
1028 |         self.url._scheme = "file"
1029 | 
1030 |         if c == "/" or c == "\\":
1031 |             if c == "\\":
1032 |                 self.validation_error = True
1033 |             self._state = PARSER_STATE_FILE_SLASH
1034 | 
1035 |         elif self.base is not None and self.base.scheme == "file":
1036 |             if c == "":
1037 |                 self.url._hostname = self.base.hostname
1038 |                 self.url._path = self.base._path[:]
1039 |                 self.url._query = self.base.query
1040 | 
1041 |             elif c == "?":
1042 |                 self.url._hostname = self.base.hostname
1043 |                 self.url._path = self.base._path[:]
1044 |                 self.url._query = ""
1045 | 
1046 |                 self._state = PARSER_STATE_QUERY
1047 | 
1048 |             elif c == "#":
1049 |                 self.url._hostname = self.base.hostname
1050 |                 self.url._path = self.base._path[:]
1051 |                 self.url._query = self.base.query
1052 |                 self.url._fragment = ""
1053 | 
1054 |                 self._state = PARSER_STATE_FRAGMENT
1055 | 
1056 |             else:
1057 |                 match = WINDOWS_DRIVE_LETTER.search(c + remaining)
1058 |                 if match is None:
1059 |                     self.url._hostname = self.base.hostname
1060 |                     self.url._path = self.base._path[:]
1061 |                     self.shorten_url_path()
1062 | 
1063 |                 else:
1064 |                     self.validation_error = True
1065 | 
1066 |                 self._state = PARSER_STATE_PATH
1067 |                 self._pointer -= 1
1068 | 
1069 |         else:
1070 |             self._state = PARSER_STATE_PATH
1071 |             self._pointer -= 1
1072 | 
1073 |     def _on_file_slash(self, c, remaining):
1074 |         """Handles the FILE SLASH state"""
1075 |         if c == "/" or c == "\\":
1076 |             if c == "\\":
1077 |                 self.validation_error = True
1078 |             self._state = PARSER_STATE_FILE_HOST
1079 | 
1080 |         else:
1081 |             if (
1082 |                 self.base is not None
1083 |                 and self.base.scheme == "file"
1084 |                 and WINDOWS_DRIVE_LETTER.search(c + remaining) is None
1085 |             ):
1086 |                 if (
1087 |                     len(self.base._path) > 0
1088 |                     and NORMALIZED_WINDOWS_DRIVE_LETTER.match(self.base._path[0])
1089 |                     is not None
1090 |                 ):
1091 |                     self.url._path.append(self.base._path[0])
1092 | 
1093 |                 else:
1094 |                     self.url._hostname = self.base.hostname
1095 | 
1096 |             self._state = PARSER_STATE_PATH
1097 |             self._pointer -= 1
1098 | 
1099 |     def _on_file_host(self, c, _):
1100 |         """Handles the FILE HOST state"""
1101 |         if c in PATH_DELIMITERS:
1102 |             self._pointer -= 1
1103 | 
1104 |             if (
1105 |                 self.state_override is None
1106 |                 and WINDOWS_DRIVE_LETTER.match(self._buffer) is not None
1107 |             ):
1108 |                 self.validation_error = True
1109 |                 self._state = PARSER_STATE_PATH
1110 | 
1111 |             elif self._buffer == "":
1112 |                 self.url._hostname = ""
1113 | 
1114 |                 if self.state_override is not None:
1115 |                     raise _UrlParserReturn()
1116 | 
1117 |                 self._state = PARSER_STATE_PATH_START
1118 | 
1119 |             else:
1120 |                 self.url._hostname = self.parse_host(
1121 |                     self._buffer, self.url.scheme not in SPECIAL_SCHEMES
1122 |                 )
1123 | 
1124 |                 if self.url.hostname == "localhost":
1125 |                     self.url._hostname = ""
1126 | 
1127 |                 if self.state_override is not None:
1128 |                     raise _UrlParserReturn()
1129 | 
1130 |                 self._buffer = ""
1131 |                 self._state = PARSER_STATE_PATH_START
1132 | 
1133 |         else:
1134 |             self._buffer += c
1135 | 
1136 |     def _on_path_start(self, c, _):
1137 |         """Handles the PATH START state"""
1138 |         if self.url.scheme in SPECIAL_SCHEMES:
1139 |             if c == "\\":
1140 |                 self.validation_error = True
1141 | 
1142 |             self._state = PARSER_STATE_PATH
1143 | 
1144 |             if c != "/" and c != "\\":
1145 |                 self._pointer -= 1
1146 | 
1147 |         elif self.state_override is None and c == "?":
1148 |             self.url._query = ""
1149 |             self._state = PARSER_STATE_QUERY
1150 | 
1151 |         elif self.state_override is None and c == "#":
1152 |             self.url._fragment = ""
1153 |             self._state = PARSER_STATE_FRAGMENT
1154 | 
1155 |         elif c != "":
1156 |             self._state = PARSER_STATE_PATH
1157 | 
1158 |             if c != "/":
1159 |                 self._pointer -= 1
1160 | 
1161 |     def _on_path(self, c, remaining):
1162 |         """Handles the PATH state"""
1163 |         cond = c == "\\" and self.url.scheme in SPECIAL_SCHEMES
1164 |         if (
1165 |             c == ""
1166 |             or c == "/"
1167 |             or cond
1168 |             or (self.state_override is None and (c == "?" or c == "#"))
1169 |         ):
1170 |             if cond:
1171 |                 self.validation_error = True
1172 | 
1173 |             if self._buffer in DOUBLE_DOT_PATH_SEGMENTS:
1174 |                 self.shorten_url_path()
1175 | 
1176 |                 if not (c == "/" or cond):
1177 |                     self.url._path.append("")
1178 | 
1179 |             elif self._buffer in SINGLE_DOT_PATH_SEGMENTS and not (c == "/" or cond):
1180 |                 self.url._path.append("")
1181 | 
1182 |             elif self._buffer not in SINGLE_DOT_PATH_SEGMENTS:
1183 |                 if (
1184 |                     self.url.scheme == "file"
1185 |                     and len(self.url._path) == 0
1186 |                     and WINDOWS_DRIVE_LETTER.match(self._buffer) is not None
1187 |                 ):
1188 |                     if self.url.hostname != "" and self.url.hostname is not None:
1189 |                         self.validation_error = True
1190 |                         self.url._hostname = ""
1191 | 
1192 |                     self._buffer = self._buffer[0] + ":" + self._buffer[2:]
1193 | 
1194 |                 self.url._path.append(self._buffer)
1195 | 
1196 |             self._buffer = ""
1197 | 
1198 |             if self.url.scheme == "file" and c in PATH_DELIMITERS:
1199 |                 while len(self.url._path) > 1 and self.url._path[0] == "":
1200 |                     self.validation_error = True
1201 |                     self.url._path.pop(0)
1202 | 
1203 |             if c == "?":
1204 |                 self.url._query = ""
1205 |                 self._state = PARSER_STATE_QUERY
1206 | 
1207 |             elif c == "#":
1208 |                 self.url._fragment = ""
1209 |                 self._state = PARSER_STATE_FRAGMENT
1210 | 
1211 |         else:
1212 |             if c != "%" and not _is_url_codepoint(c):
1213 |                 self.validation_error = True
1214 |             if c == "%" and TWO_ASCII_HEX.search(remaining) is None:
1215 |                 self.validation_error = True
1216 |             self._buffer += _percent_encode(c, PATH_PERCENT_ENCODE)
1217 | 
1218 |     def _on_cannot_be_base_url(self, c, remaining):
1219 |         """Handles the CANNOT BE BASE URL state"""
1220 |         if c == "?":
1221 |             self.url._query = ""
1222 |             self._state = PARSER_STATE_QUERY
1223 | 
1224 |         elif c == "#":
1225 |             self.url._fragment = ""
1226 |             self._state = PARSER_STATE_FRAGMENT
1227 | 
1228 |         else:
1229 |             if c != "" and c != "%" and not _is_url_codepoint(c):
1230 |                 self.validation_error = True
1231 | 
1232 |             if c == "%" and TWO_ASCII_HEX.search(remaining) is None:
1233 |                 self.validation_error = True
1234 | 
1235 |             if c != "":
1236 |                 self.url._path[0] += _percent_encode(c, C0_PERCENT_ENCODE)
1237 | 
1238 |     def _on_query(self, c, remaining):
1239 |         """Handles the QUERY state"""
1240 |         if self.encoding != "utf-8" and (
1241 |             self.url.scheme == "ws"
1242 |             or self.url.scheme == "wss"
1243 |             or self.url.scheme not in SPECIAL_SCHEMES
1244 |         ):
1245 |             self.encoding = "utf-8"
1246 | 
1247 |         if self.state_override is None and c == "#":
1248 |             self.url._fragment = ""
1249 |             self._state = PARSER_STATE_FRAGMENT
1250 | 
1251 |         elif c != "":
1252 |             if c != "%" and not _is_url_codepoint(c):
1253 |                 self.validation_error = True
1254 | 
1255 |             if c == "%" and TWO_ASCII_HEX.search(remaining) is None:
1256 |                 self.validation_error = True
1257 | 
1258 |             bytes_ = c.encode(self.encoding)
1259 | 
1260 |             if bytes_.startswith(b"&#") and bytes_.endswith(b";"):
1261 |                 self.url._query += (b"%26%23" + bytes_[2:-1] + b"%3B").decode("ascii")
1262 | 
1263 |             else:
1264 |                 is_special = self.url.scheme in SPECIAL_SCHEMES
1265 |                 for byte in _iterbytes(bytes_):
1266 |                     if (
1267 |                         byte < 0x21
1268 |                         or byte > 0x7e
1269 |                         or byte == 0x22
1270 |                         or byte == 0x23
1271 |                         or byte == 0x3c
1272 |                         or byte == 0x3e
1273 |                         or (is_special and byte == 0x27)
1274 |                     ):
1275 |                         self.url._query += "%" + _hex(byte)
1276 |                     else:
1277 |                         self.url._query += chr(byte)
1278 | 
1279 |     def _on_fragment(self, c, remaining):
1280 |         if c == "":
1281 |             pass
1282 | 
1283 |         elif c == "\x00":
1284 |             self.validation_error = True
1285 | 
1286 |         else:
1287 |             if c != "%" and _is_url_codepoint(c):
1288 |                 self.validation_error = True
1289 | 
1290 |             if c == "%" and TWO_ASCII_HEX.search(remaining) is None:
1291 |                 self.validation_error = True
1292 | 
1293 |             self.url._fragment += _percent_encode(c, FRAGMENT_PERCENT_ENCODE)
1294 | 
1295 | 
1296 | def _string_percent_decode(data):
1297 |     bytes_ = data.encode("utf-8")
1298 |     return _percent_decode(bytes_)
1299 | 
1300 | 
1301 | def _percent_encode(c, encode_set):
1302 |     if c in encode_set or ord(c) > 0x7e:
1303 |         if not isinstance(c, bytes):
1304 |             c = c.encode("utf-8")
1305 |         return "".join(["%" + _hex(x) for x in _iterbytes(c)])
1306 |     return c
1307 | 
1308 | 
1309 | def _is_url_codepoint(c):
1310 |     if c in URL_CODEPOINTS:
1311 |         return True
1312 |     c_ord = ord(c)
1313 |     return (
1314 |         0xa0 <= c_ord <= 0x10fffd
1315 |         and not 0xd800 <= c_ord <= 0xdfff
1316 |         and not 0xfdd0 <= c_ord <= 0xfdef
1317 |         and c_ord not in NONCHARACTERS
1318 |     )
1319 | 
1320 | 
1321 | def _is_c0_control_or_space(c):
1322 |     return c == " " or 0 <= ord(c) <= 0x1f
1323 | 
1324 | 
1325 | def _percent_decode(bytes_):
1326 |     output = []
1327 |     skip = 0
1328 | 
1329 |     def is_hex(x):
1330 |         x = _byte2int(x)
1331 |         return 0x30 <= x <= 0x39 or 0x41 <= x <= 0x46 or 0x61 <= x <= 0x66
1332 | 
1333 |     for i, byte in enumerate(_iterbytes(bytes_)):
1334 |         if skip:
1335 |             skip -= 1
1336 |             continue
1337 |         if byte != 0x25:
1338 |             output.append(_int2byte(byte))
1339 |         elif (
1340 |             i + 2 >= len(bytes_)
1341 |             or not is_hex(bytes_[i + 1])
1342 |             or not is_hex(bytes_[i + 2])
1343 |         ):
1344 |             output.append(_int2byte(byte))
1345 |         else:
1346 |             value = int(bytes_[i + 1 : i + 3].decode("ascii").lower(), 16)
1347 |             skip = 2
1348 |             output.append(_int2byte(value))
1349 | 
1350 |     return b"".join(output)
1351 | 
1352 | 
1353 | def _domain_to_ascii(domain, strict=False):
1354 |     """Attempt to encode with IDNA 2008 first, if that fails
1355 |     then attempt to encode with IDNA 2003.
1356 |     """
1357 |     try:
1358 |         return idna.encode(
1359 |             domain, strict=strict, std3_rules=strict, uts46=True, transitional=False
1360 |         )
1361 |     except idna.IDNAError:
1362 |         if isinstance(domain, (bytes, bytearray)):
1363 |             domain = domain.decode("ascii")
1364 |         domain = idna.uts46_remap(domain, std3_rules=strict, transitional=False)
1365 |         trailing_dot = False
1366 |         result = []
1367 |         if strict:
1368 |             labels = domain.split(".")
1369 |         else:
1370 |             labels = IDNA_DOTS_REGEX.split(domain)
1371 | 
1372 |         if not labels or labels == [""]:
1373 |             raise idna.IDNAError("Empty domain")
1374 |         if labels[-1] == "":
1375 |             del labels[-1]
1376 |             trailing_dot = True
1377 | 
1378 |         for label in labels:
1379 |             try:
1380 |                 s = idna2003.ToASCII(label)
1381 |             except UnicodeError:
1382 |                 if strict:
1383 |                     raise
1384 |                 result.append(label.encode("utf-8"))
1385 |                 continue
1386 |             if s:
1387 |                 result.append(s)
1388 |             else:
1389 |                 raise idna.IDNAError("Empty label")
1390 |         if trailing_dot:
1391 |             result.append(b"")
1392 |         s = b".".join(result)
1393 |         if not idna.valid_string_length(s, trailing_dot):
1394 |             raise idna.IDNAError("Domain too long")
1395 |         return s
1396 | 
1397 | 
1398 | def _parse_ipv4_number(input_):
1399 |     """Parses a single IPv4 number"""
1400 | 
1401 |     r = 10
1402 | 
1403 |     try:
1404 |         if len(input_) >= 2:
1405 |             if input_[:2].lower() == "0x":
1406 |                 r = 16
1407 |                 input_ = input_[2:]
1408 | 
1409 |             elif input_.startswith("0"):
1410 |                 r = 8
1411 |                 input_ = input_[1:]
1412 | 
1413 |         if input_ == "":
1414 |             return 0, False
1415 | 
1416 |         return int(input_, r), r != 10
1417 |     except ValueError:
1418 |         return None, False
1419 | 
1420 | 
1421 | class ParseResultMixin(object):
1422 |     def geturl(self):
1423 |         return self.url.href
1424 | 
1425 |     @property
1426 |     def username(self):
1427 |         if self.url.password:
1428 |             return self.url.username or ""
1429 |         return self.url.username
1430 | 
1431 |     @property
1432 |     def password(self):
1433 |         return self.url.password
1434 | 
1435 |     @property
1436 |     def hostname(self):
1437 |         return self.url.hostname
1438 | 
1439 |     @property
1440 |     def port(self):
1441 |         return self.url.port
1442 | 
1443 | 
1444 | class ParseResult(
1445 |     collections.namedtuple(
1446 |         "ParseResult", ["scheme", "netloc", "path", "params", "query", "fragment"]
1447 |     ),
1448 |     ParseResultMixin,
1449 | ):
1450 |     slots = ()
1451 | 
1452 |     def __new__(cls, scheme, netloc, path, params, query, fragment, url):
1453 |         parse_result = super(ParseResult, cls).__new__(
1454 |             cls,
1455 |             scheme or "",
1456 |             netloc or "",
1457 |             path or "",
1458 |             params or "",
1459 |             query or "",
1460 |             fragment or "",
1461 |         )
1462 |         parse_result.url = url
1463 |         return parse_result
1464 | 
1465 | 
1466 | def urlparse(urlstring, scheme="", allow_fragments=True, encoding="utf-8"):
1467 |     """Compatible with urllib.parse.urlparse().
1468 |     See documentation of urlparse() for more information.
1469 |     """
1470 |     parser = UrlParser(Url())
1471 |     url = parser.parse(urlstring, encoding=encoding)
1472 |     if scheme != "":
1473 |         url.scheme = scheme
1474 |     if not allow_fragments:
1475 |         _add_url_fragment_to_path(url)
1476 |     return ParseResult(
1477 |         url.scheme, url.authority, url.path, "", url.query, url.fragment, url
1478 |     )
1479 | 
1480 | 
1481 | def urljoin(base, url, allow_fragments=True, encoding="utf-8"):
1482 |     """Compatible with urllib.parse.urljoin()
1483 |     See documentation of urljoin() for more information.
1484 |     """
1485 |     parser = UrlParser(Url())
1486 |     url = parser.parse(url, base=base, encoding=encoding)
1487 |     if not allow_fragments:
1488 |         _add_url_fragment_to_path(url)
1489 |     return url.href
1490 | 
1491 | 
1492 | def _add_url_fragment_to_path(url):
1493 |     if len(url._path):
1494 |         url._path[-1] += "#" + url.fragment
1495 |     else:
1496 |         url._path.append("#" + url.fragment)
1497 |     url.fragment = None
1498 | 
1499 | 
1500 | def _iterbytes(bytes_):
1501 |     if six.PY3:
1502 |         return bytes_
1503 |     else:
1504 |         return [ord(x) for x in bytes_]
1505 | 
1506 | 
1507 | def _byte2int(byte):
1508 |     if not isinstance(byte, int):
1509 |         return ord(byte)
1510 |     return byte
1511 | 
1512 | 
1513 | def _int2byte(i):
1514 |     if six.PY3:
1515 |         return i.to_bytes(length=1, byteorder="little")
1516 |     return chr(i)
1517 | 
1518 | 
1519 | def _hex(x):
1520 |     return hex(_byte2int(x))[2:].zfill(2).upper()
1521 | 


--------------------------------------------------------------------------------