├── .github └── workflows │ └── pythonpublish.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── setup.py └── url_parser ├── __init__.py ├── public_suffix_list.dat ├── public_suffix_list.py └── tests └── test_url_parser.py /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python 14 | uses: actions/setup-python@v1 15 | with: 16 | python-version: '3.x' 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install setuptools wheel twine 21 | - name: Run tests 22 | run: | 23 | python -m unittest url_parser.tests.test_url_parser 24 | - name: Build and publish 25 | env: 26 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 27 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 28 | run: | 29 | python setup.py sdist bdist_wheel 30 | twine upload dist/* 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | build_and_deploy.sh 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # Environments 101 | .env 102 | .venv 103 | env/ 104 | venv/ 105 | ENV/ 106 | env.bak/ 107 | venv.bak/ 108 | virt/ 109 | 110 | # Spyder project settings 111 | .spyderproject 112 | .spyproject 113 | 114 | # Rope project settings 115 | .ropeproject 116 | 117 | # mkdocs documentation 118 | /site 119 | 120 | # mypy 121 | .mypy_cache/ 122 | .dmypy.json 123 | dmypy.json 124 | 125 | # Pyre type checker 126 | .pyre/ 127 | 128 | /.idea 129 | /data 130 | !/data/.gitkeep 131 | virtenv/Scripts/tldextract.exe 132 | virtenv/Scripts/pythonw.exe 133 | virtenv/Scripts/python.exe 134 | virtenv/Scripts/pip3.exe 135 | virtenv/Scripts/pip3.7.exe 136 | virtenv/Scripts/pip3.7-script.py 137 | virtenv/Scripts/pip3-script.py 138 | virtenv/Scripts/pip.exe 139 | virtenv/Scripts/pip-script.py 140 | virtenv/Scripts/flask.exe 141 | virtenv/Scripts/easy_install.exe 142 | virtenv/Scripts/easy_install-script.py 143 | virtenv/Scripts/easy_install-3.7.exe 144 | virtenv/Scripts/easy_install-3.7-script.py 145 | virtenv/Scripts/dotenv.exe 146 | virtenv/Scripts/deactivate.bat 147 | virtenv/Scripts/chardetect.exe 148 | virtenv/Scripts/activate.bat 149 | virtenv/Scripts/activate 150 | virtenv/Scripts/Activate.ps1 151 | virtenv/pyvenv.cfg 152 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ### Changelog: 2 | 3 | ##### v3.0.3 4 | * Fixed catastrophic backtracking and a small top domain bug. 5 | 6 | ##### v3.0.2 7 | * Fixed bug for not loading public_suffix_list.dat correctly. 8 | 9 | ##### v3.0.0 10 | * A lot of cleanup. 11 | * Added list from https://publicsuffix.org/ to find top domains more correct. 12 | * Breaking change: Signature of `parse_url(item)` now changed to `parse_url(url: str) -> dict`. 13 | * Breaking change: Signature of `get_url(item)` now changed to `def get_url(url: str) -> UrlObject:`. 14 | 15 | ##### v2.1.1 16 | * Small fix for readme and Github actions. 17 | 18 | ##### v2.1.0 19 | * Added function to get url back as Object. Also added a function to get the basics of a url returned. 20 | 21 | ##### v2.0.0 22 | * Added new regex and support for foreign languages. 23 | 24 | ##### v1.0.0 25 | * Small bugfixes and optimisation for stable release. 26 | 27 | ##### v0.9.9 28 | * Bugfixes on the readme file. 29 | 30 | ##### v0.9.8 31 | * Added support for args. 32 | 33 | ##### v0.9.7 34 | * Changed setup.py and readme for PyPi optimisation. 35 | 36 | ##### v0.9.6 37 | * Added support for secondary top domain (like: co.uk, .parliament.uk, .gov.au). 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Adapted AS 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include url_parser/public_suffix_list.dat -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python URL Parser 2 | ![PyPI - Format](https://img.shields.io/pypi/format/url-parser) 3 | ![PyPI - Status](https://img.shields.io/pypi/status/url-parser) 4 | ![Downloads](https://pepy.tech/badge/url-parser) 5 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/url-parser) 6 | 7 | A nice package to help you parse all types of URL's in vanilla python and return the parsed URL in groups.
8 | 9 | To not brake the API `parse_url` (returns a dict) still works and we made `get_url` to get the url parts as as object instead. 10 | 11 | In version 2.1 we also included `get_basic_url` a small yet neat function to get a the main url back from a string 12 | 13 | ### Installation 14 | ``` 15 | pip install url-parser 16 | ``` 17 | 18 | ### Usage 19 | 20 | ```python 21 | from url_parser import parse_url, get_url, get_base_url 22 | 23 | 24 | url = parse_url('https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0') # returns url sections as a dict 25 | url_object = get_url('https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0') # Does the same, bur returns a object 26 | basic_url = get_base_url('https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0') # Returns just the main url 27 | 28 | print(url['domain']) # Outputs -> prospecta 29 | print(url_object.domain) # Outputs -> prospecta 30 | print(basic_url) # Outputs -> https://open.prospecta.app 31 | ``` 32 | 33 | ### Keywords `get_url` and `parse_url` 34 | 35 | When using the `parse_url` and `get_url` function, you get a dict (parse_url) or object (get_url) back with different parts of the URL. 36 | 37 | The different parts can be accessed by keywords:
38 | For `parse_url` use: `result['top_domain]`
39 | For `get_url` use: `result.top_domain` 40 | 41 | 42 | Here is a list of all the available keywords: 43 | 44 | | Keyword | Desription | Value when not present in URL 45 | | ------ | ------ | ------ | 46 | | protocol | The protocol, e.g. **https** or **ftp** | None 47 | | www | Returns **www** if www is used in the URL | None 48 | | sub_domain | The sub domain, e.g. **my.subdomain** in **my.subdomain.example.com**. Note that the sub domain also includes www. | None 49 | | domain | The domain, e.g. **example** in **example.com** | Is always present 50 | | top_domain | The domain, e.g. **com** in **example.com** | Is always present 51 | | dir | The directory, e.g. **/my/directory/** in **example.com/my/directory/** | None 52 | | file | The file, e.g. **my_file.js** in **example.com/home/my_file.js** | None 53 | | path | The full path, e.g. **/home/my_file.js** in **example.com/home/my_file.js** | None 54 | | fragment | The URL fragment, e.g. **my_link** in **example.com#my_link** | None 55 | | query | The URL query, e.g. **my_parameter=1&foo=bar** in **example.com?my_parameter=1&foo=bar** | None 56 | 57 | ### Testing 58 | 59 | Use the following command to run tests. 60 | 61 | ```bash 62 | python -m unittest url_parser.tests.test_url_parser 63 | ``` 64 | 65 | ### Changelog: 66 | 67 | See CHANGELOG.md 68 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setup(name='url_parser', 7 | version='3.0.3', 8 | description='Parse url and get all the different parts out of it', 9 | url='https://github.com/AdaptedAS/url_parser', 10 | author='Odd Jøren Røland', 11 | long_description=long_description, 12 | long_description_content_type="text/markdown", 13 | author_email='odd@adapted.no', 14 | license='MIT', 15 | packages=['url_parser'], 16 | platforms=['any'], 17 | include_package_data=True, 18 | classifiers=[ 19 | 'License :: OSI Approved :: MIT License', 20 | 'Natural Language :: English', 21 | 'Intended Audience :: Developers', 22 | 'Topic :: Software Development :: Libraries', 23 | 'Topic :: Software Development :: Libraries :: Python Modules', 24 | 'Development Status :: 5 - Production/Stable', 25 | 'Programming Language :: Python', 26 | 'Programming Language :: Python :: 2', 27 | 'Programming Language :: Python :: 2.7', 28 | 'Programming Language :: Python :: 3', 29 | 'Programming Language :: Python :: 3.4', 30 | 'Programming Language :: Python :: 3.5', 31 | 'Programming Language :: Python :: 3.6', 32 | 'Programming Language :: Python :: 3.7', 33 | ], 34 | zip_safe=False 35 | ) 36 | -------------------------------------------------------------------------------- /url_parser/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | import warnings 3 | from collections import namedtuple 4 | 5 | from url_parser.public_suffix_list import PublicSuffixList 6 | 7 | UrlObject = namedtuple( 8 | 'UrlObject', [ 9 | 'protocol', 10 | 'www', 11 | 'sub_domain', 12 | 'domain', 13 | 'top_domain', 14 | 'path', 15 | 'dir', 16 | 'file', 17 | 'fragment', 18 | 'query' 19 | ]) 20 | 21 | 22 | def _split_query_group(query_groups: list) -> dict: 23 | result = dict() 24 | 25 | for query_group in query_groups: 26 | query = query_group.split('=') 27 | 28 | if len(query) == 1: 29 | result[query[0]] = None 30 | continue 31 | 32 | result[query[0]] = query[1] 33 | 34 | return result 35 | 36 | 37 | def _parse_url_with_top_domain(url, top_domain): 38 | regex = r"^(?:(?P[\w\d]+)(?:\:\/\/))?" \ 39 | r"(?P" \ 40 | r"(?P(?:www)?)(?:\.?)" \ 41 | r"(?:(?:[\w\d-]+|\.)*?)?" \ 42 | r")(?:\.?)" \ 43 | r"(?P[^./]+(?=\.))\." \ 44 | r"(?P" + re.escape(top_domain) + r"(?![^/?#]))" \ 45 | r"(?P" \ 46 | r"(?P\/(?:[^/\r\n]+(?:/))+)?" \ 47 | r"(?:\/?)(?P[^?#\r\n]+)?" \ 48 | r")?" \ 49 | r"(?:\#(?P[^#?\r\n]*))?" \ 50 | r"(?:\?(?P.*(?=$)))*$" 51 | 52 | dict_data = { 53 | 'protocol': None, 54 | 'www': None, 55 | 'sub_domain': None, 56 | 'domain': None, 57 | 'top_domain': None, 58 | 'path': None, 59 | 'dir': None, 60 | 'file': None, 61 | 'fragment': None, 62 | 'query': None, 63 | } 64 | 65 | match = re.search(regex, url) 66 | 67 | dict_data['protocol'] = match.group('protocol') if match.group('protocol') else None 68 | dict_data['www'] = match.group('www') if match.group('www') else None 69 | dict_data['sub_domain'] = match.group('sub_domain') if match.group('sub_domain') else None 70 | dict_data['domain'] = match.group('domain') 71 | dict_data['top_domain'] = top_domain 72 | dict_data['path'] = match.group('path') if match.group('path') else None 73 | dict_data['dir'] = match.group('dir') if match.group('dir') else None 74 | dict_data['file'] = match.group('file') if match.group('file') else None 75 | dict_data['fragment'] = match.group('fragment') if match.group('fragment') else None 76 | 77 | query = match.group('query') if match.group('query') else None 78 | 79 | if query is not None: 80 | query_groups = query.split('&') 81 | query = _split_query_group(query_groups) 82 | dict_data['query'] = query 83 | 84 | return dict_data 85 | 86 | 87 | def _parse_url_with_public_suffix(url): 88 | public_suffix = PublicSuffixList.get_list() 89 | public_suffix.sort() 90 | 91 | domain_regex = r"(?:^|\/)(?P[^:/#?]+)(?:[/#?]|$)" 92 | match = re.search(domain_regex, url) 93 | domain = match.group('domain') 94 | domain_parts = domain.split('.') 95 | 96 | top_domain = None 97 | 98 | for i in range(len(domain_parts)): 99 | tail_gram = domain_parts[i:len(domain_parts)] 100 | tail_gram = '.'.join(tail_gram) 101 | 102 | if tail_gram in public_suffix: 103 | top_domain = tail_gram 104 | break 105 | 106 | data = _parse_url_with_top_domain(url, top_domain) 107 | 108 | return data 109 | 110 | 111 | def get_base_url(url: str) -> str: 112 | url = get_url(url) 113 | protocol = str(url.protocol) + '://' if url.protocol is not None else 'http://' 114 | www = 'www.' if url.www is not None else '' 115 | sub_domain = str(url.sub_domain) + '.' if url.sub_domain is not None and url.sub_domain != 'www.' else '' 116 | return protocol + www + sub_domain + url.domain + '.' + url.top_domain 117 | 118 | 119 | def get_url(url: str) -> UrlObject: 120 | data = _parse_url_with_public_suffix(url) 121 | 122 | object_data = UrlObject( 123 | protocol=data['protocol'], 124 | www=data['www'], 125 | sub_domain=data['sub_domain'], 126 | domain=data['domain'], 127 | top_domain=data['top_domain'], 128 | path=data['path'], 129 | dir=data['dir'], 130 | file=data['file'], 131 | fragment=data['fragment'], 132 | query=data['query'], 133 | ) 134 | 135 | return object_data 136 | 137 | 138 | def parse_url(url: str) -> dict: 139 | warnings.warn( 140 | "parse_url is deprecated, use get_url instead", 141 | DeprecationWarning 142 | ) 143 | 144 | data = get_url(url) 145 | return data._asdict() 146 | -------------------------------------------------------------------------------- /url_parser/public_suffix_list.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class PublicSuffixList: 5 | _public_suffix_list = None 6 | 7 | @staticmethod 8 | def get_list(): 9 | if PublicSuffixList._public_suffix_list is not None: 10 | return PublicSuffixList._public_suffix_list 11 | 12 | public_suffix_list = [] 13 | 14 | dir_path = os.path.dirname(os.path.realpath(__file__)) 15 | dat_file = f'{dir_path}/public_suffix_list.dat' 16 | 17 | with open(dat_file, encoding='utf-8') as file: 18 | data = file.readlines() 19 | 20 | for line in data: 21 | if line[0:2] == '//': 22 | continue 23 | 24 | line = line.replace('\n', '') 25 | line = line.replace('\r', '') 26 | 27 | if line == '': 28 | continue 29 | 30 | public_suffix_list.append(line) 31 | 32 | PublicSuffixList._public_suffix_list = public_suffix_list 33 | 34 | return PublicSuffixList._public_suffix_list 35 | -------------------------------------------------------------------------------- /url_parser/tests/test_url_parser.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import url_parser 4 | 5 | 6 | class TestUrlParser(TestCase): 7 | def test_parses_url_without_www(self): 8 | url = 'example.com' 9 | result = url_parser.parse_url(url) 10 | self.assertEqual(result['domain'], 'example') 11 | self.assertEqual(result['top_domain'], 'com') 12 | 13 | def test_parses_url_with_www(self): 14 | url = 'www.example.com' 15 | result = url_parser.parse_url(url) 16 | self.assertEqual(result['domain'], 'example') 17 | self.assertEqual(result['top_domain'], 'com') 18 | 19 | def test_returns_null_if_protocol_is_missing(self): 20 | url = 'www.example.com' 21 | result = url_parser.parse_url(url) 22 | self.assertIsNone(result['protocol']) 23 | 24 | def test_returns_null_if_www_is_missing(self): 25 | url = 'http://example.com' 26 | result = url_parser.parse_url(url) 27 | self.assertIsNone(result['www']) 28 | 29 | def test_removes_extra_dot_from_www(self): 30 | url = 'http://www..example.com' 31 | result = url_parser.parse_url(url) 32 | has_dot = '.' in result['www'] 33 | self.assertFalse(has_dot) 34 | 35 | def test_returns_null_if_sub_domain_is_missing(self): 36 | url = 'http://example.com' 37 | result = url_parser.parse_url(url) 38 | self.assertIsNone(result['sub_domain']) 39 | 40 | def test_finds_sub_domain(self): 41 | url = 'mysubdomain.example.com' 42 | result = url_parser.parse_url(url) 43 | self.assertEqual(result['sub_domain'], 'mysubdomain') 44 | 45 | def test_finds_multiple_subdomains(self): 46 | url = 'my.subdomain.example.com' 47 | result = url_parser.parse_url(url) 48 | self.assertEqual(result['sub_domain'], 'my.subdomain') 49 | 50 | def test_finds_protocol(self): 51 | url = 'http://mysubdomain.example.com' 52 | result = url_parser.parse_url(url) 53 | self.assertEqual(result['protocol'], 'http') 54 | 55 | url = 'https://mysubdomain.example.com' 56 | result = url_parser.parse_url(url) 57 | self.assertEqual(result['protocol'], 'https') 58 | 59 | url = 'ftp://mysubdomain.example.com' 60 | result = url_parser.parse_url(url) 61 | self.assertEqual(result['protocol'], 'ftp') 62 | 63 | def test_finds_dir(self): 64 | url = 'http://mysubdomain.example.com/folder/' 65 | result = url_parser.parse_url(url) 66 | self.assertEqual(result['dir'], '/folder/') 67 | 68 | url = 'http://mysubdomain.example.com/multiple/folders/' 69 | result = url_parser.parse_url(url) 70 | self.assertEqual(result['dir'], '/multiple/folders/') 71 | 72 | url = 'http://mysubdomain.example.com/multiple/folders/with_a_file.js' 73 | result = url_parser.parse_url(url) 74 | self.assertEqual(result['dir'], '/multiple/folders/') 75 | 76 | def test_does_not_mistake_file_for_dir(self): 77 | url = 'http://mysubdomain.example.com/folder/test' 78 | result = url_parser.parse_url(url) 79 | self.assertEqual(result['dir'], '/folder/') 80 | self.assertNotEqual(result['dir'], '/folder/test') 81 | 82 | def test_finds_domain(self): 83 | url = 'http://mysubdomain.example.com' 84 | result = url_parser.parse_url(url) 85 | self.assertEqual(result['domain'], 'example') 86 | 87 | def test_finds_top_domain(self): 88 | url = 'http://mysubdomain.example.com' 89 | result = url_parser.parse_url(url) 90 | self.assertEqual(result['top_domain'], 'com') 91 | 92 | url = 'http://mysubdomain.example.co.uk' 93 | result = url_parser.parse_url(url) 94 | self.assertEqual(result['top_domain'], 'co.uk') 95 | 96 | def test_finds_file(self): 97 | url = 'http://mysubdomain.example.com/cool.jpg' 98 | result = url_parser.parse_url(url) 99 | self.assertEqual(result['file'], 'cool.jpg') 100 | 101 | url = 'http://mysubdomain.example.com/directory/here/sample.mp4' 102 | result = url_parser.parse_url(url) 103 | self.assertEqual(result['file'], 'sample.mp4') 104 | 105 | def test_finds_path(self): 106 | url = 'http://mysubdomain.example.com/path' 107 | result = url_parser.parse_url(url) 108 | self.assertEqual(result['path'], '/path') 109 | 110 | url = 'http://mysubdomain.example.com/this/is/the/path' 111 | result = url_parser.parse_url(url) 112 | self.assertEqual(result['path'], '/this/is/the/path') 113 | 114 | url = 'http://mysubdomain.example.com/path/with/file.js' 115 | result = url_parser.parse_url(url) 116 | self.assertEqual(result['path'], '/path/with/file.js') 117 | 118 | def test_finds_fragment(self): 119 | url = 'http://mysubdomain.example.com#my_fragment' 120 | result = url_parser.parse_url(url) 121 | self.assertEqual(result['fragment'], 'my_fragment') 122 | 123 | url = 'http://mysubdomain.example.com/path/#my_fragment' 124 | result = url_parser.parse_url(url) 125 | self.assertEqual(result['fragment'], 'my_fragment') 126 | 127 | url = 'http://mysubdomain.example.com/path/file.js#my_fragment' 128 | result = url_parser.parse_url(url) 129 | self.assertEqual(result['fragment'], 'my_fragment') 130 | 131 | url = 'http://mysubdomain.example.com#my_fragment?myargs=test' 132 | result = url_parser.parse_url(url) 133 | self.assertEqual(result['fragment'], 'my_fragment') 134 | 135 | url = 'http://mysubdomain.example.com/test/path.js#my_fragment?myargs=test' 136 | result = url_parser.parse_url(url) 137 | self.assertEqual(result['fragment'], 'my_fragment') 138 | 139 | def test_finds_query(self): 140 | url = 'http://mysubdomain.example.com?myquery=test' 141 | result = url_parser.parse_url(url) 142 | self.assertEqual(result['query']['myquery'], 'test') 143 | 144 | url = 'http://mysubdomain.example.com?myquery=test&one=two&test' 145 | result = url_parser.parse_url(url) 146 | self.assertEqual(result['query']['myquery'], 'test') 147 | self.assertEqual(result['query']['one'], 'two') 148 | self.assertIsNone(result['query']['test']) 149 | 150 | url = 'http://mysubdomain.example.com/file.js?myquery=test&one=two' 151 | result = url_parser.parse_url(url) 152 | self.assertEqual(result['query']['myquery'], 'test') 153 | self.assertEqual(result['query']['one'], 'two') 154 | 155 | url = 'http://mysubdomain.example.com/path/and/file.js?myquery=test&one=two' 156 | result = url_parser.parse_url(url) 157 | self.assertEqual(result['query']['myquery'], 'test') 158 | self.assertEqual(result['query']['one'], 'two') 159 | 160 | url = 'http://mysubdomain.example.com/path/?myquery=test&one=two' 161 | result = url_parser.parse_url(url) 162 | self.assertEqual(result['query']['myquery'], 'test') 163 | self.assertEqual(result['query']['one'], 'two') 164 | 165 | def test_catastrophic_backtracking(self): 166 | url = 'http://very_long-and-complixated_subdomaind-for-page.mywebpageishere.com/' 167 | result = url_parser.parse_url(url) 168 | self.assertEqual(result['top_domain'], 'com') 169 | self.assertEqual(result['domain'], 'mywebpageishere') 170 | self.assertEqual(result['sub_domain'], 'very_long-and-complixated_subdomaind-for-page') 171 | 172 | def test_domain_that_starts_with_same_letters_as_top_domain(self): 173 | url = 'http://domains-stars-with-same-top-domain.nogo.no/' 174 | result = url_parser.parse_url(url) 175 | self.assertEqual(result['top_domain'], 'no') 176 | self.assertEqual(result['domain'], 'nogo') 177 | self.assertEqual(result['sub_domain'], 'domains-stars-with-same-top-domain') 178 | 179 | def test_domain_that_includes_a_top_domain_in_sub_domain(self): 180 | url = 'http://test.com.hello.nogo.no/' 181 | result = url_parser.parse_url(url) 182 | self.assertEqual(result['top_domain'], 'no') 183 | self.assertEqual(result['domain'], 'nogo') 184 | self.assertEqual(result['sub_domain'], 'test.com.hello') 185 | 186 | def test_domain_that_includes_a_top_domain_in_query(self): 187 | url = 'http://test.com.hello.nogo.no?my_query_domain=www.test.com' 188 | result = url_parser.parse_url(url) 189 | self.assertEqual(result['top_domain'], 'no') 190 | self.assertEqual(result['domain'], 'nogo') 191 | self.assertEqual(result['sub_domain'], 'test.com.hello') 192 | self.assertEqual(result['query']['my_query_domain'], 'www.test.com') 193 | 194 | 195 | class TestGetUrl(TestCase): 196 | def test_parses_url_without_www(self): 197 | url = 'example.com' 198 | result = url_parser.get_url(url) 199 | self.assertEqual(result.domain, 'example') 200 | self.assertEqual(result.top_domain, 'com') 201 | 202 | def test_parses_url_with_www(self): 203 | url = 'www.example.com' 204 | result = url_parser.get_url(url) 205 | self.assertEqual(result.domain, 'example') 206 | self.assertEqual(result.top_domain, 'com') 207 | 208 | def test_returns_null_if_protocol_is_missing(self): 209 | url = 'www.example.com' 210 | result = url_parser.get_url(url) 211 | self.assertIsNone(result.protocol) 212 | 213 | def test_returns_null_if_www_is_missing(self): 214 | url = 'http://example.com' 215 | result = url_parser.get_url(url) 216 | self.assertIsNone(result.www) 217 | 218 | def test_removes_extra_dot_from_www(self): 219 | url = 'http://www..example.com' 220 | result = url_parser.get_url(url) 221 | has_dot = '.' in result.www 222 | self.assertFalse(has_dot) 223 | 224 | def test_returns_null_if_sub_domain_is_missing(self): 225 | url = 'http://example.com' 226 | result = url_parser.get_url(url) 227 | self.assertIsNone(result.sub_domain) 228 | 229 | def test_finds_sub_domain(self): 230 | url = 'mysubdomain.example.com' 231 | result = url_parser.get_url(url) 232 | self.assertEqual(result.sub_domain, 'mysubdomain') 233 | 234 | def test_finds_multiple_subdomains(self): 235 | url = 'my.subdomain.example.com' 236 | result = url_parser.get_url(url) 237 | self.assertEqual(result.sub_domain, 'my.subdomain') 238 | 239 | def test_finds_protocol(self): 240 | url = 'http://mysubdomain.example.com' 241 | result = url_parser.get_url(url) 242 | self.assertEqual(result.protocol, 'http') 243 | 244 | url = 'https://mysubdomain.example.com' 245 | result = url_parser.get_url(url) 246 | self.assertEqual(result.protocol, 'https') 247 | 248 | url = 'ftp://mysubdomain.example.com' 249 | result = url_parser.get_url(url) 250 | self.assertEqual(result.protocol, 'ftp') 251 | 252 | def test_finds_dir(self): 253 | url = 'http://mysubdomain.example.com/folder/' 254 | result = url_parser.get_url(url) 255 | self.assertEqual(result.dir, '/folder/') 256 | 257 | url = 'http://mysubdomain.example.com/multiple/folders/' 258 | result = url_parser.get_url(url) 259 | self.assertEqual(result.dir, '/multiple/folders/') 260 | 261 | url = 'http://mysubdomain.example.com/multiple/folders/with_a_file.js' 262 | result = url_parser.get_url(url) 263 | self.assertEqual(result.dir, '/multiple/folders/') 264 | 265 | def test_does_not_mistake_file_for_dir(self): 266 | url = 'http://mysubdomain.example.com/folder/test' 267 | result = url_parser.get_url(url) 268 | self.assertEqual(result.dir, '/folder/') 269 | self.assertNotEqual(result.dir, '/folder/test') 270 | 271 | def test_finds_domain(self): 272 | url = 'http://mysubdomain.example.com' 273 | result = url_parser.get_url(url) 274 | self.assertEqual(result.domain, 'example') 275 | 276 | def test_finds_top_domain(self): 277 | url = 'http://mysubdomain.example.com' 278 | result = url_parser.get_url(url) 279 | self.assertEqual(result.top_domain, 'com') 280 | 281 | url = 'http://mysubdomain.example.co.uk' 282 | result = url_parser.get_url(url) 283 | self.assertEqual(result.top_domain, 'co.uk') 284 | 285 | def test_finds_file(self): 286 | url = 'http://mysubdomain.example.com/cool.jpg' 287 | result = url_parser.get_url(url) 288 | self.assertEqual(result.file, 'cool.jpg') 289 | 290 | url = 'http://mysubdomain.example.com/directory/here/sample.mp4' 291 | result = url_parser.get_url(url) 292 | self.assertEqual(result.file, 'sample.mp4') 293 | 294 | def test_finds_path(self): 295 | url = 'http://mysubdomain.example.com/path' 296 | result = url_parser.get_url(url) 297 | self.assertEqual(result.path, '/path') 298 | 299 | url = 'http://mysubdomain.example.com/this/is/the/path' 300 | result = url_parser.get_url(url) 301 | self.assertEqual(result.path, '/this/is/the/path') 302 | 303 | url = 'http://mysubdomain.example.com/path/with/file.js' 304 | result = url_parser.get_url(url) 305 | self.assertEqual(result.path, '/path/with/file.js') 306 | 307 | def test_finds_fragment(self): 308 | url = 'http://mysubdomain.example.com#my_fragment' 309 | result = url_parser.get_url(url) 310 | self.assertEqual(result.fragment, 'my_fragment') 311 | 312 | url = 'http://mysubdomain.example.com/path/#my_fragment' 313 | result = url_parser.get_url(url) 314 | self.assertEqual(result.fragment, 'my_fragment') 315 | 316 | url = 'http://mysubdomain.example.com/path/file.js#my_fragment' 317 | result = url_parser.get_url(url) 318 | self.assertEqual(result.fragment, 'my_fragment') 319 | 320 | url = 'http://mysubdomain.example.com#my_fragment?myargs=test' 321 | result = url_parser.get_url(url) 322 | self.assertEqual(result.fragment, 'my_fragment') 323 | 324 | url = 'http://mysubdomain.example.com/test/path.js#my_fragment?myargs=test' 325 | result = url_parser.get_url(url) 326 | self.assertEqual(result.fragment, 'my_fragment') 327 | 328 | def test_finds_query(self): 329 | url = 'http://mysubdomain.example.com?myquery=test' 330 | result = url_parser.get_url(url) 331 | self.assertEqual(result.query['myquery'], 'test') 332 | 333 | url = 'http://mysubdomain.example.com?myquery=test&one=two&test' 334 | result = url_parser.get_url(url) 335 | self.assertEqual(result.query['myquery'], 'test') 336 | self.assertEqual(result.query['one'], 'two') 337 | self.assertIsNone(result.query['test']) 338 | 339 | url = 'http://mysubdomain.example.com/file.js?myquery=test&one=two' 340 | result = url_parser.get_url(url) 341 | self.assertEqual(result.query['myquery'], 'test') 342 | self.assertEqual(result.query['one'], 'two') 343 | 344 | url = 'http://mysubdomain.example.com/path/and/file.js?myquery=test&one=two' 345 | result = url_parser.get_url(url) 346 | self.assertEqual(result.query['myquery'], 'test') 347 | self.assertEqual(result.query['one'], 'two') 348 | 349 | url = 'http://mysubdomain.example.com/path/?myquery=test&one=two' 350 | result = url_parser.get_url(url) 351 | self.assertEqual(result.query['myquery'], 'test') 352 | self.assertEqual(result.query['one'], 'two') 353 | 354 | 355 | class TestGetBasicUrl(TestCase): 356 | def test_basic_url(self): 357 | url = 'http://example.com' 358 | result = url_parser.get_base_url(url) 359 | self.assertEqual(result, 'http://example.com') 360 | 361 | url = 'https://example.com' 362 | result = url_parser.get_base_url(url) 363 | self.assertEqual(result, 'https://example.com') 364 | 365 | url = 'https://www.example.com' 366 | result = url_parser.get_base_url(url) 367 | self.assertEqual(result, 'https://www.example.com') 368 | 369 | url = 'example.com' 370 | result = url_parser.get_base_url(url) 371 | self.assertEqual(result, 'http://example.com') 372 | 373 | def test_sub_domain_basic_url(self): 374 | url = 'http://mysubdomain.example.com' 375 | result = url_parser.get_base_url(url) 376 | self.assertEqual(result, 'http://mysubdomain.example.com') 377 | 378 | url = 'https://mysubdomain.example.com' 379 | result = url_parser.get_base_url(url) 380 | self.assertEqual(result, 'https://mysubdomain.example.com') 381 | 382 | url = 'ftp://mysubdomain.example.com' 383 | result = url_parser.get_base_url(url) 384 | self.assertEqual(result, 'ftp://mysubdomain.example.com') 385 | 386 | def test_path_url(self): 387 | url = 'https://mysubdomain.example.com/path/to/wisdom' 388 | result = url_parser.get_base_url(url) 389 | self.assertEqual(result, 'https://mysubdomain.example.com') 390 | 391 | def test_long_url(self): 392 | url = 'https://mysubdomain.example.com/path/to/wisdom?query=2&this=3' 393 | result = url_parser.get_base_url(url) 394 | self.assertEqual(result, 'https://mysubdomain.example.com') 395 | 396 | url = 'https://www.example.com/path/to/wisdom?query=2&this=3' 397 | result = url_parser.get_base_url(url) 398 | self.assertEqual(result, 'https://www.example.com') 399 | 400 | url = 'https://example.com/path/to/wisdom?query=2&this=3' 401 | result = url_parser.get_base_url(url) 402 | self.assertEqual(result, 'https://example.com') 403 | 404 | url = 'example.com/path/to/wisdom?query=2&this=3' 405 | result = url_parser.get_base_url(url) 406 | self.assertEqual(result, 'http://example.com') --------------------------------------------------------------------------------