├── .github
    └── workflows
    │   └── pythonpublish.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── setup.py
└── url_parser
    ├── __init__.py
    ├── public_suffix_list.dat
    ├── public_suffix_list.py
    └── tests
        └── test_url_parser.py


/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v2
13 |     - name: Set up Python
14 |       uses: actions/setup-python@v1
15 |       with:
16 |         python-version: '3.x'
17 |     - name: Install dependencies
18 |       run: |
19 |         python -m pip install --upgrade pip
20 |         pip install setuptools wheel twine
21 |     - name: Run tests
22 |       run: |
23 |         python -m unittest url_parser.tests.test_url_parser
24 |     - name: Build and publish
25 |       env:
26 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
27 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
28 |       run: |
29 |         python setup.py sdist bdist_wheel
30 |         twine upload dist/*
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | build_and_deploy.sh
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # celery beat schedule file
 95 | celerybeat-schedule
 96 | 
 97 | # SageMath parsed files
 98 | *.sage.py
 99 | 
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 | virt/
109 | 
110 | # Spyder project settings
111 | .spyderproject
112 | .spyproject
113 | 
114 | # Rope project settings
115 | .ropeproject
116 | 
117 | # mkdocs documentation
118 | /site
119 | 
120 | # mypy
121 | .mypy_cache/
122 | .dmypy.json
123 | dmypy.json
124 | 
125 | # Pyre type checker
126 | .pyre/
127 | 
128 | /.idea
129 | /data
130 | !/data/.gitkeep
131 | virtenv/Scripts/tldextract.exe
132 | virtenv/Scripts/pythonw.exe
133 | virtenv/Scripts/python.exe
134 | virtenv/Scripts/pip3.exe
135 | virtenv/Scripts/pip3.7.exe
136 | virtenv/Scripts/pip3.7-script.py
137 | virtenv/Scripts/pip3-script.py
138 | virtenv/Scripts/pip.exe
139 | virtenv/Scripts/pip-script.py
140 | virtenv/Scripts/flask.exe
141 | virtenv/Scripts/easy_install.exe
142 | virtenv/Scripts/easy_install-script.py
143 | virtenv/Scripts/easy_install-3.7.exe
144 | virtenv/Scripts/easy_install-3.7-script.py
145 | virtenv/Scripts/dotenv.exe
146 | virtenv/Scripts/deactivate.bat
147 | virtenv/Scripts/chardetect.exe
148 | virtenv/Scripts/activate.bat
149 | virtenv/Scripts/activate
150 | virtenv/Scripts/Activate.ps1
151 | virtenv/pyvenv.cfg
152 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ### Changelog:
 2 | 
 3 | ##### v3.0.3
 4 | * Fixed catastrophic backtracking and a small top domain bug.
 5 | 
 6 | ##### v3.0.2
 7 | * Fixed bug for not loading public_suffix_list.dat correctly.
 8 | 
 9 | ##### v3.0.0
10 | * A lot of cleanup.
11 | * Added list from https://publicsuffix.org/ to find top domains more correct.
12 | * Breaking change: Signature of `parse_url(item)` now changed to `parse_url(url: str) -> dict`.
13 | * Breaking change: Signature of `get_url(item)` now changed to `def get_url(url: str) -> UrlObject:`.
14 | 
15 | ##### v2.1.1
16 | * Small fix for readme and Github actions.
17 | 
18 | ##### v2.1.0
19 | * Added function to get url back as Object. Also added a function to get the basics of a url returned.
20 | 
21 | ##### v2.0.0
22 | * Added new regex and support for foreign languages.
23 | 
24 | ##### v1.0.0
25 | * Small bugfixes and optimisation for stable release.
26 | 
27 | ##### v0.9.9
28 | * Bugfixes on the readme file.
29 | 
30 | ##### v0.9.8
31 | * Added support for args.
32 | 
33 | ##### v0.9.7
34 | * Changed setup.py and readme for PyPi optimisation.
35 | 
36 | ##### v0.9.6 
37 | * Added support for secondary top domain (like: co.uk, .parliament.uk, .gov.au).
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Adapted AS
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include url_parser/public_suffix_list.dat


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Python URL Parser
 2 | ![PyPI - Format](https://img.shields.io/pypi/format/url-parser)
 3 | ![PyPI - Status](https://img.shields.io/pypi/status/url-parser)
 4 | ![Downloads](https://pepy.tech/badge/url-parser)
 5 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/url-parser)
 6 | 
 7 | A nice package to help you parse all types of URL's in vanilla python and return the parsed URL in groups.<br />
 8 | 
 9 | To not brake the API `parse_url` (returns a dict) still works and we made `get_url` to get the url parts as as object instead.
10 | 
11 | In version 2.1 we also included `get_basic_url` a small yet neat function to get a the main url back from a string
12 | 
13 | ### Installation
14 | ```
15 | pip install url-parser
16 | ```
17 | 
18 | ### Usage
19 | 
20 | ```python
21 | from url_parser import parse_url, get_url, get_base_url
22 | 
23 | 
24 | url = parse_url('https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0') # returns url sections as a dict  
25 | url_object = get_url('https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0') # Does the same, bur returns a object  
26 | basic_url = get_base_url('https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0') # Returns just the main url  
27 | 
28 | print(url['domain']) # Outputs -> prospecta  
29 | print(url_object.domain) # Outputs -> prospecta  
30 | print(basic_url) # Outputs -> https://open.prospecta.app  
31 | ```
32 | 
33 | ### Keywords `get_url` and `parse_url`
34 | 
35 | When using the `parse_url` and `get_url` function, you get a dict (parse_url) or object (get_url) back with different parts of the URL.
36 | 
37 | The different parts can be accessed by keywords:<br />
38 | For `parse_url` use: `result['top_domain]`<br />
39 | For `get_url` use: `result.top_domain`
40 | 
41 | 
42 | Here is a list of all the available keywords:
43 | 
44 | | Keyword | Desription | Value when not present in URL
45 | | ------ | ------ | ------ |
46 | | protocol | The protocol, e.g. **https** or **ftp** | None
47 | | www | Returns **www** if www is used in the URL | None
48 | | sub_domain | The sub domain, e.g. **my.subdomain** in **my.subdomain.example.com**. Note that the sub domain also includes www. | None
49 | | domain | The domain, e.g. **example** in **example.com** | Is always present
50 | | top_domain | The domain, e.g. **com** in **example.com** | Is always present
51 | | dir | The directory, e.g. **/my/directory/** in **example.com/my/directory/** | None
52 | | file | The file, e.g. **my_file.js** in **example.com/home/my_file.js** | None
53 | | path | The full path, e.g. **/home/my_file.js** in **example.com/home/my_file.js** | None
54 | | fragment | The URL fragment, e.g. **my_link** in **example.com#my_link** | None
55 | | query | The URL query, e.g. **my_parameter=1&foo=bar** in **example.com?my_parameter=1&foo=bar** | None
56 | 
57 | ### Testing
58 | 
59 | Use the following command to run tests.
60 | 
61 | ```bash
62 | python -m unittest url_parser.tests.test_url_parser
63 | ```
64 | 
65 | ### Changelog:
66 | 
67 | See CHANGELOG.md
68 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setup(name='url_parser',
 7 |       version='3.0.3',
 8 |       description='Parse url and get all the different parts out of it',
 9 |       url='https://github.com/AdaptedAS/url_parser',
10 |       author='Odd Jøren Røland',
11 |       long_description=long_description,
12 |       long_description_content_type="text/markdown",
13 |       author_email='odd@adapted.no',
14 |       license='MIT',
15 |       packages=['url_parser'],
16 |       platforms=['any'],
17 |       include_package_data=True,
18 |       classifiers=[
19 |           'License :: OSI Approved :: MIT License',
20 |           'Natural Language :: English',
21 |           'Intended Audience :: Developers',
22 |           'Topic :: Software Development :: Libraries',
23 |           'Topic :: Software Development :: Libraries :: Python Modules',
24 |           'Development Status :: 5 - Production/Stable',
25 |           'Programming Language :: Python',
26 |           'Programming Language :: Python :: 2',
27 |           'Programming Language :: Python :: 2.7',
28 |           'Programming Language :: Python :: 3',
29 |           'Programming Language :: Python :: 3.4',
30 |           'Programming Language :: Python :: 3.5',
31 |           'Programming Language :: Python :: 3.6',
32 |           'Programming Language :: Python :: 3.7',
33 |       ],
34 |       zip_safe=False
35 |       )
36 | 


--------------------------------------------------------------------------------
/url_parser/__init__.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import warnings
  3 | from collections import namedtuple
  4 | 
  5 | from url_parser.public_suffix_list import PublicSuffixList
  6 | 
  7 | UrlObject = namedtuple(
  8 |     'UrlObject', [
  9 |         'protocol',
 10 |         'www',
 11 |         'sub_domain',
 12 |         'domain',
 13 |         'top_domain',
 14 |         'path',
 15 |         'dir',
 16 |         'file',
 17 |         'fragment',
 18 |         'query'
 19 |     ])
 20 | 
 21 | 
 22 | def _split_query_group(query_groups: list) -> dict:
 23 |     result = dict()
 24 | 
 25 |     for query_group in query_groups:
 26 |         query = query_group.split('=')
 27 | 
 28 |         if len(query) == 1:
 29 |             result[query[0]] = None
 30 |             continue
 31 | 
 32 |         result[query[0]] = query[1]
 33 | 
 34 |     return result
 35 | 
 36 | 
 37 | def _parse_url_with_top_domain(url, top_domain):
 38 |     regex = r"^(?:(?P<protocol>[\w\d]+)(?:\:\/\/))?" \
 39 |                   r"(?P<sub_domain>" \
 40 |                   r"(?P<www>(?:www)?)(?:\.?)" \
 41 |                   r"(?:(?:[\w\d-]+|\.)*?)?" \
 42 |                   r")(?:\.?)" \
 43 |                   r"(?P<domain>[^./]+(?=\.))\." \
 44 |                   r"(?P<top_domain>" + re.escape(top_domain) + r"(?![^/?#]))" \
 45 |                   r"(?P<path>" \
 46 |                   r"(?P<dir>\/(?:[^/\r\n]+(?:/))+)?" \
 47 |                   r"(?:\/?)(?P<file>[^?#\r\n]+)?" \
 48 |                   r")?" \
 49 |                   r"(?:\#(?P<fragment>[^#?\r\n]*))?" \
 50 |                   r"(?:\?(?P<query>.*(?=$)))*$"
 51 | 
 52 |     dict_data = {
 53 |         'protocol': None,
 54 |         'www': None,
 55 |         'sub_domain': None,
 56 |         'domain': None,
 57 |         'top_domain': None,
 58 |         'path': None,
 59 |         'dir': None,
 60 |         'file': None,
 61 |         'fragment': None,
 62 |         'query': None,
 63 |     }
 64 | 
 65 |     match = re.search(regex, url)
 66 | 
 67 |     dict_data['protocol'] = match.group('protocol') if match.group('protocol') else None
 68 |     dict_data['www'] = match.group('www') if match.group('www') else None
 69 |     dict_data['sub_domain'] = match.group('sub_domain') if match.group('sub_domain') else None
 70 |     dict_data['domain'] = match.group('domain')
 71 |     dict_data['top_domain'] = top_domain
 72 |     dict_data['path'] = match.group('path') if match.group('path') else None
 73 |     dict_data['dir'] = match.group('dir') if match.group('dir') else None
 74 |     dict_data['file'] = match.group('file') if match.group('file') else None
 75 |     dict_data['fragment'] = match.group('fragment') if match.group('fragment') else None
 76 | 
 77 |     query = match.group('query') if match.group('query') else None
 78 | 
 79 |     if query is not None:
 80 |         query_groups = query.split('&')
 81 |         query = _split_query_group(query_groups)
 82 |         dict_data['query'] = query
 83 | 
 84 |     return dict_data
 85 | 
 86 | 
 87 | def _parse_url_with_public_suffix(url):
 88 |     public_suffix = PublicSuffixList.get_list()
 89 |     public_suffix.sort()
 90 | 
 91 |     domain_regex = r"(?:^|\/)(?P<domain>[^:/#?]+)(?:[/#?]|$)"
 92 |     match = re.search(domain_regex, url)
 93 |     domain = match.group('domain')
 94 |     domain_parts = domain.split('.')
 95 | 
 96 |     top_domain = None
 97 | 
 98 |     for i in range(len(domain_parts)):
 99 |         tail_gram = domain_parts[i:len(domain_parts)]
100 |         tail_gram = '.'.join(tail_gram)
101 | 
102 |         if tail_gram in public_suffix:
103 |             top_domain = tail_gram
104 |             break
105 | 
106 |     data = _parse_url_with_top_domain(url, top_domain)
107 | 
108 |     return data
109 | 
110 | 
111 | def get_base_url(url: str) -> str:
112 |     url = get_url(url)
113 |     protocol = str(url.protocol) + '://' if url.protocol is not None else 'http://'
114 |     www = 'www.' if url.www is not None else ''
115 |     sub_domain = str(url.sub_domain) + '.' if url.sub_domain is not None and url.sub_domain != 'www.' else ''
116 |     return protocol + www + sub_domain + url.domain + '.' + url.top_domain
117 | 
118 | 
119 | def get_url(url: str) -> UrlObject:
120 |     data = _parse_url_with_public_suffix(url)
121 | 
122 |     object_data = UrlObject(
123 |         protocol=data['protocol'],
124 |         www=data['www'],
125 |         sub_domain=data['sub_domain'],
126 |         domain=data['domain'],
127 |         top_domain=data['top_domain'],
128 |         path=data['path'],
129 |         dir=data['dir'],
130 |         file=data['file'],
131 |         fragment=data['fragment'],
132 |         query=data['query'],
133 |     )
134 | 
135 |     return object_data
136 | 
137 | 
138 | def parse_url(url: str) -> dict:
139 |     warnings.warn(
140 |         "parse_url is deprecated, use get_url instead",
141 |         DeprecationWarning
142 |     )
143 | 
144 |     data = get_url(url)
145 |     return data._asdict()
146 | 


--------------------------------------------------------------------------------
/url_parser/public_suffix_list.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | class PublicSuffixList:
 5 |     _public_suffix_list = None
 6 | 
 7 |     @staticmethod
 8 |     def get_list():
 9 |         if PublicSuffixList._public_suffix_list is not None:
10 |             return PublicSuffixList._public_suffix_list
11 | 
12 |         public_suffix_list = []
13 | 
14 |         dir_path = os.path.dirname(os.path.realpath(__file__))
15 |         dat_file = f'{dir_path}/public_suffix_list.dat'
16 | 
17 |         with open(dat_file, encoding='utf-8') as file:
18 |             data = file.readlines()
19 | 
20 |             for line in data:
21 |                 if line[0:2] == '//':
22 |                     continue
23 | 
24 |                 line = line.replace('\n', '')
25 |                 line = line.replace('\r', '')
26 | 
27 |                 if line == '':
28 |                     continue
29 | 
30 |                 public_suffix_list.append(line)
31 | 
32 |         PublicSuffixList._public_suffix_list = public_suffix_list
33 | 
34 |         return PublicSuffixList._public_suffix_list
35 | 


--------------------------------------------------------------------------------
/url_parser/tests/test_url_parser.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | 
  3 | import url_parser
  4 | 
  5 | 
  6 | class TestUrlParser(TestCase):
  7 |     def test_parses_url_without_www(self):
  8 |         url = 'example.com'
  9 |         result = url_parser.parse_url(url)
 10 |         self.assertEqual(result['domain'], 'example')
 11 |         self.assertEqual(result['top_domain'], 'com')
 12 | 
 13 |     def test_parses_url_with_www(self):
 14 |         url = 'www.example.com'
 15 |         result = url_parser.parse_url(url)
 16 |         self.assertEqual(result['domain'], 'example')
 17 |         self.assertEqual(result['top_domain'], 'com')
 18 | 
 19 |     def test_returns_null_if_protocol_is_missing(self):
 20 |         url = 'www.example.com'
 21 |         result = url_parser.parse_url(url)
 22 |         self.assertIsNone(result['protocol'])
 23 | 
 24 |     def test_returns_null_if_www_is_missing(self):
 25 |         url = 'http://example.com'
 26 |         result = url_parser.parse_url(url)
 27 |         self.assertIsNone(result['www'])
 28 | 
 29 |     def test_removes_extra_dot_from_www(self):
 30 |         url = 'http://www..example.com'
 31 |         result = url_parser.parse_url(url)
 32 |         has_dot = '.' in result['www']
 33 |         self.assertFalse(has_dot)
 34 | 
 35 |     def test_returns_null_if_sub_domain_is_missing(self):
 36 |         url = 'http://example.com'
 37 |         result = url_parser.parse_url(url)
 38 |         self.assertIsNone(result['sub_domain'])
 39 | 
 40 |     def test_finds_sub_domain(self):
 41 |         url = 'mysubdomain.example.com'
 42 |         result = url_parser.parse_url(url)
 43 |         self.assertEqual(result['sub_domain'], 'mysubdomain')
 44 | 
 45 |     def test_finds_multiple_subdomains(self):
 46 |         url = 'my.subdomain.example.com'
 47 |         result = url_parser.parse_url(url)
 48 |         self.assertEqual(result['sub_domain'], 'my.subdomain')
 49 | 
 50 |     def test_finds_protocol(self):
 51 |         url = 'http://mysubdomain.example.com'
 52 |         result = url_parser.parse_url(url)
 53 |         self.assertEqual(result['protocol'], 'http')
 54 | 
 55 |         url = 'https://mysubdomain.example.com'
 56 |         result = url_parser.parse_url(url)
 57 |         self.assertEqual(result['protocol'], 'https')
 58 | 
 59 |         url = 'ftp://mysubdomain.example.com'
 60 |         result = url_parser.parse_url(url)
 61 |         self.assertEqual(result['protocol'], 'ftp')
 62 | 
 63 |     def test_finds_dir(self):
 64 |         url = 'http://mysubdomain.example.com/folder/'
 65 |         result = url_parser.parse_url(url)
 66 |         self.assertEqual(result['dir'], '/folder/')
 67 | 
 68 |         url = 'http://mysubdomain.example.com/multiple/folders/'
 69 |         result = url_parser.parse_url(url)
 70 |         self.assertEqual(result['dir'], '/multiple/folders/')
 71 | 
 72 |         url = 'http://mysubdomain.example.com/multiple/folders/with_a_file.js'
 73 |         result = url_parser.parse_url(url)
 74 |         self.assertEqual(result['dir'], '/multiple/folders/')
 75 | 
 76 |     def test_does_not_mistake_file_for_dir(self):
 77 |         url = 'http://mysubdomain.example.com/folder/test'
 78 |         result = url_parser.parse_url(url)
 79 |         self.assertEqual(result['dir'], '/folder/')
 80 |         self.assertNotEqual(result['dir'], '/folder/test')
 81 | 
 82 |     def test_finds_domain(self):
 83 |         url = 'http://mysubdomain.example.com'
 84 |         result = url_parser.parse_url(url)
 85 |         self.assertEqual(result['domain'], 'example')
 86 | 
 87 |     def test_finds_top_domain(self):
 88 |         url = 'http://mysubdomain.example.com'
 89 |         result = url_parser.parse_url(url)
 90 |         self.assertEqual(result['top_domain'], 'com')
 91 | 
 92 |         url = 'http://mysubdomain.example.co.uk'
 93 |         result = url_parser.parse_url(url)
 94 |         self.assertEqual(result['top_domain'], 'co.uk')
 95 | 
 96 |     def test_finds_file(self):
 97 |         url = 'http://mysubdomain.example.com/cool.jpg'
 98 |         result = url_parser.parse_url(url)
 99 |         self.assertEqual(result['file'], 'cool.jpg')
100 | 
101 |         url = 'http://mysubdomain.example.com/directory/here/sample.mp4'
102 |         result = url_parser.parse_url(url)
103 |         self.assertEqual(result['file'], 'sample.mp4')
104 | 
105 |     def test_finds_path(self):
106 |         url = 'http://mysubdomain.example.com/path'
107 |         result = url_parser.parse_url(url)
108 |         self.assertEqual(result['path'], '/path')
109 | 
110 |         url = 'http://mysubdomain.example.com/this/is/the/path'
111 |         result = url_parser.parse_url(url)
112 |         self.assertEqual(result['path'], '/this/is/the/path')
113 | 
114 |         url = 'http://mysubdomain.example.com/path/with/file.js'
115 |         result = url_parser.parse_url(url)
116 |         self.assertEqual(result['path'], '/path/with/file.js')
117 | 
118 |     def test_finds_fragment(self):
119 |         url = 'http://mysubdomain.example.com#my_fragment'
120 |         result = url_parser.parse_url(url)
121 |         self.assertEqual(result['fragment'], 'my_fragment')
122 | 
123 |         url = 'http://mysubdomain.example.com/path/#my_fragment'
124 |         result = url_parser.parse_url(url)
125 |         self.assertEqual(result['fragment'], 'my_fragment')
126 | 
127 |         url = 'http://mysubdomain.example.com/path/file.js#my_fragment'
128 |         result = url_parser.parse_url(url)
129 |         self.assertEqual(result['fragment'], 'my_fragment')
130 | 
131 |         url = 'http://mysubdomain.example.com#my_fragment?myargs=test'
132 |         result = url_parser.parse_url(url)
133 |         self.assertEqual(result['fragment'], 'my_fragment')
134 | 
135 |         url = 'http://mysubdomain.example.com/test/path.js#my_fragment?myargs=test'
136 |         result = url_parser.parse_url(url)
137 |         self.assertEqual(result['fragment'], 'my_fragment')
138 | 
139 |     def test_finds_query(self):
140 |         url = 'http://mysubdomain.example.com?myquery=test'
141 |         result = url_parser.parse_url(url)
142 |         self.assertEqual(result['query']['myquery'], 'test')
143 | 
144 |         url = 'http://mysubdomain.example.com?myquery=test&one=two&test'
145 |         result = url_parser.parse_url(url)
146 |         self.assertEqual(result['query']['myquery'], 'test')
147 |         self.assertEqual(result['query']['one'], 'two')
148 |         self.assertIsNone(result['query']['test'])
149 | 
150 |         url = 'http://mysubdomain.example.com/file.js?myquery=test&one=two'
151 |         result = url_parser.parse_url(url)
152 |         self.assertEqual(result['query']['myquery'], 'test')
153 |         self.assertEqual(result['query']['one'], 'two')
154 | 
155 |         url = 'http://mysubdomain.example.com/path/and/file.js?myquery=test&one=two'
156 |         result = url_parser.parse_url(url)
157 |         self.assertEqual(result['query']['myquery'], 'test')
158 |         self.assertEqual(result['query']['one'], 'two')
159 | 
160 |         url = 'http://mysubdomain.example.com/path/?myquery=test&one=two'
161 |         result = url_parser.parse_url(url)
162 |         self.assertEqual(result['query']['myquery'], 'test')
163 |         self.assertEqual(result['query']['one'], 'two')
164 | 
165 |     def test_catastrophic_backtracking(self):
166 |         url = 'http://very_long-and-complixated_subdomaind-for-page.mywebpageishere.com/'
167 |         result = url_parser.parse_url(url)
168 |         self.assertEqual(result['top_domain'], 'com')
169 |         self.assertEqual(result['domain'], 'mywebpageishere')
170 |         self.assertEqual(result['sub_domain'], 'very_long-and-complixated_subdomaind-for-page')
171 | 
172 |     def test_domain_that_starts_with_same_letters_as_top_domain(self):
173 |         url = 'http://domains-stars-with-same-top-domain.nogo.no/'
174 |         result = url_parser.parse_url(url)
175 |         self.assertEqual(result['top_domain'], 'no')
176 |         self.assertEqual(result['domain'], 'nogo')
177 |         self.assertEqual(result['sub_domain'], 'domains-stars-with-same-top-domain')
178 | 
179 |     def test_domain_that_includes_a_top_domain_in_sub_domain(self):
180 |         url = 'http://test.com.hello.nogo.no/'
181 |         result = url_parser.parse_url(url)
182 |         self.assertEqual(result['top_domain'], 'no')
183 |         self.assertEqual(result['domain'], 'nogo')
184 |         self.assertEqual(result['sub_domain'], 'test.com.hello')
185 | 
186 |     def test_domain_that_includes_a_top_domain_in_query(self):
187 |         url = 'http://test.com.hello.nogo.no?my_query_domain=www.test.com'
188 |         result = url_parser.parse_url(url)
189 |         self.assertEqual(result['top_domain'], 'no')
190 |         self.assertEqual(result['domain'], 'nogo')
191 |         self.assertEqual(result['sub_domain'], 'test.com.hello')
192 |         self.assertEqual(result['query']['my_query_domain'], 'www.test.com')
193 | 
194 | 
195 | class TestGetUrl(TestCase):
196 |     def test_parses_url_without_www(self):
197 |         url = 'example.com'
198 |         result = url_parser.get_url(url)
199 |         self.assertEqual(result.domain, 'example')
200 |         self.assertEqual(result.top_domain, 'com')
201 | 
202 |     def test_parses_url_with_www(self):
203 |         url = 'www.example.com'
204 |         result = url_parser.get_url(url)
205 |         self.assertEqual(result.domain, 'example')
206 |         self.assertEqual(result.top_domain, 'com')
207 | 
208 |     def test_returns_null_if_protocol_is_missing(self):
209 |         url = 'www.example.com'
210 |         result = url_parser.get_url(url)
211 |         self.assertIsNone(result.protocol)
212 | 
213 |     def test_returns_null_if_www_is_missing(self):
214 |         url = 'http://example.com'
215 |         result = url_parser.get_url(url)
216 |         self.assertIsNone(result.www)
217 | 
218 |     def test_removes_extra_dot_from_www(self):
219 |         url = 'http://www..example.com'
220 |         result = url_parser.get_url(url)
221 |         has_dot = '.' in result.www
222 |         self.assertFalse(has_dot)
223 | 
224 |     def test_returns_null_if_sub_domain_is_missing(self):
225 |         url = 'http://example.com'
226 |         result = url_parser.get_url(url)
227 |         self.assertIsNone(result.sub_domain)
228 | 
229 |     def test_finds_sub_domain(self):
230 |         url = 'mysubdomain.example.com'
231 |         result = url_parser.get_url(url)
232 |         self.assertEqual(result.sub_domain, 'mysubdomain')
233 | 
234 |     def test_finds_multiple_subdomains(self):
235 |         url = 'my.subdomain.example.com'
236 |         result = url_parser.get_url(url)
237 |         self.assertEqual(result.sub_domain, 'my.subdomain')
238 | 
239 |     def test_finds_protocol(self):
240 |         url = 'http://mysubdomain.example.com'
241 |         result = url_parser.get_url(url)
242 |         self.assertEqual(result.protocol, 'http')
243 | 
244 |         url = 'https://mysubdomain.example.com'
245 |         result = url_parser.get_url(url)
246 |         self.assertEqual(result.protocol, 'https')
247 | 
248 |         url = 'ftp://mysubdomain.example.com'
249 |         result = url_parser.get_url(url)
250 |         self.assertEqual(result.protocol, 'ftp')
251 | 
252 |     def test_finds_dir(self):
253 |         url = 'http://mysubdomain.example.com/folder/'
254 |         result = url_parser.get_url(url)
255 |         self.assertEqual(result.dir, '/folder/')
256 | 
257 |         url = 'http://mysubdomain.example.com/multiple/folders/'
258 |         result = url_parser.get_url(url)
259 |         self.assertEqual(result.dir, '/multiple/folders/')
260 | 
261 |         url = 'http://mysubdomain.example.com/multiple/folders/with_a_file.js'
262 |         result = url_parser.get_url(url)
263 |         self.assertEqual(result.dir, '/multiple/folders/')
264 | 
265 |     def test_does_not_mistake_file_for_dir(self):
266 |         url = 'http://mysubdomain.example.com/folder/test'
267 |         result = url_parser.get_url(url)
268 |         self.assertEqual(result.dir, '/folder/')
269 |         self.assertNotEqual(result.dir, '/folder/test')
270 | 
271 |     def test_finds_domain(self):
272 |         url = 'http://mysubdomain.example.com'
273 |         result = url_parser.get_url(url)
274 |         self.assertEqual(result.domain, 'example')
275 | 
276 |     def test_finds_top_domain(self):
277 |         url = 'http://mysubdomain.example.com'
278 |         result = url_parser.get_url(url)
279 |         self.assertEqual(result.top_domain, 'com')
280 | 
281 |         url = 'http://mysubdomain.example.co.uk'
282 |         result = url_parser.get_url(url)
283 |         self.assertEqual(result.top_domain, 'co.uk')
284 | 
285 |     def test_finds_file(self):
286 |         url = 'http://mysubdomain.example.com/cool.jpg'
287 |         result = url_parser.get_url(url)
288 |         self.assertEqual(result.file, 'cool.jpg')
289 | 
290 |         url = 'http://mysubdomain.example.com/directory/here/sample.mp4'
291 |         result = url_parser.get_url(url)
292 |         self.assertEqual(result.file, 'sample.mp4')
293 | 
294 |     def test_finds_path(self):
295 |         url = 'http://mysubdomain.example.com/path'
296 |         result = url_parser.get_url(url)
297 |         self.assertEqual(result.path, '/path')
298 | 
299 |         url = 'http://mysubdomain.example.com/this/is/the/path'
300 |         result = url_parser.get_url(url)
301 |         self.assertEqual(result.path, '/this/is/the/path')
302 | 
303 |         url = 'http://mysubdomain.example.com/path/with/file.js'
304 |         result = url_parser.get_url(url)
305 |         self.assertEqual(result.path, '/path/with/file.js')
306 | 
307 |     def test_finds_fragment(self):
308 |         url = 'http://mysubdomain.example.com#my_fragment'
309 |         result = url_parser.get_url(url)
310 |         self.assertEqual(result.fragment, 'my_fragment')
311 | 
312 |         url = 'http://mysubdomain.example.com/path/#my_fragment'
313 |         result = url_parser.get_url(url)
314 |         self.assertEqual(result.fragment, 'my_fragment')
315 | 
316 |         url = 'http://mysubdomain.example.com/path/file.js#my_fragment'
317 |         result = url_parser.get_url(url)
318 |         self.assertEqual(result.fragment, 'my_fragment')
319 | 
320 |         url = 'http://mysubdomain.example.com#my_fragment?myargs=test'
321 |         result = url_parser.get_url(url)
322 |         self.assertEqual(result.fragment, 'my_fragment')
323 | 
324 |         url = 'http://mysubdomain.example.com/test/path.js#my_fragment?myargs=test'
325 |         result = url_parser.get_url(url)
326 |         self.assertEqual(result.fragment, 'my_fragment')
327 | 
328 |     def test_finds_query(self):
329 |         url = 'http://mysubdomain.example.com?myquery=test'
330 |         result = url_parser.get_url(url)
331 |         self.assertEqual(result.query['myquery'], 'test')
332 | 
333 |         url = 'http://mysubdomain.example.com?myquery=test&one=two&test'
334 |         result = url_parser.get_url(url)
335 |         self.assertEqual(result.query['myquery'], 'test')
336 |         self.assertEqual(result.query['one'], 'two')
337 |         self.assertIsNone(result.query['test'])
338 | 
339 |         url = 'http://mysubdomain.example.com/file.js?myquery=test&one=two'
340 |         result = url_parser.get_url(url)
341 |         self.assertEqual(result.query['myquery'], 'test')
342 |         self.assertEqual(result.query['one'], 'two')
343 | 
344 |         url = 'http://mysubdomain.example.com/path/and/file.js?myquery=test&one=two'
345 |         result = url_parser.get_url(url)
346 |         self.assertEqual(result.query['myquery'], 'test')
347 |         self.assertEqual(result.query['one'], 'two')
348 | 
349 |         url = 'http://mysubdomain.example.com/path/?myquery=test&one=two'
350 |         result = url_parser.get_url(url)
351 |         self.assertEqual(result.query['myquery'], 'test')
352 |         self.assertEqual(result.query['one'], 'two')
353 | 
354 | 
355 | class TestGetBasicUrl(TestCase):
356 |     def test_basic_url(self):
357 |         url = 'http://example.com'
358 |         result = url_parser.get_base_url(url)
359 |         self.assertEqual(result, 'http://example.com')
360 | 
361 |         url = 'https://example.com'
362 |         result = url_parser.get_base_url(url)
363 |         self.assertEqual(result, 'https://example.com')
364 | 
365 |         url = 'https://www.example.com'
366 |         result = url_parser.get_base_url(url)
367 |         self.assertEqual(result, 'https://www.example.com')
368 | 
369 |         url = 'example.com'
370 |         result = url_parser.get_base_url(url)
371 |         self.assertEqual(result, 'http://example.com')
372 | 
373 |     def test_sub_domain_basic_url(self):
374 |         url = 'http://mysubdomain.example.com'
375 |         result = url_parser.get_base_url(url)
376 |         self.assertEqual(result, 'http://mysubdomain.example.com')
377 | 
378 |         url = 'https://mysubdomain.example.com'
379 |         result = url_parser.get_base_url(url)
380 |         self.assertEqual(result, 'https://mysubdomain.example.com')
381 | 
382 |         url = 'ftp://mysubdomain.example.com'
383 |         result = url_parser.get_base_url(url)
384 |         self.assertEqual(result, 'ftp://mysubdomain.example.com')
385 | 
386 |     def test_path_url(self):
387 |         url = 'https://mysubdomain.example.com/path/to/wisdom'
388 |         result = url_parser.get_base_url(url)
389 |         self.assertEqual(result, 'https://mysubdomain.example.com')
390 | 
391 |     def test_long_url(self):
392 |         url = 'https://mysubdomain.example.com/path/to/wisdom?query=2&this=3'
393 |         result = url_parser.get_base_url(url)
394 |         self.assertEqual(result, 'https://mysubdomain.example.com')
395 | 
396 |         url = 'https://www.example.com/path/to/wisdom?query=2&this=3'
397 |         result = url_parser.get_base_url(url)
398 |         self.assertEqual(result, 'https://www.example.com')
399 | 
400 |         url = 'https://example.com/path/to/wisdom?query=2&this=3'
401 |         result = url_parser.get_base_url(url)
402 |         self.assertEqual(result, 'https://example.com')
403 | 
404 |         url = 'example.com/path/to/wisdom?query=2&this=3'
405 |         result = url_parser.get_base_url(url)
406 |         self.assertEqual(result, 'http://example.com')


--------------------------------------------------------------------------------