├── .github
└── workflows
│ └── pythonpublish.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── setup.py
└── url_parser
├── __init__.py
├── public_suffix_list.dat
├── public_suffix_list.py
└── tests
└── test_url_parser.py
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
1 | name: Upload Python Package
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 |
8 | jobs:
9 | deploy:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v2
13 | - name: Set up Python
14 | uses: actions/setup-python@v1
15 | with:
16 | python-version: '3.x'
17 | - name: Install dependencies
18 | run: |
19 | python -m pip install --upgrade pip
20 | pip install setuptools wheel twine
21 | - name: Run tests
22 | run: |
23 | python -m unittest url_parser.tests.test_url_parser
24 | - name: Build and publish
25 | env:
26 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
27 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
28 | run: |
29 | python setup.py sdist bdist_wheel
30 | twine upload dist/*
31 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 | build_and_deploy.sh
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # celery beat schedule file
95 | celerybeat-schedule
96 |
97 | # SageMath parsed files
98 | *.sage.py
99 |
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 | virt/
109 |
110 | # Spyder project settings
111 | .spyderproject
112 | .spyproject
113 |
114 | # Rope project settings
115 | .ropeproject
116 |
117 | # mkdocs documentation
118 | /site
119 |
120 | # mypy
121 | .mypy_cache/
122 | .dmypy.json
123 | dmypy.json
124 |
125 | # Pyre type checker
126 | .pyre/
127 |
128 | /.idea
129 | /data
130 | !/data/.gitkeep
131 | virtenv/Scripts/tldextract.exe
132 | virtenv/Scripts/pythonw.exe
133 | virtenv/Scripts/python.exe
134 | virtenv/Scripts/pip3.exe
135 | virtenv/Scripts/pip3.7.exe
136 | virtenv/Scripts/pip3.7-script.py
137 | virtenv/Scripts/pip3-script.py
138 | virtenv/Scripts/pip.exe
139 | virtenv/Scripts/pip-script.py
140 | virtenv/Scripts/flask.exe
141 | virtenv/Scripts/easy_install.exe
142 | virtenv/Scripts/easy_install-script.py
143 | virtenv/Scripts/easy_install-3.7.exe
144 | virtenv/Scripts/easy_install-3.7-script.py
145 | virtenv/Scripts/dotenv.exe
146 | virtenv/Scripts/deactivate.bat
147 | virtenv/Scripts/chardetect.exe
148 | virtenv/Scripts/activate.bat
149 | virtenv/Scripts/activate
150 | virtenv/Scripts/Activate.ps1
151 | virtenv/pyvenv.cfg
152 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ### Changelog:
2 |
3 | ##### v3.0.3
4 | * Fixed catastrophic backtracking and a small top domain bug.
5 |
6 | ##### v3.0.2
7 | * Fixed bug for not loading public_suffix_list.dat correctly.
8 |
9 | ##### v3.0.0
10 | * A lot of cleanup.
11 | * Added list from https://publicsuffix.org/ to find top domains more correct.
12 | * Breaking change: Signature of `parse_url(item)` now changed to `parse_url(url: str) -> dict`.
13 | * Breaking change: Signature of `get_url(item)` now changed to `def get_url(url: str) -> UrlObject:`.
14 |
15 | ##### v2.1.1
16 | * Small fix for readme and Github actions.
17 |
18 | ##### v2.1.0
19 | * Added function to get url back as Object. Also added a function to get the basics of a url returned.
20 |
21 | ##### v2.0.0
22 | * Added new regex and support for foreign languages.
23 |
24 | ##### v1.0.0
25 | * Small bugfixes and optimisation for stable release.
26 |
27 | ##### v0.9.9
28 | * Bugfixes on the readme file.
29 |
30 | ##### v0.9.8
31 | * Added support for args.
32 |
33 | ##### v0.9.7
34 | * Changed setup.py and readme for PyPi optimisation.
35 |
36 | ##### v0.9.6
37 | * Added support for secondary top domain (like: co.uk, .parliament.uk, .gov.au).
38 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Adapted AS
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include url_parser/public_suffix_list.dat
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python URL Parser
2 | 
3 | 
4 | 
5 | 
6 |
7 | A nice package to help you parse all types of URL's in vanilla python and return the parsed URL in groups.
8 |
9 | To not brake the API `parse_url` (returns a dict) still works and we made `get_url` to get the url parts as as object instead.
10 |
11 | In version 2.1 we also included `get_basic_url` a small yet neat function to get a the main url back from a string
12 |
13 | ### Installation
14 | ```
15 | pip install url-parser
16 | ```
17 |
18 | ### Usage
19 |
20 | ```python
21 | from url_parser import parse_url, get_url, get_base_url
22 |
23 |
24 | url = parse_url('https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0') # returns url sections as a dict
25 | url_object = get_url('https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0') # Does the same, bur returns a object
26 | basic_url = get_base_url('https://open.prospecta.app/my_user_login?user=url-parser&password=H3ll0') # Returns just the main url
27 |
28 | print(url['domain']) # Outputs -> prospecta
29 | print(url_object.domain) # Outputs -> prospecta
30 | print(basic_url) # Outputs -> https://open.prospecta.app
31 | ```
32 |
33 | ### Keywords `get_url` and `parse_url`
34 |
35 | When using the `parse_url` and `get_url` function, you get a dict (parse_url) or object (get_url) back with different parts of the URL.
36 |
37 | The different parts can be accessed by keywords:
38 | For `parse_url` use: `result['top_domain]`
39 | For `get_url` use: `result.top_domain`
40 |
41 |
42 | Here is a list of all the available keywords:
43 |
44 | | Keyword | Desription | Value when not present in URL
45 | | ------ | ------ | ------ |
46 | | protocol | The protocol, e.g. **https** or **ftp** | None
47 | | www | Returns **www** if www is used in the URL | None
48 | | sub_domain | The sub domain, e.g. **my.subdomain** in **my.subdomain.example.com**. Note that the sub domain also includes www. | None
49 | | domain | The domain, e.g. **example** in **example.com** | Is always present
50 | | top_domain | The domain, e.g. **com** in **example.com** | Is always present
51 | | dir | The directory, e.g. **/my/directory/** in **example.com/my/directory/** | None
52 | | file | The file, e.g. **my_file.js** in **example.com/home/my_file.js** | None
53 | | path | The full path, e.g. **/home/my_file.js** in **example.com/home/my_file.js** | None
54 | | fragment | The URL fragment, e.g. **my_link** in **example.com#my_link** | None
55 | | query | The URL query, e.g. **my_parameter=1&foo=bar** in **example.com?my_parameter=1&foo=bar** | None
56 |
57 | ### Testing
58 |
59 | Use the following command to run tests.
60 |
61 | ```bash
62 | python -m unittest url_parser.tests.test_url_parser
63 | ```
64 |
65 | ### Changelog:
66 |
67 | See CHANGELOG.md
68 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | with open("README.md", "r") as fh:
4 | long_description = fh.read()
5 |
6 | setup(name='url_parser',
7 | version='3.0.3',
8 | description='Parse url and get all the different parts out of it',
9 | url='https://github.com/AdaptedAS/url_parser',
10 | author='Odd Jøren Røland',
11 | long_description=long_description,
12 | long_description_content_type="text/markdown",
13 | author_email='odd@adapted.no',
14 | license='MIT',
15 | packages=['url_parser'],
16 | platforms=['any'],
17 | include_package_data=True,
18 | classifiers=[
19 | 'License :: OSI Approved :: MIT License',
20 | 'Natural Language :: English',
21 | 'Intended Audience :: Developers',
22 | 'Topic :: Software Development :: Libraries',
23 | 'Topic :: Software Development :: Libraries :: Python Modules',
24 | 'Development Status :: 5 - Production/Stable',
25 | 'Programming Language :: Python',
26 | 'Programming Language :: Python :: 2',
27 | 'Programming Language :: Python :: 2.7',
28 | 'Programming Language :: Python :: 3',
29 | 'Programming Language :: Python :: 3.4',
30 | 'Programming Language :: Python :: 3.5',
31 | 'Programming Language :: Python :: 3.6',
32 | 'Programming Language :: Python :: 3.7',
33 | ],
34 | zip_safe=False
35 | )
36 |
--------------------------------------------------------------------------------
/url_parser/__init__.py:
--------------------------------------------------------------------------------
1 | import re
2 | import warnings
3 | from collections import namedtuple
4 |
5 | from url_parser.public_suffix_list import PublicSuffixList
6 |
7 | UrlObject = namedtuple(
8 | 'UrlObject', [
9 | 'protocol',
10 | 'www',
11 | 'sub_domain',
12 | 'domain',
13 | 'top_domain',
14 | 'path',
15 | 'dir',
16 | 'file',
17 | 'fragment',
18 | 'query'
19 | ])
20 |
21 |
22 | def _split_query_group(query_groups: list) -> dict:
23 | result = dict()
24 |
25 | for query_group in query_groups:
26 | query = query_group.split('=')
27 |
28 | if len(query) == 1:
29 | result[query[0]] = None
30 | continue
31 |
32 | result[query[0]] = query[1]
33 |
34 | return result
35 |
36 |
37 | def _parse_url_with_top_domain(url, top_domain):
38 | regex = r"^(?:(?P[\w\d]+)(?:\:\/\/))?" \
39 | r"(?P" \
40 | r"(?P(?:www)?)(?:\.?)" \
41 | r"(?:(?:[\w\d-]+|\.)*?)?" \
42 | r")(?:\.?)" \
43 | r"(?P[^./]+(?=\.))\." \
44 | r"(?P" + re.escape(top_domain) + r"(?![^/?#]))" \
45 | r"(?P" \
46 | r"(?P\/(?:[^/\r\n]+(?:/))+)?" \
47 | r"(?:\/?)(?P[^?#\r\n]+)?" \
48 | r")?" \
49 | r"(?:\#(?P[^#?\r\n]*))?" \
50 | r"(?:\?(?P.*(?=$)))*$"
51 |
52 | dict_data = {
53 | 'protocol': None,
54 | 'www': None,
55 | 'sub_domain': None,
56 | 'domain': None,
57 | 'top_domain': None,
58 | 'path': None,
59 | 'dir': None,
60 | 'file': None,
61 | 'fragment': None,
62 | 'query': None,
63 | }
64 |
65 | match = re.search(regex, url)
66 |
67 | dict_data['protocol'] = match.group('protocol') if match.group('protocol') else None
68 | dict_data['www'] = match.group('www') if match.group('www') else None
69 | dict_data['sub_domain'] = match.group('sub_domain') if match.group('sub_domain') else None
70 | dict_data['domain'] = match.group('domain')
71 | dict_data['top_domain'] = top_domain
72 | dict_data['path'] = match.group('path') if match.group('path') else None
73 | dict_data['dir'] = match.group('dir') if match.group('dir') else None
74 | dict_data['file'] = match.group('file') if match.group('file') else None
75 | dict_data['fragment'] = match.group('fragment') if match.group('fragment') else None
76 |
77 | query = match.group('query') if match.group('query') else None
78 |
79 | if query is not None:
80 | query_groups = query.split('&')
81 | query = _split_query_group(query_groups)
82 | dict_data['query'] = query
83 |
84 | return dict_data
85 |
86 |
87 | def _parse_url_with_public_suffix(url):
88 | public_suffix = PublicSuffixList.get_list()
89 | public_suffix.sort()
90 |
91 | domain_regex = r"(?:^|\/)(?P[^:/#?]+)(?:[/#?]|$)"
92 | match = re.search(domain_regex, url)
93 | domain = match.group('domain')
94 | domain_parts = domain.split('.')
95 |
96 | top_domain = None
97 |
98 | for i in range(len(domain_parts)):
99 | tail_gram = domain_parts[i:len(domain_parts)]
100 | tail_gram = '.'.join(tail_gram)
101 |
102 | if tail_gram in public_suffix:
103 | top_domain = tail_gram
104 | break
105 |
106 | data = _parse_url_with_top_domain(url, top_domain)
107 |
108 | return data
109 |
110 |
111 | def get_base_url(url: str) -> str:
112 | url = get_url(url)
113 | protocol = str(url.protocol) + '://' if url.protocol is not None else 'http://'
114 | www = 'www.' if url.www is not None else ''
115 | sub_domain = str(url.sub_domain) + '.' if url.sub_domain is not None and url.sub_domain != 'www.' else ''
116 | return protocol + www + sub_domain + url.domain + '.' + url.top_domain
117 |
118 |
119 | def get_url(url: str) -> UrlObject:
120 | data = _parse_url_with_public_suffix(url)
121 |
122 | object_data = UrlObject(
123 | protocol=data['protocol'],
124 | www=data['www'],
125 | sub_domain=data['sub_domain'],
126 | domain=data['domain'],
127 | top_domain=data['top_domain'],
128 | path=data['path'],
129 | dir=data['dir'],
130 | file=data['file'],
131 | fragment=data['fragment'],
132 | query=data['query'],
133 | )
134 |
135 | return object_data
136 |
137 |
138 | def parse_url(url: str) -> dict:
139 | warnings.warn(
140 | "parse_url is deprecated, use get_url instead",
141 | DeprecationWarning
142 | )
143 |
144 | data = get_url(url)
145 | return data._asdict()
146 |
--------------------------------------------------------------------------------
/url_parser/public_suffix_list.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | class PublicSuffixList:
5 | _public_suffix_list = None
6 |
7 | @staticmethod
8 | def get_list():
9 | if PublicSuffixList._public_suffix_list is not None:
10 | return PublicSuffixList._public_suffix_list
11 |
12 | public_suffix_list = []
13 |
14 | dir_path = os.path.dirname(os.path.realpath(__file__))
15 | dat_file = f'{dir_path}/public_suffix_list.dat'
16 |
17 | with open(dat_file, encoding='utf-8') as file:
18 | data = file.readlines()
19 |
20 | for line in data:
21 | if line[0:2] == '//':
22 | continue
23 |
24 | line = line.replace('\n', '')
25 | line = line.replace('\r', '')
26 |
27 | if line == '':
28 | continue
29 |
30 | public_suffix_list.append(line)
31 |
32 | PublicSuffixList._public_suffix_list = public_suffix_list
33 |
34 | return PublicSuffixList._public_suffix_list
35 |
--------------------------------------------------------------------------------
/url_parser/tests/test_url_parser.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | import url_parser
4 |
5 |
6 | class TestUrlParser(TestCase):
7 | def test_parses_url_without_www(self):
8 | url = 'example.com'
9 | result = url_parser.parse_url(url)
10 | self.assertEqual(result['domain'], 'example')
11 | self.assertEqual(result['top_domain'], 'com')
12 |
13 | def test_parses_url_with_www(self):
14 | url = 'www.example.com'
15 | result = url_parser.parse_url(url)
16 | self.assertEqual(result['domain'], 'example')
17 | self.assertEqual(result['top_domain'], 'com')
18 |
19 | def test_returns_null_if_protocol_is_missing(self):
20 | url = 'www.example.com'
21 | result = url_parser.parse_url(url)
22 | self.assertIsNone(result['protocol'])
23 |
24 | def test_returns_null_if_www_is_missing(self):
25 | url = 'http://example.com'
26 | result = url_parser.parse_url(url)
27 | self.assertIsNone(result['www'])
28 |
29 | def test_removes_extra_dot_from_www(self):
30 | url = 'http://www..example.com'
31 | result = url_parser.parse_url(url)
32 | has_dot = '.' in result['www']
33 | self.assertFalse(has_dot)
34 |
35 | def test_returns_null_if_sub_domain_is_missing(self):
36 | url = 'http://example.com'
37 | result = url_parser.parse_url(url)
38 | self.assertIsNone(result['sub_domain'])
39 |
40 | def test_finds_sub_domain(self):
41 | url = 'mysubdomain.example.com'
42 | result = url_parser.parse_url(url)
43 | self.assertEqual(result['sub_domain'], 'mysubdomain')
44 |
45 | def test_finds_multiple_subdomains(self):
46 | url = 'my.subdomain.example.com'
47 | result = url_parser.parse_url(url)
48 | self.assertEqual(result['sub_domain'], 'my.subdomain')
49 |
50 | def test_finds_protocol(self):
51 | url = 'http://mysubdomain.example.com'
52 | result = url_parser.parse_url(url)
53 | self.assertEqual(result['protocol'], 'http')
54 |
55 | url = 'https://mysubdomain.example.com'
56 | result = url_parser.parse_url(url)
57 | self.assertEqual(result['protocol'], 'https')
58 |
59 | url = 'ftp://mysubdomain.example.com'
60 | result = url_parser.parse_url(url)
61 | self.assertEqual(result['protocol'], 'ftp')
62 |
63 | def test_finds_dir(self):
64 | url = 'http://mysubdomain.example.com/folder/'
65 | result = url_parser.parse_url(url)
66 | self.assertEqual(result['dir'], '/folder/')
67 |
68 | url = 'http://mysubdomain.example.com/multiple/folders/'
69 | result = url_parser.parse_url(url)
70 | self.assertEqual(result['dir'], '/multiple/folders/')
71 |
72 | url = 'http://mysubdomain.example.com/multiple/folders/with_a_file.js'
73 | result = url_parser.parse_url(url)
74 | self.assertEqual(result['dir'], '/multiple/folders/')
75 |
76 | def test_does_not_mistake_file_for_dir(self):
77 | url = 'http://mysubdomain.example.com/folder/test'
78 | result = url_parser.parse_url(url)
79 | self.assertEqual(result['dir'], '/folder/')
80 | self.assertNotEqual(result['dir'], '/folder/test')
81 |
82 | def test_finds_domain(self):
83 | url = 'http://mysubdomain.example.com'
84 | result = url_parser.parse_url(url)
85 | self.assertEqual(result['domain'], 'example')
86 |
87 | def test_finds_top_domain(self):
88 | url = 'http://mysubdomain.example.com'
89 | result = url_parser.parse_url(url)
90 | self.assertEqual(result['top_domain'], 'com')
91 |
92 | url = 'http://mysubdomain.example.co.uk'
93 | result = url_parser.parse_url(url)
94 | self.assertEqual(result['top_domain'], 'co.uk')
95 |
96 | def test_finds_file(self):
97 | url = 'http://mysubdomain.example.com/cool.jpg'
98 | result = url_parser.parse_url(url)
99 | self.assertEqual(result['file'], 'cool.jpg')
100 |
101 | url = 'http://mysubdomain.example.com/directory/here/sample.mp4'
102 | result = url_parser.parse_url(url)
103 | self.assertEqual(result['file'], 'sample.mp4')
104 |
105 | def test_finds_path(self):
106 | url = 'http://mysubdomain.example.com/path'
107 | result = url_parser.parse_url(url)
108 | self.assertEqual(result['path'], '/path')
109 |
110 | url = 'http://mysubdomain.example.com/this/is/the/path'
111 | result = url_parser.parse_url(url)
112 | self.assertEqual(result['path'], '/this/is/the/path')
113 |
114 | url = 'http://mysubdomain.example.com/path/with/file.js'
115 | result = url_parser.parse_url(url)
116 | self.assertEqual(result['path'], '/path/with/file.js')
117 |
118 | def test_finds_fragment(self):
119 | url = 'http://mysubdomain.example.com#my_fragment'
120 | result = url_parser.parse_url(url)
121 | self.assertEqual(result['fragment'], 'my_fragment')
122 |
123 | url = 'http://mysubdomain.example.com/path/#my_fragment'
124 | result = url_parser.parse_url(url)
125 | self.assertEqual(result['fragment'], 'my_fragment')
126 |
127 | url = 'http://mysubdomain.example.com/path/file.js#my_fragment'
128 | result = url_parser.parse_url(url)
129 | self.assertEqual(result['fragment'], 'my_fragment')
130 |
131 | url = 'http://mysubdomain.example.com#my_fragment?myargs=test'
132 | result = url_parser.parse_url(url)
133 | self.assertEqual(result['fragment'], 'my_fragment')
134 |
135 | url = 'http://mysubdomain.example.com/test/path.js#my_fragment?myargs=test'
136 | result = url_parser.parse_url(url)
137 | self.assertEqual(result['fragment'], 'my_fragment')
138 |
139 | def test_finds_query(self):
140 | url = 'http://mysubdomain.example.com?myquery=test'
141 | result = url_parser.parse_url(url)
142 | self.assertEqual(result['query']['myquery'], 'test')
143 |
144 | url = 'http://mysubdomain.example.com?myquery=test&one=two&test'
145 | result = url_parser.parse_url(url)
146 | self.assertEqual(result['query']['myquery'], 'test')
147 | self.assertEqual(result['query']['one'], 'two')
148 | self.assertIsNone(result['query']['test'])
149 |
150 | url = 'http://mysubdomain.example.com/file.js?myquery=test&one=two'
151 | result = url_parser.parse_url(url)
152 | self.assertEqual(result['query']['myquery'], 'test')
153 | self.assertEqual(result['query']['one'], 'two')
154 |
155 | url = 'http://mysubdomain.example.com/path/and/file.js?myquery=test&one=two'
156 | result = url_parser.parse_url(url)
157 | self.assertEqual(result['query']['myquery'], 'test')
158 | self.assertEqual(result['query']['one'], 'two')
159 |
160 | url = 'http://mysubdomain.example.com/path/?myquery=test&one=two'
161 | result = url_parser.parse_url(url)
162 | self.assertEqual(result['query']['myquery'], 'test')
163 | self.assertEqual(result['query']['one'], 'two')
164 |
165 | def test_catastrophic_backtracking(self):
166 | url = 'http://very_long-and-complixated_subdomaind-for-page.mywebpageishere.com/'
167 | result = url_parser.parse_url(url)
168 | self.assertEqual(result['top_domain'], 'com')
169 | self.assertEqual(result['domain'], 'mywebpageishere')
170 | self.assertEqual(result['sub_domain'], 'very_long-and-complixated_subdomaind-for-page')
171 |
172 | def test_domain_that_starts_with_same_letters_as_top_domain(self):
173 | url = 'http://domains-stars-with-same-top-domain.nogo.no/'
174 | result = url_parser.parse_url(url)
175 | self.assertEqual(result['top_domain'], 'no')
176 | self.assertEqual(result['domain'], 'nogo')
177 | self.assertEqual(result['sub_domain'], 'domains-stars-with-same-top-domain')
178 |
179 | def test_domain_that_includes_a_top_domain_in_sub_domain(self):
180 | url = 'http://test.com.hello.nogo.no/'
181 | result = url_parser.parse_url(url)
182 | self.assertEqual(result['top_domain'], 'no')
183 | self.assertEqual(result['domain'], 'nogo')
184 | self.assertEqual(result['sub_domain'], 'test.com.hello')
185 |
186 | def test_domain_that_includes_a_top_domain_in_query(self):
187 | url = 'http://test.com.hello.nogo.no?my_query_domain=www.test.com'
188 | result = url_parser.parse_url(url)
189 | self.assertEqual(result['top_domain'], 'no')
190 | self.assertEqual(result['domain'], 'nogo')
191 | self.assertEqual(result['sub_domain'], 'test.com.hello')
192 | self.assertEqual(result['query']['my_query_domain'], 'www.test.com')
193 |
194 |
195 | class TestGetUrl(TestCase):
196 | def test_parses_url_without_www(self):
197 | url = 'example.com'
198 | result = url_parser.get_url(url)
199 | self.assertEqual(result.domain, 'example')
200 | self.assertEqual(result.top_domain, 'com')
201 |
202 | def test_parses_url_with_www(self):
203 | url = 'www.example.com'
204 | result = url_parser.get_url(url)
205 | self.assertEqual(result.domain, 'example')
206 | self.assertEqual(result.top_domain, 'com')
207 |
208 | def test_returns_null_if_protocol_is_missing(self):
209 | url = 'www.example.com'
210 | result = url_parser.get_url(url)
211 | self.assertIsNone(result.protocol)
212 |
213 | def test_returns_null_if_www_is_missing(self):
214 | url = 'http://example.com'
215 | result = url_parser.get_url(url)
216 | self.assertIsNone(result.www)
217 |
218 | def test_removes_extra_dot_from_www(self):
219 | url = 'http://www..example.com'
220 | result = url_parser.get_url(url)
221 | has_dot = '.' in result.www
222 | self.assertFalse(has_dot)
223 |
224 | def test_returns_null_if_sub_domain_is_missing(self):
225 | url = 'http://example.com'
226 | result = url_parser.get_url(url)
227 | self.assertIsNone(result.sub_domain)
228 |
229 | def test_finds_sub_domain(self):
230 | url = 'mysubdomain.example.com'
231 | result = url_parser.get_url(url)
232 | self.assertEqual(result.sub_domain, 'mysubdomain')
233 |
234 | def test_finds_multiple_subdomains(self):
235 | url = 'my.subdomain.example.com'
236 | result = url_parser.get_url(url)
237 | self.assertEqual(result.sub_domain, 'my.subdomain')
238 |
239 | def test_finds_protocol(self):
240 | url = 'http://mysubdomain.example.com'
241 | result = url_parser.get_url(url)
242 | self.assertEqual(result.protocol, 'http')
243 |
244 | url = 'https://mysubdomain.example.com'
245 | result = url_parser.get_url(url)
246 | self.assertEqual(result.protocol, 'https')
247 |
248 | url = 'ftp://mysubdomain.example.com'
249 | result = url_parser.get_url(url)
250 | self.assertEqual(result.protocol, 'ftp')
251 |
252 | def test_finds_dir(self):
253 | url = 'http://mysubdomain.example.com/folder/'
254 | result = url_parser.get_url(url)
255 | self.assertEqual(result.dir, '/folder/')
256 |
257 | url = 'http://mysubdomain.example.com/multiple/folders/'
258 | result = url_parser.get_url(url)
259 | self.assertEqual(result.dir, '/multiple/folders/')
260 |
261 | url = 'http://mysubdomain.example.com/multiple/folders/with_a_file.js'
262 | result = url_parser.get_url(url)
263 | self.assertEqual(result.dir, '/multiple/folders/')
264 |
265 | def test_does_not_mistake_file_for_dir(self):
266 | url = 'http://mysubdomain.example.com/folder/test'
267 | result = url_parser.get_url(url)
268 | self.assertEqual(result.dir, '/folder/')
269 | self.assertNotEqual(result.dir, '/folder/test')
270 |
271 | def test_finds_domain(self):
272 | url = 'http://mysubdomain.example.com'
273 | result = url_parser.get_url(url)
274 | self.assertEqual(result.domain, 'example')
275 |
276 | def test_finds_top_domain(self):
277 | url = 'http://mysubdomain.example.com'
278 | result = url_parser.get_url(url)
279 | self.assertEqual(result.top_domain, 'com')
280 |
281 | url = 'http://mysubdomain.example.co.uk'
282 | result = url_parser.get_url(url)
283 | self.assertEqual(result.top_domain, 'co.uk')
284 |
285 | def test_finds_file(self):
286 | url = 'http://mysubdomain.example.com/cool.jpg'
287 | result = url_parser.get_url(url)
288 | self.assertEqual(result.file, 'cool.jpg')
289 |
290 | url = 'http://mysubdomain.example.com/directory/here/sample.mp4'
291 | result = url_parser.get_url(url)
292 | self.assertEqual(result.file, 'sample.mp4')
293 |
294 | def test_finds_path(self):
295 | url = 'http://mysubdomain.example.com/path'
296 | result = url_parser.get_url(url)
297 | self.assertEqual(result.path, '/path')
298 |
299 | url = 'http://mysubdomain.example.com/this/is/the/path'
300 | result = url_parser.get_url(url)
301 | self.assertEqual(result.path, '/this/is/the/path')
302 |
303 | url = 'http://mysubdomain.example.com/path/with/file.js'
304 | result = url_parser.get_url(url)
305 | self.assertEqual(result.path, '/path/with/file.js')
306 |
307 | def test_finds_fragment(self):
308 | url = 'http://mysubdomain.example.com#my_fragment'
309 | result = url_parser.get_url(url)
310 | self.assertEqual(result.fragment, 'my_fragment')
311 |
312 | url = 'http://mysubdomain.example.com/path/#my_fragment'
313 | result = url_parser.get_url(url)
314 | self.assertEqual(result.fragment, 'my_fragment')
315 |
316 | url = 'http://mysubdomain.example.com/path/file.js#my_fragment'
317 | result = url_parser.get_url(url)
318 | self.assertEqual(result.fragment, 'my_fragment')
319 |
320 | url = 'http://mysubdomain.example.com#my_fragment?myargs=test'
321 | result = url_parser.get_url(url)
322 | self.assertEqual(result.fragment, 'my_fragment')
323 |
324 | url = 'http://mysubdomain.example.com/test/path.js#my_fragment?myargs=test'
325 | result = url_parser.get_url(url)
326 | self.assertEqual(result.fragment, 'my_fragment')
327 |
328 | def test_finds_query(self):
329 | url = 'http://mysubdomain.example.com?myquery=test'
330 | result = url_parser.get_url(url)
331 | self.assertEqual(result.query['myquery'], 'test')
332 |
333 | url = 'http://mysubdomain.example.com?myquery=test&one=two&test'
334 | result = url_parser.get_url(url)
335 | self.assertEqual(result.query['myquery'], 'test')
336 | self.assertEqual(result.query['one'], 'two')
337 | self.assertIsNone(result.query['test'])
338 |
339 | url = 'http://mysubdomain.example.com/file.js?myquery=test&one=two'
340 | result = url_parser.get_url(url)
341 | self.assertEqual(result.query['myquery'], 'test')
342 | self.assertEqual(result.query['one'], 'two')
343 |
344 | url = 'http://mysubdomain.example.com/path/and/file.js?myquery=test&one=two'
345 | result = url_parser.get_url(url)
346 | self.assertEqual(result.query['myquery'], 'test')
347 | self.assertEqual(result.query['one'], 'two')
348 |
349 | url = 'http://mysubdomain.example.com/path/?myquery=test&one=two'
350 | result = url_parser.get_url(url)
351 | self.assertEqual(result.query['myquery'], 'test')
352 | self.assertEqual(result.query['one'], 'two')
353 |
354 |
355 | class TestGetBasicUrl(TestCase):
356 | def test_basic_url(self):
357 | url = 'http://example.com'
358 | result = url_parser.get_base_url(url)
359 | self.assertEqual(result, 'http://example.com')
360 |
361 | url = 'https://example.com'
362 | result = url_parser.get_base_url(url)
363 | self.assertEqual(result, 'https://example.com')
364 |
365 | url = 'https://www.example.com'
366 | result = url_parser.get_base_url(url)
367 | self.assertEqual(result, 'https://www.example.com')
368 |
369 | url = 'example.com'
370 | result = url_parser.get_base_url(url)
371 | self.assertEqual(result, 'http://example.com')
372 |
373 | def test_sub_domain_basic_url(self):
374 | url = 'http://mysubdomain.example.com'
375 | result = url_parser.get_base_url(url)
376 | self.assertEqual(result, 'http://mysubdomain.example.com')
377 |
378 | url = 'https://mysubdomain.example.com'
379 | result = url_parser.get_base_url(url)
380 | self.assertEqual(result, 'https://mysubdomain.example.com')
381 |
382 | url = 'ftp://mysubdomain.example.com'
383 | result = url_parser.get_base_url(url)
384 | self.assertEqual(result, 'ftp://mysubdomain.example.com')
385 |
386 | def test_path_url(self):
387 | url = 'https://mysubdomain.example.com/path/to/wisdom'
388 | result = url_parser.get_base_url(url)
389 | self.assertEqual(result, 'https://mysubdomain.example.com')
390 |
391 | def test_long_url(self):
392 | url = 'https://mysubdomain.example.com/path/to/wisdom?query=2&this=3'
393 | result = url_parser.get_base_url(url)
394 | self.assertEqual(result, 'https://mysubdomain.example.com')
395 |
396 | url = 'https://www.example.com/path/to/wisdom?query=2&this=3'
397 | result = url_parser.get_base_url(url)
398 | self.assertEqual(result, 'https://www.example.com')
399 |
400 | url = 'https://example.com/path/to/wisdom?query=2&this=3'
401 | result = url_parser.get_base_url(url)
402 | self.assertEqual(result, 'https://example.com')
403 |
404 | url = 'example.com/path/to/wisdom?query=2&this=3'
405 | result = url_parser.get_base_url(url)
406 | self.assertEqual(result, 'http://example.com')
--------------------------------------------------------------------------------