├── .github
    └── workflows
    │   ├── build.yml
    │   └── deploy.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmarks
    ├── domain_benchmark.py
    └── url_benchmark.py
├── cortex.yaml
├── images
    └── logo.png
├── pydomainextractor
    ├── __init__.py
    └── pydomainextractor.pyi
├── pyproject.toml
├── src
    ├── lib.rs
    └── public_suffix_list.dat
└── tests
    ├── __init__.py
    └── test_pydomainextractor.py


/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | on:
 3 |   - push
 4 |   - pull_request
 5 | jobs:
 6 |   lint:
 7 |     if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags')
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - name: Checkout
11 |         uses: actions/checkout@v3
12 |       - name: Install latest rust
13 |         uses: actions-rs/toolchain@v1
14 |         with:
15 |           toolchain: stable
16 |           profile: minimal
17 |           override: true
18 |           components: clippy
19 |       - name: Lint with clippy
20 |         uses: actions-rs/cargo@v1
21 |         with:
22 |           command: clippy
23 |           args: --all-targets --all-features
24 |   test:
25 |     runs-on: ${{ matrix.os }}
26 |     needs: lint
27 |     strategy:
28 |       fail-fast: false
29 |       matrix:
30 |         python-version:
31 |           - '3.7'
32 |           - '3.8'
33 |           - '3.9'
34 |           - '3.10'
35 |           - '3.11'
36 |         os:
37 |           - ubuntu-latest
38 |           - macos-latest
39 |           - windows-latest
40 |     steps:
41 |       - name: Checkout
42 |         uses: actions/checkout@v3
43 |       - name: Set up Python ${{ matrix.python-version }}
44 |         uses: actions/setup-python@v4
45 |         with:
46 |           python-version: ${{ matrix.python-version }}
47 |       - name: Install Poetry
48 |         uses: abatilo/actions-poetry@v2
49 |       - name: Install Rust
50 |         uses: actions-rs/toolchain@v1
51 |         with:
52 |           profile: minimal
53 |           toolchain: stable
54 |           override: true
55 |       - name: Install dependencies
56 |         run: poetry install
57 |       - name: Build Python package
58 |         run: poetry run maturin develop
59 |       - name: Test
60 |         run: poetry run pytest -Werror tests
61 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy
 2 | on:
 3 |   release:
 4 |     types:
 5 |       - released
 6 | jobs:
 7 |   deploy:
 8 |     runs-on: ${{ matrix.os }}
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         python-version:
13 |           - "3.7"
14 |           - "3.8"
15 |           - "3.9"
16 |           - "3.10"
17 |           - "3.11"
18 |         os:
19 |           - ubuntu-latest
20 |           - macos-latest
21 |           - windows-latest
22 |     steps:
23 |       - name: Checkout
24 |         uses: actions/checkout@v3
25 |       - name: Set up Python ${{ matrix.python-version }}
26 |         uses: actions/setup-python@v4
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 |       - name: Install Rust
30 |         uses: actions-rs/toolchain@v1
31 |         with:
32 |           profile: minimal
33 |           toolchain: stable
34 |           override: true
35 |       - name: Install Cross-compilers (macOS)
36 |         if: matrix.os == 'macos-latest'
37 |         run: |
38 |           rustup target add x86_64-apple-darwin
39 |           rustup target add aarch64-apple-darwin
40 |       - name: Install Cross-compilers (Linux)
41 |         if: matrix.os == 'ubuntu-latest'
42 |         run: |
43 |           rustup target add aarch64-unknown-linux-gnu
44 |       - name: Publish Package
45 |         uses: PyO3/maturin-action@v1
46 |         with:
47 |           command: publish
48 |           args: --username=__token__ ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.7' && '' || '--no-sdist' }} --interpreter=python${{ !startsWith(matrix.os, 'windows') && matrix.python-version || '' }}
49 |         env:
50 |           MATURIN_PASSWORD: ${{ secrets.pypi_password }}
51 |         if: matrix.os != 'macos-latest'
52 |       - name: Publish macOS (x86_64) Package
53 |         if: matrix.os == 'macos-latest'
54 |         uses: PyO3/maturin-action@v1
55 |         with:
56 |           command: publish
57 |           args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=x86_64-apple-darwin --no-sdist
58 |         env:
59 |           MATURIN_PASSWORD: ${{ secrets.pypi_password }}
60 |       - name: Publish macOS (arm64) Package
61 |         if: matrix.os == 'macos-latest'
62 |         uses: PyO3/maturin-action@v1
63 |         with:
64 |           command: publish
65 |           args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-apple-darwin --no-sdist
66 |         env:
67 |           MATURIN_PASSWORD: ${{ secrets.pypi_password }}
68 |       - name: Publish Linux (arm64) Package
69 |         if: matrix.os == 'ubuntu-latest'
70 |         uses: PyO3/maturin-action@v1
71 |         with:
72 |           command: publish
73 |           args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-unknown-linux-gnu --no-sdist
74 |         env:
75 |           MATURIN_PASSWORD: ${{ secrets.pypi_password }}
76 |   dist-source:
77 |     runs-on: ubuntu-latest
78 |     steps:
79 |       - name: Distribute Source
80 |         uses: PyO3/maturin-action@v1
81 |         with:
82 |           command: sdist
83 |         env:
84 |           MATURIN_PASSWORD: ${{ secrets.pypi_password }}
85 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Python ###
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | .vscode/
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # poetry
100 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
102 | #   commonly ignored for libraries.
103 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | poetry.lock
105 | 
106 | # pdm
107 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | #   in version control.
111 | #   https://pdm.fming.dev/#use-with-ide
112 | .pdm.toml
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 
164 | ### Rust ###
165 | # Generated by Cargo
166 | # will have compiled files and executables
167 | debug/
168 | 
169 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
170 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
171 | Cargo.lock
172 | 
173 | # These are backup files generated by rustfmt
174 | **/*.rs.bk
175 | 
176 | # MSVC Windows builds of rustc generate these, which store debugging information
177 | *.pdb
178 | 
179 | ### VisualStudioCode ###
180 | .vscode/*
181 | !.vscode/settings.json
182 | !.vscode/tasks.json
183 | !.vscode/launch.json
184 | !.vscode/extensions.json
185 | !.vscode/*.code-snippets
186 | 
187 | # Local History for Visual Studio Code
188 | .history/
189 | 
190 | # Built Visual Studio Code Extensions
191 | *.vsix
192 | 
193 | ### VisualStudioCode Patch ###
194 | # Ignore all local history of files
195 | .history
196 | .ionide
197 | 
198 | # Support for Project snippet scope
199 | .vscode/*.code-snippets
200 | 
201 | # Ignore code-workspaces
202 | *.code-workspace
203 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "pydomainextractor"
 3 | version = "0.13.10"
 4 | authors = ["Viktor Vilskyi <viktor_vilskyi@rapid7.com>"]
 5 | edition = "2021"
 6 | repository = "https://github.com/intsights/pydomainextractor"
 7 | homepage = "https://github.com/intsights/pydomainextractor"
 8 | license = "MIT"
 9 | keywords = [
10 |     "domain",
11 |     "extraction",
12 |     "tld",
13 |     "suffix",
14 |     "psl",
15 |     "rust",
16 |     "pyo3",
17 | ]
18 | 
19 | [lib]
20 | name = "pydomainextractor"
21 | crate-type = ["cdylib"]
22 | 
23 | [dependencies]
24 | ahash = "0.8"
25 | idna = "0.3"
26 | memchr = "2"
27 | arraystring = "0.3.0"
28 | typenum = "1"
29 | 
30 | [dependencies.pyo3]
31 | version = "0.17.3"
32 | features = ["extension-module"]
33 | 
34 | [profile.release]
35 | lto = true
36 | panic = "abort"
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Gal Ben David
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include images/logo.png
3 | graft tests
4 | recursive-include pydomainextractor *.py *.pyi
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |     <a href="https://github.com/Intsights/PyDomainExtractor">
  3 |         <img src="https://raw.githubusercontent.com/Intsights/PyDomainExtractor/master/images/logo.png" alt="Logo">
  4 |     </a>
  5 |     <h3 align="center">
  6 |         A blazingly fast domain extraction library written in Rust
  7 |     </h3>
  8 | </p>
  9 | 
 10 | ![license](https://img.shields.io/badge/MIT-License-blue)
 11 | ![Python](https://img.shields.io/badge/Python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue)
 12 | ![Build](https://github.com/Intsights/PyDomainExtractor/workflows/Build/badge.svg)
 13 | [![PyPi](https://img.shields.io/pypi/v/PyDomainExtractor.svg)](https://pypi.org/project/PyDomainExtractor/)
 14 | 
 15 | ## Table of Contents
 16 | 
 17 | - [Table of Contents](#table-of-contents)
 18 | - [About The Project](#about-the-project)
 19 |   - [Built With](#built-with)
 20 |   - [Performance](#performance)
 21 |     - [Extract From Domain](#extract-from-domain)
 22 |     - [Extract From URL](#extract-from-url)
 23 |   - [Installation](#installation)
 24 | - [Usage](#usage)
 25 |   - [Extraction](#extraction)
 26 |   - [URL Extraction](#url-extraction)
 27 |   - [Validation](#validation)
 28 |   - [TLDs List](#tlds-list)
 29 | - [License](#license)
 30 | - [Contact](#contact)
 31 | 
 32 | 
 33 | ## About The Project
 34 | 
 35 | PyDomainExtractor is a Python library designed to parse domain names quickly.
 36 | In order to achieve the highest performance possible, the library was written in Rust.
 37 | 
 38 | 
 39 | ### Built With
 40 | 
 41 | * [AHash](https://github.com/tkaitchuck/aHash)
 42 | * [idna](https://github.com/servo/rust-url/)
 43 | * [memchr](https://github.com/BurntSushi/memchr)
 44 | * [once_cell](https://github.com/matklad/once_cell)
 45 | * [Public Suffix List](https://publicsuffix.org/)
 46 | 
 47 | 
 48 | ### Performance
 49 | 
 50 | 
 51 | #### Extract From Domain
 52 | 
 53 | Tests were run on a file containing 10 million random domains from various top-level domains (Mar. 13rd 2022)
 54 | 
 55 | | Library  | Function | Time |
 56 | | ------------- | ------------- | ------------- |
 57 | | [PyDomainExtractor](https://github.com/Intsights/PyDomainExtractor) | pydomainextractor.extract | 1.50s |
 58 | | [publicsuffix2](https://github.com/nexb/python-publicsuffix2) | publicsuffix2.get_sld | 9.92s |
 59 | | [tldextract](https://github.com/john-kurkowski/tldextract) | \_\_call\_\_ | 29.23s |
 60 | | [tld](https://github.com/barseghyanartur/tld) | tld.parse_tld | 34.48s |
 61 | 
 62 | 
 63 | #### Extract From URL
 64 | 
 65 | The test was conducted on a file containing 1 million random urls (Mar. 13rd 2022)
 66 | 
 67 | | Library  | Function | Time |
 68 | | ------------- | ------------- | ------------- |
 69 | | [PyDomainExtractor](https://github.com/Intsights/PyDomainExtractor) | pydomainextractor.extract_from_url | 2.24s |
 70 | | [publicsuffix2](https://github.com/nexb/python-publicsuffix2) | publicsuffix2.get_sld | 10.84s |
 71 | | [tldextract](https://github.com/john-kurkowski/tldextract) | \_\_call\_\_ | 36.04s |
 72 | | [tld](https://github.com/barseghyanartur/tld) | tld.parse_tld | 57.87s |
 73 | 
 74 | 
 75 | ### Installation
 76 | 
 77 | ```sh
 78 | pip3 install PyDomainExtractor
 79 | ```
 80 | 
 81 | 
 82 | ## Usage
 83 | 
 84 | 
 85 | ### Extraction
 86 | 
 87 | ```python
 88 | import pydomainextractor
 89 | 
 90 | 
 91 | # Loads the current supplied version of PublicSuffixList from the repository. Does not download any data.
 92 | domain_extractor = pydomainextractor.DomainExtractor()
 93 | 
 94 | domain_extractor.extract('google.com')
 95 | >>> {
 96 | >>>     'subdomain': '',
 97 | >>>     'domain': 'google',
 98 | >>>     'suffix': 'com'
 99 | >>> }
100 | 
101 | # Loads a custom SuffixList data. Should follow PublicSuffixList's format.
102 | domain_extractor = pydomainextractor.DomainExtractor(
103 |     'tld\n'
104 |     'custom.tld\n'
105 | )
106 | 
107 | domain_extractor.extract('google.com')
108 | >>> {
109 | >>>     'subdomain': 'google',
110 | >>>     'domain': 'com',
111 | >>>     'suffix': ''
112 | >>> }
113 | 
114 | domain_extractor.extract('google.custom.tld')
115 | >>> {
116 | >>>     'subdomain': '',
117 | >>>     'domain': 'google',
118 | >>>     'suffix': 'custom.tld'
119 | >>> }
120 | ```
121 | 
122 | 
123 | ### URL Extraction
124 | 
125 | ```python
126 | import pydomainextractor
127 | 
128 | 
129 | # Loads the current supplied version of PublicSuffixList from the repository. Does not download any data.
130 | domain_extractor = pydomainextractor.DomainExtractor()
131 | 
132 | domain_extractor.extract_from_url('http://google.com/')
133 | >>> {
134 | >>>     'subdomain': '',
135 | >>>     'domain': 'google',
136 | >>>     'suffix': 'com'
137 | >>> }
138 | ```
139 | 
140 | 
141 | ### Validation
142 | 
143 | ```python
144 | import pydomainextractor
145 | 
146 | 
147 | # Loads the current supplied version of PublicSuffixList from the repository. Does not download any data.
148 | domain_extractor = pydomainextractor.DomainExtractor()
149 | 
150 | domain_extractor.is_valid_domain('google.com')
151 | >>> True
152 | 
153 | domain_extractor.is_valid_domain('domain.اتصالات')
154 | >>> True
155 | 
156 | domain_extractor.is_valid_domain('xn--mgbaakc7dvf.xn--mgbaakc7dvf')
157 | >>> True
158 | 
159 | domain_extractor.is_valid_domain('domain-.com')
160 | >>> False
161 | 
162 | domain_extractor.is_valid_domain('-sub.domain.com')
163 | >>> False
164 | 
165 | domain_extractor.is_valid_domain('\xF0\x9F\x98\x81nonalphanum.com')
166 | >>> False
167 | ```
168 | 
169 | 
170 | ### TLDs List
171 | 
172 | ```python
173 | import pydomainextractor
174 | 
175 | 
176 | # Loads the current supplied version of PublicSuffixList from the repository. Does not download any data.
177 | domain_extractor = pydomainextractor.DomainExtractor()
178 | 
179 | domain_extractor.get_tld_list()
180 | >>> [
181 | >>>     'bostik',
182 | >>>     'backyards.banzaicloud.io',
183 | >>>     'biz.bb',
184 | >>>     ...
185 | >>> ]
186 | ```
187 | 
188 | 
189 | ## License
190 | 
191 | Distributed under the MIT License. See `LICENSE` for more information.
192 | 
193 | 
194 | ## Contact
195 | 
196 | Gal Ben David - gal@intsights.com
197 | 
198 | Project Link: [https://github.com/Intsights/PyDomainExtractor](https://github.com/Intsights/PyDomainExtractor)
199 | 
200 | 
201 | 
202 | 
203 | [license-shield]: https://img.shields.io/github/license/othneildrew/Best-README-Template.svg?style=flat-square
204 | 


--------------------------------------------------------------------------------
/benchmarks/domain_benchmark.py:
--------------------------------------------------------------------------------
 1 | import tldextract
 2 | import publicsuffix2
 3 | import tld
 4 | import pydomainextractor
 5 | import time
 6 | 
 7 | 
 8 | def benchmark_tldextract(
 9 |     domains,
10 | ):
11 |     extractor = tldextract.TLDExtract(
12 |         include_psl_private_domains=True,
13 |     )
14 | 
15 |     start = time.perf_counter()
16 | 
17 |     for domain in domains:
18 |         extractor(domain)
19 | 
20 |     end = time.perf_counter()
21 | 
22 |     print(f'tldextract: {end - start}s')
23 | 
24 | 
25 | def benchmark_publicsuffix2(
26 |     domains,
27 | ):
28 |     start = time.perf_counter()
29 | 
30 |     for domain in domains:
31 |         publicsuffix2.get_sld(domain)
32 | 
33 |     end = time.perf_counter()
34 | 
35 |     print(f'publicsuffix2: {end - start}s')
36 | 
37 | 
38 | def benchmark_tld(
39 |     domains,
40 | ):
41 |     start = time.perf_counter()
42 | 
43 |     for domain in domains:
44 |         tld.parse_tld(domain)
45 | 
46 |     end = time.perf_counter()
47 | 
48 |     print(f'tld: {end - start}s')
49 | 
50 | 
51 | def benchmark_pydomainextractor(
52 |     domains,
53 | ):
54 |     extractor = pydomainextractor.DomainExtractor()
55 | 
56 |     start = time.perf_counter()
57 | 
58 |     for domain in domains:
59 |         extractor.extract(domain)
60 | 
61 |     end = time.perf_counter()
62 | 
63 |     print(f'pydomainextractor: {end - start}s')
64 | 
65 | 
66 | def main():
67 |     domains = []
68 |     with open('10m_domains') as domains_file:
69 |         for line in domains_file:
70 |             domains.append(line.rstrip())
71 | 
72 |     benchmark_tldextract(domains)
73 |     benchmark_publicsuffix2(domains)
74 |     benchmark_tld(domains)
75 |     benchmark_pydomainextractor(domains)
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     main()
80 | 


--------------------------------------------------------------------------------
/benchmarks/url_benchmark.py:
--------------------------------------------------------------------------------
 1 | import tldextract
 2 | import publicsuffix2
 3 | import tld
 4 | import pydomainextractor
 5 | import time
 6 | 
 7 | 
 8 | def benchmark_tldextract(
 9 |     urls,
10 | ):
11 |     extractor = tldextract.TLDExtract(
12 |         include_psl_private_domains=True,
13 |     )
14 | 
15 |     start = time.perf_counter()
16 | 
17 |     for url in urls:
18 |         extractor(url)
19 | 
20 |     end = time.perf_counter()
21 | 
22 |     print(f'tldextract: {end - start}s')
23 | 
24 | 
25 | def benchmark_publicsuffix2(
26 |     urls,
27 | ):
28 |     start = time.perf_counter()
29 | 
30 |     for url in urls:
31 |         publicsuffix2.get_sld(url)
32 | 
33 |     end = time.perf_counter()
34 | 
35 |     print(f'publicsuffix2: {end - start}s')
36 | 
37 | 
38 | def benchmark_tld(
39 |     urls,
40 | ):
41 |     start = time.perf_counter()
42 | 
43 |     for url in urls:
44 |         tld.parse_tld(url)
45 | 
46 |     end = time.perf_counter()
47 | 
48 |     print(f'tld: {end - start}s')
49 | 
50 | 
51 | def benchmark_pydomainextractor(
52 |     urls,
53 | ):
54 |     extractor = pydomainextractor.DomainExtractor()
55 | 
56 |     start = time.perf_counter()
57 | 
58 |     for url in urls:
59 |         extractor.extract_from_url(url)
60 | 
61 |     end = time.perf_counter()
62 | 
63 |     print(f'pydomainextractor: {end - start}s')
64 | 
65 | 
66 | def main():
67 |     urls = []
68 |     with open('1m_urls') as urls_file:
69 |         for line in urls_file:
70 |             urls.append(line.rstrip())
71 | 
72 |     urls = urls * 10
73 | 
74 |     # benchmark_tldextract(urls)
75 |     # benchmark_publicsuffix2(urls)
76 |     # benchmark_tld(urls)
77 |     benchmark_pydomainextractor(urls)
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     main()
82 | 


--------------------------------------------------------------------------------
/cortex.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | info:
 3 |   title: Pydomainextractor
 4 |   description: A blazingly fast domain extraction library written in Rust
 5 |   x-cortex-git:
 6 |     github:
 7 |       alias: intsightsorg
 8 |       repository: Intsights/PyDomainExtractor
 9 |   x-cortex-tag: pydomainextractor
10 |   x-cortex-type: service
11 |   x-cortex-domain-parents:
12 |   - tag: threatintel-shadow-intel
13 |   x-cortex-groups:
14 |   - exposure:external-ship
15 |   - target:library
16 | openapi: 3.0.1
17 | servers:
18 | - url: "/"
19 | 


--------------------------------------------------------------------------------
/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyDomainExtractor/a59d365effa56872235d3ffa0e7a1367065fa3e6/images/logo.png


--------------------------------------------------------------------------------
/pydomainextractor/__init__.py:
--------------------------------------------------------------------------------
 1 | import typing
 2 | 
 3 | from . import pydomainextractor
 4 | 
 5 | 
 6 | class DomainExtractor:
 7 |     '''
 8 |     PyDomainExtractor is a highly optimized Domain Name Extraction library written in Rust
 9 |     '''
10 |     engine: typing.Optional[pydomainextractor.DomainExtractor] = None
11 | 
12 |     def __new__(
13 |         cls,
14 |         suffix_list_data: typing.Optional[str] = None,
15 |     ):
16 |         if suffix_list_data is None:
17 |             if DomainExtractor.engine is None:
18 |                 DomainExtractor.engine = pydomainextractor.DomainExtractor()
19 | 
20 |             return DomainExtractor.engine
21 |         else:
22 |             return pydomainextractor.DomainExtractor(suffix_list_data)
23 | 


--------------------------------------------------------------------------------
/pydomainextractor/pydomainextractor.pyi:
--------------------------------------------------------------------------------
 1 | import typing
 2 | 
 3 | 
 4 | class DomainExtractor:
 5 |     def __init__(
 6 |         self,
 7 |         suffix_list_data: typing.Optional[str] = None,
 8 |     ) -> None: ...
 9 | 
10 |     def extract(
11 |         self,
12 |         domain: str,
13 |     ) -> typing.Dict[str, str]: ...
14 | 
15 |     def extract_from_url(
16 |         self,
17 |         url: str,
18 |     ) -> typing.Dict[str, str]: ...
19 | 
20 |     def is_valid_domain(
21 |         self,
22 |         domain: str,
23 |     ) -> bool: ...
24 | 
25 |     def get_tld_list(
26 |         self,
27 |     ) -> typing.List[str]: ...
28 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "pydomainextractor"
 3 | version = "0.13.10"
 4 | authors = [
 5 |   {email = "viktor_vilskyi@rapid7.com"},
 6 |   {name = "Viktor Vilskyi"}
 7 | ]
 8 | requires-python = ">=3.7"
 9 | license = {file = "LICENSE"}
10 | classifiers = [
11 |   "License :: OSI Approved :: MIT License",
12 |   "Operating System :: MacOS",
13 |   "Operating System :: Microsoft",
14 |   "Operating System :: POSIX :: Linux",
15 |   "Programming Language :: Python :: 3.7",
16 |   "Programming Language :: Python :: 3.8",
17 |   "Programming Language :: Python :: 3.9",
18 |   "Programming Language :: Python :: 3.10",
19 |   "Programming Language :: Python :: 3.11",
20 |   "Programming Language :: Rust",
21 | ]
22 | 
23 | [project.urls]
24 | repository = "https://github.com/intsights/pydomainextractor"
25 | homepage = "https://github.com/intsights/pydomainextractor"
26 | 
27 | [build-system]
28 | requires = ["maturin>=0.14,<0.15"]
29 | build-backend = "maturin"
30 | 
31 | [tool.maturin]
32 | sdist-include = [
33 |     "Cargo.toml",
34 |     "pydomainextractor/*.py",
35 |     "pydomainextractor/*.pyi",
36 |     "pyproject.toml",
37 |     "src/*",
38 | ]
39 | 
40 | [tool.poetry]
41 | name = "pydomainextractor"
42 | version = "0.13.9"
43 | authors = ["Viktor Vilskyi <viktor_vilskyi@rapid7.com>"]
44 | description = "A blazingly fast domain extraction library written in Rust"
45 | readme = "README.md"
46 | repository = "https://github.com/intsights/pydomainextractor"
47 | homepage = "https://github.com/intsights/pydomainextractor"
48 | license = "MIT"
49 | keywords = [
50 |     "domain",
51 |     "extraction",
52 |     "tld",
53 |     "suffix",
54 |     "psl",
55 |     "rust",
56 |     "pyo3",
57 | ]
58 | classifiers = [
59 |     "License :: OSI Approved :: MIT License",
60 |     "Operating System :: MacOS",
61 |     "Operating System :: Microsoft",
62 |     "Operating System :: POSIX :: Linux",
63 |     "Programming Language :: Python :: 3.7",
64 |     "Programming Language :: Python :: 3.8",
65 |     "Programming Language :: Python :: 3.9",
66 |     "Programming Language :: Python :: 3.10",
67 |     "Programming Language :: Python :: 3.11",
68 |     "Programming Language :: Rust",
69 | ]
70 | 
71 | [tool.poetry.dependencies]
72 | python = "^3.7"
73 | 
74 | [tool.poetry.dev-dependencies]
75 | pytest = "*"
76 | wheel = "*"
77 | pytest-runner = "*"
78 | maturin = "*"
79 | 
80 | [tool.pytest.ini_options]
81 | minversion = "6.0"
82 | addopts = [
83 |     "--tb=native",
84 |     "--pythonwarnings=all",
85 | ]
86 | testpaths = [
87 |     "tests",
88 | ]
89 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | use ahash::{AHashMap, AHashSet};
  2 | use pyo3::exceptions::PyValueError;
  3 | use pyo3::intern;
  4 | use pyo3::prelude::*;
  5 | use pyo3::types::PyString;
  6 | use std::os::raw::c_char;
  7 | 
  8 | type DomainString = arraystring::ArrayString<typenum::U255>;
  9 | 
 10 | #[derive(Default)]
 11 | struct Suffix {
 12 |     sub_suffixes: AHashMap<String, Suffix>,
 13 |     is_wildcard: bool,
 14 |     sub_blacklist: AHashSet<String>,
 15 | }
 16 | 
 17 | static PUBLIC_SUFFIX_LIST_DATA: &str = include_str!("public_suffix_list.dat");
 18 | 
 19 | #[pyclass]
 20 | struct DomainExtractor {
 21 |     suffixes: AHashMap<String, Suffix>,
 22 |     tld_list: Vec<String>,
 23 | }
 24 | 
 25 | #[pymethods]
 26 | impl DomainExtractor {
 27 |     #[new]
 28 |     fn new(
 29 |         suffix_list: Option<&str>,
 30 |     ) -> Self {
 31 |         let (suffixes, tld_list) = if let Some(suffix_list) = suffix_list {
 32 |             parse_suffix_list(suffix_list)
 33 |         } else {
 34 |             parse_suffix_list(PUBLIC_SUFFIX_LIST_DATA)
 35 |         };
 36 | 
 37 |         DomainExtractor { suffixes, tld_list }
 38 |     }
 39 | 
 40 |     fn parse_domain_parts<'a>(
 41 |         &self,
 42 |         domain: &'a str,
 43 |     ) -> PyResult<(&'a str, &'a str, &'a str)> {
 44 |         let mut suffix_part = "";
 45 |         let mut current_suffixes = &self.suffixes;
 46 |         let mut last_dot_index = domain.len();
 47 |         let mut in_wildcard_tld = false;
 48 |         let mut last_suffix: Option<&Suffix> = None;
 49 | 
 50 |         while let Some(dot_index) = memchr::memrchr(b'.', &domain.as_bytes()[..last_dot_index]) {
 51 |             let current_fraction = &domain[dot_index + 1..last_dot_index];
 52 |             if current_fraction.is_empty() || dot_index == 0 {
 53 |                 return Err(PyValueError::new_err("Invalid domain detected"));
 54 |             }
 55 | 
 56 |             if in_wildcard_tld {
 57 |                 if last_suffix.unwrap().sub_blacklist.contains(current_fraction) {
 58 |                     let leftover_part = &domain[0..dot_index];
 59 | 
 60 |                     return Ok((suffix_part, current_fraction, leftover_part));
 61 |                 }
 62 | 
 63 |                 if let Some(current_suffix) = current_suffixes.get(current_fraction) {
 64 |                     if !current_suffix.is_wildcard {
 65 |                         current_suffixes = &current_suffix.sub_suffixes;
 66 |                     }
 67 |                     last_suffix.replace(current_suffix);
 68 |                     suffix_part = &domain[dot_index + 1..];
 69 |                     last_dot_index = dot_index;
 70 |                 } else {
 71 |                     suffix_part = &domain[dot_index + 1..];
 72 |                     let leftover_part = &domain[0..dot_index];
 73 |                     match leftover_part.rsplit_once('.') {
 74 |                         Some((subdomain_part, domain_part)) => {
 75 |                             if subdomain_part.ends_with('.') {
 76 |                                 return Err(PyValueError::new_err("Invalid domain detected"));
 77 |                             }
 78 |                             return Ok((suffix_part, domain_part, subdomain_part));
 79 |                         }
 80 |                         None => {
 81 |                             return Ok((suffix_part, leftover_part, ""));
 82 |                         }
 83 |                     }
 84 |                 }
 85 |             }
 86 |             if let Some(current_suffix) = current_suffixes.get(current_fraction) {
 87 |                 in_wildcard_tld = current_suffix.is_wildcard;
 88 | 
 89 |                 current_suffixes = &current_suffix.sub_suffixes;
 90 |                 last_suffix.replace(current_suffix);
 91 |                 suffix_part = &domain[dot_index + 1..];
 92 |                 last_dot_index = dot_index;
 93 |             } else {
 94 |                 let leftover_part = &domain[0..last_dot_index];
 95 |                 match leftover_part.rsplit_once('.') {
 96 |                     Some((subdomain_part, domain_part)) => {
 97 |                         if subdomain_part.ends_with('.') {
 98 |                             return Err(PyValueError::new_err("Invalid domain detected"));
 99 |                         }
100 |                         return Ok((suffix_part, domain_part, subdomain_part));
101 |                     }
102 |                     None => {
103 |                         return Ok((suffix_part, leftover_part, ""));
104 |                     }
105 |                 };
106 |             }
107 |         }
108 | 
109 |         let current_fraction = &domain[0..last_dot_index];
110 |         if in_wildcard_tld {
111 |             if last_suffix.unwrap().sub_blacklist.contains(current_fraction) {
112 |                 Ok((suffix_part, current_fraction, ""))
113 |             } else {
114 |                 Ok((domain, "", ""))
115 |             }
116 |         } else if current_suffixes.len() > 0 && current_suffixes.contains_key(current_fraction) {
117 |             Ok((domain, "", ""))
118 |         } else {
119 |             Ok((suffix_part, current_fraction, ""))
120 |         }
121 |     }
122 | 
123 |     fn extract(
124 |         &self,
125 |         py: Python,
126 |         domain: &PyString,
127 |     ) -> PyResult<PyObject> {
128 |         if domain.len().unwrap() > 255 {
129 |             return Err(PyValueError::new_err("Invalid domain detected"));
130 |         }
131 | 
132 |         let mut domain_string = unsafe {
133 |             DomainString::from_str_unchecked(domain.to_string_lossy().as_ref())
134 |         };
135 |         domain_string.make_ascii_lowercase();
136 | 
137 |         let (suffix_part, domain_part, subdomain_part) = self.parse_domain_parts(domain_string.as_str())?;
138 | 
139 |         unsafe {
140 |             let dict = pyo3::ffi::PyDict_New();
141 |             for (fraction_key, fraction) in [
142 |                 (intern!(py, "suffix").into_ptr(), suffix_part),
143 |                 (intern!(py, "domain").into_ptr(), domain_part),
144 |                 (intern!(py, "subdomain").into_ptr(), subdomain_part),
145 |             ] {
146 |                 if !fraction.is_empty() {
147 |                     let substr = pyo3::ffi::PyUnicode_FromStringAndSize(
148 |                         fraction.as_ptr() as *const c_char,
149 |                         fraction.len() as isize,
150 |                     );
151 | 
152 |                     pyo3::ffi::PyDict_SetItem(
153 |                         dict,
154 |                         fraction_key,
155 |                         substr,
156 |                     );
157 |                     pyo3::ffi::Py_DECREF(substr);
158 |                 } else {
159 |                     pyo3::ffi::PyDict_SetItem(
160 |                         dict,
161 |                         fraction_key,
162 |                         intern!(py, "").into_ptr(),
163 |                     );
164 |                 }
165 |             }
166 | 
167 |             Ok(pyo3::PyObject::from_owned_ptr(py, dict))
168 |         }
169 |     }
170 | 
171 |     fn is_valid_domain(
172 |         &self,
173 |         domain: &PyString,
174 |     ) -> bool {
175 |         let domain_len = domain.len().unwrap();
176 |         if domain_len == 0 || domain_len > 255 {
177 |             return false;
178 |         }
179 | 
180 |         let mut domain_string = unsafe {
181 |             DomainString::from_str_unchecked(domain.to_string_lossy().as_ref())
182 |         };
183 | 
184 |         for fraction in domain_string.split('.') {
185 |             if fraction.len() > 63 || fraction.is_empty() {
186 |                 return false;
187 |             }
188 |             if fraction.starts_with('-') || fraction.ends_with('-') {
189 |                 return false;
190 |             }
191 | 
192 |             for ch in fraction.chars() {
193 |                 if !ch.is_alphanumeric() && ch != '-' {
194 |                     return false;
195 |                 }
196 |             }
197 |         }
198 | 
199 |         domain_string.make_ascii_lowercase();
200 |         if let Ok((suffix_part, domain_part, _subdomain_part)) = self.parse_domain_parts(domain_string.as_str()) {
201 |             if suffix_part.is_empty() || domain_part.is_empty() {
202 |                 return false;
203 |             }
204 | 
205 |             if idna::domain_to_ascii(domain_string.as_str()).is_err() {
206 |                 return false;
207 |             }
208 |             if idna::domain_to_unicode(domain_string.as_str()).1.is_err() {
209 |                 return false;
210 |             }
211 | 
212 |             true
213 |         } else {
214 |             false
215 |         }
216 |     }
217 | 
218 |     fn get_tld_list(
219 |         &self,
220 |     ) -> Vec<String> {
221 |         self.tld_list.clone()
222 |     }
223 | 
224 |     fn extract_from_url(
225 |         &self,
226 |         py: Python,
227 |         url: &PyString,
228 |     ) -> PyResult<PyObject> {
229 |         let mut url_str = url.to_str().unwrap();
230 | 
231 |         match memchr::memmem::find(url_str.as_bytes(), b"//") {
232 |             Some(scheme_separator_position) => {
233 |                 url_str = &url_str[scheme_separator_position + 2..];
234 |             },
235 |             None => return Err(
236 |                 PyValueError::new_err("url is invalid: no scheme")
237 |             ),
238 |         };
239 | 
240 |         if let Some(path_separator) = memchr::memchr(b'/', url_str.as_bytes()) {
241 |             url_str = &url_str[..path_separator];
242 |         };
243 | 
244 |         if let Some(authentication_separator) = memchr::memchr(b'@', url_str.as_bytes()) {
245 |             url_str = &url_str[authentication_separator + 1..];
246 |         };
247 | 
248 |         if let Some(port_separator) = memchr::memchr(b':', url_str.as_bytes()) {
249 |             url_str = &url_str[..port_separator];
250 |         };
251 | 
252 |         if url_str.is_empty() {
253 |             return Err(
254 |                 PyValueError::new_err("url does not contain a domain")
255 |             );
256 |         }
257 | 
258 |         if url_str.len() > 255 {
259 |             return Err(PyValueError::new_err("url is invalid: too long"));
260 |         }
261 |         let mut domain_string = unsafe {
262 |             DomainString::from_str_unchecked(url_str)
263 |         };
264 |         domain_string.make_ascii_lowercase();
265 | 
266 |         let (suffix_part, domain_part, subdomain_part) = self.parse_domain_parts(domain_string.as_str())?;
267 | 
268 |         unsafe {
269 |             let dict = pyo3::ffi::PyDict_New();
270 |             for (fraction_key, fraction) in [
271 |                 (intern!(py, "suffix").into_ptr(), suffix_part),
272 |                 (intern!(py, "domain").into_ptr(), domain_part),
273 |                 (intern!(py, "subdomain").into_ptr(), subdomain_part),
274 |             ] {
275 |                 if !fraction.is_empty() {
276 |                     let substr = pyo3::ffi::PyUnicode_FromStringAndSize(
277 |                         fraction.as_ptr() as *const c_char,
278 |                         fraction.len() as isize,
279 |                     );
280 | 
281 |                     pyo3::ffi::PyDict_SetItem(
282 |                         dict,
283 |                         fraction_key,
284 |                         substr,
285 |                     );
286 |                     pyo3::ffi::Py_DECREF(substr);
287 |                 } else {
288 |                     pyo3::ffi::PyDict_SetItem(
289 |                         dict,
290 |                         fraction_key,
291 |                         intern!(py, "").into_ptr(),
292 |                     );
293 |                 }
294 |             }
295 | 
296 |             Ok(pyo3::PyObject::from_owned_ptr(py, dict))
297 |         }
298 |     }
299 | }
300 | 
301 | fn parse_suffix_list(
302 |     suffixes_list: &str,
303 | ) -> (AHashMap<String, Suffix>, Vec<String>) {
304 |     let mut suffixes = AHashMap::new();
305 |     let mut tld_list = Vec::new();
306 | 
307 |     for line in suffixes_list.lines().map(
308 |         |line| line.to_ascii_lowercase()
309 |     ) {
310 |         if line.starts_with("//") || line.is_empty() {
311 |             continue;
312 |         }
313 | 
314 |         let mut tlds = vec![line.clone()];
315 |         if !line.is_ascii() {
316 |             tlds.push(idna::domain_to_ascii(&line).unwrap());
317 |         }
318 |         for tld in tlds {
319 |             tld_list.push(tld.clone());
320 | 
321 |             let fractions: Vec<String> = tld.rsplit('.').map(
322 |                 |s| s.to_string()
323 |             ).collect();
324 |             let mut current_suffix = suffixes.entry(fractions.first().unwrap().to_owned()).or_insert(
325 |                 Suffix {
326 |                     sub_suffixes: AHashMap::new(),
327 |                     is_wildcard: false,
328 |                     sub_blacklist: AHashSet::new(),
329 |                 }
330 |             );
331 | 
332 |             for fraction in fractions[1..].iter() {
333 |                 if fraction.starts_with('!') {
334 |                     current_suffix.sub_blacklist.insert(fraction.strip_prefix('!').unwrap().to_string());
335 |                 } else if fraction == "*" {
336 |                     current_suffix.is_wildcard = true;
337 |                 } else {
338 |                     current_suffix = current_suffix.sub_suffixes.entry(fraction.clone()).or_insert(
339 |                         Suffix {
340 |                             sub_suffixes: AHashMap::new(),
341 |                             is_wildcard: false,
342 |                             sub_blacklist: AHashSet::new(),
343 |                         }
344 |                     );
345 |                 }
346 |             }
347 |         }
348 |     }
349 | 
350 |     (suffixes, tld_list)
351 | }
352 | 
353 | #[pymodule]
354 | fn pydomainextractor(
355 |     _py: Python,
356 |     m: &PyModule,
357 | ) -> PyResult<()> {
358 |     m.add_class::<DomainExtractor>()?;
359 |     Ok(())
360 | }
361 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyDomainExtractor/a59d365effa56872235d3ffa0e7a1367065fa3e6/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_pydomainextractor.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import unittest.mock
  3 | 
  4 | import pydomainextractor
  5 | 
  6 | 
  7 | class DomainExtractorExtractionTestCase(
  8 |     unittest.TestCase,
  9 | ):
 10 |     def setUp(
 11 |         self,
 12 |     ):
 13 |         self.domain_extractor = pydomainextractor.DomainExtractor()
 14 | 
 15 |     def test_extract_only_tld(
 16 |         self,
 17 |     ):
 18 |         self.assertEqual(
 19 |             first=self.domain_extractor.extract('com'),
 20 |             second={
 21 |                 'domain': '',
 22 |                 'subdomain': '',
 23 |                 'suffix': 'com',
 24 |             },
 25 |         )
 26 | 
 27 |         self.assertEqual(
 28 |             first=self.domain_extractor.extract('jp.net'),
 29 |             second={
 30 |                 'domain': '',
 31 |                 'subdomain': '',
 32 |                 'suffix': 'jp.net',
 33 |             },
 34 |         )
 35 | 
 36 |         self.assertEqual(
 37 |             first=self.domain_extractor.extract('鹿児島.jp'),
 38 |             second={
 39 |                 'domain': '',
 40 |                 'subdomain': '',
 41 |                 'suffix': '鹿児島.jp',
 42 |             },
 43 |         )
 44 | 
 45 |         self.assertEqual(
 46 |             first=self.domain_extractor.extract('香格里拉'),
 47 |             second={
 48 |                 'domain': '',
 49 |                 'subdomain': '',
 50 |                 'suffix': '香格里拉',
 51 |             },
 52 |         )
 53 | 
 54 |         self.assertEqual(
 55 |             first=self.domain_extractor.extract('xn--32vp30h.jp'),
 56 |             second={
 57 |                 'domain': '',
 58 |                 'subdomain': '',
 59 |                 'suffix': 'xn--32vp30h.jp',
 60 |             },
 61 |         )
 62 | 
 63 |     def test_extract_only_domain(
 64 |         self,
 65 |     ):
 66 |         self.assertEqual(
 67 |             first=self.domain_extractor.extract('nonexistenttld'),
 68 |             second={
 69 |                 'domain': 'nonexistenttld',
 70 |                 'subdomain': '',
 71 |                 'suffix': '',
 72 |             },
 73 |         )
 74 | 
 75 |         self.assertEqual(
 76 |             first=self.domain_extractor.extract('香格里拉香格里拉香格里拉'),
 77 |             second={
 78 |                 'domain': '香格里拉香格里拉香格里拉',
 79 |                 'subdomain': '',
 80 |                 'suffix': '',
 81 |             },
 82 |         )
 83 | 
 84 |     def test_extract_only_domain_and_subdomain(
 85 |         self,
 86 |     ):
 87 |         self.assertEqual(
 88 |             first=self.domain_extractor.extract('nonexistenttld.nonexistenttld'),
 89 |             second={
 90 |                 'domain': 'nonexistenttld',
 91 |                 'subdomain': 'nonexistenttld',
 92 |                 'suffix': '',
 93 |             },
 94 |         )
 95 | 
 96 |         self.assertEqual(
 97 |             first=self.domain_extractor.extract('香格里拉香格里拉香格里拉.nonexistenttld'),
 98 |             second={
 99 |                 'domain': 'nonexistenttld',
100 |                 'subdomain': '香格里拉香格里拉香格里拉',
101 |                 'suffix': '',
102 |             },
103 |         )
104 | 
105 |     def test_extract_all_parts(
106 |         self,
107 |     ):
108 |         self.assertEqual(
109 |             first=self.domain_extractor.extract('google.com'),
110 |             second={
111 |                 'domain': 'google',
112 |                 'subdomain': '',
113 |                 'suffix': 'com',
114 |             },
115 |         )
116 | 
117 |         self.assertEqual(
118 |             first=self.domain_extractor.extract('subdomain.google.com'),
119 |             second={
120 |                 'domain': 'google',
121 |                 'subdomain': 'subdomain',
122 |                 'suffix': 'com',
123 |             },
124 |         )
125 | 
126 |         self.assertEqual(
127 |             first=self.domain_extractor.extract('subsubdomain.subdomain.google.com'),
128 |             second={
129 |                 'domain': 'google',
130 |                 'subdomain': 'subsubdomain.subdomain',
131 |                 'suffix': 'com',
132 |             },
133 |         )
134 | 
135 |         self.assertEqual(
136 |             first=self.domain_extractor.extract('subsubdomain.subdomain.google.香格里拉'),
137 |             second={
138 |                 'domain': 'google',
139 |                 'subdomain': 'subsubdomain.subdomain',
140 |                 'suffix': '香格里拉',
141 |             },
142 |         )
143 | 
144 |         self.assertEqual(
145 |             first=self.domain_extractor.extract('subsubdomain.subdomain.google.鹿児島.jp'),
146 |             second={
147 |                 'domain': 'google',
148 |                 'subdomain': 'subsubdomain.subdomain',
149 |                 'suffix': '鹿児島.jp',
150 |             },
151 |         )
152 | 
153 |         self.assertEqual(
154 |             first=self.domain_extractor.extract('subsubdomain.subdomain.google.xn--32vp30h.jp'),
155 |             second={
156 |                 'domain': 'google',
157 |                 'subdomain': 'subsubdomain.subdomain',
158 |                 'suffix': 'xn--32vp30h.jp',
159 |             },
160 |         )
161 | 
162 |     def test_special_cases(
163 |         self,
164 |     ):
165 |         self.assertEqual(
166 |             first=self.domain_extractor.extract('bla.ck'),
167 |             second={
168 |                 'domain': '',
169 |                 'subdomain': '',
170 |                 'suffix': 'bla.ck',
171 |             },
172 |         )
173 | 
174 |         self.assertEqual(
175 |             first=self.domain_extractor.extract('a.bla.ck'),
176 |             second={
177 |                 'domain': 'a',
178 |                 'subdomain': '',
179 |                 'suffix': 'bla.ck',
180 |             },
181 |         )
182 | 
183 |         self.assertEqual(
184 |             first=self.domain_extractor.extract('a.b.bla.ck'),
185 |             second={
186 |                 'domain': 'b',
187 |                 'subdomain': 'a',
188 |                 'suffix': 'bla.ck',
189 |             },
190 |         )
191 | 
192 |         self.assertEqual(
193 |             first=self.domain_extractor.extract('www.ck'),
194 |             second={
195 |                 'domain': 'www',
196 |                 'subdomain': '',
197 |                 'suffix': 'ck',
198 |             },
199 |         )
200 | 
201 |         self.assertEqual(
202 |             first=self.domain_extractor.extract('a.www.ck'),
203 |             second={
204 |                 'domain': 'www',
205 |                 'subdomain': 'a',
206 |                 'suffix': 'ck',
207 |             },
208 |         )
209 | 
210 |         self.assertEqual(
211 |             first=self.domain_extractor.extract('a.bzz.dapps.earth'),
212 |             second={
213 |                 'domain': '',
214 |                 'subdomain': '',
215 |                 'suffix': 'a.bzz.dapps.earth',
216 |             },
217 |         )
218 | 
219 |         self.assertEqual(
220 |             first=self.domain_extractor.extract('a.b.bzz.dapps.earth'),
221 |             second={
222 |                 'domain': 'a',
223 |                 'subdomain': '',
224 |                 'suffix': 'b.bzz.dapps.earth',
225 |             },
226 |         )
227 | 
228 |         self.assertEqual(
229 |             first=self.domain_extractor.extract('domain.co.za'),
230 |             second={
231 |                 'domain': 'domain',
232 |                 'subdomain': '',
233 |                 'suffix': 'co.za',
234 |             },
235 |         )
236 | 
237 |     def test_upper_case(
238 |         self,
239 |     ):
240 |         self.assertEqual(
241 |             first=self.domain_extractor.extract('domain.Com'),
242 |             second={
243 |                 'domain': 'domain',
244 |                 'subdomain': '',
245 |                 'suffix': 'com',
246 |             },
247 |         )
248 | 
249 |         self.assertEqual(
250 |             first=self.domain_extractor.extract('DOmain.Com'),
251 |             second={
252 |                 'domain': 'domain',
253 |                 'subdomain': '',
254 |                 'suffix': 'com',
255 |             },
256 |         )
257 | 
258 |         self.assertEqual(
259 |             first=self.domain_extractor.extract('DOmain.COM'),
260 |             second={
261 |                 'domain': 'domain',
262 |                 'subdomain': '',
263 |                 'suffix': 'com',
264 |             },
265 |         )
266 | 
267 |         self.assertEqual(
268 |             first=self.domain_extractor.extract('a.b.bla.CK'),
269 |             second={
270 |                 'domain': 'b',
271 |                 'subdomain': 'a',
272 |                 'suffix': 'bla.ck',
273 |             },
274 |         )
275 | 
276 |     def test_syntactic_invalid_domains(
277 |         self,
278 |     ):
279 |         with self.assertRaises(
280 |             expected_exception=ValueError,
281 |         ):
282 |             self.domain_extractor.extract('.com')
283 | 
284 |         with self.assertRaises(
285 |             expected_exception=ValueError,
286 |         ):
287 |             self.domain_extractor.extract('domain..com')
288 | 
289 |         with self.assertRaises(
290 |             expected_exception=ValueError,
291 |         ):
292 |             self.domain_extractor.extract('sub..domain.com')
293 | 
294 |         with self.assertRaises(
295 |             expected_exception=ValueError,
296 |         ):
297 |             self.domain_extractor.extract('domain.com.')
298 | 
299 |         with self.assertRaises(
300 |             expected_exception=ValueError,
301 |         ):
302 |             self.domain_extractor.extract('com.')
303 | 
304 |     def test_domain_too_long(
305 |         self,
306 |     ):
307 |         with self.assertRaises(
308 |             expected_exception=ValueError,
309 |         ):
310 |             self.domain_extractor.extract(f'{"very-long" * 255}.com')
311 | 
312 |     def test_extract_from_url(
313 |         self,
314 |     ):
315 |         with self.assertRaises(
316 |             ValueError,
317 |         ):
318 |             self.domain_extractor.extract_from_url('http://www.example.com./')
319 | 
320 |         with self.assertRaises(
321 |             ValueError,
322 |         ):
323 |             self.domain_extractor.extract_from_url('mail.google.com/mail')
324 | 
325 |         with self.assertRaises(
326 |             ValueError,
327 |         ):
328 |             self.domain_extractor.extract_from_url('xn--gieen46ers-73a.de')
329 | 
330 |         with self.assertRaises(
331 |             ValueError,
332 |         ):
333 |             self.domain_extractor.extract_from_url('http://')
334 | 
335 |         with self.assertRaises(
336 |             ValueError,
337 |         ):
338 |             self.domain_extractor.extract_from_url('xn--tub-1m9d15sfkkhsifsbqygyujjrw602gk4li5qqk98aca0w.google.com')
339 | 
340 |         with self.assertRaises(
341 |             ValueError,
342 |         ):
343 |             self.domain_extractor.extract_from_url('xn--tub-1m9d15sfkkhsifsbqygyujjrw60.google.com')
344 | 
345 |         with self.assertRaises(
346 |             ValueError,
347 |         ):
348 |             self.domain_extractor.extract_from_url('1\xe9')
349 | 
350 |         with self.assertRaises(
351 |             ValueError,
352 |         ):
353 |             self.domain_extractor.extract_from_url('com')
354 | 
355 |         with self.assertRaises(
356 |             ValueError,
357 |         ):
358 |             self.domain_extractor.extract_from_url('co.uk')
359 | 
360 |         with self.assertRaises(
361 |             ValueError,
362 |         ):
363 |             self.domain_extractor.extract_from_url(f'http://{"domain" * 255}co.uk:3030/some/path')
364 | 
365 |         self.assertEqual(
366 |             first=self.domain_extractor.extract_from_url('http://www.google.com'),
367 |             second={
368 |                 'subdomain': 'www',
369 |                 'domain': 'google',
370 |                 'suffix': 'com',
371 |             },
372 |         )
373 |         self.assertEqual(
374 |             first=self.domain_extractor.extract_from_url('http://www.theregister.co.uk'),
375 |             second={
376 |                 'subdomain': 'www',
377 |                 'domain': 'theregister',
378 |                 'suffix': 'co.uk',
379 |             },
380 |         )
381 |         self.assertEqual(
382 |             first=self.domain_extractor.extract_from_url('http://gmail.com'),
383 |             second={
384 |                 'subdomain': '',
385 |                 'domain': 'gmail',
386 |                 'suffix': 'com',
387 |             },
388 |         )
389 |         self.assertEqual(
390 |             first=self.domain_extractor.extract_from_url('http://media.forums.theregister.co.uk'),
391 |             second={
392 |                 'subdomain': 'media.forums',
393 |                 'domain': 'theregister',
394 |                 'suffix': 'co.uk',
395 |             },
396 |         )
397 |         self.assertEqual(
398 |             first=self.domain_extractor.extract_from_url('http://www.www.com'),
399 |             second={
400 |                 'subdomain': 'www',
401 |                 'domain': 'www',
402 |                 'suffix': 'com',
403 |             },
404 |         )
405 |         self.assertEqual(
406 |             first=self.domain_extractor.extract_from_url('http://www.com'),
407 |             second={
408 |                 'subdomain': '',
409 |                 'domain': 'www',
410 |                 'suffix': 'com',
411 |             },
412 |         )
413 |         self.assertEqual(
414 |             first=self.domain_extractor.extract_from_url('http://internalunlikelyhostname/'),
415 |             second={
416 |                 'subdomain': '',
417 |                 'domain': 'internalunlikelyhostname',
418 |                 'suffix': '',
419 |             },
420 |         )
421 |         self.assertEqual(
422 |             first=self.domain_extractor.extract_from_url('http://internalunlikelyhostname.bizarre'),
423 |             second={
424 |                 'subdomain': 'internalunlikelyhostname',
425 |                 'domain': 'bizarre',
426 |                 'suffix': '',
427 |             },
428 |         )
429 |         self.assertEqual(
430 |             first=self.domain_extractor.extract_from_url('http://internalunlikelyhostname.info/'),
431 |             second={
432 |                 'subdomain': '',
433 |                 'domain': 'internalunlikelyhostname',
434 |                 'suffix': 'info',
435 |             },
436 |         )
437 |         self.assertEqual(
438 |             first=self.domain_extractor.extract_from_url('http://internalunlikelyhostname.information/'),
439 |             second={
440 |                 'subdomain': 'internalunlikelyhostname',
441 |                 'domain': 'information',
442 |                 'suffix': '',
443 |             },
444 |         )
445 |         self.assertEqual(
446 |             first=self.domain_extractor.extract_from_url('http://216.22.0.192/'),
447 |             second={
448 |                 'subdomain': '216.22.0',
449 |                 'domain': '192',
450 |                 'suffix': '',
451 |             },
452 |         )
453 |         self.assertEqual(
454 |             first=self.domain_extractor.extract_from_url('http://216.22.project.coop/'),
455 |             second={
456 |                 'subdomain': '216.22',
457 |                 'domain': 'project',
458 |                 'suffix': 'coop',
459 |             },
460 |         )
461 |         self.assertEqual(
462 |             first=self.domain_extractor.extract_from_url('http://xn--h1alffa9f.xn--p1ai'),
463 |             second={
464 |                 'subdomain': '',
465 |                 'domain': 'xn--h1alffa9f',
466 |                 'suffix': 'xn--p1ai',
467 |             },
468 |         )
469 |         self.assertEqual(
470 |             first=self.domain_extractor.extract_from_url('http://xN--h1alffa9f.xn--p1ai'),
471 |             second={
472 |                 'subdomain': '',
473 |                 'domain': 'xn--h1alffa9f',
474 |                 'suffix': 'xn--p1ai',
475 |             },
476 |         )
477 |         self.assertEqual(
478 |             first=self.domain_extractor.extract_from_url('http://XN--h1alffa9f.xn--p1ai'),
479 |             second={
480 |                 'subdomain': '',
481 |                 'domain': 'xn--h1alffa9f',
482 |                 'suffix': 'xn--p1ai',
483 |             },
484 |         )
485 |         self.assertEqual(
486 |             first=self.domain_extractor.extract_from_url('http://xn--zckzap6140b352by.blog.so-net.xn--wcvs22d.hk'),
487 |             second={
488 |                 'subdomain': 'xn--zckzap6140b352by.blog',
489 |                 'domain': 'so-net',
490 |                 'suffix': 'xn--wcvs22d.hk',
491 |             },
492 |         )
493 |         self.assertEqual(
494 |             first=self.domain_extractor.extract_from_url('http://xn--zckzap6140b352by.blog.so-net.教育.hk'),
495 |             second={
496 |                 'subdomain': 'xn--zckzap6140b352by.blog',
497 |                 'domain': 'so-net',
498 |                 'suffix': '教育.hk',
499 |             },
500 |         )
501 |         self.assertEqual(
502 |             first=self.domain_extractor.extract_from_url('https://mail.google.com/mail'),
503 |             second={
504 |                 'subdomain': 'mail',
505 |                 'domain': 'google',
506 |                 'suffix': 'com',
507 |             },
508 |         )
509 |         self.assertEqual(
510 |             first=self.domain_extractor.extract_from_url('ssh://mail.google.com/mail'),
511 |             second={
512 |                 'subdomain': 'mail',
513 |                 'domain': 'google',
514 |                 'suffix': 'com',
515 |             },
516 |         )
517 |         self.assertEqual(
518 |             first=self.domain_extractor.extract_from_url('git+ssh://www.github.com:8443/'),
519 |             second={
520 |                 'subdomain': 'www',
521 |                 'domain': 'github',
522 |                 'suffix': 'com',
523 |             },
524 |         )
525 |         self.assertEqual(
526 |             first=self.domain_extractor.extract_from_url('ftp://johndoe:5cr1p7k1dd13@1337.warez.com:2501'),
527 |             second={
528 |                 'subdomain': '1337',
529 |                 'domain': 'warez',
530 |                 'suffix': 'com',
531 |             },
532 |         )
533 |         self.assertEqual(
534 |             first=self.domain_extractor.extract_from_url('http://google.com/?q=cats'),
535 |             second={
536 |                 'subdomain': '',
537 |                 'domain': 'google',
538 |                 'suffix': 'com',
539 |             },
540 |         )
541 |         self.assertEqual(
542 |             first=self.domain_extractor.extract_from_url('http://google.com/#Welcome'),
543 |             second={
544 |                 'subdomain': '',
545 |                 'domain': 'google',
546 |                 'suffix': 'com',
547 |             },
548 |         )
549 |         self.assertEqual(
550 |             first=self.domain_extractor.extract_from_url('http://google.com/#Welcome'),
551 |             second={
552 |                 'subdomain': '',
553 |                 'domain': 'google',
554 |                 'suffix': 'com',
555 |             },
556 |         )
557 |         self.assertEqual(
558 |             first=self.domain_extractor.extract_from_url('http://google.com/s#Welcome'),
559 |             second={
560 |                 'subdomain': '',
561 |                 'domain': 'google',
562 |                 'suffix': 'com',
563 |             },
564 |         )
565 |         self.assertEqual(
566 |             first=self.domain_extractor.extract_from_url('http://google.com/s?q=cats#Welcome'),
567 |             second={
568 |                 'subdomain': '',
569 |                 'domain': 'google',
570 |                 'suffix': 'com',
571 |             },
572 |         )
573 |         self.assertEqual(
574 |             first=self.domain_extractor.extract_from_url('http://www.parliament.uk'),
575 |             second={
576 |                 'subdomain': 'www',
577 |                 'domain': 'parliament',
578 |                 'suffix': 'uk',
579 |             },
580 |         )
581 |         self.assertEqual(
582 |             first=self.domain_extractor.extract_from_url('http://www.parliament.co.uk'),
583 |             second={
584 |                 'subdomain': 'www',
585 |                 'domain': 'parliament',
586 |                 'suffix': 'co.uk',
587 |             },
588 |         )
589 |         self.assertEqual(
590 |             first=self.domain_extractor.extract_from_url('http://www.cgs.act.edu.au/'),
591 |             second={
592 |                 'subdomain': 'www',
593 |                 'domain': 'cgs',
594 |                 'suffix': 'act.edu.au',
595 |             },
596 |         )
597 |         self.assertEqual(
598 |             first=self.domain_extractor.extract_from_url('http://www.google.com.au/'),
599 |             second={
600 |                 'subdomain': 'www',
601 |                 'domain': 'google',
602 |                 'suffix': 'com.au',
603 |             },
604 |         )
605 |         self.assertEqual(
606 |             first=self.domain_extractor.extract_from_url('http://www.metp.net.cn'),
607 |             second={
608 |                 'subdomain': 'www',
609 |                 'domain': 'metp',
610 |                 'suffix': 'net.cn',
611 |             },
612 |         )
613 |         self.assertEqual(
614 |             first=self.domain_extractor.extract_from_url('http://waiterrant.blogspot.com'),
615 |             second={
616 |                 'subdomain': '',
617 |                 'domain': 'waiterrant',
618 |                 'suffix': 'blogspot.com',
619 |             },
620 |         )
621 |         self.assertEqual(
622 |             first=self.domain_extractor.extract_from_url('http://127.0.0.1/foo/bar'),
623 |             second={
624 |                 'subdomain': '127.0.0',
625 |                 'domain': '1',
626 |                 'suffix': '',
627 |             },
628 |         )
629 |         self.assertEqual(
630 |             first=self.domain_extractor.extract_from_url('http://256.256.256.256/foo/bar'),
631 |             second={
632 |                 'subdomain': '256.256.256',
633 |                 'domain': '256',
634 |                 'suffix': '',
635 |             },
636 |         )
637 |         self.assertEqual(
638 |             first=self.domain_extractor.extract_from_url('http://127.0.0.1.9/foo/bar'),
639 |             second={
640 |                 'subdomain': '127.0.0.1',
641 |                 'domain': '9',
642 |                 'suffix': '',
643 |             },
644 |         )
645 |         self.assertEqual(
646 |             first=self.domain_extractor.extract_from_url('http://admin:password1@www.google.com:666/secret/admin/interface?param1=42'),
647 |             second={
648 |                 'subdomain': 'www',
649 |                 'domain': 'google',
650 |                 'suffix': 'com',
651 |             },
652 |         )
653 |         self.assertEqual(
654 |             first=self.domain_extractor.extract_from_url('//admin:password1@www.google.com:666/secret/admin/interface?param1=42'),
655 |             second={
656 |                 'subdomain': 'www',
657 |                 'domain': 'google',
658 |                 'suffix': 'com',
659 |             },
660 |         )
661 |         self.assertEqual(
662 |             first=self.domain_extractor.extract_from_url('//mail.google.com/mail'),
663 |             second={
664 |                 'subdomain': 'mail',
665 |                 'domain': 'google',
666 |                 'suffix': 'com',
667 |             },
668 |         )
669 |         self.assertEqual(
670 |             first=self.domain_extractor.extract_from_url('http://test.nu'),
671 |             second={
672 |                 'subdomain': '',
673 |                 'domain': 'test',
674 |                 'suffix': 'nu',
675 |             },
676 |         )
677 | 
678 |     def test_is_valid_domain(
679 |         self,
680 |     ):
681 |         self.assertTrue(
682 |             expr=self.domain_extractor.is_valid_domain('domain.com'),
683 |         )
684 |         self.assertTrue(
685 |             expr=self.domain_extractor.is_valid_domain('sub.domain.com'),
686 |         )
687 |         self.assertTrue(
688 |             expr=self.domain_extractor.is_valid_domain('domain.COM'),
689 |         )
690 |         self.assertTrue(
691 |             expr=self.domain_extractor.is_valid_domain('domain.co.il'),
692 |         )
693 |         self.assertTrue(
694 |             expr=self.domain_extractor.is_valid_domain('domain.co.za'),
695 |         )
696 |         self.assertFalse(
697 |             expr=self.domain_extractor.is_valid_domain('domain.invalid'),
698 |         )
699 |         self.assertFalse(
700 |             expr=self.domain_extractor.is_valid_domain('com'),
701 |         )
702 |         self.assertFalse(
703 |             expr=self.domain_extractor.is_valid_domain('com'),
704 |         )
705 |         self.assertFalse(
706 |             expr=self.domain_extractor.is_valid_domain('-domain.com'),
707 |         )
708 |         self.assertFalse(
709 |             expr=self.domain_extractor.is_valid_domain('domain-.com'),
710 |         )
711 |         self.assertFalse(
712 |             expr=self.domain_extractor.is_valid_domain('-sub.domain.com'),
713 |         )
714 |         self.assertFalse(
715 |             expr=self.domain_extractor.is_valid_domain('sub-.domain.com'),
716 |         )
717 | 
718 |         self.assertTrue(
719 |             expr=self.domain_extractor.is_valid_domain('domain.xn--mgbaakc7dvf'),
720 |         )
721 |         self.assertTrue(
722 |             expr=self.domain_extractor.is_valid_domain('domain.اتصالات'),
723 |         )
724 |         self.assertTrue(
725 |             expr=self.domain_extractor.is_valid_domain('xn--mgbaakc7dvf.com'),
726 |         )
727 |         self.assertTrue(
728 |             expr=self.domain_extractor.is_valid_domain('اتصالات.com'),
729 |         )
730 |         self.assertTrue(
731 |             expr=self.domain_extractor.is_valid_domain('اتصالات.اتصالات'),
732 |         )
733 |         self.assertTrue(
734 |             expr=self.domain_extractor.is_valid_domain('xn--mgbaakc7dvf.xn--mgbaakc7dvf'),
735 |         )
736 | 
737 |         self.assertFalse(
738 |             expr=self.domain_extractor.is_valid_domain('domain.xn--mgbaakc7dvfa'),
739 |         )
740 |         self.assertFalse(
741 |             expr=self.domain_extractor.is_valid_domain('domain.اsتصالات'),
742 |         )
743 |         self.assertFalse(
744 |             expr=self.domain_extractor.is_valid_domain('xn--mgbaaskc7777dvf.com'),
745 |         )
746 |         self.assertFalse(
747 |             expr=self.domain_extractor.is_valid_domain('اتصالsات.com'),
748 |         )
749 |         self.assertFalse(
750 |             expr=self.domain_extractor.is_valid_domain('اتصالاsت.اتصالات'),
751 |         )
752 |         self.assertFalse(
753 |             expr=self.domain_extractor.is_valid_domain('xn--mgbsaadddd1212121212kc7dvf.xn--mgbaakc7dvf'),
754 |         )
755 | 
756 |         self.assertFalse(
757 |             expr=self.domain_extractor.is_valid_domain('\xF0\x9F\x98\x81nonalphanum.com'),
758 |         )
759 | 
760 |         self.assertFalse(
761 |             expr=self.domain_extractor.is_valid_domain('.com'),
762 |         )
763 |         self.assertFalse(
764 |             expr=self.domain_extractor.is_valid_domain('domain..com'),
765 |         )
766 |         self.assertFalse(
767 |             expr=self.domain_extractor.is_valid_domain('sub..domain.com'),
768 |         )
769 |         self.assertFalse(
770 |             expr=self.domain_extractor.is_valid_domain('domain.com.'),
771 |         )
772 |         self.assertFalse(
773 |             expr=self.domain_extractor.is_valid_domain('com.'),
774 |         )
775 | 
776 |     def test_mutability(
777 |         self,
778 |     ):
779 |         domain_to_test_original = 'Google.COM'
780 |         domain_to_test = 'Google.COM'
781 | 
782 |         self.domain_extractor.is_valid_domain(domain_to_test)
783 |         self.assertEqual(
784 |             first=domain_to_test_original,
785 |             second=domain_to_test,
786 |         )
787 | 
788 |         self.domain_extractor.extract(domain_to_test)
789 |         self.assertEqual(
790 |             first=domain_to_test_original,
791 |             second=domain_to_test,
792 |         )
793 | 
794 |         url_to_test_original = 'http://Google.COM/A.php?Bla=true'
795 |         url_to_test = 'http://Google.COM/A.php?Bla=true'
796 |         self.domain_extractor.extract_from_url(url_to_test)
797 |         self.assertEqual(
798 |             first=url_to_test_original,
799 |             second=url_to_test,
800 |         )
801 | 
802 | 
803 | class DomainExtractorLoadTestCase(
804 |     unittest.TestCase,
805 | ):
806 |     def test_load_called_without_data(
807 |         self,
808 |     ):
809 |         domain_extractor = pydomainextractor.DomainExtractor()
810 | 
811 |         self.assertEqual(
812 |             first=domain_extractor.extract('com'),
813 |             second={
814 |                 'subdomain': '',
815 |                 'domain': '',
816 |                 'suffix': 'com',
817 |             },
818 |         )
819 | 
820 |     def test_load_called_with_data(
821 |         self,
822 |     ):
823 |         domain_extractor = pydomainextractor.DomainExtractor(
824 |             'com\n'
825 |         )
826 | 
827 |         self.assertEqual(
828 |             first=domain_extractor.extract('com'),
829 |             second={
830 |                 'subdomain': '',
831 |                 'domain': '',
832 |                 'suffix': 'com',
833 |             },
834 |         )
835 | 
836 |         domain_extractor = pydomainextractor.DomainExtractor(
837 |             'net\n'
838 |         )
839 | 
840 |         self.assertEqual(
841 |             first=domain_extractor.extract('com'),
842 |             second={
843 |                 'subdomain': '',
844 |                 'domain': 'com',
845 |                 'suffix': '',
846 |             },
847 |         )
848 | 
849 |         domain_extractor = pydomainextractor.DomainExtractor(
850 |             'customtld\n'
851 |         )
852 | 
853 |         self.assertEqual(
854 |             first=domain_extractor.extract('google.customtld'),
855 |             second={
856 |                 'subdomain': '',
857 |                 'domain': 'google',
858 |                 'suffix': 'customtld',
859 |             },
860 |         )
861 | 
862 |         domain_extractor = pydomainextractor.DomainExtractor(
863 |             'tld\n'
864 |             'custom.tld\n'
865 |         )
866 | 
867 |         self.assertEqual(
868 |             first=domain_extractor.extract('google.custom.tld'),
869 |             second={
870 |                 'subdomain': '',
871 |                 'domain': 'google',
872 |                 'suffix': 'custom.tld',
873 |             },
874 |         )
875 | 
876 |     def test_get_tld_list(
877 |         self,
878 |     ):
879 |         domain_extractor = pydomainextractor.DomainExtractor(
880 |             'com\n'
881 |         )
882 | 
883 |         self.assertEqual(
884 |             first=domain_extractor.get_tld_list(),
885 |             second=[
886 |                 'com',
887 |             ],
888 |         )
889 | 
890 |         domain_extractor = pydomainextractor.DomainExtractor(
891 |             'com\n'
892 |             'net\n'
893 |             'org\n'
894 |             'uk.com\n'
895 |         )
896 | 
897 |         self.assertCountEqual(
898 |             first=domain_extractor.get_tld_list(),
899 |             second=[
900 |                 'com',
901 |                 'net',
902 |                 'org',
903 |                 'uk.com',
904 |             ],
905 |         )
906 | 


--------------------------------------------------------------------------------