├── .github
└── workflows
│ ├── build.yml
│ └── deploy.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmarks
├── domain_benchmark.py
└── url_benchmark.py
├── cortex.yaml
├── images
└── logo.png
├── pydomainextractor
├── __init__.py
└── pydomainextractor.pyi
├── pyproject.toml
├── src
├── lib.rs
└── public_suffix_list.dat
└── tests
├── __init__.py
└── test_pydomainextractor.py
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 | on:
3 | - push
4 | - pull_request
5 | jobs:
6 | lint:
7 | if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags')
8 | runs-on: ubuntu-latest
9 | steps:
10 | - name: Checkout
11 | uses: actions/checkout@v3
12 | - name: Install latest rust
13 | uses: actions-rs/toolchain@v1
14 | with:
15 | toolchain: stable
16 | profile: minimal
17 | override: true
18 | components: clippy
19 | - name: Lint with clippy
20 | uses: actions-rs/cargo@v1
21 | with:
22 | command: clippy
23 | args: --all-targets --all-features
24 | test:
25 | runs-on: ${{ matrix.os }}
26 | needs: lint
27 | strategy:
28 | fail-fast: false
29 | matrix:
30 | python-version:
31 | - '3.7'
32 | - '3.8'
33 | - '3.9'
34 | - '3.10'
35 | - '3.11'
36 | os:
37 | - ubuntu-latest
38 | - macos-latest
39 | - windows-latest
40 | steps:
41 | - name: Checkout
42 | uses: actions/checkout@v3
43 | - name: Set up Python ${{ matrix.python-version }}
44 | uses: actions/setup-python@v4
45 | with:
46 | python-version: ${{ matrix.python-version }}
47 | - name: Install Poetry
48 | uses: abatilo/actions-poetry@v2
49 | - name: Install Rust
50 | uses: actions-rs/toolchain@v1
51 | with:
52 | profile: minimal
53 | toolchain: stable
54 | override: true
55 | - name: Install dependencies
56 | run: poetry install
57 | - name: Build Python package
58 | run: poetry run maturin develop
59 | - name: Test
60 | run: poetry run pytest -Werror tests
61 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
1 | name: Deploy
2 | on:
3 | release:
4 | types:
5 | - released
6 | jobs:
7 | deploy:
8 | runs-on: ${{ matrix.os }}
9 | strategy:
10 | fail-fast: false
11 | matrix:
12 | python-version:
13 | - "3.7"
14 | - "3.8"
15 | - "3.9"
16 | - "3.10"
17 | - "3.11"
18 | os:
19 | - ubuntu-latest
20 | - macos-latest
21 | - windows-latest
22 | steps:
23 | - name: Checkout
24 | uses: actions/checkout@v3
25 | - name: Set up Python ${{ matrix.python-version }}
26 | uses: actions/setup-python@v4
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | - name: Install Rust
30 | uses: actions-rs/toolchain@v1
31 | with:
32 | profile: minimal
33 | toolchain: stable
34 | override: true
35 | - name: Install Cross-compilers (macOS)
36 | if: matrix.os == 'macos-latest'
37 | run: |
38 | rustup target add x86_64-apple-darwin
39 | rustup target add aarch64-apple-darwin
40 | - name: Install Cross-compilers (Linux)
41 | if: matrix.os == 'ubuntu-latest'
42 | run: |
43 | rustup target add aarch64-unknown-linux-gnu
44 | - name: Publish Package
45 | uses: PyO3/maturin-action@v1
46 | with:
47 | command: publish
48 | args: --username=__token__ ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.7' && '' || '--no-sdist' }} --interpreter=python${{ !startsWith(matrix.os, 'windows') && matrix.python-version || '' }}
49 | env:
50 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
51 | if: matrix.os != 'macos-latest'
52 | - name: Publish macOS (x86_64) Package
53 | if: matrix.os == 'macos-latest'
54 | uses: PyO3/maturin-action@v1
55 | with:
56 | command: publish
57 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=x86_64-apple-darwin --no-sdist
58 | env:
59 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
60 | - name: Publish macOS (arm64) Package
61 | if: matrix.os == 'macos-latest'
62 | uses: PyO3/maturin-action@v1
63 | with:
64 | command: publish
65 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-apple-darwin --no-sdist
66 | env:
67 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
68 | - name: Publish Linux (arm64) Package
69 | if: matrix.os == 'ubuntu-latest'
70 | uses: PyO3/maturin-action@v1
71 | with:
72 | command: publish
73 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-unknown-linux-gnu --no-sdist
74 | env:
75 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
76 | dist-source:
77 | runs-on: ubuntu-latest
78 | steps:
79 | - name: Distribute Source
80 | uses: PyO3/maturin-action@v1
81 | with:
82 | command: sdist
83 | env:
84 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
85 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ### Python ###
2 | # Byte-compiled / optimized / DLL files
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 | .vscode/
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 | cover/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | .pybuilder/
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | # For a library or package, you might want to ignore these files since the code is
89 | # intended to run in multiple environments; otherwise, check them in:
90 | # .python-version
91 |
92 | # pipenv
93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
96 | # install all needed dependencies.
97 | #Pipfile.lock
98 |
99 | # poetry
100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | # This is especially recommended for binary packages to ensure reproducibility, and is more
102 | # commonly ignored for libraries.
103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | poetry.lock
105 |
106 | # pdm
107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | # in version control.
111 | # https://pdm.fming.dev/#use-with-ide
112 | .pdm.toml
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
164 | ### Rust ###
165 | # Generated by Cargo
166 | # will have compiled files and executables
167 | debug/
168 |
169 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
170 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
171 | Cargo.lock
172 |
173 | # These are backup files generated by rustfmt
174 | **/*.rs.bk
175 |
176 | # MSVC Windows builds of rustc generate these, which store debugging information
177 | *.pdb
178 |
179 | ### VisualStudioCode ###
180 | .vscode/*
181 | !.vscode/settings.json
182 | !.vscode/tasks.json
183 | !.vscode/launch.json
184 | !.vscode/extensions.json
185 | !.vscode/*.code-snippets
186 |
187 | # Local History for Visual Studio Code
188 | .history/
189 |
190 | # Built Visual Studio Code Extensions
191 | *.vsix
192 |
193 | ### VisualStudioCode Patch ###
194 | # Ignore all local history of files
195 | .history
196 | .ionide
197 |
198 | # Support for Project snippet scope
199 | .vscode/*.code-snippets
200 |
201 | # Ignore code-workspaces
202 | *.code-workspace
203 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "pydomainextractor"
3 | version = "0.13.10"
4 | authors = ["Viktor Vilskyi "]
5 | edition = "2021"
6 | repository = "https://github.com/intsights/pydomainextractor"
7 | homepage = "https://github.com/intsights/pydomainextractor"
8 | license = "MIT"
9 | keywords = [
10 | "domain",
11 | "extraction",
12 | "tld",
13 | "suffix",
14 | "psl",
15 | "rust",
16 | "pyo3",
17 | ]
18 |
19 | [lib]
20 | name = "pydomainextractor"
21 | crate-type = ["cdylib"]
22 |
23 | [dependencies]
24 | ahash = "0.8"
25 | idna = "0.3"
26 | memchr = "2"
27 | arraystring = "0.3.0"
28 | typenum = "1"
29 |
30 | [dependencies.pyo3]
31 | version = "0.17.3"
32 | features = ["extension-module"]
33 |
34 | [profile.release]
35 | lto = true
36 | panic = "abort"
37 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Gal Ben David
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include images/logo.png
3 | graft tests
4 | recursive-include pydomainextractor *.py *.pyi
5 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | A blazingly fast domain extraction library written in Rust
7 |
8 |
9 |
10 | 
11 | 
12 | 
13 | [](https://pypi.org/project/PyDomainExtractor/)
14 |
15 | ## Table of Contents
16 |
17 | - [Table of Contents](#table-of-contents)
18 | - [About The Project](#about-the-project)
19 | - [Built With](#built-with)
20 | - [Performance](#performance)
21 | - [Extract From Domain](#extract-from-domain)
22 | - [Extract From URL](#extract-from-url)
23 | - [Installation](#installation)
24 | - [Usage](#usage)
25 | - [Extraction](#extraction)
26 | - [URL Extraction](#url-extraction)
27 | - [Validation](#validation)
28 | - [TLDs List](#tlds-list)
29 | - [License](#license)
30 | - [Contact](#contact)
31 |
32 |
33 | ## About The Project
34 |
35 | PyDomainExtractor is a Python library designed to parse domain names quickly.
36 | In order to achieve the highest performance possible, the library was written in Rust.
37 |
38 |
39 | ### Built With
40 |
41 | * [AHash](https://github.com/tkaitchuck/aHash)
42 | * [idna](https://github.com/servo/rust-url/)
43 | * [memchr](https://github.com/BurntSushi/memchr)
44 | * [once_cell](https://github.com/matklad/once_cell)
45 | * [Public Suffix List](https://publicsuffix.org/)
46 |
47 |
48 | ### Performance
49 |
50 |
51 | #### Extract From Domain
52 |
53 | Tests were run on a file containing 10 million random domains from various top-level domains (Mar. 13rd 2022)
54 |
55 | | Library | Function | Time |
56 | | ------------- | ------------- | ------------- |
57 | | [PyDomainExtractor](https://github.com/Intsights/PyDomainExtractor) | pydomainextractor.extract | 1.50s |
58 | | [publicsuffix2](https://github.com/nexb/python-publicsuffix2) | publicsuffix2.get_sld | 9.92s |
59 | | [tldextract](https://github.com/john-kurkowski/tldextract) | \_\_call\_\_ | 29.23s |
60 | | [tld](https://github.com/barseghyanartur/tld) | tld.parse_tld | 34.48s |
61 |
62 |
63 | #### Extract From URL
64 |
65 | The test was conducted on a file containing 1 million random urls (Mar. 13rd 2022)
66 |
67 | | Library | Function | Time |
68 | | ------------- | ------------- | ------------- |
69 | | [PyDomainExtractor](https://github.com/Intsights/PyDomainExtractor) | pydomainextractor.extract_from_url | 2.24s |
70 | | [publicsuffix2](https://github.com/nexb/python-publicsuffix2) | publicsuffix2.get_sld | 10.84s |
71 | | [tldextract](https://github.com/john-kurkowski/tldextract) | \_\_call\_\_ | 36.04s |
72 | | [tld](https://github.com/barseghyanartur/tld) | tld.parse_tld | 57.87s |
73 |
74 |
75 | ### Installation
76 |
77 | ```sh
78 | pip3 install PyDomainExtractor
79 | ```
80 |
81 |
82 | ## Usage
83 |
84 |
85 | ### Extraction
86 |
87 | ```python
88 | import pydomainextractor
89 |
90 |
91 | # Loads the current supplied version of PublicSuffixList from the repository. Does not download any data.
92 | domain_extractor = pydomainextractor.DomainExtractor()
93 |
94 | domain_extractor.extract('google.com')
95 | >>> {
96 | >>> 'subdomain': '',
97 | >>> 'domain': 'google',
98 | >>> 'suffix': 'com'
99 | >>> }
100 |
101 | # Loads a custom SuffixList data. Should follow PublicSuffixList's format.
102 | domain_extractor = pydomainextractor.DomainExtractor(
103 | 'tld\n'
104 | 'custom.tld\n'
105 | )
106 |
107 | domain_extractor.extract('google.com')
108 | >>> {
109 | >>> 'subdomain': 'google',
110 | >>> 'domain': 'com',
111 | >>> 'suffix': ''
112 | >>> }
113 |
114 | domain_extractor.extract('google.custom.tld')
115 | >>> {
116 | >>> 'subdomain': '',
117 | >>> 'domain': 'google',
118 | >>> 'suffix': 'custom.tld'
119 | >>> }
120 | ```
121 |
122 |
123 | ### URL Extraction
124 |
125 | ```python
126 | import pydomainextractor
127 |
128 |
129 | # Loads the current supplied version of PublicSuffixList from the repository. Does not download any data.
130 | domain_extractor = pydomainextractor.DomainExtractor()
131 |
132 | domain_extractor.extract_from_url('http://google.com/')
133 | >>> {
134 | >>> 'subdomain': '',
135 | >>> 'domain': 'google',
136 | >>> 'suffix': 'com'
137 | >>> }
138 | ```
139 |
140 |
141 | ### Validation
142 |
143 | ```python
144 | import pydomainextractor
145 |
146 |
147 | # Loads the current supplied version of PublicSuffixList from the repository. Does not download any data.
148 | domain_extractor = pydomainextractor.DomainExtractor()
149 |
150 | domain_extractor.is_valid_domain('google.com')
151 | >>> True
152 |
153 | domain_extractor.is_valid_domain('domain.اتصالات')
154 | >>> True
155 |
156 | domain_extractor.is_valid_domain('xn--mgbaakc7dvf.xn--mgbaakc7dvf')
157 | >>> True
158 |
159 | domain_extractor.is_valid_domain('domain-.com')
160 | >>> False
161 |
162 | domain_extractor.is_valid_domain('-sub.domain.com')
163 | >>> False
164 |
165 | domain_extractor.is_valid_domain('\xF0\x9F\x98\x81nonalphanum.com')
166 | >>> False
167 | ```
168 |
169 |
170 | ### TLDs List
171 |
172 | ```python
173 | import pydomainextractor
174 |
175 |
176 | # Loads the current supplied version of PublicSuffixList from the repository. Does not download any data.
177 | domain_extractor = pydomainextractor.DomainExtractor()
178 |
179 | domain_extractor.get_tld_list()
180 | >>> [
181 | >>> 'bostik',
182 | >>> 'backyards.banzaicloud.io',
183 | >>> 'biz.bb',
184 | >>> ...
185 | >>> ]
186 | ```
187 |
188 |
189 | ## License
190 |
191 | Distributed under the MIT License. See `LICENSE` for more information.
192 |
193 |
194 | ## Contact
195 |
196 | Gal Ben David - gal@intsights.com
197 |
198 | Project Link: [https://github.com/Intsights/PyDomainExtractor](https://github.com/Intsights/PyDomainExtractor)
199 |
200 |
201 |
202 |
203 | [license-shield]: https://img.shields.io/github/license/othneildrew/Best-README-Template.svg?style=flat-square
204 |
--------------------------------------------------------------------------------
/benchmarks/domain_benchmark.py:
--------------------------------------------------------------------------------
1 | import tldextract
2 | import publicsuffix2
3 | import tld
4 | import pydomainextractor
5 | import time
6 |
7 |
8 | def benchmark_tldextract(
9 | domains,
10 | ):
11 | extractor = tldextract.TLDExtract(
12 | include_psl_private_domains=True,
13 | )
14 |
15 | start = time.perf_counter()
16 |
17 | for domain in domains:
18 | extractor(domain)
19 |
20 | end = time.perf_counter()
21 |
22 | print(f'tldextract: {end - start}s')
23 |
24 |
25 | def benchmark_publicsuffix2(
26 | domains,
27 | ):
28 | start = time.perf_counter()
29 |
30 | for domain in domains:
31 | publicsuffix2.get_sld(domain)
32 |
33 | end = time.perf_counter()
34 |
35 | print(f'publicsuffix2: {end - start}s')
36 |
37 |
38 | def benchmark_tld(
39 | domains,
40 | ):
41 | start = time.perf_counter()
42 |
43 | for domain in domains:
44 | tld.parse_tld(domain)
45 |
46 | end = time.perf_counter()
47 |
48 | print(f'tld: {end - start}s')
49 |
50 |
51 | def benchmark_pydomainextractor(
52 | domains,
53 | ):
54 | extractor = pydomainextractor.DomainExtractor()
55 |
56 | start = time.perf_counter()
57 |
58 | for domain in domains:
59 | extractor.extract(domain)
60 |
61 | end = time.perf_counter()
62 |
63 | print(f'pydomainextractor: {end - start}s')
64 |
65 |
66 | def main():
67 | domains = []
68 | with open('10m_domains') as domains_file:
69 | for line in domains_file:
70 | domains.append(line.rstrip())
71 |
72 | benchmark_tldextract(domains)
73 | benchmark_publicsuffix2(domains)
74 | benchmark_tld(domains)
75 | benchmark_pydomainextractor(domains)
76 |
77 |
78 | if __name__ == '__main__':
79 | main()
80 |
--------------------------------------------------------------------------------
/benchmarks/url_benchmark.py:
--------------------------------------------------------------------------------
1 | import tldextract
2 | import publicsuffix2
3 | import tld
4 | import pydomainextractor
5 | import time
6 |
7 |
8 | def benchmark_tldextract(
9 | urls,
10 | ):
11 | extractor = tldextract.TLDExtract(
12 | include_psl_private_domains=True,
13 | )
14 |
15 | start = time.perf_counter()
16 |
17 | for url in urls:
18 | extractor(url)
19 |
20 | end = time.perf_counter()
21 |
22 | print(f'tldextract: {end - start}s')
23 |
24 |
25 | def benchmark_publicsuffix2(
26 | urls,
27 | ):
28 | start = time.perf_counter()
29 |
30 | for url in urls:
31 | publicsuffix2.get_sld(url)
32 |
33 | end = time.perf_counter()
34 |
35 | print(f'publicsuffix2: {end - start}s')
36 |
37 |
38 | def benchmark_tld(
39 | urls,
40 | ):
41 | start = time.perf_counter()
42 |
43 | for url in urls:
44 | tld.parse_tld(url)
45 |
46 | end = time.perf_counter()
47 |
48 | print(f'tld: {end - start}s')
49 |
50 |
51 | def benchmark_pydomainextractor(
52 | urls,
53 | ):
54 | extractor = pydomainextractor.DomainExtractor()
55 |
56 | start = time.perf_counter()
57 |
58 | for url in urls:
59 | extractor.extract_from_url(url)
60 |
61 | end = time.perf_counter()
62 |
63 | print(f'pydomainextractor: {end - start}s')
64 |
65 |
66 | def main():
67 | urls = []
68 | with open('1m_urls') as urls_file:
69 | for line in urls_file:
70 | urls.append(line.rstrip())
71 |
72 | urls = urls * 10
73 |
74 | # benchmark_tldextract(urls)
75 | # benchmark_publicsuffix2(urls)
76 | # benchmark_tld(urls)
77 | benchmark_pydomainextractor(urls)
78 |
79 |
80 | if __name__ == '__main__':
81 | main()
82 |
--------------------------------------------------------------------------------
/cortex.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | info:
3 | title: Pydomainextractor
4 | description: A blazingly fast domain extraction library written in Rust
5 | x-cortex-git:
6 | github:
7 | alias: intsightsorg
8 | repository: Intsights/PyDomainExtractor
9 | x-cortex-tag: pydomainextractor
10 | x-cortex-type: service
11 | x-cortex-domain-parents:
12 | - tag: threatintel-shadow-intel
13 | x-cortex-groups:
14 | - exposure:external-ship
15 | - target:library
16 | openapi: 3.0.1
17 | servers:
18 | - url: "/"
19 |
--------------------------------------------------------------------------------
/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyDomainExtractor/a59d365effa56872235d3ffa0e7a1367065fa3e6/images/logo.png
--------------------------------------------------------------------------------
/pydomainextractor/__init__.py:
--------------------------------------------------------------------------------
1 | import typing
2 |
3 | from . import pydomainextractor
4 |
5 |
6 | class DomainExtractor:
7 | '''
8 | PyDomainExtractor is a highly optimized Domain Name Extraction library written in Rust
9 | '''
10 | engine: typing.Optional[pydomainextractor.DomainExtractor] = None
11 |
12 | def __new__(
13 | cls,
14 | suffix_list_data: typing.Optional[str] = None,
15 | ):
16 | if suffix_list_data is None:
17 | if DomainExtractor.engine is None:
18 | DomainExtractor.engine = pydomainextractor.DomainExtractor()
19 |
20 | return DomainExtractor.engine
21 | else:
22 | return pydomainextractor.DomainExtractor(suffix_list_data)
23 |
--------------------------------------------------------------------------------
/pydomainextractor/pydomainextractor.pyi:
--------------------------------------------------------------------------------
1 | import typing
2 |
3 |
4 | class DomainExtractor:
5 | def __init__(
6 | self,
7 | suffix_list_data: typing.Optional[str] = None,
8 | ) -> None: ...
9 |
10 | def extract(
11 | self,
12 | domain: str,
13 | ) -> typing.Dict[str, str]: ...
14 |
15 | def extract_from_url(
16 | self,
17 | url: str,
18 | ) -> typing.Dict[str, str]: ...
19 |
20 | def is_valid_domain(
21 | self,
22 | domain: str,
23 | ) -> bool: ...
24 |
25 | def get_tld_list(
26 | self,
27 | ) -> typing.List[str]: ...
28 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "pydomainextractor"
3 | version = "0.13.10"
4 | authors = [
5 | {email = "viktor_vilskyi@rapid7.com"},
6 | {name = "Viktor Vilskyi"}
7 | ]
8 | requires-python = ">=3.7"
9 | license = {file = "LICENSE"}
10 | classifiers = [
11 | "License :: OSI Approved :: MIT License",
12 | "Operating System :: MacOS",
13 | "Operating System :: Microsoft",
14 | "Operating System :: POSIX :: Linux",
15 | "Programming Language :: Python :: 3.7",
16 | "Programming Language :: Python :: 3.8",
17 | "Programming Language :: Python :: 3.9",
18 | "Programming Language :: Python :: 3.10",
19 | "Programming Language :: Python :: 3.11",
20 | "Programming Language :: Rust",
21 | ]
22 |
23 | [project.urls]
24 | repository = "https://github.com/intsights/pydomainextractor"
25 | homepage = "https://github.com/intsights/pydomainextractor"
26 |
27 | [build-system]
28 | requires = ["maturin>=0.14,<0.15"]
29 | build-backend = "maturin"
30 |
31 | [tool.maturin]
32 | sdist-include = [
33 | "Cargo.toml",
34 | "pydomainextractor/*.py",
35 | "pydomainextractor/*.pyi",
36 | "pyproject.toml",
37 | "src/*",
38 | ]
39 |
40 | [tool.poetry]
41 | name = "pydomainextractor"
42 | version = "0.13.9"
43 | authors = ["Viktor Vilskyi "]
44 | description = "A blazingly fast domain extraction library written in Rust"
45 | readme = "README.md"
46 | repository = "https://github.com/intsights/pydomainextractor"
47 | homepage = "https://github.com/intsights/pydomainextractor"
48 | license = "MIT"
49 | keywords = [
50 | "domain",
51 | "extraction",
52 | "tld",
53 | "suffix",
54 | "psl",
55 | "rust",
56 | "pyo3",
57 | ]
58 | classifiers = [
59 | "License :: OSI Approved :: MIT License",
60 | "Operating System :: MacOS",
61 | "Operating System :: Microsoft",
62 | "Operating System :: POSIX :: Linux",
63 | "Programming Language :: Python :: 3.7",
64 | "Programming Language :: Python :: 3.8",
65 | "Programming Language :: Python :: 3.9",
66 | "Programming Language :: Python :: 3.10",
67 | "Programming Language :: Python :: 3.11",
68 | "Programming Language :: Rust",
69 | ]
70 |
71 | [tool.poetry.dependencies]
72 | python = "^3.7"
73 |
74 | [tool.poetry.dev-dependencies]
75 | pytest = "*"
76 | wheel = "*"
77 | pytest-runner = "*"
78 | maturin = "*"
79 |
80 | [tool.pytest.ini_options]
81 | minversion = "6.0"
82 | addopts = [
83 | "--tb=native",
84 | "--pythonwarnings=all",
85 | ]
86 | testpaths = [
87 | "tests",
88 | ]
89 |
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | use ahash::{AHashMap, AHashSet};
2 | use pyo3::exceptions::PyValueError;
3 | use pyo3::intern;
4 | use pyo3::prelude::*;
5 | use pyo3::types::PyString;
6 | use std::os::raw::c_char;
7 |
8 | type DomainString = arraystring::ArrayString;
9 |
10 | #[derive(Default)]
11 | struct Suffix {
12 | sub_suffixes: AHashMap,
13 | is_wildcard: bool,
14 | sub_blacklist: AHashSet,
15 | }
16 |
17 | static PUBLIC_SUFFIX_LIST_DATA: &str = include_str!("public_suffix_list.dat");
18 |
19 | #[pyclass]
20 | struct DomainExtractor {
21 | suffixes: AHashMap,
22 | tld_list: Vec,
23 | }
24 |
25 | #[pymethods]
26 | impl DomainExtractor {
27 | #[new]
28 | fn new(
29 | suffix_list: Option<&str>,
30 | ) -> Self {
31 | let (suffixes, tld_list) = if let Some(suffix_list) = suffix_list {
32 | parse_suffix_list(suffix_list)
33 | } else {
34 | parse_suffix_list(PUBLIC_SUFFIX_LIST_DATA)
35 | };
36 |
37 | DomainExtractor { suffixes, tld_list }
38 | }
39 |
40 | fn parse_domain_parts<'a>(
41 | &self,
42 | domain: &'a str,
43 | ) -> PyResult<(&'a str, &'a str, &'a str)> {
44 | let mut suffix_part = "";
45 | let mut current_suffixes = &self.suffixes;
46 | let mut last_dot_index = domain.len();
47 | let mut in_wildcard_tld = false;
48 | let mut last_suffix: Option<&Suffix> = None;
49 |
50 | while let Some(dot_index) = memchr::memrchr(b'.', &domain.as_bytes()[..last_dot_index]) {
51 | let current_fraction = &domain[dot_index + 1..last_dot_index];
52 | if current_fraction.is_empty() || dot_index == 0 {
53 | return Err(PyValueError::new_err("Invalid domain detected"));
54 | }
55 |
56 | if in_wildcard_tld {
57 | if last_suffix.unwrap().sub_blacklist.contains(current_fraction) {
58 | let leftover_part = &domain[0..dot_index];
59 |
60 | return Ok((suffix_part, current_fraction, leftover_part));
61 | }
62 |
63 | if let Some(current_suffix) = current_suffixes.get(current_fraction) {
64 | if !current_suffix.is_wildcard {
65 | current_suffixes = ¤t_suffix.sub_suffixes;
66 | }
67 | last_suffix.replace(current_suffix);
68 | suffix_part = &domain[dot_index + 1..];
69 | last_dot_index = dot_index;
70 | } else {
71 | suffix_part = &domain[dot_index + 1..];
72 | let leftover_part = &domain[0..dot_index];
73 | match leftover_part.rsplit_once('.') {
74 | Some((subdomain_part, domain_part)) => {
75 | if subdomain_part.ends_with('.') {
76 | return Err(PyValueError::new_err("Invalid domain detected"));
77 | }
78 | return Ok((suffix_part, domain_part, subdomain_part));
79 | }
80 | None => {
81 | return Ok((suffix_part, leftover_part, ""));
82 | }
83 | }
84 | }
85 | }
86 | if let Some(current_suffix) = current_suffixes.get(current_fraction) {
87 | in_wildcard_tld = current_suffix.is_wildcard;
88 |
89 | current_suffixes = ¤t_suffix.sub_suffixes;
90 | last_suffix.replace(current_suffix);
91 | suffix_part = &domain[dot_index + 1..];
92 | last_dot_index = dot_index;
93 | } else {
94 | let leftover_part = &domain[0..last_dot_index];
95 | match leftover_part.rsplit_once('.') {
96 | Some((subdomain_part, domain_part)) => {
97 | if subdomain_part.ends_with('.') {
98 | return Err(PyValueError::new_err("Invalid domain detected"));
99 | }
100 | return Ok((suffix_part, domain_part, subdomain_part));
101 | }
102 | None => {
103 | return Ok((suffix_part, leftover_part, ""));
104 | }
105 | };
106 | }
107 | }
108 |
109 | let current_fraction = &domain[0..last_dot_index];
110 | if in_wildcard_tld {
111 | if last_suffix.unwrap().sub_blacklist.contains(current_fraction) {
112 | Ok((suffix_part, current_fraction, ""))
113 | } else {
114 | Ok((domain, "", ""))
115 | }
116 | } else if current_suffixes.len() > 0 && current_suffixes.contains_key(current_fraction) {
117 | Ok((domain, "", ""))
118 | } else {
119 | Ok((suffix_part, current_fraction, ""))
120 | }
121 | }
122 |
123 | fn extract(
124 | &self,
125 | py: Python,
126 | domain: &PyString,
127 | ) -> PyResult {
128 | if domain.len().unwrap() > 255 {
129 | return Err(PyValueError::new_err("Invalid domain detected"));
130 | }
131 |
132 | let mut domain_string = unsafe {
133 | DomainString::from_str_unchecked(domain.to_string_lossy().as_ref())
134 | };
135 | domain_string.make_ascii_lowercase();
136 |
137 | let (suffix_part, domain_part, subdomain_part) = self.parse_domain_parts(domain_string.as_str())?;
138 |
139 | unsafe {
140 | let dict = pyo3::ffi::PyDict_New();
141 | for (fraction_key, fraction) in [
142 | (intern!(py, "suffix").into_ptr(), suffix_part),
143 | (intern!(py, "domain").into_ptr(), domain_part),
144 | (intern!(py, "subdomain").into_ptr(), subdomain_part),
145 | ] {
146 | if !fraction.is_empty() {
147 | let substr = pyo3::ffi::PyUnicode_FromStringAndSize(
148 | fraction.as_ptr() as *const c_char,
149 | fraction.len() as isize,
150 | );
151 |
152 | pyo3::ffi::PyDict_SetItem(
153 | dict,
154 | fraction_key,
155 | substr,
156 | );
157 | pyo3::ffi::Py_DECREF(substr);
158 | } else {
159 | pyo3::ffi::PyDict_SetItem(
160 | dict,
161 | fraction_key,
162 | intern!(py, "").into_ptr(),
163 | );
164 | }
165 | }
166 |
167 | Ok(pyo3::PyObject::from_owned_ptr(py, dict))
168 | }
169 | }
170 |
171 | fn is_valid_domain(
172 | &self,
173 | domain: &PyString,
174 | ) -> bool {
175 | let domain_len = domain.len().unwrap();
176 | if domain_len == 0 || domain_len > 255 {
177 | return false;
178 | }
179 |
180 | let mut domain_string = unsafe {
181 | DomainString::from_str_unchecked(domain.to_string_lossy().as_ref())
182 | };
183 |
184 | for fraction in domain_string.split('.') {
185 | if fraction.len() > 63 || fraction.is_empty() {
186 | return false;
187 | }
188 | if fraction.starts_with('-') || fraction.ends_with('-') {
189 | return false;
190 | }
191 |
192 | for ch in fraction.chars() {
193 | if !ch.is_alphanumeric() && ch != '-' {
194 | return false;
195 | }
196 | }
197 | }
198 |
199 | domain_string.make_ascii_lowercase();
200 | if let Ok((suffix_part, domain_part, _subdomain_part)) = self.parse_domain_parts(domain_string.as_str()) {
201 | if suffix_part.is_empty() || domain_part.is_empty() {
202 | return false;
203 | }
204 |
205 | if idna::domain_to_ascii(domain_string.as_str()).is_err() {
206 | return false;
207 | }
208 | if idna::domain_to_unicode(domain_string.as_str()).1.is_err() {
209 | return false;
210 | }
211 |
212 | true
213 | } else {
214 | false
215 | }
216 | }
217 |
218 | fn get_tld_list(
219 | &self,
220 | ) -> Vec {
221 | self.tld_list.clone()
222 | }
223 |
224 | fn extract_from_url(
225 | &self,
226 | py: Python,
227 | url: &PyString,
228 | ) -> PyResult {
229 | let mut url_str = url.to_str().unwrap();
230 |
231 | match memchr::memmem::find(url_str.as_bytes(), b"//") {
232 | Some(scheme_separator_position) => {
233 | url_str = &url_str[scheme_separator_position + 2..];
234 | },
235 | None => return Err(
236 | PyValueError::new_err("url is invalid: no scheme")
237 | ),
238 | };
239 |
240 | if let Some(path_separator) = memchr::memchr(b'/', url_str.as_bytes()) {
241 | url_str = &url_str[..path_separator];
242 | };
243 |
244 | if let Some(authentication_separator) = memchr::memchr(b'@', url_str.as_bytes()) {
245 | url_str = &url_str[authentication_separator + 1..];
246 | };
247 |
248 | if let Some(port_separator) = memchr::memchr(b':', url_str.as_bytes()) {
249 | url_str = &url_str[..port_separator];
250 | };
251 |
252 | if url_str.is_empty() {
253 | return Err(
254 | PyValueError::new_err("url does not contain a domain")
255 | );
256 | }
257 |
258 | if url_str.len() > 255 {
259 | return Err(PyValueError::new_err("url is invalid: too long"));
260 | }
261 | let mut domain_string = unsafe {
262 | DomainString::from_str_unchecked(url_str)
263 | };
264 | domain_string.make_ascii_lowercase();
265 |
266 | let (suffix_part, domain_part, subdomain_part) = self.parse_domain_parts(domain_string.as_str())?;
267 |
268 | unsafe {
269 | let dict = pyo3::ffi::PyDict_New();
270 | for (fraction_key, fraction) in [
271 | (intern!(py, "suffix").into_ptr(), suffix_part),
272 | (intern!(py, "domain").into_ptr(), domain_part),
273 | (intern!(py, "subdomain").into_ptr(), subdomain_part),
274 | ] {
275 | if !fraction.is_empty() {
276 | let substr = pyo3::ffi::PyUnicode_FromStringAndSize(
277 | fraction.as_ptr() as *const c_char,
278 | fraction.len() as isize,
279 | );
280 |
281 | pyo3::ffi::PyDict_SetItem(
282 | dict,
283 | fraction_key,
284 | substr,
285 | );
286 | pyo3::ffi::Py_DECREF(substr);
287 | } else {
288 | pyo3::ffi::PyDict_SetItem(
289 | dict,
290 | fraction_key,
291 | intern!(py, "").into_ptr(),
292 | );
293 | }
294 | }
295 |
296 | Ok(pyo3::PyObject::from_owned_ptr(py, dict))
297 | }
298 | }
299 | }
300 |
301 | fn parse_suffix_list(
302 | suffixes_list: &str,
303 | ) -> (AHashMap, Vec) {
304 | let mut suffixes = AHashMap::new();
305 | let mut tld_list = Vec::new();
306 |
307 | for line in suffixes_list.lines().map(
308 | |line| line.to_ascii_lowercase()
309 | ) {
310 | if line.starts_with("//") || line.is_empty() {
311 | continue;
312 | }
313 |
314 | let mut tlds = vec![line.clone()];
315 | if !line.is_ascii() {
316 | tlds.push(idna::domain_to_ascii(&line).unwrap());
317 | }
318 | for tld in tlds {
319 | tld_list.push(tld.clone());
320 |
321 | let fractions: Vec = tld.rsplit('.').map(
322 | |s| s.to_string()
323 | ).collect();
324 | let mut current_suffix = suffixes.entry(fractions.first().unwrap().to_owned()).or_insert(
325 | Suffix {
326 | sub_suffixes: AHashMap::new(),
327 | is_wildcard: false,
328 | sub_blacklist: AHashSet::new(),
329 | }
330 | );
331 |
332 | for fraction in fractions[1..].iter() {
333 | if fraction.starts_with('!') {
334 | current_suffix.sub_blacklist.insert(fraction.strip_prefix('!').unwrap().to_string());
335 | } else if fraction == "*" {
336 | current_suffix.is_wildcard = true;
337 | } else {
338 | current_suffix = current_suffix.sub_suffixes.entry(fraction.clone()).or_insert(
339 | Suffix {
340 | sub_suffixes: AHashMap::new(),
341 | is_wildcard: false,
342 | sub_blacklist: AHashSet::new(),
343 | }
344 | );
345 | }
346 | }
347 | }
348 | }
349 |
350 | (suffixes, tld_list)
351 | }
352 |
353 | #[pymodule]
354 | fn pydomainextractor(
355 | _py: Python,
356 | m: &PyModule,
357 | ) -> PyResult<()> {
358 | m.add_class::()?;
359 | Ok(())
360 | }
361 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyDomainExtractor/a59d365effa56872235d3ffa0e7a1367065fa3e6/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_pydomainextractor.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import unittest.mock
3 |
4 | import pydomainextractor
5 |
6 |
7 | class DomainExtractorExtractionTestCase(
8 | unittest.TestCase,
9 | ):
10 | def setUp(
11 | self,
12 | ):
13 | self.domain_extractor = pydomainextractor.DomainExtractor()
14 |
15 | def test_extract_only_tld(
16 | self,
17 | ):
18 | self.assertEqual(
19 | first=self.domain_extractor.extract('com'),
20 | second={
21 | 'domain': '',
22 | 'subdomain': '',
23 | 'suffix': 'com',
24 | },
25 | )
26 |
27 | self.assertEqual(
28 | first=self.domain_extractor.extract('jp.net'),
29 | second={
30 | 'domain': '',
31 | 'subdomain': '',
32 | 'suffix': 'jp.net',
33 | },
34 | )
35 |
36 | self.assertEqual(
37 | first=self.domain_extractor.extract('鹿児島.jp'),
38 | second={
39 | 'domain': '',
40 | 'subdomain': '',
41 | 'suffix': '鹿児島.jp',
42 | },
43 | )
44 |
45 | self.assertEqual(
46 | first=self.domain_extractor.extract('香格里拉'),
47 | second={
48 | 'domain': '',
49 | 'subdomain': '',
50 | 'suffix': '香格里拉',
51 | },
52 | )
53 |
54 | self.assertEqual(
55 | first=self.domain_extractor.extract('xn--32vp30h.jp'),
56 | second={
57 | 'domain': '',
58 | 'subdomain': '',
59 | 'suffix': 'xn--32vp30h.jp',
60 | },
61 | )
62 |
63 | def test_extract_only_domain(
64 | self,
65 | ):
66 | self.assertEqual(
67 | first=self.domain_extractor.extract('nonexistenttld'),
68 | second={
69 | 'domain': 'nonexistenttld',
70 | 'subdomain': '',
71 | 'suffix': '',
72 | },
73 | )
74 |
75 | self.assertEqual(
76 | first=self.domain_extractor.extract('香格里拉香格里拉香格里拉'),
77 | second={
78 | 'domain': '香格里拉香格里拉香格里拉',
79 | 'subdomain': '',
80 | 'suffix': '',
81 | },
82 | )
83 |
84 | def test_extract_only_domain_and_subdomain(
85 | self,
86 | ):
87 | self.assertEqual(
88 | first=self.domain_extractor.extract('nonexistenttld.nonexistenttld'),
89 | second={
90 | 'domain': 'nonexistenttld',
91 | 'subdomain': 'nonexistenttld',
92 | 'suffix': '',
93 | },
94 | )
95 |
96 | self.assertEqual(
97 | first=self.domain_extractor.extract('香格里拉香格里拉香格里拉.nonexistenttld'),
98 | second={
99 | 'domain': 'nonexistenttld',
100 | 'subdomain': '香格里拉香格里拉香格里拉',
101 | 'suffix': '',
102 | },
103 | )
104 |
105 | def test_extract_all_parts(
106 | self,
107 | ):
108 | self.assertEqual(
109 | first=self.domain_extractor.extract('google.com'),
110 | second={
111 | 'domain': 'google',
112 | 'subdomain': '',
113 | 'suffix': 'com',
114 | },
115 | )
116 |
117 | self.assertEqual(
118 | first=self.domain_extractor.extract('subdomain.google.com'),
119 | second={
120 | 'domain': 'google',
121 | 'subdomain': 'subdomain',
122 | 'suffix': 'com',
123 | },
124 | )
125 |
126 | self.assertEqual(
127 | first=self.domain_extractor.extract('subsubdomain.subdomain.google.com'),
128 | second={
129 | 'domain': 'google',
130 | 'subdomain': 'subsubdomain.subdomain',
131 | 'suffix': 'com',
132 | },
133 | )
134 |
135 | self.assertEqual(
136 | first=self.domain_extractor.extract('subsubdomain.subdomain.google.香格里拉'),
137 | second={
138 | 'domain': 'google',
139 | 'subdomain': 'subsubdomain.subdomain',
140 | 'suffix': '香格里拉',
141 | },
142 | )
143 |
144 | self.assertEqual(
145 | first=self.domain_extractor.extract('subsubdomain.subdomain.google.鹿児島.jp'),
146 | second={
147 | 'domain': 'google',
148 | 'subdomain': 'subsubdomain.subdomain',
149 | 'suffix': '鹿児島.jp',
150 | },
151 | )
152 |
153 | self.assertEqual(
154 | first=self.domain_extractor.extract('subsubdomain.subdomain.google.xn--32vp30h.jp'),
155 | second={
156 | 'domain': 'google',
157 | 'subdomain': 'subsubdomain.subdomain',
158 | 'suffix': 'xn--32vp30h.jp',
159 | },
160 | )
161 |
162 | def test_special_cases(
163 | self,
164 | ):
165 | self.assertEqual(
166 | first=self.domain_extractor.extract('bla.ck'),
167 | second={
168 | 'domain': '',
169 | 'subdomain': '',
170 | 'suffix': 'bla.ck',
171 | },
172 | )
173 |
174 | self.assertEqual(
175 | first=self.domain_extractor.extract('a.bla.ck'),
176 | second={
177 | 'domain': 'a',
178 | 'subdomain': '',
179 | 'suffix': 'bla.ck',
180 | },
181 | )
182 |
183 | self.assertEqual(
184 | first=self.domain_extractor.extract('a.b.bla.ck'),
185 | second={
186 | 'domain': 'b',
187 | 'subdomain': 'a',
188 | 'suffix': 'bla.ck',
189 | },
190 | )
191 |
192 | self.assertEqual(
193 | first=self.domain_extractor.extract('www.ck'),
194 | second={
195 | 'domain': 'www',
196 | 'subdomain': '',
197 | 'suffix': 'ck',
198 | },
199 | )
200 |
201 | self.assertEqual(
202 | first=self.domain_extractor.extract('a.www.ck'),
203 | second={
204 | 'domain': 'www',
205 | 'subdomain': 'a',
206 | 'suffix': 'ck',
207 | },
208 | )
209 |
210 | self.assertEqual(
211 | first=self.domain_extractor.extract('a.bzz.dapps.earth'),
212 | second={
213 | 'domain': '',
214 | 'subdomain': '',
215 | 'suffix': 'a.bzz.dapps.earth',
216 | },
217 | )
218 |
219 | self.assertEqual(
220 | first=self.domain_extractor.extract('a.b.bzz.dapps.earth'),
221 | second={
222 | 'domain': 'a',
223 | 'subdomain': '',
224 | 'suffix': 'b.bzz.dapps.earth',
225 | },
226 | )
227 |
228 | self.assertEqual(
229 | first=self.domain_extractor.extract('domain.co.za'),
230 | second={
231 | 'domain': 'domain',
232 | 'subdomain': '',
233 | 'suffix': 'co.za',
234 | },
235 | )
236 |
237 | def test_upper_case(
238 | self,
239 | ):
240 | self.assertEqual(
241 | first=self.domain_extractor.extract('domain.Com'),
242 | second={
243 | 'domain': 'domain',
244 | 'subdomain': '',
245 | 'suffix': 'com',
246 | },
247 | )
248 |
249 | self.assertEqual(
250 | first=self.domain_extractor.extract('DOmain.Com'),
251 | second={
252 | 'domain': 'domain',
253 | 'subdomain': '',
254 | 'suffix': 'com',
255 | },
256 | )
257 |
258 | self.assertEqual(
259 | first=self.domain_extractor.extract('DOmain.COM'),
260 | second={
261 | 'domain': 'domain',
262 | 'subdomain': '',
263 | 'suffix': 'com',
264 | },
265 | )
266 |
267 | self.assertEqual(
268 | first=self.domain_extractor.extract('a.b.bla.CK'),
269 | second={
270 | 'domain': 'b',
271 | 'subdomain': 'a',
272 | 'suffix': 'bla.ck',
273 | },
274 | )
275 |
276 | def test_syntactic_invalid_domains(
277 | self,
278 | ):
279 | with self.assertRaises(
280 | expected_exception=ValueError,
281 | ):
282 | self.domain_extractor.extract('.com')
283 |
284 | with self.assertRaises(
285 | expected_exception=ValueError,
286 | ):
287 | self.domain_extractor.extract('domain..com')
288 |
289 | with self.assertRaises(
290 | expected_exception=ValueError,
291 | ):
292 | self.domain_extractor.extract('sub..domain.com')
293 |
294 | with self.assertRaises(
295 | expected_exception=ValueError,
296 | ):
297 | self.domain_extractor.extract('domain.com.')
298 |
299 | with self.assertRaises(
300 | expected_exception=ValueError,
301 | ):
302 | self.domain_extractor.extract('com.')
303 |
304 | def test_domain_too_long(
305 | self,
306 | ):
307 | with self.assertRaises(
308 | expected_exception=ValueError,
309 | ):
310 | self.domain_extractor.extract(f'{"very-long" * 255}.com')
311 |
312 | def test_extract_from_url(
313 | self,
314 | ):
315 | with self.assertRaises(
316 | ValueError,
317 | ):
318 | self.domain_extractor.extract_from_url('http://www.example.com./')
319 |
320 | with self.assertRaises(
321 | ValueError,
322 | ):
323 | self.domain_extractor.extract_from_url('mail.google.com/mail')
324 |
325 | with self.assertRaises(
326 | ValueError,
327 | ):
328 | self.domain_extractor.extract_from_url('xn--gieen46ers-73a.de')
329 |
330 | with self.assertRaises(
331 | ValueError,
332 | ):
333 | self.domain_extractor.extract_from_url('http://')
334 |
335 | with self.assertRaises(
336 | ValueError,
337 | ):
338 | self.domain_extractor.extract_from_url('xn--tub-1m9d15sfkkhsifsbqygyujjrw602gk4li5qqk98aca0w.google.com')
339 |
340 | with self.assertRaises(
341 | ValueError,
342 | ):
343 | self.domain_extractor.extract_from_url('xn--tub-1m9d15sfkkhsifsbqygyujjrw60.google.com')
344 |
345 | with self.assertRaises(
346 | ValueError,
347 | ):
348 | self.domain_extractor.extract_from_url('1\xe9')
349 |
350 | with self.assertRaises(
351 | ValueError,
352 | ):
353 | self.domain_extractor.extract_from_url('com')
354 |
355 | with self.assertRaises(
356 | ValueError,
357 | ):
358 | self.domain_extractor.extract_from_url('co.uk')
359 |
360 | with self.assertRaises(
361 | ValueError,
362 | ):
363 | self.domain_extractor.extract_from_url(f'http://{"domain" * 255}co.uk:3030/some/path')
364 |
365 | self.assertEqual(
366 | first=self.domain_extractor.extract_from_url('http://www.google.com'),
367 | second={
368 | 'subdomain': 'www',
369 | 'domain': 'google',
370 | 'suffix': 'com',
371 | },
372 | )
373 | self.assertEqual(
374 | first=self.domain_extractor.extract_from_url('http://www.theregister.co.uk'),
375 | second={
376 | 'subdomain': 'www',
377 | 'domain': 'theregister',
378 | 'suffix': 'co.uk',
379 | },
380 | )
381 | self.assertEqual(
382 | first=self.domain_extractor.extract_from_url('http://gmail.com'),
383 | second={
384 | 'subdomain': '',
385 | 'domain': 'gmail',
386 | 'suffix': 'com',
387 | },
388 | )
389 | self.assertEqual(
390 | first=self.domain_extractor.extract_from_url('http://media.forums.theregister.co.uk'),
391 | second={
392 | 'subdomain': 'media.forums',
393 | 'domain': 'theregister',
394 | 'suffix': 'co.uk',
395 | },
396 | )
397 | self.assertEqual(
398 | first=self.domain_extractor.extract_from_url('http://www.www.com'),
399 | second={
400 | 'subdomain': 'www',
401 | 'domain': 'www',
402 | 'suffix': 'com',
403 | },
404 | )
405 | self.assertEqual(
406 | first=self.domain_extractor.extract_from_url('http://www.com'),
407 | second={
408 | 'subdomain': '',
409 | 'domain': 'www',
410 | 'suffix': 'com',
411 | },
412 | )
413 | self.assertEqual(
414 | first=self.domain_extractor.extract_from_url('http://internalunlikelyhostname/'),
415 | second={
416 | 'subdomain': '',
417 | 'domain': 'internalunlikelyhostname',
418 | 'suffix': '',
419 | },
420 | )
421 | self.assertEqual(
422 | first=self.domain_extractor.extract_from_url('http://internalunlikelyhostname.bizarre'),
423 | second={
424 | 'subdomain': 'internalunlikelyhostname',
425 | 'domain': 'bizarre',
426 | 'suffix': '',
427 | },
428 | )
429 | self.assertEqual(
430 | first=self.domain_extractor.extract_from_url('http://internalunlikelyhostname.info/'),
431 | second={
432 | 'subdomain': '',
433 | 'domain': 'internalunlikelyhostname',
434 | 'suffix': 'info',
435 | },
436 | )
437 | self.assertEqual(
438 | first=self.domain_extractor.extract_from_url('http://internalunlikelyhostname.information/'),
439 | second={
440 | 'subdomain': 'internalunlikelyhostname',
441 | 'domain': 'information',
442 | 'suffix': '',
443 | },
444 | )
445 | self.assertEqual(
446 | first=self.domain_extractor.extract_from_url('http://216.22.0.192/'),
447 | second={
448 | 'subdomain': '216.22.0',
449 | 'domain': '192',
450 | 'suffix': '',
451 | },
452 | )
453 | self.assertEqual(
454 | first=self.domain_extractor.extract_from_url('http://216.22.project.coop/'),
455 | second={
456 | 'subdomain': '216.22',
457 | 'domain': 'project',
458 | 'suffix': 'coop',
459 | },
460 | )
461 | self.assertEqual(
462 | first=self.domain_extractor.extract_from_url('http://xn--h1alffa9f.xn--p1ai'),
463 | second={
464 | 'subdomain': '',
465 | 'domain': 'xn--h1alffa9f',
466 | 'suffix': 'xn--p1ai',
467 | },
468 | )
469 | self.assertEqual(
470 | first=self.domain_extractor.extract_from_url('http://xN--h1alffa9f.xn--p1ai'),
471 | second={
472 | 'subdomain': '',
473 | 'domain': 'xn--h1alffa9f',
474 | 'suffix': 'xn--p1ai',
475 | },
476 | )
477 | self.assertEqual(
478 | first=self.domain_extractor.extract_from_url('http://XN--h1alffa9f.xn--p1ai'),
479 | second={
480 | 'subdomain': '',
481 | 'domain': 'xn--h1alffa9f',
482 | 'suffix': 'xn--p1ai',
483 | },
484 | )
485 | self.assertEqual(
486 | first=self.domain_extractor.extract_from_url('http://xn--zckzap6140b352by.blog.so-net.xn--wcvs22d.hk'),
487 | second={
488 | 'subdomain': 'xn--zckzap6140b352by.blog',
489 | 'domain': 'so-net',
490 | 'suffix': 'xn--wcvs22d.hk',
491 | },
492 | )
493 | self.assertEqual(
494 | first=self.domain_extractor.extract_from_url('http://xn--zckzap6140b352by.blog.so-net.教育.hk'),
495 | second={
496 | 'subdomain': 'xn--zckzap6140b352by.blog',
497 | 'domain': 'so-net',
498 | 'suffix': '教育.hk',
499 | },
500 | )
501 | self.assertEqual(
502 | first=self.domain_extractor.extract_from_url('https://mail.google.com/mail'),
503 | second={
504 | 'subdomain': 'mail',
505 | 'domain': 'google',
506 | 'suffix': 'com',
507 | },
508 | )
509 | self.assertEqual(
510 | first=self.domain_extractor.extract_from_url('ssh://mail.google.com/mail'),
511 | second={
512 | 'subdomain': 'mail',
513 | 'domain': 'google',
514 | 'suffix': 'com',
515 | },
516 | )
517 | self.assertEqual(
518 | first=self.domain_extractor.extract_from_url('git+ssh://www.github.com:8443/'),
519 | second={
520 | 'subdomain': 'www',
521 | 'domain': 'github',
522 | 'suffix': 'com',
523 | },
524 | )
525 | self.assertEqual(
526 | first=self.domain_extractor.extract_from_url('ftp://johndoe:5cr1p7k1dd13@1337.warez.com:2501'),
527 | second={
528 | 'subdomain': '1337',
529 | 'domain': 'warez',
530 | 'suffix': 'com',
531 | },
532 | )
533 | self.assertEqual(
534 | first=self.domain_extractor.extract_from_url('http://google.com/?q=cats'),
535 | second={
536 | 'subdomain': '',
537 | 'domain': 'google',
538 | 'suffix': 'com',
539 | },
540 | )
541 | self.assertEqual(
542 | first=self.domain_extractor.extract_from_url('http://google.com/#Welcome'),
543 | second={
544 | 'subdomain': '',
545 | 'domain': 'google',
546 | 'suffix': 'com',
547 | },
548 | )
549 | self.assertEqual(
550 | first=self.domain_extractor.extract_from_url('http://google.com/#Welcome'),
551 | second={
552 | 'subdomain': '',
553 | 'domain': 'google',
554 | 'suffix': 'com',
555 | },
556 | )
557 | self.assertEqual(
558 | first=self.domain_extractor.extract_from_url('http://google.com/s#Welcome'),
559 | second={
560 | 'subdomain': '',
561 | 'domain': 'google',
562 | 'suffix': 'com',
563 | },
564 | )
565 | self.assertEqual(
566 | first=self.domain_extractor.extract_from_url('http://google.com/s?q=cats#Welcome'),
567 | second={
568 | 'subdomain': '',
569 | 'domain': 'google',
570 | 'suffix': 'com',
571 | },
572 | )
573 | self.assertEqual(
574 | first=self.domain_extractor.extract_from_url('http://www.parliament.uk'),
575 | second={
576 | 'subdomain': 'www',
577 | 'domain': 'parliament',
578 | 'suffix': 'uk',
579 | },
580 | )
581 | self.assertEqual(
582 | first=self.domain_extractor.extract_from_url('http://www.parliament.co.uk'),
583 | second={
584 | 'subdomain': 'www',
585 | 'domain': 'parliament',
586 | 'suffix': 'co.uk',
587 | },
588 | )
589 | self.assertEqual(
590 | first=self.domain_extractor.extract_from_url('http://www.cgs.act.edu.au/'),
591 | second={
592 | 'subdomain': 'www',
593 | 'domain': 'cgs',
594 | 'suffix': 'act.edu.au',
595 | },
596 | )
597 | self.assertEqual(
598 | first=self.domain_extractor.extract_from_url('http://www.google.com.au/'),
599 | second={
600 | 'subdomain': 'www',
601 | 'domain': 'google',
602 | 'suffix': 'com.au',
603 | },
604 | )
605 | self.assertEqual(
606 | first=self.domain_extractor.extract_from_url('http://www.metp.net.cn'),
607 | second={
608 | 'subdomain': 'www',
609 | 'domain': 'metp',
610 | 'suffix': 'net.cn',
611 | },
612 | )
613 | self.assertEqual(
614 | first=self.domain_extractor.extract_from_url('http://waiterrant.blogspot.com'),
615 | second={
616 | 'subdomain': '',
617 | 'domain': 'waiterrant',
618 | 'suffix': 'blogspot.com',
619 | },
620 | )
621 | self.assertEqual(
622 | first=self.domain_extractor.extract_from_url('http://127.0.0.1/foo/bar'),
623 | second={
624 | 'subdomain': '127.0.0',
625 | 'domain': '1',
626 | 'suffix': '',
627 | },
628 | )
629 | self.assertEqual(
630 | first=self.domain_extractor.extract_from_url('http://256.256.256.256/foo/bar'),
631 | second={
632 | 'subdomain': '256.256.256',
633 | 'domain': '256',
634 | 'suffix': '',
635 | },
636 | )
637 | self.assertEqual(
638 | first=self.domain_extractor.extract_from_url('http://127.0.0.1.9/foo/bar'),
639 | second={
640 | 'subdomain': '127.0.0.1',
641 | 'domain': '9',
642 | 'suffix': '',
643 | },
644 | )
645 | self.assertEqual(
646 | first=self.domain_extractor.extract_from_url('http://admin:password1@www.google.com:666/secret/admin/interface?param1=42'),
647 | second={
648 | 'subdomain': 'www',
649 | 'domain': 'google',
650 | 'suffix': 'com',
651 | },
652 | )
653 | self.assertEqual(
654 | first=self.domain_extractor.extract_from_url('//admin:password1@www.google.com:666/secret/admin/interface?param1=42'),
655 | second={
656 | 'subdomain': 'www',
657 | 'domain': 'google',
658 | 'suffix': 'com',
659 | },
660 | )
661 | self.assertEqual(
662 | first=self.domain_extractor.extract_from_url('//mail.google.com/mail'),
663 | second={
664 | 'subdomain': 'mail',
665 | 'domain': 'google',
666 | 'suffix': 'com',
667 | },
668 | )
669 | self.assertEqual(
670 | first=self.domain_extractor.extract_from_url('http://test.nu'),
671 | second={
672 | 'subdomain': '',
673 | 'domain': 'test',
674 | 'suffix': 'nu',
675 | },
676 | )
677 |
678 | def test_is_valid_domain(
679 | self,
680 | ):
681 | self.assertTrue(
682 | expr=self.domain_extractor.is_valid_domain('domain.com'),
683 | )
684 | self.assertTrue(
685 | expr=self.domain_extractor.is_valid_domain('sub.domain.com'),
686 | )
687 | self.assertTrue(
688 | expr=self.domain_extractor.is_valid_domain('domain.COM'),
689 | )
690 | self.assertTrue(
691 | expr=self.domain_extractor.is_valid_domain('domain.co.il'),
692 | )
693 | self.assertTrue(
694 | expr=self.domain_extractor.is_valid_domain('domain.co.za'),
695 | )
696 | self.assertFalse(
697 | expr=self.domain_extractor.is_valid_domain('domain.invalid'),
698 | )
699 | self.assertFalse(
700 | expr=self.domain_extractor.is_valid_domain('com'),
701 | )
702 | self.assertFalse(
703 | expr=self.domain_extractor.is_valid_domain('com'),
704 | )
705 | self.assertFalse(
706 | expr=self.domain_extractor.is_valid_domain('-domain.com'),
707 | )
708 | self.assertFalse(
709 | expr=self.domain_extractor.is_valid_domain('domain-.com'),
710 | )
711 | self.assertFalse(
712 | expr=self.domain_extractor.is_valid_domain('-sub.domain.com'),
713 | )
714 | self.assertFalse(
715 | expr=self.domain_extractor.is_valid_domain('sub-.domain.com'),
716 | )
717 |
718 | self.assertTrue(
719 | expr=self.domain_extractor.is_valid_domain('domain.xn--mgbaakc7dvf'),
720 | )
721 | self.assertTrue(
722 | expr=self.domain_extractor.is_valid_domain('domain.اتصالات'),
723 | )
724 | self.assertTrue(
725 | expr=self.domain_extractor.is_valid_domain('xn--mgbaakc7dvf.com'),
726 | )
727 | self.assertTrue(
728 | expr=self.domain_extractor.is_valid_domain('اتصالات.com'),
729 | )
730 | self.assertTrue(
731 | expr=self.domain_extractor.is_valid_domain('اتصالات.اتصالات'),
732 | )
733 | self.assertTrue(
734 | expr=self.domain_extractor.is_valid_domain('xn--mgbaakc7dvf.xn--mgbaakc7dvf'),
735 | )
736 |
737 | self.assertFalse(
738 | expr=self.domain_extractor.is_valid_domain('domain.xn--mgbaakc7dvfa'),
739 | )
740 | self.assertFalse(
741 | expr=self.domain_extractor.is_valid_domain('domain.اsتصالات'),
742 | )
743 | self.assertFalse(
744 | expr=self.domain_extractor.is_valid_domain('xn--mgbaaskc7777dvf.com'),
745 | )
746 | self.assertFalse(
747 | expr=self.domain_extractor.is_valid_domain('اتصالsات.com'),
748 | )
749 | self.assertFalse(
750 | expr=self.domain_extractor.is_valid_domain('اتصالاsت.اتصالات'),
751 | )
752 | self.assertFalse(
753 | expr=self.domain_extractor.is_valid_domain('xn--mgbsaadddd1212121212kc7dvf.xn--mgbaakc7dvf'),
754 | )
755 |
756 | self.assertFalse(
757 | expr=self.domain_extractor.is_valid_domain('\xF0\x9F\x98\x81nonalphanum.com'),
758 | )
759 |
760 | self.assertFalse(
761 | expr=self.domain_extractor.is_valid_domain('.com'),
762 | )
763 | self.assertFalse(
764 | expr=self.domain_extractor.is_valid_domain('domain..com'),
765 | )
766 | self.assertFalse(
767 | expr=self.domain_extractor.is_valid_domain('sub..domain.com'),
768 | )
769 | self.assertFalse(
770 | expr=self.domain_extractor.is_valid_domain('domain.com.'),
771 | )
772 | self.assertFalse(
773 | expr=self.domain_extractor.is_valid_domain('com.'),
774 | )
775 |
776 | def test_mutability(
777 | self,
778 | ):
779 | domain_to_test_original = 'Google.COM'
780 | domain_to_test = 'Google.COM'
781 |
782 | self.domain_extractor.is_valid_domain(domain_to_test)
783 | self.assertEqual(
784 | first=domain_to_test_original,
785 | second=domain_to_test,
786 | )
787 |
788 | self.domain_extractor.extract(domain_to_test)
789 | self.assertEqual(
790 | first=domain_to_test_original,
791 | second=domain_to_test,
792 | )
793 |
794 | url_to_test_original = 'http://Google.COM/A.php?Bla=true'
795 | url_to_test = 'http://Google.COM/A.php?Bla=true'
796 | self.domain_extractor.extract_from_url(url_to_test)
797 | self.assertEqual(
798 | first=url_to_test_original,
799 | second=url_to_test,
800 | )
801 |
802 |
803 | class DomainExtractorLoadTestCase(
804 | unittest.TestCase,
805 | ):
806 | def test_load_called_without_data(
807 | self,
808 | ):
809 | domain_extractor = pydomainextractor.DomainExtractor()
810 |
811 | self.assertEqual(
812 | first=domain_extractor.extract('com'),
813 | second={
814 | 'subdomain': '',
815 | 'domain': '',
816 | 'suffix': 'com',
817 | },
818 | )
819 |
820 | def test_load_called_with_data(
821 | self,
822 | ):
823 | domain_extractor = pydomainextractor.DomainExtractor(
824 | 'com\n'
825 | )
826 |
827 | self.assertEqual(
828 | first=domain_extractor.extract('com'),
829 | second={
830 | 'subdomain': '',
831 | 'domain': '',
832 | 'suffix': 'com',
833 | },
834 | )
835 |
836 | domain_extractor = pydomainextractor.DomainExtractor(
837 | 'net\n'
838 | )
839 |
840 | self.assertEqual(
841 | first=domain_extractor.extract('com'),
842 | second={
843 | 'subdomain': '',
844 | 'domain': 'com',
845 | 'suffix': '',
846 | },
847 | )
848 |
849 | domain_extractor = pydomainextractor.DomainExtractor(
850 | 'customtld\n'
851 | )
852 |
853 | self.assertEqual(
854 | first=domain_extractor.extract('google.customtld'),
855 | second={
856 | 'subdomain': '',
857 | 'domain': 'google',
858 | 'suffix': 'customtld',
859 | },
860 | )
861 |
862 | domain_extractor = pydomainextractor.DomainExtractor(
863 | 'tld\n'
864 | 'custom.tld\n'
865 | )
866 |
867 | self.assertEqual(
868 | first=domain_extractor.extract('google.custom.tld'),
869 | second={
870 | 'subdomain': '',
871 | 'domain': 'google',
872 | 'suffix': 'custom.tld',
873 | },
874 | )
875 |
876 | def test_get_tld_list(
877 | self,
878 | ):
879 | domain_extractor = pydomainextractor.DomainExtractor(
880 | 'com\n'
881 | )
882 |
883 | self.assertEqual(
884 | first=domain_extractor.get_tld_list(),
885 | second=[
886 | 'com',
887 | ],
888 | )
889 |
890 | domain_extractor = pydomainextractor.DomainExtractor(
891 | 'com\n'
892 | 'net\n'
893 | 'org\n'
894 | 'uk.com\n'
895 | )
896 |
897 | self.assertCountEqual(
898 | first=domain_extractor.get_tld_list(),
899 | second=[
900 | 'com',
901 | 'net',
902 | 'org',
903 | 'uk.com',
904 | ],
905 | )
906 |
--------------------------------------------------------------------------------