├── .github └── workflows │ ├── build.yml │ └── deploy.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── MANIFEST.in ├── README.md ├── benchmarks ├── domain_benchmark.py └── url_benchmark.py ├── cortex.yaml ├── images └── logo.png ├── pydomainextractor ├── __init__.py └── pydomainextractor.pyi ├── pyproject.toml ├── src ├── lib.rs └── public_suffix_list.dat └── tests ├── __init__.py └── test_pydomainextractor.py /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | on: 3 | - push 4 | - pull_request 5 | jobs: 6 | lint: 7 | if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags') 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout 11 | uses: actions/checkout@v3 12 | - name: Install latest rust 13 | uses: actions-rs/toolchain@v1 14 | with: 15 | toolchain: stable 16 | profile: minimal 17 | override: true 18 | components: clippy 19 | - name: Lint with clippy 20 | uses: actions-rs/cargo@v1 21 | with: 22 | command: clippy 23 | args: --all-targets --all-features 24 | test: 25 | runs-on: ${{ matrix.os }} 26 | needs: lint 27 | strategy: 28 | fail-fast: false 29 | matrix: 30 | python-version: 31 | - '3.7' 32 | - '3.8' 33 | - '3.9' 34 | - '3.10' 35 | - '3.11' 36 | os: 37 | - ubuntu-latest 38 | - macos-latest 39 | - windows-latest 40 | steps: 41 | - name: Checkout 42 | uses: actions/checkout@v3 43 | - name: Set up Python ${{ matrix.python-version }} 44 | uses: actions/setup-python@v4 45 | with: 46 | python-version: ${{ matrix.python-version }} 47 | - name: Install Poetry 48 | uses: abatilo/actions-poetry@v2 49 | - name: Install Rust 50 | uses: actions-rs/toolchain@v1 51 | with: 52 | profile: minimal 53 | toolchain: stable 54 | override: true 55 | - name: Install dependencies 56 | run: poetry install 57 | - name: Build Python package 58 | run: poetry run maturin develop 59 | - name: Test 60 | run: poetry run pytest -Werror tests 61 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy 2 | on: 3 | release: 4 | types: 5 | - released 6 | jobs: 7 | deploy: 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | python-version: 13 | - "3.7" 14 | - "3.8" 15 | - "3.9" 16 | - "3.10" 17 | - "3.11" 18 | os: 19 | - ubuntu-latest 20 | - macos-latest 21 | - windows-latest 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v3 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v4 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install Rust 30 | uses: actions-rs/toolchain@v1 31 | with: 32 | profile: minimal 33 | toolchain: stable 34 | override: true 35 | - name: Install Cross-compilers (macOS) 36 | if: matrix.os == 'macos-latest' 37 | run: | 38 | rustup target add x86_64-apple-darwin 39 | rustup target add aarch64-apple-darwin 40 | - name: Install Cross-compilers (Linux) 41 | if: matrix.os == 'ubuntu-latest' 42 | run: | 43 | rustup target add aarch64-unknown-linux-gnu 44 | - name: Publish Package 45 | uses: PyO3/maturin-action@v1 46 | with: 47 | command: publish 48 | args: --username=__token__ ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.7' && '' || '--no-sdist' }} --interpreter=python${{ !startsWith(matrix.os, 'windows') && matrix.python-version || '' }} 49 | env: 50 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} 51 | if: matrix.os != 'macos-latest' 52 | - name: Publish macOS (x86_64) Package 53 | if: matrix.os == 'macos-latest' 54 | uses: PyO3/maturin-action@v1 55 | with: 56 | command: publish 57 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=x86_64-apple-darwin --no-sdist 58 | env: 59 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} 60 | - name: Publish macOS (arm64) Package 61 | if: matrix.os == 'macos-latest' 62 | uses: PyO3/maturin-action@v1 63 | with: 64 | command: publish 65 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-apple-darwin --no-sdist 66 | env: 67 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} 68 | - name: Publish Linux (arm64) Package 69 | if: matrix.os == 'ubuntu-latest' 70 | uses: PyO3/maturin-action@v1 71 | with: 72 | command: publish 73 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-unknown-linux-gnu --no-sdist 74 | env: 75 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} 76 | dist-source: 77 | runs-on: ubuntu-latest 78 | steps: 79 | - name: Distribute Source 80 | uses: PyO3/maturin-action@v1 81 | with: 82 | command: sdist 83 | env: 84 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python ### 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | .vscode/ 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # poetry 100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 104 | poetry.lock 105 | 106 | # pdm 107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 108 | #pdm.lock 109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 110 | # in version control. 111 | # https://pdm.fming.dev/#use-with-ide 112 | .pdm.toml 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | ### Rust ### 165 | # Generated by Cargo 166 | # will have compiled files and executables 167 | debug/ 168 | 169 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 170 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 171 | Cargo.lock 172 | 173 | # These are backup files generated by rustfmt 174 | **/*.rs.bk 175 | 176 | # MSVC Windows builds of rustc generate these, which store debugging information 177 | *.pdb 178 | 179 | ### VisualStudioCode ### 180 | .vscode/* 181 | !.vscode/settings.json 182 | !.vscode/tasks.json 183 | !.vscode/launch.json 184 | !.vscode/extensions.json 185 | !.vscode/*.code-snippets 186 | 187 | # Local History for Visual Studio Code 188 | .history/ 189 | 190 | # Built Visual Studio Code Extensions 191 | *.vsix 192 | 193 | ### VisualStudioCode Patch ### 194 | # Ignore all local history of files 195 | .history 196 | .ionide 197 | 198 | # Support for Project snippet scope 199 | .vscode/*.code-snippets 200 | 201 | # Ignore code-workspaces 202 | *.code-workspace 203 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pydomainextractor" 3 | version = "0.13.10" 4 | authors = ["Viktor Vilskyi "] 5 | edition = "2021" 6 | repository = "https://github.com/intsights/pydomainextractor" 7 | homepage = "https://github.com/intsights/pydomainextractor" 8 | license = "MIT" 9 | keywords = [ 10 | "domain", 11 | "extraction", 12 | "tld", 13 | "suffix", 14 | "psl", 15 | "rust", 16 | "pyo3", 17 | ] 18 | 19 | [lib] 20 | name = "pydomainextractor" 21 | crate-type = ["cdylib"] 22 | 23 | [dependencies] 24 | ahash = "0.8" 25 | idna = "0.3" 26 | memchr = "2" 27 | arraystring = "0.3.0" 28 | typenum = "1" 29 | 30 | [dependencies.pyo3] 31 | version = "0.17.3" 32 | features = ["extension-module"] 33 | 34 | [profile.release] 35 | lto = true 36 | panic = "abort" 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Gal Ben David 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include images/logo.png 3 | graft tests 4 | recursive-include pydomainextractor *.py *.pyi 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Logo 4 | 5 |

6 | A blazingly fast domain extraction library written in Rust 7 |

8 |

9 | 10 | ![license](https://img.shields.io/badge/MIT-License-blue) 11 | ![Python](https://img.shields.io/badge/Python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue) 12 | ![Build](https://github.com/Intsights/PyDomainExtractor/workflows/Build/badge.svg) 13 | [![PyPi](https://img.shields.io/pypi/v/PyDomainExtractor.svg)](https://pypi.org/project/PyDomainExtractor/) 14 | 15 | ## Table of Contents 16 | 17 | - [Table of Contents](#table-of-contents) 18 | - [About The Project](#about-the-project) 19 | - [Built With](#built-with) 20 | - [Performance](#performance) 21 | - [Extract From Domain](#extract-from-domain) 22 | - [Extract From URL](#extract-from-url) 23 | - [Installation](#installation) 24 | - [Usage](#usage) 25 | - [Extraction](#extraction) 26 | - [URL Extraction](#url-extraction) 27 | - [Validation](#validation) 28 | - [TLDs List](#tlds-list) 29 | - [License](#license) 30 | - [Contact](#contact) 31 | 32 | 33 | ## About The Project 34 | 35 | PyDomainExtractor is a Python library designed to parse domain names quickly. 36 | In order to achieve the highest performance possible, the library was written in Rust. 37 | 38 | 39 | ### Built With 40 | 41 | * [AHash](https://github.com/tkaitchuck/aHash) 42 | * [idna](https://github.com/servo/rust-url/) 43 | * [memchr](https://github.com/BurntSushi/memchr) 44 | * [once_cell](https://github.com/matklad/once_cell) 45 | * [Public Suffix List](https://publicsuffix.org/) 46 | 47 | 48 | ### Performance 49 | 50 | 51 | #### Extract From Domain 52 | 53 | Tests were run on a file containing 10 million random domains from various top-level domains (Mar. 13rd 2022) 54 | 55 | | Library | Function | Time | 56 | | ------------- | ------------- | ------------- | 57 | | [PyDomainExtractor](https://github.com/Intsights/PyDomainExtractor) | pydomainextractor.extract | 1.50s | 58 | | [publicsuffix2](https://github.com/nexb/python-publicsuffix2) | publicsuffix2.get_sld | 9.92s | 59 | | [tldextract](https://github.com/john-kurkowski/tldextract) | \_\_call\_\_ | 29.23s | 60 | | [tld](https://github.com/barseghyanartur/tld) | tld.parse_tld | 34.48s | 61 | 62 | 63 | #### Extract From URL 64 | 65 | The test was conducted on a file containing 1 million random urls (Mar. 13rd 2022) 66 | 67 | | Library | Function | Time | 68 | | ------------- | ------------- | ------------- | 69 | | [PyDomainExtractor](https://github.com/Intsights/PyDomainExtractor) | pydomainextractor.extract_from_url | 2.24s | 70 | | [publicsuffix2](https://github.com/nexb/python-publicsuffix2) | publicsuffix2.get_sld | 10.84s | 71 | | [tldextract](https://github.com/john-kurkowski/tldextract) | \_\_call\_\_ | 36.04s | 72 | | [tld](https://github.com/barseghyanartur/tld) | tld.parse_tld | 57.87s | 73 | 74 | 75 | ### Installation 76 | 77 | ```sh 78 | pip3 install PyDomainExtractor 79 | ``` 80 | 81 | 82 | ## Usage 83 | 84 | 85 | ### Extraction 86 | 87 | ```python 88 | import pydomainextractor 89 | 90 | 91 | # Loads the current supplied version of PublicSuffixList from the repository. Does not download any data. 92 | domain_extractor = pydomainextractor.DomainExtractor() 93 | 94 | domain_extractor.extract('google.com') 95 | >>> { 96 | >>> 'subdomain': '', 97 | >>> 'domain': 'google', 98 | >>> 'suffix': 'com' 99 | >>> } 100 | 101 | # Loads a custom SuffixList data. Should follow PublicSuffixList's format. 102 | domain_extractor = pydomainextractor.DomainExtractor( 103 | 'tld\n' 104 | 'custom.tld\n' 105 | ) 106 | 107 | domain_extractor.extract('google.com') 108 | >>> { 109 | >>> 'subdomain': 'google', 110 | >>> 'domain': 'com', 111 | >>> 'suffix': '' 112 | >>> } 113 | 114 | domain_extractor.extract('google.custom.tld') 115 | >>> { 116 | >>> 'subdomain': '', 117 | >>> 'domain': 'google', 118 | >>> 'suffix': 'custom.tld' 119 | >>> } 120 | ``` 121 | 122 | 123 | ### URL Extraction 124 | 125 | ```python 126 | import pydomainextractor 127 | 128 | 129 | # Loads the current supplied version of PublicSuffixList from the repository. Does not download any data. 130 | domain_extractor = pydomainextractor.DomainExtractor() 131 | 132 | domain_extractor.extract_from_url('http://google.com/') 133 | >>> { 134 | >>> 'subdomain': '', 135 | >>> 'domain': 'google', 136 | >>> 'suffix': 'com' 137 | >>> } 138 | ``` 139 | 140 | 141 | ### Validation 142 | 143 | ```python 144 | import pydomainextractor 145 | 146 | 147 | # Loads the current supplied version of PublicSuffixList from the repository. Does not download any data. 148 | domain_extractor = pydomainextractor.DomainExtractor() 149 | 150 | domain_extractor.is_valid_domain('google.com') 151 | >>> True 152 | 153 | domain_extractor.is_valid_domain('domain.اتصالات') 154 | >>> True 155 | 156 | domain_extractor.is_valid_domain('xn--mgbaakc7dvf.xn--mgbaakc7dvf') 157 | >>> True 158 | 159 | domain_extractor.is_valid_domain('domain-.com') 160 | >>> False 161 | 162 | domain_extractor.is_valid_domain('-sub.domain.com') 163 | >>> False 164 | 165 | domain_extractor.is_valid_domain('\xF0\x9F\x98\x81nonalphanum.com') 166 | >>> False 167 | ``` 168 | 169 | 170 | ### TLDs List 171 | 172 | ```python 173 | import pydomainextractor 174 | 175 | 176 | # Loads the current supplied version of PublicSuffixList from the repository. Does not download any data. 177 | domain_extractor = pydomainextractor.DomainExtractor() 178 | 179 | domain_extractor.get_tld_list() 180 | >>> [ 181 | >>> 'bostik', 182 | >>> 'backyards.banzaicloud.io', 183 | >>> 'biz.bb', 184 | >>> ... 185 | >>> ] 186 | ``` 187 | 188 | 189 | ## License 190 | 191 | Distributed under the MIT License. See `LICENSE` for more information. 192 | 193 | 194 | ## Contact 195 | 196 | Gal Ben David - gal@intsights.com 197 | 198 | Project Link: [https://github.com/Intsights/PyDomainExtractor](https://github.com/Intsights/PyDomainExtractor) 199 | 200 | 201 | 202 | 203 | [license-shield]: https://img.shields.io/github/license/othneildrew/Best-README-Template.svg?style=flat-square 204 | -------------------------------------------------------------------------------- /benchmarks/domain_benchmark.py: -------------------------------------------------------------------------------- 1 | import tldextract 2 | import publicsuffix2 3 | import tld 4 | import pydomainextractor 5 | import time 6 | 7 | 8 | def benchmark_tldextract( 9 | domains, 10 | ): 11 | extractor = tldextract.TLDExtract( 12 | include_psl_private_domains=True, 13 | ) 14 | 15 | start = time.perf_counter() 16 | 17 | for domain in domains: 18 | extractor(domain) 19 | 20 | end = time.perf_counter() 21 | 22 | print(f'tldextract: {end - start}s') 23 | 24 | 25 | def benchmark_publicsuffix2( 26 | domains, 27 | ): 28 | start = time.perf_counter() 29 | 30 | for domain in domains: 31 | publicsuffix2.get_sld(domain) 32 | 33 | end = time.perf_counter() 34 | 35 | print(f'publicsuffix2: {end - start}s') 36 | 37 | 38 | def benchmark_tld( 39 | domains, 40 | ): 41 | start = time.perf_counter() 42 | 43 | for domain in domains: 44 | tld.parse_tld(domain) 45 | 46 | end = time.perf_counter() 47 | 48 | print(f'tld: {end - start}s') 49 | 50 | 51 | def benchmark_pydomainextractor( 52 | domains, 53 | ): 54 | extractor = pydomainextractor.DomainExtractor() 55 | 56 | start = time.perf_counter() 57 | 58 | for domain in domains: 59 | extractor.extract(domain) 60 | 61 | end = time.perf_counter() 62 | 63 | print(f'pydomainextractor: {end - start}s') 64 | 65 | 66 | def main(): 67 | domains = [] 68 | with open('10m_domains') as domains_file: 69 | for line in domains_file: 70 | domains.append(line.rstrip()) 71 | 72 | benchmark_tldextract(domains) 73 | benchmark_publicsuffix2(domains) 74 | benchmark_tld(domains) 75 | benchmark_pydomainextractor(domains) 76 | 77 | 78 | if __name__ == '__main__': 79 | main() 80 | -------------------------------------------------------------------------------- /benchmarks/url_benchmark.py: -------------------------------------------------------------------------------- 1 | import tldextract 2 | import publicsuffix2 3 | import tld 4 | import pydomainextractor 5 | import time 6 | 7 | 8 | def benchmark_tldextract( 9 | urls, 10 | ): 11 | extractor = tldextract.TLDExtract( 12 | include_psl_private_domains=True, 13 | ) 14 | 15 | start = time.perf_counter() 16 | 17 | for url in urls: 18 | extractor(url) 19 | 20 | end = time.perf_counter() 21 | 22 | print(f'tldextract: {end - start}s') 23 | 24 | 25 | def benchmark_publicsuffix2( 26 | urls, 27 | ): 28 | start = time.perf_counter() 29 | 30 | for url in urls: 31 | publicsuffix2.get_sld(url) 32 | 33 | end = time.perf_counter() 34 | 35 | print(f'publicsuffix2: {end - start}s') 36 | 37 | 38 | def benchmark_tld( 39 | urls, 40 | ): 41 | start = time.perf_counter() 42 | 43 | for url in urls: 44 | tld.parse_tld(url) 45 | 46 | end = time.perf_counter() 47 | 48 | print(f'tld: {end - start}s') 49 | 50 | 51 | def benchmark_pydomainextractor( 52 | urls, 53 | ): 54 | extractor = pydomainextractor.DomainExtractor() 55 | 56 | start = time.perf_counter() 57 | 58 | for url in urls: 59 | extractor.extract_from_url(url) 60 | 61 | end = time.perf_counter() 62 | 63 | print(f'pydomainextractor: {end - start}s') 64 | 65 | 66 | def main(): 67 | urls = [] 68 | with open('1m_urls') as urls_file: 69 | for line in urls_file: 70 | urls.append(line.rstrip()) 71 | 72 | urls = urls * 10 73 | 74 | # benchmark_tldextract(urls) 75 | # benchmark_publicsuffix2(urls) 76 | # benchmark_tld(urls) 77 | benchmark_pydomainextractor(urls) 78 | 79 | 80 | if __name__ == '__main__': 81 | main() 82 | -------------------------------------------------------------------------------- /cortex.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | info: 3 | title: Pydomainextractor 4 | description: A blazingly fast domain extraction library written in Rust 5 | x-cortex-git: 6 | github: 7 | alias: intsightsorg 8 | repository: Intsights/PyDomainExtractor 9 | x-cortex-tag: pydomainextractor 10 | x-cortex-type: service 11 | x-cortex-domain-parents: 12 | - tag: threatintel-shadow-intel 13 | x-cortex-groups: 14 | - exposure:external-ship 15 | - target:library 16 | openapi: 3.0.1 17 | servers: 18 | - url: "/" 19 | -------------------------------------------------------------------------------- /images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intsights/PyDomainExtractor/a59d365effa56872235d3ffa0e7a1367065fa3e6/images/logo.png -------------------------------------------------------------------------------- /pydomainextractor/__init__.py: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | from . import pydomainextractor 4 | 5 | 6 | class DomainExtractor: 7 | ''' 8 | PyDomainExtractor is a highly optimized Domain Name Extraction library written in Rust 9 | ''' 10 | engine: typing.Optional[pydomainextractor.DomainExtractor] = None 11 | 12 | def __new__( 13 | cls, 14 | suffix_list_data: typing.Optional[str] = None, 15 | ): 16 | if suffix_list_data is None: 17 | if DomainExtractor.engine is None: 18 | DomainExtractor.engine = pydomainextractor.DomainExtractor() 19 | 20 | return DomainExtractor.engine 21 | else: 22 | return pydomainextractor.DomainExtractor(suffix_list_data) 23 | -------------------------------------------------------------------------------- /pydomainextractor/pydomainextractor.pyi: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | 4 | class DomainExtractor: 5 | def __init__( 6 | self, 7 | suffix_list_data: typing.Optional[str] = None, 8 | ) -> None: ... 9 | 10 | def extract( 11 | self, 12 | domain: str, 13 | ) -> typing.Dict[str, str]: ... 14 | 15 | def extract_from_url( 16 | self, 17 | url: str, 18 | ) -> typing.Dict[str, str]: ... 19 | 20 | def is_valid_domain( 21 | self, 22 | domain: str, 23 | ) -> bool: ... 24 | 25 | def get_tld_list( 26 | self, 27 | ) -> typing.List[str]: ... 28 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pydomainextractor" 3 | version = "0.13.10" 4 | authors = [ 5 | {email = "viktor_vilskyi@rapid7.com"}, 6 | {name = "Viktor Vilskyi"} 7 | ] 8 | requires-python = ">=3.7" 9 | license = {file = "LICENSE"} 10 | classifiers = [ 11 | "License :: OSI Approved :: MIT License", 12 | "Operating System :: MacOS", 13 | "Operating System :: Microsoft", 14 | "Operating System :: POSIX :: Linux", 15 | "Programming Language :: Python :: 3.7", 16 | "Programming Language :: Python :: 3.8", 17 | "Programming Language :: Python :: 3.9", 18 | "Programming Language :: Python :: 3.10", 19 | "Programming Language :: Python :: 3.11", 20 | "Programming Language :: Rust", 21 | ] 22 | 23 | [project.urls] 24 | repository = "https://github.com/intsights/pydomainextractor" 25 | homepage = "https://github.com/intsights/pydomainextractor" 26 | 27 | [build-system] 28 | requires = ["maturin>=0.14,<0.15"] 29 | build-backend = "maturin" 30 | 31 | [tool.maturin] 32 | sdist-include = [ 33 | "Cargo.toml", 34 | "pydomainextractor/*.py", 35 | "pydomainextractor/*.pyi", 36 | "pyproject.toml", 37 | "src/*", 38 | ] 39 | 40 | [tool.poetry] 41 | name = "pydomainextractor" 42 | version = "0.13.9" 43 | authors = ["Viktor Vilskyi "] 44 | description = "A blazingly fast domain extraction library written in Rust" 45 | readme = "README.md" 46 | repository = "https://github.com/intsights/pydomainextractor" 47 | homepage = "https://github.com/intsights/pydomainextractor" 48 | license = "MIT" 49 | keywords = [ 50 | "domain", 51 | "extraction", 52 | "tld", 53 | "suffix", 54 | "psl", 55 | "rust", 56 | "pyo3", 57 | ] 58 | classifiers = [ 59 | "License :: OSI Approved :: MIT License", 60 | "Operating System :: MacOS", 61 | "Operating System :: Microsoft", 62 | "Operating System :: POSIX :: Linux", 63 | "Programming Language :: Python :: 3.7", 64 | "Programming Language :: Python :: 3.8", 65 | "Programming Language :: Python :: 3.9", 66 | "Programming Language :: Python :: 3.10", 67 | "Programming Language :: Python :: 3.11", 68 | "Programming Language :: Rust", 69 | ] 70 | 71 | [tool.poetry.dependencies] 72 | python = "^3.7" 73 | 74 | [tool.poetry.dev-dependencies] 75 | pytest = "*" 76 | wheel = "*" 77 | pytest-runner = "*" 78 | maturin = "*" 79 | 80 | [tool.pytest.ini_options] 81 | minversion = "6.0" 82 | addopts = [ 83 | "--tb=native", 84 | "--pythonwarnings=all", 85 | ] 86 | testpaths = [ 87 | "tests", 88 | ] 89 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use ahash::{AHashMap, AHashSet}; 2 | use pyo3::exceptions::PyValueError; 3 | use pyo3::intern; 4 | use pyo3::prelude::*; 5 | use pyo3::types::PyString; 6 | use std::os::raw::c_char; 7 | 8 | type DomainString = arraystring::ArrayString; 9 | 10 | #[derive(Default)] 11 | struct Suffix { 12 | sub_suffixes: AHashMap, 13 | is_wildcard: bool, 14 | sub_blacklist: AHashSet, 15 | } 16 | 17 | static PUBLIC_SUFFIX_LIST_DATA: &str = include_str!("public_suffix_list.dat"); 18 | 19 | #[pyclass] 20 | struct DomainExtractor { 21 | suffixes: AHashMap, 22 | tld_list: Vec, 23 | } 24 | 25 | #[pymethods] 26 | impl DomainExtractor { 27 | #[new] 28 | fn new( 29 | suffix_list: Option<&str>, 30 | ) -> Self { 31 | let (suffixes, tld_list) = if let Some(suffix_list) = suffix_list { 32 | parse_suffix_list(suffix_list) 33 | } else { 34 | parse_suffix_list(PUBLIC_SUFFIX_LIST_DATA) 35 | }; 36 | 37 | DomainExtractor { suffixes, tld_list } 38 | } 39 | 40 | fn parse_domain_parts<'a>( 41 | &self, 42 | domain: &'a str, 43 | ) -> PyResult<(&'a str, &'a str, &'a str)> { 44 | let mut suffix_part = ""; 45 | let mut current_suffixes = &self.suffixes; 46 | let mut last_dot_index = domain.len(); 47 | let mut in_wildcard_tld = false; 48 | let mut last_suffix: Option<&Suffix> = None; 49 | 50 | while let Some(dot_index) = memchr::memrchr(b'.', &domain.as_bytes()[..last_dot_index]) { 51 | let current_fraction = &domain[dot_index + 1..last_dot_index]; 52 | if current_fraction.is_empty() || dot_index == 0 { 53 | return Err(PyValueError::new_err("Invalid domain detected")); 54 | } 55 | 56 | if in_wildcard_tld { 57 | if last_suffix.unwrap().sub_blacklist.contains(current_fraction) { 58 | let leftover_part = &domain[0..dot_index]; 59 | 60 | return Ok((suffix_part, current_fraction, leftover_part)); 61 | } 62 | 63 | if let Some(current_suffix) = current_suffixes.get(current_fraction) { 64 | if !current_suffix.is_wildcard { 65 | current_suffixes = ¤t_suffix.sub_suffixes; 66 | } 67 | last_suffix.replace(current_suffix); 68 | suffix_part = &domain[dot_index + 1..]; 69 | last_dot_index = dot_index; 70 | } else { 71 | suffix_part = &domain[dot_index + 1..]; 72 | let leftover_part = &domain[0..dot_index]; 73 | match leftover_part.rsplit_once('.') { 74 | Some((subdomain_part, domain_part)) => { 75 | if subdomain_part.ends_with('.') { 76 | return Err(PyValueError::new_err("Invalid domain detected")); 77 | } 78 | return Ok((suffix_part, domain_part, subdomain_part)); 79 | } 80 | None => { 81 | return Ok((suffix_part, leftover_part, "")); 82 | } 83 | } 84 | } 85 | } 86 | if let Some(current_suffix) = current_suffixes.get(current_fraction) { 87 | in_wildcard_tld = current_suffix.is_wildcard; 88 | 89 | current_suffixes = ¤t_suffix.sub_suffixes; 90 | last_suffix.replace(current_suffix); 91 | suffix_part = &domain[dot_index + 1..]; 92 | last_dot_index = dot_index; 93 | } else { 94 | let leftover_part = &domain[0..last_dot_index]; 95 | match leftover_part.rsplit_once('.') { 96 | Some((subdomain_part, domain_part)) => { 97 | if subdomain_part.ends_with('.') { 98 | return Err(PyValueError::new_err("Invalid domain detected")); 99 | } 100 | return Ok((suffix_part, domain_part, subdomain_part)); 101 | } 102 | None => { 103 | return Ok((suffix_part, leftover_part, "")); 104 | } 105 | }; 106 | } 107 | } 108 | 109 | let current_fraction = &domain[0..last_dot_index]; 110 | if in_wildcard_tld { 111 | if last_suffix.unwrap().sub_blacklist.contains(current_fraction) { 112 | Ok((suffix_part, current_fraction, "")) 113 | } else { 114 | Ok((domain, "", "")) 115 | } 116 | } else if current_suffixes.len() > 0 && current_suffixes.contains_key(current_fraction) { 117 | Ok((domain, "", "")) 118 | } else { 119 | Ok((suffix_part, current_fraction, "")) 120 | } 121 | } 122 | 123 | fn extract( 124 | &self, 125 | py: Python, 126 | domain: &PyString, 127 | ) -> PyResult { 128 | if domain.len().unwrap() > 255 { 129 | return Err(PyValueError::new_err("Invalid domain detected")); 130 | } 131 | 132 | let mut domain_string = unsafe { 133 | DomainString::from_str_unchecked(domain.to_string_lossy().as_ref()) 134 | }; 135 | domain_string.make_ascii_lowercase(); 136 | 137 | let (suffix_part, domain_part, subdomain_part) = self.parse_domain_parts(domain_string.as_str())?; 138 | 139 | unsafe { 140 | let dict = pyo3::ffi::PyDict_New(); 141 | for (fraction_key, fraction) in [ 142 | (intern!(py, "suffix").into_ptr(), suffix_part), 143 | (intern!(py, "domain").into_ptr(), domain_part), 144 | (intern!(py, "subdomain").into_ptr(), subdomain_part), 145 | ] { 146 | if !fraction.is_empty() { 147 | let substr = pyo3::ffi::PyUnicode_FromStringAndSize( 148 | fraction.as_ptr() as *const c_char, 149 | fraction.len() as isize, 150 | ); 151 | 152 | pyo3::ffi::PyDict_SetItem( 153 | dict, 154 | fraction_key, 155 | substr, 156 | ); 157 | pyo3::ffi::Py_DECREF(substr); 158 | } else { 159 | pyo3::ffi::PyDict_SetItem( 160 | dict, 161 | fraction_key, 162 | intern!(py, "").into_ptr(), 163 | ); 164 | } 165 | } 166 | 167 | Ok(pyo3::PyObject::from_owned_ptr(py, dict)) 168 | } 169 | } 170 | 171 | fn is_valid_domain( 172 | &self, 173 | domain: &PyString, 174 | ) -> bool { 175 | let domain_len = domain.len().unwrap(); 176 | if domain_len == 0 || domain_len > 255 { 177 | return false; 178 | } 179 | 180 | let mut domain_string = unsafe { 181 | DomainString::from_str_unchecked(domain.to_string_lossy().as_ref()) 182 | }; 183 | 184 | for fraction in domain_string.split('.') { 185 | if fraction.len() > 63 || fraction.is_empty() { 186 | return false; 187 | } 188 | if fraction.starts_with('-') || fraction.ends_with('-') { 189 | return false; 190 | } 191 | 192 | for ch in fraction.chars() { 193 | if !ch.is_alphanumeric() && ch != '-' { 194 | return false; 195 | } 196 | } 197 | } 198 | 199 | domain_string.make_ascii_lowercase(); 200 | if let Ok((suffix_part, domain_part, _subdomain_part)) = self.parse_domain_parts(domain_string.as_str()) { 201 | if suffix_part.is_empty() || domain_part.is_empty() { 202 | return false; 203 | } 204 | 205 | if idna::domain_to_ascii(domain_string.as_str()).is_err() { 206 | return false; 207 | } 208 | if idna::domain_to_unicode(domain_string.as_str()).1.is_err() { 209 | return false; 210 | } 211 | 212 | true 213 | } else { 214 | false 215 | } 216 | } 217 | 218 | fn get_tld_list( 219 | &self, 220 | ) -> Vec { 221 | self.tld_list.clone() 222 | } 223 | 224 | fn extract_from_url( 225 | &self, 226 | py: Python, 227 | url: &PyString, 228 | ) -> PyResult { 229 | let mut url_str = url.to_str().unwrap(); 230 | 231 | match memchr::memmem::find(url_str.as_bytes(), b"//") { 232 | Some(scheme_separator_position) => { 233 | url_str = &url_str[scheme_separator_position + 2..]; 234 | }, 235 | None => return Err( 236 | PyValueError::new_err("url is invalid: no scheme") 237 | ), 238 | }; 239 | 240 | if let Some(path_separator) = memchr::memchr(b'/', url_str.as_bytes()) { 241 | url_str = &url_str[..path_separator]; 242 | }; 243 | 244 | if let Some(authentication_separator) = memchr::memchr(b'@', url_str.as_bytes()) { 245 | url_str = &url_str[authentication_separator + 1..]; 246 | }; 247 | 248 | if let Some(port_separator) = memchr::memchr(b':', url_str.as_bytes()) { 249 | url_str = &url_str[..port_separator]; 250 | }; 251 | 252 | if url_str.is_empty() { 253 | return Err( 254 | PyValueError::new_err("url does not contain a domain") 255 | ); 256 | } 257 | 258 | if url_str.len() > 255 { 259 | return Err(PyValueError::new_err("url is invalid: too long")); 260 | } 261 | let mut domain_string = unsafe { 262 | DomainString::from_str_unchecked(url_str) 263 | }; 264 | domain_string.make_ascii_lowercase(); 265 | 266 | let (suffix_part, domain_part, subdomain_part) = self.parse_domain_parts(domain_string.as_str())?; 267 | 268 | unsafe { 269 | let dict = pyo3::ffi::PyDict_New(); 270 | for (fraction_key, fraction) in [ 271 | (intern!(py, "suffix").into_ptr(), suffix_part), 272 | (intern!(py, "domain").into_ptr(), domain_part), 273 | (intern!(py, "subdomain").into_ptr(), subdomain_part), 274 | ] { 275 | if !fraction.is_empty() { 276 | let substr = pyo3::ffi::PyUnicode_FromStringAndSize( 277 | fraction.as_ptr() as *const c_char, 278 | fraction.len() as isize, 279 | ); 280 | 281 | pyo3::ffi::PyDict_SetItem( 282 | dict, 283 | fraction_key, 284 | substr, 285 | ); 286 | pyo3::ffi::Py_DECREF(substr); 287 | } else { 288 | pyo3::ffi::PyDict_SetItem( 289 | dict, 290 | fraction_key, 291 | intern!(py, "").into_ptr(), 292 | ); 293 | } 294 | } 295 | 296 | Ok(pyo3::PyObject::from_owned_ptr(py, dict)) 297 | } 298 | } 299 | } 300 | 301 | fn parse_suffix_list( 302 | suffixes_list: &str, 303 | ) -> (AHashMap, Vec) { 304 | let mut suffixes = AHashMap::new(); 305 | let mut tld_list = Vec::new(); 306 | 307 | for line in suffixes_list.lines().map( 308 | |line| line.to_ascii_lowercase() 309 | ) { 310 | if line.starts_with("//") || line.is_empty() { 311 | continue; 312 | } 313 | 314 | let mut tlds = vec![line.clone()]; 315 | if !line.is_ascii() { 316 | tlds.push(idna::domain_to_ascii(&line).unwrap()); 317 | } 318 | for tld in tlds { 319 | tld_list.push(tld.clone()); 320 | 321 | let fractions: Vec = tld.rsplit('.').map( 322 | |s| s.to_string() 323 | ).collect(); 324 | let mut current_suffix = suffixes.entry(fractions.first().unwrap().to_owned()).or_insert( 325 | Suffix { 326 | sub_suffixes: AHashMap::new(), 327 | is_wildcard: false, 328 | sub_blacklist: AHashSet::new(), 329 | } 330 | ); 331 | 332 | for fraction in fractions[1..].iter() { 333 | if fraction.starts_with('!') { 334 | current_suffix.sub_blacklist.insert(fraction.strip_prefix('!').unwrap().to_string()); 335 | } else if fraction == "*" { 336 | current_suffix.is_wildcard = true; 337 | } else { 338 | current_suffix = current_suffix.sub_suffixes.entry(fraction.clone()).or_insert( 339 | Suffix { 340 | sub_suffixes: AHashMap::new(), 341 | is_wildcard: false, 342 | sub_blacklist: AHashSet::new(), 343 | } 344 | ); 345 | } 346 | } 347 | } 348 | } 349 | 350 | (suffixes, tld_list) 351 | } 352 | 353 | #[pymodule] 354 | fn pydomainextractor( 355 | _py: Python, 356 | m: &PyModule, 357 | ) -> PyResult<()> { 358 | m.add_class::()?; 359 | Ok(()) 360 | } 361 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intsights/PyDomainExtractor/a59d365effa56872235d3ffa0e7a1367065fa3e6/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_pydomainextractor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import unittest.mock 3 | 4 | import pydomainextractor 5 | 6 | 7 | class DomainExtractorExtractionTestCase( 8 | unittest.TestCase, 9 | ): 10 | def setUp( 11 | self, 12 | ): 13 | self.domain_extractor = pydomainextractor.DomainExtractor() 14 | 15 | def test_extract_only_tld( 16 | self, 17 | ): 18 | self.assertEqual( 19 | first=self.domain_extractor.extract('com'), 20 | second={ 21 | 'domain': '', 22 | 'subdomain': '', 23 | 'suffix': 'com', 24 | }, 25 | ) 26 | 27 | self.assertEqual( 28 | first=self.domain_extractor.extract('jp.net'), 29 | second={ 30 | 'domain': '', 31 | 'subdomain': '', 32 | 'suffix': 'jp.net', 33 | }, 34 | ) 35 | 36 | self.assertEqual( 37 | first=self.domain_extractor.extract('鹿児島.jp'), 38 | second={ 39 | 'domain': '', 40 | 'subdomain': '', 41 | 'suffix': '鹿児島.jp', 42 | }, 43 | ) 44 | 45 | self.assertEqual( 46 | first=self.domain_extractor.extract('香格里拉'), 47 | second={ 48 | 'domain': '', 49 | 'subdomain': '', 50 | 'suffix': '香格里拉', 51 | }, 52 | ) 53 | 54 | self.assertEqual( 55 | first=self.domain_extractor.extract('xn--32vp30h.jp'), 56 | second={ 57 | 'domain': '', 58 | 'subdomain': '', 59 | 'suffix': 'xn--32vp30h.jp', 60 | }, 61 | ) 62 | 63 | def test_extract_only_domain( 64 | self, 65 | ): 66 | self.assertEqual( 67 | first=self.domain_extractor.extract('nonexistenttld'), 68 | second={ 69 | 'domain': 'nonexistenttld', 70 | 'subdomain': '', 71 | 'suffix': '', 72 | }, 73 | ) 74 | 75 | self.assertEqual( 76 | first=self.domain_extractor.extract('香格里拉香格里拉香格里拉'), 77 | second={ 78 | 'domain': '香格里拉香格里拉香格里拉', 79 | 'subdomain': '', 80 | 'suffix': '', 81 | }, 82 | ) 83 | 84 | def test_extract_only_domain_and_subdomain( 85 | self, 86 | ): 87 | self.assertEqual( 88 | first=self.domain_extractor.extract('nonexistenttld.nonexistenttld'), 89 | second={ 90 | 'domain': 'nonexistenttld', 91 | 'subdomain': 'nonexistenttld', 92 | 'suffix': '', 93 | }, 94 | ) 95 | 96 | self.assertEqual( 97 | first=self.domain_extractor.extract('香格里拉香格里拉香格里拉.nonexistenttld'), 98 | second={ 99 | 'domain': 'nonexistenttld', 100 | 'subdomain': '香格里拉香格里拉香格里拉', 101 | 'suffix': '', 102 | }, 103 | ) 104 | 105 | def test_extract_all_parts( 106 | self, 107 | ): 108 | self.assertEqual( 109 | first=self.domain_extractor.extract('google.com'), 110 | second={ 111 | 'domain': 'google', 112 | 'subdomain': '', 113 | 'suffix': 'com', 114 | }, 115 | ) 116 | 117 | self.assertEqual( 118 | first=self.domain_extractor.extract('subdomain.google.com'), 119 | second={ 120 | 'domain': 'google', 121 | 'subdomain': 'subdomain', 122 | 'suffix': 'com', 123 | }, 124 | ) 125 | 126 | self.assertEqual( 127 | first=self.domain_extractor.extract('subsubdomain.subdomain.google.com'), 128 | second={ 129 | 'domain': 'google', 130 | 'subdomain': 'subsubdomain.subdomain', 131 | 'suffix': 'com', 132 | }, 133 | ) 134 | 135 | self.assertEqual( 136 | first=self.domain_extractor.extract('subsubdomain.subdomain.google.香格里拉'), 137 | second={ 138 | 'domain': 'google', 139 | 'subdomain': 'subsubdomain.subdomain', 140 | 'suffix': '香格里拉', 141 | }, 142 | ) 143 | 144 | self.assertEqual( 145 | first=self.domain_extractor.extract('subsubdomain.subdomain.google.鹿児島.jp'), 146 | second={ 147 | 'domain': 'google', 148 | 'subdomain': 'subsubdomain.subdomain', 149 | 'suffix': '鹿児島.jp', 150 | }, 151 | ) 152 | 153 | self.assertEqual( 154 | first=self.domain_extractor.extract('subsubdomain.subdomain.google.xn--32vp30h.jp'), 155 | second={ 156 | 'domain': 'google', 157 | 'subdomain': 'subsubdomain.subdomain', 158 | 'suffix': 'xn--32vp30h.jp', 159 | }, 160 | ) 161 | 162 | def test_special_cases( 163 | self, 164 | ): 165 | self.assertEqual( 166 | first=self.domain_extractor.extract('bla.ck'), 167 | second={ 168 | 'domain': '', 169 | 'subdomain': '', 170 | 'suffix': 'bla.ck', 171 | }, 172 | ) 173 | 174 | self.assertEqual( 175 | first=self.domain_extractor.extract('a.bla.ck'), 176 | second={ 177 | 'domain': 'a', 178 | 'subdomain': '', 179 | 'suffix': 'bla.ck', 180 | }, 181 | ) 182 | 183 | self.assertEqual( 184 | first=self.domain_extractor.extract('a.b.bla.ck'), 185 | second={ 186 | 'domain': 'b', 187 | 'subdomain': 'a', 188 | 'suffix': 'bla.ck', 189 | }, 190 | ) 191 | 192 | self.assertEqual( 193 | first=self.domain_extractor.extract('www.ck'), 194 | second={ 195 | 'domain': 'www', 196 | 'subdomain': '', 197 | 'suffix': 'ck', 198 | }, 199 | ) 200 | 201 | self.assertEqual( 202 | first=self.domain_extractor.extract('a.www.ck'), 203 | second={ 204 | 'domain': 'www', 205 | 'subdomain': 'a', 206 | 'suffix': 'ck', 207 | }, 208 | ) 209 | 210 | self.assertEqual( 211 | first=self.domain_extractor.extract('a.bzz.dapps.earth'), 212 | second={ 213 | 'domain': '', 214 | 'subdomain': '', 215 | 'suffix': 'a.bzz.dapps.earth', 216 | }, 217 | ) 218 | 219 | self.assertEqual( 220 | first=self.domain_extractor.extract('a.b.bzz.dapps.earth'), 221 | second={ 222 | 'domain': 'a', 223 | 'subdomain': '', 224 | 'suffix': 'b.bzz.dapps.earth', 225 | }, 226 | ) 227 | 228 | self.assertEqual( 229 | first=self.domain_extractor.extract('domain.co.za'), 230 | second={ 231 | 'domain': 'domain', 232 | 'subdomain': '', 233 | 'suffix': 'co.za', 234 | }, 235 | ) 236 | 237 | def test_upper_case( 238 | self, 239 | ): 240 | self.assertEqual( 241 | first=self.domain_extractor.extract('domain.Com'), 242 | second={ 243 | 'domain': 'domain', 244 | 'subdomain': '', 245 | 'suffix': 'com', 246 | }, 247 | ) 248 | 249 | self.assertEqual( 250 | first=self.domain_extractor.extract('DOmain.Com'), 251 | second={ 252 | 'domain': 'domain', 253 | 'subdomain': '', 254 | 'suffix': 'com', 255 | }, 256 | ) 257 | 258 | self.assertEqual( 259 | first=self.domain_extractor.extract('DOmain.COM'), 260 | second={ 261 | 'domain': 'domain', 262 | 'subdomain': '', 263 | 'suffix': 'com', 264 | }, 265 | ) 266 | 267 | self.assertEqual( 268 | first=self.domain_extractor.extract('a.b.bla.CK'), 269 | second={ 270 | 'domain': 'b', 271 | 'subdomain': 'a', 272 | 'suffix': 'bla.ck', 273 | }, 274 | ) 275 | 276 | def test_syntactic_invalid_domains( 277 | self, 278 | ): 279 | with self.assertRaises( 280 | expected_exception=ValueError, 281 | ): 282 | self.domain_extractor.extract('.com') 283 | 284 | with self.assertRaises( 285 | expected_exception=ValueError, 286 | ): 287 | self.domain_extractor.extract('domain..com') 288 | 289 | with self.assertRaises( 290 | expected_exception=ValueError, 291 | ): 292 | self.domain_extractor.extract('sub..domain.com') 293 | 294 | with self.assertRaises( 295 | expected_exception=ValueError, 296 | ): 297 | self.domain_extractor.extract('domain.com.') 298 | 299 | with self.assertRaises( 300 | expected_exception=ValueError, 301 | ): 302 | self.domain_extractor.extract('com.') 303 | 304 | def test_domain_too_long( 305 | self, 306 | ): 307 | with self.assertRaises( 308 | expected_exception=ValueError, 309 | ): 310 | self.domain_extractor.extract(f'{"very-long" * 255}.com') 311 | 312 | def test_extract_from_url( 313 | self, 314 | ): 315 | with self.assertRaises( 316 | ValueError, 317 | ): 318 | self.domain_extractor.extract_from_url('http://www.example.com./') 319 | 320 | with self.assertRaises( 321 | ValueError, 322 | ): 323 | self.domain_extractor.extract_from_url('mail.google.com/mail') 324 | 325 | with self.assertRaises( 326 | ValueError, 327 | ): 328 | self.domain_extractor.extract_from_url('xn--gieen46ers-73a.de') 329 | 330 | with self.assertRaises( 331 | ValueError, 332 | ): 333 | self.domain_extractor.extract_from_url('http://') 334 | 335 | with self.assertRaises( 336 | ValueError, 337 | ): 338 | self.domain_extractor.extract_from_url('xn--tub-1m9d15sfkkhsifsbqygyujjrw602gk4li5qqk98aca0w.google.com') 339 | 340 | with self.assertRaises( 341 | ValueError, 342 | ): 343 | self.domain_extractor.extract_from_url('xn--tub-1m9d15sfkkhsifsbqygyujjrw60.google.com') 344 | 345 | with self.assertRaises( 346 | ValueError, 347 | ): 348 | self.domain_extractor.extract_from_url('1\xe9') 349 | 350 | with self.assertRaises( 351 | ValueError, 352 | ): 353 | self.domain_extractor.extract_from_url('com') 354 | 355 | with self.assertRaises( 356 | ValueError, 357 | ): 358 | self.domain_extractor.extract_from_url('co.uk') 359 | 360 | with self.assertRaises( 361 | ValueError, 362 | ): 363 | self.domain_extractor.extract_from_url(f'http://{"domain" * 255}co.uk:3030/some/path') 364 | 365 | self.assertEqual( 366 | first=self.domain_extractor.extract_from_url('http://www.google.com'), 367 | second={ 368 | 'subdomain': 'www', 369 | 'domain': 'google', 370 | 'suffix': 'com', 371 | }, 372 | ) 373 | self.assertEqual( 374 | first=self.domain_extractor.extract_from_url('http://www.theregister.co.uk'), 375 | second={ 376 | 'subdomain': 'www', 377 | 'domain': 'theregister', 378 | 'suffix': 'co.uk', 379 | }, 380 | ) 381 | self.assertEqual( 382 | first=self.domain_extractor.extract_from_url('http://gmail.com'), 383 | second={ 384 | 'subdomain': '', 385 | 'domain': 'gmail', 386 | 'suffix': 'com', 387 | }, 388 | ) 389 | self.assertEqual( 390 | first=self.domain_extractor.extract_from_url('http://media.forums.theregister.co.uk'), 391 | second={ 392 | 'subdomain': 'media.forums', 393 | 'domain': 'theregister', 394 | 'suffix': 'co.uk', 395 | }, 396 | ) 397 | self.assertEqual( 398 | first=self.domain_extractor.extract_from_url('http://www.www.com'), 399 | second={ 400 | 'subdomain': 'www', 401 | 'domain': 'www', 402 | 'suffix': 'com', 403 | }, 404 | ) 405 | self.assertEqual( 406 | first=self.domain_extractor.extract_from_url('http://www.com'), 407 | second={ 408 | 'subdomain': '', 409 | 'domain': 'www', 410 | 'suffix': 'com', 411 | }, 412 | ) 413 | self.assertEqual( 414 | first=self.domain_extractor.extract_from_url('http://internalunlikelyhostname/'), 415 | second={ 416 | 'subdomain': '', 417 | 'domain': 'internalunlikelyhostname', 418 | 'suffix': '', 419 | }, 420 | ) 421 | self.assertEqual( 422 | first=self.domain_extractor.extract_from_url('http://internalunlikelyhostname.bizarre'), 423 | second={ 424 | 'subdomain': 'internalunlikelyhostname', 425 | 'domain': 'bizarre', 426 | 'suffix': '', 427 | }, 428 | ) 429 | self.assertEqual( 430 | first=self.domain_extractor.extract_from_url('http://internalunlikelyhostname.info/'), 431 | second={ 432 | 'subdomain': '', 433 | 'domain': 'internalunlikelyhostname', 434 | 'suffix': 'info', 435 | }, 436 | ) 437 | self.assertEqual( 438 | first=self.domain_extractor.extract_from_url('http://internalunlikelyhostname.information/'), 439 | second={ 440 | 'subdomain': 'internalunlikelyhostname', 441 | 'domain': 'information', 442 | 'suffix': '', 443 | }, 444 | ) 445 | self.assertEqual( 446 | first=self.domain_extractor.extract_from_url('http://216.22.0.192/'), 447 | second={ 448 | 'subdomain': '216.22.0', 449 | 'domain': '192', 450 | 'suffix': '', 451 | }, 452 | ) 453 | self.assertEqual( 454 | first=self.domain_extractor.extract_from_url('http://216.22.project.coop/'), 455 | second={ 456 | 'subdomain': '216.22', 457 | 'domain': 'project', 458 | 'suffix': 'coop', 459 | }, 460 | ) 461 | self.assertEqual( 462 | first=self.domain_extractor.extract_from_url('http://xn--h1alffa9f.xn--p1ai'), 463 | second={ 464 | 'subdomain': '', 465 | 'domain': 'xn--h1alffa9f', 466 | 'suffix': 'xn--p1ai', 467 | }, 468 | ) 469 | self.assertEqual( 470 | first=self.domain_extractor.extract_from_url('http://xN--h1alffa9f.xn--p1ai'), 471 | second={ 472 | 'subdomain': '', 473 | 'domain': 'xn--h1alffa9f', 474 | 'suffix': 'xn--p1ai', 475 | }, 476 | ) 477 | self.assertEqual( 478 | first=self.domain_extractor.extract_from_url('http://XN--h1alffa9f.xn--p1ai'), 479 | second={ 480 | 'subdomain': '', 481 | 'domain': 'xn--h1alffa9f', 482 | 'suffix': 'xn--p1ai', 483 | }, 484 | ) 485 | self.assertEqual( 486 | first=self.domain_extractor.extract_from_url('http://xn--zckzap6140b352by.blog.so-net.xn--wcvs22d.hk'), 487 | second={ 488 | 'subdomain': 'xn--zckzap6140b352by.blog', 489 | 'domain': 'so-net', 490 | 'suffix': 'xn--wcvs22d.hk', 491 | }, 492 | ) 493 | self.assertEqual( 494 | first=self.domain_extractor.extract_from_url('http://xn--zckzap6140b352by.blog.so-net.教育.hk'), 495 | second={ 496 | 'subdomain': 'xn--zckzap6140b352by.blog', 497 | 'domain': 'so-net', 498 | 'suffix': '教育.hk', 499 | }, 500 | ) 501 | self.assertEqual( 502 | first=self.domain_extractor.extract_from_url('https://mail.google.com/mail'), 503 | second={ 504 | 'subdomain': 'mail', 505 | 'domain': 'google', 506 | 'suffix': 'com', 507 | }, 508 | ) 509 | self.assertEqual( 510 | first=self.domain_extractor.extract_from_url('ssh://mail.google.com/mail'), 511 | second={ 512 | 'subdomain': 'mail', 513 | 'domain': 'google', 514 | 'suffix': 'com', 515 | }, 516 | ) 517 | self.assertEqual( 518 | first=self.domain_extractor.extract_from_url('git+ssh://www.github.com:8443/'), 519 | second={ 520 | 'subdomain': 'www', 521 | 'domain': 'github', 522 | 'suffix': 'com', 523 | }, 524 | ) 525 | self.assertEqual( 526 | first=self.domain_extractor.extract_from_url('ftp://johndoe:5cr1p7k1dd13@1337.warez.com:2501'), 527 | second={ 528 | 'subdomain': '1337', 529 | 'domain': 'warez', 530 | 'suffix': 'com', 531 | }, 532 | ) 533 | self.assertEqual( 534 | first=self.domain_extractor.extract_from_url('http://google.com/?q=cats'), 535 | second={ 536 | 'subdomain': '', 537 | 'domain': 'google', 538 | 'suffix': 'com', 539 | }, 540 | ) 541 | self.assertEqual( 542 | first=self.domain_extractor.extract_from_url('http://google.com/#Welcome'), 543 | second={ 544 | 'subdomain': '', 545 | 'domain': 'google', 546 | 'suffix': 'com', 547 | }, 548 | ) 549 | self.assertEqual( 550 | first=self.domain_extractor.extract_from_url('http://google.com/#Welcome'), 551 | second={ 552 | 'subdomain': '', 553 | 'domain': 'google', 554 | 'suffix': 'com', 555 | }, 556 | ) 557 | self.assertEqual( 558 | first=self.domain_extractor.extract_from_url('http://google.com/s#Welcome'), 559 | second={ 560 | 'subdomain': '', 561 | 'domain': 'google', 562 | 'suffix': 'com', 563 | }, 564 | ) 565 | self.assertEqual( 566 | first=self.domain_extractor.extract_from_url('http://google.com/s?q=cats#Welcome'), 567 | second={ 568 | 'subdomain': '', 569 | 'domain': 'google', 570 | 'suffix': 'com', 571 | }, 572 | ) 573 | self.assertEqual( 574 | first=self.domain_extractor.extract_from_url('http://www.parliament.uk'), 575 | second={ 576 | 'subdomain': 'www', 577 | 'domain': 'parliament', 578 | 'suffix': 'uk', 579 | }, 580 | ) 581 | self.assertEqual( 582 | first=self.domain_extractor.extract_from_url('http://www.parliament.co.uk'), 583 | second={ 584 | 'subdomain': 'www', 585 | 'domain': 'parliament', 586 | 'suffix': 'co.uk', 587 | }, 588 | ) 589 | self.assertEqual( 590 | first=self.domain_extractor.extract_from_url('http://www.cgs.act.edu.au/'), 591 | second={ 592 | 'subdomain': 'www', 593 | 'domain': 'cgs', 594 | 'suffix': 'act.edu.au', 595 | }, 596 | ) 597 | self.assertEqual( 598 | first=self.domain_extractor.extract_from_url('http://www.google.com.au/'), 599 | second={ 600 | 'subdomain': 'www', 601 | 'domain': 'google', 602 | 'suffix': 'com.au', 603 | }, 604 | ) 605 | self.assertEqual( 606 | first=self.domain_extractor.extract_from_url('http://www.metp.net.cn'), 607 | second={ 608 | 'subdomain': 'www', 609 | 'domain': 'metp', 610 | 'suffix': 'net.cn', 611 | }, 612 | ) 613 | self.assertEqual( 614 | first=self.domain_extractor.extract_from_url('http://waiterrant.blogspot.com'), 615 | second={ 616 | 'subdomain': '', 617 | 'domain': 'waiterrant', 618 | 'suffix': 'blogspot.com', 619 | }, 620 | ) 621 | self.assertEqual( 622 | first=self.domain_extractor.extract_from_url('http://127.0.0.1/foo/bar'), 623 | second={ 624 | 'subdomain': '127.0.0', 625 | 'domain': '1', 626 | 'suffix': '', 627 | }, 628 | ) 629 | self.assertEqual( 630 | first=self.domain_extractor.extract_from_url('http://256.256.256.256/foo/bar'), 631 | second={ 632 | 'subdomain': '256.256.256', 633 | 'domain': '256', 634 | 'suffix': '', 635 | }, 636 | ) 637 | self.assertEqual( 638 | first=self.domain_extractor.extract_from_url('http://127.0.0.1.9/foo/bar'), 639 | second={ 640 | 'subdomain': '127.0.0.1', 641 | 'domain': '9', 642 | 'suffix': '', 643 | }, 644 | ) 645 | self.assertEqual( 646 | first=self.domain_extractor.extract_from_url('http://admin:password1@www.google.com:666/secret/admin/interface?param1=42'), 647 | second={ 648 | 'subdomain': 'www', 649 | 'domain': 'google', 650 | 'suffix': 'com', 651 | }, 652 | ) 653 | self.assertEqual( 654 | first=self.domain_extractor.extract_from_url('//admin:password1@www.google.com:666/secret/admin/interface?param1=42'), 655 | second={ 656 | 'subdomain': 'www', 657 | 'domain': 'google', 658 | 'suffix': 'com', 659 | }, 660 | ) 661 | self.assertEqual( 662 | first=self.domain_extractor.extract_from_url('//mail.google.com/mail'), 663 | second={ 664 | 'subdomain': 'mail', 665 | 'domain': 'google', 666 | 'suffix': 'com', 667 | }, 668 | ) 669 | self.assertEqual( 670 | first=self.domain_extractor.extract_from_url('http://test.nu'), 671 | second={ 672 | 'subdomain': '', 673 | 'domain': 'test', 674 | 'suffix': 'nu', 675 | }, 676 | ) 677 | 678 | def test_is_valid_domain( 679 | self, 680 | ): 681 | self.assertTrue( 682 | expr=self.domain_extractor.is_valid_domain('domain.com'), 683 | ) 684 | self.assertTrue( 685 | expr=self.domain_extractor.is_valid_domain('sub.domain.com'), 686 | ) 687 | self.assertTrue( 688 | expr=self.domain_extractor.is_valid_domain('domain.COM'), 689 | ) 690 | self.assertTrue( 691 | expr=self.domain_extractor.is_valid_domain('domain.co.il'), 692 | ) 693 | self.assertTrue( 694 | expr=self.domain_extractor.is_valid_domain('domain.co.za'), 695 | ) 696 | self.assertFalse( 697 | expr=self.domain_extractor.is_valid_domain('domain.invalid'), 698 | ) 699 | self.assertFalse( 700 | expr=self.domain_extractor.is_valid_domain('com'), 701 | ) 702 | self.assertFalse( 703 | expr=self.domain_extractor.is_valid_domain('com'), 704 | ) 705 | self.assertFalse( 706 | expr=self.domain_extractor.is_valid_domain('-domain.com'), 707 | ) 708 | self.assertFalse( 709 | expr=self.domain_extractor.is_valid_domain('domain-.com'), 710 | ) 711 | self.assertFalse( 712 | expr=self.domain_extractor.is_valid_domain('-sub.domain.com'), 713 | ) 714 | self.assertFalse( 715 | expr=self.domain_extractor.is_valid_domain('sub-.domain.com'), 716 | ) 717 | 718 | self.assertTrue( 719 | expr=self.domain_extractor.is_valid_domain('domain.xn--mgbaakc7dvf'), 720 | ) 721 | self.assertTrue( 722 | expr=self.domain_extractor.is_valid_domain('domain.اتصالات'), 723 | ) 724 | self.assertTrue( 725 | expr=self.domain_extractor.is_valid_domain('xn--mgbaakc7dvf.com'), 726 | ) 727 | self.assertTrue( 728 | expr=self.domain_extractor.is_valid_domain('اتصالات.com'), 729 | ) 730 | self.assertTrue( 731 | expr=self.domain_extractor.is_valid_domain('اتصالات.اتصالات'), 732 | ) 733 | self.assertTrue( 734 | expr=self.domain_extractor.is_valid_domain('xn--mgbaakc7dvf.xn--mgbaakc7dvf'), 735 | ) 736 | 737 | self.assertFalse( 738 | expr=self.domain_extractor.is_valid_domain('domain.xn--mgbaakc7dvfa'), 739 | ) 740 | self.assertFalse( 741 | expr=self.domain_extractor.is_valid_domain('domain.اsتصالات'), 742 | ) 743 | self.assertFalse( 744 | expr=self.domain_extractor.is_valid_domain('xn--mgbaaskc7777dvf.com'), 745 | ) 746 | self.assertFalse( 747 | expr=self.domain_extractor.is_valid_domain('اتصالsات.com'), 748 | ) 749 | self.assertFalse( 750 | expr=self.domain_extractor.is_valid_domain('اتصالاsت.اتصالات'), 751 | ) 752 | self.assertFalse( 753 | expr=self.domain_extractor.is_valid_domain('xn--mgbsaadddd1212121212kc7dvf.xn--mgbaakc7dvf'), 754 | ) 755 | 756 | self.assertFalse( 757 | expr=self.domain_extractor.is_valid_domain('\xF0\x9F\x98\x81nonalphanum.com'), 758 | ) 759 | 760 | self.assertFalse( 761 | expr=self.domain_extractor.is_valid_domain('.com'), 762 | ) 763 | self.assertFalse( 764 | expr=self.domain_extractor.is_valid_domain('domain..com'), 765 | ) 766 | self.assertFalse( 767 | expr=self.domain_extractor.is_valid_domain('sub..domain.com'), 768 | ) 769 | self.assertFalse( 770 | expr=self.domain_extractor.is_valid_domain('domain.com.'), 771 | ) 772 | self.assertFalse( 773 | expr=self.domain_extractor.is_valid_domain('com.'), 774 | ) 775 | 776 | def test_mutability( 777 | self, 778 | ): 779 | domain_to_test_original = 'Google.COM' 780 | domain_to_test = 'Google.COM' 781 | 782 | self.domain_extractor.is_valid_domain(domain_to_test) 783 | self.assertEqual( 784 | first=domain_to_test_original, 785 | second=domain_to_test, 786 | ) 787 | 788 | self.domain_extractor.extract(domain_to_test) 789 | self.assertEqual( 790 | first=domain_to_test_original, 791 | second=domain_to_test, 792 | ) 793 | 794 | url_to_test_original = 'http://Google.COM/A.php?Bla=true' 795 | url_to_test = 'http://Google.COM/A.php?Bla=true' 796 | self.domain_extractor.extract_from_url(url_to_test) 797 | self.assertEqual( 798 | first=url_to_test_original, 799 | second=url_to_test, 800 | ) 801 | 802 | 803 | class DomainExtractorLoadTestCase( 804 | unittest.TestCase, 805 | ): 806 | def test_load_called_without_data( 807 | self, 808 | ): 809 | domain_extractor = pydomainextractor.DomainExtractor() 810 | 811 | self.assertEqual( 812 | first=domain_extractor.extract('com'), 813 | second={ 814 | 'subdomain': '', 815 | 'domain': '', 816 | 'suffix': 'com', 817 | }, 818 | ) 819 | 820 | def test_load_called_with_data( 821 | self, 822 | ): 823 | domain_extractor = pydomainextractor.DomainExtractor( 824 | 'com\n' 825 | ) 826 | 827 | self.assertEqual( 828 | first=domain_extractor.extract('com'), 829 | second={ 830 | 'subdomain': '', 831 | 'domain': '', 832 | 'suffix': 'com', 833 | }, 834 | ) 835 | 836 | domain_extractor = pydomainextractor.DomainExtractor( 837 | 'net\n' 838 | ) 839 | 840 | self.assertEqual( 841 | first=domain_extractor.extract('com'), 842 | second={ 843 | 'subdomain': '', 844 | 'domain': 'com', 845 | 'suffix': '', 846 | }, 847 | ) 848 | 849 | domain_extractor = pydomainextractor.DomainExtractor( 850 | 'customtld\n' 851 | ) 852 | 853 | self.assertEqual( 854 | first=domain_extractor.extract('google.customtld'), 855 | second={ 856 | 'subdomain': '', 857 | 'domain': 'google', 858 | 'suffix': 'customtld', 859 | }, 860 | ) 861 | 862 | domain_extractor = pydomainextractor.DomainExtractor( 863 | 'tld\n' 864 | 'custom.tld\n' 865 | ) 866 | 867 | self.assertEqual( 868 | first=domain_extractor.extract('google.custom.tld'), 869 | second={ 870 | 'subdomain': '', 871 | 'domain': 'google', 872 | 'suffix': 'custom.tld', 873 | }, 874 | ) 875 | 876 | def test_get_tld_list( 877 | self, 878 | ): 879 | domain_extractor = pydomainextractor.DomainExtractor( 880 | 'com\n' 881 | ) 882 | 883 | self.assertEqual( 884 | first=domain_extractor.get_tld_list(), 885 | second=[ 886 | 'com', 887 | ], 888 | ) 889 | 890 | domain_extractor = pydomainextractor.DomainExtractor( 891 | 'com\n' 892 | 'net\n' 893 | 'org\n' 894 | 'uk.com\n' 895 | ) 896 | 897 | self.assertCountEqual( 898 | first=domain_extractor.get_tld_list(), 899 | second=[ 900 | 'com', 901 | 'net', 902 | 'org', 903 | 'uk.com', 904 | ], 905 | ) 906 | --------------------------------------------------------------------------------