├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── README.md ├── pyproject.toml ├── tests ├── test_cli.py ├── test_deconstruct_url.py ├── test_generic_url_cleanup.py ├── test_normalize_fragment.py ├── test_normalize_host.py ├── test_normalize_path.py ├── test_normalize_port.py ├── test_normalize_query.py ├── test_normalize_query_filters.py ├── test_normalize_scheme.py ├── test_normalize_userinfo.py ├── test_provide_url_domain.py ├── test_provide_url_scheme.py ├── test_reconstruct_url.py ├── test_tools.py └── test_url_normalize.py └── url_normalize ├── __init__.py ├── cli.py ├── generic_url_cleanup.py ├── normalize_fragment.py ├── normalize_host.py ├── normalize_path.py ├── normalize_port.py ├── normalize_query.py ├── normalize_scheme.py ├── normalize_userinfo.py ├── param_allowlist.py ├── provide_url_domain.py ├── provide_url_scheme.py ├── py.typed ├── tools.py └── url_normalize.py /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | # Avoid using all the resources/limits available by checking only 6 | # relevant branches and tags. Other branches can be checked via PRs. 7 | branches: [master] 8 | tags: ['v[0-9]*', '[0-9]+.[0-9]+*'] # Match tags that resemble a version 9 | pull_request: 10 | workflow_dispatch: # Allow manually triggering the workflow 11 | schedule: 12 | # Run roughly every 15 days at 00:00 UTC 13 | # (useful to check if updates on dependencies break the package) 14 | - cron: '0 0 1,16 * *' 15 | 16 | concurrency: 17 | group: >- 18 | ${{ github.workflow }}-${{ github.ref_type }}- 19 | ${{ github.event.pull_request.number || github.sha }} 20 | cancel-in-progress: true 21 | 22 | jobs: 23 | prepare: 24 | runs-on: ubuntu-latest 25 | outputs: 26 | wheel-distribution: ${{ steps.wheel-distribution.outputs.path }} 27 | steps: 28 | - uses: actions/checkout@v4 29 | with: {fetch-depth: 0} # deep clone for setuptools-scm 30 | - uses: actions/setup-python@v5 31 | with: {python-version-file: "pyproject.toml"} 32 | - uses: astral-sh/setup-uv@v5 33 | - name: Run static analysis and format checkers 34 | run: uv run --with '.[dev]' pre-commit run --all-files 35 | - name: Build package distribution files 36 | run: uv build 37 | - name: Record the path of wheel distribution 38 | id: wheel-distribution 39 | run: echo "path=$(ls dist/*.whl)" >> $GITHUB_OUTPUT 40 | - name: Store the distribution files for use in other stages 41 | uses: actions/upload-artifact@v4 42 | with: 43 | name: python-distribution-files 44 | path: dist/ 45 | retention-days: 1 46 | 47 | test: 48 | needs: prepare 49 | strategy: 50 | matrix: 51 | python: 52 | - "3.8" # oldest Python supported by validate-pyproject 53 | - "3.x" # newest Python that is stable 54 | platform: 55 | - ubuntu-latest 56 | - macos-13 57 | - windows-latest 58 | runs-on: ${{ matrix.platform }} 59 | steps: 60 | - uses: actions/checkout@v4 61 | - uses: actions/setup-python@v5 62 | with: 63 | python-version: ${{ matrix.python }} 64 | - uses: astral-sh/setup-uv@v5 65 | - name: Retrieve pre-built distribution files 66 | uses: actions/download-artifact@v4 67 | with: {name: python-distribution-files, path: dist/} 68 | - name: Run tests 69 | run: uv run --with '.[dev]' pytest --cov-report=lcov:coverage.lcov 70 | - name: Upload partial coverage report 71 | uses: coverallsapp/github-action@v2 72 | with: 73 | path-to-lcov: coverage.lcov 74 | github-token: ${{ secrets.GITHUB_TOKEN }} 75 | flag-name: ${{ matrix.platform }} - py${{ matrix.python }} 76 | parallel: true 77 | 78 | finalize: 79 | needs: test 80 | runs-on: ubuntu-latest 81 | steps: 82 | - name: Finalize coverage report 83 | uses: coverallsapp/github-action@v2 84 | with: 85 | github-token: ${{ secrets.GITHUB_TOKEN }} 86 | parallel-finished: true 87 | 88 | publish: 89 | needs: finalize 90 | if: ${{ github.event_name == 'push' && contains(github.ref, 'refs/tags/') }} 91 | runs-on: ubuntu-latest 92 | permissions: 93 | id-token: write 94 | steps: 95 | - uses: actions/checkout@v4 96 | - uses: actions/setup-python@v5 97 | with: {python-version-file: "pyproject.toml"} 98 | - uses: astral-sh/setup-uv@v5 99 | - name: Retrieve pre-built distribution files 100 | uses: actions/download-artifact@v4 101 | with: {name: python-distribution-files, path: dist/} 102 | - name: Publish Package to PyPI 103 | run: uv publish dist/* 104 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .* 3 | !.coveragerc 4 | !.github 5 | !.pre-commit-config.yaml 6 | *.egg-info 7 | *.pyc 8 | build 9 | dist 10 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v5.0.0 4 | hooks: 5 | - id: check-added-large-files 6 | - id: check-ast 7 | - id: check-json 8 | - id: check-merge-conflict 9 | - id: check-symlinks 10 | - id: check-toml 11 | - id: check-xml 12 | - id: check-yaml 13 | - id: debug-statements 14 | - id: end-of-file-fixer 15 | - id: requirements-txt-fixer 16 | - id: trailing-whitespace 17 | - id: mixed-line-ending 18 | args: ["--fix=auto"] # replace 'auto' with 'lf' to enforce Linux/Mac line endings or 'crlf' for Windows 19 | - repo: https://github.com/abravalheri/validate-pyproject 20 | rev: v0.24.1 21 | hooks: 22 | - id: validate-pyproject 23 | - repo: https://github.com/pre-commit/mirrors-mypy 24 | rev: v1.15.0 25 | hooks: 26 | - id: mypy 27 | exclude: tests 28 | - repo: https://github.com/igorshubovych/markdownlint-cli 29 | rev: v0.44.0 30 | hooks: 31 | - id: markdownlint 32 | args: ["--fix", "--disable", "MD024"] 33 | - repo: https://github.com/codespell-project/codespell 34 | rev: v2.4.1 35 | hooks: 36 | - id: codespell 37 | - repo: https://github.com/astral-sh/ruff-pre-commit 38 | rev: v0.11.4 39 | hooks: 40 | - id: ruff-format 41 | - id: ruff 42 | args: [--fix, --exit-non-zero-on-fix] 43 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [2.2.1] - 2025-04-26 9 | 10 | ### Added 11 | 12 | - Include `py.typed` marker file for PEP 561 compatibility. 13 | 14 | ## [2.2.0] - 2025-03-30 15 | 16 | ### Added 17 | 18 | - New `default_domain` parameter to support absolute paths with domain names (#22) 19 | 20 | ### Fixed 21 | 22 | - Handle URLs with missing slashes correctly (#19) 23 | - Fix decoding of reserved characters in URL paths (#25) 24 | - Fix Twitter hashtag encoding in query parameters (#31) 25 | 26 | ### Internal 27 | 28 | - Update CI configuration to use uv from PATH 29 | 30 | ## [2.1.0] - 2025-03-30 31 | 32 | ### Added 33 | 34 | - New command-line interface (`url-normalize`) with support for: 35 | - Version information (`--version`, `-v`) 36 | - Charset selection (`--charset`, `-c`) 37 | - Default scheme override (`--default-scheme`, `-s`) 38 | - Query parameter filtering (`--filter-params`, `-f`) 39 | - Custom allowlist for query parameters (`--param-allowlist`, `-p`) 40 | 41 | ### Fixed 42 | 43 | - Do not encode equals sign in fragment (Fixes #36) 44 | 45 | ### Internal 46 | 47 | - Add GitHub Action to publish package to PyPI using uv 48 | 49 | ## [2.0.1] - 2025-03-29 50 | 51 | ### Fixed 52 | 53 | - Reverted license format in pyproject.toml to maintain Python 3.8 compatibility with older setuptools versions 54 | 55 | ## [2.0.0] - 2025-03-29 56 | 57 | ### Added 58 | 59 | - Query parameter filtering functionality 60 | - Parameter allowlist feature for controlling accepted query parameters 61 | - IDNA 2008 support via `idna` package 62 | 63 | ### Changed 64 | 65 | - **BREAKING:** Switch default scheme from 'http' to 'https' 66 | - **BREAKING:** Migrated IDNA handling to use IDNA 2008 with UTS46 processing 67 | - **BREAKING:** Updated minimum Python version to 3.8 (removed Python 2.7 support) 68 | - **BREAKING:** Removed sort_query_params option as it was incorrect - query parameter order is semantically meaningful and cannot be changed 69 | - Enhanced query normalization with parameter filtering support 70 | - Updated URL cleanup to support new filtering features 71 | - Changed host normalization to handle each domain label separately 72 | 73 | ### Internal 74 | 75 | - Refactored code organization for improved maintainability: 76 | - Split url_normalize.py into separate function modules 77 | - Moved each normalization function to its own file 78 | - Reorganized constants to their relevant modules 79 | - Maintained backward compatibility and test coverage 80 | - Added pre-commit hooks for code quality and linting 81 | - Dedicated CHANGELOG.md file 82 | - Increased test coverage requirement to 100% 83 | - Migrated from Travis CI to GitHub Actions for testing across multiple Python versions 84 | - Moved pytest configuration from tox.ini to pyproject.toml 85 | - Removed Travis CI configuration in favor of GitHub Actions 86 | - Semantic versioning compliance 87 | - Upgraded project structure to modern Python packaging standards using pyproject.toml 88 | 89 | ## [1.4.3] - 2024-02-15 90 | 91 | ### Added 92 | 93 | - LICENSE file 94 | 95 | ## [1.4.2] 96 | 97 | ### Added 98 | 99 | - Optional param `sort_query_params` (True by default) 100 | 101 | ## [1.4.1] 102 | 103 | ### Added 104 | 105 | - Param `default_scheme` to url_normalize ('https' by default) 106 | 107 | ## [1.4.0] 108 | 109 | ### Changed 110 | 111 | - Code refactoring and cleanup 112 | 113 | ## [1.3.3] 114 | 115 | ### Added 116 | 117 | - Support for empty string and double slash urls (//domain.tld) 118 | 119 | ## [1.3.2] 120 | 121 | ### Added 122 | 123 | - Cross-version compatibility: same code supports both Python 3 and Python 2 124 | 125 | ## [1.3.1] 126 | 127 | ### Added 128 | 129 | - Python 3 compatibility 130 | 131 | ## [1.2.1] 132 | 133 | ### Changed 134 | 135 | - PEP8 compliance improvements 136 | - Setup.py improvements 137 | 138 | ## [1.1.2] 139 | 140 | ### Added 141 | 142 | - Support for shebang (#!) urls 143 | 144 | ## [1.1.1] 145 | 146 | ### Changed 147 | 148 | - Using 'http' schema by default when appropriate 149 | 150 | ## [1.1.0] 151 | 152 | ### Added 153 | 154 | - Handling of IDN domains 155 | 156 | ## [1.0.0] 157 | 158 | ### Changed 159 | 160 | - Code PEP8 compliance 161 | 162 | ## [0.1.0] 163 | 164 | ### Added 165 | 166 | - Initial release 167 | - Forked from Sam Ruby's urlnorm.py 168 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Nikolay Panov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | install: 2 | @uv pip install -e ".[dev]" 3 | 4 | # tox target removed 5 | 6 | update: install 7 | @uv run -- pre-commit autoupdate 8 | 9 | lint: install 10 | @uv run -- pre-commit run -a 11 | 12 | test: install 13 | @uv run -- pytest 14 | 15 | build: 16 | @rm -rf dist 17 | @uv build 18 | 19 | publish: build 20 | @uv publish 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # url-normalize 2 | 3 | [![tests](https://github.com/niksite/url-normalize/actions/workflows/ci.yml/badge.svg)](https://github.com/niksite/url-normalize/actions/workflows/ci.yml) 4 | [![Coveralls](https://img.shields.io/coveralls/github/niksite/url-normalize/master.svg)](https://coveralls.io/r/niksite/url-normalize) 5 | [![PyPI](https://img.shields.io/pypi/v/url-normalize.svg)](https://pypi.org/project/url-normalize/) 6 | 7 | A Python library for standardizing and normalizing URLs with support for internationalized domain names (IDN). 8 | 9 | ## Table of Contents 10 | 11 | - [Introduction](#introduction) 12 | - [Features](#features) 13 | - [Installation](#installation) 14 | - [Usage](#usage) 15 | - [Python API](#python-api) 16 | - [Command Line](#command-line-usage) 17 | - [Documentation](#documentation) 18 | - [Contributing](#contributing) 19 | - [License](#license) 20 | 21 | ## Introduction 22 | 23 | url-normalize provides a robust URI normalization function that: 24 | 25 | - Takes care of IDN domains. 26 | - Always provides the URI scheme in lowercase characters. 27 | - Always provides the host, if any, in lowercase characters. 28 | - Only performs percent-encoding where it is essential. 29 | - Always uses uppercase A-through-F characters when percent-encoding. 30 | - Prevents dot-segments appearing in non-relative URI paths. 31 | - For schemes that define a default authority, uses an empty authority if the 32 | default is desired. 33 | - For schemes that define an empty path to be equivalent to a path of "/", 34 | uses "/". 35 | - For schemes that define a port, uses an empty port if the default is desired 36 | - Ensures all portions of the URI are utf-8 encoded NFC from Unicode strings 37 | 38 | Inspired by Sam Ruby's [urlnorm.py](http://intertwingly.net/blog/2004/08/04/Urlnorm) 39 | 40 | ## Features 41 | 42 | - **IDN Support**: Full internationalized domain name handling 43 | - **Configurable Defaults**: 44 | - Customizable default scheme (https by default) 45 | - Configurable default domain for absolute paths 46 | - **Query Parameter Control**: 47 | - Parameter filtering with allowlists 48 | - Support for domain-specific parameter rules 49 | - **Versatile URL Handling**: 50 | - Empty string URLs 51 | - Double slash URLs (//domain.tld) 52 | - Shebang (#!) URLs 53 | - **Developer Friendly**: 54 | - Cross-version Python compatibility (3.8+) 55 | - 100% test coverage 56 | - Modern type hints and string handling 57 | 58 | ## Installation 59 | 60 | ```sh 61 | pip install url-normalize 62 | ``` 63 | 64 | ## Usage 65 | 66 | ### Python API 67 | 68 | ```python 69 | from url_normalize import url_normalize 70 | 71 | # Basic normalization (uses https by default) 72 | print(url_normalize("www.foo.com:80/foo")) 73 | # Output: https://www.foo.com/foo 74 | 75 | # With custom default scheme 76 | print(url_normalize("www.foo.com/foo", default_scheme="http")) 77 | # Output: http://www.foo.com/foo 78 | 79 | # With query parameter filtering enabled 80 | print(url_normalize("www.google.com/search?q=test&utm_source=test", filter_params=True)) 81 | # Output: https://www.google.com/search?q=test 82 | 83 | # With custom parameter allowlist as a dict 84 | print(url_normalize( 85 | "example.com?page=1&id=123&ref=test", 86 | filter_params=True, 87 | param_allowlist={"example.com": ["page", "id"]} 88 | )) 89 | # Output: https://example.com?page=1&id=123 90 | 91 | # With custom parameter allowlist as a list 92 | print(url_normalize( 93 | "example.com?page=1&id=123&ref=test", 94 | filter_params=True, 95 | param_allowlist=["page", "id"] 96 | )) 97 | # Output: https://example.com?page=1&id=123 98 | 99 | # With default domain for absolute paths 100 | print(url_normalize("/images/logo.png", default_domain="example.com")) 101 | # Output: https://example.com/images/logo.png 102 | 103 | # With default domain and custom scheme 104 | print(url_normalize("/images/logo.png", default_scheme="http", default_domain="example.com")) 105 | # Output: http://example.com/images/logo.png 106 | ``` 107 | 108 | ### Command-line Usage 109 | 110 | You can also use `url-normalize` from the command line: 111 | 112 | ```bash 113 | $ url-normalize "www.foo.com:80/foo" 114 | # Output: https://www.foo.com/foo 115 | 116 | # With custom default scheme 117 | $ url-normalize -s http "www.foo.com/foo" 118 | # Output: http://www.foo.com/foo 119 | 120 | # With query parameter filtering 121 | $ url-normalize -f "www.google.com/search?q=test&utm_source=test" 122 | # Output: https://www.google.com/search?q=test 123 | 124 | # With custom allowlist 125 | $ url-normalize -f -p page,id "example.com?page=1&id=123&ref=test" 126 | # Output: https://example.com/?page=1&id=123 127 | 128 | # With default domain for absolute paths 129 | $ url-normalize -d example.com "/images/logo.png" 130 | # Output: https://example.com/images/logo.png 131 | 132 | # With default domain and custom scheme 133 | $ url-normalize -d example.com -s http "/images/logo.png" 134 | # Output: http://example.com/images/logo.png 135 | 136 | # Via uv tool/uvx 137 | $ uvx url-normalize www.foo.com:80/foo 138 | # Output: https://www.foo.com:80/foo 139 | ``` 140 | 141 | ## Documentation 142 | 143 | For a complete history of changes, see [CHANGELOG.md](CHANGELOG.md). 144 | 145 | ## Contributing 146 | 147 | Contributions are welcome! Please feel free to submit a Pull Request. 148 | 149 | ## License 150 | 151 | MIT License 152 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "url-normalize" 3 | version = "2.2.1" 4 | description = "URL normalization for Python" 5 | authors = [{ name = "Nikolay Panov", email = "github@npanov.com" }] 6 | license = { text = "MIT" } 7 | readme = "README.md" 8 | requires-python = ">=3.8" 9 | keywords = ["url", "normalization", "normalize", "normalizer"] 10 | dependencies = ["idna>=3.3"] 11 | 12 | [project.urls] 13 | Homepage = "https://github.com/niksite/url-normalize" 14 | Repository = "https://github.com/niksite/url-normalize" 15 | Issues = "https://github.com/niksite/url-normalize/issues" 16 | Changelog = "https://github.com/niksite/url-normalize/blob/master/CHANGELOG.md" 17 | 18 | [project.scripts] 19 | url-normalize = "url_normalize.cli:main" 20 | 21 | [project.optional-dependencies] 22 | dev = ["mypy", "pre-commit", "pytest-cov", "pytest-socket", "pytest", "ruff"] 23 | 24 | [tool.ruff] 25 | target-version = "py38" 26 | line-length = 88 27 | unsafe-fixes = true 28 | 29 | [tool.ruff.lint] 30 | select = ["ALL"] 31 | extend-select = [ 32 | "D400", # First line should end with a period 33 | "D401", # First line should be in imperative mood 34 | "D413", # Missing blank line after the last section of a multiline docstring 35 | ] 36 | fixable = ["ALL"] 37 | ignore = [ 38 | "COM812", # missing-trailing-comma 39 | "D203", # One blank line before class - we prefer D211 instead 40 | "D213", # multi-line-summary-second-line - we prefer D212 instead 41 | ] 42 | 43 | [tool.ruff.lint.pydocstyle] 44 | convention = "google" 45 | 46 | [tool.ruff.lint.per-file-ignores] 47 | "tests/**" = ["INP001", "ANN001", "ANN201", "S101", "CPY001"] 48 | 49 | [tool.ruff.format] 50 | quote-style = "double" 51 | indent-style = "space" 52 | 53 | [tool.mypy] 54 | ignore_missing_imports = true 55 | exclude = ["tests"] 56 | python_version = "3.8" 57 | show_error_codes = true 58 | 59 | [build-system] 60 | requires = ["setuptools>=42", "wheel"] 61 | build-backend = "setuptools.build_meta" 62 | 63 | [tool.pytest.ini_options] 64 | addopts = [ 65 | "--cov-report=term-missing:skip-covered", 66 | "--cov=url_normalize", 67 | "--disable-socket", 68 | "-v", 69 | ] 70 | python_files = ["tests.py", "test_*.py", "*_tests.py"] 71 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | """Tests for the command line interface.""" 2 | 3 | import subprocess 4 | import sys 5 | from unittest.mock import patch 6 | 7 | import pytest 8 | 9 | from url_normalize import __version__ 10 | from url_normalize.cli import main 11 | 12 | 13 | def run_cli(*args: str) -> subprocess.CompletedProcess: 14 | """Run the CLI command with given arguments. 15 | 16 | Params: 17 | *args: Command line arguments to pass to the CLI. 18 | 19 | Returns: 20 | A completed process with stdout, stderr, and return code. 21 | 22 | """ 23 | command = [sys.executable, "-m", "url_normalize.cli", *list(args)] 24 | return subprocess.run( # noqa: S603 25 | command, capture_output=True, text=True, check=False 26 | ) 27 | 28 | 29 | def test_cli_error_handling(capsys, monkeypatch): 30 | """Test CLI error handling when URL normalization fails.""" 31 | with patch("url_normalize.cli.url_normalize") as mock_normalize: 32 | mock_normalize.side_effect = Exception("Simulated error") 33 | monkeypatch.setattr("sys.argv", ["url-normalize", "http://example.com"]) 34 | 35 | with pytest.raises(SystemExit) as excinfo: 36 | main() 37 | 38 | assert excinfo.value.code == 1 39 | captured = capsys.readouterr() 40 | assert "Error normalizing URL: Simulated error" in captured.err 41 | assert not captured.out 42 | 43 | 44 | def test_cli_basic_normalization() -> None: 45 | """Test basic URL normalization via CLI.""" 46 | url = "http://EXAMPLE.com/./path/../other/" 47 | expected = "http://example.com/other/" 48 | 49 | result = run_cli(url) 50 | 51 | assert result.returncode == 0 52 | assert result.stdout.strip() == expected 53 | assert not result.stderr 54 | 55 | 56 | def test_cli_basic_normalization_short_args() -> None: 57 | """Test basic URL normalization via CLI using short arguments.""" 58 | url = "http://EXAMPLE.com/./path/../other/" 59 | expected = "http://example.com/other/" 60 | # Using short args where applicable (none for the URL itself) 61 | 62 | result = run_cli(url) # No short args needed for basic case 63 | 64 | assert result.returncode == 0 65 | assert result.stdout.strip() == expected 66 | assert not result.stderr 67 | 68 | 69 | def test_cli_default_scheme() -> None: 70 | """Test default scheme addition via CLI.""" 71 | url = "//example.com" 72 | expected = "https://example.com/" 73 | 74 | result = run_cli(url) 75 | 76 | assert result.returncode == 0 77 | assert result.stdout.strip() == expected 78 | assert not result.stderr 79 | 80 | 81 | def test_cli_default_scheme_short_arg() -> None: 82 | """Test default scheme addition via CLI using short argument.""" 83 | url = "//example.com" 84 | expected = "https://example.com/" 85 | 86 | result = run_cli(url) # Default scheme is implicit, no arg needed 87 | 88 | assert result.returncode == 0 89 | assert result.stdout.strip() == expected 90 | assert not result.stderr 91 | 92 | 93 | def test_cli_custom_default_scheme() -> None: 94 | """Test custom default scheme via CLI.""" 95 | url = "//example.com" 96 | expected = "ftp://example.com/" 97 | 98 | result = run_cli("--default-scheme", "ftp", url) 99 | 100 | assert result.returncode == 0 101 | assert result.stdout.strip() == expected 102 | assert not result.stderr 103 | 104 | 105 | def test_cli_custom_default_scheme_short_arg() -> None: 106 | """Test custom default scheme via CLI using short argument.""" 107 | url = "//example.com" 108 | expected = "ftp://example.com/" 109 | 110 | result = run_cli("-s", "ftp", url) 111 | 112 | assert result.returncode == 0 113 | assert result.stdout.strip() == expected 114 | assert not result.stderr 115 | 116 | 117 | def test_cli_filter_params() -> None: 118 | """Test parameter filtering via CLI.""" 119 | url = "http://google.com?utm_source=test&q=1" 120 | expected = "http://google.com/?q=1" 121 | 122 | result = run_cli("--filter-params", url) 123 | 124 | assert result.returncode == 0 125 | assert result.stdout.strip() == expected 126 | assert not result.stderr 127 | 128 | 129 | def test_cli_filter_params_short_arg() -> None: 130 | """Test parameter filtering via CLI using short argument.""" 131 | url = "http://google.com?utm_source=test&q=1" 132 | expected = "http://google.com/?q=1" 133 | 134 | result = run_cli("-f", url) 135 | 136 | assert result.returncode == 0 137 | assert result.stdout.strip() == expected 138 | assert not result.stderr 139 | 140 | 141 | def test_cli_param_allowlist() -> None: 142 | """Test parameter allowlist via CLI.""" 143 | url = "http://example.com?remove=me&keep=this&remove_too=true" 144 | expected = "http://example.com/?keep=this" 145 | # Use filter_params to enable filtering, then allowlist to keep specific ones 146 | 147 | result = run_cli("-f", "-p", "keep", url) 148 | 149 | assert result.returncode == 0 150 | assert result.stdout.strip() == expected 151 | assert not result.stderr 152 | 153 | 154 | def test_cli_param_allowlist_multiple() -> None: 155 | """Test parameter allowlist with multiple params via CLI.""" 156 | url = "http://example.com?remove=me&keep=this&keep_too=yes&remove_too=true" 157 | expected = "http://example.com/?keep=this&keep_too=yes" 158 | 159 | result = run_cli("-f", "-p", "keep,keep_too", url) 160 | 161 | assert result.returncode == 0 162 | assert result.stdout.strip() == expected 163 | assert not result.stderr 164 | 165 | 166 | def test_cli_param_allowlist_without_filtering() -> None: 167 | """Test allowlist has no effect if filtering is not enabled.""" 168 | url = "http://example.com?remove=me&keep=this&remove_too=true" 169 | expected = "http://example.com/?remove=me&keep=this&remove_too=true" 170 | # Not using -f, so allowlist should be ignored 171 | 172 | result = run_cli("-p", "keep", url) 173 | 174 | assert result.returncode == 0 175 | assert result.stdout.strip() == expected 176 | assert not result.stderr 177 | 178 | 179 | def test_cli_no_url() -> None: 180 | """Test CLI error when no URL is provided.""" 181 | result = run_cli() 182 | 183 | assert result.returncode != 0 184 | assert "the following arguments are required: url" in result.stderr 185 | 186 | 187 | def test_cli_version_long() -> None: 188 | """Test version output with --version flag.""" 189 | result = run_cli("--version") 190 | 191 | assert result.returncode == 0 192 | assert __version__ in result.stdout 193 | assert not result.stderr 194 | 195 | 196 | def test_cli_version_short() -> None: 197 | """Test version output with -v flag.""" 198 | result = run_cli("-v") 199 | 200 | assert result.returncode == 0 201 | assert __version__ in result.stdout 202 | assert not result.stderr 203 | 204 | 205 | @pytest.mark.skipif( 206 | sys.platform == "win32", reason="Charset handling differs on Windows CLI" 207 | ) 208 | def test_cli_charset() -> None: 209 | """Test charset handling via CLI (might be platform-dependent).""" 210 | # Example using Cyrillic characters which need correct encoding 211 | url = "http://пример.рф/path" 212 | expected_idn = "http://xn--e1afmkfd.xn--p1ai/path" 213 | 214 | # Test with default UTF-8 215 | result_utf8 = run_cli(url) 216 | 217 | assert result_utf8.returncode == 0 218 | assert result_utf8.stdout.strip() == expected_idn 219 | assert not result_utf8.stderr 220 | 221 | # Test specifying UTF-8 explicitly 222 | result_charset = run_cli("--charset", "utf-8", url) 223 | 224 | assert result_charset.returncode == 0 225 | assert result_charset.stdout.strip() == expected_idn 226 | assert not result_charset.stderr 227 | 228 | # Test specifying UTF-8 explicitly using short arg 229 | result_charset_short = run_cli("-c", "utf-8", url) 230 | 231 | assert result_charset_short.returncode == 0 232 | assert result_charset_short.stdout.strip() == expected_idn 233 | assert not result_charset_short.stderr 234 | 235 | 236 | def test_cli_default_domain() -> None: 237 | """Test adding default domain to absolute path via CLI.""" 238 | url = "/path/to/image.png" 239 | expected = "https://example.com/path/to/image.png" 240 | 241 | result = run_cli("--default-domain", "example.com", url) 242 | 243 | assert result.returncode == 0 244 | assert result.stdout.strip() == expected 245 | assert not result.stderr 246 | 247 | 248 | def test_cli_default_domain_short_arg() -> None: 249 | """Test adding default domain using short argument.""" 250 | url = "/path/to/image.png" 251 | expected = "https://example.com/path/to/image.png" 252 | 253 | result = run_cli("-d", "example.com", url) 254 | 255 | assert result.returncode == 0 256 | assert result.stdout.strip() == expected 257 | assert not result.stderr 258 | 259 | 260 | def test_cli_default_domain_with_scheme() -> None: 261 | """Test adding default domain with custom scheme.""" 262 | url = "/path/to/image.png" 263 | expected = "http://example.com/path/to/image.png" 264 | 265 | result = run_cli("-d", "example.com", "-s", "http", url) 266 | 267 | assert result.returncode == 0 268 | assert result.stdout.strip() == expected 269 | assert not result.stderr 270 | 271 | 272 | def test_cli_default_domain_no_effect_on_absolute_urls() -> None: 273 | """Test default domain has no effect on absolute URLs.""" 274 | url = "http://original-domain.com/path" 275 | expected = "http://original-domain.com/path" 276 | 277 | result = run_cli("-d", "example.com", url) 278 | 279 | assert result.returncode == 0 280 | assert result.stdout.strip() == expected 281 | assert not result.stderr 282 | 283 | 284 | def test_cli_default_domain_no_effect_on_relative_paths() -> None: 285 | """Test default domain has no effect on relative paths.""" 286 | url = "path/to/file.html" 287 | # This becomes a regular URL with the default scheme 288 | expected = "https://path/to/file.html" 289 | 290 | result = run_cli("-d", "example.com", url) 291 | 292 | assert result.returncode == 0 293 | assert result.stdout.strip() == expected 294 | assert not result.stderr 295 | -------------------------------------------------------------------------------- /tests/test_deconstruct_url.py: -------------------------------------------------------------------------------- 1 | """Deconstruct url tests.""" 2 | 3 | import pytest 4 | 5 | from url_normalize.tools import URL, deconstruct_url 6 | 7 | 8 | @pytest.mark.parametrize( 9 | ("url", "expected"), 10 | [ 11 | ( 12 | "http://site.com", 13 | URL( 14 | fragment="", 15 | host="site.com", 16 | path="", 17 | port="", 18 | query="", 19 | scheme="http", 20 | userinfo="", 21 | ), 22 | ), 23 | ( 24 | "http://user@www.example.com:8080/path/index.html?param=val#fragment", 25 | URL( 26 | fragment="fragment", 27 | host="www.example.com", 28 | path="/path/index.html", 29 | port="8080", 30 | query="param=val", 31 | scheme="http", 32 | userinfo="user@", 33 | ), 34 | ), 35 | ], 36 | ) 37 | def test_deconstruct_url_result_is_expected(url: str, expected: URL) -> None: 38 | """Assert we got expected results from the deconstruct_url function.""" 39 | result = deconstruct_url(url) 40 | assert result == expected, url 41 | -------------------------------------------------------------------------------- /tests/test_generic_url_cleanup.py: -------------------------------------------------------------------------------- 1 | """Tests for generic_url_cleanup function.""" 2 | 3 | from __future__ import annotations 4 | 5 | import pytest 6 | 7 | from url_normalize.url_normalize import generic_url_cleanup 8 | 9 | 10 | @pytest.mark.parametrize( 11 | ("url", "expected"), 12 | [ 13 | ("//site/#!fragment", "//site/?_escaped_fragment_=fragment"), 14 | ("//site/page", "//site/page"), 15 | ("//site/?& ", "//site/"), 16 | ], 17 | ) 18 | def test_generic_url_cleanup_result_is_expected(url: str, expected: str) -> None: 19 | """Assert we got expected results from the generic_url_cleanup function.""" 20 | result = generic_url_cleanup(url) 21 | assert result == expected 22 | -------------------------------------------------------------------------------- /tests/test_normalize_fragment.py: -------------------------------------------------------------------------------- 1 | """Tests for normalize_fragment function.""" 2 | 3 | import pytest 4 | 5 | from url_normalize.url_normalize import normalize_fragment 6 | 7 | 8 | @pytest.mark.parametrize( 9 | ("fragment", "expected"), 10 | [ 11 | ("", ""), 12 | ("fragment", "fragment"), 13 | ("пример", "%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80"), 14 | ("!fragment", "%21fragment"), 15 | ("~fragment", "~fragment"), 16 | # Issue #36: Equal sign should not be encoded 17 | ("gid=1234", "gid=1234"), 18 | ], 19 | ) 20 | def test_normalize_fragment_result_is_expected(fragment: str, expected: str) -> None: 21 | """Assert we got expected results from the normalize_fragment function.""" 22 | result = normalize_fragment(fragment) 23 | assert result == expected, fragment 24 | -------------------------------------------------------------------------------- /tests/test_normalize_host.py: -------------------------------------------------------------------------------- 1 | """Tests for normalize_host function.""" 2 | 3 | import pytest 4 | 5 | from url_normalize.url_normalize import normalize_host 6 | 7 | 8 | @pytest.mark.parametrize( 9 | ("host", "expected"), 10 | [ 11 | # Basic cases 12 | ("site.com", "site.com"), 13 | ("SITE.COM", "site.com"), 14 | ("site.com.", "site.com"), 15 | # Cyrillic domains 16 | ("пример.испытание", "xn--e1afmkfd.xn--80akhbyknj4f"), 17 | # Mixed case with Cyrillic 18 | ("ExAmPle.РФ", "example.xn--p1ai"), 19 | # IDNA2008 with UTS46 20 | ("faß.de", "fass.de"), # Normalize using transitional rules 21 | # Edge cases 22 | ("ドメイン.テスト", "xn--eckwd4c7c.xn--zckzah"), # Japanese 23 | ("domain.café", "domain.xn--caf-dma"), # Latin with diacritic 24 | # Normalization tests 25 | ("über.example", "xn--ber-goa.example"), # IDNA 2008 for umlaut 26 | ("example。com", "example.com"), # Normalize full-width punctuation 27 | ], 28 | ) 29 | def test_normalize_host_result_is_expected(host: str, expected: str) -> None: 30 | """Assert we got expected results from the normalize_host function.""" 31 | result = normalize_host(host) 32 | assert result == expected, host 33 | -------------------------------------------------------------------------------- /tests/test_normalize_path.py: -------------------------------------------------------------------------------- 1 | """Tests for normalize_path function.""" 2 | 3 | import pytest 4 | 5 | from url_normalize.url_normalize import normalize_path 6 | 7 | 8 | @pytest.mark.parametrize( 9 | ("path", "expected"), 10 | [ 11 | ("..", "/"), 12 | ("", "/"), 13 | ("/../foo", "/foo"), 14 | ("/..foo", "/..foo"), 15 | ("/./../foo", "/foo"), 16 | ("/./foo", "/foo"), 17 | ("/./foo/.", "/foo/"), 18 | ("/.foo", "/.foo"), 19 | ("/", "/"), 20 | ("/foo..", "/foo.."), 21 | ("/foo.", "/foo."), 22 | ("/FOO", "/FOO"), 23 | ("/foo/../bar", "/bar"), 24 | ("/foo/./bar", "/foo/bar"), 25 | ("/foo//", "/foo/"), 26 | ("/foo///bar//", "/foo/bar/"), 27 | ("/foo/bar/..", "/foo/"), 28 | ("/foo/bar/../..", "/"), 29 | ("/foo/bar/../../../../baz", "/baz"), 30 | ("/foo/bar/../../../baz", "/baz"), 31 | ("/foo/bar/../../", "/"), 32 | ("/foo/bar/../../baz", "/baz"), 33 | ("/foo/bar/../", "/foo/"), 34 | ("/foo/bar/../baz", "/foo/baz"), 35 | ("/foo/bar/.", "/foo/bar/"), 36 | ("/foo/bar/./", "/foo/bar/"), 37 | # Issue #25: we should preserve ? in the path 38 | ("/More+Tea+Vicar%3F/discussion", "/More+Tea+Vicar%3F/discussion"), 39 | ], 40 | ) 41 | def test_normalize_path_result_is_expected(path: str, expected: str) -> None: 42 | """Assert we got expected results from the normalize_path function.""" 43 | result = normalize_path(path, "http") 44 | assert result == expected, path 45 | -------------------------------------------------------------------------------- /tests/test_normalize_port.py: -------------------------------------------------------------------------------- 1 | """Tests for normalize_port function.""" 2 | 3 | import pytest 4 | 5 | from url_normalize.url_normalize import normalize_port 6 | 7 | 8 | @pytest.mark.parametrize( 9 | ("port", "expected"), 10 | [ 11 | ("8080", "8080"), # Non-default port 12 | ("", ""), # Empty port 13 | ("80", ""), # Default HTTP port 14 | ("string", "string"), # Non-numeric port (should pass through) 15 | # Add more cases as needed, e.g., for HTTPS 16 | pytest.param("443", "", id="https_default_port"), 17 | ], 18 | ) 19 | def test_normalize_port_result_is_expected(port: str, expected: str): 20 | """Assert we got expected results from the normalize_port function.""" 21 | # Test with 'http' scheme for most cases 22 | scheme = "https" if port == "443" else "http" 23 | 24 | result = normalize_port(port, scheme) 25 | 26 | assert result == expected 27 | -------------------------------------------------------------------------------- /tests/test_normalize_query.py: -------------------------------------------------------------------------------- 1 | """Tests for normalize_query function.""" 2 | 3 | import pytest 4 | 5 | from url_normalize.url_normalize import normalize_query 6 | 7 | 8 | @pytest.mark.parametrize( 9 | ("query", "expected"), 10 | [ 11 | ("", ""), 12 | ("&&&", ""), 13 | ("param1=val1¶m2=val2", "param1=val1¶m2=val2"), 14 | ("Ç=Ç", "%C3%87=%C3%87"), 15 | ("%C3%87=%C3%87", "%C3%87=%C3%87"), 16 | ("q=C%CC%A7", "q=%C3%87"), 17 | ("q=%23test", "q=%23test"), # Preserve encoded # in value, #31 18 | ("where=code%3D123", "where=code%3D123"), # Preserve encoded = in value, #25 19 | ], 20 | ) 21 | def test_normalize_query_result_is_expected(query, expected): 22 | """Assert we got expected results from the normalize_query function.""" 23 | result = normalize_query(query) 24 | assert result == expected, query 25 | -------------------------------------------------------------------------------- /tests/test_normalize_query_filters.py: -------------------------------------------------------------------------------- 1 | """URL parameter filtering test module.""" 2 | 3 | from __future__ import annotations 4 | 5 | import pytest 6 | 7 | from url_normalize import url_normalize 8 | 9 | 10 | def test_param_filtering_disabled_by_default(): 11 | """Test that parameter filtering is disabled by default.""" 12 | url = "https://www.google.com/search?q=test&utm_source=test" 13 | assert url_normalize(url) == url 14 | 15 | 16 | def test_empty_query(): 17 | """Test handling empty query strings.""" 18 | assert url_normalize("https://example.com/page?") == "https://example.com/page" 19 | 20 | 21 | def test_custom_allowlist(): 22 | """Test custom allowlist functionality with preserved order.""" 23 | custom_allowlist = {"example.com": ["page", "id"], "google.com": ["q", "lang"]} 24 | 25 | # Order should match input query string order 26 | assert ( 27 | url_normalize( 28 | "https://example.com/search?page=1&id=123&utm_source=test", 29 | filter_params=True, 30 | param_allowlist=custom_allowlist, 31 | ) 32 | == "https://example.com/search?page=1&id=123" 33 | ) 34 | 35 | assert ( 36 | url_normalize( 37 | "https://google.com/search?q=test&ie=utf8&lang=en", 38 | filter_params=True, 39 | param_allowlist=custom_allowlist, 40 | ) 41 | == "https://google.com/search?q=test&lang=en" 42 | ) 43 | 44 | 45 | def test_custom_list_allowlist(): 46 | """Test custom list allowlist functionality.""" 47 | assert ( 48 | url_normalize( 49 | "https://google.com/search?qq=test&ie=utf8&utm_source=test", 50 | filter_params=True, 51 | param_allowlist=["ie", "qq"], 52 | ) 53 | == "https://google.com/search?qq=test&ie=utf8" 54 | ) 55 | 56 | 57 | @pytest.mark.parametrize( 58 | ("url", "expected"), 59 | [ 60 | # Basic parameter filtering 61 | ( 62 | "https://www.google.com/search?q=test&utm_source=test", 63 | "https://www.google.com/search?q=test", 64 | ), 65 | ( 66 | "https://www.youtube.com/watch?v=12345&utm_source=share", 67 | "https://www.youtube.com/watch?v=12345", 68 | ), 69 | # With www subdomain 70 | ( 71 | "https://www.google.com/search?q=test&ref=test", 72 | "https://www.google.com/search?q=test", 73 | ), 74 | # With port number 75 | ( 76 | "https://google.com:8080/search?q=test&ref=test", 77 | "https://google.com:8080/search?q=test", 78 | ), 79 | # Default allowlist cases 80 | ( 81 | "https://www.google.com/search?q=test&utm_source=test&ie=utf8", 82 | "https://www.google.com/search?q=test&ie=utf8", 83 | ), 84 | ( 85 | "https://www.baidu.com/s?wd=test&utm_source=test&ie=utf8", 86 | "https://www.baidu.com/s?wd=test&ie=utf8", 87 | ), 88 | ( 89 | "https://youtube.com/watch?v=12345&utm_source=test&search_query=test", 90 | "https://youtube.com/watch?v=12345&search_query=test", 91 | ), 92 | # Non-allowlisted domain 93 | ("https://example.org/page?a=1&b=2", "https://example.org/page"), 94 | ], 95 | ) 96 | def test_parameter_filtering(url: str, expected: str): 97 | """Test URL parameter filtering functionality with various scenarios.""" 98 | assert url_normalize(url, filter_params=True) == expected 99 | -------------------------------------------------------------------------------- /tests/test_normalize_scheme.py: -------------------------------------------------------------------------------- 1 | """Tests for normalize_scheme function.""" 2 | 3 | import pytest 4 | 5 | from url_normalize.url_normalize import normalize_scheme 6 | 7 | 8 | @pytest.mark.parametrize( 9 | ("scheme", "expected"), 10 | [ 11 | ("http", "http"), 12 | ("HTTP", "http"), 13 | ], 14 | ) 15 | def test_normalize_scheme_result_is_expected(scheme: str, expected: str) -> None: 16 | """Assert we got expected results from the normalize_scheme function.""" 17 | result = normalize_scheme(scheme) 18 | assert result == expected, scheme 19 | -------------------------------------------------------------------------------- /tests/test_normalize_userinfo.py: -------------------------------------------------------------------------------- 1 | """Tests for normalize_userinfo function.""" 2 | 3 | import pytest 4 | 5 | from url_normalize.url_normalize import normalize_userinfo 6 | 7 | 8 | @pytest.mark.parametrize( 9 | ("userinfo", "expected"), 10 | [ 11 | (":@", ""), 12 | ("", ""), 13 | ("@", ""), 14 | ("user:password@", "user:password@"), 15 | ("user@", "user@"), 16 | ], 17 | ) 18 | def test_normalize_userinfo_result_is_expected(userinfo: str, expected: str) -> None: 19 | """Assert we got expected results from the normalize_userinfo function.""" 20 | result = normalize_userinfo(userinfo) 21 | assert result == expected, userinfo 22 | -------------------------------------------------------------------------------- /tests/test_provide_url_domain.py: -------------------------------------------------------------------------------- 1 | """Tests for provide_url_domain function.""" 2 | 3 | import pytest 4 | 5 | from url_normalize.provide_url_domain import provide_url_domain 6 | 7 | 8 | @pytest.mark.parametrize( 9 | ("url", "expected"), 10 | [ 11 | ("", ""), 12 | ("-", "-"), 13 | ("http://example.com/", "http://example.com/"), 14 | ("/file/path", "//example.com/file/path"), 15 | ("site/page", "site/page"), # No change for relative paths 16 | ], 17 | ) 18 | def test_provide_url_domain_result_is_expected(url: str, expected: str) -> None: 19 | """Assert we get expected results from provide_url_domain function.""" 20 | result = provide_url_domain(url, default_domain="example.com") 21 | assert result == expected 22 | 23 | 24 | def test_provide_url_domain_accept_different_domains(): 25 | """Assert we could provide different default_domain values.""" 26 | url = "/file/path" 27 | expected = "//custom-domain.org/file/path" 28 | 29 | actual = provide_url_domain(url, default_domain="custom-domain.org") 30 | 31 | assert actual == expected 32 | -------------------------------------------------------------------------------- /tests/test_provide_url_scheme.py: -------------------------------------------------------------------------------- 1 | """Tests for provide_url_scheme function.""" 2 | 3 | import pytest 4 | 5 | from url_normalize.url_normalize import provide_url_scheme 6 | 7 | 8 | @pytest.mark.parametrize( 9 | ("url", "expected"), 10 | [ 11 | ("", ""), 12 | ("-", "-"), 13 | ("/file/path", "/file/path"), 14 | ("//site/path", "https://site/path"), 15 | ("ftp://site/", "ftp://site/"), 16 | ("site/page", "https://site/page"), 17 | ], 18 | ) 19 | def test_provide_url_scheme_result_is_expected(url: str, expected: str) -> None: 20 | """Assert we got expected results from the provide_url_scheme function.""" 21 | result = provide_url_scheme(url) 22 | assert result == expected, url 23 | 24 | 25 | def test_provide_url_scheme_accept_default_scheme_param() -> None: 26 | """Assert we could provide default_scheme param other than https.""" 27 | url = "//site/path" 28 | expected = "http://site/path" 29 | 30 | actual = provide_url_scheme(url, default_scheme="http") 31 | 32 | assert actual == expected 33 | -------------------------------------------------------------------------------- /tests/test_reconstruct_url.py: -------------------------------------------------------------------------------- 1 | """Reconstruct url tests.""" 2 | 3 | from __future__ import annotations 4 | 5 | import pytest 6 | 7 | from url_normalize.tools import URL, reconstruct_url 8 | 9 | 10 | @pytest.mark.parametrize( 11 | ("url_obj", "expected"), 12 | [ 13 | ( 14 | URL( 15 | fragment="", 16 | host="site.com", 17 | path="", 18 | port="", 19 | query="", 20 | scheme="http", 21 | userinfo="", 22 | ), 23 | "http://site.com", 24 | ), 25 | ( 26 | URL( 27 | fragment="fragment", 28 | host="www.example.com", 29 | path="/path/index.html", 30 | port="8080", 31 | query="param=val", 32 | scheme="http", 33 | userinfo="user@", 34 | ), 35 | "http://user@www.example.com:8080/path/index.html?param=val#fragment", 36 | ), 37 | ], 38 | ) 39 | def test_reconstruct_url_result_is_expected(url_obj: URL, expected: str) -> None: 40 | """Assert we got expected results from the reconstruct_url function.""" 41 | result = reconstruct_url(url_obj) 42 | assert result == expected, url_obj 43 | -------------------------------------------------------------------------------- /tests/test_tools.py: -------------------------------------------------------------------------------- 1 | """Tools module tests.""" 2 | 3 | from __future__ import annotations 4 | 5 | from url_normalize.tools import force_unicode 6 | 7 | 8 | def test_force_unicode_with_bytes() -> None: 9 | """Test force_unicode handles bytes input correctly.""" 10 | test_bytes = b"hello world" 11 | result = force_unicode(test_bytes) 12 | assert result == "hello world" 13 | -------------------------------------------------------------------------------- /tests/test_url_normalize.py: -------------------------------------------------------------------------------- 1 | """Integrations tests.""" 2 | 3 | from __future__ import annotations 4 | 5 | import pytest 6 | 7 | from url_normalize import url_normalize 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "value", 12 | [ 13 | "-", 14 | "", 15 | "/..foo", 16 | "/.foo", 17 | "/foo..", 18 | "/foo.", 19 | "ftp://user:pass@ftp.foo.net/foo/bar", 20 | "http://127.0.0.1/", 21 | "http://example.com:8080/", 22 | "http://example.com/?a&b", 23 | "http://example.com/?q=%5C", 24 | "http://example.com/?q=%C3%87", 25 | "http://example.com/?q=%E2%85%A0", 26 | "http://example.com/", 27 | "http://example.com/~jane", 28 | "http://example.com/a/b", 29 | "http://example.com/FOO", 30 | "http://user:password@example.com/", 31 | "http://www.foo.com:8000/foo", 32 | # from rfc2396bis 33 | "ftp://ftp.is.co.za/rfc/rfc1808.txt", 34 | "http://www.ietf.org/rfc/rfc2396.txt", 35 | "ldap://[2001:db8::7]/c=GB?objectClass?one", 36 | "mailto:John.Doe@example.com", 37 | "news:comp.infosystems.www.servers.unix", 38 | "tel:+1-816-555-1212", 39 | "telnet://192.0.2.16:80/", 40 | "urn:oasis:names:specification:docbook:dtd:xml:4.1.2", 41 | # Issue #36: Fragment with '=' should not be encoded 42 | "https://docs.google.com/spreadsheets/d/abcd/edit#gid=1234", 43 | ], 44 | ) 45 | def test_url_normalize_no_changes_expected(value: str) -> None: 46 | """Assert url_normalize do not change URI if not required. 47 | 48 | http://www.intertwingly.net/wiki/pie/PaceCanonicalIds 49 | """ 50 | assert url_normalize(value) == value 51 | 52 | 53 | @pytest.mark.parametrize( 54 | ("value", "expected"), 55 | [ 56 | ("/../foo", "/foo"), 57 | ("/./../foo", "/foo"), 58 | ("/./foo", "/foo"), 59 | ("/./foo/.", "/foo/"), 60 | ("//www.foo.com/", "https://www.foo.com/"), 61 | ("/foo/../bar", "/bar"), 62 | ("/foo/./bar", "/foo/bar"), 63 | ("/foo//", "/foo/"), 64 | ("/foo///bar//", "/foo/bar/"), 65 | ("/foo/bar/..", "/foo/"), 66 | ("/foo/bar/../..", "/"), 67 | ("/foo/bar/../../../../baz", "/baz"), 68 | ("/foo/bar/../../../baz", "/baz"), 69 | ("/foo/bar/../../", "/"), 70 | ("/foo/bar/../../baz", "/baz"), 71 | ("/foo/bar/../", "/foo/"), 72 | ("/foo/bar/../baz", "/foo/baz"), 73 | ("/foo/bar/.", "/foo/bar/"), 74 | ("/foo/bar/./", "/foo/bar/"), 75 | ("http://:@example.com/", "http://example.com/"), 76 | ("http://@example.com/", "http://example.com/"), 77 | ("http://127.0.0.1:80/", "http://127.0.0.1/"), 78 | ("http://example.com:081/", "http://example.com:81/"), 79 | ("http://example.com:80/", "http://example.com/"), 80 | ("http://example.com", "http://example.com/"), 81 | ("http://example.com/?b&a", "http://example.com/?b&a"), 82 | ("http://example.com/?q=%5c", "http://example.com/?q=%5C"), 83 | ("http://example.com/?q=%C7", "http://example.com/?q=%EF%BF%BD"), 84 | ("http://example.com/?q=C%CC%A7", "http://example.com/?q=%C3%87"), 85 | ("http://EXAMPLE.COM/", "http://example.com/"), 86 | ("http://example.com/%7Ejane", "http://example.com/~jane"), 87 | ("http://example.com/a/../a/b", "http://example.com/a/b"), 88 | ("http://example.com/a/./b", "http://example.com/a/b"), 89 | ( 90 | "http://example.com/#!5753509/hello-world", 91 | "http://example.com/?_escaped_fragment_=5753509/hello-world", 92 | ), 93 | ( 94 | "http://USER:pass@www.Example.COM/foo/bar", 95 | "http://USER:pass@www.example.com/foo/bar", 96 | ), 97 | ("http://www.example.com./", "http://www.example.com/"), 98 | ("http://www.foo.com:80/foo", "http://www.foo.com/foo"), 99 | ("http://www.foo.com.:81/foo", "http://www.foo.com:81/foo"), 100 | ("http://www.foo.com./foo/bar.html", "http://www.foo.com/foo/bar.html"), 101 | ("http://www.foo.com/%7Ebar", "http://www.foo.com/~bar"), 102 | ("http://www.foo.com/%7ebar", "http://www.foo.com/~bar"), 103 | ( 104 | "пример.испытание/Служебная:Search/Test", 105 | "https://xn--e1afmkfd.xn--80akhbyknj4f/%D0%A1%D0%BB%D1%83%D0%B6%D0%B5%D0%B1%D0%BD%D0%B0%D1%8F:Search/Test", 106 | ), 107 | # Issue #19: http:example.com 108 | ("http:example.com", "http://example.com/"), 109 | ("http:example.com/path", "http://example.com/path"), 110 | ("ftp:test.com/files", "ftp://test.com/files"), 111 | ("https:www.example.com", "https://www.example.com/"), 112 | ], 113 | ) 114 | def test_url_normalize_expected_changes(value: str, expected: str) -> None: 115 | """Assert url_normalize return expected results.""" 116 | assert url_normalize(value) == expected 117 | 118 | 119 | def test_url_normalize_filtered() -> None: 120 | """Assert url_normalize return expected results.""" 121 | url = "/?a&b" 122 | expected = "/" 123 | 124 | actual = url_normalize(url, filter_params=True) 125 | 126 | assert actual == expected 127 | 128 | 129 | def test_url_normalize_with_http_scheme() -> None: 130 | """Assert we could use http scheme as default.""" 131 | url = "//www.foo.com/" 132 | expected = "http://www.foo.com/" 133 | 134 | actual = url_normalize(url, default_scheme="http") 135 | 136 | assert actual == expected 137 | 138 | 139 | @pytest.mark.parametrize( 140 | ("url", "expected"), 141 | [ 142 | ("/foo.png", "https://example.com/foo.png"), 143 | ("//google.com", "https://google.com/"), 144 | ("//example.com:80/foo.png", "https://example.com:80/foo.png"), 145 | ("//example.com/foo.png?foo=bar", "https://example.com/foo.png?foo=bar"), 146 | ("http://google.com", "http://google.com/"), 147 | ], 148 | ) 149 | def test_url_normalize_with_default_domain(url: str, expected: str) -> None: 150 | """Assert we could use default_domain parameter.""" 151 | actual = url_normalize(url, default_domain="example.com") 152 | 153 | assert actual == expected 154 | 155 | 156 | def test_url_normalize_with_default_domain_and_scheme() -> None: 157 | """Assert we can use both default_domain and default_scheme together.""" 158 | url = "/foo.png" 159 | expected = "http://example.com/foo.png" 160 | 161 | actual = url_normalize(url, default_scheme="http", default_domain="example.com") 162 | 163 | assert actual == expected 164 | -------------------------------------------------------------------------------- /url_normalize/__init__.py: -------------------------------------------------------------------------------- 1 | """URI normalize. 2 | 3 | Copyright (c) 2020 Nikolay Panov 4 | SPDX-License-Identifier: MIT 5 | 6 | """ 7 | 8 | from .url_normalize import url_normalize 9 | 10 | __license__ = "MIT" 11 | __version__ = "2.2.1" 12 | 13 | __all__ = ["url_normalize"] 14 | -------------------------------------------------------------------------------- /url_normalize/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Command line interface for url-normalize.""" 3 | 4 | import argparse 5 | import sys 6 | from importlib.metadata import version 7 | 8 | from .url_normalize import url_normalize 9 | 10 | 11 | def main() -> None: 12 | """Parse arguments and run url_normalize.""" 13 | parser = argparse.ArgumentParser(description="Normalize a URL.") 14 | parser.add_argument( 15 | "-v", 16 | "--version", 17 | action="version", 18 | version=f"%(prog)s {version('url-normalize')}", 19 | ) 20 | parser.add_argument("url", help="The URL to normalize.") 21 | parser.add_argument( 22 | "-c", 23 | "--charset", 24 | default="utf-8", 25 | help="The charset of the URL. Default: utf-8", 26 | ) 27 | parser.add_argument( 28 | "-s", 29 | "--default-scheme", 30 | default="https", 31 | help="The default scheme to use if missing. Default: https", 32 | ) 33 | parser.add_argument( 34 | "-f", 35 | "--filter-params", 36 | action="store_true", 37 | help="Filter common tracking parameters.", 38 | ) 39 | parser.add_argument( 40 | "-d", 41 | "--default-domain", 42 | type=str, 43 | help="Default domain to use for absolute paths (starting with '/').", 44 | ) 45 | parser.add_argument( 46 | "-p", 47 | "--param-allowlist", 48 | type=str, 49 | help="Comma-separated list of query parameters to allow (e.g., 'q,id').", 50 | ) 51 | 52 | args = parser.parse_args() 53 | 54 | allowlist = args.param_allowlist.split(",") if args.param_allowlist else None 55 | 56 | try: 57 | normalized_url = url_normalize( 58 | args.url, 59 | charset=args.charset, 60 | default_scheme=args.default_scheme, 61 | default_domain=args.default_domain, 62 | filter_params=args.filter_params, 63 | param_allowlist=allowlist, 64 | ) 65 | except Exception as e: # noqa: BLE001 66 | print(f"Error normalizing URL: {e}", file=sys.stderr) # noqa: T201 67 | sys.exit(1) 68 | else: 69 | print(normalized_url) # noqa: T201 70 | 71 | 72 | if __name__ == "__main__": 73 | main() # pragma: no cover 74 | -------------------------------------------------------------------------------- /url_normalize/generic_url_cleanup.py: -------------------------------------------------------------------------------- 1 | """URL generic cleanup operations.""" 2 | 3 | from __future__ import annotations 4 | 5 | 6 | def generic_url_cleanup(url: str) -> str: 7 | """Cleanup the URL from unnecessary data and convert to final form. 8 | 9 | Converts shebang urls to final form, removed unnecessary data from the url. 10 | 11 | Params: 12 | url : string : the URL 13 | 14 | Returns: 15 | string : update url 16 | 17 | """ 18 | url = url.replace("#!", "?_escaped_fragment_=") 19 | return url.rstrip("&? ") 20 | -------------------------------------------------------------------------------- /url_normalize/normalize_fragment.py: -------------------------------------------------------------------------------- 1 | """URL fragment normalization.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .tools import quote, unquote 6 | 7 | 8 | def normalize_fragment(fragment: str) -> str: 9 | """Normalize fragment part of the url. 10 | 11 | Params: 12 | fragment : string : url fragment, e.g., 'fragment' 13 | 14 | Returns: 15 | string : normalized fragment data. 16 | 17 | Notes: 18 | According to RFC 3986, the following characters are allowed in a fragment: 19 | fragment = *( pchar / "/" / "?" ) 20 | pchar = unreserved / pct-encoded / sub-delims / ":" / "@" 21 | unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 22 | sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 23 | We specifically allow "~" and "=" as safe characters during normalization. 24 | Other sub-delimiters could potentially be added to the `safe` list if needed. 25 | 26 | """ 27 | return quote(unquote(fragment), safe="~=") 28 | -------------------------------------------------------------------------------- /url_normalize/normalize_host.py: -------------------------------------------------------------------------------- 1 | """URL host normalization.""" 2 | 3 | from __future__ import annotations 4 | 5 | import idna 6 | 7 | from .tools import force_unicode 8 | 9 | DEFAULT_CHARSET = "utf-8" 10 | 11 | 12 | def normalize_host(host: str, charset: str = DEFAULT_CHARSET) -> str: 13 | """Normalize host part of the url. 14 | 15 | Lowercase and strip of final dot. 16 | Also, handle IDN domains using IDNA2008 with UTS46 transitional processing. 17 | 18 | Params: 19 | host : string : url host, e.g., 'site.com' 20 | charset : string : encoding charset 21 | 22 | Returns: 23 | string : normalized host data. 24 | 25 | """ 26 | host = force_unicode(host, charset) 27 | host = host.lower() 28 | host = host.strip(".") 29 | 30 | # Split domain into parts to handle each label separately 31 | parts = host.split(".") 32 | try: 33 | # Process each label separately to handle mixed unicode/ascii domains 34 | parts = [ 35 | idna.encode(p, uts46=True, transitional=True).decode(charset) 36 | for p in parts 37 | if p 38 | ] 39 | return ".".join(parts) 40 | except idna.IDNAError: 41 | # Fallback to direct encoding if IDNA2008 processing fails 42 | return host.encode("idna").decode(charset) 43 | -------------------------------------------------------------------------------- /url_normalize/normalize_path.py: -------------------------------------------------------------------------------- 1 | """URL path normalization.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .tools import quote, unquote 6 | 7 | 8 | def normalize_path(path: str, scheme: str) -> str: 9 | """Normalize path part of the url. 10 | 11 | Remove mention of default path number 12 | 13 | Params: 14 | path : string : url path, e.g., '/section/page.html' 15 | scheme : string : url scheme, e.g., 'http' 16 | 17 | Returns: 18 | string : normalized path data. 19 | 20 | """ 21 | # Only perform percent-encoding where it is essential. 22 | # Always use uppercase A-through-F characters when percent-encoding. 23 | # All portions of the URI must be utf-8 encoded NFC from Unicode strings 24 | path = quote(unquote(path), "~:/#[]@!$&'()*+,;=") 25 | # Prevent dot-segments appearing in non-relative URI paths. 26 | if scheme in {"", "http", "https", "ftp", "file"}: 27 | output: list[str] = [] 28 | for part in path.split("/"): 29 | if part == "": 30 | if not output: 31 | output.append(part) 32 | elif part == ".": 33 | pass 34 | elif part == "..": 35 | if len(output) > 1: 36 | output.pop() 37 | else: 38 | output.append(part) 39 | # The part variable is used in the final check 40 | last_part = part 41 | if last_part in {"", ".", ".."}: 42 | output.append("") 43 | path = "/".join(output) 44 | # For schemes that define an empty path to be equivalent to a path of "/", 45 | # use "/". 46 | if not path and scheme in {"http", "https", "ftp", "file"}: 47 | path = "/" 48 | return path 49 | -------------------------------------------------------------------------------- /url_normalize/normalize_port.py: -------------------------------------------------------------------------------- 1 | """URL port normalization.""" 2 | 3 | from __future__ import annotations 4 | 5 | DEFAULT_PORT = { 6 | "ftp": "21", 7 | "gopher": "70", 8 | "http": "80", 9 | "https": "443", 10 | "news": "119", 11 | "nntp": "119", 12 | "snews": "563", 13 | "snntp": "563", 14 | "telnet": "23", 15 | "ws": "80", 16 | "wss": "443", 17 | } 18 | 19 | 20 | def normalize_port(port: str, scheme: str) -> str: 21 | """Normalize port part of the url. 22 | 23 | Remove mention of default port number 24 | 25 | Params: 26 | port : string : url port, e.g., '8080' 27 | scheme : string : url scheme, e.g., 'http' 28 | 29 | Returns: 30 | string : normalized port data. 31 | 32 | """ 33 | if not port.isdigit(): 34 | return port 35 | port = str(int(port)) 36 | if DEFAULT_PORT.get(scheme) == port: 37 | return "" 38 | return port 39 | -------------------------------------------------------------------------------- /url_normalize/normalize_query.py: -------------------------------------------------------------------------------- 1 | """URL query normalization.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .param_allowlist import get_allowed_params 6 | from .tools import quote, unquote 7 | 8 | QUERY_PARAM_SAFE_CHARS = "~:/?[]@!$'()*+,;" 9 | 10 | 11 | def process_query_param(param: str) -> str: 12 | """Process a single query parameter. 13 | 14 | This function normalizes the parameter by quoting reserved characters 15 | and ensuring the parameter is in the correct format. 16 | 17 | Params: 18 | param: The query parameter to process. 19 | 20 | Returns: 21 | str: The normalized query parameter. 22 | 23 | """ 24 | if not param: 25 | return "" 26 | return quote(unquote(param), QUERY_PARAM_SAFE_CHARS) 27 | 28 | 29 | def normalize_query( 30 | query: str, 31 | *, # Force keyword-only arguments 32 | host: str | None = None, 33 | filter_params: bool = False, 34 | param_allowlist: list | dict | None = None, 35 | ) -> str: 36 | """Normalize query while preserving parameter order. 37 | 38 | Params: 39 | query: URL query string (e.g. 'param1=val1¶m2') 40 | host: Domain for allowlist checks 41 | filter_params: If True, removes non-allowlisted parameters 42 | param_allowlist: Optional override for default allowlist 43 | 44 | Returns: 45 | Normalized query string with original parameter order 46 | 47 | """ 48 | if not query: 49 | return "" 50 | 51 | processed = [] 52 | for param in query.split("&"): 53 | if not param: 54 | continue 55 | key, _, value = param.partition("=") 56 | key = process_query_param(key) 57 | if filter_params: 58 | allowed_params = get_allowed_params(host, param_allowlist) 59 | if key not in allowed_params: 60 | continue 61 | value = process_query_param(value) 62 | processed.append(f"{key}={value}" if value else key) 63 | 64 | return "&".join(processed) 65 | -------------------------------------------------------------------------------- /url_normalize/normalize_scheme.py: -------------------------------------------------------------------------------- 1 | """URL scheme normalization.""" 2 | 3 | from __future__ import annotations 4 | 5 | DEFAULT_SCHEME = "https" 6 | 7 | 8 | def normalize_scheme(scheme: str) -> str: 9 | """Normalize scheme part of the url. 10 | 11 | Params: 12 | scheme : string : url scheme, e.g., 'https' 13 | 14 | Returns: 15 | string : normalized scheme data. 16 | 17 | """ 18 | return scheme.lower() 19 | -------------------------------------------------------------------------------- /url_normalize/normalize_userinfo.py: -------------------------------------------------------------------------------- 1 | """URL userinfo normalization.""" 2 | 3 | from __future__ import annotations 4 | 5 | 6 | def normalize_userinfo(userinfo: str) -> str: 7 | """Normalize userinfo part of the url. 8 | 9 | Params: 10 | userinfo : string : url userinfo, e.g., 'user@' 11 | 12 | Returns: 13 | string : normalized userinfo data. 14 | 15 | """ 16 | if userinfo in ["@", ":@"]: 17 | return "" 18 | return userinfo 19 | -------------------------------------------------------------------------------- /url_normalize/param_allowlist.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023. All rights reserved. 2 | """URL query parameter allowlist module.""" 3 | 4 | from __future__ import annotations 5 | 6 | DEFAULT_ALLOWLIST = { 7 | "google.com": ["q", "ie"], 8 | "baidu.com": ["wd", "ie"], 9 | "bing.com": ["q"], 10 | "youtube.com": ["v", "search_query"], 11 | } 12 | 13 | 14 | def get_allowed_params( 15 | host: str | None = None, 16 | allowlist: dict | list | None = None, 17 | ) -> set[str]: 18 | """Get allowed parameters for a given domain. 19 | 20 | Params: 21 | host: Domain name to check (e.g. 'google.com') 22 | allowlist: Optional override for default allowlist 23 | If provided as a list, it will be used as is. 24 | If provided as a dictionary, it should map domain names to 25 | lists of allowed parameters. 26 | If None, the default allowlist will be used. 27 | 28 | Returns: 29 | Set of allowed parameter names for the domain 30 | 31 | """ 32 | if isinstance(allowlist, list): 33 | return set(allowlist) 34 | 35 | if not host: 36 | return set() 37 | 38 | # Normalize host by removing www and port 39 | domain = host.lower() 40 | if domain.startswith("www."): 41 | domain = domain[4:] 42 | domain = domain.split(":")[0] 43 | 44 | # Use default allowlist if none provided 45 | if allowlist is None: 46 | allowlist = DEFAULT_ALLOWLIST 47 | 48 | # Return allowed parameters for the domain, or an empty set if not found 49 | return set(allowlist.get(domain, [])) 50 | -------------------------------------------------------------------------------- /url_normalize/provide_url_domain.py: -------------------------------------------------------------------------------- 1 | """URL domain validation and attachment.""" 2 | 3 | from __future__ import annotations 4 | 5 | 6 | def provide_url_domain(url: str, default_domain: str | None = None) -> str: 7 | """Add default domain to URL if needed. 8 | 9 | For absolute paths (starting with '/'), adds the specified default domain. 10 | 11 | Params: 12 | url : str : the URL 13 | default_domain : str | None : default domain to use, e.g. 'example.com' 14 | 15 | Returns: 16 | str : URL with domain added if applicable 17 | 18 | """ 19 | # Skip processing if no default domain provided or URL is empty or stdout 20 | if not default_domain or not url or url == "-": 21 | return url 22 | 23 | # Only apply to absolute paths (starting with '/') 24 | # but not scheme-relative URLs ('//') 25 | if url.startswith("/") and not url.startswith("//"): 26 | return "//" + default_domain + url 27 | 28 | return url 29 | -------------------------------------------------------------------------------- /url_normalize/provide_url_scheme.py: -------------------------------------------------------------------------------- 1 | """URL scheme validation and attachment.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .normalize_scheme import DEFAULT_SCHEME 6 | 7 | # Schemes that require authority component reconstruction with // 8 | AUTHORITY_SCHEMES = frozenset(["http", "https", "ftp", "ftps"]) 9 | 10 | 11 | def provide_url_scheme(url: str, default_scheme: str = DEFAULT_SCHEME) -> str: 12 | """Make sure we have valid url scheme. 13 | 14 | Params: 15 | url : string : the URL 16 | default_scheme : string : default scheme to use, e.g. 'https' 17 | 18 | Returns: 19 | string : updated url with validated/attached scheme 20 | 21 | """ 22 | has_scheme = ":" in url[:7] 23 | is_universal_scheme = url.startswith("//") 24 | is_file_path = url == "-" or (url.startswith("/") and not is_universal_scheme) 25 | if not url or is_file_path: 26 | return url 27 | if not has_scheme: 28 | return f"{default_scheme}://{url.lstrip('/')}" 29 | scheme_part, rest = url.split(":", 1) 30 | if scheme_part.lower() not in AUTHORITY_SCHEMES: 31 | # handle cases like tel:, mailto:, etc. 32 | return url 33 | return f"{scheme_part}://{rest.lstrip('/')}" 34 | -------------------------------------------------------------------------------- /url_normalize/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niksite/url-normalize/918b135085d73f2a0441048ee01643c60fc7d89a/url_normalize/py.typed -------------------------------------------------------------------------------- /url_normalize/tools.py: -------------------------------------------------------------------------------- 1 | """URL normalization tools.""" 2 | 3 | from __future__ import annotations 4 | 5 | import re 6 | import unicodedata 7 | from typing import NamedTuple 8 | from urllib.parse import quote as quote_orig 9 | from urllib.parse import unquote as unquote_orig 10 | from urllib.parse import urlsplit, urlunsplit 11 | 12 | 13 | class URL(NamedTuple): 14 | """URL components tuple. 15 | 16 | A named tuple containing the parsed components of a URL: 17 | scheme, userinfo, host, port, path, query, and fragment. 18 | """ 19 | 20 | scheme: str 21 | userinfo: str 22 | host: str 23 | port: str 24 | path: str 25 | query: str 26 | fragment: str 27 | 28 | 29 | def deconstruct_url(url: str) -> URL: 30 | """Transform the url into URL structure. 31 | 32 | Params: 33 | url : string : the URL 34 | 35 | Returns: 36 | URL 37 | 38 | """ 39 | scheme, auth, path, query, fragment = urlsplit(url.strip()) 40 | match = re.search(r"([^@]*@)?([^:]*):?(.*)", auth) 41 | (userinfo, host, port) = match.groups() # type: ignore # noqa: PGH003 42 | return URL( 43 | fragment=fragment, 44 | host=host, 45 | path=path, 46 | port=port or "", 47 | query=query, 48 | scheme=scheme, 49 | userinfo=userinfo or "", 50 | ) 51 | 52 | 53 | def reconstruct_url(url: URL) -> str: 54 | """Reconstruct string url from URL. 55 | 56 | Params: 57 | url : URL object instance 58 | 59 | Returns: 60 | string : reconstructed url string 61 | 62 | """ 63 | auth = (url.userinfo or "") + url.host 64 | if url.port: 65 | auth += ":" + url.port 66 | return urlunsplit((url.scheme, auth, url.path, url.query, url.fragment)) 67 | 68 | 69 | def force_unicode(string: str | bytes, charset: str = "utf-8") -> str: 70 | """Ensure string is properly encoded (Python 3 only). 71 | 72 | Params: 73 | string : str : an input string 74 | charset : str : optional : output encoding 75 | 76 | Returns: 77 | str 78 | 79 | """ 80 | if isinstance(string, bytes): 81 | return string.decode(charset, "replace") 82 | return string 83 | 84 | 85 | def unquote(string: str, charset: str = "utf-8") -> str: 86 | """Unquote and normalize unicode string. 87 | 88 | Params: 89 | string : string to be unquoted 90 | charset : string : optional : output encoding 91 | 92 | Returns: 93 | string : an unquoted and normalized string 94 | 95 | """ 96 | string = unquote_orig(string) 97 | string = force_unicode(string, charset) 98 | encoded_str = unicodedata.normalize("NFC", string).encode(charset) 99 | return encoded_str.decode(charset) 100 | 101 | 102 | def quote(string: str, safe: str = "/") -> str: 103 | """Quote string. 104 | 105 | Params: 106 | string : string to be quoted 107 | safe : string of safe characters 108 | 109 | Returns: 110 | string : quoted string 111 | 112 | """ 113 | return quote_orig(string, safe) 114 | -------------------------------------------------------------------------------- /url_normalize/url_normalize.py: -------------------------------------------------------------------------------- 1 | """URL normalize main module. 2 | 3 | Copyright (c) 2020 Nikolay Panov 4 | This module is part of url-normalize package and is released under the MIT License: 5 | https://opensource.org/licenses/MIT 6 | 7 | """ 8 | 9 | from __future__ import annotations 10 | 11 | from .generic_url_cleanup import generic_url_cleanup 12 | from .normalize_fragment import normalize_fragment 13 | from .normalize_host import DEFAULT_CHARSET, normalize_host 14 | from .normalize_path import normalize_path 15 | from .normalize_port import normalize_port 16 | from .normalize_query import normalize_query 17 | from .normalize_scheme import DEFAULT_SCHEME, normalize_scheme 18 | from .normalize_userinfo import normalize_userinfo 19 | from .provide_url_domain import provide_url_domain 20 | from .provide_url_scheme import provide_url_scheme 21 | from .tools import deconstruct_url, reconstruct_url 22 | 23 | 24 | def url_normalize( # noqa: PLR0913 25 | url: str | None, 26 | *, # Force keyword-only arguments 27 | charset: str = DEFAULT_CHARSET, 28 | default_scheme: str = DEFAULT_SCHEME, 29 | default_domain: str | None = None, 30 | filter_params: bool = False, 31 | param_allowlist: dict | list | None = None, 32 | ) -> str | None: 33 | """URI normalization routine. 34 | 35 | Sometimes you get an URL by a user that just isn't a real 36 | URL because it contains unsafe characters like ' ' and so on. 37 | This function can fix some of the problems in a similar way 38 | browsers handle data entered by the user: 39 | 40 | >>> url_normalize('http://de.wikipedia.org/wiki/Elf (Begriffsklärung)') 41 | 'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29' 42 | 43 | Params: 44 | url : str | None : URL to normalize 45 | charset : str : optional 46 | The target charset for the URL if the url was given as unicode string 47 | default_scheme : str : default scheme to use if none present 48 | default_domain : str | None : optional 49 | Default domain to use for absolute paths (starting with '/') 50 | filter_params : bool : optional 51 | Whether to filter non-allowlisted parameters (False by default) 52 | param_allowlist : dict | list | None : optional 53 | Override for the parameter allowlist 54 | 55 | Returns: 56 | str | None : a normalized url 57 | 58 | """ 59 | if not url: 60 | return url 61 | url = provide_url_domain(url, default_domain) 62 | url = provide_url_scheme(url, default_scheme) 63 | url = generic_url_cleanup(url) 64 | url_elements = deconstruct_url(url) 65 | url_elements = url_elements._replace( 66 | scheme=normalize_scheme(url_elements.scheme), 67 | userinfo=normalize_userinfo(url_elements.userinfo), 68 | host=normalize_host(url_elements.host, charset), 69 | query=normalize_query( 70 | url_elements.query, 71 | host=url_elements.host, 72 | filter_params=filter_params, 73 | param_allowlist=param_allowlist, 74 | ), 75 | fragment=normalize_fragment(url_elements.fragment), 76 | ) 77 | url_elements = url_elements._replace( 78 | port=normalize_port(url_elements.port, url_elements.scheme), 79 | path=normalize_path(url_elements.path, url_elements.scheme), 80 | ) 81 | return reconstruct_url(url_elements) 82 | --------------------------------------------------------------------------------