├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE
├── Makefile
├── README.md
├── pyproject.toml
├── tests
    ├── test_cli.py
    ├── test_deconstruct_url.py
    ├── test_generic_url_cleanup.py
    ├── test_normalize_fragment.py
    ├── test_normalize_host.py
    ├── test_normalize_path.py
    ├── test_normalize_port.py
    ├── test_normalize_query.py
    ├── test_normalize_query_filters.py
    ├── test_normalize_scheme.py
    ├── test_normalize_userinfo.py
    ├── test_provide_url_domain.py
    ├── test_provide_url_scheme.py
    ├── test_reconstruct_url.py
    ├── test_tools.py
    └── test_url_normalize.py
└── url_normalize
    ├── __init__.py
    ├── cli.py
    ├── generic_url_cleanup.py
    ├── normalize_fragment.py
    ├── normalize_host.py
    ├── normalize_path.py
    ├── normalize_port.py
    ├── normalize_query.py
    ├── normalize_scheme.py
    ├── normalize_userinfo.py
    ├── param_allowlist.py
    ├── provide_url_domain.py
    ├── provide_url_scheme.py
    ├── py.typed
    ├── tools.py
    └── url_normalize.py


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: tests
  2 | 
  3 | on:
  4 |   push:
  5 |     # Avoid using all the resources/limits available by checking only
  6 |     # relevant branches and tags. Other branches can be checked via PRs.
  7 |     branches: [master]
  8 |     tags: ['v[0-9]*', '[0-9]+.[0-9]+*']  # Match tags that resemble a version
  9 |   pull_request:
 10 |   workflow_dispatch:  # Allow manually triggering the workflow
 11 |   schedule:
 12 |     # Run roughly every 15 days at 00:00 UTC
 13 |     # (useful to check if updates on dependencies break the package)
 14 |     - cron: '0 0 1,16 * *'
 15 | 
 16 | concurrency:
 17 |   group: >-
 18 |     ${{ github.workflow }}-${{ github.ref_type }}-
 19 |     ${{ github.event.pull_request.number || github.sha }}
 20 |   cancel-in-progress: true
 21 | 
 22 | jobs:
 23 |   prepare:
 24 |     runs-on: ubuntu-latest
 25 |     outputs:
 26 |       wheel-distribution: ${{ steps.wheel-distribution.outputs.path }}
 27 |     steps:
 28 |       - uses: actions/checkout@v4
 29 |         with: {fetch-depth: 0}  # deep clone for setuptools-scm
 30 |       - uses: actions/setup-python@v5
 31 |         with: {python-version-file: "pyproject.toml"}
 32 |       - uses: astral-sh/setup-uv@v5
 33 |       - name: Run static analysis and format checkers
 34 |         run: uv run --with '.[dev]' pre-commit run --all-files
 35 |       - name: Build package distribution files
 36 |         run: uv build
 37 |       - name: Record the path of wheel distribution
 38 |         id: wheel-distribution
 39 |         run: echo "path=$(ls dist/*.whl)" >> $GITHUB_OUTPUT
 40 |       - name: Store the distribution files for use in other stages
 41 |         uses: actions/upload-artifact@v4
 42 |         with:
 43 |           name: python-distribution-files
 44 |           path: dist/
 45 |           retention-days: 1
 46 | 
 47 |   test:
 48 |     needs: prepare
 49 |     strategy:
 50 |       matrix:
 51 |         python:
 52 |         - "3.8"   # oldest Python supported by validate-pyproject
 53 |         - "3.x"   # newest Python that is stable
 54 |         platform:
 55 |         - ubuntu-latest
 56 |         - macos-13
 57 |         - windows-latest
 58 |     runs-on: ${{ matrix.platform }}
 59 |     steps:
 60 |       - uses: actions/checkout@v4
 61 |       - uses: actions/setup-python@v5
 62 |         with:
 63 |           python-version: ${{ matrix.python }}
 64 |       - uses: astral-sh/setup-uv@v5
 65 |       - name: Retrieve pre-built distribution files
 66 |         uses: actions/download-artifact@v4
 67 |         with: {name: python-distribution-files, path: dist/}
 68 |       - name: Run tests
 69 |         run: uv run --with '.[dev]' pytest --cov-report=lcov:coverage.lcov
 70 |       - name: Upload partial coverage report
 71 |         uses: coverallsapp/github-action@v2
 72 |         with:
 73 |           path-to-lcov: coverage.lcov
 74 |           github-token: ${{ secrets.GITHUB_TOKEN }}
 75 |           flag-name: ${{ matrix.platform }} - py${{ matrix.python }}
 76 |           parallel: true
 77 | 
 78 |   finalize:
 79 |     needs: test
 80 |     runs-on: ubuntu-latest
 81 |     steps:
 82 |       - name: Finalize coverage report
 83 |         uses: coverallsapp/github-action@v2
 84 |         with:
 85 |           github-token: ${{ secrets.GITHUB_TOKEN }}
 86 |           parallel-finished: true
 87 | 
 88 |   publish:
 89 |     needs: finalize
 90 |     if: ${{ github.event_name == 'push' && contains(github.ref, 'refs/tags/') }}
 91 |     runs-on: ubuntu-latest
 92 |     permissions:
 93 |       id-token: write
 94 |     steps:
 95 |       - uses: actions/checkout@v4
 96 |       - uses: actions/setup-python@v5
 97 |         with: {python-version-file: "pyproject.toml"}
 98 |       - uses: astral-sh/setup-uv@v5
 99 |       - name: Retrieve pre-built distribution files
100 |         uses: actions/download-artifact@v4
101 |         with: {name: python-distribution-files, path: dist/}
102 |       - name: Publish Package to PyPI
103 |         run: uv publish dist/*
104 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | .*
 3 | !.coveragerc
 4 | !.github
 5 | !.pre-commit-config.yaml
 6 | *.egg-info
 7 | *.pyc
 8 | build
 9 | dist
10 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v5.0.0
 4 |     hooks:
 5 |       - id: check-added-large-files
 6 |       - id: check-ast
 7 |       - id: check-json
 8 |       - id: check-merge-conflict
 9 |       - id: check-symlinks
10 |       - id: check-toml
11 |       - id: check-xml
12 |       - id: check-yaml
13 |       - id: debug-statements
14 |       - id: end-of-file-fixer
15 |       - id: requirements-txt-fixer
16 |       - id: trailing-whitespace
17 |       - id: mixed-line-ending
18 |         args: ["--fix=auto"] # replace 'auto' with 'lf' to enforce Linux/Mac line endings or 'crlf' for Windows
19 |   - repo: https://github.com/abravalheri/validate-pyproject
20 |     rev: v0.24.1
21 |     hooks:
22 |       - id: validate-pyproject
23 |   - repo: https://github.com/pre-commit/mirrors-mypy
24 |     rev: v1.15.0
25 |     hooks:
26 |       - id: mypy
27 |         exclude: tests
28 |   - repo: https://github.com/igorshubovych/markdownlint-cli
29 |     rev: v0.44.0
30 |     hooks:
31 |       - id: markdownlint
32 |         args: ["--fix", "--disable", "MD024"]
33 |   - repo: https://github.com/codespell-project/codespell
34 |     rev: v2.4.1
35 |     hooks:
36 |       - id: codespell
37 |   - repo: https://github.com/astral-sh/ruff-pre-commit
38 |     rev: v0.11.4
39 |     hooks:
40 |       - id: ruff-format
41 |       - id: ruff
42 |         args: [--fix, --exit-non-zero-on-fix]
43 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | All notable changes to this project will be documented in this file.
  4 | 
  5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
  6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  7 | 
  8 | ## [2.2.1] - 2025-04-26
  9 | 
 10 | ### Added
 11 | 
 12 | - Include `py.typed` marker file for PEP 561 compatibility.
 13 | 
 14 | ## [2.2.0] - 2025-03-30
 15 | 
 16 | ### Added
 17 | 
 18 | - New `default_domain` parameter to support absolute paths with domain names (#22)
 19 | 
 20 | ### Fixed
 21 | 
 22 | - Handle URLs with missing slashes correctly (#19)
 23 | - Fix decoding of reserved characters in URL paths (#25)
 24 | - Fix Twitter hashtag encoding in query parameters (#31)
 25 | 
 26 | ### Internal
 27 | 
 28 | - Update CI configuration to use uv from PATH
 29 | 
 30 | ## [2.1.0] - 2025-03-30
 31 | 
 32 | ### Added
 33 | 
 34 | - New command-line interface (`url-normalize`) with support for:
 35 |   - Version information (`--version`, `-v`)
 36 |   - Charset selection (`--charset`, `-c`)
 37 |   - Default scheme override (`--default-scheme`, `-s`)
 38 |   - Query parameter filtering (`--filter-params`, `-f`)
 39 |   - Custom allowlist for query parameters (`--param-allowlist`, `-p`)
 40 | 
 41 | ### Fixed
 42 | 
 43 | - Do not encode equals sign in fragment (Fixes #36)
 44 | 
 45 | ### Internal
 46 | 
 47 | - Add GitHub Action to publish package to PyPI using uv
 48 | 
 49 | ## [2.0.1] - 2025-03-29
 50 | 
 51 | ### Fixed
 52 | 
 53 | - Reverted license format in pyproject.toml to maintain Python 3.8 compatibility with older setuptools versions
 54 | 
 55 | ## [2.0.0] - 2025-03-29
 56 | 
 57 | ### Added
 58 | 
 59 | - Query parameter filtering functionality
 60 | - Parameter allowlist feature for controlling accepted query parameters
 61 | - IDNA 2008 support via `idna` package
 62 | 
 63 | ### Changed
 64 | 
 65 | - **BREAKING:** Switch default scheme from 'http' to 'https'
 66 | - **BREAKING:** Migrated IDNA handling to use IDNA 2008 with UTS46 processing
 67 | - **BREAKING:** Updated minimum Python version to 3.8 (removed Python 2.7 support)
 68 | - **BREAKING:** Removed sort_query_params option as it was incorrect - query parameter order is semantically meaningful and cannot be changed
 69 | - Enhanced query normalization with parameter filtering support
 70 | - Updated URL cleanup to support new filtering features
 71 | - Changed host normalization to handle each domain label separately
 72 | 
 73 | ### Internal
 74 | 
 75 | - Refactored code organization for improved maintainability:
 76 |   - Split url_normalize.py into separate function modules
 77 |   - Moved each normalization function to its own file
 78 |   - Reorganized constants to their relevant modules
 79 |   - Maintained backward compatibility and test coverage
 80 | - Added pre-commit hooks for code quality and linting
 81 | - Dedicated CHANGELOG.md file
 82 | - Increased test coverage requirement to 100%
 83 | - Migrated from Travis CI to GitHub Actions for testing across multiple Python versions
 84 | - Moved pytest configuration from tox.ini to pyproject.toml
 85 | - Removed Travis CI configuration in favor of GitHub Actions
 86 | - Semantic versioning compliance
 87 | - Upgraded project structure to modern Python packaging standards using pyproject.toml
 88 | 
 89 | ## [1.4.3] - 2024-02-15
 90 | 
 91 | ### Added
 92 | 
 93 | - LICENSE file
 94 | 
 95 | ## [1.4.2]
 96 | 
 97 | ### Added
 98 | 
 99 | - Optional param `sort_query_params` (True by default)
100 | 
101 | ## [1.4.1]
102 | 
103 | ### Added
104 | 
105 | - Param `default_scheme` to url_normalize ('https' by default)
106 | 
107 | ## [1.4.0]
108 | 
109 | ### Changed
110 | 
111 | - Code refactoring and cleanup
112 | 
113 | ## [1.3.3]
114 | 
115 | ### Added
116 | 
117 | - Support for empty string and double slash urls (//domain.tld)
118 | 
119 | ## [1.3.2]
120 | 
121 | ### Added
122 | 
123 | - Cross-version compatibility: same code supports both Python 3 and Python 2
124 | 
125 | ## [1.3.1]
126 | 
127 | ### Added
128 | 
129 | - Python 3 compatibility
130 | 
131 | ## [1.2.1]
132 | 
133 | ### Changed
134 | 
135 | - PEP8 compliance improvements
136 | - Setup.py improvements
137 | 
138 | ## [1.1.2]
139 | 
140 | ### Added
141 | 
142 | - Support for shebang (#!) urls
143 | 
144 | ## [1.1.1]
145 | 
146 | ### Changed
147 | 
148 | - Using 'http' schema by default when appropriate
149 | 
150 | ## [1.1.0]
151 | 
152 | ### Added
153 | 
154 | - Handling of IDN domains
155 | 
156 | ## [1.0.0]
157 | 
158 | ### Changed
159 | 
160 | - Code PEP8 compliance
161 | 
162 | ## [0.1.0]
163 | 
164 | ### Added
165 | 
166 | - Initial release
167 | - Forked from Sam Ruby's urlnorm.py
168 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Nikolay Panov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | install:
 2 | 	@uv pip install -e ".[dev]"
 3 | 
 4 | # tox target removed
 5 | 
 6 | update: install
 7 | 	@uv run -- pre-commit autoupdate
 8 | 
 9 | lint: install
10 | 	@uv run -- pre-commit run -a
11 | 
12 | test: install
13 | 	@uv run -- pytest
14 | 
15 | build:
16 | 	@rm -rf dist
17 | 	@uv build
18 | 
19 | publish: build
20 | 	@uv publish
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # url-normalize
  2 | 
  3 | [![tests](https://github.com/niksite/url-normalize/actions/workflows/ci.yml/badge.svg)](https://github.com/niksite/url-normalize/actions/workflows/ci.yml)
  4 | [![Coveralls](https://img.shields.io/coveralls/github/niksite/url-normalize/master.svg)](https://coveralls.io/r/niksite/url-normalize)
  5 | [![PyPI](https://img.shields.io/pypi/v/url-normalize.svg)](https://pypi.org/project/url-normalize/)
  6 | 
  7 | A Python library for standardizing and normalizing URLs with support for internationalized domain names (IDN).
  8 | 
  9 | ## Table of Contents
 10 | 
 11 | - [Introduction](#introduction)
 12 | - [Features](#features)
 13 | - [Installation](#installation)
 14 | - [Usage](#usage)
 15 |   - [Python API](#python-api)
 16 |   - [Command Line](#command-line-usage)
 17 | - [Documentation](#documentation)
 18 | - [Contributing](#contributing)
 19 | - [License](#license)
 20 | 
 21 | ## Introduction
 22 | 
 23 | url-normalize provides a robust URI normalization function that:
 24 | 
 25 | - Takes care of IDN domains.
 26 | - Always provides the URI scheme in lowercase characters.
 27 | - Always provides the host, if any, in lowercase characters.
 28 | - Only performs percent-encoding where it is essential.
 29 | - Always uses uppercase A-through-F characters when percent-encoding.
 30 | - Prevents dot-segments appearing in non-relative URI paths.
 31 | - For schemes that define a default authority, uses an empty authority if the
 32 |   default is desired.
 33 | - For schemes that define an empty path to be equivalent to a path of "/",
 34 |   uses "/".
 35 | - For schemes that define a port, uses an empty port if the default is desired
 36 | - Ensures all portions of the URI are utf-8 encoded NFC from Unicode strings
 37 | 
 38 | Inspired by Sam Ruby's [urlnorm.py](http://intertwingly.net/blog/2004/08/04/Urlnorm)
 39 | 
 40 | ## Features
 41 | 
 42 | - **IDN Support**: Full internationalized domain name handling
 43 | - **Configurable Defaults**:
 44 |   - Customizable default scheme (https by default)
 45 |   - Configurable default domain for absolute paths
 46 | - **Query Parameter Control**:
 47 |   - Parameter filtering with allowlists
 48 |   - Support for domain-specific parameter rules
 49 | - **Versatile URL Handling**:
 50 |   - Empty string URLs
 51 |   - Double slash URLs (//domain.tld)
 52 |   - Shebang (#!) URLs
 53 | - **Developer Friendly**:
 54 |   - Cross-version Python compatibility (3.8+)
 55 |   - 100% test coverage
 56 |   - Modern type hints and string handling
 57 | 
 58 | ## Installation
 59 | 
 60 | ```sh
 61 | pip install url-normalize
 62 | ```
 63 | 
 64 | ## Usage
 65 | 
 66 | ### Python API
 67 | 
 68 | ```python
 69 | from url_normalize import url_normalize
 70 | 
 71 | # Basic normalization (uses https by default)
 72 | print(url_normalize("www.foo.com:80/foo"))
 73 | # Output: https://www.foo.com/foo
 74 | 
 75 | # With custom default scheme
 76 | print(url_normalize("www.foo.com/foo", default_scheme="http"))
 77 | # Output: http://www.foo.com/foo
 78 | 
 79 | # With query parameter filtering enabled
 80 | print(url_normalize("www.google.com/search?q=test&utm_source=test", filter_params=True))
 81 | # Output: https://www.google.com/search?q=test
 82 | 
 83 | # With custom parameter allowlist as a dict
 84 | print(url_normalize(
 85 |     "example.com?page=1&id=123&ref=test",
 86 |     filter_params=True,
 87 |     param_allowlist={"example.com": ["page", "id"]}
 88 | ))
 89 | # Output: https://example.com?page=1&id=123
 90 | 
 91 | # With custom parameter allowlist as a list
 92 | print(url_normalize(
 93 |     "example.com?page=1&id=123&ref=test",
 94 |     filter_params=True,
 95 |     param_allowlist=["page", "id"]
 96 | ))
 97 | # Output: https://example.com?page=1&id=123
 98 | 
 99 | # With default domain for absolute paths
100 | print(url_normalize("/images/logo.png", default_domain="example.com"))
101 | # Output: https://example.com/images/logo.png
102 | 
103 | # With default domain and custom scheme
104 | print(url_normalize("/images/logo.png", default_scheme="http", default_domain="example.com"))
105 | # Output: http://example.com/images/logo.png
106 | ```
107 | 
108 | ### Command-line Usage
109 | 
110 | You can also use `url-normalize` from the command line:
111 | 
112 | ```bash
113 | $ url-normalize "www.foo.com:80/foo"
114 | # Output: https://www.foo.com/foo
115 | 
116 | # With custom default scheme
117 | $ url-normalize -s http "www.foo.com/foo"
118 | # Output: http://www.foo.com/foo
119 | 
120 | # With query parameter filtering
121 | $ url-normalize -f "www.google.com/search?q=test&utm_source=test"
122 | # Output: https://www.google.com/search?q=test
123 | 
124 | # With custom allowlist
125 | $ url-normalize -f -p page,id "example.com?page=1&id=123&ref=test"
126 | # Output: https://example.com/?page=1&id=123
127 | 
128 | # With default domain for absolute paths
129 | $ url-normalize -d example.com "/images/logo.png"
130 | # Output: https://example.com/images/logo.png
131 | 
132 | # With default domain and custom scheme
133 | $ url-normalize -d example.com -s http "/images/logo.png"
134 | # Output: http://example.com/images/logo.png
135 | 
136 | # Via uv tool/uvx
137 | $ uvx url-normalize www.foo.com:80/foo
138 | # Output: https://www.foo.com:80/foo
139 | ```
140 | 
141 | ## Documentation
142 | 
143 | For a complete history of changes, see [CHANGELOG.md](CHANGELOG.md).
144 | 
145 | ## Contributing
146 | 
147 | Contributions are welcome! Please feel free to submit a Pull Request.
148 | 
149 | ## License
150 | 
151 | MIT License
152 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "url-normalize"
 3 | version = "2.2.1"
 4 | description = "URL normalization for Python"
 5 | authors = [{ name = "Nikolay Panov", email = "github@npanov.com" }]
 6 | license = { text = "MIT" }
 7 | readme = "README.md"
 8 | requires-python = ">=3.8"
 9 | keywords = ["url", "normalization", "normalize", "normalizer"]
10 | dependencies = ["idna>=3.3"]
11 | 
12 | [project.urls]
13 | Homepage = "https://github.com/niksite/url-normalize"
14 | Repository = "https://github.com/niksite/url-normalize"
15 | Issues = "https://github.com/niksite/url-normalize/issues"
16 | Changelog = "https://github.com/niksite/url-normalize/blob/master/CHANGELOG.md"
17 | 
18 | [project.scripts]
19 | url-normalize = "url_normalize.cli:main"
20 | 
21 | [project.optional-dependencies]
22 | dev = ["mypy", "pre-commit", "pytest-cov", "pytest-socket", "pytest", "ruff"]
23 | 
24 | [tool.ruff]
25 | target-version = "py38"
26 | line-length = 88
27 | unsafe-fixes = true
28 | 
29 | [tool.ruff.lint]
30 | select = ["ALL"]
31 | extend-select = [
32 |   "D400", # First line should end with a period
33 |   "D401", # First line should be in imperative mood
34 |   "D413", # Missing blank line after the last section of a multiline docstring
35 | ]
36 | fixable = ["ALL"]
37 | ignore = [
38 |   "COM812", # missing-trailing-comma
39 |   "D203",   # One blank line before class - we prefer D211 instead
40 |   "D213",   # multi-line-summary-second-line - we prefer D212 instead
41 | ]
42 | 
43 | [tool.ruff.lint.pydocstyle]
44 | convention = "google"
45 | 
46 | [tool.ruff.lint.per-file-ignores]
47 | "tests/**" = ["INP001", "ANN001", "ANN201", "S101", "CPY001"]
48 | 
49 | [tool.ruff.format]
50 | quote-style = "double"
51 | indent-style = "space"
52 | 
53 | [tool.mypy]
54 | ignore_missing_imports = true
55 | exclude = ["tests"]
56 | python_version = "3.8"
57 | show_error_codes = true
58 | 
59 | [build-system]
60 | requires = ["setuptools>=42", "wheel"]
61 | build-backend = "setuptools.build_meta"
62 | 
63 | [tool.pytest.ini_options]
64 | addopts = [
65 |   "--cov-report=term-missing:skip-covered",
66 |   "--cov=url_normalize",
67 |   "--disable-socket",
68 |   "-v",
69 | ]
70 | python_files = ["tests.py", "test_*.py", "*_tests.py"]
71 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | """Tests for the command line interface."""
  2 | 
  3 | import subprocess
  4 | import sys
  5 | from unittest.mock import patch
  6 | 
  7 | import pytest
  8 | 
  9 | from url_normalize import __version__
 10 | from url_normalize.cli import main
 11 | 
 12 | 
 13 | def run_cli(*args: str) -> subprocess.CompletedProcess:
 14 |     """Run the CLI command with given arguments.
 15 | 
 16 |     Params:
 17 |         *args: Command line arguments to pass to the CLI.
 18 | 
 19 |     Returns:
 20 |         A completed process with stdout, stderr, and return code.
 21 | 
 22 |     """
 23 |     command = [sys.executable, "-m", "url_normalize.cli", *list(args)]
 24 |     return subprocess.run(  # noqa: S603
 25 |         command, capture_output=True, text=True, check=False
 26 |     )
 27 | 
 28 | 
 29 | def test_cli_error_handling(capsys, monkeypatch):
 30 |     """Test CLI error handling when URL normalization fails."""
 31 |     with patch("url_normalize.cli.url_normalize") as mock_normalize:
 32 |         mock_normalize.side_effect = Exception("Simulated error")
 33 |         monkeypatch.setattr("sys.argv", ["url-normalize", "http://example.com"])
 34 | 
 35 |         with pytest.raises(SystemExit) as excinfo:
 36 |             main()
 37 | 
 38 |         assert excinfo.value.code == 1
 39 |         captured = capsys.readouterr()
 40 |         assert "Error normalizing URL: Simulated error" in captured.err
 41 |         assert not captured.out
 42 | 
 43 | 
 44 | def test_cli_basic_normalization() -> None:
 45 |     """Test basic URL normalization via CLI."""
 46 |     url = "http://EXAMPLE.com/./path/../other/"
 47 |     expected = "http://example.com/other/"
 48 | 
 49 |     result = run_cli(url)
 50 | 
 51 |     assert result.returncode == 0
 52 |     assert result.stdout.strip() == expected
 53 |     assert not result.stderr
 54 | 
 55 | 
 56 | def test_cli_basic_normalization_short_args() -> None:
 57 |     """Test basic URL normalization via CLI using short arguments."""
 58 |     url = "http://EXAMPLE.com/./path/../other/"
 59 |     expected = "http://example.com/other/"
 60 |     # Using short args where applicable (none for the URL itself)
 61 | 
 62 |     result = run_cli(url)  # No short args needed for basic case
 63 | 
 64 |     assert result.returncode == 0
 65 |     assert result.stdout.strip() == expected
 66 |     assert not result.stderr
 67 | 
 68 | 
 69 | def test_cli_default_scheme() -> None:
 70 |     """Test default scheme addition via CLI."""
 71 |     url = "//example.com"
 72 |     expected = "https://example.com/"
 73 | 
 74 |     result = run_cli(url)
 75 | 
 76 |     assert result.returncode == 0
 77 |     assert result.stdout.strip() == expected
 78 |     assert not result.stderr
 79 | 
 80 | 
 81 | def test_cli_default_scheme_short_arg() -> None:
 82 |     """Test default scheme addition via CLI using short argument."""
 83 |     url = "//example.com"
 84 |     expected = "https://example.com/"
 85 | 
 86 |     result = run_cli(url)  # Default scheme is implicit, no arg needed
 87 | 
 88 |     assert result.returncode == 0
 89 |     assert result.stdout.strip() == expected
 90 |     assert not result.stderr
 91 | 
 92 | 
 93 | def test_cli_custom_default_scheme() -> None:
 94 |     """Test custom default scheme via CLI."""
 95 |     url = "//example.com"
 96 |     expected = "ftp://example.com/"
 97 | 
 98 |     result = run_cli("--default-scheme", "ftp", url)
 99 | 
100 |     assert result.returncode == 0
101 |     assert result.stdout.strip() == expected
102 |     assert not result.stderr
103 | 
104 | 
105 | def test_cli_custom_default_scheme_short_arg() -> None:
106 |     """Test custom default scheme via CLI using short argument."""
107 |     url = "//example.com"
108 |     expected = "ftp://example.com/"
109 | 
110 |     result = run_cli("-s", "ftp", url)
111 | 
112 |     assert result.returncode == 0
113 |     assert result.stdout.strip() == expected
114 |     assert not result.stderr
115 | 
116 | 
117 | def test_cli_filter_params() -> None:
118 |     """Test parameter filtering via CLI."""
119 |     url = "http://google.com?utm_source=test&q=1"
120 |     expected = "http://google.com/?q=1"
121 | 
122 |     result = run_cli("--filter-params", url)
123 | 
124 |     assert result.returncode == 0
125 |     assert result.stdout.strip() == expected
126 |     assert not result.stderr
127 | 
128 | 
129 | def test_cli_filter_params_short_arg() -> None:
130 |     """Test parameter filtering via CLI using short argument."""
131 |     url = "http://google.com?utm_source=test&q=1"
132 |     expected = "http://google.com/?q=1"
133 | 
134 |     result = run_cli("-f", url)
135 | 
136 |     assert result.returncode == 0
137 |     assert result.stdout.strip() == expected
138 |     assert not result.stderr
139 | 
140 | 
141 | def test_cli_param_allowlist() -> None:
142 |     """Test parameter allowlist via CLI."""
143 |     url = "http://example.com?remove=me&keep=this&remove_too=true"
144 |     expected = "http://example.com/?keep=this"
145 |     # Use filter_params to enable filtering, then allowlist to keep specific ones
146 | 
147 |     result = run_cli("-f", "-p", "keep", url)
148 | 
149 |     assert result.returncode == 0
150 |     assert result.stdout.strip() == expected
151 |     assert not result.stderr
152 | 
153 | 
154 | def test_cli_param_allowlist_multiple() -> None:
155 |     """Test parameter allowlist with multiple params via CLI."""
156 |     url = "http://example.com?remove=me&keep=this&keep_too=yes&remove_too=true"
157 |     expected = "http://example.com/?keep=this&keep_too=yes"
158 | 
159 |     result = run_cli("-f", "-p", "keep,keep_too", url)
160 | 
161 |     assert result.returncode == 0
162 |     assert result.stdout.strip() == expected
163 |     assert not result.stderr
164 | 
165 | 
166 | def test_cli_param_allowlist_without_filtering() -> None:
167 |     """Test allowlist has no effect if filtering is not enabled."""
168 |     url = "http://example.com?remove=me&keep=this&remove_too=true"
169 |     expected = "http://example.com/?remove=me&keep=this&remove_too=true"
170 |     # Not using -f, so allowlist should be ignored
171 | 
172 |     result = run_cli("-p", "keep", url)
173 | 
174 |     assert result.returncode == 0
175 |     assert result.stdout.strip() == expected
176 |     assert not result.stderr
177 | 
178 | 
179 | def test_cli_no_url() -> None:
180 |     """Test CLI error when no URL is provided."""
181 |     result = run_cli()
182 | 
183 |     assert result.returncode != 0
184 |     assert "the following arguments are required: url" in result.stderr
185 | 
186 | 
187 | def test_cli_version_long() -> None:
188 |     """Test version output with --version flag."""
189 |     result = run_cli("--version")
190 | 
191 |     assert result.returncode == 0
192 |     assert __version__ in result.stdout
193 |     assert not result.stderr
194 | 
195 | 
196 | def test_cli_version_short() -> None:
197 |     """Test version output with -v flag."""
198 |     result = run_cli("-v")
199 | 
200 |     assert result.returncode == 0
201 |     assert __version__ in result.stdout
202 |     assert not result.stderr
203 | 
204 | 
205 | @pytest.mark.skipif(
206 |     sys.platform == "win32", reason="Charset handling differs on Windows CLI"
207 | )
208 | def test_cli_charset() -> None:
209 |     """Test charset handling via CLI (might be platform-dependent)."""
210 |     # Example using Cyrillic characters which need correct encoding
211 |     url = "http://пример.рф/path"
212 |     expected_idn = "http://xn--e1afmkfd.xn--p1ai/path"
213 | 
214 |     # Test with default UTF-8
215 |     result_utf8 = run_cli(url)
216 | 
217 |     assert result_utf8.returncode == 0
218 |     assert result_utf8.stdout.strip() == expected_idn
219 |     assert not result_utf8.stderr
220 | 
221 |     # Test specifying UTF-8 explicitly
222 |     result_charset = run_cli("--charset", "utf-8", url)
223 | 
224 |     assert result_charset.returncode == 0
225 |     assert result_charset.stdout.strip() == expected_idn
226 |     assert not result_charset.stderr
227 | 
228 |     # Test specifying UTF-8 explicitly using short arg
229 |     result_charset_short = run_cli("-c", "utf-8", url)
230 | 
231 |     assert result_charset_short.returncode == 0
232 |     assert result_charset_short.stdout.strip() == expected_idn
233 |     assert not result_charset_short.stderr
234 | 
235 | 
236 | def test_cli_default_domain() -> None:
237 |     """Test adding default domain to absolute path via CLI."""
238 |     url = "/path/to/image.png"
239 |     expected = "https://example.com/path/to/image.png"
240 | 
241 |     result = run_cli("--default-domain", "example.com", url)
242 | 
243 |     assert result.returncode == 0
244 |     assert result.stdout.strip() == expected
245 |     assert not result.stderr
246 | 
247 | 
248 | def test_cli_default_domain_short_arg() -> None:
249 |     """Test adding default domain using short argument."""
250 |     url = "/path/to/image.png"
251 |     expected = "https://example.com/path/to/image.png"
252 | 
253 |     result = run_cli("-d", "example.com", url)
254 | 
255 |     assert result.returncode == 0
256 |     assert result.stdout.strip() == expected
257 |     assert not result.stderr
258 | 
259 | 
260 | def test_cli_default_domain_with_scheme() -> None:
261 |     """Test adding default domain with custom scheme."""
262 |     url = "/path/to/image.png"
263 |     expected = "http://example.com/path/to/image.png"
264 | 
265 |     result = run_cli("-d", "example.com", "-s", "http", url)
266 | 
267 |     assert result.returncode == 0
268 |     assert result.stdout.strip() == expected
269 |     assert not result.stderr
270 | 
271 | 
272 | def test_cli_default_domain_no_effect_on_absolute_urls() -> None:
273 |     """Test default domain has no effect on absolute URLs."""
274 |     url = "http://original-domain.com/path"
275 |     expected = "http://original-domain.com/path"
276 | 
277 |     result = run_cli("-d", "example.com", url)
278 | 
279 |     assert result.returncode == 0
280 |     assert result.stdout.strip() == expected
281 |     assert not result.stderr
282 | 
283 | 
284 | def test_cli_default_domain_no_effect_on_relative_paths() -> None:
285 |     """Test default domain has no effect on relative paths."""
286 |     url = "path/to/file.html"
287 |     # This becomes a regular URL with the default scheme
288 |     expected = "https://path/to/file.html"
289 | 
290 |     result = run_cli("-d", "example.com", url)
291 | 
292 |     assert result.returncode == 0
293 |     assert result.stdout.strip() == expected
294 |     assert not result.stderr
295 | 


--------------------------------------------------------------------------------
/tests/test_deconstruct_url.py:
--------------------------------------------------------------------------------
 1 | """Deconstruct url tests."""
 2 | 
 3 | import pytest
 4 | 
 5 | from url_normalize.tools import URL, deconstruct_url
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     ("url", "expected"),
10 |     [
11 |         (
12 |             "http://site.com",
13 |             URL(
14 |                 fragment="",
15 |                 host="site.com",
16 |                 path="",
17 |                 port="",
18 |                 query="",
19 |                 scheme="http",
20 |                 userinfo="",
21 |             ),
22 |         ),
23 |         (
24 |             "http://user@www.example.com:8080/path/index.html?param=val#fragment",
25 |             URL(
26 |                 fragment="fragment",
27 |                 host="www.example.com",
28 |                 path="/path/index.html",
29 |                 port="8080",
30 |                 query="param=val",
31 |                 scheme="http",
32 |                 userinfo="user@",
33 |             ),
34 |         ),
35 |     ],
36 | )
37 | def test_deconstruct_url_result_is_expected(url: str, expected: URL) -> None:
38 |     """Assert we got expected results from the deconstruct_url function."""
39 |     result = deconstruct_url(url)
40 |     assert result == expected, url
41 | 


--------------------------------------------------------------------------------
/tests/test_generic_url_cleanup.py:
--------------------------------------------------------------------------------
 1 | """Tests for generic_url_cleanup function."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import pytest
 6 | 
 7 | from url_normalize.url_normalize import generic_url_cleanup
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     ("url", "expected"),
12 |     [
13 |         ("//site/#!fragment", "//site/?_escaped_fragment_=fragment"),
14 |         ("//site/page", "//site/page"),
15 |         ("//site/?& ", "//site/"),
16 |     ],
17 | )
18 | def test_generic_url_cleanup_result_is_expected(url: str, expected: str) -> None:
19 |     """Assert we got expected results from the generic_url_cleanup function."""
20 |     result = generic_url_cleanup(url)
21 |     assert result == expected
22 | 


--------------------------------------------------------------------------------
/tests/test_normalize_fragment.py:
--------------------------------------------------------------------------------
 1 | """Tests for normalize_fragment function."""
 2 | 
 3 | import pytest
 4 | 
 5 | from url_normalize.url_normalize import normalize_fragment
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     ("fragment", "expected"),
10 |     [
11 |         ("", ""),
12 |         ("fragment", "fragment"),
13 |         ("пример", "%D0%BF%D1%80%D0%B8%D0%BC%D0%B5%D1%80"),
14 |         ("!fragment", "%21fragment"),
15 |         ("~fragment", "~fragment"),
16 |         # Issue #36: Equal sign should not be encoded
17 |         ("gid=1234", "gid=1234"),
18 |     ],
19 | )
20 | def test_normalize_fragment_result_is_expected(fragment: str, expected: str) -> None:
21 |     """Assert we got expected results from the normalize_fragment function."""
22 |     result = normalize_fragment(fragment)
23 |     assert result == expected, fragment
24 | 


--------------------------------------------------------------------------------
/tests/test_normalize_host.py:
--------------------------------------------------------------------------------
 1 | """Tests for normalize_host function."""
 2 | 
 3 | import pytest
 4 | 
 5 | from url_normalize.url_normalize import normalize_host
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     ("host", "expected"),
10 |     [
11 |         # Basic cases
12 |         ("site.com", "site.com"),
13 |         ("SITE.COM", "site.com"),
14 |         ("site.com.", "site.com"),
15 |         # Cyrillic domains
16 |         ("пример.испытание", "xn--e1afmkfd.xn--80akhbyknj4f"),
17 |         # Mixed case with Cyrillic
18 |         ("ExAmPle.РФ", "example.xn--p1ai"),
19 |         # IDNA2008 with UTS46
20 |         ("faß.de", "fass.de"),  # Normalize using transitional rules
21 |         # Edge cases
22 |         ("ドメイン.テスト", "xn--eckwd4c7c.xn--zckzah"),  # Japanese
23 |         ("domain.café", "domain.xn--caf-dma"),  # Latin with diacritic
24 |         # Normalization tests
25 |         ("über.example", "xn--ber-goa.example"),  # IDNA 2008 for umlaut
26 |         ("example。com", "example.com"),  # Normalize full-width punctuation
27 |     ],
28 | )
29 | def test_normalize_host_result_is_expected(host: str, expected: str) -> None:
30 |     """Assert we got expected results from the normalize_host function."""
31 |     result = normalize_host(host)
32 |     assert result == expected, host
33 | 


--------------------------------------------------------------------------------
/tests/test_normalize_path.py:
--------------------------------------------------------------------------------
 1 | """Tests for normalize_path function."""
 2 | 
 3 | import pytest
 4 | 
 5 | from url_normalize.url_normalize import normalize_path
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     ("path", "expected"),
10 |     [
11 |         ("..", "/"),
12 |         ("", "/"),
13 |         ("/../foo", "/foo"),
14 |         ("/..foo", "/..foo"),
15 |         ("/./../foo", "/foo"),
16 |         ("/./foo", "/foo"),
17 |         ("/./foo/.", "/foo/"),
18 |         ("/.foo", "/.foo"),
19 |         ("/", "/"),
20 |         ("/foo..", "/foo.."),
21 |         ("/foo.", "/foo."),
22 |         ("/FOO", "/FOO"),
23 |         ("/foo/../bar", "/bar"),
24 |         ("/foo/./bar", "/foo/bar"),
25 |         ("/foo//", "/foo/"),
26 |         ("/foo///bar//", "/foo/bar/"),
27 |         ("/foo/bar/..", "/foo/"),
28 |         ("/foo/bar/../..", "/"),
29 |         ("/foo/bar/../../../../baz", "/baz"),
30 |         ("/foo/bar/../../../baz", "/baz"),
31 |         ("/foo/bar/../../", "/"),
32 |         ("/foo/bar/../../baz", "/baz"),
33 |         ("/foo/bar/../", "/foo/"),
34 |         ("/foo/bar/../baz", "/foo/baz"),
35 |         ("/foo/bar/.", "/foo/bar/"),
36 |         ("/foo/bar/./", "/foo/bar/"),
37 |         # Issue #25: we should preserve ? in the path
38 |         ("/More+Tea+Vicar%3F/discussion", "/More+Tea+Vicar%3F/discussion"),
39 |     ],
40 | )
41 | def test_normalize_path_result_is_expected(path: str, expected: str) -> None:
42 |     """Assert we got expected results from the normalize_path function."""
43 |     result = normalize_path(path, "http")
44 |     assert result == expected, path
45 | 


--------------------------------------------------------------------------------
/tests/test_normalize_port.py:
--------------------------------------------------------------------------------
 1 | """Tests for normalize_port function."""
 2 | 
 3 | import pytest
 4 | 
 5 | from url_normalize.url_normalize import normalize_port
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     ("port", "expected"),
10 |     [
11 |         ("8080", "8080"),  # Non-default port
12 |         ("", ""),  # Empty port
13 |         ("80", ""),  # Default HTTP port
14 |         ("string", "string"),  # Non-numeric port (should pass through)
15 |         # Add more cases as needed, e.g., for HTTPS
16 |         pytest.param("443", "", id="https_default_port"),
17 |     ],
18 | )
19 | def test_normalize_port_result_is_expected(port: str, expected: str):
20 |     """Assert we got expected results from the normalize_port function."""
21 |     # Test with 'http' scheme for most cases
22 |     scheme = "https" if port == "443" else "http"
23 | 
24 |     result = normalize_port(port, scheme)
25 | 
26 |     assert result == expected
27 | 


--------------------------------------------------------------------------------
/tests/test_normalize_query.py:
--------------------------------------------------------------------------------
 1 | """Tests for normalize_query function."""
 2 | 
 3 | import pytest
 4 | 
 5 | from url_normalize.url_normalize import normalize_query
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     ("query", "expected"),
10 |     [
11 |         ("", ""),
12 |         ("&&&", ""),
13 |         ("param1=val1&param2=val2", "param1=val1&param2=val2"),
14 |         ("Ç=Ç", "%C3%87=%C3%87"),
15 |         ("%C3%87=%C3%87", "%C3%87=%C3%87"),
16 |         ("q=C%CC%A7", "q=%C3%87"),
17 |         ("q=%23test", "q=%23test"),  # Preserve encoded # in value, #31
18 |         ("where=code%3D123", "where=code%3D123"),  # Preserve encoded = in value, #25
19 |     ],
20 | )
21 | def test_normalize_query_result_is_expected(query, expected):
22 |     """Assert we got expected results from the normalize_query function."""
23 |     result = normalize_query(query)
24 |     assert result == expected, query
25 | 


--------------------------------------------------------------------------------
/tests/test_normalize_query_filters.py:
--------------------------------------------------------------------------------
 1 | """URL parameter filtering test module."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import pytest
 6 | 
 7 | from url_normalize import url_normalize
 8 | 
 9 | 
10 | def test_param_filtering_disabled_by_default():
11 |     """Test that parameter filtering is disabled by default."""
12 |     url = "https://www.google.com/search?q=test&utm_source=test"
13 |     assert url_normalize(url) == url
14 | 
15 | 
16 | def test_empty_query():
17 |     """Test handling empty query strings."""
18 |     assert url_normalize("https://example.com/page?") == "https://example.com/page"
19 | 
20 | 
21 | def test_custom_allowlist():
22 |     """Test custom allowlist functionality with preserved order."""
23 |     custom_allowlist = {"example.com": ["page", "id"], "google.com": ["q", "lang"]}
24 | 
25 |     # Order should match input query string order
26 |     assert (
27 |         url_normalize(
28 |             "https://example.com/search?page=1&id=123&utm_source=test",
29 |             filter_params=True,
30 |             param_allowlist=custom_allowlist,
31 |         )
32 |         == "https://example.com/search?page=1&id=123"
33 |     )
34 | 
35 |     assert (
36 |         url_normalize(
37 |             "https://google.com/search?q=test&ie=utf8&lang=en",
38 |             filter_params=True,
39 |             param_allowlist=custom_allowlist,
40 |         )
41 |         == "https://google.com/search?q=test&lang=en"
42 |     )
43 | 
44 | 
45 | def test_custom_list_allowlist():
46 |     """Test custom list allowlist functionality."""
47 |     assert (
48 |         url_normalize(
49 |             "https://google.com/search?qq=test&ie=utf8&utm_source=test",
50 |             filter_params=True,
51 |             param_allowlist=["ie", "qq"],
52 |         )
53 |         == "https://google.com/search?qq=test&ie=utf8"
54 |     )
55 | 
56 | 
57 | @pytest.mark.parametrize(
58 |     ("url", "expected"),
59 |     [
60 |         # Basic parameter filtering
61 |         (
62 |             "https://www.google.com/search?q=test&utm_source=test",
63 |             "https://www.google.com/search?q=test",
64 |         ),
65 |         (
66 |             "https://www.youtube.com/watch?v=12345&utm_source=share",
67 |             "https://www.youtube.com/watch?v=12345",
68 |         ),
69 |         # With www subdomain
70 |         (
71 |             "https://www.google.com/search?q=test&ref=test",
72 |             "https://www.google.com/search?q=test",
73 |         ),
74 |         # With port number
75 |         (
76 |             "https://google.com:8080/search?q=test&ref=test",
77 |             "https://google.com:8080/search?q=test",
78 |         ),
79 |         # Default allowlist cases
80 |         (
81 |             "https://www.google.com/search?q=test&utm_source=test&ie=utf8",
82 |             "https://www.google.com/search?q=test&ie=utf8",
83 |         ),
84 |         (
85 |             "https://www.baidu.com/s?wd=test&utm_source=test&ie=utf8",
86 |             "https://www.baidu.com/s?wd=test&ie=utf8",
87 |         ),
88 |         (
89 |             "https://youtube.com/watch?v=12345&utm_source=test&search_query=test",
90 |             "https://youtube.com/watch?v=12345&search_query=test",
91 |         ),
92 |         # Non-allowlisted domain
93 |         ("https://example.org/page?a=1&b=2", "https://example.org/page"),
94 |     ],
95 | )
96 | def test_parameter_filtering(url: str, expected: str):
97 |     """Test URL parameter filtering functionality with various scenarios."""
98 |     assert url_normalize(url, filter_params=True) == expected
99 | 


--------------------------------------------------------------------------------
/tests/test_normalize_scheme.py:
--------------------------------------------------------------------------------
 1 | """Tests for normalize_scheme function."""
 2 | 
 3 | import pytest
 4 | 
 5 | from url_normalize.url_normalize import normalize_scheme
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     ("scheme", "expected"),
10 |     [
11 |         ("http", "http"),
12 |         ("HTTP", "http"),
13 |     ],
14 | )
15 | def test_normalize_scheme_result_is_expected(scheme: str, expected: str) -> None:
16 |     """Assert we got expected results from the normalize_scheme function."""
17 |     result = normalize_scheme(scheme)
18 |     assert result == expected, scheme
19 | 


--------------------------------------------------------------------------------
/tests/test_normalize_userinfo.py:
--------------------------------------------------------------------------------
 1 | """Tests for normalize_userinfo function."""
 2 | 
 3 | import pytest
 4 | 
 5 | from url_normalize.url_normalize import normalize_userinfo
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     ("userinfo", "expected"),
10 |     [
11 |         (":@", ""),
12 |         ("", ""),
13 |         ("@", ""),
14 |         ("user:password@", "user:password@"),
15 |         ("user@", "user@"),
16 |     ],
17 | )
18 | def test_normalize_userinfo_result_is_expected(userinfo: str, expected: str) -> None:
19 |     """Assert we got expected results from the normalize_userinfo function."""
20 |     result = normalize_userinfo(userinfo)
21 |     assert result == expected, userinfo
22 | 


--------------------------------------------------------------------------------
/tests/test_provide_url_domain.py:
--------------------------------------------------------------------------------
 1 | """Tests for provide_url_domain function."""
 2 | 
 3 | import pytest
 4 | 
 5 | from url_normalize.provide_url_domain import provide_url_domain
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     ("url", "expected"),
10 |     [
11 |         ("", ""),
12 |         ("-", "-"),
13 |         ("http://example.com/", "http://example.com/"),
14 |         ("/file/path", "//example.com/file/path"),
15 |         ("site/page", "site/page"),  # No change for relative paths
16 |     ],
17 | )
18 | def test_provide_url_domain_result_is_expected(url: str, expected: str) -> None:
19 |     """Assert we get expected results from provide_url_domain function."""
20 |     result = provide_url_domain(url, default_domain="example.com")
21 |     assert result == expected
22 | 
23 | 
24 | def test_provide_url_domain_accept_different_domains():
25 |     """Assert we could provide different default_domain values."""
26 |     url = "/file/path"
27 |     expected = "//custom-domain.org/file/path"
28 | 
29 |     actual = provide_url_domain(url, default_domain="custom-domain.org")
30 | 
31 |     assert actual == expected
32 | 


--------------------------------------------------------------------------------
/tests/test_provide_url_scheme.py:
--------------------------------------------------------------------------------
 1 | """Tests for provide_url_scheme function."""
 2 | 
 3 | import pytest
 4 | 
 5 | from url_normalize.url_normalize import provide_url_scheme
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     ("url", "expected"),
10 |     [
11 |         ("", ""),
12 |         ("-", "-"),
13 |         ("/file/path", "/file/path"),
14 |         ("//site/path", "https://site/path"),
15 |         ("ftp://site/", "ftp://site/"),
16 |         ("site/page", "https://site/page"),
17 |     ],
18 | )
19 | def test_provide_url_scheme_result_is_expected(url: str, expected: str) -> None:
20 |     """Assert we got expected results from the provide_url_scheme function."""
21 |     result = provide_url_scheme(url)
22 |     assert result == expected, url
23 | 
24 | 
25 | def test_provide_url_scheme_accept_default_scheme_param() -> None:
26 |     """Assert we could provide default_scheme param other than https."""
27 |     url = "//site/path"
28 |     expected = "http://site/path"
29 | 
30 |     actual = provide_url_scheme(url, default_scheme="http")
31 | 
32 |     assert actual == expected
33 | 


--------------------------------------------------------------------------------
/tests/test_reconstruct_url.py:
--------------------------------------------------------------------------------
 1 | """Reconstruct url tests."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import pytest
 6 | 
 7 | from url_normalize.tools import URL, reconstruct_url
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     ("url_obj", "expected"),
12 |     [
13 |         (
14 |             URL(
15 |                 fragment="",
16 |                 host="site.com",
17 |                 path="",
18 |                 port="",
19 |                 query="",
20 |                 scheme="http",
21 |                 userinfo="",
22 |             ),
23 |             "http://site.com",
24 |         ),
25 |         (
26 |             URL(
27 |                 fragment="fragment",
28 |                 host="www.example.com",
29 |                 path="/path/index.html",
30 |                 port="8080",
31 |                 query="param=val",
32 |                 scheme="http",
33 |                 userinfo="user@",
34 |             ),
35 |             "http://user@www.example.com:8080/path/index.html?param=val#fragment",
36 |         ),
37 |     ],
38 | )
39 | def test_reconstruct_url_result_is_expected(url_obj: URL, expected: str) -> None:
40 |     """Assert we got expected results from the reconstruct_url function."""
41 |     result = reconstruct_url(url_obj)
42 |     assert result == expected, url_obj
43 | 


--------------------------------------------------------------------------------
/tests/test_tools.py:
--------------------------------------------------------------------------------
 1 | """Tools module tests."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from url_normalize.tools import force_unicode
 6 | 
 7 | 
 8 | def test_force_unicode_with_bytes() -> None:
 9 |     """Test force_unicode handles bytes input correctly."""
10 |     test_bytes = b"hello world"
11 |     result = force_unicode(test_bytes)
12 |     assert result == "hello world"
13 | 


--------------------------------------------------------------------------------
/tests/test_url_normalize.py:
--------------------------------------------------------------------------------
  1 | """Integrations tests."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import pytest
  6 | 
  7 | from url_normalize import url_normalize
  8 | 
  9 | 
 10 | @pytest.mark.parametrize(
 11 |     "value",
 12 |     [
 13 |         "-",
 14 |         "",
 15 |         "/..foo",
 16 |         "/.foo",
 17 |         "/foo..",
 18 |         "/foo.",
 19 |         "ftp://user:pass@ftp.foo.net/foo/bar",
 20 |         "http://127.0.0.1/",
 21 |         "http://example.com:8080/",
 22 |         "http://example.com/?a&b",
 23 |         "http://example.com/?q=%5C",
 24 |         "http://example.com/?q=%C3%87",
 25 |         "http://example.com/?q=%E2%85%A0",
 26 |         "http://example.com/",
 27 |         "http://example.com/~jane",
 28 |         "http://example.com/a/b",
 29 |         "http://example.com/FOO",
 30 |         "http://user:password@example.com/",
 31 |         "http://www.foo.com:8000/foo",
 32 |         # from rfc2396bis
 33 |         "ftp://ftp.is.co.za/rfc/rfc1808.txt",
 34 |         "http://www.ietf.org/rfc/rfc2396.txt",
 35 |         "ldap://[2001:db8::7]/c=GB?objectClass?one",
 36 |         "mailto:John.Doe@example.com",
 37 |         "news:comp.infosystems.www.servers.unix",
 38 |         "tel:+1-816-555-1212",
 39 |         "telnet://192.0.2.16:80/",
 40 |         "urn:oasis:names:specification:docbook:dtd:xml:4.1.2",
 41 |         # Issue #36: Fragment with '=' should not be encoded
 42 |         "https://docs.google.com/spreadsheets/d/abcd/edit#gid=1234",
 43 |     ],
 44 | )
 45 | def test_url_normalize_no_changes_expected(value: str) -> None:
 46 |     """Assert url_normalize do not change URI if not required.
 47 | 
 48 |     http://www.intertwingly.net/wiki/pie/PaceCanonicalIds
 49 |     """
 50 |     assert url_normalize(value) == value
 51 | 
 52 | 
 53 | @pytest.mark.parametrize(
 54 |     ("value", "expected"),
 55 |     [
 56 |         ("/../foo", "/foo"),
 57 |         ("/./../foo", "/foo"),
 58 |         ("/./foo", "/foo"),
 59 |         ("/./foo/.", "/foo/"),
 60 |         ("//www.foo.com/", "https://www.foo.com/"),
 61 |         ("/foo/../bar", "/bar"),
 62 |         ("/foo/./bar", "/foo/bar"),
 63 |         ("/foo//", "/foo/"),
 64 |         ("/foo///bar//", "/foo/bar/"),
 65 |         ("/foo/bar/..", "/foo/"),
 66 |         ("/foo/bar/../..", "/"),
 67 |         ("/foo/bar/../../../../baz", "/baz"),
 68 |         ("/foo/bar/../../../baz", "/baz"),
 69 |         ("/foo/bar/../../", "/"),
 70 |         ("/foo/bar/../../baz", "/baz"),
 71 |         ("/foo/bar/../", "/foo/"),
 72 |         ("/foo/bar/../baz", "/foo/baz"),
 73 |         ("/foo/bar/.", "/foo/bar/"),
 74 |         ("/foo/bar/./", "/foo/bar/"),
 75 |         ("http://:@example.com/", "http://example.com/"),
 76 |         ("http://@example.com/", "http://example.com/"),
 77 |         ("http://127.0.0.1:80/", "http://127.0.0.1/"),
 78 |         ("http://example.com:081/", "http://example.com:81/"),
 79 |         ("http://example.com:80/", "http://example.com/"),
 80 |         ("http://example.com", "http://example.com/"),
 81 |         ("http://example.com/?b&a", "http://example.com/?b&a"),
 82 |         ("http://example.com/?q=%5c", "http://example.com/?q=%5C"),
 83 |         ("http://example.com/?q=%C7", "http://example.com/?q=%EF%BF%BD"),
 84 |         ("http://example.com/?q=C%CC%A7", "http://example.com/?q=%C3%87"),
 85 |         ("http://EXAMPLE.COM/", "http://example.com/"),
 86 |         ("http://example.com/%7Ejane", "http://example.com/~jane"),
 87 |         ("http://example.com/a/../a/b", "http://example.com/a/b"),
 88 |         ("http://example.com/a/./b", "http://example.com/a/b"),
 89 |         (
 90 |             "http://example.com/#!5753509/hello-world",
 91 |             "http://example.com/?_escaped_fragment_=5753509/hello-world",
 92 |         ),
 93 |         (
 94 |             "http://USER:pass@www.Example.COM/foo/bar",
 95 |             "http://USER:pass@www.example.com/foo/bar",
 96 |         ),
 97 |         ("http://www.example.com./", "http://www.example.com/"),
 98 |         ("http://www.foo.com:80/foo", "http://www.foo.com/foo"),
 99 |         ("http://www.foo.com.:81/foo", "http://www.foo.com:81/foo"),
100 |         ("http://www.foo.com./foo/bar.html", "http://www.foo.com/foo/bar.html"),
101 |         ("http://www.foo.com/%7Ebar", "http://www.foo.com/~bar"),
102 |         ("http://www.foo.com/%7ebar", "http://www.foo.com/~bar"),
103 |         (
104 |             "пример.испытание/Служебная:Search/Test",
105 |             "https://xn--e1afmkfd.xn--80akhbyknj4f/%D0%A1%D0%BB%D1%83%D0%B6%D0%B5%D0%B1%D0%BD%D0%B0%D1%8F:Search/Test",
106 |         ),
107 |         # Issue #19: http:example.com
108 |         ("http:example.com", "http://example.com/"),
109 |         ("http:example.com/path", "http://example.com/path"),
110 |         ("ftp:test.com/files", "ftp://test.com/files"),
111 |         ("https:www.example.com", "https://www.example.com/"),
112 |     ],
113 | )
114 | def test_url_normalize_expected_changes(value: str, expected: str) -> None:
115 |     """Assert url_normalize return expected results."""
116 |     assert url_normalize(value) == expected
117 | 
118 | 
119 | def test_url_normalize_filtered() -> None:
120 |     """Assert url_normalize return expected results."""
121 |     url = "/?a&b"
122 |     expected = "/"
123 | 
124 |     actual = url_normalize(url, filter_params=True)
125 | 
126 |     assert actual == expected
127 | 
128 | 
129 | def test_url_normalize_with_http_scheme() -> None:
130 |     """Assert we could use http scheme as default."""
131 |     url = "//www.foo.com/"
132 |     expected = "http://www.foo.com/"
133 | 
134 |     actual = url_normalize(url, default_scheme="http")
135 | 
136 |     assert actual == expected
137 | 
138 | 
139 | @pytest.mark.parametrize(
140 |     ("url", "expected"),
141 |     [
142 |         ("/foo.png", "https://example.com/foo.png"),
143 |         ("//google.com", "https://google.com/"),
144 |         ("//example.com:80/foo.png", "https://example.com:80/foo.png"),
145 |         ("//example.com/foo.png?foo=bar", "https://example.com/foo.png?foo=bar"),
146 |         ("http://google.com", "http://google.com/"),
147 |     ],
148 | )
149 | def test_url_normalize_with_default_domain(url: str, expected: str) -> None:
150 |     """Assert we could use default_domain parameter."""
151 |     actual = url_normalize(url, default_domain="example.com")
152 | 
153 |     assert actual == expected
154 | 
155 | 
156 | def test_url_normalize_with_default_domain_and_scheme() -> None:
157 |     """Assert we can use both default_domain and default_scheme together."""
158 |     url = "/foo.png"
159 |     expected = "http://example.com/foo.png"
160 | 
161 |     actual = url_normalize(url, default_scheme="http", default_domain="example.com")
162 | 
163 |     assert actual == expected
164 | 


--------------------------------------------------------------------------------
/url_normalize/__init__.py:
--------------------------------------------------------------------------------
 1 | """URI normalize.
 2 | 
 3 | Copyright (c) 2020 Nikolay Panov
 4 | SPDX-License-Identifier: MIT
 5 | 
 6 | """
 7 | 
 8 | from .url_normalize import url_normalize
 9 | 
10 | __license__ = "MIT"
11 | __version__ = "2.2.1"
12 | 
13 | __all__ = ["url_normalize"]
14 | 


--------------------------------------------------------------------------------
/url_normalize/cli.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Command line interface for url-normalize."""
 3 | 
 4 | import argparse
 5 | import sys
 6 | from importlib.metadata import version
 7 | 
 8 | from .url_normalize import url_normalize
 9 | 
10 | 
11 | def main() -> None:
12 |     """Parse arguments and run url_normalize."""
13 |     parser = argparse.ArgumentParser(description="Normalize a URL.")
14 |     parser.add_argument(
15 |         "-v",
16 |         "--version",
17 |         action="version",
18 |         version=f"%(prog)s {version('url-normalize')}",
19 |     )
20 |     parser.add_argument("url", help="The URL to normalize.")
21 |     parser.add_argument(
22 |         "-c",
23 |         "--charset",
24 |         default="utf-8",
25 |         help="The charset of the URL. Default: utf-8",
26 |     )
27 |     parser.add_argument(
28 |         "-s",
29 |         "--default-scheme",
30 |         default="https",
31 |         help="The default scheme to use if missing. Default: https",
32 |     )
33 |     parser.add_argument(
34 |         "-f",
35 |         "--filter-params",
36 |         action="store_true",
37 |         help="Filter common tracking parameters.",
38 |     )
39 |     parser.add_argument(
40 |         "-d",
41 |         "--default-domain",
42 |         type=str,
43 |         help="Default domain to use for absolute paths (starting with '/').",
44 |     )
45 |     parser.add_argument(
46 |         "-p",
47 |         "--param-allowlist",
48 |         type=str,
49 |         help="Comma-separated list of query parameters to allow (e.g., 'q,id').",
50 |     )
51 | 
52 |     args = parser.parse_args()
53 | 
54 |     allowlist = args.param_allowlist.split(",") if args.param_allowlist else None
55 | 
56 |     try:
57 |         normalized_url = url_normalize(
58 |             args.url,
59 |             charset=args.charset,
60 |             default_scheme=args.default_scheme,
61 |             default_domain=args.default_domain,
62 |             filter_params=args.filter_params,
63 |             param_allowlist=allowlist,
64 |         )
65 |     except Exception as e:  # noqa: BLE001
66 |         print(f"Error normalizing URL: {e}", file=sys.stderr)  # noqa: T201
67 |         sys.exit(1)
68 |     else:
69 |         print(normalized_url)  # noqa: T201
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     main()  # pragma: no cover
74 | 


--------------------------------------------------------------------------------
/url_normalize/generic_url_cleanup.py:
--------------------------------------------------------------------------------
 1 | """URL generic cleanup operations."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | 
 6 | def generic_url_cleanup(url: str) -> str:
 7 |     """Cleanup the URL from unnecessary data and convert to final form.
 8 | 
 9 |     Converts shebang urls to final form, removed unnecessary data from the url.
10 | 
11 |     Params:
12 |         url : string : the URL
13 | 
14 |     Returns:
15 |         string : update url
16 | 
17 |     """
18 |     url = url.replace("#!", "?_escaped_fragment_=")
19 |     return url.rstrip("&? ")
20 | 


--------------------------------------------------------------------------------
/url_normalize/normalize_fragment.py:
--------------------------------------------------------------------------------
 1 | """URL fragment normalization."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from .tools import quote, unquote
 6 | 
 7 | 
 8 | def normalize_fragment(fragment: str) -> str:
 9 |     """Normalize fragment part of the url.
10 | 
11 |     Params:
12 |         fragment : string : url fragment, e.g., 'fragment'
13 | 
14 |     Returns:
15 |         string : normalized fragment data.
16 | 
17 |     Notes:
18 |         According to RFC 3986, the following characters are allowed in a fragment:
19 |         fragment    = *( pchar / "/" / "?" )
20 |         pchar       = unreserved / pct-encoded / sub-delims / ":" / "@"
21 |         unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
22 |         sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
23 |         We specifically allow "~" and "=" as safe characters during normalization.
24 |         Other sub-delimiters could potentially be added to the `safe` list if needed.
25 | 
26 |     """
27 |     return quote(unquote(fragment), safe="~=")
28 | 


--------------------------------------------------------------------------------
/url_normalize/normalize_host.py:
--------------------------------------------------------------------------------
 1 | """URL host normalization."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import idna
 6 | 
 7 | from .tools import force_unicode
 8 | 
 9 | DEFAULT_CHARSET = "utf-8"
10 | 
11 | 
12 | def normalize_host(host: str, charset: str = DEFAULT_CHARSET) -> str:
13 |     """Normalize host part of the url.
14 | 
15 |     Lowercase and strip of final dot.
16 |     Also, handle IDN domains using IDNA2008 with UTS46 transitional processing.
17 | 
18 |     Params:
19 |         host : string : url host, e.g., 'site.com'
20 |         charset : string : encoding charset
21 | 
22 |     Returns:
23 |         string : normalized host data.
24 | 
25 |     """
26 |     host = force_unicode(host, charset)
27 |     host = host.lower()
28 |     host = host.strip(".")
29 | 
30 |     # Split domain into parts to handle each label separately
31 |     parts = host.split(".")
32 |     try:
33 |         # Process each label separately to handle mixed unicode/ascii domains
34 |         parts = [
35 |             idna.encode(p, uts46=True, transitional=True).decode(charset)
36 |             for p in parts
37 |             if p
38 |         ]
39 |         return ".".join(parts)
40 |     except idna.IDNAError:
41 |         # Fallback to direct encoding if IDNA2008 processing fails
42 |         return host.encode("idna").decode(charset)
43 | 


--------------------------------------------------------------------------------
/url_normalize/normalize_path.py:
--------------------------------------------------------------------------------
 1 | """URL path normalization."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from .tools import quote, unquote
 6 | 
 7 | 
 8 | def normalize_path(path: str, scheme: str) -> str:
 9 |     """Normalize path part of the url.
10 | 
11 |     Remove mention of default path number
12 | 
13 |     Params:
14 |         path : string : url path, e.g., '/section/page.html'
15 |         scheme : string : url scheme, e.g., 'http'
16 | 
17 |     Returns:
18 |         string : normalized path data.
19 | 
20 |     """
21 |     # Only perform percent-encoding where it is essential.
22 |     # Always use uppercase A-through-F characters when percent-encoding.
23 |     # All portions of the URI must be utf-8 encoded NFC from Unicode strings
24 |     path = quote(unquote(path), "~:/#[]@!$&'()*+,;=")
25 |     # Prevent dot-segments appearing in non-relative URI paths.
26 |     if scheme in {"", "http", "https", "ftp", "file"}:
27 |         output: list[str] = []
28 |         for part in path.split("/"):
29 |             if part == "":
30 |                 if not output:
31 |                     output.append(part)
32 |             elif part == ".":
33 |                 pass
34 |             elif part == "..":
35 |                 if len(output) > 1:
36 |                     output.pop()
37 |             else:
38 |                 output.append(part)
39 |         # The part variable is used in the final check
40 |         last_part = part
41 |         if last_part in {"", ".", ".."}:
42 |             output.append("")
43 |         path = "/".join(output)
44 |     # For schemes that define an empty path to be equivalent to a path of "/",
45 |     # use "/".
46 |     if not path and scheme in {"http", "https", "ftp", "file"}:
47 |         path = "/"
48 |     return path
49 | 


--------------------------------------------------------------------------------
/url_normalize/normalize_port.py:
--------------------------------------------------------------------------------
 1 | """URL port normalization."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | DEFAULT_PORT = {
 6 |     "ftp": "21",
 7 |     "gopher": "70",
 8 |     "http": "80",
 9 |     "https": "443",
10 |     "news": "119",
11 |     "nntp": "119",
12 |     "snews": "563",
13 |     "snntp": "563",
14 |     "telnet": "23",
15 |     "ws": "80",
16 |     "wss": "443",
17 | }
18 | 
19 | 
20 | def normalize_port(port: str, scheme: str) -> str:
21 |     """Normalize port part of the url.
22 | 
23 |     Remove mention of default port number
24 | 
25 |     Params:
26 |         port : string : url port, e.g., '8080'
27 |         scheme : string : url scheme, e.g., 'http'
28 | 
29 |     Returns:
30 |         string : normalized port data.
31 | 
32 |     """
33 |     if not port.isdigit():
34 |         return port
35 |     port = str(int(port))
36 |     if DEFAULT_PORT.get(scheme) == port:
37 |         return ""
38 |     return port
39 | 


--------------------------------------------------------------------------------
/url_normalize/normalize_query.py:
--------------------------------------------------------------------------------
 1 | """URL query normalization."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from .param_allowlist import get_allowed_params
 6 | from .tools import quote, unquote
 7 | 
 8 | QUERY_PARAM_SAFE_CHARS = "~:/?[]@!$'()*+,;"
 9 | 
10 | 
11 | def process_query_param(param: str) -> str:
12 |     """Process a single query parameter.
13 | 
14 |     This function normalizes the parameter by quoting reserved characters
15 |     and ensuring the parameter is in the correct format.
16 | 
17 |     Params:
18 |         param: The query parameter to process.
19 | 
20 |     Returns:
21 |         str: The normalized query parameter.
22 | 
23 |     """
24 |     if not param:
25 |         return ""
26 |     return quote(unquote(param), QUERY_PARAM_SAFE_CHARS)
27 | 
28 | 
29 | def normalize_query(
30 |     query: str,
31 |     *,  # Force keyword-only arguments
32 |     host: str | None = None,
33 |     filter_params: bool = False,
34 |     param_allowlist: list | dict | None = None,
35 | ) -> str:
36 |     """Normalize query while preserving parameter order.
37 | 
38 |     Params:
39 |         query: URL query string (e.g. 'param1=val1&param2')
40 |         host: Domain for allowlist checks
41 |         filter_params: If True, removes non-allowlisted parameters
42 |         param_allowlist: Optional override for default allowlist
43 | 
44 |     Returns:
45 |         Normalized query string with original parameter order
46 | 
47 |     """
48 |     if not query:
49 |         return ""
50 | 
51 |     processed = []
52 |     for param in query.split("&"):
53 |         if not param:
54 |             continue
55 |         key, _, value = param.partition("=")
56 |         key = process_query_param(key)
57 |         if filter_params:
58 |             allowed_params = get_allowed_params(host, param_allowlist)
59 |             if key not in allowed_params:
60 |                 continue
61 |         value = process_query_param(value)
62 |         processed.append(f"{key}={value}" if value else key)
63 | 
64 |     return "&".join(processed)
65 | 


--------------------------------------------------------------------------------
/url_normalize/normalize_scheme.py:
--------------------------------------------------------------------------------
 1 | """URL scheme normalization."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | DEFAULT_SCHEME = "https"
 6 | 
 7 | 
 8 | def normalize_scheme(scheme: str) -> str:
 9 |     """Normalize scheme part of the url.
10 | 
11 |     Params:
12 |         scheme : string : url scheme, e.g., 'https'
13 | 
14 |     Returns:
15 |         string : normalized scheme data.
16 | 
17 |     """
18 |     return scheme.lower()
19 | 


--------------------------------------------------------------------------------
/url_normalize/normalize_userinfo.py:
--------------------------------------------------------------------------------
 1 | """URL userinfo normalization."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | 
 6 | def normalize_userinfo(userinfo: str) -> str:
 7 |     """Normalize userinfo part of the url.
 8 | 
 9 |     Params:
10 |         userinfo : string : url userinfo, e.g., 'user@'
11 | 
12 |     Returns:
13 |         string : normalized userinfo data.
14 | 
15 |     """
16 |     if userinfo in ["@", ":@"]:
17 |         return ""
18 |     return userinfo
19 | 


--------------------------------------------------------------------------------
/url_normalize/param_allowlist.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023. All rights reserved.
 2 | """URL query parameter allowlist module."""
 3 | 
 4 | from __future__ import annotations
 5 | 
 6 | DEFAULT_ALLOWLIST = {
 7 |     "google.com": ["q", "ie"],
 8 |     "baidu.com": ["wd", "ie"],
 9 |     "bing.com": ["q"],
10 |     "youtube.com": ["v", "search_query"],
11 | }
12 | 
13 | 
14 | def get_allowed_params(
15 |     host: str | None = None,
16 |     allowlist: dict | list | None = None,
17 | ) -> set[str]:
18 |     """Get allowed parameters for a given domain.
19 | 
20 |     Params:
21 |         host: Domain name to check (e.g. 'google.com')
22 |         allowlist: Optional override for default allowlist
23 |             If provided as a list, it will be used as is.
24 |             If provided as a dictionary, it should map domain names to
25 |             lists of allowed parameters.
26 |             If None, the default allowlist will be used.
27 | 
28 |     Returns:
29 |         Set of allowed parameter names for the domain
30 | 
31 |     """
32 |     if isinstance(allowlist, list):
33 |         return set(allowlist)
34 | 
35 |     if not host:
36 |         return set()
37 | 
38 |     # Normalize host by removing www and port
39 |     domain = host.lower()
40 |     if domain.startswith("www."):
41 |         domain = domain[4:]
42 |     domain = domain.split(":")[0]
43 | 
44 |     # Use default allowlist if none provided
45 |     if allowlist is None:
46 |         allowlist = DEFAULT_ALLOWLIST
47 | 
48 |     # Return allowed parameters for the domain, or an empty set if not found
49 |     return set(allowlist.get(domain, []))
50 | 


--------------------------------------------------------------------------------
/url_normalize/provide_url_domain.py:
--------------------------------------------------------------------------------
 1 | """URL domain validation and attachment."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | 
 6 | def provide_url_domain(url: str, default_domain: str | None = None) -> str:
 7 |     """Add default domain to URL if needed.
 8 | 
 9 |     For absolute paths (starting with '/'), adds the specified default domain.
10 | 
11 |     Params:
12 |         url : str : the URL
13 |         default_domain : str | None : default domain to use, e.g. 'example.com'
14 | 
15 |     Returns:
16 |         str : URL with domain added if applicable
17 | 
18 |     """
19 |     # Skip processing if no default domain provided or URL is empty or stdout
20 |     if not default_domain or not url or url == "-":
21 |         return url
22 | 
23 |     # Only apply to absolute paths (starting with '/')
24 |     # but not scheme-relative URLs ('//')
25 |     if url.startswith("/") and not url.startswith("//"):
26 |         return "//" + default_domain + url
27 | 
28 |     return url
29 | 


--------------------------------------------------------------------------------
/url_normalize/provide_url_scheme.py:
--------------------------------------------------------------------------------
 1 | """URL scheme validation and attachment."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from .normalize_scheme import DEFAULT_SCHEME
 6 | 
 7 | # Schemes that require authority component reconstruction with //
 8 | AUTHORITY_SCHEMES = frozenset(["http", "https", "ftp", "ftps"])
 9 | 
10 | 
11 | def provide_url_scheme(url: str, default_scheme: str = DEFAULT_SCHEME) -> str:
12 |     """Make sure we have valid url scheme.
13 | 
14 |     Params:
15 |         url : string : the URL
16 |         default_scheme : string : default scheme to use, e.g. 'https'
17 | 
18 |     Returns:
19 |         string : updated url with validated/attached scheme
20 | 
21 |     """
22 |     has_scheme = ":" in url[:7]
23 |     is_universal_scheme = url.startswith("//")
24 |     is_file_path = url == "-" or (url.startswith("/") and not is_universal_scheme)
25 |     if not url or is_file_path:
26 |         return url
27 |     if not has_scheme:
28 |         return f"{default_scheme}://{url.lstrip('/')}"
29 |     scheme_part, rest = url.split(":", 1)
30 |     if scheme_part.lower() not in AUTHORITY_SCHEMES:
31 |         # handle cases like tel:, mailto:, etc.
32 |         return url
33 |     return f"{scheme_part}://{rest.lstrip('/')}"
34 | 


--------------------------------------------------------------------------------
/url_normalize/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niksite/url-normalize/918b135085d73f2a0441048ee01643c60fc7d89a/url_normalize/py.typed


--------------------------------------------------------------------------------
/url_normalize/tools.py:
--------------------------------------------------------------------------------
  1 | """URL normalization tools."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import re
  6 | import unicodedata
  7 | from typing import NamedTuple
  8 | from urllib.parse import quote as quote_orig
  9 | from urllib.parse import unquote as unquote_orig
 10 | from urllib.parse import urlsplit, urlunsplit
 11 | 
 12 | 
 13 | class URL(NamedTuple):
 14 |     """URL components tuple.
 15 | 
 16 |     A named tuple containing the parsed components of a URL:
 17 |     scheme, userinfo, host, port, path, query, and fragment.
 18 |     """
 19 | 
 20 |     scheme: str
 21 |     userinfo: str
 22 |     host: str
 23 |     port: str
 24 |     path: str
 25 |     query: str
 26 |     fragment: str
 27 | 
 28 | 
 29 | def deconstruct_url(url: str) -> URL:
 30 |     """Transform the url into URL structure.
 31 | 
 32 |     Params:
 33 |         url : string : the URL
 34 | 
 35 |     Returns:
 36 |         URL
 37 | 
 38 |     """
 39 |     scheme, auth, path, query, fragment = urlsplit(url.strip())
 40 |     match = re.search(r"([^@]*@)?([^:]*):?(.*)", auth)
 41 |     (userinfo, host, port) = match.groups()  # type: ignore  # noqa: PGH003
 42 |     return URL(
 43 |         fragment=fragment,
 44 |         host=host,
 45 |         path=path,
 46 |         port=port or "",
 47 |         query=query,
 48 |         scheme=scheme,
 49 |         userinfo=userinfo or "",
 50 |     )
 51 | 
 52 | 
 53 | def reconstruct_url(url: URL) -> str:
 54 |     """Reconstruct string url from URL.
 55 | 
 56 |     Params:
 57 |         url : URL object instance
 58 | 
 59 |     Returns:
 60 |         string : reconstructed url string
 61 | 
 62 |     """
 63 |     auth = (url.userinfo or "") + url.host
 64 |     if url.port:
 65 |         auth += ":" + url.port
 66 |     return urlunsplit((url.scheme, auth, url.path, url.query, url.fragment))
 67 | 
 68 | 
 69 | def force_unicode(string: str | bytes, charset: str = "utf-8") -> str:
 70 |     """Ensure string is properly encoded (Python 3 only).
 71 | 
 72 |     Params:
 73 |         string : str : an input string
 74 |         charset : str : optional : output encoding
 75 | 
 76 |     Returns:
 77 |         str
 78 | 
 79 |     """
 80 |     if isinstance(string, bytes):
 81 |         return string.decode(charset, "replace")
 82 |     return string
 83 | 
 84 | 
 85 | def unquote(string: str, charset: str = "utf-8") -> str:
 86 |     """Unquote and normalize unicode string.
 87 | 
 88 |     Params:
 89 |         string : string to be unquoted
 90 |         charset : string : optional : output encoding
 91 | 
 92 |     Returns:
 93 |         string : an unquoted and normalized string
 94 | 
 95 |     """
 96 |     string = unquote_orig(string)
 97 |     string = force_unicode(string, charset)
 98 |     encoded_str = unicodedata.normalize("NFC", string).encode(charset)
 99 |     return encoded_str.decode(charset)
100 | 
101 | 
102 | def quote(string: str, safe: str = "/") -> str:
103 |     """Quote string.
104 | 
105 |     Params:
106 |         string : string to be quoted
107 |         safe : string of safe characters
108 | 
109 |     Returns:
110 |         string : quoted string
111 | 
112 |     """
113 |     return quote_orig(string, safe)
114 | 


--------------------------------------------------------------------------------
/url_normalize/url_normalize.py:
--------------------------------------------------------------------------------
 1 | """URL normalize main module.
 2 | 
 3 | Copyright (c) 2020 Nikolay Panov
 4 | This module is part of url-normalize package and is released under the MIT License:
 5 | https://opensource.org/licenses/MIT
 6 | 
 7 | """
 8 | 
 9 | from __future__ import annotations
10 | 
11 | from .generic_url_cleanup import generic_url_cleanup
12 | from .normalize_fragment import normalize_fragment
13 | from .normalize_host import DEFAULT_CHARSET, normalize_host
14 | from .normalize_path import normalize_path
15 | from .normalize_port import normalize_port
16 | from .normalize_query import normalize_query
17 | from .normalize_scheme import DEFAULT_SCHEME, normalize_scheme
18 | from .normalize_userinfo import normalize_userinfo
19 | from .provide_url_domain import provide_url_domain
20 | from .provide_url_scheme import provide_url_scheme
21 | from .tools import deconstruct_url, reconstruct_url
22 | 
23 | 
24 | def url_normalize(  # noqa: PLR0913
25 |     url: str | None,
26 |     *,  # Force keyword-only arguments
27 |     charset: str = DEFAULT_CHARSET,
28 |     default_scheme: str = DEFAULT_SCHEME,
29 |     default_domain: str | None = None,
30 |     filter_params: bool = False,
31 |     param_allowlist: dict | list | None = None,
32 | ) -> str | None:
33 |     """URI normalization routine.
34 | 
35 |     Sometimes you get an URL by a user that just isn't a real
36 |     URL because it contains unsafe characters like ' ' and so on.
37 |     This function can fix some of the problems in a similar way
38 |     browsers handle data entered by the user:
39 | 
40 |     >>> url_normalize('http://de.wikipedia.org/wiki/Elf (Begriffsklärung)')
41 |     'http://de.wikipedia.org/wiki/Elf%20%28Begriffskl%C3%A4rung%29'
42 | 
43 |     Params:
44 |         url : str | None : URL to normalize
45 |         charset : str : optional
46 |             The target charset for the URL if the url was given as unicode string
47 |         default_scheme : str : default scheme to use if none present
48 |         default_domain : str | None : optional
49 |             Default domain to use for absolute paths (starting with '/')
50 |         filter_params : bool : optional
51 |             Whether to filter non-allowlisted parameters (False by default)
52 |         param_allowlist : dict | list | None : optional
53 |             Override for the parameter allowlist
54 | 
55 |     Returns:
56 |         str | None : a normalized url
57 | 
58 |     """
59 |     if not url:
60 |         return url
61 |     url = provide_url_domain(url, default_domain)
62 |     url = provide_url_scheme(url, default_scheme)
63 |     url = generic_url_cleanup(url)
64 |     url_elements = deconstruct_url(url)
65 |     url_elements = url_elements._replace(
66 |         scheme=normalize_scheme(url_elements.scheme),
67 |         userinfo=normalize_userinfo(url_elements.userinfo),
68 |         host=normalize_host(url_elements.host, charset),
69 |         query=normalize_query(
70 |             url_elements.query,
71 |             host=url_elements.host,
72 |             filter_params=filter_params,
73 |             param_allowlist=param_allowlist,
74 |         ),
75 |         fragment=normalize_fragment(url_elements.fragment),
76 |     )
77 |     url_elements = url_elements._replace(
78 |         port=normalize_port(url_elements.port, url_elements.scheme),
79 |         path=normalize_path(url_elements.path, url_elements.scheme),
80 |     )
81 |     return reconstruct_url(url_elements)
82 | 


--------------------------------------------------------------------------------