├── .bumpversion.cfg ├── .github └── workflows │ ├── checks.yml │ ├── publish.yml │ └── tests.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── pyproject.toml ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── files │ ├── NonID3.mp3 │ ├── foo.exe │ ├── foo.gif │ ├── foo.html │ ├── foo.mp3 │ ├── foo.mp4 │ ├── foo.pdf │ ├── foo.ps │ ├── foo.ttf │ ├── foo.txt │ ├── foo.webm │ ├── foo.xml │ └── foo.zip ├── requirements.txt ├── test_main.py ├── test_mimegroups.py └── test_utils.py ├── tox.ini └── xtractmime ├── __init__.py ├── _patterns.py ├── _utils.py └── mimegroups.py /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.2.1 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:xtractmime/__init__.py] 7 | 8 | [bumpversion:file:setup.py] 9 | -------------------------------------------------------------------------------- /.github/workflows/checks.yml: -------------------------------------------------------------------------------- 1 | name: Checks 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | checks: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | include: 12 | - python-version: 3 13 | env: 14 | TOXENV: black 15 | - python-version: 3 16 | env: 17 | TOXENV: bandit 18 | - python-version: 3 19 | env: 20 | TOXENV: flake8 21 | - python-version: 3.8 22 | env: 23 | TOXENV: typing 24 | 25 | steps: 26 | - uses: actions/checkout@v2 27 | 28 | - name: Set up Python ${{ matrix.python-version }} 29 | uses: actions/setup-python@v2 30 | with: 31 | python-version: ${{ matrix.python-version }} 32 | 33 | - name: Run check 34 | env: ${{ matrix.env }} 35 | run: | 36 | pip install -U pip 37 | pip install -U tox 38 | tox 39 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | on: 3 | release: 4 | types: [published] 5 | 6 | jobs: 7 | publish: 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v2 12 | 13 | - name: Set up Python 3 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: 3 17 | 18 | - name: Publish to PyPI 19 | run: | 20 | pip install --upgrade pip 21 | pip install --upgrade setuptools wheel twine 22 | python setup.py sdist bdist_wheel 23 | export TWINE_USERNAME=__token__ 24 | export TWINE_PASSWORD=${{ secrets.PYPI_TOKEN }} 25 | twine upload dist/* 26 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | tests-ubuntu: 7 | name: "Test: py${{ matrix.python-version }}, Ubuntu" 8 | runs-on: ubuntu-latest 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | python-version: [3.7, 3.8, 3.9, "3.10", "3.11", "pypy3.7"] 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | 22 | - name: Install tox 23 | run: pip install tox 24 | 25 | - name: Run tests 26 | run: tox -e py 27 | 28 | - name: Upload coverage report 29 | run: bash <(curl -s https://codecov.io/bash) 30 | 31 | tests-other-os: 32 | name: "Test: py3.8, ${{ matrix.os }}" 33 | runs-on: "${{ matrix.os }}" 34 | strategy: 35 | fail-fast: false 36 | matrix: 37 | os: [macos-latest, windows-latest] 38 | 39 | steps: 40 | - uses: actions/checkout@v3 41 | 42 | - name: Set up Python 3.8 43 | uses: actions/setup-python@v4 44 | with: 45 | python-version: 3.8 46 | 47 | - name: Install tox 48 | run: pip install tox 49 | 50 | - name: Run tests 51 | run: tox -e py 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .~lock* 3 | .DS_Store 4 | .mypy_cache/ 5 | *.egg-info/ 6 | .tox/ 7 | .coverage 8 | htmlcov/ 9 | coverage.xml 10 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.2.1 (2024-01-16) 4 | 5 | A specified content type is now ignored if it is not a valid MIME type. 6 | 7 | ## 0.2.0 (2023-08-31) 8 | 9 | Dropped Python 3.6 support, added official Python 3.10, 3.11 and PyPy support. 10 | 11 | A specified content type is no longer ignored for being a variant of 12 | `plain/text`, unless it is one of the 4 specific variants affected by the old 13 | Apache bug [13986](https://bz.apache.org/bugzilla/show_bug.cgi?id=13986). 14 | 15 | ## 0.1.0 (2022-06-21) 16 | 17 | Initial release. 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2021 Akshay Sharma 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xtractmime 2 | 3 | `xtractmime` is a [BSD-licensed](https://opensource.org/licenses/BSD-3-Clause) 4 | Python 3.7+ implementation of the [MIME Sniffing 5 | Standard](https://mimesniff.spec.whatwg.org/). 6 | 7 | Install from [`PyPI`](https://pypi.python.org/pypi/xtractmime): 8 | 9 | ``` 10 | pip install xtractmime 11 | ``` 12 | 13 | --- 14 | 15 | ## Basic usage 16 | 17 | Below mentioned are some simple examples of using `xtractmime.extract_mime`: 18 | 19 | ```python 20 | >>> from xtractmime import extract_mime 21 | >>> extract_mime(b'Sample text content') 22 | b'text/plain' 23 | >>> extract_mime(b'', content_types=(b'text/html',)) 24 | b'text/html' 25 | ``` 26 | 27 | Additional functionality to check if a MIME type belongs to a specific MIME type group using 28 | methods included in `xtractmime.mimegroups`: 29 | 30 | ```python 31 | >>> from xtractmime.mimegroups import is_html_mime_type, is_image_mime_type 32 | >>> mime_type = b'text/html' 33 | >>> is_html_mime_type(mime_type) 34 | True 35 | >>> is_image_mime_type(mime_type) 36 | False 37 | ``` 38 | 39 | --- 40 | 41 | ## API Reference 42 | 43 | ### function `xtractmime.extract_mime(*args, **kwargs) -> Optional[bytes]` 44 | **Parameters:** 45 | 46 | * `body: bytes` 47 | * `content_types: Optional[Tuple[bytes]] = None` 48 | * `http_origin: bool = True` 49 | * `no_sniff: bool = False` 50 | * `extra_types: Optional[Tuple[Tuple[bytes, bytes, Optional[Set[bytes]], bytes], ...]] = None` 51 | * `supported_types: Set[bytes] = None` 52 | 53 | Return the [MIME type essence](https://mimesniff.spec.whatwg.org/#mime-type-essence) (e.g. `text/html`) matching the input data, or 54 | `None` if no match can be found. 55 | 56 | The `body` parameter is the byte sequence of which MIME type is to be determined. `xtractmime` only considers the first few 57 | bytes of the `body` and the specific number of bytes read is defined in the `xtractmime.RESOURCE_HEADER_BUFFER_LENGTH` constant. 58 | 59 | `content_types` is a tuple of MIME types given in the resource metadata. For example, for resources retrieved via HTTP, users should pass the list of MIME types mentioned in the `Content-Type` header. 60 | 61 | `http_origin` indicates if the resource has been retrieved via HTTP (`True`, default) or not (`False`). 62 | 63 | `no_sniff` is a flag which is *`True`* if the user agent does not wish to 64 | perform sniffing on the resource and *`False`* (by default) otherwise. Users may want to set 65 | this parameter to *`True`* if the [`X-Content-Type-Options`](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Content-Type-Options) response header is set to `nosniff`. For more info, see [here](https://mimesniff.spec.whatwg.org/#no-sniff-flag). 66 | 67 | `extra_types` is a tuple of patterns to support detecting additional MIME types. Each entry in the tuple should follow the format 68 | **(Byte Pattern, Pattern Mask, Leading Bytes, MIME type)**: 69 | 70 | * **Byte Pattern** is a byte sequence to compare with the first few bytes (``xtractmime.RESOURCE_HEADER_BUFFER_LENGTH``) of the `body`. 71 | * **Pattern Mask** is a byte sequence that indicates the significance of **Byte Pattern** bytes: `b"\xff"` indicates the matching byte is strictly significant, `b"\xdf"` indicates that the byte is significant in an ASCII case-insensitive way, and `b"\x00"` indicates that the byte is not significant. 72 | * **Leading Bytes** is a set of bytes to be ignored while matching the leading bytes in the content. 73 | * **MIME type** should be returned if the pattern matches. 74 | 75 | **Sample `extra_types`:** 76 | ```python 77 | extra_types = ((b'test', b'\xff\xff\xff\xff', None, b'text/test'), ...) 78 | ``` 79 | 80 | --- 81 | **NOTE** 82 | 83 | *Be careful while using the `extra_types` argument, as it may introduce some privilege escalation vulnerabilities for `xtractmime`. For more info, see [here](https://mimesniff.spec.whatwg.org/#ref-for-mime-type%E2%91%A1%E2%91%A8).* 84 | 85 | --- 86 | 87 | Optional `supported_types` is a set of all [MIME types supported the by user agent](https://mimesniff.spec.whatwg.org/#supported-by-the-user-agent). If `supported_types` is not 88 | specified, all MIME types are assumed to be supported. Using this parameter can improve the performance of `xtractmime`. 89 | 90 | ### function `xtractmime.is_binary_data(input_bytes: bytes) -> bool` 91 | 92 | Return *`True`* if the provided byte sequence contains any binary data bytes, else *`False`* 93 | 94 | ### MIME type group functions 95 | 96 | The following functions return `True` if a given MIME type belongs to a certain 97 | [MIME type group](https://mimesniff.spec.whatwg.org/#mime-type-groups), or 98 | `False` otherwise: 99 | ``` 100 | xtractmime.mimegroups.is_archive_mime_type(mime_type: bytes) -> bool 101 | xtractmime.mimegroups.is_audio_video_mime_type(mime_type: bytes) -> bool 102 | xtractmime.mimegroups.is_font_mime_type(mime_type: bytes) -> bool 103 | xtractmime.mimegroups.is_html_mime_type(mime_type: bytes) -> bool 104 | xtractmime.mimegroups.is_image_mime_type(mime_type: bytes) -> bool 105 | xtractmime.mimegroups.is_javascript_mime_type(mime_type: bytes) -> bool 106 | xtractmime.mimegroups.is_json_mime_type(mime_type: bytes) -> bool 107 | xtractmime.mimegroups.is_scriptable_mime_type(mime_type: bytes) -> bool 108 | xtractmime.mimegroups.is_xml_mime_type(mime_type: bytes) -> bool 109 | xtractmime.mimegroups.is_zip_mime_type(mime_type: bytes) -> bool 110 | ``` 111 | **Example** 112 | ```python 113 | >>> from xtractmime.mimegroups import is_html_mime_type, is_image_mime_type, is_zip_mime_type 114 | >>> mime_type = b'text/html' 115 | >>> is_html_mime_type(mime_type) 116 | True 117 | >>> is_image_mime_type(mime_type) 118 | False 119 | >>> is_zip_mime_type(mime_type) 120 | False 121 | ``` 122 | 123 | 124 | ## Changelog 125 | 126 | See the [changelog](CHANGELOG.md) 127 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 99 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, W503 3 | max-line-length = 99 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | 4 | with open("README.md", "r", encoding="utf-8") as desc: 5 | long_description = desc.read() 6 | 7 | setuptools.setup( 8 | name="xtractmime", 9 | version="0.2.1", 10 | license="BSD", 11 | description=( 12 | "Implementation of the MIME Sniffing standard (https://mimesniff.spec.whatwg.org/)" 13 | ), 14 | long_description=long_description, 15 | long_description_content_type="text/markdown", 16 | author="Akshay Sharma", 17 | author_email="akshaysharmajs@gmail.com", 18 | url="https://github.com/scrapy/xtractmime", 19 | packages=["xtractmime"], 20 | python_requires=">=3.7", 21 | classifiers=[ 22 | "Development Status :: 1 - Planning", 23 | "License :: OSI Approved :: BSD License", 24 | "Programming Language :: Python", 25 | "Programming Language :: Python :: 3.7", 26 | "Programming Language :: Python :: 3.8", 27 | "Programming Language :: Python :: 3.9", 28 | "Programming Language :: Python :: 3.10", 29 | "Programming Language :: Python :: 3.11", 30 | "Programming Language :: Python :: Implementation :: CPython", 31 | "Programming Language :: Python :: Implementation :: PyPy", 32 | "Framework :: Scrapy", 33 | "Intended Audience :: Developers", 34 | "Topic :: Internet :: WWW/HTTP", 35 | "Topic :: Software Development :: Libraries :: Application Frameworks", 36 | "Topic :: Software Development :: Libraries :: Python Modules", 37 | ], 38 | ) 39 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/xtractmime/26757354487c1c8ce8c8810fd0404f6c76e8519e/tests/__init__.py -------------------------------------------------------------------------------- /tests/files/NonID3.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/xtractmime/26757354487c1c8ce8c8810fd0404f6c76e8519e/tests/files/NonID3.mp3 -------------------------------------------------------------------------------- /tests/files/foo.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/xtractmime/26757354487c1c8ce8c8810fd0404f6c76e8519e/tests/files/foo.exe -------------------------------------------------------------------------------- /tests/files/foo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy/xtractmime/26757354487c1c8ce8c8810fd0404f6c76e8519e/tests/files/foo.gif -------------------------------------------------------------------------------- /tests/files/foo.html: -------------------------------------------------------------------------------- 1 | 2 |
3 |A blank HTML document for testing purposes.
14 | 15 | 16 | 17 | 18 |