├── .coveragerc ├── .flake8 ├── .github └── workflows │ ├── pythonpublish.yml │ └── tests.yml ├── .gitignore ├── .pep8speaks.yml ├── .pre-commit-config.yaml ├── AUTHORS.rst ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── README.rst ├── puremagic ├── __init__.py ├── __main__.py ├── magic_data.json ├── main.py ├── py.typed └── scanners │ ├── __init__.py │ ├── helpers.py │ ├── json_scanner.py │ ├── pdf_scanner.py │ ├── python_scanner.py │ ├── text_scanner.py │ └── zip_scanner.py ├── pyproject.toml ├── requirements-test.txt ├── scripts └── parse_ftk_kessler_sigs.py ├── setup.cfg ├── setup.py └── test ├── __init__.py ├── common.py ├── resources ├── archive │ ├── test.7z │ ├── test.big_endian.pcapng │ ├── test.bz2 │ ├── test.cb7 │ ├── test.cbt │ ├── test.cbz │ ├── test.fb2.zip │ ├── test.fbz │ ├── test.gz │ ├── test.little_endian.pcapng │ ├── test.pcapng │ ├── test.rar │ ├── test.tar │ ├── test.txt.lz4 │ ├── test.txt.zst │ ├── test.xz │ └── test.zip ├── audio │ ├── test.aac │ ├── test.aif │ ├── test.mp3 │ └── test.wav ├── fake_file ├── images │ ├── test.avif │ ├── test.bmp │ ├── test.gif │ ├── test.ico │ ├── test.jpg │ ├── test.png │ ├── test.psd │ ├── test.svg │ ├── test.tga │ ├── test.tif │ ├── test.webp │ ├── test.xcf │ ├── test_varriant.svg │ └── test_varriant_2.svg ├── media │ ├── test (single).vmdk │ ├── test (split).vmdk │ ├── test.iso │ ├── test.mdf │ └── test.swf ├── office │ ├── test.doc │ ├── test.docm │ ├── test.docx │ ├── test.dotm │ ├── test.dotx │ ├── test.fb2 │ ├── test.odp │ ├── test.ods │ ├── test.odt │ ├── test.pdf │ ├── test.potm │ ├── test.potx │ ├── test.ppt │ ├── test.pptm │ ├── test.pptx │ ├── test.txt │ ├── test.xls │ ├── test.xlsb │ ├── test.xlsm │ ├── test.xlsx │ ├── test.xltm │ └── test.xltx ├── system │ ├── test.exe │ ├── test.json │ ├── test.puremagic_multi_footer │ └── test.py └── video │ ├── test.3g2 │ ├── test.avi │ ├── test.flv │ ├── test.jxsv │ ├── test.mkv │ ├── test.mp4 │ ├── test.mpg │ └── test.wmv ├── test_common_extensions.py ├── test_main.py └── test_scanners.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | omit = 3 | */python?.?/* 4 | */site-packages/* 5 | */test/* 6 | */pypy/* 7 | */venv/* 8 | */.*/* 9 | */*.egg-info/* 10 | */.mypy_cache/* 11 | */.pytest_cache/* 12 | exclude_lines = 13 | command_line_entry 14 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | exclude = .git,.idea,__pycache__,.gitignore,venv,.github,build,dist,test 4 | ignore = 5 | # E203 whitespace before ':' 6 | # black will insert some non-E203-compliant whitespace 7 | E203, 8 | # W503 line break before binary operator 9 | # black will inserts non-W503-compliant line breaks 10 | W503, 11 | # F401 imported but unused 12 | # for __version__, __author__ 13 | F401, 14 | # F403 used; unable to detect undefined names 15 | # When importing start but some names undefined 16 | F403, 17 | T001 18 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Set up Python 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: '3.12' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Tests 5 | 6 | on: 7 | push: 8 | branches: [ master, develop ] 9 | pull_request: 10 | branches: [ master, develop ] 11 | 12 | jobs: 13 | build: 14 | 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | os: [ubuntu-latest] 19 | python-version: ["3.12", "3.13"] 20 | include: 21 | - os: macos-latest 22 | python-version: '3.13' 23 | - os: windows-latest 24 | python-version: '3.13' 25 | runs-on: ${{ matrix.os }} 26 | steps: 27 | - uses: actions/checkout@v4 28 | 29 | - name: Set up Python ${{ matrix.python-version }} 30 | uses: actions/setup-python@v5 31 | with: 32 | python-version: ${{ matrix.python-version }} 33 | allow-prereleases: true 34 | cache: 'pip' 35 | 36 | - name: Install dependencies 37 | run: | 38 | python -m pip install --upgrade pip 39 | pip install coveralls flake8 setuptools wheel twine 40 | pip install -r requirements-test.txt --upgrade 41 | pip install black==24.10.0 42 | 43 | - name: Verify Code with Black 44 | run: | 45 | black --check puremagic test 46 | 47 | - name: Lint with flake8 48 | run: | 49 | # stop the tests if there are linting errors 50 | flake8 puremagic --count --show-source --statistics 51 | 52 | - name: Test with pytest 53 | run: | 54 | python -m pytest --cov=puremagic test/ 55 | 56 | - name: Check distribution log description 57 | shell: bash 58 | run: | 59 | python setup.py sdist bdist_wheel 60 | twine check dist/* 61 | ls -lah "dist/" 62 | WHL=$(find dist -name *.whl -print -quit) 63 | echo ${WHL} 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | # static files generated from Django application using `collectstatic` 142 | media 143 | static 144 | 145 | .idea/ 146 | -------------------------------------------------------------------------------- /.pep8speaks.yml: -------------------------------------------------------------------------------- 1 | scanner: 2 | diff_only: True 3 | linter: flake8 4 | 5 | flake8: 6 | max-line-length: 120 # Default is 79 in PEP 8 7 | ignore: # Errors and warnings to ignore 8 | - F401 9 | - F403 10 | - F405 11 | - W503 12 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v5.0.0 4 | hooks: 5 | # Identify invalid files 6 | - id: check-ast 7 | - id: check-yaml 8 | - id: check-json 9 | - id: check-toml 10 | # git checks 11 | - id: check-merge-conflict 12 | - id: check-added-large-files 13 | exclude: ^test/resources/.+ 14 | - id: detect-private-key 15 | - id: check-case-conflict 16 | # Python checks 17 | - id: check-docstring-first 18 | - id: debug-statements 19 | - id: requirements-txt-fixer 20 | - id: fix-byte-order-marker 21 | # General quality checks 22 | - id: mixed-line-ending 23 | args: [--fix=lf] 24 | exclude: ^test/resources/ 25 | - id: trailing-whitespace 26 | args: [--markdown-linebreak-ext=md] 27 | exclude: | 28 | (?x)^( 29 | ^test/resources/.+| 30 | ^puremagic/magic_data.json 31 | )$ 32 | - id: check-executables-have-shebangs 33 | - id: end-of-file-fixer 34 | exclude: ^test/resources/.+ 35 | 36 | 37 | - repo: https://github.com/astral-sh/ruff-pre-commit 38 | rev: v0.7.2 39 | hooks: 40 | - id: ruff 41 | 42 | - repo: https://github.com/ambv/black 43 | rev: 24.10.0 44 | hooks: 45 | - id: black 46 | 47 | - repo: https://github.com/pre-commit/mirrors-mypy 48 | rev: 'v1.13.0' 49 | hooks: 50 | - id: mypy 51 | 52 | - repo: https://github.com/tox-dev/pyproject-fmt 53 | rev: v2.5.0 54 | hooks: 55 | - id: pyproject-fmt 56 | 57 | - repo: https://github.com/abravalheri/validate-pyproject 58 | rev: v0.22 59 | hooks: 60 | - id: validate-pyproject 61 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | puremagic is written and maintained by Chris Griffith . 2 | 3 | A big thank you to everyone that has helped! 4 | 5 | - Jonathan Barratt (reduxionist) 6 | - jiel 7 | - Victor Domingos (victordomingos) 8 | - David Shunfenthal (dshunfen) 9 | - Andrey Zakharevich (andreyz4k) 10 | - Sergey Ponomarev (stokito) 11 | - andrewpmk 12 | - bannsec 13 | - Don Tsang (DonaldTsang) 14 | - Oleksandr (msdinit) 15 | - Robbert Korving (robkorv) 16 | - Sean Stallbaum (CSBaum) 17 | - phithon 18 | - Gerhard Schmidt 19 | - R. Singh (Gr3atWh173) 20 | - Andy (NebularNerd) 21 | - Raphaël Vinot (Rafiot) 22 | - Sebastian Kreft (sk-) 23 | - William Bonnaventure (Aztorius) 24 | - Nicholas Bollweg (bollwyvl) 25 | - Péter (peterekepeter) 26 | - mara004 27 | - Markus (tvads-markus) 28 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | Version 2.0.0 5 | ------------- 6 | 7 | - Adding deep scan for improved accuracy #102 #94 #70 #69 #12 #3 8 | - Changing to full semantic versioning to be able to denote bugfixes vs minor features 9 | - Removing support for python 3.7, 3.8, 3.9, 3.10 and 3.11 please stick to 1.x release chain to support older versions 10 | 11 | Version 1.29 12 | ------------ 13 | 14 | - Changing to publishing to pypi with Python 3.9 15 | - Fixing #105 fsspec/gcsfs throws an ValueError instead of a OSError (thanks to Markus) 16 | - Fixing github actions due to updates 17 | 18 | Version 1.28 19 | ------------ 20 | 21 | - Adding #99 New file support (thanks to Andy - NebularNerd) 22 | - Fixing #100 FITS files no longer had mime type (thanks to ejeschke) 23 | 24 | Version 1.27 25 | ------------ 26 | 27 | - Adding new verbose output to command line with `-v` or `--verbose` 28 | - Adding #92 include py.typed in sdist (thanks to Nicholas Bollweg - bollwyvl) 29 | - Adding #93 Improve PDF file detection, fix json description (thanks to Péter - peterekepeter) 30 | - Fixing #96 #86 stream does not work properly on opened small files (thanks to Felipe Lema and Andy - NebularNerd) 31 | - Removing expected invalid WinZip signature 32 | 33 | Version 1.26 34 | ------------ 35 | 36 | - Adding #87 sndhdr update and HD/CD/DVD Image files (thanks to Andy - NebularNerd) 37 | - Adding #88 Add .caf mime type (thanks to William Bonnaventure) 38 | - Fixing #89 add py.typed to package_data (thanks to Sebastian Kreft) 39 | 40 | Version 1.25 41 | ------------ 42 | 43 | - Changing to support Python 3.7 again 44 | 45 | Version 1.24 46 | ------------ 47 | 48 | - Adding #72 #75 #76 #81 `.what()` to be a drop in replacement for `imghdr.what()` (thanks to Christian Clauss and Andy - NebularNerd) 49 | - Adding #67 Test on Python 3.13 beta (thanks to Christian Clauss) 50 | - Adding #77 from __future__ import annotations (thanks to Christian Clauss 51 | - Changing all HTML extensions to full `.html` 52 | - Fixing #66 Confidence sorting (thanks to Andy - NebularNerd) 53 | 54 | Version 1.23 55 | ------------ 56 | 57 | - Fixing #32 MP3 Detection improvements (thanks to Andy - NebularNerd and Sander) 58 | 59 | Version 1.22 60 | ------------ 61 | 62 | - Adding #52 magic data for JPEG XS (thanks to Andy - NebularNerd) 63 | - Adding #57 Multi-part checks with negative offsets (thanks to Andy - NebularNerd) 64 | - Fixing #60 encoding warning (thanks to Andy - NebularNerd and Jason R. Coombs) 65 | 66 | Version 1.21 67 | ------------ 68 | 69 | - Adding #50 details for ZSoft .pcx files (thanks to Andy - NebularNerd) 70 | - Adding #51 details for JXL files (thanks to Andy - NebularNerd) 71 | - Adding #54 missing py.typed file (thanks to Raphaël Vinot) 72 | - Fixing #53 magic data for GIF images (thanks to Andy - NebularNerd) 73 | 74 | Version 1.20 75 | ------------ 76 | 77 | - Adding support for multi-part header checks (thanks to Andy) 78 | - Fixing matches for webp (thanks to Nicolas Wicht) 79 | - Fixing matches for epub (thanks to Alexander Walters) 80 | 81 | Version 1.15 82 | ------------ 83 | 84 | - Adding fix for resetting the stream after reading part of it (thanks to R. Singh) 85 | 86 | Version 1.14 87 | ------------ 88 | 89 | - Adding generic extension mapping for common file types 90 | - Adding #36 details to readme about magic_stream and magic_string (thanks to Martin) 91 | - Fixing multiple bad extensions and mimetypes 92 | - Removing bad entry for 3gp5 selecting multiple things 93 | 94 | Version 1.13 95 | ------------ 96 | 97 | - Adding support for Path for filename 98 | - Adding details for mp4 99 | - Adding details for avif and heif images 100 | 101 | Version 1.12 102 | ------------ 103 | 104 | - Adding #38 webp mimetype (thanks to phith0n) 105 | - Adding #37 SVG images (thanks to Gerhard Schmidt) 106 | - Adding missing mimetypes for aac, vmdk, wmv and xcf 107 | 108 | Version 1.11 109 | ------------ 110 | 111 | - Adding #34 test files to build (thanks to James French) 112 | - Adding #33 install from pypi details (thanks to Sander) 113 | - Removing #31 unsupported Python items in setup.py (thanks to Safihre) 114 | 115 | Version 1.10 116 | ------------ 117 | 118 | - Fixing how confidence works (thanks to Sean Stallbaum) 119 | 120 | Version 1.9 121 | ----------- 122 | 123 | - Adding new methods for stream handling (from_stream, magic_stream) (thanks to Robbert Korving) 124 | 125 | Version 1.8 126 | ----------- 127 | 128 | - Adding support for various other files (thanks to Don Tsang) 129 | - Adding missing mime types (thanks to Oleksandr) 130 | 131 | Version 1.7 132 | ----------- 133 | 134 | - Adding support for PCAPNG files (thanks to bannsec) 135 | - Adding support for numerous other files updated by Gary C. Kessler 136 | - Adding script for parsing FTK GCK sigs 137 | - Changing test suites to github workflows instead of TravisCI 138 | - Removing official support, new packages and test for python 2 139 | 140 | Version 1.6 141 | ----------- 142 | 143 | - Adding support for LZ4 and ZSTD archives (Thanks to Sergey Ponomarev) 144 | - Adding support for more office formats (Thanks to andrewpmk) 145 | 146 | Version 1.5 147 | ----------- 148 | 149 | - Adding full magic info in results (Thanks to David Shunfenthal) 150 | - Fixing magic_data.json not being added to sdist dist (Thanks to Andrey Zakharevich) 151 | 152 | Version 1.4 153 | ----------- 154 | 155 | - Fixing how `__main__` was implemented (Thanks to Victor Domingos) 156 | 157 | Version 1.3 158 | ----------- 159 | 160 | - Adding filename extension hinting for string (Thanks to jiel) 161 | - Adding open office MIME types (Thanks to jiel) 162 | 163 | Version 1.2 164 | ----------- 165 | 166 | - Adding setup file 167 | - Adding changelog 168 | - Adding CI tests support for 3.4, 3.5, 3.6 and pypy 169 | - Adding more basic documentation 170 | - Adding magic detection from https://www.freedesktop.org/wiki/Specifications/shared-mime-info-spec/ 171 | - Removing testing on 3.2 due to Travis CI and coverage not getting along 172 | - Changing to argparse instead of optparse 173 | - Changing magic_file to not raise error on empty, simple provide an empty list 174 | - Changing magic_data py file to a json file so it's easier to understand and modify 175 | - Updating data to be a python file, so there is no dangerous eval 176 | 177 | 178 | Version 1.1 179 | ----------- 180 | 181 | - Adding tests 182 | - Changing to MIT License 183 | 184 | Version 1.0 185 | ----------- 186 | 187 | - Initial release 188 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013-2025 Chris Griffith 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include puremagic/*.json 2 | include puremagic/py.typed 3 | include puremagic/scanners/*.py 4 | include LICENSE 5 | include AUTHORS.rst 6 | include CHANGELOG.md 7 | graft test 8 | global-exclude *.py[cod] 9 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | puremagic 3 | ========= 4 | 5 | puremagic is a pure python module that will identify a file based off 6 | it's magic numbers. 7 | 8 | It is designed to be minimalistic and inherently cross platform 9 | compatible. It is also designed to be a stand in for python-magic, it 10 | incorporates the functions from\_file(filename[, mime]) and 11 | from\_string(string[, mime]) however the magic\_file() and 12 | magic\_string() are more powerful and will also display confidence and 13 | duplicate matches. 14 | 15 | It does NOT try to match files off non-magic string. In other words it 16 | will not search for a string within a certain window of bytes like 17 | others might. 18 | 19 | Advantages over using a wrapper for 'file' or 'libmagic': 20 | 21 | - Faster 22 | - Lightweight 23 | - Cross platform compatible 24 | - No dependencies 25 | 26 | Disadvantages: 27 | 28 | - Does not have as many file types 29 | - No multilingual comments 30 | - Duplications due to small or reused magic numbers 31 | 32 | (Help fix the first two disadvantages by contributing!) 33 | 34 | Compatibility 35 | ~~~~~~~~~~~~~ 36 | 37 | - Python 3.12+ 38 | 39 | For use with with 3.7 use the 1.x branch. 40 | 41 | Using github ci to run continuous integration tests on listed platforms. 42 | 43 | Install from pypy 44 | ----------------- 45 | 46 | .. code:: bash 47 | 48 | $ pip install puremagic 49 | 50 | On linux environments, you may want to be clear you are using python3 51 | 52 | .. code:: bash 53 | 54 | $ python3 -m pip install puremagic 55 | 56 | 57 | Install from source 58 | ------------------- 59 | 60 | In either a virtualenv or globally, simply run: 61 | 62 | .. code:: bash 63 | 64 | $ python setup.py install 65 | 66 | Usage 67 | ----- 68 | 69 | "from_file" will return the most likely file extension. "magic_file" 70 | will give you every possible result it finds, as well as the confidence. 71 | 72 | .. code:: python 73 | 74 | import puremagic 75 | 76 | filename = "test/resources/images/test.gif" 77 | 78 | ext = puremagic.from_file(filename) 79 | # '.gif' 80 | 81 | puremagic.magic_file(filename) 82 | # [['.gif', 'image/gif', 'Graphics interchange format file (GIF87a)', 0.7], 83 | # ['.gif', '', 'GIF file', 0.5]] 84 | 85 | With "magic_file" it gives each match, highest confidence first: 86 | 87 | - possible extension(s) 88 | - mime type 89 | - description 90 | - confidence (All headers have to perfectly match to make the list, 91 | however this orders it by longest header, therefore most precise, 92 | first) 93 | 94 | If you already have a file open, or raw byte string, you could also use: 95 | 96 | * from_string 97 | * from_stream 98 | * magic_string 99 | * magic_stream 100 | 101 | .. code:: python 102 | 103 | with open(r"test\resources\video\test.mp4", "rb") as file: 104 | print(puremagic.magic_stream(file)) 105 | 106 | # [PureMagicWithConfidence(byte_match=b'ftypisom', offset=4, extension='.mp4', mime_type='video/mp4', name='MPEG-4 video', confidence=0.8), 107 | # PureMagicWithConfidence(byte_match=b'iso2avc1mp4', offset=20, extension='.mp4', mime_type='video/mp4', name='MP4 Video', confidence=0.8)] 108 | 109 | Script 110 | ------ 111 | 112 | *Usage* 113 | 114 | .. code:: bash 115 | 116 | $ python -m puremagic [options] filename ... 117 | 118 | *Examples* 119 | 120 | .. code:: bash 121 | 122 | $ python -m puremagic test/resources/images/test.gif 123 | 'test/resources/images/test.gif' : .gif 124 | 125 | $ python -m puremagic -m test/resources/images/test.gif test/resources/audio/test.mp3 126 | 'test/resources/images/test.gif' : image/gif 127 | 'test/resources/audio/test.mp3' : audio/mpeg 128 | 129 | imghdr replacement 130 | ------------------ 131 | 132 | If you are looking for a replacement for the standard library's depreciated imghdr, you can use `puremagic.what()` 133 | 134 | .. code:: python 135 | 136 | import puremagic 137 | 138 | filename = "test/resources/images/test.gif" 139 | 140 | ext = puremagic.what(filename) 141 | # 'gif' 142 | 143 | FAQ 144 | --- 145 | 146 | *The file type is actually X but it's showing up as Y with higher 147 | confidence?* 148 | 149 | This can happen when the file's signature happens to match a subset of a 150 | file standard. The subset signature will be longer, therefore report 151 | with greater confidence, because it will have both the base file type 152 | signature plus the additional subset one. 153 | 154 | 155 | Acknowledgements 156 | ---------------- 157 | 158 | Gary C. Kessler 159 | 160 | For use of his File Signature Tables, available at: 161 | https://filesig.search.org/ 162 | 163 | Freedesktop.org 164 | 165 | For use of their shared-mime-info file, available at: 166 | https://cgit.freedesktop.org/xdg/shared-mime-info/ 167 | 168 | License 169 | ------- 170 | 171 | MIT Licenced, see LICENSE, Copyright (c) 2013-2025 Chris Griffith 172 | -------------------------------------------------------------------------------- /puremagic/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from puremagic.main import * 3 | from puremagic.main import __author__, __version__ 4 | -------------------------------------------------------------------------------- /puremagic/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from puremagic.main import command_line_entry 3 | 4 | command_line_entry() 5 | -------------------------------------------------------------------------------- /puremagic/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | puremagic is a pure python module that will identify a file based off it's 4 | magic numbers. It is designed to be minimalistic and inherently cross-platform 5 | compatible, with no imports when used as a module. 6 | 7 | © 2013-2025 Chris Griffith - License: MIT (see LICENSE) 8 | 9 | Acknowledgements 10 | Gary C. Kessler 11 | For use of his File Signature Tables, available at: 12 | https://filesig.search.org/ 13 | """ 14 | 15 | import json 16 | import os 17 | from binascii import unhexlify 18 | from collections import namedtuple 19 | from itertools import chain 20 | from pathlib import Path 21 | 22 | if os.getenv("PUREMAGIC_DEEPSCAN") != "0": 23 | from puremagic.scanners import zip_scanner, pdf_scanner, text_scanner, json_scanner, python_scanner 24 | 25 | __author__ = "Chris Griffith" 26 | __version__ = "2.0.0b2" 27 | __all__ = [ 28 | "magic_file", 29 | "magic_string", 30 | "magic_stream", 31 | "from_file", 32 | "from_string", 33 | "from_stream", 34 | "ext_from_filename", 35 | "PureError", 36 | "magic_footer_array", 37 | "magic_header_array", 38 | "multi_part_dict", 39 | "what", 40 | "PureMagic", 41 | "PureMagicWithConfidence", 42 | ] 43 | 44 | # Convert puremagic extensions to imghdr extensions 45 | imghdr_exts = {"dib": "bmp", "jfif": "jpeg", "jpg": "jpeg", "rst": "rast", "sun": "rast", "tif": "tiff"} 46 | 47 | here = os.path.abspath(os.path.dirname(__file__)) 48 | 49 | PureMagic = namedtuple( 50 | "PureMagic", 51 | ( 52 | "byte_match", 53 | "offset", 54 | "extension", 55 | "mime_type", 56 | "name", 57 | ), 58 | ) 59 | PureMagicWithConfidence = namedtuple( 60 | "PureMagicWithConfidence", 61 | ( 62 | "byte_match", 63 | "offset", 64 | "extension", 65 | "mime_type", 66 | "name", 67 | "confidence", 68 | ), 69 | ) 70 | 71 | 72 | class PureError(LookupError): 73 | """Do not have that type of file in our databanks""" 74 | 75 | 76 | def _magic_data( 77 | filename: os.PathLike | str = os.path.join(here, "magic_data.json"), 78 | ) -> tuple[list[PureMagic], list[PureMagic], list[PureMagic], dict[bytes, list[PureMagic]]]: 79 | """Read the magic file""" 80 | with open(filename, encoding="utf-8") as f: 81 | data = json.load(f) 82 | headers = sorted((_create_puremagic(x) for x in data["headers"]), key=lambda x: x.byte_match) 83 | footers = sorted((_create_puremagic(x) for x in data["footers"]), key=lambda x: x.byte_match) 84 | extensions = [_create_puremagic(x) for x in data["extension_only"]] 85 | multi_part_extensions = {} 86 | for file_match, option_list in data["multi-part"].items(): 87 | multi_part_extensions[unhexlify(file_match.encode("ascii"))] = [_create_puremagic(x) for x in option_list] 88 | return headers, footers, extensions, multi_part_extensions 89 | 90 | 91 | def _create_puremagic(x: list) -> PureMagic: 92 | return PureMagic( 93 | byte_match=unhexlify(x[0].encode("ascii")), 94 | offset=x[1], 95 | extension=x[2], 96 | mime_type=x[3], 97 | name=x[4], 98 | ) 99 | 100 | 101 | magic_header_array, magic_footer_array, extension_only_array, multi_part_dict = _magic_data() 102 | 103 | 104 | def _max_lengths() -> tuple[int, int]: 105 | """The length of the largest magic string + its offset""" 106 | max_header_length = max([len(x.byte_match) + x.offset for x in magic_header_array]) 107 | max_footer_length = max([len(x.byte_match) + abs(x.offset) for x in magic_footer_array]) 108 | 109 | for options in multi_part_dict.values(): 110 | for option in options: 111 | if option.offset < 0: 112 | max_footer_length = max(max_footer_length, len(option.byte_match) + abs(option.offset)) 113 | else: 114 | max_header_length = max(max_header_length, len(option.byte_match) + option.offset) 115 | 116 | return max_header_length, max_footer_length 117 | 118 | 119 | max_head, max_foot = _max_lengths() 120 | 121 | 122 | def _confidence(matches, ext=None) -> list[PureMagicWithConfidence]: 123 | """Rough confidence based on string length and file extension""" 124 | results = [] 125 | for match in matches: 126 | con = 0.8 if len(match.byte_match) >= 9 else float(f"0.{len(match.byte_match)}") 127 | if con >= 0.1 and ext and ext == match.extension: 128 | con = 0.9 129 | results.append(PureMagicWithConfidence(confidence=con, **match._asdict())) 130 | 131 | if not results and ext: 132 | results = [ 133 | PureMagicWithConfidence(confidence=0.1, **magic_row._asdict()) 134 | for magic_row in extension_only_array 135 | if ext == magic_row.extension 136 | ] 137 | 138 | return sorted(results, key=lambda x: (x.confidence, len(x.byte_match)), reverse=True) 139 | 140 | 141 | def _identify_all(header: bytes, footer: bytes, ext=None) -> list[PureMagicWithConfidence]: 142 | """Attempt to identify 'data' by its magic numbers""" 143 | 144 | # Capture the length of the data 145 | # That way we do not try to identify bytes that don't exist 146 | matches = [] 147 | for magic_row in magic_header_array: 148 | start = magic_row.offset 149 | end = magic_row.offset + len(magic_row.byte_match) 150 | if end > len(header): 151 | continue 152 | if header[start:end] == magic_row.byte_match: 153 | matches.append(magic_row) 154 | 155 | for magic_row in magic_footer_array: 156 | start = magic_row.offset 157 | end = magic_row.offset + len(magic_row.byte_match) 158 | match_area = footer[start:end] if end != 0 else footer[start:] 159 | if match_area == magic_row.byte_match: 160 | matches.append(magic_row) 161 | 162 | new_matches = set() 163 | for matched in matches: 164 | if matched.byte_match in multi_part_dict: 165 | for magic_row in multi_part_dict[matched.byte_match]: 166 | start = magic_row.offset 167 | end = magic_row.offset + len(magic_row.byte_match) 168 | if magic_row.offset < 0: 169 | match_area = footer[start:end] if end != 0 else footer[start:] 170 | if match_area == magic_row.byte_match: 171 | new_matches.add( 172 | PureMagic( 173 | byte_match=matched.byte_match + magic_row.byte_match, 174 | offset=magic_row.offset, 175 | extension=magic_row.extension, 176 | mime_type=magic_row.mime_type, 177 | name=magic_row.name, 178 | ) 179 | ) 180 | else: 181 | if end > len(header): 182 | continue 183 | if header[start:end] == magic_row.byte_match: 184 | new_matches.add( 185 | PureMagic( 186 | byte_match=header[matched.offset : end], 187 | offset=magic_row.offset, 188 | extension=magic_row.extension, 189 | mime_type=magic_row.mime_type, 190 | name=magic_row.name, 191 | ) 192 | ) 193 | 194 | matches.extend(list(new_matches)) 195 | return _confidence(matches, ext) 196 | 197 | 198 | def _magic(header: bytes, footer: bytes, mime: bool, ext=None, filename=None) -> str: 199 | """Discover what type of file it is based on the incoming string""" 200 | if not header: 201 | raise ValueError("Input was empty") 202 | infos = _identify_all(header, footer, ext) 203 | if filename and os.getenv("PUREMAGIC_DEEPSCAN") != "0": 204 | results = _run_deep_scan(infos, filename, header, footer, raise_on_none=True) 205 | if results: 206 | if results[0].extension == "": 207 | raise PureError("Could not identify file") 208 | if mime: 209 | return results[0].mime_type 210 | return results[0].extension 211 | if not infos: 212 | raise PureError("Could not identify file") 213 | info = infos[0] 214 | if mime: 215 | return info.mime_type 216 | return info.extension if not isinstance(info.extension, list) else info[0].extension 217 | 218 | 219 | def _file_details(filename: os.PathLike | str) -> tuple[bytes, bytes]: 220 | """Grab the start and end of the file""" 221 | with open(filename, "rb") as fin: 222 | head = fin.read(max_head) 223 | try: 224 | fin.seek(-max_foot, os.SEEK_END) 225 | except OSError: 226 | fin.seek(0) 227 | foot = fin.read() 228 | return head, foot 229 | 230 | 231 | def _string_details(string): 232 | """Grab the start and end of the string""" 233 | return string[:max_head], string[-max_foot:] 234 | 235 | 236 | def _stream_details(stream): 237 | """Grab the start and end of the stream""" 238 | head = stream.read(max_head) 239 | try: 240 | stream.seek(-max_foot, os.SEEK_END) 241 | except (OSError, ValueError): # fsspec throws ValueError 242 | # File is smaller than the max_foot size, jump to beginning 243 | stream.seek(0) 244 | foot = stream.read() 245 | stream.seek(0) 246 | return head, foot 247 | 248 | 249 | def ext_from_filename(filename: os.PathLike | str) -> str: 250 | """Scan a filename for its extension. 251 | 252 | :param filename: string of the filename 253 | :return: the extension off the end (empty string if it can't find one) 254 | """ 255 | try: 256 | base, ext = str(filename).lower().rsplit(".", 1) 257 | except ValueError: 258 | return "" 259 | ext = f".{ext}" 260 | all_exts = [x.extension for x in chain(magic_header_array, magic_footer_array)] 261 | 262 | if base[-4:].startswith("."): 263 | # For double extensions like .tar.gz 264 | long_ext = base[-4:] + ext 265 | if long_ext in all_exts: 266 | return long_ext 267 | return ext 268 | 269 | 270 | def from_file(filename: os.PathLike | str, mime: bool = False) -> str: 271 | """Opens file, attempts to identify content based 272 | off magic number and will return the file extension. 273 | If mime is True it will return the mime type instead. 274 | 275 | :param filename: path to file 276 | :param mime: Return mime, not extension 277 | :return: guessed extension or mime 278 | """ 279 | 280 | head, foot = _file_details(filename) 281 | return _magic(head, foot, mime, ext_from_filename(filename), filename=filename) 282 | 283 | 284 | def from_string(string: str | bytes, mime: bool = False, filename: os.PathLike | str | None = None) -> str: 285 | """Reads in string, attempts to identify content based 286 | off magic number and will return the file extension. 287 | If mime is True it will return the mime type instead. 288 | If filename is provided it will be used in the computation. 289 | 290 | :param string: string representation to check 291 | :param mime: Return mime, not extension 292 | :param filename: original filename 293 | :return: guessed extension or mime 294 | """ 295 | if isinstance(string, str): 296 | string = string.encode("utf-8") 297 | head, foot = _string_details(string) 298 | ext = ext_from_filename(filename) if filename else None 299 | return _magic(head, foot, mime, ext) 300 | 301 | 302 | def from_stream(stream, mime: bool = False, filename: os.PathLike | str | None = None) -> str: 303 | """Reads in stream, attempts to identify content based 304 | off magic number and will return the file extension. 305 | If mime is True it will return the mime type instead. 306 | If filename is provided it will be used in the computation. 307 | 308 | :param stream: stream representation to check 309 | :param mime: Return mime, not extension 310 | :param filename: original filename 311 | :return: guessed extension or mime 312 | """ 313 | head, foot = _stream_details(stream) 314 | ext = ext_from_filename(filename) if filename else None 315 | return _magic(head, foot, mime, ext) 316 | 317 | 318 | def magic_file(filename: os.PathLike | str) -> list[PureMagicWithConfidence]: 319 | """ 320 | Returns list of (num_of_matches, array_of_matches) 321 | arranged by highest confidence match first. 322 | 323 | :param filename: path to file 324 | :return: list of possible matches, highest confidence first 325 | """ 326 | head, foot = _file_details(filename) 327 | if not head: 328 | raise ValueError("Input was empty") 329 | try: 330 | info = _identify_all(head, foot, ext_from_filename(filename)) 331 | except PureError: 332 | info = [] 333 | info.sort(key=lambda x: x.confidence, reverse=True) 334 | if os.getenv("PUREMAGIC_DEEPSCAN") != "0": 335 | return _run_deep_scan(info, filename, head, foot, raise_on_none=False) 336 | return info 337 | 338 | 339 | def magic_string(string, filename: os.PathLike | str | None = None) -> list[PureMagicWithConfidence]: 340 | """ 341 | Returns tuple of (num_of_matches, array_of_matches) 342 | arranged by highest confidence match first 343 | If filename is provided it will be used in the computation. 344 | 345 | :param string: string representation to check 346 | :param filename: original filename 347 | :return: list of possible matches, highest confidence first 348 | """ 349 | if not string: 350 | raise ValueError("Input was empty") 351 | head, foot = _string_details(string) 352 | ext = ext_from_filename(filename) if filename else None 353 | info = _identify_all(head, foot, ext) 354 | info.sort(key=lambda x: x.confidence, reverse=True) 355 | return info 356 | 357 | 358 | def magic_stream( 359 | stream, 360 | filename: os.PathLike | None = None, 361 | ) -> list[PureMagicWithConfidence]: 362 | """Returns tuple of (num_of_matches, array_of_matches) 363 | arranged by highest confidence match first 364 | If filename is provided it will be used in the computation. 365 | 366 | :param stream: stream representation to check 367 | :param filename: original filename 368 | :return: list of possible matches, highest confidence first 369 | """ 370 | head, foot = _stream_details(stream) 371 | if not head: 372 | raise ValueError("Input was empty") 373 | ext = ext_from_filename(filename) if filename else None 374 | info = _identify_all(head, foot, ext) 375 | info.sort(key=lambda x: x.confidence, reverse=True) 376 | return info 377 | 378 | 379 | def _single_deep_scan( 380 | bytes_match: bytes | bytearray | None, 381 | filename: os.PathLike | str, 382 | head=None, 383 | foot=None, 384 | ): 385 | if os.getenv("PUREMAGIC_DEEPSCAN") == "0": 386 | return None 387 | if not isinstance(filename, os.PathLike): 388 | filename = Path(filename) 389 | match bytes_match: 390 | case zip_scanner.match_bytes: 391 | return zip_scanner.main(filename, head, foot) 392 | case pdf_scanner.match_bytes: 393 | return pdf_scanner.main(filename, head, foot) 394 | 395 | # First match wins, so text_scanner should always be last 396 | for scanner in (pdf_scanner, python_scanner, json_scanner): 397 | result = scanner.main(filename, head, foot) 398 | if result: 399 | return result 400 | return None 401 | 402 | 403 | def _catch_all_deep_scan( 404 | filename: os.PathLike | str, 405 | head=None, 406 | foot=None, 407 | ): 408 | if os.getenv("PUREMAGIC_DEEPSCAN") == "0": 409 | return None 410 | if not isinstance(filename, os.PathLike): 411 | filename = Path(filename) 412 | return text_scanner.main(filename, head, foot) 413 | 414 | 415 | def _run_deep_scan( 416 | matches: list[PureMagicWithConfidence], 417 | filename: os.PathLike | str, 418 | head=None, 419 | foot=None, 420 | raise_on_none=True, 421 | ): 422 | if not matches or matches[0].byte_match == b"": 423 | try: 424 | result = _single_deep_scan(None, filename, head, foot) 425 | except Exception: 426 | pass 427 | else: 428 | if result: 429 | return [ 430 | PureMagicWithConfidence( 431 | confidence=result.confidence, 432 | byte_match=None, 433 | offset=None, 434 | extension=result.extension, 435 | mime_type=result.mime_type, 436 | name=result.name, 437 | ) 438 | ] 439 | try: 440 | result = _catch_all_deep_scan(filename, head, foot) 441 | except Exception: 442 | pass 443 | else: 444 | if result: 445 | return [result] 446 | if raise_on_none: 447 | raise PureError("Could not identify file") 448 | 449 | for pure_magic_match in matches: 450 | # noinspection PyBroadException 451 | try: 452 | result = _single_deep_scan(pure_magic_match.byte_match, filename, head, foot) 453 | except Exception: 454 | continue 455 | if result: 456 | return [ 457 | PureMagicWithConfidence( 458 | confidence=result.confidence, 459 | byte_match=pure_magic_match.byte_match, 460 | offset=pure_magic_match.offset, 461 | extension=result.extension, 462 | mime_type=result.mime_type, 463 | name=result.name, 464 | ) 465 | ] 466 | return matches 467 | 468 | 469 | def command_line_entry(*args): 470 | import sys 471 | from argparse import ArgumentParser 472 | 473 | parser = ArgumentParser( 474 | description=( 475 | "puremagic is a pure python file identification module." 476 | "It looks for matching magic numbers in the file to locate the file type. " 477 | ) 478 | ) 479 | parser.add_argument( 480 | "-m", 481 | "--mime", 482 | action="store_true", 483 | dest="mime", 484 | help="Return the mime type instead of file type", 485 | ) 486 | parser.add_argument("-v", "--v", action="store_true", dest="verbose", help="Print verbose output") 487 | parser.add_argument("files", nargs="+") 488 | args = parser.parse_args(args if args else sys.argv[1:]) 489 | 490 | for fn in args.files: 491 | if not os.path.exists(fn): 492 | print(f"File '{fn}' does not exist!") 493 | continue 494 | try: 495 | print(f"'{fn}' : {from_file(fn, args.mime)}") 496 | except PureError: 497 | print(f"'{fn}' : could not be Identified") 498 | continue 499 | if args.verbose: 500 | matches = magic_file(fn) 501 | print(f"Total Possible Matches: {len(matches)}") 502 | for i, result in enumerate(matches): 503 | if i == 0: 504 | print("\n\tBest Match") 505 | else: 506 | print(f"\tAlternative Match #{i}") 507 | print(f"\tName: {result.name}") 508 | print(f"\tConfidence: {int(result.confidence * 100)}%") 509 | print(f"\tExtension: {result.extension}") 510 | print(f"\tMime Type: {result.mime_type}") 511 | print(f"\tByte Match: {result.byte_match}") 512 | print(f"\tOffset: {result.offset}\n") 513 | 514 | 515 | imghdr_bug_for_bug = { # Special cases where imghdr is probably incorrect. 516 | b"______Exif": "jpeg", 517 | b"______JFIF": "jpeg", 518 | b"II": "tiff", 519 | b"II\\x2a\\x00": "tiff", 520 | b"MM": "tiff", 521 | b"MM\\x00\\x2a": "tiff", 522 | } 523 | 524 | 525 | def what(file: os.PathLike | str | None, h: bytes | None = None, imghdr_strict: bool = True) -> str | None: 526 | """A drop-in replacement for `imghdr.what()` which was removed from the standard 527 | library in Python 3.13. 528 | 529 | Usage: 530 | ```python 531 | # Replace... 532 | from imghdr import what 533 | # with... 534 | from puremagic import what 535 | # --- 536 | # Or replace... 537 | import imghdr 538 | ext = imghdr.what(...) 539 | # with... 540 | import puremagic 541 | ext = puremagic.what(...) 542 | ``` 543 | imghdr documentation: https://docs.python.org/3.12/library/imghdr.html 544 | imghdr source code: https://github.com/python/cpython/blob/3.12/Lib/imghdr.py 545 | 546 | imghdr_strict enables bug-for-bug compatibility between imghdr.what() and puremagic.what() when the imghdr returns 547 | a match but puremagic returns None. We believe that imghdr is delivering a "false positive" in each of these 548 | scenarios, but we want puremagic.what()'s default behavior to match imghdr.what()'s false positives so we do not 549 | break existing applications. 550 | 551 | If imghdr_strict is True (the default) then a lookup will be done to deliver a matching result on all known false 552 | positives. If imghdr_strict is False then puremagic's algorithms will determine the image type. True is more 553 | compatible while False is more correct. 554 | 555 | NOTE: This compatibility effort only deals false positives, and we are not interested to track the opposite 556 | situation where puremagic's deliver a match while imghdr would have returned None. Also, puremagic.what() can 557 | recognize many more file types than the twelve image file types that imghdr focused on. 558 | """ 559 | if isinstance(h, str): 560 | raise TypeError("h must be bytes, not str. Consider using bytes.fromhex(h)") 561 | if h and imghdr_strict: 562 | ext = imghdr_bug_for_bug.get(h) 563 | if ext: 564 | return ext 565 | try: 566 | ext = (from_string(h) if h else from_file(file or "")).lstrip(".") 567 | except PureError: 568 | return None # imghdr.what() returns None if it cannot find a match. 569 | return imghdr_exts.get(ext, ext) 570 | 571 | 572 | if __name__ == "__main__": # pragma: no cover 573 | command_line_entry() 574 | -------------------------------------------------------------------------------- /puremagic/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/puremagic/py.typed -------------------------------------------------------------------------------- /puremagic/scanners/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/puremagic/scanners/__init__.py -------------------------------------------------------------------------------- /puremagic/scanners/helpers.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class Match: 6 | extension: str 7 | name: str 8 | mime_type: str 9 | confidence: float = 1 10 | -------------------------------------------------------------------------------- /puremagic/scanners/json_scanner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | from puremagic.scanners.helpers import Match 5 | 6 | match_bytes = b"{" 7 | 8 | 9 | def main(file_path: os.PathLike | str, head: bytes, foot: bytes) -> Match | None: 10 | if not (head.strip().startswith(b"{") and foot.strip().endswith(b"}")): 11 | return None 12 | try: 13 | with open(file_path, "rb") as file: 14 | json.load(file) 15 | except (json.decoder.JSONDecodeError, OSError): 16 | return None 17 | return Match( 18 | extension=".json", 19 | name="JSON File", 20 | mime_type="application/json", 21 | confidence=1.0, 22 | ) 23 | -------------------------------------------------------------------------------- /puremagic/scanners/pdf_scanner.py: -------------------------------------------------------------------------------- 1 | from puremagic.scanners.helpers import Match 2 | 3 | match_bytes = b"%PDF" 4 | 5 | 6 | def main(_, head: bytes, foot: bytes) -> Match | None: 7 | if b"%PDF-" in head and b"startxref" in foot: 8 | return Match(".pdf", "PDF document", "application/pdf") 9 | return None 10 | -------------------------------------------------------------------------------- /puremagic/scanners/python_scanner.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os 3 | 4 | from puremagic.scanners.helpers import Match 5 | 6 | 7 | def main(file_path: os.PathLike | str, *_, **__) -> Match | None: 8 | file_size = os.path.getsize(file_path) 9 | if file_size > 1_000_000: 10 | return None 11 | if not str(file_path).endswith(".py") and file_size < 100: 12 | return None 13 | 14 | try: 15 | with open(file_path, "r") as file: 16 | content = file.read() 17 | ast.parse(content) 18 | except Exception: 19 | return None 20 | return Match( 21 | extension=".py", 22 | name="Python Script", 23 | mime_type="text/x-python", 24 | confidence=1.0, 25 | ) 26 | -------------------------------------------------------------------------------- /puremagic/scanners/text_scanner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | from puremagic.scanners.helpers import Match 5 | 6 | crlf_pattern = re.compile(rb"\r\n") 7 | lf_pattern = re.compile(rb"(? Match | None: 12 | with open(file_path, "rb") as file: 13 | head = file.read(1_000_000) 14 | if len(head) < 8: 15 | return Match("", "very short file", "application/octet-stream", confidence=0.5) 16 | try: 17 | head.decode("ascii") 18 | except UnicodeDecodeError: 19 | return Match("", "data", "application/octet-stream", confidence=0.5) 20 | crlf = len(crlf_pattern.findall(head)) 21 | lf = len(lf_pattern.findall(head)) 22 | cr = len(cr_pattern.findall(head)) 23 | if crlf + lf + cr == 0: 24 | return Match(".txt", "ASCII text", "text/plain", confidence=0.9) 25 | 26 | if crlf > lf and crlf > cr: 27 | return Match(".txt", "ASCII text, with CRLF line terminators", "text/plain", confidence=0.9) 28 | if cr > lf and cr > crlf: 29 | return Match(".txt", "ASCII text, with CR line terminators", "text/plain", confidence=0.9) 30 | if lf > cr and lf > crlf: 31 | return Match(".txt", "ASCII text, with LF line terminators", "text/plain", confidence=0.9) 32 | return Match(".txt", "ASCII text", "text/plain", confidence=0.9) 33 | -------------------------------------------------------------------------------- /puremagic/scanners/zip_scanner.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | from zipfile import ZipFile 4 | 5 | from puremagic.scanners.helpers import Match 6 | 7 | match_bytes = b"PK\x03\x04" 8 | office_macro_enable_match = b"macroEnabled" 9 | 10 | application_re = re.compile(b"(.*)") 11 | 12 | 13 | def open_office_check(internal_files: list[str], zip_file: ZipFile, extension: str | None = None) -> Match | None: 14 | if "content.xml" not in internal_files: 15 | return None 16 | if "mimetype" not in internal_files: 17 | return None 18 | 19 | known_extensions = ["odt", "ods", "odp", "sxd", "sxi", "sxw"] 20 | 21 | mime_type = zip_file.read("mimetype").decode("utf-8").strip() 22 | 23 | if "application/vnd.oasis.opendocument.text" in mime_type: 24 | return Match(".odt", "OpenDocument Text Document", "application/vnd.oasis.opendocument.text") 25 | if "application/vnd.oasis.opendocument.spreadsheet" in mime_type: 26 | return Match(".ods", "OpenDocument Spreadsheet", "application/vnd.oasis.opendocument.spreadsheet") 27 | if "application/vnd.oasis.opendocument.presentation" in mime_type: 28 | return Match(".odp", "OpenDocument Presentation", "application/vnd.oasis.opendocument.presentation") 29 | if extension in known_extensions and mime_type.startswith("application/vnd.oasis.opendocument"): 30 | return Match(extension, "OpenDocument", mime_type) 31 | 32 | return None 33 | 34 | 35 | def office_check(internal_files: list[str], zip_file: ZipFile, extension: str | None = None) -> Match | None: 36 | if "[Content_Types].xml" not in internal_files: 37 | return None 38 | if "docProps/app.xml" not in internal_files: 39 | return None 40 | app_type_matches = application_re.search(zip_file.read("docProps/app.xml")) 41 | if not app_type_matches: 42 | return None 43 | application_type = app_type_matches.group(1).decode("utf-8") 44 | 45 | if "PowerPoint" in application_type: 46 | if extension: 47 | if extension == "ppsm": 48 | return Match(".ppsm", application_type, "application/vnd.ms-powerpoint.slideshow.macroEnabled.12") 49 | if extension == "potm": 50 | return Match(".potm", application_type, "application/vnd.ms-powerpoint.template.macroEnabled.12") 51 | if extension == "potx": 52 | return Match( 53 | "potx", 54 | application_type, 55 | "application/vnd.openxmlformats-officedocument.presentationml.template", 56 | ) 57 | if extension == "ppam": 58 | return Match(".ppam", application_type, "application/vnd.ms-powerpoint.addin.macroEnabled") 59 | if office_macro_enable_match in zip_file.read("[Content_Types].xml"): 60 | return Match(".ppsm", application_type, "application/vnd.ms-powerpoint.slideshow.macroEnabled.12") 61 | return Match( 62 | "pptx", 63 | application_type, 64 | "application/vnd.openxmlformats-officedocument.presentationml.presentation", 65 | ) 66 | if "Excel" in application_type: 67 | if extension: 68 | if extension == "xlsm": 69 | return Match(".xlsm", application_type, "application/vnd.ms-excel.sheet.macroEnabled.12") 70 | if extension == "xlsb": 71 | return Match(".xlsb", application_type, "application/vnd.ms-excel.sheet.binary.macroEnabled.12") 72 | if extension == "xlam": 73 | return Match(".xlam", application_type, "application/vnd.ms-excel.addin.macroEnabled.12") 74 | if extension == "xltm": 75 | return Match(".xltm", application_type, "application/vnd.ms-excel.template.macroEnabled.12") 76 | if extension == "xltx": 77 | return Match( 78 | "xltx", 79 | application_type, 80 | "application/vnd.openxmlformats-officedocument.spreadsheetml.template", 81 | ) 82 | if office_macro_enable_match in zip_file.read("[Content_Types].xml"): 83 | return Match(".xlsm", application_type, "application/vnd.ms-excel.sheet.macroEnabled.12") 84 | 85 | return Match(".xlsx", application_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") 86 | if "Word" in application_type: 87 | if extension: 88 | if extension == "docm": 89 | return Match(".docm", application_type, "application/vnd.ms-word.document.macroEnabled.12") 90 | if extension == "dotm": 91 | return Match(".dotm", application_type, "application/vnd.ms-word.template.macroEnabled.12") 92 | if extension == "dotx": 93 | return Match( 94 | "dotx", 95 | application_type, 96 | "application/vnd.openxmlformats-officedocument.wordprocessingml.template", 97 | ) 98 | if office_macro_enable_match in zip_file.read("[Content_Types].xml"): 99 | return Match(".docm", application_type, "application/vnd.ms-word.document.macroEnabled.12") 100 | return Match( 101 | "docx", application_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" 102 | ) 103 | 104 | return None 105 | 106 | 107 | def jar_check(internal_files: list[str], zip_file: ZipFile) -> Match | None: 108 | 109 | if "META-INF/MANIFEST.MF" not in internal_files: 110 | return None 111 | if "version.json" not in internal_files: 112 | return None 113 | 114 | if b'"java_version":' in zip_file.read("version.json"): 115 | return Match(".jar", "Java Archive", "application/java-archive") 116 | return None 117 | 118 | 119 | def apk_check(internal_files: list[str]) -> Match | None: 120 | if "META-INF/MANIFEST.MF" not in internal_files: 121 | return None 122 | if "AndroidManifest.xml" in internal_files: 123 | return Match(".apk", "Android Package", "application/vnd.android.package-archive") 124 | return None 125 | 126 | 127 | def xpi_check(internal_files: list[str], zip_file: ZipFile) -> Match | None: 128 | if "install.rdf" in internal_files and b"mozilla:install-manifest" in zip_file.read("install.rdf"): 129 | return Match(".xpi", "Mozilla Firefox Add-on", "application/x-xpinstall") 130 | return None 131 | 132 | 133 | def fb2_check(internal_files: list[str], zip_file: ZipFile, file_path: os.PathLike) -> Match | None: 134 | if ( 135 | len(internal_files) == 1 136 | and internal_files[0].endswith(".fb2") 137 | and b" Match | None: 149 | if extension != "cbz": 150 | return None 151 | image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif") 152 | for file in internal_files: 153 | if not file.lower().endswith(image_extensions): 154 | return None 155 | return Match(".cbz", "Comic Book Archive", "application/vnd.comicbook+zip") 156 | 157 | 158 | def main(file_path: os.PathLike, _, __) -> Match | None: 159 | extension = str(file_path).split(".")[-1].lower() 160 | if extension == "zip" and not str(file_path).endswith(".fb2.zip"): 161 | return Match(".zip", "ZIP archive", "application/zip") 162 | 163 | with ZipFile(file_path) as myzip: 164 | internal_files = myzip.namelist() 165 | office_result = office_check(internal_files, myzip, extension) 166 | if office_result: 167 | return office_result 168 | 169 | open_office_result = open_office_check(internal_files, myzip) 170 | if open_office_result: 171 | return open_office_result 172 | 173 | jar_result = jar_check(internal_files, myzip) 174 | if jar_result: 175 | return jar_result 176 | 177 | apk_result = apk_check(internal_files) 178 | if apk_result: 179 | return apk_result 180 | 181 | xpi_result = xpi_check(internal_files, myzip) 182 | if xpi_result: 183 | return xpi_result 184 | 185 | fb_result = fb2_check(internal_files, myzip, file_path) 186 | if fb_result: 187 | return fb_result 188 | 189 | cbz_result = cbz_check(internal_files, extension) 190 | if cbz_result: 191 | return cbz_result 192 | 193 | return None 194 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 120 3 | target-version = [ 4 | 'py312', 5 | 'py313', 6 | ] 7 | exclude = ''' 8 | /( 9 | \.eggs 10 | | \.git 11 | | \.idea 12 | | \.pytest_cache 13 | | \.github 14 | | _build 15 | | build 16 | | dist 17 | | venv 18 | | test/resources 19 | )/ 20 | ''' 21 | 22 | [tool.ruff] 23 | line-length = 120 24 | 25 | lint.select = [ 26 | "ALL", 27 | ] 28 | lint.extend-ignore = [ 29 | "ANN", 30 | "B008", 31 | "BLE001", 32 | "C408", 33 | "C901", # too complex 34 | "COM812", 35 | "D", 36 | "EM101", 37 | "EM103", 38 | "EXE001", 39 | "F401", 40 | "F403", 41 | "FA102", 42 | "FBT", 43 | "FIX002", 44 | "I001", 45 | "INP001", 46 | "N817", 47 | "PERF401", 48 | "PGH003", 49 | "PLR0911", # Too many return statements 50 | "PLR0912", # Too many branches 51 | "PLR0913", # Too many arguments in function definition 52 | "PLR2004", 53 | "PT", 54 | "PTH", 55 | "PYI024", 56 | "S101", 57 | "S110", 58 | "S112", 59 | "S314", 60 | "SLF001", 61 | "T201", 62 | "TCH003", 63 | "TD002", 64 | "TD003", 65 | "TRY003", 66 | "UP", 67 | ] 68 | lint.pylint.allow-magic-value-types = [ 69 | "float", 70 | "int", 71 | "str", 72 | ] 73 | lint.pylint.max-branches = 13 74 | -------------------------------------------------------------------------------- /requirements-test.txt: -------------------------------------------------------------------------------- 1 | coverage>=7.8.0 2 | pytest>=8.3.5 3 | pytest-cov>=6.1.1 4 | -------------------------------------------------------------------------------- /scripts/parse_ftk_kessler_sigs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | This is a very ugly helper script to keep up to date with file types in 4 | Gary C. Kessler's FTK_sigs_GCK archive. 5 | 6 | """ 7 | 8 | import binascii 9 | import json 10 | import os 11 | import xml.etree.ElementTree as ET 12 | 13 | import puremagic 14 | 15 | folder = "FTK_sigs_GCK" 16 | 17 | sigs = [] 18 | 19 | for file in os.listdir(folder): 20 | if file.endswith(".xml"): 21 | tree = ET.parse(os.path.join(folder, file)) 22 | root = tree.getroot() 23 | sig = {} 24 | for child in root[0]: 25 | if child.text: 26 | sig[child.tag] = child.text 27 | else: 28 | for grandchild in child: 29 | if grandchild.tag == "EXT_NAME": 30 | sig[grandchild.tag] = grandchild.text.lower().split("|") # type: ignore 31 | else: 32 | sig[grandchild.tag] = grandchild.text # type: ignore 33 | sigs.append(sig) 34 | 35 | known_sigs = {binascii.hexlify(x[0]).decode("ascii") for x in puremagic.magic_header_array} 36 | 37 | for sig in sigs: 38 | sig["SIG"] = sig["SIG"].lower().strip() 39 | try: 40 | offset = int(sig.get("OFFSET", 0)) 41 | except Exception: 42 | continue 43 | 44 | if sig["SIG"] not in known_sigs and len(sig["EXT_NAME"]) == 1 and len(sig["EXT_NAME"][0]) < 5: 45 | print( 46 | "\t\t{},".format( 47 | json.dumps( 48 | [ 49 | sig["SIG"], 50 | int(sig.get("OFFSET", 0)), 51 | ".{}".format(sig.get("EXT_NAME", "")[0]), 52 | "", 53 | sig["DESCRIPTION"], 54 | ] 55 | ) 56 | ) 57 | ) 58 | elif sig["SIG"] not in known_sigs: 59 | for ext in sig["EXT_NAME"]: 60 | if ext != "(none)": 61 | print("\t\t{},".format(json.dumps([sig["SIG"], offset, f".{ext}", "", sig["DESCRIPTION"]]))) 62 | else: 63 | print("\t\t{},".format(json.dumps([sig["SIG"], offset, "", "", sig["DESCRIPTION"]]))) 64 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | license_files = LICENSE 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import re 5 | 6 | from setuptools import setup 7 | 8 | root = os.path.abspath(os.path.dirname(__file__)) 9 | 10 | with open(os.path.join(root, "puremagic", "main.py")) as reuse_file: 11 | reuse_content = reuse_file.read() 12 | 13 | attrs = dict(re.findall(r"__([a-z]+)__ *= *['\"](.+)['\"]", reuse_content)) 14 | 15 | with open("README.rst") as readme_file: 16 | long_description = readme_file.read() 17 | 18 | setup( 19 | name="puremagic", 20 | version=attrs["version"], 21 | url="https://github.com/cdgriffith/puremagic", 22 | license="MIT", 23 | author=attrs["author"], 24 | author_email="chris@cdgriffith.com", 25 | description="Pure python implementation of magic file detection", 26 | long_description=long_description, 27 | package_data={"puremagic": ["*.json", "py.typed"]}, 28 | packages=["puremagic"], 29 | include_package_data=True, 30 | platforms="any", 31 | python_requires=">=3.12", 32 | classifiers=[ 33 | "Programming Language :: Python", 34 | "Programming Language :: Python :: 3", 35 | "Programming Language :: Python :: Implementation :: PyPy", 36 | "Development Status :: 5 - Production/Stable", 37 | "Natural Language :: English", 38 | "Intended Audience :: Developers", 39 | "License :: OSI Approved :: MIT License", 40 | "Operating System :: OS Independent", 41 | "Topic :: Utilities", 42 | ], 43 | ) 44 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/__init__.py -------------------------------------------------------------------------------- /test/common.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | LOCAL_DIR = Path(os.path.realpath(os.path.dirname(__file__))) 5 | RESOURCE_DIR = Path(os.path.join(LOCAL_DIR, "resources")) 6 | IMAGE_DIR = Path(os.path.join(LOCAL_DIR, "resources", "images")) 7 | VIDEO_DIR = Path(os.path.join(LOCAL_DIR, "resources", "video")) 8 | AUDIO_DIR = Path(os.path.join(LOCAL_DIR, "resources", "audio")) 9 | OFFICE_DIR = Path(os.path.join(LOCAL_DIR, "resources", "office")) 10 | ARCHIVE_DIR = Path(os.path.join(LOCAL_DIR, "resources", "archive")) 11 | MEDIA_DIR = Path(os.path.join(LOCAL_DIR, "resources", "media")) 12 | SYSTEM_DIR = Path(os.path.join(LOCAL_DIR, "resources", "system")) 13 | -------------------------------------------------------------------------------- /test/resources/archive/test.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.7z -------------------------------------------------------------------------------- /test/resources/archive/test.big_endian.pcapng: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.big_endian.pcapng -------------------------------------------------------------------------------- /test/resources/archive/test.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.bz2 -------------------------------------------------------------------------------- /test/resources/archive/test.cb7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.cb7 -------------------------------------------------------------------------------- /test/resources/archive/test.cbt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.cbt -------------------------------------------------------------------------------- /test/resources/archive/test.cbz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.cbz -------------------------------------------------------------------------------- /test/resources/archive/test.fb2.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.fb2.zip -------------------------------------------------------------------------------- /test/resources/archive/test.fbz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.fbz -------------------------------------------------------------------------------- /test/resources/archive/test.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.gz -------------------------------------------------------------------------------- /test/resources/archive/test.little_endian.pcapng: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.little_endian.pcapng -------------------------------------------------------------------------------- /test/resources/archive/test.pcapng: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.pcapng -------------------------------------------------------------------------------- /test/resources/archive/test.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.rar -------------------------------------------------------------------------------- /test/resources/archive/test.tar: -------------------------------------------------------------------------------- 1 | test.txt0000664000175000017500000000000512072363426011602 0ustar chrischristest 2 | -------------------------------------------------------------------------------- /test/resources/archive/test.txt.lz4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.txt.lz4 -------------------------------------------------------------------------------- /test/resources/archive/test.txt.zst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.txt.zst -------------------------------------------------------------------------------- /test/resources/archive/test.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.xz -------------------------------------------------------------------------------- /test/resources/archive/test.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/archive/test.zip -------------------------------------------------------------------------------- /test/resources/audio/test.aac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/audio/test.aac -------------------------------------------------------------------------------- /test/resources/audio/test.aif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/audio/test.aif -------------------------------------------------------------------------------- /test/resources/audio/test.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/audio/test.mp3 -------------------------------------------------------------------------------- /test/resources/audio/test.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/audio/test.wav -------------------------------------------------------------------------------- /test/resources/fake_file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/fake_file -------------------------------------------------------------------------------- /test/resources/images/test.avif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/images/test.avif -------------------------------------------------------------------------------- /test/resources/images/test.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/images/test.bmp -------------------------------------------------------------------------------- /test/resources/images/test.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/images/test.gif -------------------------------------------------------------------------------- /test/resources/images/test.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/images/test.ico -------------------------------------------------------------------------------- /test/resources/images/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/images/test.jpg -------------------------------------------------------------------------------- /test/resources/images/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/images/test.png -------------------------------------------------------------------------------- /test/resources/images/test.psd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/images/test.psd -------------------------------------------------------------------------------- /test/resources/images/test.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /test/resources/images/test.tga: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/images/test.tga -------------------------------------------------------------------------------- /test/resources/images/test.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/images/test.tif -------------------------------------------------------------------------------- /test/resources/images/test.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/images/test.webp -------------------------------------------------------------------------------- /test/resources/images/test.xcf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/images/test.xcf -------------------------------------------------------------------------------- /test/resources/images/test_varriant.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /test/resources/images/test_varriant_2.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /test/resources/media/test (single).vmdk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/media/test (single).vmdk -------------------------------------------------------------------------------- /test/resources/media/test (split).vmdk: -------------------------------------------------------------------------------- 1 | # Disk DescriptorFile 2 | version=1 3 | encoding="windows-1252" 4 | CID=fffffffe 5 | parentCID=ffffffff 6 | isNativeSnapshot="no" 7 | createType="twoGbMaxExtentSparse" 8 | 9 | # Extent description 10 | RW 104448 SPARSE "test-s001" 11 | 12 | # The Disk Data Base 13 | #DDB 14 | 15 | ddb.virtualHWVersion = "8" 16 | ddb.longContentID = "44285d2a935c800a4ebb1c8efffffffe" 17 | ddb.uuid = "60 00 C2 98 02 71 24 99-3d d7 a8 42 4d 32 fd 91" 18 | ddb.geometry.cylinders = "103" 19 | ddb.geometry.heads = "16" 20 | ddb.geometry.sectors = "63" 21 | ddb.adapterType = "ide" 22 | -------------------------------------------------------------------------------- /test/resources/media/test.iso: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/media/test.iso -------------------------------------------------------------------------------- /test/resources/media/test.mdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/media/test.mdf -------------------------------------------------------------------------------- /test/resources/media/test.swf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/media/test.swf -------------------------------------------------------------------------------- /test/resources/office/test.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.doc -------------------------------------------------------------------------------- /test/resources/office/test.docm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.docm -------------------------------------------------------------------------------- /test/resources/office/test.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.docx -------------------------------------------------------------------------------- /test/resources/office/test.dotm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.dotm -------------------------------------------------------------------------------- /test/resources/office/test.dotx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.dotx -------------------------------------------------------------------------------- /test/resources/office/test.fb2: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /test/resources/office/test.odp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.odp -------------------------------------------------------------------------------- /test/resources/office/test.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.ods -------------------------------------------------------------------------------- /test/resources/office/test.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.odt -------------------------------------------------------------------------------- /test/resources/office/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.pdf -------------------------------------------------------------------------------- /test/resources/office/test.potm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.potm -------------------------------------------------------------------------------- /test/resources/office/test.potx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.potx -------------------------------------------------------------------------------- /test/resources/office/test.ppt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.ppt -------------------------------------------------------------------------------- /test/resources/office/test.pptm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.pptm -------------------------------------------------------------------------------- /test/resources/office/test.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.pptx -------------------------------------------------------------------------------- /test/resources/office/test.txt: -------------------------------------------------------------------------------- 1 | Generic text file 2 | -------------------------------------------------------------------------------- /test/resources/office/test.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.xls -------------------------------------------------------------------------------- /test/resources/office/test.xlsb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.xlsb -------------------------------------------------------------------------------- /test/resources/office/test.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.xlsm -------------------------------------------------------------------------------- /test/resources/office/test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.xlsx -------------------------------------------------------------------------------- /test/resources/office/test.xltm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.xltm -------------------------------------------------------------------------------- /test/resources/office/test.xltx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/office/test.xltx -------------------------------------------------------------------------------- /test/resources/system/test.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/system/test.exe -------------------------------------------------------------------------------- /test/resources/system/test.json: -------------------------------------------------------------------------------- 1 | {"Test": "Script"} 2 | -------------------------------------------------------------------------------- /test/resources/system/test.puremagic_multi_footer: -------------------------------------------------------------------------------- 1 | 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ -------------------------------------------------------------------------------- /test/resources/system/test.py: -------------------------------------------------------------------------------- 1 | # Very simple test python file 2 | 3 | 4 | def main(): 5 | print("Hello World") 6 | 7 | 8 | if __name__ == "__main__": 9 | main() 10 | -------------------------------------------------------------------------------- /test/resources/video/test.3g2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/video/test.3g2 -------------------------------------------------------------------------------- /test/resources/video/test.avi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/video/test.avi -------------------------------------------------------------------------------- /test/resources/video/test.flv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/video/test.flv -------------------------------------------------------------------------------- /test/resources/video/test.jxsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/video/test.jxsv -------------------------------------------------------------------------------- /test/resources/video/test.mkv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/video/test.mkv -------------------------------------------------------------------------------- /test/resources/video/test.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/video/test.mp4 -------------------------------------------------------------------------------- /test/resources/video/test.mpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/video/test.mpg -------------------------------------------------------------------------------- /test/resources/video/test.wmv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdgriffith/puremagic/55db71ae8a4940ec1a04d563e0911c6aa662d8b4/test/resources/video/test.wmv -------------------------------------------------------------------------------- /test/test_common_extensions.py: -------------------------------------------------------------------------------- 1 | import os 2 | from io import BytesIO 3 | from pathlib import Path 4 | from tempfile import NamedTemporaryFile 5 | 6 | import pytest 7 | 8 | import puremagic 9 | from test.common import ( 10 | RESOURCE_DIR, 11 | IMAGE_DIR, 12 | VIDEO_DIR, 13 | AUDIO_DIR, 14 | OFFICE_DIR, 15 | ARCHIVE_DIR, 16 | MEDIA_DIR, 17 | SYSTEM_DIR, 18 | LOCAL_DIR, 19 | ) 20 | 21 | 22 | TGA_FILE = os.path.join(IMAGE_DIR, "test.tga") 23 | 24 | 25 | class MockBytesIO(BytesIO): 26 | 27 | def seek(self, offset, whence=0): 28 | if offset < 0: 29 | raise OSError("Invalid seek position") 30 | return super().seek(offset, whence) 31 | 32 | 33 | mp4magic = b"\x00\x00\x00\x1c\x66\x74\x79\x70\x4d\x53\x4e\ 34 | \x56\x01\x29\x00\x46\x4d\x53\x4e\x56\x6d\x70\x34\x32" 35 | expect_ext = ".mp4" 36 | expect_mime = "video/mp4" 37 | 38 | 39 | def group_run(directory): 40 | failures = [] 41 | ext_failures = [] 42 | mime_failures = [] 43 | for item in os.listdir(directory): 44 | try: 45 | ext = puremagic.from_file(os.path.join(directory, item)) 46 | except puremagic.PureError: 47 | failures.append(item) 48 | else: 49 | if not item.endswith(ext): 50 | ext_failures.append((item, ext)) 51 | 52 | try: 53 | mime = puremagic.from_file(os.path.join(directory, item), mime=True) 54 | except puremagic.PureError: 55 | failures.append(item) 56 | else: 57 | if not mime: 58 | mime_failures.append(item) 59 | if failures: 60 | raise AssertionError( 61 | "The following items could not be identified from the {} folder: {}".format(directory, ", ".join(failures)) 62 | ) 63 | if ext_failures: 64 | raise AssertionError( 65 | "The following files did not have the expected extensions: {}".format( 66 | ", ".join([f'"{item}" expected "{ext}"' for item, ext in ext_failures]) 67 | ) 68 | ) 69 | if mime_failures: 70 | raise AssertionError("The following files did not have a mime type: {}".format(", ".join(mime_failures))) 71 | 72 | 73 | def test_file(): 74 | """File identification""" 75 | with NamedTemporaryFile(delete=False) as mp4file: 76 | mp4file.write(mp4magic) 77 | 78 | ext = puremagic.from_file(mp4file.name) 79 | os.unlink(mp4file.name) 80 | assert expect_ext == ext 81 | 82 | 83 | def test_hex_string(): 84 | """Hex string identification""" 85 | ext = puremagic.from_string(mp4magic) 86 | assert expect_ext == ext 87 | 88 | 89 | def test_string(): 90 | """String identification""" 91 | ext = puremagic.from_string(bytes(mp4magic)) 92 | assert expect_ext == ext 93 | 94 | 95 | def test_string_with_confidence(): 96 | """String identification: magic_string""" 97 | ext = puremagic.magic_string(bytes(mp4magic)) 98 | assert expect_ext == ext[0].extension 99 | with pytest.raises(ValueError): 100 | puremagic.magic_string("") 101 | 102 | 103 | def test_magic_string_with_filename_hint(): 104 | """String identification: magic_string with hint""" 105 | filename = os.path.join(OFFICE_DIR, "test.xlsx") 106 | with open(filename, "rb") as f: 107 | data = f.read() 108 | ext = puremagic.magic_string(data, filename=filename) 109 | assert ext[0].extension == ".xlsx" 110 | 111 | 112 | def test_not_found(): 113 | """Bad file type via string""" 114 | try: 115 | with pytest.raises(puremagic.PureError): 116 | puremagic.from_string("not applicable string") 117 | except TypeError: 118 | # Python 2.6 doesn't support using 119 | # assertRaises as a context manager 120 | pass 121 | 122 | 123 | def test_magic_file(): 124 | """File identification with magic_file""" 125 | assert puremagic.magic_file(TGA_FILE)[0].extension == ".tga" 126 | open("test_empty_file", "w").close() 127 | try: 128 | with pytest.raises(ValueError): 129 | puremagic.magic_file("test_empty_file") 130 | finally: 131 | os.unlink("test_empty_file") 132 | 133 | 134 | def test_stream(): 135 | """Stream identification""" 136 | ext = puremagic.from_stream(BytesIO(mp4magic)) 137 | assert expect_ext == ext 138 | with pytest.raises(ValueError): 139 | puremagic.from_stream(BytesIO(b"")) 140 | 141 | 142 | def test_magic_stream(): 143 | """File identification with magic_stream""" 144 | with open(TGA_FILE, "rb") as f: 145 | stream = BytesIO(f.read()) 146 | result = puremagic.magic_stream(stream, TGA_FILE) 147 | assert result[0].extension == ".tga" 148 | with pytest.raises(ValueError): 149 | puremagic.magic_stream(BytesIO(b"")) 150 | 151 | 152 | def test_small_stream_error(): 153 | ext = puremagic.from_stream(MockBytesIO(b"#!/usr/bin/env python")) 154 | assert ext == ".py" 155 | 156 | 157 | def test_mime(): 158 | """Identify mime type""" 159 | assert puremagic.from_file(TGA_FILE, True) == "image/tga" 160 | 161 | 162 | def test_images(): 163 | """Test common image formats""" 164 | group_run(IMAGE_DIR) 165 | 166 | 167 | def test_video(): 168 | """Test common video formats""" 169 | group_run(VIDEO_DIR) 170 | 171 | 172 | def test_audio(): 173 | """Test common audio formats""" 174 | group_run(AUDIO_DIR) 175 | 176 | 177 | def test_office(): 178 | """Test common office document formats""" 179 | # Office files have very similar magic numbers, and may overlap 180 | for item in os.listdir(OFFICE_DIR): 181 | puremagic.from_file(os.path.join(OFFICE_DIR, item)) 182 | 183 | 184 | def test_archive(): 185 | """Test common compressed archive formats""" 186 | # pcapng files from https://wiki.wireshark.org/Development/PcapNg 187 | group_run(ARCHIVE_DIR) 188 | 189 | 190 | def test_media(): 191 | """Test common media formats""" 192 | group_run(MEDIA_DIR) 193 | 194 | 195 | def test_system(): 196 | """Test common system formats""" 197 | group_run(SYSTEM_DIR) 198 | 199 | 200 | def test_ext(): 201 | """Test ext from filename""" 202 | ext = puremagic.ext_from_filename("test.tar.bz2") 203 | assert ext == ".tar.bz2", ext 204 | 205 | 206 | def test_cmd_options(): 207 | """Test CLI options""" 208 | from puremagic.main import command_line_entry 209 | 210 | command_line_entry(__file__, os.path.join(AUDIO_DIR, "test.mp3"), "-v") 211 | command_line_entry(__file__, "DOES NOT EXIST FILE") 212 | command_line_entry(__file__, os.path.join(RESOURCE_DIR, "fake_file"), "-v") 213 | 214 | 215 | def test_bad_magic_input(): 216 | """Test bad magic input""" 217 | with pytest.raises(ValueError): 218 | puremagic.main._magic(None, None, None) 219 | 220 | 221 | def test_fake_file(): 222 | assert puremagic.magic_file(filename=Path(LOCAL_DIR, "resources", "fake_file"))[0].confidence == 0.5 223 | -------------------------------------------------------------------------------- /test/test_main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from pathlib import Path 3 | from sys import version_info 4 | from warnings import filterwarnings 5 | 6 | import pytest 7 | 8 | from puremagic.main import what 9 | 10 | filterwarnings("ignore", message="'imghdr' is deprecated") 11 | try: # imghdr was removed from the standard library in Python 3.13 12 | from imghdr import what as imghdr_what 13 | except ModuleNotFoundError: 14 | imghdr_what = None # type: ignore[assignment] 15 | 16 | file_tests = ["bmp", "gif", "jpg", "png", "tif", "webp"] 17 | 18 | here = Path(__file__).resolve().parent 19 | 20 | 21 | @pytest.mark.skipif(imghdr_what is None, reason="imghdr was removed from the standard library in Python 3.13") 22 | @pytest.mark.parametrize("file", file_tests) 23 | def test_what_from_file(file, h=None): 24 | """Run each test with a path string and a pathlib.Path.""" 25 | file = str(here / f"resources/images/test.{file}") 26 | assert what(file, h) == imghdr_what(file, h) 27 | file = Path(file).resolve() 28 | assert what(file, h) == imghdr_what(file, h) 29 | 30 | 31 | @pytest.mark.skipif(imghdr_what is None, reason="imghdr was removed from the standard library in Python 3.13") 32 | def test_what_from_file_none(): 33 | file = str(here / "resources/fake_file") 34 | assert what(file) == imghdr_what(file) is None 35 | file = Path(file).resolve() 36 | assert what(file, None) == imghdr_what(file, None) is None 37 | 38 | 39 | @pytest.mark.skipif(imghdr_what is None, reason="imghdr was removed from the standard library in Python 3.13") 40 | def test_what_from_string_no_str(h="string"): 41 | """what() should raise a TypeError if h is a string.""" 42 | with pytest.raises(TypeError): 43 | imghdr_what(None, h) 44 | with pytest.raises(TypeError) as excinfo: 45 | what(None, h) 46 | assert str(excinfo.value) == "h must be bytes, not str. Consider using bytes.fromhex(h)" 47 | 48 | 49 | string_tests = [ 50 | ("bmp", "424d"), 51 | ("bmp", "424d787878785c3030305c303030"), 52 | ("bmp", b"BM"), 53 | ("exr", "762f3101"), 54 | ("exr", b"\x76\x2f\x31\x01"), 55 | ("exr", b"v/1\x01"), 56 | ("gif", "474946383761"), 57 | ("gif", "474946383961"), 58 | ("gif", b"GIF87a"), 59 | ("gif", b"GIF89a"), 60 | ("pbm", b"P1 "), 61 | ("pbm", b"P1\n"), 62 | ("pbm", b"P1\r"), 63 | ("pbm", b"P1\t"), 64 | ("pbm", b"P4 "), 65 | ("pbm", b"P4\n"), 66 | ("pbm", b"P4\r"), 67 | ("pbm", b"P4\t"), 68 | ("pgm", b"P2 "), 69 | ("pgm", b"P2\n"), 70 | ("pgm", b"P2\r"), 71 | ("pgm", b"P2\t"), 72 | ("pgm", b"P5 "), 73 | ("pgm", b"P5\n"), 74 | ("pgm", b"P5\r"), 75 | ("pgm", b"P5\t"), 76 | ("png", "89504e470d0a1a0a"), 77 | ("png", b"\211PNG\r\n\032\n"), 78 | ("png", b"\x89PNG\r\n\x1a\n"), 79 | ("ppm", b"P3 "), 80 | ("ppm", b"P3\n"), 81 | ("ppm", b"P3\r"), 82 | ("ppm", b"P3\t"), 83 | ("ppm", b"P6 "), 84 | ("ppm", b"P6\n"), 85 | ("ppm", b"P6\r"), 86 | ("ppm", b"P6\t"), 87 | ("rast", "59A66A95"), 88 | ("rast", b"\x59\xa6\x6a\x95"), 89 | ("rgb", "01da"), 90 | ("rgb", b"\x01\xda"), 91 | ("tiff", "49492a00"), 92 | ("tiff", "4d4d002a"), 93 | ("tiff", "4d4d002b"), 94 | ("tiff", b"II*\x00"), # bytes.fromhex('49492a00') 95 | ("tiff", b"MM\x00*"), # bytes.fromhex('4d4d002a') 96 | ("tiff", b"MM\x00+"), # bytes.fromhex('4d4d002b') 97 | ("webp", b"RIFF____WEBP"), 98 | ("xbm", b"#define "), 99 | (None, "decafbad"), 100 | (None, b"decafbad"), 101 | ] 102 | 103 | 104 | @pytest.mark.skipif(imghdr_what is None, reason="imghdr was removed from the standard library in Python 3.13") 105 | @pytest.mark.parametrize("expected, h", string_tests) 106 | def test_what_from_string(expected, h): 107 | if isinstance(h, str): # In imgdir.what() h must be bytes, not str. 108 | h = bytes.fromhex(h) # ex. "474946383761" --> b"GIF87a" 109 | assert imghdr_what(None, h) == what(None, h) == expected 110 | 111 | 112 | @pytest.mark.skipif(imghdr_what is None, reason="imghdr was removed from the standard library in Python 3.13") 113 | @pytest.mark.parametrize( 114 | "expected, h", 115 | [ 116 | ("jpeg", "ffd8ffdb"), 117 | ("jpeg", b"\xff\xd8\xff\xdb"), 118 | ], 119 | ) 120 | def test_what_from_string_py311(expected, h): 121 | """ 122 | These tests fail with imghdr on Python < 3.11. 123 | """ 124 | if isinstance(h, str): # In imgdir.what() h must be bytes, not str. 125 | h = bytes.fromhex(h) 126 | assert what(None, h) == expected 127 | if version_info < (3, 11): # TODO: Document these imghdr fails 128 | expected = None 129 | assert imghdr_what(None, h) == expected 130 | 131 | 132 | @pytest.mark.skipif(imghdr_what is None, reason="imghdr was removed from the standard library in Python 3.13") 133 | @pytest.mark.parametrize( 134 | "expected, h", 135 | [ 136 | ("jpeg", b"______Exif"), 137 | ("jpeg", b"______Exif"), 138 | ("jpeg", b"______JFIF"), 139 | ("jpeg", b"______JFIF"), 140 | ("tiff", "4949"), 141 | ("tiff", "49495c7832615c783030"), 142 | ("tiff", "4d4d"), 143 | ("tiff", "4d4d5c7830305c783261"), 144 | ("tiff", b"II"), # bytes.fromhex('4949') 145 | ("tiff", b"II\\x2a\\x00"), # bytes.fromhex('49495c7832615c783030') 146 | ("tiff", b"MM"), # bytes.fromhex('4d4d') 147 | ("tiff", b"MM\\x00\\x2a"), # bytes.fromhex('4d4d5c7830305c783261') 148 | ], 149 | ) 150 | @pytest.mark.parametrize("imghdr_strict", [True, False]) 151 | def test_what_from_string_imghdr_strict(expected, h, imghdr_strict): 152 | """ 153 | These tests pass with imghdr but fail with puremagic. 154 | """ 155 | if isinstance(h, str): # In imgdir.what() h must be bytes, not str. 156 | h = bytes.fromhex(h) 157 | assert imghdr_what(None, h) == expected 158 | assert what(None, h, imghdr_strict) == (expected if imghdr_strict else None) 159 | -------------------------------------------------------------------------------- /test/test_scanners.py: -------------------------------------------------------------------------------- 1 | import puremagic 2 | from test.common import OFFICE_DIR, SYSTEM_DIR 3 | from puremagic.scanners import python_scanner, text_scanner, json_scanner 4 | 5 | sample_text = b"""Lorem ipsum dolor sit amet, consectetur adipiscing elit,{ending} 6 | sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.{ending} 7 | {ending} 8 | Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.{ending} 9 | Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.{ending} 10 | Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.{ending} 11 | """ 12 | 13 | 14 | def test_text_scanner(): 15 | # Test the text scanner with a sample text file 16 | lr_file = OFFICE_DIR / "text_lf.txt" 17 | lr_file.write_bytes(sample_text.replace(b"\n", b"").replace(b"{ending}", b"\n")) 18 | results = puremagic.magic_file(lr_file) 19 | assert results[0].extension == ".txt" 20 | assert results[0].name == "ASCII text, with LF line terminators" 21 | assert results[0].mime_type == "text/plain" 22 | assert results[0].confidence == 0.9 23 | 24 | crlf_file = OFFICE_DIR / "text_crlf.txt" 25 | crlf_file.write_bytes(sample_text.replace(b"\n", b"").replace(b"{ending}", b"\r\n")) 26 | results = puremagic.magic_file(crlf_file) 27 | assert results[0].extension == ".txt" 28 | assert results[0].name == "ASCII text, with CRLF line terminators" 29 | assert results[0].mime_type == "text/plain" 30 | assert results[0].confidence == 0.9 31 | 32 | cr_file = OFFICE_DIR / "text_cr.txt" 33 | cr_file.write_bytes(sample_text.replace(b"\n", b"").replace(b"{ending}", b"\r")) 34 | results = puremagic.magic_file(cr_file) 35 | assert results[0].extension == ".txt" 36 | assert results[0].name == "ASCII text, with CR line terminators" 37 | assert results[0].mime_type == "text/plain" 38 | assert results[0].confidence == 0.9 39 | 40 | 41 | def test_python_scanner(): 42 | # Test the Python scanner with a sample Python file 43 | py_file = SYSTEM_DIR / "test.py" 44 | result = python_scanner.main(py_file) 45 | magic_result = puremagic.magic_file(py_file) 46 | assert result.confidence == magic_result[0].confidence 47 | assert result.extension == ".py" 48 | assert result.name == "Python Script" 49 | assert result.mime_type == "text/x-python" 50 | assert result.confidence == 1.0 51 | 52 | 53 | def test_json_scanner(): 54 | json_file = SYSTEM_DIR / "test.json" 55 | result = json_scanner.main(json_file, b"{", b"}") 56 | magic_result = puremagic.magic_file(json_file) 57 | assert result.confidence == magic_result[0].confidence 58 | assert result.extension == ".json" 59 | assert result.name == "JSON File" 60 | assert result.mime_type == "application/json" 61 | assert result.confidence == 1.0 62 | --------------------------------------------------------------------------------