├── .activate.sh ├── .deactivate.sh ├── .github └── workflows │ └── main.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── bin └── vendor-licenses ├── identify ├── __init__.py ├── cli.py ├── extensions.py ├── identify.py ├── interpreters.py ├── py.typed └── vendor │ ├── __init__.py │ └── licenses.py ├── requirements-dev.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── cli_test.py ├── extensions_test.py └── identify_test.py └── tox.ini /.activate.sh: -------------------------------------------------------------------------------- 1 | venv/bin/activate -------------------------------------------------------------------------------- /.deactivate.sh: -------------------------------------------------------------------------------- 1 | deactivate 2 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: main 2 | 3 | on: 4 | push: 5 | branches: [main, test-me-*] 6 | tags: '*' 7 | pull_request: 8 | 9 | jobs: 10 | main: 11 | uses: asottile/workflows/.github/workflows/tox.yml@v1.8.1 12 | with: 13 | env: '["py39", "py310", "py311", "py312"]' 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.py[co] 3 | /.coverage 4 | /.tox 5 | /dist 6 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v5.0.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - id: debug-statements 9 | - id: double-quote-string-fixer 10 | - id: name-tests-test 11 | - id: requirements-txt-fixer 12 | - repo: https://github.com/asottile/setup-cfg-fmt 13 | rev: v2.8.0 14 | hooks: 15 | - id: setup-cfg-fmt 16 | - repo: https://github.com/asottile/reorder-python-imports 17 | rev: v3.15.0 18 | hooks: 19 | - id: reorder-python-imports 20 | args: [--py39-plus, --add-import, 'from __future__ import annotations'] 21 | - repo: https://github.com/asottile/add-trailing-comma 22 | rev: v3.2.0 23 | hooks: 24 | - id: add-trailing-comma 25 | - repo: https://github.com/asottile/pyupgrade 26 | rev: v3.20.0 27 | hooks: 28 | - id: pyupgrade 29 | args: [--py39-plus] 30 | - repo: https://github.com/hhatto/autopep8 31 | rev: v2.3.2 32 | hooks: 33 | - id: autopep8 34 | - repo: https://github.com/PyCQA/flake8 35 | rev: 7.2.0 36 | hooks: 37 | - id: flake8 38 | exclude: ^identify/vendor/licenses\.py$ 39 | - repo: https://github.com/pre-commit/mirrors-mypy 40 | rev: v1.16.0 41 | hooks: 42 | - id: mypy 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Chris Kuehl, Anthony Sottile 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![build status](https://github.com/pre-commit/identify/actions/workflows/main.yml/badge.svg)](https://github.com/pre-commit/identify/actions/workflows/main.yml) 2 | [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/pre-commit/identify/main.svg)](https://results.pre-commit.ci/latest/github/pre-commit/identify/main) 3 | 4 | identify 5 | ======== 6 | 7 | File identification library for Python. 8 | 9 | Given a file (or some information about a file), return a set of standardized 10 | tags identifying what the file is. 11 | 12 | ## Installation 13 | 14 | ```bash 15 | pip install identify 16 | ``` 17 | 18 | ## Usage 19 | ### With a file on disk 20 | 21 | If you have an actual file on disk, you can get the most information possible 22 | (a superset of all other methods): 23 | 24 | ```python 25 | >>> from identify import identify 26 | >>> identify.tags_from_path('/path/to/file.py') 27 | {'file', 'text', 'python', 'non-executable'} 28 | >>> identify.tags_from_path('/path/to/file-with-shebang') 29 | {'file', 'text', 'shell', 'bash', 'executable'} 30 | >>> identify.tags_from_path('/bin/bash') 31 | {'file', 'binary', 'executable'} 32 | >>> identify.tags_from_path('/path/to/directory') 33 | {'directory'} 34 | >>> identify.tags_from_path('/path/to/symlink') 35 | {'symlink'} 36 | ``` 37 | 38 | When using a file on disk, the checks performed are: 39 | 40 | * File type (file, symlink, directory, socket) 41 | * Mode (is it executable?) 42 | * File name (mostly based on extension) 43 | * If executable, the shebang is read and the interpreter interpreted 44 | 45 | 46 | ### If you only have the filename 47 | 48 | ```python 49 | >>> identify.tags_from_filename('file.py') 50 | {'text', 'python'} 51 | ``` 52 | 53 | 54 | ### If you only have the interpreter 55 | 56 | ```python 57 | >>> identify.tags_from_interpreter('python3.5') 58 | {'python', 'python3'} 59 | >>> identify.tags_from_interpreter('bash') 60 | {'shell', 'bash'} 61 | >>> identify.tags_from_interpreter('some-unrecognized-thing') 62 | set() 63 | ``` 64 | 65 | ### As a cli 66 | 67 | ``` 68 | $ identify-cli --help 69 | usage: identify-cli [-h] [--filename-only] path 70 | 71 | positional arguments: 72 | path 73 | 74 | optional arguments: 75 | -h, --help show this help message and exit 76 | --filename-only 77 | ``` 78 | 79 | ```console 80 | $ identify-cli setup.py; echo $? 81 | ["file", "non-executable", "python", "text"] 82 | 0 83 | $ identify-cli setup.py --filename-only; echo $? 84 | ["python", "text"] 85 | 0 86 | $ identify-cli wat.wat; echo $? 87 | wat.wat does not exist. 88 | 1 89 | $ identify-cli wat.wat --filename-only; echo $? 90 | 1 91 | ``` 92 | 93 | ### Identifying LICENSE files 94 | 95 | `identify` also has an api for determining what type of license is contained 96 | in a file. This routine is roughly based on the approaches used by 97 | [licensee] (the ruby gem that github uses to figure out the license for a 98 | repo). 99 | 100 | The approach that `identify` uses is as follows: 101 | 102 | 1. Strip the copyright line 103 | 2. Normalize all whitespace 104 | 3. Return any exact matches 105 | 4. Return the closest by edit distance (where edit distance < 5%) 106 | 107 | To use the api, install via `pip install identify[license]` 108 | 109 | ```pycon 110 | >>> from identify import identify 111 | >>> identify.license_id('LICENSE') 112 | 'MIT' 113 | ``` 114 | 115 | The return value of the `license_id` function is an [SPDX] id. Currently 116 | licenses are sourced from [choosealicense.com]. 117 | 118 | [licensee]: https://github.com/benbalter/licensee 119 | [SPDX]: https://spdx.org/licenses/ 120 | [choosealicense.com]: https://github.com/github/choosealicense.com 121 | 122 | ## How it works 123 | 124 | A call to `tags_from_path` does this: 125 | 126 | 1. What is the type: file, symlink, directory? If it's not file, stop here. 127 | 2. Is it executable? Add the appropriate tag. 128 | 3. Do we recognize the file extension? If so, add the appropriate tags, stop 129 | here. These tags would include binary/text. 130 | 4. Peek at the first X bytes of the file. Use these to determine whether it is 131 | binary or text, add the appropriate tag. 132 | 5. If identified as text above, try to read and interpret the shebang, and add 133 | appropriate tags. 134 | 135 | By design, this means we don't need to partially read files where we recognize 136 | the file extension. 137 | -------------------------------------------------------------------------------- /bin/vendor-licenses: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Usage: 3 | 4 | ./bin/vendor-licenses > identify/vendor/licenses.py 5 | """ 6 | from __future__ import annotations 7 | 8 | import argparse 9 | import os.path 10 | import subprocess 11 | import tempfile 12 | 13 | 14 | def main() -> int: 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--revision', default='HEAD') 17 | args = parser.parse_args() 18 | 19 | licenses = [] 20 | 21 | with tempfile.TemporaryDirectory() as tmpdir: 22 | subprocess.check_call(( 23 | 'git', 'clone', '--no-checkout', '--quiet', 24 | 'https://github.com/github/choosealicense.com', tmpdir, 25 | )) 26 | subprocess.check_call(( 27 | 'git', '-C', tmpdir, 'checkout', args.revision, '--', '_licenses', 28 | )) 29 | 30 | for filename in os.listdir(os.path.join(tmpdir, '_licenses')): 31 | filename = os.path.join(tmpdir, '_licenses', filename) 32 | 33 | with open(filename) as f: 34 | contents = f.read() 35 | 36 | _, data, license_text = contents.split('---\n', 2) 37 | 38 | spdx, = ( 39 | line[len('spdx-id:'):].strip() 40 | for line in data.splitlines() 41 | if line.startswith('spdx-id:') 42 | ) 43 | 44 | licenses.append((spdx, license_text)) 45 | 46 | print('LICENSES = (') 47 | for spdx, text in sorted(licenses): 48 | print(' (') 49 | print(f' {spdx!r},') 50 | print(" '''\\") 51 | print(text.replace('\t', ' ').replace(' \n', '').strip()) 52 | print("''',") 53 | print(' ),') 54 | print(')') 55 | return 0 56 | 57 | 58 | if __name__ == '__main__': 59 | raise SystemExit(main()) 60 | -------------------------------------------------------------------------------- /identify/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pre-commit/identify/40af39f8124a4e8029ff3716c2b0bbf2e1e5fb1e/identify/__init__.py -------------------------------------------------------------------------------- /identify/cli.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import argparse 4 | import json 5 | from collections.abc import Sequence 6 | 7 | from identify import identify 8 | 9 | 10 | def main(argv: Sequence[str] | None = None) -> int: 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--filename-only', action='store_true') 13 | parser.add_argument('path') 14 | args = parser.parse_args(argv) 15 | 16 | if args.filename_only: 17 | func = identify.tags_from_filename 18 | else: 19 | func = identify.tags_from_path 20 | 21 | try: 22 | tags = sorted(func(args.path)) 23 | except ValueError as e: 24 | print(e) 25 | return 1 26 | 27 | if not tags: 28 | return 1 29 | else: 30 | print(json.dumps(tags)) 31 | return 0 32 | 33 | 34 | if __name__ == '__main__': 35 | raise SystemExit(main()) 36 | -------------------------------------------------------------------------------- /identify/extensions.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | EXTENSIONS = { 3 | 'adoc': {'text', 'asciidoc'}, 4 | 'ai': {'binary', 'adobe-illustrator'}, 5 | 'aj': {'text', 'aspectj'}, 6 | 'asciidoc': {'text', 'asciidoc'}, 7 | 'apinotes': {'text', 'apinotes'}, 8 | 'asar': {'binary', 'asar'}, 9 | 'asm': {'text', 'asm'}, 10 | 'astro': {'text', 'astro'}, 11 | 'avif': {'binary', 'image', 'avif'}, 12 | 'avsc': {'text', 'avro-schema'}, 13 | 'bash': {'text', 'shell', 'bash'}, 14 | 'bat': {'text', 'batch'}, 15 | 'bats': {'text', 'shell', 'bash', 'bats'}, 16 | 'bazel': {'text', 'bazel'}, 17 | 'bb': {'text', 'bitbake'}, 18 | 'bbappend': {'text', 'bitbake'}, 19 | 'bbclass': {'text', 'bitbake'}, 20 | 'beancount': {'text', 'beancount'}, 21 | 'bib': {'text', 'bib'}, 22 | 'bmp': {'binary', 'image', 'bitmap'}, 23 | 'bz2': {'binary', 'bzip2'}, 24 | 'bz3': {'binary', 'bzip3'}, 25 | 'bzl': {'text', 'bazel'}, 26 | 'c': {'text', 'c'}, 27 | 'c++': {'text', 'c++'}, 28 | 'c++m': {'text', 'c++'}, 29 | 'cc': {'text', 'c++'}, 30 | 'ccm': {'text', 'c++'}, 31 | 'cfg': {'text'}, 32 | 'chs': {'text', 'c2hs'}, 33 | 'cjs': {'text', 'javascript'}, 34 | 'clj': {'text', 'clojure'}, 35 | 'cljc': {'text', 'clojure'}, 36 | 'cljs': {'text', 'clojure', 'clojurescript'}, 37 | 'cmake': {'text', 'cmake'}, 38 | 'cnf': {'text'}, 39 | 'coffee': {'text', 'coffee'}, 40 | 'conf': {'text'}, 41 | 'cpp': {'text', 'c++'}, 42 | 'cppm': {'text', 'c++'}, 43 | 'cr': {'text', 'crystal'}, 44 | 'crt': {'text', 'pem'}, 45 | 'cs': {'text', 'c#'}, 46 | 'csproj': {'text', 'xml', 'csproj', 'msbuild'}, 47 | 'csh': {'text', 'shell', 'csh'}, 48 | 'cson': {'text', 'cson'}, 49 | 'css': {'text', 'css'}, 50 | 'csv': {'text', 'csv'}, 51 | 'csx': {'text', 'c#', 'c#script'}, 52 | 'cu': {'text', 'cuda'}, 53 | 'cue': {'text', 'cue'}, 54 | 'cuh': {'text', 'cuda'}, 55 | 'cxx': {'text', 'c++'}, 56 | 'cxxm': {'text', 'c++'}, 57 | 'cylc': {'text', 'cylc'}, 58 | 'dart': {'text', 'dart'}, 59 | 'dbc': {'text', 'dbc'}, 60 | 'def': {'text', 'def'}, 61 | 'dll': {'binary'}, 62 | 'dtd': {'text', 'dtd'}, 63 | 'ear': {'binary', 'zip', 'jar'}, 64 | 'edn': {'text', 'clojure', 'edn'}, 65 | 'ejs': {'text', 'ejs'}, 66 | 'ejson': {'text', 'json', 'ejson'}, 67 | 'elm': {'text', 'elm'}, 68 | 'env': {'text', 'dotenv'}, 69 | 'eot': {'binary', 'eot'}, 70 | 'eps': {'binary', 'eps'}, 71 | 'erb': {'text', 'erb'}, 72 | 'erl': {'text', 'erlang'}, 73 | 'ex': {'text', 'elixir'}, 74 | 'exe': {'binary'}, 75 | 'exs': {'text', 'elixir'}, 76 | 'eyaml': {'text', 'yaml'}, 77 | 'f03': {'text', 'fortran'}, 78 | 'f08': {'text', 'fortran'}, 79 | 'f90': {'text', 'fortran'}, 80 | 'f95': {'text', 'fortran'}, 81 | 'feature': {'text', 'gherkin'}, 82 | 'fish': {'text', 'fish'}, 83 | 'fits': {'binary', 'fits'}, 84 | 'fs': {'text', 'f#'}, 85 | 'fsproj': {'text', 'xml', 'fsproj', 'msbuild'}, 86 | 'fsx': {'text', 'f#', 'f#script'}, 87 | 'gd': {'text', 'gdscript'}, 88 | 'gemspec': {'text', 'ruby'}, 89 | 'geojson': {'text', 'geojson', 'json'}, 90 | 'ggb': {'binary', 'zip', 'ggb'}, 91 | 'gif': {'binary', 'image', 'gif'}, 92 | 'gleam': {'text', 'gleam'}, 93 | 'go': {'text', 'go'}, 94 | 'gotmpl': {'text', 'gotmpl'}, 95 | 'gpx': {'text', 'gpx', 'xml'}, 96 | 'graphql': {'text', 'graphql'}, 97 | 'gradle': {'text', 'groovy'}, 98 | 'groovy': {'text', 'groovy'}, 99 | 'gyb': {'text', 'gyb'}, 100 | 'gyp': {'text', 'gyp', 'python'}, 101 | 'gypi': {'text', 'gyp', 'python'}, 102 | 'gz': {'binary', 'gzip'}, 103 | 'h': {'text', 'header', 'c', 'c++'}, 104 | 'hbs': {'text', 'handlebars'}, 105 | 'hcl': {'text', 'hcl'}, 106 | 'hh': {'text', 'header', 'c++'}, 107 | 'hpp': {'text', 'header', 'c++'}, 108 | 'hrl': {'text', 'erlang'}, 109 | 'hs': {'text', 'haskell'}, 110 | 'htm': {'text', 'html'}, 111 | 'html': {'text', 'html'}, 112 | 'hxx': {'text', 'header', 'c++'}, 113 | 'icns': {'binary', 'icns'}, 114 | 'ico': {'binary', 'icon'}, 115 | 'ics': {'text', 'icalendar'}, 116 | 'idl': {'text', 'idl'}, 117 | 'idr': {'text', 'idris'}, 118 | 'inc': {'text', 'inc'}, 119 | 'ini': {'text', 'ini'}, 120 | 'inl': {'text', 'inl', 'c++'}, 121 | 'ino': {'text', 'ino', 'c++'}, 122 | 'inx': {'text', 'xml', 'inx'}, 123 | 'ipynb': {'text', 'jupyter', 'json'}, 124 | 'ixx': {'text', 'c++'}, 125 | 'j2': {'text', 'jinja'}, 126 | 'jade': {'text', 'jade'}, 127 | 'jar': {'binary', 'zip', 'jar'}, 128 | 'java': {'text', 'java'}, 129 | 'jenkins': {'text', 'groovy', 'jenkins'}, 130 | 'jenkinsfile': {'text', 'groovy', 'jenkins'}, 131 | 'jinja': {'text', 'jinja'}, 132 | 'jinja2': {'text', 'jinja'}, 133 | 'jl': {'text', 'julia'}, 134 | 'jpeg': {'binary', 'image', 'jpeg'}, 135 | 'jpg': {'binary', 'image', 'jpeg'}, 136 | 'js': {'text', 'javascript'}, 137 | 'json': {'text', 'json'}, 138 | 'jsonld': {'text', 'json', 'jsonld'}, 139 | 'jsonnet': {'text', 'jsonnet'}, 140 | 'json5': {'text', 'json5'}, 141 | 'jsx': {'text', 'jsx'}, 142 | 'key': {'text', 'pem'}, 143 | 'kml': {'text', 'kml', 'xml'}, 144 | 'kt': {'text', 'kotlin'}, 145 | 'kts': {'text', 'kotlin'}, 146 | 'lean': {'text', 'lean'}, 147 | 'lektorproject': {'text', 'ini', 'lektorproject'}, 148 | 'less': {'text', 'less'}, 149 | 'lfm': {'text', 'lazarus', 'lazarus-form'}, 150 | 'lhs': {'text', 'literate-haskell'}, 151 | 'libsonnet': {'text', 'jsonnet'}, 152 | 'lidr': {'text', 'idris'}, 153 | 'liquid': {'text', 'liquid'}, 154 | 'lpi': {'text', 'lazarus', 'xml'}, 155 | 'lpr': {'text', 'lazarus', 'pascal'}, 156 | 'lr': {'text', 'lektor'}, 157 | 'lua': {'text', 'lua'}, 158 | 'm': {'text', 'objective-c'}, 159 | 'm4': {'text', 'm4'}, 160 | 'magik': {'text', 'magik'}, 161 | 'make': {'text', 'makefile'}, 162 | 'manifest': {'text', 'manifest'}, 163 | 'map': {'text', 'map'}, 164 | 'markdown': {'text', 'markdown'}, 165 | 'md': {'text', 'markdown'}, 166 | 'mdx': {'text', 'mdx'}, 167 | 'meson': {'text', 'meson'}, 168 | 'metal': {'text', 'metal'}, 169 | 'mib': {'text', 'mib'}, 170 | 'mjs': {'text', 'javascript'}, 171 | 'mk': {'text', 'makefile'}, 172 | 'ml': {'text', 'ocaml'}, 173 | 'mli': {'text', 'ocaml'}, 174 | 'mm': {'text', 'c++', 'objective-c++'}, 175 | 'modulemap': {'text', 'modulemap'}, 176 | 'mscx': {'text', 'xml', 'musescore'}, 177 | 'mscz': {'binary', 'zip', 'musescore'}, 178 | 'mustache': {'text', 'mustache'}, 179 | 'myst': {'text', 'myst'}, 180 | 'ngdoc': {'text', 'ngdoc'}, 181 | 'nim': {'text', 'nim'}, 182 | 'nims': {'text', 'nim'}, 183 | 'nimble': {'text', 'nimble'}, 184 | 'nix': {'text', 'nix'}, 185 | 'njk': {'text', 'nunjucks'}, 186 | 'otf': {'binary', 'otf'}, 187 | 'p12': {'binary', 'p12'}, 188 | 'pas': {'text', 'pascal'}, 189 | 'patch': {'text', 'diff'}, 190 | 'pdf': {'binary', 'pdf'}, 191 | 'pem': {'text', 'pem'}, 192 | 'php': {'text', 'php'}, 193 | 'php4': {'text', 'php'}, 194 | 'php5': {'text', 'php'}, 195 | 'phtml': {'text', 'php'}, 196 | 'pl': {'text', 'perl'}, 197 | 'plantuml': {'text', 'plantuml'}, 198 | 'pm': {'text', 'perl'}, 199 | 'png': {'binary', 'image', 'png'}, 200 | 'po': {'text', 'pofile'}, 201 | 'pom': {'pom', 'text', 'xml'}, 202 | 'pp': {'text', 'puppet'}, 203 | 'prisma': {'text', 'prisma'}, 204 | 'properties': {'text', 'java-properties'}, 205 | 'props': {'text', 'xml', 'msbuild'}, 206 | 'proto': {'text', 'proto'}, 207 | 'ps1': {'text', 'powershell'}, 208 | 'psd1': {'text', 'powershell'}, 209 | 'psm1': {'text', 'powershell'}, 210 | 'pug': {'text', 'pug'}, 211 | 'puml': {'text', 'plantuml'}, 212 | 'purs': {'text', 'purescript'}, 213 | 'pxd': {'text', 'cython'}, 214 | 'pxi': {'text', 'cython'}, 215 | 'py': {'text', 'python'}, 216 | 'pyi': {'text', 'pyi'}, 217 | 'pyproj': {'text', 'xml', 'pyproj', 'msbuild'}, 218 | 'pyt': {'text', 'python'}, 219 | 'pyx': {'text', 'cython'}, 220 | 'pyz': {'binary', 'pyz'}, 221 | 'pyzw': {'binary', 'pyz'}, 222 | 'qml': {'text', 'qml'}, 223 | 'r': {'text', 'r'}, 224 | 'rake': {'text', 'ruby'}, 225 | 'rb': {'text', 'ruby'}, 226 | 'resx': {'text', 'resx', 'xml'}, 227 | 'rng': {'text', 'xml', 'relax-ng'}, 228 | 'rs': {'text', 'rust'}, 229 | 'rst': {'text', 'rst'}, 230 | 's': {'text', 'asm'}, 231 | 'sas': {'text', 'sas'}, 232 | 'sass': {'text', 'sass'}, 233 | 'sbt': {'text', 'sbt', 'scala'}, 234 | 'sc': {'text', 'scala'}, 235 | 'scala': {'text', 'scala'}, 236 | 'scm': {'text', 'scheme'}, 237 | 'scss': {'text', 'scss'}, 238 | 'sh': {'text', 'shell'}, 239 | 'sln': {'text', 'sln'}, 240 | 'sls': {'text', 'salt'}, 241 | 'so': {'binary'}, 242 | 'sol': {'text', 'solidity'}, 243 | 'spec': {'text', 'spec'}, 244 | 'sql': {'text', 'sql'}, 245 | 'ss': {'text', 'scheme'}, 246 | 'sty': {'text', 'tex'}, 247 | 'styl': {'text', 'stylus'}, 248 | 'sv': {'text', 'system-verilog'}, 249 | 'svelte': {'text', 'svelte'}, 250 | 'svg': {'text', 'image', 'svg', 'xml'}, 251 | 'svh': {'text', 'system-verilog'}, 252 | 'swf': {'binary', 'swf'}, 253 | 'swift': {'text', 'swift'}, 254 | 'swiftdeps': {'text', 'swiftdeps'}, 255 | 'tac': {'text', 'twisted', 'python'}, 256 | 'tar': {'binary', 'tar'}, 257 | 'targets': {'text', 'xml', 'msbuild'}, 258 | 'templ': {'text', 'templ'}, 259 | 'tex': {'text', 'tex'}, 260 | 'textproto': {'text', 'textproto'}, 261 | 'tf': {'text', 'terraform'}, 262 | 'tfvars': {'text', 'terraform'}, 263 | 'tgz': {'binary', 'gzip'}, 264 | 'thrift': {'text', 'thrift'}, 265 | 'tiff': {'binary', 'image', 'tiff'}, 266 | 'toml': {'text', 'toml'}, 267 | 'ts': {'text', 'ts'}, 268 | 'tsv': {'text', 'tsv'}, 269 | 'tsx': {'text', 'tsx'}, 270 | 'ttf': {'binary', 'ttf'}, 271 | 'twig': {'text', 'twig'}, 272 | 'txsprofile': {'text', 'ini', 'txsprofile'}, 273 | 'txt': {'text', 'plain-text'}, 274 | 'txtpb': {'text', 'textproto'}, 275 | 'urdf': {'text', 'xml', 'urdf'}, 276 | 'v': {'text', 'verilog'}, 277 | 'vb': {'text', 'vb'}, 278 | 'vbproj': {'text', 'xml', 'vbproj', 'msbuild'}, 279 | 'vcxproj': {'text', 'xml', 'vcxproj', 'msbuild'}, 280 | 'vdx': {'text', 'vdx'}, 281 | 'vh': {'text', 'verilog'}, 282 | 'vhd': {'text', 'vhdl'}, 283 | 'vim': {'text', 'vim'}, 284 | 'vtl': {'text', 'vtl'}, 285 | 'vue': {'text', 'vue'}, 286 | 'war': {'binary', 'zip', 'jar'}, 287 | 'wav': {'binary', 'audio', 'wav'}, 288 | 'webp': {'binary', 'image', 'webp'}, 289 | 'whl': {'binary', 'wheel', 'zip'}, 290 | 'wkt': {'text', 'wkt'}, 291 | 'woff': {'binary', 'woff'}, 292 | 'woff2': {'binary', 'woff2'}, 293 | 'wsdl': {'text', 'xml', 'wsdl'}, 294 | 'wsgi': {'text', 'wsgi', 'python'}, 295 | 'xhtml': {'text', 'xml', 'html', 'xhtml'}, 296 | 'xacro': {'text', 'xml', 'urdf', 'xacro'}, 297 | 'xctestplan': {'text', 'json'}, 298 | 'xml': {'text', 'xml'}, 299 | 'xq': {'text', 'xquery'}, 300 | 'xql': {'text', 'xquery'}, 301 | 'xqm': {'text', 'xquery'}, 302 | 'xqu': {'text', 'xquery'}, 303 | 'xquery': {'text', 'xquery'}, 304 | 'xqy': {'text', 'xquery'}, 305 | 'xsd': {'text', 'xml', 'xsd'}, 306 | 'xsl': {'text', 'xml', 'xsl'}, 307 | 'xslt': {'text', 'xml', 'xsl'}, 308 | 'yaml': {'text', 'yaml'}, 309 | 'yamlld': {'text', 'yaml', 'yamlld'}, 310 | 'yang': {'text', 'yang'}, 311 | 'yin': {'text', 'xml', 'yin'}, 312 | 'yml': {'text', 'yaml'}, 313 | 'zcml': {'text', 'xml', 'zcml'}, 314 | 'zig': {'text', 'zig'}, 315 | 'zip': {'binary', 'zip'}, 316 | 'zpt': {'text', 'zpt'}, 317 | 'zsh': {'text', 'shell', 'zsh'}, 318 | } 319 | EXTENSIONS_NEED_BINARY_CHECK = { 320 | 'plist': {'plist'}, 321 | 'ppm': {'image', 'ppm'}, 322 | } 323 | 324 | NAMES = { 325 | '.ansible-lint': EXTENSIONS['yaml'], 326 | '.babelrc': EXTENSIONS['json'] | {'babelrc'}, 327 | '.bash_aliases': EXTENSIONS['bash'], 328 | '.bash_profile': EXTENSIONS['bash'], 329 | '.bashrc': EXTENSIONS['bash'], 330 | '.bazelrc': {'text', 'bazelrc'}, 331 | '.bowerrc': EXTENSIONS['json'] | {'bowerrc'}, 332 | '.browserslistrc': {'text', 'browserslistrc'}, 333 | '.clang-format': EXTENSIONS['yaml'], 334 | '.clang-tidy': EXTENSIONS['yaml'], 335 | '.codespellrc': EXTENSIONS['ini'] | {'codespellrc'}, 336 | '.coveragerc': EXTENSIONS['ini'] | {'coveragerc'}, 337 | '.cshrc': EXTENSIONS['csh'], 338 | '.csslintrc': EXTENSIONS['json'] | {'csslintrc'}, 339 | '.dockerignore': {'text', 'dockerignore'}, 340 | '.editorconfig': {'text', 'editorconfig'}, 341 | '.envrc': EXTENSIONS['bash'], 342 | '.flake8': EXTENSIONS['ini'] | {'flake8'}, 343 | '.gitattributes': {'text', 'gitattributes'}, 344 | '.gitconfig': EXTENSIONS['ini'] | {'gitconfig'}, 345 | '.gitignore': {'text', 'gitignore'}, 346 | '.gitlint': EXTENSIONS['ini'] | {'gitlint'}, 347 | '.gitmodules': {'text', 'gitmodules'}, 348 | '.hgrc': EXTENSIONS['ini'] | {'hgrc'}, 349 | '.isort.cfg': EXTENSIONS['ini'] | {'isort'}, 350 | '.jshintrc': EXTENSIONS['json'] | {'jshintrc'}, 351 | '.mailmap': {'text', 'mailmap'}, 352 | '.mention-bot': EXTENSIONS['json'] | {'mention-bot'}, 353 | '.npmignore': {'text', 'npmignore'}, 354 | '.pdbrc': EXTENSIONS['py'] | {'pdbrc'}, 355 | '.prettierignore': {'text', 'gitignore', 'prettierignore'}, 356 | '.pypirc': EXTENSIONS['ini'] | {'pypirc'}, 357 | '.rstcheck.cfg': EXTENSIONS['ini'], 358 | '.salt-lint': EXTENSIONS['yaml'] | {'salt-lint'}, 359 | '.sqlfluff': EXTENSIONS['ini'], 360 | '.yamllint': EXTENSIONS['yaml'] | {'yamllint'}, 361 | '.zlogin': EXTENSIONS['zsh'], 362 | '.zlogout': EXTENSIONS['zsh'], 363 | '.zprofile': EXTENSIONS['zsh'], 364 | '.zshrc': EXTENSIONS['zsh'], 365 | '.zshenv': EXTENSIONS['zsh'], 366 | 'AUTHORS': EXTENSIONS['txt'], 367 | 'bblayers.conf': EXTENSIONS['bb'], 368 | 'bitbake.conf': EXTENSIONS['bb'], 369 | 'BUILD': EXTENSIONS['bzl'], 370 | 'Cargo.toml': EXTENSIONS['toml'] | {'cargo'}, 371 | 'Cargo.lock': EXTENSIONS['toml'] | {'cargo-lock'}, 372 | 'CMakeLists.txt': EXTENSIONS['cmake'], 373 | 'CHANGELOG': EXTENSIONS['txt'], 374 | 'config.ru': EXTENSIONS['rb'], 375 | 'Containerfile': {'text', 'dockerfile'}, 376 | 'CONTRIBUTING': EXTENSIONS['txt'], 377 | 'copy.bara.sky': EXTENSIONS['bzl'], 378 | 'COPYING': EXTENSIONS['txt'], 379 | 'Dockerfile': {'text', 'dockerfile'}, 380 | 'direnvrc': EXTENSIONS['bash'], 381 | 'Gemfile': EXTENSIONS['rb'], 382 | 'Gemfile.lock': {'text'}, 383 | 'GNUmakefile': EXTENSIONS['mk'], 384 | 'go.mod': {'text', 'go-mod'}, 385 | 'go.sum': {'text', 'go-sum'}, 386 | 'Jenkinsfile': EXTENSIONS['jenkins'], 387 | 'LICENSE': EXTENSIONS['txt'], 388 | 'MAINTAINERS': EXTENSIONS['txt'], 389 | 'Makefile': EXTENSIONS['mk'], 390 | 'meson.build': EXTENSIONS['meson'], 391 | 'meson_options.txt': EXTENSIONS['meson'], 392 | 'makefile': EXTENSIONS['mk'], 393 | 'NEWS': EXTENSIONS['txt'], 394 | 'NOTICE': EXTENSIONS['txt'], 395 | 'PATENTS': EXTENSIONS['txt'], 396 | 'Pipfile': EXTENSIONS['toml'], 397 | 'Pipfile.lock': EXTENSIONS['json'], 398 | 'PKGBUILD': {'text', 'bash', 'pkgbuild', 'alpm'}, 399 | 'poetry.lock': EXTENSIONS['toml'], 400 | 'pom.xml': EXTENSIONS['pom'], 401 | 'pylintrc': EXTENSIONS['ini'] | {'pylintrc'}, 402 | 'README': EXTENSIONS['txt'], 403 | 'Rakefile': EXTENSIONS['rb'], 404 | 'rebar.config': EXTENSIONS['erl'], 405 | 'setup.cfg': EXTENSIONS['ini'], 406 | 'sys.config': EXTENSIONS['erl'], 407 | 'sys.config.src': EXTENSIONS['erl'], 408 | 'Tiltfile': {'text', 'tiltfile'}, 409 | 'Vagrantfile': EXTENSIONS['rb'], 410 | 'WORKSPACE': EXTENSIONS['bzl'], 411 | 'wscript': EXTENSIONS['py'], 412 | } 413 | -------------------------------------------------------------------------------- /identify/identify.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import errno 4 | import math 5 | import os.path 6 | import re 7 | import shlex 8 | import stat 9 | import string 10 | import sys 11 | from typing import IO 12 | 13 | from identify import extensions 14 | from identify import interpreters 15 | from identify.vendor import licenses 16 | 17 | 18 | printable = frozenset(string.printable) 19 | 20 | DIRECTORY = 'directory' 21 | SYMLINK = 'symlink' 22 | SOCKET = 'socket' 23 | FILE = 'file' 24 | EXECUTABLE = 'executable' 25 | NON_EXECUTABLE = 'non-executable' 26 | TEXT = 'text' 27 | BINARY = 'binary' 28 | 29 | TYPE_TAGS = frozenset((DIRECTORY, FILE, SYMLINK, SOCKET)) 30 | MODE_TAGS = frozenset((EXECUTABLE, NON_EXECUTABLE)) 31 | ENCODING_TAGS = frozenset((BINARY, TEXT)) 32 | _ALL_TAGS = {*TYPE_TAGS, *MODE_TAGS, *ENCODING_TAGS} 33 | _ALL_TAGS.update(*extensions.EXTENSIONS.values()) 34 | _ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values()) 35 | _ALL_TAGS.update(*extensions.NAMES.values()) 36 | _ALL_TAGS.update(*interpreters.INTERPRETERS.values()) 37 | ALL_TAGS = frozenset(_ALL_TAGS) 38 | 39 | 40 | def tags_from_path(path: str) -> set[str]: 41 | try: 42 | sr = os.lstat(path) 43 | except (OSError, ValueError): # same error-handling as `os.lexists()` 44 | raise ValueError(f'{path} does not exist.') 45 | 46 | mode = sr.st_mode 47 | if stat.S_ISDIR(mode): 48 | return {DIRECTORY} 49 | if stat.S_ISLNK(mode): 50 | return {SYMLINK} 51 | if stat.S_ISSOCK(mode): 52 | return {SOCKET} 53 | 54 | tags = {FILE} 55 | 56 | executable = os.access(path, os.X_OK) 57 | if executable: 58 | tags.add(EXECUTABLE) 59 | else: 60 | tags.add(NON_EXECUTABLE) 61 | 62 | # As an optimization, if we're able to read tags from the filename, then we 63 | # don't peek at the file contents. 64 | t = tags_from_filename(os.path.basename(path)) 65 | if len(t) > 0: 66 | tags.update(t) 67 | else: 68 | if executable: 69 | shebang = parse_shebang_from_file(path) 70 | if len(shebang) > 0: 71 | tags.update(tags_from_interpreter(shebang[0])) 72 | 73 | # some extensions can be both binary and text 74 | # see EXTENSIONS_NEED_BINARY_CHECK 75 | if not ENCODING_TAGS & tags: 76 | if file_is_text(path): 77 | tags.add(TEXT) 78 | else: 79 | tags.add(BINARY) 80 | 81 | assert ENCODING_TAGS & tags, tags 82 | assert MODE_TAGS & tags, tags 83 | return tags 84 | 85 | 86 | def tags_from_filename(path: str) -> set[str]: 87 | _, filename = os.path.split(path) 88 | _, ext = os.path.splitext(filename) 89 | 90 | ret = set() 91 | 92 | # Allow e.g. "Dockerfile.xenial" to match "Dockerfile" 93 | for part in [filename] + filename.split('.'): 94 | if part in extensions.NAMES: 95 | ret.update(extensions.NAMES[part]) 96 | break 97 | 98 | if len(ext) > 0: 99 | ext = ext[1:].lower() 100 | if ext in extensions.EXTENSIONS: 101 | ret.update(extensions.EXTENSIONS[ext]) 102 | elif ext in extensions.EXTENSIONS_NEED_BINARY_CHECK: 103 | ret.update(extensions.EXTENSIONS_NEED_BINARY_CHECK[ext]) 104 | 105 | return ret 106 | 107 | 108 | def tags_from_interpreter(interpreter: str) -> set[str]: 109 | _, _, interpreter = interpreter.rpartition('/') 110 | 111 | # Try "python3.5.2" => "python3.5" => "python3" until one matches. 112 | while interpreter: 113 | if interpreter in interpreters.INTERPRETERS: 114 | return interpreters.INTERPRETERS[interpreter] 115 | else: 116 | interpreter, _, _ = interpreter.rpartition('.') 117 | 118 | return set() 119 | 120 | 121 | def is_text(bytesio: IO[bytes]) -> bool: 122 | """Return whether the first KB of contents seems to be binary. 123 | 124 | This is roughly based on libmagic's binary/text detection: 125 | https://github.com/file/file/blob/df74b09b9027676088c797528edcaae5a9ce9ad0/src/encoding.c#L203-L228 126 | """ 127 | text_chars = ( 128 | bytearray([7, 8, 9, 10, 11, 12, 13, 27]) + 129 | bytearray(range(0x20, 0x7F)) + 130 | bytearray(range(0x80, 0X100)) 131 | ) 132 | return not bool(bytesio.read(1024).translate(None, text_chars)) 133 | 134 | 135 | def file_is_text(path: str) -> bool: 136 | if not os.path.lexists(path): 137 | raise ValueError(f'{path} does not exist.') 138 | with open(path, 'rb') as f: 139 | return is_text(f) 140 | 141 | 142 | def _shebang_split(line: str) -> list[str]: 143 | try: 144 | # shebangs aren't supposed to be quoted, though some tools such as 145 | # setuptools will write them with quotes so we'll best-guess parse 146 | # with shlex first 147 | return shlex.split(line) 148 | except ValueError: 149 | # failing that, we'll do a more "traditional" shebang parsing which 150 | # just involves splitting by whitespace 151 | return line.split() 152 | 153 | 154 | def _parse_nix_shebang( 155 | bytesio: IO[bytes], 156 | cmd: tuple[str, ...], 157 | ) -> tuple[str, ...]: 158 | while bytesio.read(2) == b'#!': 159 | next_line_b = bytesio.readline() 160 | try: 161 | next_line = next_line_b.decode('UTF-8') 162 | except UnicodeDecodeError: 163 | return cmd 164 | 165 | for c in next_line: 166 | if c not in printable: 167 | return cmd 168 | 169 | line_tokens = tuple(_shebang_split(next_line.strip())) 170 | for i, token in enumerate(line_tokens[:-1]): 171 | if token != '-i': 172 | continue 173 | # the argument to -i flag 174 | cmd = (line_tokens[i + 1],) 175 | return cmd 176 | 177 | 178 | def parse_shebang(bytesio: IO[bytes]) -> tuple[str, ...]: 179 | """Parse the shebang from a file opened for reading binary.""" 180 | if bytesio.read(2) != b'#!': 181 | return () 182 | first_line_b = bytesio.readline() 183 | try: 184 | first_line = first_line_b.decode('UTF-8') 185 | except UnicodeDecodeError: 186 | return () 187 | 188 | # Require only printable ascii 189 | for c in first_line: 190 | if c not in printable: 191 | return () 192 | 193 | cmd = tuple(_shebang_split(first_line.strip())) 194 | if cmd[:2] == ('/usr/bin/env', '-S'): 195 | cmd = cmd[2:] 196 | elif cmd[:1] == ('/usr/bin/env',): 197 | cmd = cmd[1:] 198 | 199 | if cmd == ('nix-shell',): 200 | return _parse_nix_shebang(bytesio, cmd) 201 | 202 | return cmd 203 | 204 | 205 | def parse_shebang_from_file(path: str) -> tuple[str, ...]: 206 | """Parse the shebang given a file path.""" 207 | if not os.path.lexists(path): 208 | raise ValueError(f'{path} does not exist.') 209 | if not os.access(path, os.X_OK): 210 | return () 211 | 212 | try: 213 | with open(path, 'rb') as f: 214 | return parse_shebang(f) 215 | except OSError as e: 216 | if e.errno == errno.EINVAL: 217 | return () 218 | else: 219 | raise 220 | 221 | 222 | COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE) 223 | WS_RE = re.compile(r'\s+') 224 | 225 | 226 | def _norm_license(s: str) -> str: 227 | s = COPYRIGHT_RE.sub('', s) 228 | s = WS_RE.sub(' ', s) 229 | return s.strip() 230 | 231 | 232 | def license_id(filename: str) -> str | None: 233 | """Return the spdx id for the license contained in `filename`. If no 234 | license is detected, returns `None`. 235 | 236 | spdx: https://spdx.org/licenses/ 237 | licenses from choosealicense.com: https://github.com/choosealicense.com 238 | 239 | Approximate algorithm: 240 | 241 | 1. strip copyright line 242 | 2. normalize whitespace (replace all whitespace with a single space) 243 | 3. check exact text match with existing licenses 244 | 4. failing that use edit distance 245 | """ 246 | import ukkonen # `pip install identify[license]` 247 | 248 | with open(filename, encoding='UTF-8') as f: 249 | contents = f.read() 250 | 251 | norm = _norm_license(contents) 252 | 253 | min_edit_dist = sys.maxsize 254 | min_edit_dist_spdx = '' 255 | 256 | cutoff = math.ceil(.05 * len(norm)) 257 | 258 | # try exact matches 259 | for spdx, text in licenses.LICENSES: 260 | norm_license = _norm_license(text) 261 | if norm == norm_license: 262 | return spdx 263 | 264 | # skip the slow calculation if the lengths are very different 265 | if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05: 266 | continue 267 | 268 | edit_dist = ukkonen.distance(norm, norm_license, cutoff) 269 | if edit_dist < cutoff and edit_dist < min_edit_dist: 270 | min_edit_dist = edit_dist 271 | min_edit_dist_spdx = spdx 272 | 273 | # if there's less than 5% edited from the license, we found our match 274 | if norm and min_edit_dist < cutoff: 275 | return min_edit_dist_spdx 276 | else: 277 | # no matches :'( 278 | return None 279 | -------------------------------------------------------------------------------- /identify/interpreters.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | INTERPRETERS = { 3 | 'ash': {'shell', 'ash'}, 4 | 'awk': {'awk'}, 5 | 'bash': {'shell', 'bash'}, 6 | 'bats': {'shell', 'bash', 'bats'}, 7 | 'cbsd': {'shell', 'cbsd'}, 8 | 'csh': {'shell', 'csh'}, 9 | 'dash': {'shell', 'dash'}, 10 | 'expect': {'expect'}, 11 | 'ksh': {'shell', 'ksh'}, 12 | 'node': {'javascript'}, 13 | 'nodejs': {'javascript'}, 14 | 'perl': {'perl'}, 15 | 'php': {'php'}, 16 | 'php7': {'php', 'php7'}, 17 | 'php8': {'php', 'php8'}, 18 | 'python': {'python'}, 19 | 'python2': {'python', 'python2'}, 20 | 'python3': {'python', 'python3'}, 21 | 'ruby': {'ruby'}, 22 | 'sh': {'shell', 'sh'}, 23 | 'tcsh': {'shell', 'tcsh'}, 24 | 'zsh': {'shell', 'zsh'}, 25 | } 26 | -------------------------------------------------------------------------------- /identify/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pre-commit/identify/40af39f8124a4e8029ff3716c2b0bbf2e1e5fb1e/identify/py.typed -------------------------------------------------------------------------------- /identify/vendor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pre-commit/identify/40af39f8124a4e8029ff3716c2b0bbf2e1e5fb1e/identify/vendor/__init__.py -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | covdefaults 2 | coverage 3 | pytest 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = identify 3 | version = 2.6.12 4 | description = File identification library for Python 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown 7 | url = https://github.com/pre-commit/identify 8 | author = Chris Kuehl 9 | author_email = ckuehl@ocf.berkeley.edu 10 | license = MIT 11 | license_files = LICENSE 12 | classifiers = 13 | Programming Language :: Python :: 3 14 | Programming Language :: Python :: 3 :: Only 15 | Programming Language :: Python :: Implementation :: CPython 16 | Programming Language :: Python :: Implementation :: PyPy 17 | 18 | [options] 19 | packages = find: 20 | python_requires = >=3.9 21 | 22 | [options.packages.find] 23 | exclude = 24 | tests* 25 | testing* 26 | 27 | [options.entry_points] 28 | console_scripts = 29 | identify-cli=identify.cli:main 30 | 31 | [options.extras_require] 32 | license = 33 | ukkonen 34 | 35 | [options.package_data] 36 | identify = 37 | py.typed 38 | 39 | [bdist_wheel] 40 | universal = True 41 | 42 | [coverage:run] 43 | plugins = covdefaults 44 | 45 | [mypy] 46 | check_untyped_defs = true 47 | disallow_any_generics = true 48 | disallow_incomplete_defs = true 49 | disallow_untyped_defs = true 50 | warn_redundant_casts = true 51 | warn_unused_ignores = true 52 | 53 | [mypy-testing.*] 54 | disallow_untyped_defs = false 55 | 56 | [mypy-tests.*] 57 | disallow_untyped_defs = false 58 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from setuptools import setup 4 | setup() 5 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pre-commit/identify/40af39f8124a4e8029ff3716c2b0bbf2e1e5fb1e/tests/__init__.py -------------------------------------------------------------------------------- /tests/cli_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from identify import cli 4 | 5 | 6 | def test_identify_cli(capsys): 7 | ret = cli.main(('setup.py',)) 8 | out, _ = capsys.readouterr() 9 | assert ret == 0 10 | assert out == '["file", "non-executable", "python", "text"]\n' 11 | 12 | 13 | def test_identify_cli_filename_only(capsys): 14 | ret = cli.main(('setup.py', '--filename-only')) 15 | out, _ = capsys.readouterr() 16 | assert ret == 0 17 | assert out == '["python", "text"]\n' 18 | 19 | 20 | def test_identify_cli_filename_only_unidentified(capsys): 21 | ret = cli.main(('x.unknown', '--filename-only')) 22 | out, _ = capsys.readouterr() 23 | assert ret == 1 24 | assert out == '' 25 | 26 | 27 | def test_file_not_found(capsys): 28 | ret = cli.main(('x.unknown',)) 29 | out, _ = capsys.readouterr() 30 | assert ret == 1 31 | assert out == 'x.unknown does not exist.\n' 32 | -------------------------------------------------------------------------------- /tests/extensions_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from identify import extensions 6 | 7 | 8 | @pytest.mark.parametrize('extension', extensions.EXTENSIONS) 9 | def test_extensions_have_binary_or_text(extension): 10 | tags = extensions.EXTENSIONS[extension] 11 | assert len({'text', 'binary'} & tags) == 1, tags 12 | 13 | 14 | @pytest.mark.parametrize('name', extensions.NAMES) 15 | def test_names_have_binary_or_text(name): 16 | tags = extensions.NAMES[name] 17 | assert len({'text', 'binary'} & tags) == 1, tags 18 | 19 | 20 | @pytest.mark.parametrize('extension', extensions.EXTENSIONS_NEED_BINARY_CHECK) 21 | def test_need_binary_check_do_not_specify_text_binary(extension): 22 | tags = extensions.EXTENSIONS_NEED_BINARY_CHECK[extension] 23 | assert len({'text', 'binary'} & tags) == 0, tags 24 | 25 | 26 | def test_mutually_exclusive_check_types(): 27 | assert not ( 28 | set(extensions.EXTENSIONS) & 29 | set(extensions.EXTENSIONS_NEED_BINARY_CHECK) 30 | ) 31 | -------------------------------------------------------------------------------- /tests/identify_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import builtins 4 | import errno 5 | import io 6 | import os 7 | import socket 8 | import stat 9 | from tempfile import TemporaryDirectory 10 | from unittest import mock 11 | 12 | import pytest 13 | 14 | from identify import identify 15 | 16 | 17 | def test_all_tags_includes_basic_ones(): 18 | assert 'file' in identify.ALL_TAGS 19 | assert 'directory' in identify.ALL_TAGS 20 | assert 'executable' in identify.ALL_TAGS 21 | assert 'text' in identify.ALL_TAGS 22 | assert 'socket' in identify.ALL_TAGS 23 | 24 | 25 | @pytest.mark.parametrize( 26 | 'tag_group', 27 | ( 28 | identify.TYPE_TAGS, 29 | identify.MODE_TAGS, 30 | identify.ENCODING_TAGS, 31 | ), 32 | ) 33 | def test_all_tags_contains_all_groups(tag_group): 34 | assert tag_group < identify.ALL_TAGS 35 | 36 | 37 | def test_all_tags_contains_each_type(): 38 | assert 'xml' in identify.ALL_TAGS # extension 39 | assert 'plist' in identify.ALL_TAGS # extension, needs binary check 40 | assert 'dockerfile' in identify.ALL_TAGS # by file convention 41 | assert 'python3' in identify.ALL_TAGS # by shebang 42 | assert 'php8' in identify.ALL_TAGS # by shebang 43 | 44 | 45 | def test_tags_from_path_does_not_exist(tmpdir): 46 | x = tmpdir.join('foo') 47 | with pytest.raises(ValueError): 48 | identify.tags_from_path(x.strpath) 49 | 50 | 51 | def test_tags_from_path_directory(tmpdir): 52 | x = tmpdir.join('foo') 53 | x.mkdir() 54 | assert identify.tags_from_path(x.strpath) == {'directory'} 55 | 56 | 57 | def test_tags_from_path_symlink(tmpdir): 58 | x = tmpdir.join('foo') 59 | x.mksymlinkto(tmpdir.join('lol').ensure()) 60 | assert identify.tags_from_path(x.strpath) == {'symlink'} 61 | 62 | 63 | def test_tags_from_path_socket(): 64 | tmproot = '/tmp' # short path avoids `OSError: AF_UNIX path too long` 65 | with TemporaryDirectory(dir=tmproot) as tmpdir: 66 | socket_path = os.path.join(tmpdir, 'socket') 67 | with socket.socket(socket.AF_UNIX) as sock: 68 | sock.bind(socket_path) 69 | tags = identify.tags_from_path(socket_path) 70 | 71 | assert tags == {'socket'} 72 | 73 | 74 | def test_tags_from_path_broken_symlink(tmpdir): 75 | x = tmpdir.join('foo') 76 | x.mksymlinkto(tmpdir.join('lol')) 77 | assert identify.tags_from_path(x.strpath) == {'symlink'} 78 | 79 | 80 | def test_tags_from_path_simple_file(tmpdir): 81 | x = tmpdir.join('test.py').ensure() 82 | assert identify.tags_from_path(x.strpath) == { 83 | 'file', 'text', 'non-executable', 'python', 84 | } 85 | 86 | 87 | def test_tags_from_path_file_with_incomplete_shebang(tmpdir): 88 | x = tmpdir.join('test') 89 | x.write_text('#! \n', encoding='UTF-8') 90 | make_executable(x.strpath) 91 | assert identify.tags_from_path(x.strpath) == { 92 | 'file', 'text', 'executable', 93 | } 94 | 95 | 96 | def test_tags_from_path_file_with_shebang_non_executable(tmpdir): 97 | x = tmpdir.join('test') 98 | x.write_text('#!/usr/bin/env python\nimport sys\n', encoding='UTF-8') 99 | assert identify.tags_from_path(x.strpath) == { 100 | 'file', 'text', 'non-executable', 101 | } 102 | 103 | 104 | def test_tags_from_path_file_with_shebang_executable(tmpdir): 105 | x = tmpdir.join('test') 106 | x.write_text('#!/usr/bin/env python\nimport sys\n', encoding='UTF-8') 107 | make_executable(x.strpath) 108 | assert identify.tags_from_path(x.strpath) == { 109 | 'file', 'text', 'executable', 'python', 110 | } 111 | 112 | 113 | def test_tags_from_path_binary(tmpdir): 114 | x = tmpdir.join('test') 115 | x.write(b'\x7f\x45\x4c\x46\x02\x01\x01') 116 | make_executable(x.strpath) 117 | assert identify.tags_from_path(x.strpath) == { 118 | 'file', 'binary', 'executable', 119 | } 120 | 121 | 122 | def test_tags_from_path_plist_binary(tmpdir): 123 | x = tmpdir.join('t.plist') 124 | x.write_binary( 125 | b'bplist00\xd1\x01\x02_\x10\x0fLast Login NameWDefault\x08\x0b\x1d\x00' 126 | b'\x00\x00\x00\x00\x00\x01\x01\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00' 127 | b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00%', 128 | ) 129 | assert identify.tags_from_path(x.strpath) == { 130 | 'file', 'plist', 'binary', 'non-executable', 131 | } 132 | 133 | 134 | def test_tags_from_path_plist_text(tmpdir): 135 | x = tmpdir.join('t.plist') 136 | x.write( 137 | '\n' 138 | '\n' # noqa: E501 139 | '\n' 140 | '\n' 141 | '\tLast Login Name\n' 142 | '\tDefault\n' 143 | '\n' 144 | '\n', 145 | ) 146 | assert identify.tags_from_path(x.strpath) == { 147 | 'file', 'plist', 'text', 'non-executable', 148 | } 149 | 150 | 151 | @pytest.mark.parametrize( 152 | ('filename', 'expected'), 153 | ( 154 | ('.salt-lint', {'text', 'salt-lint', 'yaml'}), 155 | ('test.py', {'text', 'python'}), 156 | ('test.mk', {'text', 'makefile'}), 157 | ('Makefile', {'text', 'makefile'}), 158 | ('Containerfile', {'text', 'dockerfile'}), 159 | ('Dockerfile', {'text', 'dockerfile'}), 160 | ('Dockerfile.xenial', {'text', 'dockerfile'}), 161 | ('xenial.Dockerfile', {'text', 'dockerfile'}), 162 | ('Pipfile', {'text', 'toml'}), 163 | ('Pipfile.lock', {'text', 'json'}), 164 | ('mod/test.py', {'text', 'python'}), 165 | ('mod/Dockerfile', {'text', 'dockerfile'}), 166 | ('config.ru', {'text', 'ruby'}), 167 | ('Gemfile', {'text', 'ruby'}), 168 | ('Gemfile.lock', {'text'}), 169 | ('Jenkinsfile', {'text', 'groovy', 'jenkins'}), 170 | ('build.jenkins', {'text', 'groovy', 'jenkins'}), 171 | ('build.jenkinsfile', {'text', 'groovy', 'jenkins'}), 172 | ('meson.build', {'text', 'meson'}), 173 | ('meson_options.txt', {'text', 'plain-text', 'meson'}), 174 | ('Vagrantfile', {'text', 'ruby'}), 175 | ('Tiltfile', {'text', 'tiltfile'}), 176 | ('Tiltfile.abc', {'text', 'tiltfile'}), 177 | ('test.Tiltfile', {'text', 'tiltfile'}), 178 | 179 | # does not set binary / text 180 | ('f.plist', {'plist'}), 181 | 182 | # case of extension should be ignored 183 | ('f.JPG', {'binary', 'image', 'jpeg'}), 184 | # but case of name checks should still be honored 185 | ('dockerfile.py', {'text', 'python'}), 186 | 187 | # full filename tests should take precedence over extension tests 188 | ('test.cfg', {'text'}), 189 | ('setup.cfg', {'text', 'ini'}), 190 | 191 | # Filename matches should still include extensions if applicable 192 | ('README.md', {'text', 'markdown', 'plain-text'}), 193 | 194 | ('test.weird-unrecognized-extension', set()), 195 | ('test', set()), 196 | ('', set()), 197 | ), 198 | ) 199 | def test_tags_from_filename(filename, expected): 200 | assert identify.tags_from_filename(filename) == expected 201 | 202 | 203 | @pytest.mark.parametrize( 204 | ('interpreter', 'expected'), 205 | ( 206 | ('python', {'python'}), 207 | ('python3', {'python3', 'python'}), 208 | ('python3.5.2', {'python3', 'python'}), 209 | ('/usr/bin/python3.5.2', {'python3', 'python'}), 210 | ('/usr/bin/herpderpderpderpderp', set()), 211 | ('something-random', set()), 212 | ('', set()), 213 | ), 214 | ) 215 | def test_tags_from_interpreter(interpreter, expected): 216 | assert identify.tags_from_interpreter(interpreter) == expected 217 | 218 | 219 | @pytest.mark.parametrize( 220 | ('data', 'expected'), 221 | ( 222 | (b'hello world', True), 223 | (b'', True), 224 | ('éóñəå ⊂(◉‿◉)つ(ノ≥∇≤)ノ'.encode(), True), 225 | (r'¯\_(ツ)_/¯'.encode(), True), 226 | ('♪┏(・o・)┛♪┗ ( ・o・) ┓♪┏ ( ) ┛♪┗ (・o・ ) ┓♪'.encode(), True), 227 | ('éóñå'.encode('latin1'), True), 228 | 229 | (b'hello world\x00', False), 230 | # first few bytes of /bin/bash 231 | (b'\x7f\x45\x4c\x46\x02\x01\x01', False), 232 | # some /dev/urandom output 233 | (b'\x43\x92\xd9\x0f\xaf\x32\x2c', False), 234 | ), 235 | ) 236 | def test_is_text(data, expected): 237 | assert identify.is_text(io.BytesIO(data)) is expected 238 | 239 | 240 | def test_file_is_text_simple(tmpdir): 241 | x = tmpdir.join('f') 242 | x.write_text('hello there\n', encoding='UTF-8') 243 | assert identify.file_is_text(x.strpath) is True 244 | 245 | 246 | def test_file_is_text_does_not_exist(tmpdir): 247 | x = tmpdir.join('f') 248 | with pytest.raises(ValueError): 249 | identify.file_is_text(x.strpath) 250 | 251 | 252 | @pytest.mark.parametrize( 253 | ('s', 'expected'), 254 | ( 255 | (b'', ()), 256 | (b'#!/usr/bin/python', ('/usr/bin/python',)), 257 | (b'#!/usr/bin/env python', ('python',)), 258 | (b'#! /usr/bin/python', ('/usr/bin/python',)), 259 | (b'#!/usr/bin/foo python', ('/usr/bin/foo', 'python')), 260 | # despite this being invalid, setuptools will write shebangs like this 261 | (b'#!"/path/with spaces/x" y', ('/path/with spaces/x', 'y')), 262 | # this is apparently completely ok to embed quotes 263 | (b"#!/path'with/quotes y", ("/path'with/quotes", 'y')), 264 | # Don't regress on leading/trailing ws 265 | (b"#! /path'with/quotes y ", ("/path'with/quotes", 'y')), 266 | # Test nix-shell specialites with shebang on second line 267 | ( 268 | b'#! /usr/bin/env nix-shell\n' 269 | b'#! nix-shell -i bash -p python', 270 | ('bash',), 271 | ), 272 | ( 273 | b'#! /usr/bin/env nix-shell\n' 274 | b'#! nix-shell -i python -p coreutils', 275 | ('python',), 276 | ), 277 | ( 278 | b'#! /usr/bin/env nix-shell\n' 279 | b'#! nix-shell -p coreutils -i python', 280 | ('python',), 281 | ), 282 | # multi-line and no whitespace variation 283 | ( 284 | b'#! /usr/bin/env nix-shell\n' 285 | b'#! nix-shell -p coreutils\n' 286 | b'#! nix-shell -i python', 287 | ('python',), 288 | ), 289 | ( 290 | b'#! /usr/bin/env nix-shell\n' 291 | b'#!nix-shell -p coreutils\n' 292 | b'#!nix-shell -i python', 293 | ('python',), 294 | ), 295 | ( 296 | b'#! /usr/bin/env nix-shell\n' 297 | b'#!\xf9\x93\x01\x42\xcd', 298 | ('nix-shell',), 299 | ), 300 | ( 301 | b'#! /usr/bin/env nix-shell\n' 302 | b'#!\x00\x00\x00\x00', 303 | ('nix-shell',), 304 | ), 305 | # non-proper nix-shell 306 | (b'#! /usr/bin/nix-shell', ('/usr/bin/nix-shell',)), 307 | (b'#! /usr/bin/env nix-shell', ('nix-shell',)), 308 | ( 309 | b'#! /usr/bin/env nix-shell non-portable-argument', 310 | ('nix-shell', 'non-portable-argument'), 311 | ), 312 | ( 313 | b'#! /usr/bin/env nix-shell\n' 314 | b'#! nix-shell -i', 315 | ('nix-shell',), # guard against index error 316 | ), 317 | # interpret quotes correctly 318 | ( 319 | b'#!/usr/bin/env nix-shell\n' 320 | b'#!nix-shell --argstr x "a -i python3 p"\n' 321 | b'#!nix-shell -p hello\n' 322 | b'#!nix-shell -i bash\n' 323 | b'#!nix-shell --argstr y "b -i runhaskell q"', 324 | ('bash',), 325 | ), 326 | (b'\xf9\x93\x01\x42\xcd', ()), 327 | (b'#!\xf9\x93\x01\x42\xcd', ()), 328 | (b'#!\x00\x00\x00\x00', ()), 329 | # shebang lines with multiple arguments 330 | (b'#!/usr/bin/env -S python -u', ('python', '-u')), 331 | (b'#!/usr/bin/env', ()), 332 | (b'#!/usr/bin/env -S', ()), 333 | ), 334 | ) 335 | def test_parse_shebang(s, expected): 336 | assert identify.parse_shebang(io.BytesIO(s)) == expected 337 | 338 | 339 | def test_parse_shebang_from_file_does_not_exist(): 340 | with pytest.raises(ValueError): 341 | identify.parse_shebang_from_file('herp derp derp') 342 | 343 | 344 | def test_parse_shebang_from_file_nonexecutable(tmpdir): 345 | x = tmpdir.join('f') 346 | x.write_text('#!/usr/bin/env python', encoding='UTF-8') 347 | assert identify.parse_shebang_from_file(x.strpath) == () 348 | 349 | 350 | def test_parse_shebang_from_file_simple(tmpdir): 351 | x = tmpdir.join('f') 352 | x.write_text('#!/usr/bin/env python', encoding='UTF-8') 353 | make_executable(x.strpath) 354 | assert identify.parse_shebang_from_file(x.strpath) == ('python',) 355 | 356 | 357 | def test_parse_shebang_open_raises_einval(tmpdir): 358 | x = tmpdir.join('f') 359 | x.write('#!/usr/bin/env not-expected\n') 360 | make_executable(x) 361 | error = OSError(errno.EINVAL, f'Invalid argument {x}') 362 | with mock.patch.object(builtins, 'open', side_effect=error): 363 | assert identify.parse_shebang_from_file(x.strpath) == () 364 | 365 | 366 | def make_executable(filename): 367 | original_mode = os.stat(filename).st_mode 368 | os.chmod( 369 | filename, 370 | original_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH, 371 | ) 372 | 373 | 374 | def test_license_identification(): 375 | assert identify.license_id('LICENSE') == 'MIT' 376 | 377 | 378 | def test_license_exact_identification(tmpdir): 379 | wtfpl = '''\ 380 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 381 | Version 2, December 2004 382 | 383 | Copyright (C) 2004 Sam Hocevar 384 | 385 | Everyone is permitted to copy and distribute verbatim or modified 386 | copies of this license document, and changing it is allowed as long 387 | as the name is changed. 388 | 389 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 390 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 391 | 392 | 0. You just DO WHAT THE FUCK YOU WANT TO. 393 | ''' 394 | f = tmpdir.join('LICENSE') 395 | f.write(wtfpl) 396 | assert identify.license_id(f.strpath) == 'WTFPL' 397 | 398 | 399 | def test_license_not_identified(): 400 | assert identify.license_id(os.devnull) is None 401 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py,pre-commit 3 | 4 | [testenv] 5 | deps = -rrequirements-dev.txt 6 | extras = license 7 | commands = 8 | coverage erase 9 | coverage run -m pytest {posargs:tests} 10 | coverage report 11 | 12 | [testenv:pre-commit] 13 | skip_install = true 14 | deps = pre-commit 15 | commands = pre-commit run --all-files --show-diff-on-failure 16 | 17 | [pep8] 18 | ignore = E265,E501,W504 19 | --------------------------------------------------------------------------------