├── .activate.sh
├── .deactivate.sh
├── .github
└── workflows
│ └── main.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── bin
└── vendor-licenses
├── identify
├── __init__.py
├── cli.py
├── extensions.py
├── identify.py
├── interpreters.py
├── py.typed
└── vendor
│ ├── __init__.py
│ └── licenses.py
├── requirements-dev.txt
├── setup.cfg
├── setup.py
├── tests
├── __init__.py
├── cli_test.py
├── extensions_test.py
└── identify_test.py
└── tox.ini
/.activate.sh:
--------------------------------------------------------------------------------
1 | venv/bin/activate
--------------------------------------------------------------------------------
/.deactivate.sh:
--------------------------------------------------------------------------------
1 | deactivate
2 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: main
2 |
3 | on:
4 | push:
5 | branches: [main, test-me-*]
6 | tags: '*'
7 | pull_request:
8 |
9 | jobs:
10 | main:
11 | uses: asottile/workflows/.github/workflows/tox.yml@v1.8.1
12 | with:
13 | env: '["py39", "py310", "py311", "py312"]'
14 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | *.py[co]
3 | /.coverage
4 | /.tox
5 | /dist
6 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v5.0.0
4 | hooks:
5 | - id: trailing-whitespace
6 | - id: end-of-file-fixer
7 | - id: check-yaml
8 | - id: debug-statements
9 | - id: double-quote-string-fixer
10 | - id: name-tests-test
11 | - id: requirements-txt-fixer
12 | - repo: https://github.com/asottile/setup-cfg-fmt
13 | rev: v2.8.0
14 | hooks:
15 | - id: setup-cfg-fmt
16 | - repo: https://github.com/asottile/reorder-python-imports
17 | rev: v3.15.0
18 | hooks:
19 | - id: reorder-python-imports
20 | args: [--py39-plus, --add-import, 'from __future__ import annotations']
21 | - repo: https://github.com/asottile/add-trailing-comma
22 | rev: v3.2.0
23 | hooks:
24 | - id: add-trailing-comma
25 | - repo: https://github.com/asottile/pyupgrade
26 | rev: v3.20.0
27 | hooks:
28 | - id: pyupgrade
29 | args: [--py39-plus]
30 | - repo: https://github.com/hhatto/autopep8
31 | rev: v2.3.2
32 | hooks:
33 | - id: autopep8
34 | - repo: https://github.com/PyCQA/flake8
35 | rev: 7.2.0
36 | hooks:
37 | - id: flake8
38 | exclude: ^identify/vendor/licenses\.py$
39 | - repo: https://github.com/pre-commit/mirrors-mypy
40 | rev: v1.16.0
41 | hooks:
42 | - id: mypy
43 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2017 Chris Kuehl, Anthony Sottile
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://github.com/pre-commit/identify/actions/workflows/main.yml)
2 | [](https://results.pre-commit.ci/latest/github/pre-commit/identify/main)
3 |
4 | identify
5 | ========
6 |
7 | File identification library for Python.
8 |
9 | Given a file (or some information about a file), return a set of standardized
10 | tags identifying what the file is.
11 |
12 | ## Installation
13 |
14 | ```bash
15 | pip install identify
16 | ```
17 |
18 | ## Usage
19 | ### With a file on disk
20 |
21 | If you have an actual file on disk, you can get the most information possible
22 | (a superset of all other methods):
23 |
24 | ```python
25 | >>> from identify import identify
26 | >>> identify.tags_from_path('/path/to/file.py')
27 | {'file', 'text', 'python', 'non-executable'}
28 | >>> identify.tags_from_path('/path/to/file-with-shebang')
29 | {'file', 'text', 'shell', 'bash', 'executable'}
30 | >>> identify.tags_from_path('/bin/bash')
31 | {'file', 'binary', 'executable'}
32 | >>> identify.tags_from_path('/path/to/directory')
33 | {'directory'}
34 | >>> identify.tags_from_path('/path/to/symlink')
35 | {'symlink'}
36 | ```
37 |
38 | When using a file on disk, the checks performed are:
39 |
40 | * File type (file, symlink, directory, socket)
41 | * Mode (is it executable?)
42 | * File name (mostly based on extension)
43 | * If executable, the shebang is read and the interpreter interpreted
44 |
45 |
46 | ### If you only have the filename
47 |
48 | ```python
49 | >>> identify.tags_from_filename('file.py')
50 | {'text', 'python'}
51 | ```
52 |
53 |
54 | ### If you only have the interpreter
55 |
56 | ```python
57 | >>> identify.tags_from_interpreter('python3.5')
58 | {'python', 'python3'}
59 | >>> identify.tags_from_interpreter('bash')
60 | {'shell', 'bash'}
61 | >>> identify.tags_from_interpreter('some-unrecognized-thing')
62 | set()
63 | ```
64 |
65 | ### As a cli
66 |
67 | ```
68 | $ identify-cli --help
69 | usage: identify-cli [-h] [--filename-only] path
70 |
71 | positional arguments:
72 | path
73 |
74 | optional arguments:
75 | -h, --help show this help message and exit
76 | --filename-only
77 | ```
78 |
79 | ```console
80 | $ identify-cli setup.py; echo $?
81 | ["file", "non-executable", "python", "text"]
82 | 0
83 | $ identify-cli setup.py --filename-only; echo $?
84 | ["python", "text"]
85 | 0
86 | $ identify-cli wat.wat; echo $?
87 | wat.wat does not exist.
88 | 1
89 | $ identify-cli wat.wat --filename-only; echo $?
90 | 1
91 | ```
92 |
93 | ### Identifying LICENSE files
94 |
95 | `identify` also has an api for determining what type of license is contained
96 | in a file. This routine is roughly based on the approaches used by
97 | [licensee] (the ruby gem that github uses to figure out the license for a
98 | repo).
99 |
100 | The approach that `identify` uses is as follows:
101 |
102 | 1. Strip the copyright line
103 | 2. Normalize all whitespace
104 | 3. Return any exact matches
105 | 4. Return the closest by edit distance (where edit distance < 5%)
106 |
107 | To use the api, install via `pip install identify[license]`
108 |
109 | ```pycon
110 | >>> from identify import identify
111 | >>> identify.license_id('LICENSE')
112 | 'MIT'
113 | ```
114 |
115 | The return value of the `license_id` function is an [SPDX] id. Currently
116 | licenses are sourced from [choosealicense.com].
117 |
118 | [licensee]: https://github.com/benbalter/licensee
119 | [SPDX]: https://spdx.org/licenses/
120 | [choosealicense.com]: https://github.com/github/choosealicense.com
121 |
122 | ## How it works
123 |
124 | A call to `tags_from_path` does this:
125 |
126 | 1. What is the type: file, symlink, directory? If it's not file, stop here.
127 | 2. Is it executable? Add the appropriate tag.
128 | 3. Do we recognize the file extension? If so, add the appropriate tags, stop
129 | here. These tags would include binary/text.
130 | 4. Peek at the first X bytes of the file. Use these to determine whether it is
131 | binary or text, add the appropriate tag.
132 | 5. If identified as text above, try to read and interpret the shebang, and add
133 | appropriate tags.
134 |
135 | By design, this means we don't need to partially read files where we recognize
136 | the file extension.
137 |
--------------------------------------------------------------------------------
/bin/vendor-licenses:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """Usage:
3 |
4 | ./bin/vendor-licenses > identify/vendor/licenses.py
5 | """
6 | from __future__ import annotations
7 |
8 | import argparse
9 | import os.path
10 | import subprocess
11 | import tempfile
12 |
13 |
14 | def main() -> int:
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('--revision', default='HEAD')
17 | args = parser.parse_args()
18 |
19 | licenses = []
20 |
21 | with tempfile.TemporaryDirectory() as tmpdir:
22 | subprocess.check_call((
23 | 'git', 'clone', '--no-checkout', '--quiet',
24 | 'https://github.com/github/choosealicense.com', tmpdir,
25 | ))
26 | subprocess.check_call((
27 | 'git', '-C', tmpdir, 'checkout', args.revision, '--', '_licenses',
28 | ))
29 |
30 | for filename in os.listdir(os.path.join(tmpdir, '_licenses')):
31 | filename = os.path.join(tmpdir, '_licenses', filename)
32 |
33 | with open(filename) as f:
34 | contents = f.read()
35 |
36 | _, data, license_text = contents.split('---\n', 2)
37 |
38 | spdx, = (
39 | line[len('spdx-id:'):].strip()
40 | for line in data.splitlines()
41 | if line.startswith('spdx-id:')
42 | )
43 |
44 | licenses.append((spdx, license_text))
45 |
46 | print('LICENSES = (')
47 | for spdx, text in sorted(licenses):
48 | print(' (')
49 | print(f' {spdx!r},')
50 | print(" '''\\")
51 | print(text.replace('\t', ' ').replace(' \n', '').strip())
52 | print("''',")
53 | print(' ),')
54 | print(')')
55 | return 0
56 |
57 |
58 | if __name__ == '__main__':
59 | raise SystemExit(main())
60 |
--------------------------------------------------------------------------------
/identify/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pre-commit/identify/40af39f8124a4e8029ff3716c2b0bbf2e1e5fb1e/identify/__init__.py
--------------------------------------------------------------------------------
/identify/cli.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import argparse
4 | import json
5 | from collections.abc import Sequence
6 |
7 | from identify import identify
8 |
9 |
10 | def main(argv: Sequence[str] | None = None) -> int:
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--filename-only', action='store_true')
13 | parser.add_argument('path')
14 | args = parser.parse_args(argv)
15 |
16 | if args.filename_only:
17 | func = identify.tags_from_filename
18 | else:
19 | func = identify.tags_from_path
20 |
21 | try:
22 | tags = sorted(func(args.path))
23 | except ValueError as e:
24 | print(e)
25 | return 1
26 |
27 | if not tags:
28 | return 1
29 | else:
30 | print(json.dumps(tags))
31 | return 0
32 |
33 |
34 | if __name__ == '__main__':
35 | raise SystemExit(main())
36 |
--------------------------------------------------------------------------------
/identify/extensions.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | EXTENSIONS = {
3 | 'adoc': {'text', 'asciidoc'},
4 | 'ai': {'binary', 'adobe-illustrator'},
5 | 'aj': {'text', 'aspectj'},
6 | 'asciidoc': {'text', 'asciidoc'},
7 | 'apinotes': {'text', 'apinotes'},
8 | 'asar': {'binary', 'asar'},
9 | 'asm': {'text', 'asm'},
10 | 'astro': {'text', 'astro'},
11 | 'avif': {'binary', 'image', 'avif'},
12 | 'avsc': {'text', 'avro-schema'},
13 | 'bash': {'text', 'shell', 'bash'},
14 | 'bat': {'text', 'batch'},
15 | 'bats': {'text', 'shell', 'bash', 'bats'},
16 | 'bazel': {'text', 'bazel'},
17 | 'bb': {'text', 'bitbake'},
18 | 'bbappend': {'text', 'bitbake'},
19 | 'bbclass': {'text', 'bitbake'},
20 | 'beancount': {'text', 'beancount'},
21 | 'bib': {'text', 'bib'},
22 | 'bmp': {'binary', 'image', 'bitmap'},
23 | 'bz2': {'binary', 'bzip2'},
24 | 'bz3': {'binary', 'bzip3'},
25 | 'bzl': {'text', 'bazel'},
26 | 'c': {'text', 'c'},
27 | 'c++': {'text', 'c++'},
28 | 'c++m': {'text', 'c++'},
29 | 'cc': {'text', 'c++'},
30 | 'ccm': {'text', 'c++'},
31 | 'cfg': {'text'},
32 | 'chs': {'text', 'c2hs'},
33 | 'cjs': {'text', 'javascript'},
34 | 'clj': {'text', 'clojure'},
35 | 'cljc': {'text', 'clojure'},
36 | 'cljs': {'text', 'clojure', 'clojurescript'},
37 | 'cmake': {'text', 'cmake'},
38 | 'cnf': {'text'},
39 | 'coffee': {'text', 'coffee'},
40 | 'conf': {'text'},
41 | 'cpp': {'text', 'c++'},
42 | 'cppm': {'text', 'c++'},
43 | 'cr': {'text', 'crystal'},
44 | 'crt': {'text', 'pem'},
45 | 'cs': {'text', 'c#'},
46 | 'csproj': {'text', 'xml', 'csproj', 'msbuild'},
47 | 'csh': {'text', 'shell', 'csh'},
48 | 'cson': {'text', 'cson'},
49 | 'css': {'text', 'css'},
50 | 'csv': {'text', 'csv'},
51 | 'csx': {'text', 'c#', 'c#script'},
52 | 'cu': {'text', 'cuda'},
53 | 'cue': {'text', 'cue'},
54 | 'cuh': {'text', 'cuda'},
55 | 'cxx': {'text', 'c++'},
56 | 'cxxm': {'text', 'c++'},
57 | 'cylc': {'text', 'cylc'},
58 | 'dart': {'text', 'dart'},
59 | 'dbc': {'text', 'dbc'},
60 | 'def': {'text', 'def'},
61 | 'dll': {'binary'},
62 | 'dtd': {'text', 'dtd'},
63 | 'ear': {'binary', 'zip', 'jar'},
64 | 'edn': {'text', 'clojure', 'edn'},
65 | 'ejs': {'text', 'ejs'},
66 | 'ejson': {'text', 'json', 'ejson'},
67 | 'elm': {'text', 'elm'},
68 | 'env': {'text', 'dotenv'},
69 | 'eot': {'binary', 'eot'},
70 | 'eps': {'binary', 'eps'},
71 | 'erb': {'text', 'erb'},
72 | 'erl': {'text', 'erlang'},
73 | 'ex': {'text', 'elixir'},
74 | 'exe': {'binary'},
75 | 'exs': {'text', 'elixir'},
76 | 'eyaml': {'text', 'yaml'},
77 | 'f03': {'text', 'fortran'},
78 | 'f08': {'text', 'fortran'},
79 | 'f90': {'text', 'fortran'},
80 | 'f95': {'text', 'fortran'},
81 | 'feature': {'text', 'gherkin'},
82 | 'fish': {'text', 'fish'},
83 | 'fits': {'binary', 'fits'},
84 | 'fs': {'text', 'f#'},
85 | 'fsproj': {'text', 'xml', 'fsproj', 'msbuild'},
86 | 'fsx': {'text', 'f#', 'f#script'},
87 | 'gd': {'text', 'gdscript'},
88 | 'gemspec': {'text', 'ruby'},
89 | 'geojson': {'text', 'geojson', 'json'},
90 | 'ggb': {'binary', 'zip', 'ggb'},
91 | 'gif': {'binary', 'image', 'gif'},
92 | 'gleam': {'text', 'gleam'},
93 | 'go': {'text', 'go'},
94 | 'gotmpl': {'text', 'gotmpl'},
95 | 'gpx': {'text', 'gpx', 'xml'},
96 | 'graphql': {'text', 'graphql'},
97 | 'gradle': {'text', 'groovy'},
98 | 'groovy': {'text', 'groovy'},
99 | 'gyb': {'text', 'gyb'},
100 | 'gyp': {'text', 'gyp', 'python'},
101 | 'gypi': {'text', 'gyp', 'python'},
102 | 'gz': {'binary', 'gzip'},
103 | 'h': {'text', 'header', 'c', 'c++'},
104 | 'hbs': {'text', 'handlebars'},
105 | 'hcl': {'text', 'hcl'},
106 | 'hh': {'text', 'header', 'c++'},
107 | 'hpp': {'text', 'header', 'c++'},
108 | 'hrl': {'text', 'erlang'},
109 | 'hs': {'text', 'haskell'},
110 | 'htm': {'text', 'html'},
111 | 'html': {'text', 'html'},
112 | 'hxx': {'text', 'header', 'c++'},
113 | 'icns': {'binary', 'icns'},
114 | 'ico': {'binary', 'icon'},
115 | 'ics': {'text', 'icalendar'},
116 | 'idl': {'text', 'idl'},
117 | 'idr': {'text', 'idris'},
118 | 'inc': {'text', 'inc'},
119 | 'ini': {'text', 'ini'},
120 | 'inl': {'text', 'inl', 'c++'},
121 | 'ino': {'text', 'ino', 'c++'},
122 | 'inx': {'text', 'xml', 'inx'},
123 | 'ipynb': {'text', 'jupyter', 'json'},
124 | 'ixx': {'text', 'c++'},
125 | 'j2': {'text', 'jinja'},
126 | 'jade': {'text', 'jade'},
127 | 'jar': {'binary', 'zip', 'jar'},
128 | 'java': {'text', 'java'},
129 | 'jenkins': {'text', 'groovy', 'jenkins'},
130 | 'jenkinsfile': {'text', 'groovy', 'jenkins'},
131 | 'jinja': {'text', 'jinja'},
132 | 'jinja2': {'text', 'jinja'},
133 | 'jl': {'text', 'julia'},
134 | 'jpeg': {'binary', 'image', 'jpeg'},
135 | 'jpg': {'binary', 'image', 'jpeg'},
136 | 'js': {'text', 'javascript'},
137 | 'json': {'text', 'json'},
138 | 'jsonld': {'text', 'json', 'jsonld'},
139 | 'jsonnet': {'text', 'jsonnet'},
140 | 'json5': {'text', 'json5'},
141 | 'jsx': {'text', 'jsx'},
142 | 'key': {'text', 'pem'},
143 | 'kml': {'text', 'kml', 'xml'},
144 | 'kt': {'text', 'kotlin'},
145 | 'kts': {'text', 'kotlin'},
146 | 'lean': {'text', 'lean'},
147 | 'lektorproject': {'text', 'ini', 'lektorproject'},
148 | 'less': {'text', 'less'},
149 | 'lfm': {'text', 'lazarus', 'lazarus-form'},
150 | 'lhs': {'text', 'literate-haskell'},
151 | 'libsonnet': {'text', 'jsonnet'},
152 | 'lidr': {'text', 'idris'},
153 | 'liquid': {'text', 'liquid'},
154 | 'lpi': {'text', 'lazarus', 'xml'},
155 | 'lpr': {'text', 'lazarus', 'pascal'},
156 | 'lr': {'text', 'lektor'},
157 | 'lua': {'text', 'lua'},
158 | 'm': {'text', 'objective-c'},
159 | 'm4': {'text', 'm4'},
160 | 'magik': {'text', 'magik'},
161 | 'make': {'text', 'makefile'},
162 | 'manifest': {'text', 'manifest'},
163 | 'map': {'text', 'map'},
164 | 'markdown': {'text', 'markdown'},
165 | 'md': {'text', 'markdown'},
166 | 'mdx': {'text', 'mdx'},
167 | 'meson': {'text', 'meson'},
168 | 'metal': {'text', 'metal'},
169 | 'mib': {'text', 'mib'},
170 | 'mjs': {'text', 'javascript'},
171 | 'mk': {'text', 'makefile'},
172 | 'ml': {'text', 'ocaml'},
173 | 'mli': {'text', 'ocaml'},
174 | 'mm': {'text', 'c++', 'objective-c++'},
175 | 'modulemap': {'text', 'modulemap'},
176 | 'mscx': {'text', 'xml', 'musescore'},
177 | 'mscz': {'binary', 'zip', 'musescore'},
178 | 'mustache': {'text', 'mustache'},
179 | 'myst': {'text', 'myst'},
180 | 'ngdoc': {'text', 'ngdoc'},
181 | 'nim': {'text', 'nim'},
182 | 'nims': {'text', 'nim'},
183 | 'nimble': {'text', 'nimble'},
184 | 'nix': {'text', 'nix'},
185 | 'njk': {'text', 'nunjucks'},
186 | 'otf': {'binary', 'otf'},
187 | 'p12': {'binary', 'p12'},
188 | 'pas': {'text', 'pascal'},
189 | 'patch': {'text', 'diff'},
190 | 'pdf': {'binary', 'pdf'},
191 | 'pem': {'text', 'pem'},
192 | 'php': {'text', 'php'},
193 | 'php4': {'text', 'php'},
194 | 'php5': {'text', 'php'},
195 | 'phtml': {'text', 'php'},
196 | 'pl': {'text', 'perl'},
197 | 'plantuml': {'text', 'plantuml'},
198 | 'pm': {'text', 'perl'},
199 | 'png': {'binary', 'image', 'png'},
200 | 'po': {'text', 'pofile'},
201 | 'pom': {'pom', 'text', 'xml'},
202 | 'pp': {'text', 'puppet'},
203 | 'prisma': {'text', 'prisma'},
204 | 'properties': {'text', 'java-properties'},
205 | 'props': {'text', 'xml', 'msbuild'},
206 | 'proto': {'text', 'proto'},
207 | 'ps1': {'text', 'powershell'},
208 | 'psd1': {'text', 'powershell'},
209 | 'psm1': {'text', 'powershell'},
210 | 'pug': {'text', 'pug'},
211 | 'puml': {'text', 'plantuml'},
212 | 'purs': {'text', 'purescript'},
213 | 'pxd': {'text', 'cython'},
214 | 'pxi': {'text', 'cython'},
215 | 'py': {'text', 'python'},
216 | 'pyi': {'text', 'pyi'},
217 | 'pyproj': {'text', 'xml', 'pyproj', 'msbuild'},
218 | 'pyt': {'text', 'python'},
219 | 'pyx': {'text', 'cython'},
220 | 'pyz': {'binary', 'pyz'},
221 | 'pyzw': {'binary', 'pyz'},
222 | 'qml': {'text', 'qml'},
223 | 'r': {'text', 'r'},
224 | 'rake': {'text', 'ruby'},
225 | 'rb': {'text', 'ruby'},
226 | 'resx': {'text', 'resx', 'xml'},
227 | 'rng': {'text', 'xml', 'relax-ng'},
228 | 'rs': {'text', 'rust'},
229 | 'rst': {'text', 'rst'},
230 | 's': {'text', 'asm'},
231 | 'sas': {'text', 'sas'},
232 | 'sass': {'text', 'sass'},
233 | 'sbt': {'text', 'sbt', 'scala'},
234 | 'sc': {'text', 'scala'},
235 | 'scala': {'text', 'scala'},
236 | 'scm': {'text', 'scheme'},
237 | 'scss': {'text', 'scss'},
238 | 'sh': {'text', 'shell'},
239 | 'sln': {'text', 'sln'},
240 | 'sls': {'text', 'salt'},
241 | 'so': {'binary'},
242 | 'sol': {'text', 'solidity'},
243 | 'spec': {'text', 'spec'},
244 | 'sql': {'text', 'sql'},
245 | 'ss': {'text', 'scheme'},
246 | 'sty': {'text', 'tex'},
247 | 'styl': {'text', 'stylus'},
248 | 'sv': {'text', 'system-verilog'},
249 | 'svelte': {'text', 'svelte'},
250 | 'svg': {'text', 'image', 'svg', 'xml'},
251 | 'svh': {'text', 'system-verilog'},
252 | 'swf': {'binary', 'swf'},
253 | 'swift': {'text', 'swift'},
254 | 'swiftdeps': {'text', 'swiftdeps'},
255 | 'tac': {'text', 'twisted', 'python'},
256 | 'tar': {'binary', 'tar'},
257 | 'targets': {'text', 'xml', 'msbuild'},
258 | 'templ': {'text', 'templ'},
259 | 'tex': {'text', 'tex'},
260 | 'textproto': {'text', 'textproto'},
261 | 'tf': {'text', 'terraform'},
262 | 'tfvars': {'text', 'terraform'},
263 | 'tgz': {'binary', 'gzip'},
264 | 'thrift': {'text', 'thrift'},
265 | 'tiff': {'binary', 'image', 'tiff'},
266 | 'toml': {'text', 'toml'},
267 | 'ts': {'text', 'ts'},
268 | 'tsv': {'text', 'tsv'},
269 | 'tsx': {'text', 'tsx'},
270 | 'ttf': {'binary', 'ttf'},
271 | 'twig': {'text', 'twig'},
272 | 'txsprofile': {'text', 'ini', 'txsprofile'},
273 | 'txt': {'text', 'plain-text'},
274 | 'txtpb': {'text', 'textproto'},
275 | 'urdf': {'text', 'xml', 'urdf'},
276 | 'v': {'text', 'verilog'},
277 | 'vb': {'text', 'vb'},
278 | 'vbproj': {'text', 'xml', 'vbproj', 'msbuild'},
279 | 'vcxproj': {'text', 'xml', 'vcxproj', 'msbuild'},
280 | 'vdx': {'text', 'vdx'},
281 | 'vh': {'text', 'verilog'},
282 | 'vhd': {'text', 'vhdl'},
283 | 'vim': {'text', 'vim'},
284 | 'vtl': {'text', 'vtl'},
285 | 'vue': {'text', 'vue'},
286 | 'war': {'binary', 'zip', 'jar'},
287 | 'wav': {'binary', 'audio', 'wav'},
288 | 'webp': {'binary', 'image', 'webp'},
289 | 'whl': {'binary', 'wheel', 'zip'},
290 | 'wkt': {'text', 'wkt'},
291 | 'woff': {'binary', 'woff'},
292 | 'woff2': {'binary', 'woff2'},
293 | 'wsdl': {'text', 'xml', 'wsdl'},
294 | 'wsgi': {'text', 'wsgi', 'python'},
295 | 'xhtml': {'text', 'xml', 'html', 'xhtml'},
296 | 'xacro': {'text', 'xml', 'urdf', 'xacro'},
297 | 'xctestplan': {'text', 'json'},
298 | 'xml': {'text', 'xml'},
299 | 'xq': {'text', 'xquery'},
300 | 'xql': {'text', 'xquery'},
301 | 'xqm': {'text', 'xquery'},
302 | 'xqu': {'text', 'xquery'},
303 | 'xquery': {'text', 'xquery'},
304 | 'xqy': {'text', 'xquery'},
305 | 'xsd': {'text', 'xml', 'xsd'},
306 | 'xsl': {'text', 'xml', 'xsl'},
307 | 'xslt': {'text', 'xml', 'xsl'},
308 | 'yaml': {'text', 'yaml'},
309 | 'yamlld': {'text', 'yaml', 'yamlld'},
310 | 'yang': {'text', 'yang'},
311 | 'yin': {'text', 'xml', 'yin'},
312 | 'yml': {'text', 'yaml'},
313 | 'zcml': {'text', 'xml', 'zcml'},
314 | 'zig': {'text', 'zig'},
315 | 'zip': {'binary', 'zip'},
316 | 'zpt': {'text', 'zpt'},
317 | 'zsh': {'text', 'shell', 'zsh'},
318 | }
319 | EXTENSIONS_NEED_BINARY_CHECK = {
320 | 'plist': {'plist'},
321 | 'ppm': {'image', 'ppm'},
322 | }
323 |
324 | NAMES = {
325 | '.ansible-lint': EXTENSIONS['yaml'],
326 | '.babelrc': EXTENSIONS['json'] | {'babelrc'},
327 | '.bash_aliases': EXTENSIONS['bash'],
328 | '.bash_profile': EXTENSIONS['bash'],
329 | '.bashrc': EXTENSIONS['bash'],
330 | '.bazelrc': {'text', 'bazelrc'},
331 | '.bowerrc': EXTENSIONS['json'] | {'bowerrc'},
332 | '.browserslistrc': {'text', 'browserslistrc'},
333 | '.clang-format': EXTENSIONS['yaml'],
334 | '.clang-tidy': EXTENSIONS['yaml'],
335 | '.codespellrc': EXTENSIONS['ini'] | {'codespellrc'},
336 | '.coveragerc': EXTENSIONS['ini'] | {'coveragerc'},
337 | '.cshrc': EXTENSIONS['csh'],
338 | '.csslintrc': EXTENSIONS['json'] | {'csslintrc'},
339 | '.dockerignore': {'text', 'dockerignore'},
340 | '.editorconfig': {'text', 'editorconfig'},
341 | '.envrc': EXTENSIONS['bash'],
342 | '.flake8': EXTENSIONS['ini'] | {'flake8'},
343 | '.gitattributes': {'text', 'gitattributes'},
344 | '.gitconfig': EXTENSIONS['ini'] | {'gitconfig'},
345 | '.gitignore': {'text', 'gitignore'},
346 | '.gitlint': EXTENSIONS['ini'] | {'gitlint'},
347 | '.gitmodules': {'text', 'gitmodules'},
348 | '.hgrc': EXTENSIONS['ini'] | {'hgrc'},
349 | '.isort.cfg': EXTENSIONS['ini'] | {'isort'},
350 | '.jshintrc': EXTENSIONS['json'] | {'jshintrc'},
351 | '.mailmap': {'text', 'mailmap'},
352 | '.mention-bot': EXTENSIONS['json'] | {'mention-bot'},
353 | '.npmignore': {'text', 'npmignore'},
354 | '.pdbrc': EXTENSIONS['py'] | {'pdbrc'},
355 | '.prettierignore': {'text', 'gitignore', 'prettierignore'},
356 | '.pypirc': EXTENSIONS['ini'] | {'pypirc'},
357 | '.rstcheck.cfg': EXTENSIONS['ini'],
358 | '.salt-lint': EXTENSIONS['yaml'] | {'salt-lint'},
359 | '.sqlfluff': EXTENSIONS['ini'],
360 | '.yamllint': EXTENSIONS['yaml'] | {'yamllint'},
361 | '.zlogin': EXTENSIONS['zsh'],
362 | '.zlogout': EXTENSIONS['zsh'],
363 | '.zprofile': EXTENSIONS['zsh'],
364 | '.zshrc': EXTENSIONS['zsh'],
365 | '.zshenv': EXTENSIONS['zsh'],
366 | 'AUTHORS': EXTENSIONS['txt'],
367 | 'bblayers.conf': EXTENSIONS['bb'],
368 | 'bitbake.conf': EXTENSIONS['bb'],
369 | 'BUILD': EXTENSIONS['bzl'],
370 | 'Cargo.toml': EXTENSIONS['toml'] | {'cargo'},
371 | 'Cargo.lock': EXTENSIONS['toml'] | {'cargo-lock'},
372 | 'CMakeLists.txt': EXTENSIONS['cmake'],
373 | 'CHANGELOG': EXTENSIONS['txt'],
374 | 'config.ru': EXTENSIONS['rb'],
375 | 'Containerfile': {'text', 'dockerfile'},
376 | 'CONTRIBUTING': EXTENSIONS['txt'],
377 | 'copy.bara.sky': EXTENSIONS['bzl'],
378 | 'COPYING': EXTENSIONS['txt'],
379 | 'Dockerfile': {'text', 'dockerfile'},
380 | 'direnvrc': EXTENSIONS['bash'],
381 | 'Gemfile': EXTENSIONS['rb'],
382 | 'Gemfile.lock': {'text'},
383 | 'GNUmakefile': EXTENSIONS['mk'],
384 | 'go.mod': {'text', 'go-mod'},
385 | 'go.sum': {'text', 'go-sum'},
386 | 'Jenkinsfile': EXTENSIONS['jenkins'],
387 | 'LICENSE': EXTENSIONS['txt'],
388 | 'MAINTAINERS': EXTENSIONS['txt'],
389 | 'Makefile': EXTENSIONS['mk'],
390 | 'meson.build': EXTENSIONS['meson'],
391 | 'meson_options.txt': EXTENSIONS['meson'],
392 | 'makefile': EXTENSIONS['mk'],
393 | 'NEWS': EXTENSIONS['txt'],
394 | 'NOTICE': EXTENSIONS['txt'],
395 | 'PATENTS': EXTENSIONS['txt'],
396 | 'Pipfile': EXTENSIONS['toml'],
397 | 'Pipfile.lock': EXTENSIONS['json'],
398 | 'PKGBUILD': {'text', 'bash', 'pkgbuild', 'alpm'},
399 | 'poetry.lock': EXTENSIONS['toml'],
400 | 'pom.xml': EXTENSIONS['pom'],
401 | 'pylintrc': EXTENSIONS['ini'] | {'pylintrc'},
402 | 'README': EXTENSIONS['txt'],
403 | 'Rakefile': EXTENSIONS['rb'],
404 | 'rebar.config': EXTENSIONS['erl'],
405 | 'setup.cfg': EXTENSIONS['ini'],
406 | 'sys.config': EXTENSIONS['erl'],
407 | 'sys.config.src': EXTENSIONS['erl'],
408 | 'Tiltfile': {'text', 'tiltfile'},
409 | 'Vagrantfile': EXTENSIONS['rb'],
410 | 'WORKSPACE': EXTENSIONS['bzl'],
411 | 'wscript': EXTENSIONS['py'],
412 | }
413 |
--------------------------------------------------------------------------------
/identify/identify.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import errno
4 | import math
5 | import os.path
6 | import re
7 | import shlex
8 | import stat
9 | import string
10 | import sys
11 | from typing import IO
12 |
13 | from identify import extensions
14 | from identify import interpreters
15 | from identify.vendor import licenses
16 |
17 |
18 | printable = frozenset(string.printable)
19 |
20 | DIRECTORY = 'directory'
21 | SYMLINK = 'symlink'
22 | SOCKET = 'socket'
23 | FILE = 'file'
24 | EXECUTABLE = 'executable'
25 | NON_EXECUTABLE = 'non-executable'
26 | TEXT = 'text'
27 | BINARY = 'binary'
28 |
29 | TYPE_TAGS = frozenset((DIRECTORY, FILE, SYMLINK, SOCKET))
30 | MODE_TAGS = frozenset((EXECUTABLE, NON_EXECUTABLE))
31 | ENCODING_TAGS = frozenset((BINARY, TEXT))
32 | _ALL_TAGS = {*TYPE_TAGS, *MODE_TAGS, *ENCODING_TAGS}
33 | _ALL_TAGS.update(*extensions.EXTENSIONS.values())
34 | _ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
35 | _ALL_TAGS.update(*extensions.NAMES.values())
36 | _ALL_TAGS.update(*interpreters.INTERPRETERS.values())
37 | ALL_TAGS = frozenset(_ALL_TAGS)
38 |
39 |
40 | def tags_from_path(path: str) -> set[str]:
41 | try:
42 | sr = os.lstat(path)
43 | except (OSError, ValueError): # same error-handling as `os.lexists()`
44 | raise ValueError(f'{path} does not exist.')
45 |
46 | mode = sr.st_mode
47 | if stat.S_ISDIR(mode):
48 | return {DIRECTORY}
49 | if stat.S_ISLNK(mode):
50 | return {SYMLINK}
51 | if stat.S_ISSOCK(mode):
52 | return {SOCKET}
53 |
54 | tags = {FILE}
55 |
56 | executable = os.access(path, os.X_OK)
57 | if executable:
58 | tags.add(EXECUTABLE)
59 | else:
60 | tags.add(NON_EXECUTABLE)
61 |
62 | # As an optimization, if we're able to read tags from the filename, then we
63 | # don't peek at the file contents.
64 | t = tags_from_filename(os.path.basename(path))
65 | if len(t) > 0:
66 | tags.update(t)
67 | else:
68 | if executable:
69 | shebang = parse_shebang_from_file(path)
70 | if len(shebang) > 0:
71 | tags.update(tags_from_interpreter(shebang[0]))
72 |
73 | # some extensions can be both binary and text
74 | # see EXTENSIONS_NEED_BINARY_CHECK
75 | if not ENCODING_TAGS & tags:
76 | if file_is_text(path):
77 | tags.add(TEXT)
78 | else:
79 | tags.add(BINARY)
80 |
81 | assert ENCODING_TAGS & tags, tags
82 | assert MODE_TAGS & tags, tags
83 | return tags
84 |
85 |
86 | def tags_from_filename(path: str) -> set[str]:
87 | _, filename = os.path.split(path)
88 | _, ext = os.path.splitext(filename)
89 |
90 | ret = set()
91 |
92 | # Allow e.g. "Dockerfile.xenial" to match "Dockerfile"
93 | for part in [filename] + filename.split('.'):
94 | if part in extensions.NAMES:
95 | ret.update(extensions.NAMES[part])
96 | break
97 |
98 | if len(ext) > 0:
99 | ext = ext[1:].lower()
100 | if ext in extensions.EXTENSIONS:
101 | ret.update(extensions.EXTENSIONS[ext])
102 | elif ext in extensions.EXTENSIONS_NEED_BINARY_CHECK:
103 | ret.update(extensions.EXTENSIONS_NEED_BINARY_CHECK[ext])
104 |
105 | return ret
106 |
107 |
108 | def tags_from_interpreter(interpreter: str) -> set[str]:
109 | _, _, interpreter = interpreter.rpartition('/')
110 |
111 | # Try "python3.5.2" => "python3.5" => "python3" until one matches.
112 | while interpreter:
113 | if interpreter in interpreters.INTERPRETERS:
114 | return interpreters.INTERPRETERS[interpreter]
115 | else:
116 | interpreter, _, _ = interpreter.rpartition('.')
117 |
118 | return set()
119 |
120 |
121 | def is_text(bytesio: IO[bytes]) -> bool:
122 | """Return whether the first KB of contents seems to be binary.
123 |
124 | This is roughly based on libmagic's binary/text detection:
125 | https://github.com/file/file/blob/df74b09b9027676088c797528edcaae5a9ce9ad0/src/encoding.c#L203-L228
126 | """
127 | text_chars = (
128 | bytearray([7, 8, 9, 10, 11, 12, 13, 27]) +
129 | bytearray(range(0x20, 0x7F)) +
130 | bytearray(range(0x80, 0X100))
131 | )
132 | return not bool(bytesio.read(1024).translate(None, text_chars))
133 |
134 |
135 | def file_is_text(path: str) -> bool:
136 | if not os.path.lexists(path):
137 | raise ValueError(f'{path} does not exist.')
138 | with open(path, 'rb') as f:
139 | return is_text(f)
140 |
141 |
142 | def _shebang_split(line: str) -> list[str]:
143 | try:
144 | # shebangs aren't supposed to be quoted, though some tools such as
145 | # setuptools will write them with quotes so we'll best-guess parse
146 | # with shlex first
147 | return shlex.split(line)
148 | except ValueError:
149 | # failing that, we'll do a more "traditional" shebang parsing which
150 | # just involves splitting by whitespace
151 | return line.split()
152 |
153 |
154 | def _parse_nix_shebang(
155 | bytesio: IO[bytes],
156 | cmd: tuple[str, ...],
157 | ) -> tuple[str, ...]:
158 | while bytesio.read(2) == b'#!':
159 | next_line_b = bytesio.readline()
160 | try:
161 | next_line = next_line_b.decode('UTF-8')
162 | except UnicodeDecodeError:
163 | return cmd
164 |
165 | for c in next_line:
166 | if c not in printable:
167 | return cmd
168 |
169 | line_tokens = tuple(_shebang_split(next_line.strip()))
170 | for i, token in enumerate(line_tokens[:-1]):
171 | if token != '-i':
172 | continue
173 | # the argument to -i flag
174 | cmd = (line_tokens[i + 1],)
175 | return cmd
176 |
177 |
178 | def parse_shebang(bytesio: IO[bytes]) -> tuple[str, ...]:
179 | """Parse the shebang from a file opened for reading binary."""
180 | if bytesio.read(2) != b'#!':
181 | return ()
182 | first_line_b = bytesio.readline()
183 | try:
184 | first_line = first_line_b.decode('UTF-8')
185 | except UnicodeDecodeError:
186 | return ()
187 |
188 | # Require only printable ascii
189 | for c in first_line:
190 | if c not in printable:
191 | return ()
192 |
193 | cmd = tuple(_shebang_split(first_line.strip()))
194 | if cmd[:2] == ('/usr/bin/env', '-S'):
195 | cmd = cmd[2:]
196 | elif cmd[:1] == ('/usr/bin/env',):
197 | cmd = cmd[1:]
198 |
199 | if cmd == ('nix-shell',):
200 | return _parse_nix_shebang(bytesio, cmd)
201 |
202 | return cmd
203 |
204 |
205 | def parse_shebang_from_file(path: str) -> tuple[str, ...]:
206 | """Parse the shebang given a file path."""
207 | if not os.path.lexists(path):
208 | raise ValueError(f'{path} does not exist.')
209 | if not os.access(path, os.X_OK):
210 | return ()
211 |
212 | try:
213 | with open(path, 'rb') as f:
214 | return parse_shebang(f)
215 | except OSError as e:
216 | if e.errno == errno.EINVAL:
217 | return ()
218 | else:
219 | raise
220 |
221 |
222 | COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
223 | WS_RE = re.compile(r'\s+')
224 |
225 |
226 | def _norm_license(s: str) -> str:
227 | s = COPYRIGHT_RE.sub('', s)
228 | s = WS_RE.sub(' ', s)
229 | return s.strip()
230 |
231 |
232 | def license_id(filename: str) -> str | None:
233 | """Return the spdx id for the license contained in `filename`. If no
234 | license is detected, returns `None`.
235 |
236 | spdx: https://spdx.org/licenses/
237 | licenses from choosealicense.com: https://github.com/choosealicense.com
238 |
239 | Approximate algorithm:
240 |
241 | 1. strip copyright line
242 | 2. normalize whitespace (replace all whitespace with a single space)
243 | 3. check exact text match with existing licenses
244 | 4. failing that use edit distance
245 | """
246 | import ukkonen # `pip install identify[license]`
247 |
248 | with open(filename, encoding='UTF-8') as f:
249 | contents = f.read()
250 |
251 | norm = _norm_license(contents)
252 |
253 | min_edit_dist = sys.maxsize
254 | min_edit_dist_spdx = ''
255 |
256 | cutoff = math.ceil(.05 * len(norm))
257 |
258 | # try exact matches
259 | for spdx, text in licenses.LICENSES:
260 | norm_license = _norm_license(text)
261 | if norm == norm_license:
262 | return spdx
263 |
264 | # skip the slow calculation if the lengths are very different
265 | if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
266 | continue
267 |
268 | edit_dist = ukkonen.distance(norm, norm_license, cutoff)
269 | if edit_dist < cutoff and edit_dist < min_edit_dist:
270 | min_edit_dist = edit_dist
271 | min_edit_dist_spdx = spdx
272 |
273 | # if there's less than 5% edited from the license, we found our match
274 | if norm and min_edit_dist < cutoff:
275 | return min_edit_dist_spdx
276 | else:
277 | # no matches :'(
278 | return None
279 |
--------------------------------------------------------------------------------
/identify/interpreters.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | INTERPRETERS = {
3 | 'ash': {'shell', 'ash'},
4 | 'awk': {'awk'},
5 | 'bash': {'shell', 'bash'},
6 | 'bats': {'shell', 'bash', 'bats'},
7 | 'cbsd': {'shell', 'cbsd'},
8 | 'csh': {'shell', 'csh'},
9 | 'dash': {'shell', 'dash'},
10 | 'expect': {'expect'},
11 | 'ksh': {'shell', 'ksh'},
12 | 'node': {'javascript'},
13 | 'nodejs': {'javascript'},
14 | 'perl': {'perl'},
15 | 'php': {'php'},
16 | 'php7': {'php', 'php7'},
17 | 'php8': {'php', 'php8'},
18 | 'python': {'python'},
19 | 'python2': {'python', 'python2'},
20 | 'python3': {'python', 'python3'},
21 | 'ruby': {'ruby'},
22 | 'sh': {'shell', 'sh'},
23 | 'tcsh': {'shell', 'tcsh'},
24 | 'zsh': {'shell', 'zsh'},
25 | }
26 |
--------------------------------------------------------------------------------
/identify/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pre-commit/identify/40af39f8124a4e8029ff3716c2b0bbf2e1e5fb1e/identify/py.typed
--------------------------------------------------------------------------------
/identify/vendor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pre-commit/identify/40af39f8124a4e8029ff3716c2b0bbf2e1e5fb1e/identify/vendor/__init__.py
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | covdefaults
2 | coverage
3 | pytest
4 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = identify
3 | version = 2.6.12
4 | description = File identification library for Python
5 | long_description = file: README.md
6 | long_description_content_type = text/markdown
7 | url = https://github.com/pre-commit/identify
8 | author = Chris Kuehl
9 | author_email = ckuehl@ocf.berkeley.edu
10 | license = MIT
11 | license_files = LICENSE
12 | classifiers =
13 | Programming Language :: Python :: 3
14 | Programming Language :: Python :: 3 :: Only
15 | Programming Language :: Python :: Implementation :: CPython
16 | Programming Language :: Python :: Implementation :: PyPy
17 |
18 | [options]
19 | packages = find:
20 | python_requires = >=3.9
21 |
22 | [options.packages.find]
23 | exclude =
24 | tests*
25 | testing*
26 |
27 | [options.entry_points]
28 | console_scripts =
29 | identify-cli=identify.cli:main
30 |
31 | [options.extras_require]
32 | license =
33 | ukkonen
34 |
35 | [options.package_data]
36 | identify =
37 | py.typed
38 |
39 | [bdist_wheel]
40 | universal = True
41 |
42 | [coverage:run]
43 | plugins = covdefaults
44 |
45 | [mypy]
46 | check_untyped_defs = true
47 | disallow_any_generics = true
48 | disallow_incomplete_defs = true
49 | disallow_untyped_defs = true
50 | warn_redundant_casts = true
51 | warn_unused_ignores = true
52 |
53 | [mypy-testing.*]
54 | disallow_untyped_defs = false
55 |
56 | [mypy-tests.*]
57 | disallow_untyped_defs = false
58 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from setuptools import setup
4 | setup()
5 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pre-commit/identify/40af39f8124a4e8029ff3716c2b0bbf2e1e5fb1e/tests/__init__.py
--------------------------------------------------------------------------------
/tests/cli_test.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from identify import cli
4 |
5 |
6 | def test_identify_cli(capsys):
7 | ret = cli.main(('setup.py',))
8 | out, _ = capsys.readouterr()
9 | assert ret == 0
10 | assert out == '["file", "non-executable", "python", "text"]\n'
11 |
12 |
13 | def test_identify_cli_filename_only(capsys):
14 | ret = cli.main(('setup.py', '--filename-only'))
15 | out, _ = capsys.readouterr()
16 | assert ret == 0
17 | assert out == '["python", "text"]\n'
18 |
19 |
20 | def test_identify_cli_filename_only_unidentified(capsys):
21 | ret = cli.main(('x.unknown', '--filename-only'))
22 | out, _ = capsys.readouterr()
23 | assert ret == 1
24 | assert out == ''
25 |
26 |
27 | def test_file_not_found(capsys):
28 | ret = cli.main(('x.unknown',))
29 | out, _ = capsys.readouterr()
30 | assert ret == 1
31 | assert out == 'x.unknown does not exist.\n'
32 |
--------------------------------------------------------------------------------
/tests/extensions_test.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 | from identify import extensions
6 |
7 |
8 | @pytest.mark.parametrize('extension', extensions.EXTENSIONS)
9 | def test_extensions_have_binary_or_text(extension):
10 | tags = extensions.EXTENSIONS[extension]
11 | assert len({'text', 'binary'} & tags) == 1, tags
12 |
13 |
14 | @pytest.mark.parametrize('name', extensions.NAMES)
15 | def test_names_have_binary_or_text(name):
16 | tags = extensions.NAMES[name]
17 | assert len({'text', 'binary'} & tags) == 1, tags
18 |
19 |
20 | @pytest.mark.parametrize('extension', extensions.EXTENSIONS_NEED_BINARY_CHECK)
21 | def test_need_binary_check_do_not_specify_text_binary(extension):
22 | tags = extensions.EXTENSIONS_NEED_BINARY_CHECK[extension]
23 | assert len({'text', 'binary'} & tags) == 0, tags
24 |
25 |
26 | def test_mutually_exclusive_check_types():
27 | assert not (
28 | set(extensions.EXTENSIONS) &
29 | set(extensions.EXTENSIONS_NEED_BINARY_CHECK)
30 | )
31 |
--------------------------------------------------------------------------------
/tests/identify_test.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import builtins
4 | import errno
5 | import io
6 | import os
7 | import socket
8 | import stat
9 | from tempfile import TemporaryDirectory
10 | from unittest import mock
11 |
12 | import pytest
13 |
14 | from identify import identify
15 |
16 |
17 | def test_all_tags_includes_basic_ones():
18 | assert 'file' in identify.ALL_TAGS
19 | assert 'directory' in identify.ALL_TAGS
20 | assert 'executable' in identify.ALL_TAGS
21 | assert 'text' in identify.ALL_TAGS
22 | assert 'socket' in identify.ALL_TAGS
23 |
24 |
25 | @pytest.mark.parametrize(
26 | 'tag_group',
27 | (
28 | identify.TYPE_TAGS,
29 | identify.MODE_TAGS,
30 | identify.ENCODING_TAGS,
31 | ),
32 | )
33 | def test_all_tags_contains_all_groups(tag_group):
34 | assert tag_group < identify.ALL_TAGS
35 |
36 |
37 | def test_all_tags_contains_each_type():
38 | assert 'xml' in identify.ALL_TAGS # extension
39 | assert 'plist' in identify.ALL_TAGS # extension, needs binary check
40 | assert 'dockerfile' in identify.ALL_TAGS # by file convention
41 | assert 'python3' in identify.ALL_TAGS # by shebang
42 | assert 'php8' in identify.ALL_TAGS # by shebang
43 |
44 |
45 | def test_tags_from_path_does_not_exist(tmpdir):
46 | x = tmpdir.join('foo')
47 | with pytest.raises(ValueError):
48 | identify.tags_from_path(x.strpath)
49 |
50 |
51 | def test_tags_from_path_directory(tmpdir):
52 | x = tmpdir.join('foo')
53 | x.mkdir()
54 | assert identify.tags_from_path(x.strpath) == {'directory'}
55 |
56 |
57 | def test_tags_from_path_symlink(tmpdir):
58 | x = tmpdir.join('foo')
59 | x.mksymlinkto(tmpdir.join('lol').ensure())
60 | assert identify.tags_from_path(x.strpath) == {'symlink'}
61 |
62 |
63 | def test_tags_from_path_socket():
64 | tmproot = '/tmp' # short path avoids `OSError: AF_UNIX path too long`
65 | with TemporaryDirectory(dir=tmproot) as tmpdir:
66 | socket_path = os.path.join(tmpdir, 'socket')
67 | with socket.socket(socket.AF_UNIX) as sock:
68 | sock.bind(socket_path)
69 | tags = identify.tags_from_path(socket_path)
70 |
71 | assert tags == {'socket'}
72 |
73 |
74 | def test_tags_from_path_broken_symlink(tmpdir):
75 | x = tmpdir.join('foo')
76 | x.mksymlinkto(tmpdir.join('lol'))
77 | assert identify.tags_from_path(x.strpath) == {'symlink'}
78 |
79 |
80 | def test_tags_from_path_simple_file(tmpdir):
81 | x = tmpdir.join('test.py').ensure()
82 | assert identify.tags_from_path(x.strpath) == {
83 | 'file', 'text', 'non-executable', 'python',
84 | }
85 |
86 |
87 | def test_tags_from_path_file_with_incomplete_shebang(tmpdir):
88 | x = tmpdir.join('test')
89 | x.write_text('#! \n', encoding='UTF-8')
90 | make_executable(x.strpath)
91 | assert identify.tags_from_path(x.strpath) == {
92 | 'file', 'text', 'executable',
93 | }
94 |
95 |
96 | def test_tags_from_path_file_with_shebang_non_executable(tmpdir):
97 | x = tmpdir.join('test')
98 | x.write_text('#!/usr/bin/env python\nimport sys\n', encoding='UTF-8')
99 | assert identify.tags_from_path(x.strpath) == {
100 | 'file', 'text', 'non-executable',
101 | }
102 |
103 |
104 | def test_tags_from_path_file_with_shebang_executable(tmpdir):
105 | x = tmpdir.join('test')
106 | x.write_text('#!/usr/bin/env python\nimport sys\n', encoding='UTF-8')
107 | make_executable(x.strpath)
108 | assert identify.tags_from_path(x.strpath) == {
109 | 'file', 'text', 'executable', 'python',
110 | }
111 |
112 |
113 | def test_tags_from_path_binary(tmpdir):
114 | x = tmpdir.join('test')
115 | x.write(b'\x7f\x45\x4c\x46\x02\x01\x01')
116 | make_executable(x.strpath)
117 | assert identify.tags_from_path(x.strpath) == {
118 | 'file', 'binary', 'executable',
119 | }
120 |
121 |
122 | def test_tags_from_path_plist_binary(tmpdir):
123 | x = tmpdir.join('t.plist')
124 | x.write_binary(
125 | b'bplist00\xd1\x01\x02_\x10\x0fLast Login NameWDefault\x08\x0b\x1d\x00'
126 | b'\x00\x00\x00\x00\x00\x01\x01\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00'
127 | b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00%',
128 | )
129 | assert identify.tags_from_path(x.strpath) == {
130 | 'file', 'plist', 'binary', 'non-executable',
131 | }
132 |
133 |
134 | def test_tags_from_path_plist_text(tmpdir):
135 | x = tmpdir.join('t.plist')
136 | x.write(
137 | '\n'
138 | '\n' # noqa: E501
139 | '\n'
140 | '\n'
141 | '\tLast Login Name\n'
142 | '\tDefault\n'
143 | '\n'
144 | '\n',
145 | )
146 | assert identify.tags_from_path(x.strpath) == {
147 | 'file', 'plist', 'text', 'non-executable',
148 | }
149 |
150 |
151 | @pytest.mark.parametrize(
152 | ('filename', 'expected'),
153 | (
154 | ('.salt-lint', {'text', 'salt-lint', 'yaml'}),
155 | ('test.py', {'text', 'python'}),
156 | ('test.mk', {'text', 'makefile'}),
157 | ('Makefile', {'text', 'makefile'}),
158 | ('Containerfile', {'text', 'dockerfile'}),
159 | ('Dockerfile', {'text', 'dockerfile'}),
160 | ('Dockerfile.xenial', {'text', 'dockerfile'}),
161 | ('xenial.Dockerfile', {'text', 'dockerfile'}),
162 | ('Pipfile', {'text', 'toml'}),
163 | ('Pipfile.lock', {'text', 'json'}),
164 | ('mod/test.py', {'text', 'python'}),
165 | ('mod/Dockerfile', {'text', 'dockerfile'}),
166 | ('config.ru', {'text', 'ruby'}),
167 | ('Gemfile', {'text', 'ruby'}),
168 | ('Gemfile.lock', {'text'}),
169 | ('Jenkinsfile', {'text', 'groovy', 'jenkins'}),
170 | ('build.jenkins', {'text', 'groovy', 'jenkins'}),
171 | ('build.jenkinsfile', {'text', 'groovy', 'jenkins'}),
172 | ('meson.build', {'text', 'meson'}),
173 | ('meson_options.txt', {'text', 'plain-text', 'meson'}),
174 | ('Vagrantfile', {'text', 'ruby'}),
175 | ('Tiltfile', {'text', 'tiltfile'}),
176 | ('Tiltfile.abc', {'text', 'tiltfile'}),
177 | ('test.Tiltfile', {'text', 'tiltfile'}),
178 |
179 | # does not set binary / text
180 | ('f.plist', {'plist'}),
181 |
182 | # case of extension should be ignored
183 | ('f.JPG', {'binary', 'image', 'jpeg'}),
184 | # but case of name checks should still be honored
185 | ('dockerfile.py', {'text', 'python'}),
186 |
187 | # full filename tests should take precedence over extension tests
188 | ('test.cfg', {'text'}),
189 | ('setup.cfg', {'text', 'ini'}),
190 |
191 | # Filename matches should still include extensions if applicable
192 | ('README.md', {'text', 'markdown', 'plain-text'}),
193 |
194 | ('test.weird-unrecognized-extension', set()),
195 | ('test', set()),
196 | ('', set()),
197 | ),
198 | )
199 | def test_tags_from_filename(filename, expected):
200 | assert identify.tags_from_filename(filename) == expected
201 |
202 |
203 | @pytest.mark.parametrize(
204 | ('interpreter', 'expected'),
205 | (
206 | ('python', {'python'}),
207 | ('python3', {'python3', 'python'}),
208 | ('python3.5.2', {'python3', 'python'}),
209 | ('/usr/bin/python3.5.2', {'python3', 'python'}),
210 | ('/usr/bin/herpderpderpderpderp', set()),
211 | ('something-random', set()),
212 | ('', set()),
213 | ),
214 | )
215 | def test_tags_from_interpreter(interpreter, expected):
216 | assert identify.tags_from_interpreter(interpreter) == expected
217 |
218 |
219 | @pytest.mark.parametrize(
220 | ('data', 'expected'),
221 | (
222 | (b'hello world', True),
223 | (b'', True),
224 | ('éóñəå ⊂(◉‿◉)つ(ノ≥∇≤)ノ'.encode(), True),
225 | (r'¯\_(ツ)_/¯'.encode(), True),
226 | ('♪┏(・o・)┛♪┗ ( ・o・) ┓♪┏ ( ) ┛♪┗ (・o・ ) ┓♪'.encode(), True),
227 | ('éóñå'.encode('latin1'), True),
228 |
229 | (b'hello world\x00', False),
230 | # first few bytes of /bin/bash
231 | (b'\x7f\x45\x4c\x46\x02\x01\x01', False),
232 | # some /dev/urandom output
233 | (b'\x43\x92\xd9\x0f\xaf\x32\x2c', False),
234 | ),
235 | )
236 | def test_is_text(data, expected):
237 | assert identify.is_text(io.BytesIO(data)) is expected
238 |
239 |
240 | def test_file_is_text_simple(tmpdir):
241 | x = tmpdir.join('f')
242 | x.write_text('hello there\n', encoding='UTF-8')
243 | assert identify.file_is_text(x.strpath) is True
244 |
245 |
246 | def test_file_is_text_does_not_exist(tmpdir):
247 | x = tmpdir.join('f')
248 | with pytest.raises(ValueError):
249 | identify.file_is_text(x.strpath)
250 |
251 |
252 | @pytest.mark.parametrize(
253 | ('s', 'expected'),
254 | (
255 | (b'', ()),
256 | (b'#!/usr/bin/python', ('/usr/bin/python',)),
257 | (b'#!/usr/bin/env python', ('python',)),
258 | (b'#! /usr/bin/python', ('/usr/bin/python',)),
259 | (b'#!/usr/bin/foo python', ('/usr/bin/foo', 'python')),
260 | # despite this being invalid, setuptools will write shebangs like this
261 | (b'#!"/path/with spaces/x" y', ('/path/with spaces/x', 'y')),
262 | # this is apparently completely ok to embed quotes
263 | (b"#!/path'with/quotes y", ("/path'with/quotes", 'y')),
264 | # Don't regress on leading/trailing ws
265 | (b"#! /path'with/quotes y ", ("/path'with/quotes", 'y')),
266 | # Test nix-shell specialites with shebang on second line
267 | (
268 | b'#! /usr/bin/env nix-shell\n'
269 | b'#! nix-shell -i bash -p python',
270 | ('bash',),
271 | ),
272 | (
273 | b'#! /usr/bin/env nix-shell\n'
274 | b'#! nix-shell -i python -p coreutils',
275 | ('python',),
276 | ),
277 | (
278 | b'#! /usr/bin/env nix-shell\n'
279 | b'#! nix-shell -p coreutils -i python',
280 | ('python',),
281 | ),
282 | # multi-line and no whitespace variation
283 | (
284 | b'#! /usr/bin/env nix-shell\n'
285 | b'#! nix-shell -p coreutils\n'
286 | b'#! nix-shell -i python',
287 | ('python',),
288 | ),
289 | (
290 | b'#! /usr/bin/env nix-shell\n'
291 | b'#!nix-shell -p coreutils\n'
292 | b'#!nix-shell -i python',
293 | ('python',),
294 | ),
295 | (
296 | b'#! /usr/bin/env nix-shell\n'
297 | b'#!\xf9\x93\x01\x42\xcd',
298 | ('nix-shell',),
299 | ),
300 | (
301 | b'#! /usr/bin/env nix-shell\n'
302 | b'#!\x00\x00\x00\x00',
303 | ('nix-shell',),
304 | ),
305 | # non-proper nix-shell
306 | (b'#! /usr/bin/nix-shell', ('/usr/bin/nix-shell',)),
307 | (b'#! /usr/bin/env nix-shell', ('nix-shell',)),
308 | (
309 | b'#! /usr/bin/env nix-shell non-portable-argument',
310 | ('nix-shell', 'non-portable-argument'),
311 | ),
312 | (
313 | b'#! /usr/bin/env nix-shell\n'
314 | b'#! nix-shell -i',
315 | ('nix-shell',), # guard against index error
316 | ),
317 | # interpret quotes correctly
318 | (
319 | b'#!/usr/bin/env nix-shell\n'
320 | b'#!nix-shell --argstr x "a -i python3 p"\n'
321 | b'#!nix-shell -p hello\n'
322 | b'#!nix-shell -i bash\n'
323 | b'#!nix-shell --argstr y "b -i runhaskell q"',
324 | ('bash',),
325 | ),
326 | (b'\xf9\x93\x01\x42\xcd', ()),
327 | (b'#!\xf9\x93\x01\x42\xcd', ()),
328 | (b'#!\x00\x00\x00\x00', ()),
329 | # shebang lines with multiple arguments
330 | (b'#!/usr/bin/env -S python -u', ('python', '-u')),
331 | (b'#!/usr/bin/env', ()),
332 | (b'#!/usr/bin/env -S', ()),
333 | ),
334 | )
335 | def test_parse_shebang(s, expected):
336 | assert identify.parse_shebang(io.BytesIO(s)) == expected
337 |
338 |
339 | def test_parse_shebang_from_file_does_not_exist():
340 | with pytest.raises(ValueError):
341 | identify.parse_shebang_from_file('herp derp derp')
342 |
343 |
344 | def test_parse_shebang_from_file_nonexecutable(tmpdir):
345 | x = tmpdir.join('f')
346 | x.write_text('#!/usr/bin/env python', encoding='UTF-8')
347 | assert identify.parse_shebang_from_file(x.strpath) == ()
348 |
349 |
350 | def test_parse_shebang_from_file_simple(tmpdir):
351 | x = tmpdir.join('f')
352 | x.write_text('#!/usr/bin/env python', encoding='UTF-8')
353 | make_executable(x.strpath)
354 | assert identify.parse_shebang_from_file(x.strpath) == ('python',)
355 |
356 |
357 | def test_parse_shebang_open_raises_einval(tmpdir):
358 | x = tmpdir.join('f')
359 | x.write('#!/usr/bin/env not-expected\n')
360 | make_executable(x)
361 | error = OSError(errno.EINVAL, f'Invalid argument {x}')
362 | with mock.patch.object(builtins, 'open', side_effect=error):
363 | assert identify.parse_shebang_from_file(x.strpath) == ()
364 |
365 |
366 | def make_executable(filename):
367 | original_mode = os.stat(filename).st_mode
368 | os.chmod(
369 | filename,
370 | original_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH,
371 | )
372 |
373 |
374 | def test_license_identification():
375 | assert identify.license_id('LICENSE') == 'MIT'
376 |
377 |
378 | def test_license_exact_identification(tmpdir):
379 | wtfpl = '''\
380 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
381 | Version 2, December 2004
382 |
383 | Copyright (C) 2004 Sam Hocevar
384 |
385 | Everyone is permitted to copy and distribute verbatim or modified
386 | copies of this license document, and changing it is allowed as long
387 | as the name is changed.
388 |
389 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
390 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
391 |
392 | 0. You just DO WHAT THE FUCK YOU WANT TO.
393 | '''
394 | f = tmpdir.join('LICENSE')
395 | f.write(wtfpl)
396 | assert identify.license_id(f.strpath) == 'WTFPL'
397 |
398 |
399 | def test_license_not_identified():
400 | assert identify.license_id(os.devnull) is None
401 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py,pre-commit
3 |
4 | [testenv]
5 | deps = -rrequirements-dev.txt
6 | extras = license
7 | commands =
8 | coverage erase
9 | coverage run -m pytest {posargs:tests}
10 | coverage report
11 |
12 | [testenv:pre-commit]
13 | skip_install = true
14 | deps = pre-commit
15 | commands = pre-commit run --all-files --show-diff-on-failure
16 |
17 | [pep8]
18 | ignore = E265,E501,W504
19 |
--------------------------------------------------------------------------------