├── .activate.sh
├── .deactivate.sh
├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── bin
    └── vendor-licenses
├── identify
    ├── __init__.py
    ├── cli.py
    ├── extensions.py
    ├── identify.py
    ├── interpreters.py
    ├── py.typed
    └── vendor
    │   ├── __init__.py
    │   └── licenses.py
├── requirements-dev.txt
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── cli_test.py
    ├── extensions_test.py
    └── identify_test.py
└── tox.ini


/.activate.sh:
--------------------------------------------------------------------------------
1 | venv/bin/activate


--------------------------------------------------------------------------------
/.deactivate.sh:
--------------------------------------------------------------------------------
1 | deactivate
2 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: main
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main, test-me-*]
 6 |     tags: '*'
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   main:
11 |     uses: asottile/workflows/.github/workflows/tox.yml@v1.8.1
12 |     with:
13 |       env: '["py39", "py310", "py311", "py312"]'
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | *.py[co]
3 | /.coverage
4 | /.tox
5 | /dist
6 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v5.0.0
 4 |     hooks:
 5 |     -   id: trailing-whitespace
 6 |     -   id: end-of-file-fixer
 7 |     -   id: check-yaml
 8 |     -   id: debug-statements
 9 |     -   id: double-quote-string-fixer
10 |     -   id: name-tests-test
11 |     -   id: requirements-txt-fixer
12 | -   repo: https://github.com/asottile/setup-cfg-fmt
13 |     rev: v2.8.0
14 |     hooks:
15 |     -   id: setup-cfg-fmt
16 | -   repo: https://github.com/asottile/reorder-python-imports
17 |     rev: v3.15.0
18 |     hooks:
19 |     -   id: reorder-python-imports
20 |         args: [--py39-plus, --add-import, 'from __future__ import annotations']
21 | -   repo: https://github.com/asottile/add-trailing-comma
22 |     rev: v3.2.0
23 |     hooks:
24 |     -   id: add-trailing-comma
25 | -   repo: https://github.com/asottile/pyupgrade
26 |     rev: v3.20.0
27 |     hooks:
28 |     -   id: pyupgrade
29 |         args: [--py39-plus]
30 | -   repo: https://github.com/hhatto/autopep8
31 |     rev: v2.3.2
32 |     hooks:
33 |     -   id: autopep8
34 | -   repo: https://github.com/PyCQA/flake8
35 |     rev: 7.2.0
36 |     hooks:
37 |     -   id: flake8
38 |         exclude: ^identify/vendor/licenses\.py$
39 | -   repo: https://github.com/pre-commit/mirrors-mypy
40 |     rev: v1.16.0
41 |     hooks:
42 |     -   id: mypy
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Chris Kuehl, Anthony Sottile
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![build status](https://github.com/pre-commit/identify/actions/workflows/main.yml/badge.svg)](https://github.com/pre-commit/identify/actions/workflows/main.yml)
  2 | [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/pre-commit/identify/main.svg)](https://results.pre-commit.ci/latest/github/pre-commit/identify/main)
  3 | 
  4 | identify
  5 | ========
  6 | 
  7 | File identification library for Python.
  8 | 
  9 | Given a file (or some information about a file), return a set of standardized
 10 | tags identifying what the file is.
 11 | 
 12 | ## Installation
 13 | 
 14 | ```bash
 15 | pip install identify
 16 | ```
 17 | 
 18 | ## Usage
 19 | ### With a file on disk
 20 | 
 21 | If you have an actual file on disk, you can get the most information possible
 22 | (a superset of all other methods):
 23 | 
 24 | ```python
 25 | >>> from identify import identify
 26 | >>> identify.tags_from_path('/path/to/file.py')
 27 | {'file', 'text', 'python', 'non-executable'}
 28 | >>> identify.tags_from_path('/path/to/file-with-shebang')
 29 | {'file', 'text', 'shell', 'bash', 'executable'}
 30 | >>> identify.tags_from_path('/bin/bash')
 31 | {'file', 'binary', 'executable'}
 32 | >>> identify.tags_from_path('/path/to/directory')
 33 | {'directory'}
 34 | >>> identify.tags_from_path('/path/to/symlink')
 35 | {'symlink'}
 36 | ```
 37 | 
 38 | When using a file on disk, the checks performed are:
 39 | 
 40 | * File type (file, symlink, directory, socket)
 41 | * Mode (is it executable?)
 42 | * File name (mostly based on extension)
 43 | * If executable, the shebang is read and the interpreter interpreted
 44 | 
 45 | 
 46 | ### If you only have the filename
 47 | 
 48 | ```python
 49 | >>> identify.tags_from_filename('file.py')
 50 | {'text', 'python'}
 51 | ```
 52 | 
 53 | 
 54 | ### If you only have the interpreter
 55 | 
 56 | ```python
 57 | >>> identify.tags_from_interpreter('python3.5')
 58 | {'python', 'python3'}
 59 | >>> identify.tags_from_interpreter('bash')
 60 | {'shell', 'bash'}
 61 | >>> identify.tags_from_interpreter('some-unrecognized-thing')
 62 | set()
 63 | ```
 64 | 
 65 | ### As a cli
 66 | 
 67 | ```
 68 | $ identify-cli --help
 69 | usage: identify-cli [-h] [--filename-only] path
 70 | 
 71 | positional arguments:
 72 |   path
 73 | 
 74 | optional arguments:
 75 |   -h, --help       show this help message and exit
 76 |   --filename-only
 77 | ```
 78 | 
 79 | ```console
 80 | $ identify-cli setup.py; echo $?
 81 | ["file", "non-executable", "python", "text"]
 82 | 0
 83 | $ identify-cli setup.py --filename-only; echo $?
 84 | ["python", "text"]
 85 | 0
 86 | $ identify-cli wat.wat; echo $?
 87 | wat.wat does not exist.
 88 | 1
 89 | $ identify-cli wat.wat --filename-only; echo $?
 90 | 1
 91 | ```
 92 | 
 93 | ### Identifying LICENSE files
 94 | 
 95 | `identify` also has an api for determining what type of license is contained
 96 | in a file.  This routine is roughly based on the approaches used by
 97 | [licensee] (the ruby gem that github uses to figure out the license for a
 98 | repo).
 99 | 
100 | The approach that `identify` uses is as follows:
101 | 
102 | 1. Strip the copyright line
103 | 2. Normalize all whitespace
104 | 3. Return any exact matches
105 | 4. Return the closest by edit distance (where edit distance < 5%)
106 | 
107 | To use the api, install via `pip install identify[license]`
108 | 
109 | ```pycon
110 | >>> from identify import identify
111 | >>> identify.license_id('LICENSE')
112 | 'MIT'
113 | ```
114 | 
115 | The return value of the `license_id` function is an [SPDX] id.  Currently
116 | licenses are sourced from [choosealicense.com].
117 | 
118 | [licensee]: https://github.com/benbalter/licensee
119 | [SPDX]: https://spdx.org/licenses/
120 | [choosealicense.com]: https://github.com/github/choosealicense.com
121 | 
122 | ## How it works
123 | 
124 | A call to `tags_from_path` does this:
125 | 
126 | 1. What is the type: file, symlink, directory? If it's not file, stop here.
127 | 2. Is it executable? Add the appropriate tag.
128 | 3. Do we recognize the file extension? If so, add the appropriate tags, stop
129 |    here. These tags would include binary/text.
130 | 4. Peek at the first X bytes of the file. Use these to determine whether it is
131 |    binary or text, add the appropriate tag.
132 | 5. If identified as text above, try to read and interpret the shebang, and add
133 |    appropriate tags.
134 | 
135 | By design, this means we don't need to partially read files where we recognize
136 | the file extension.
137 | 


--------------------------------------------------------------------------------
/bin/vendor-licenses:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Usage:
 3 | 
 4 |     ./bin/vendor-licenses > identify/vendor/licenses.py
 5 | """
 6 | from __future__ import annotations
 7 | 
 8 | import argparse
 9 | import os.path
10 | import subprocess
11 | import tempfile
12 | 
13 | 
14 | def main() -> int:
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument('--revision', default='HEAD')
17 |     args = parser.parse_args()
18 | 
19 |     licenses = []
20 | 
21 |     with tempfile.TemporaryDirectory() as tmpdir:
22 |         subprocess.check_call((
23 |             'git', 'clone', '--no-checkout', '--quiet',
24 |             'https://github.com/github/choosealicense.com', tmpdir,
25 |         ))
26 |         subprocess.check_call((
27 |             'git', '-C', tmpdir, 'checkout', args.revision, '--', '_licenses',
28 |         ))
29 | 
30 |         for filename in os.listdir(os.path.join(tmpdir, '_licenses')):
31 |             filename = os.path.join(tmpdir, '_licenses', filename)
32 | 
33 |             with open(filename) as f:
34 |                 contents = f.read()
35 | 
36 |             _, data, license_text = contents.split('---\n', 2)
37 | 
38 |             spdx, = (
39 |                 line[len('spdx-id:'):].strip()
40 |                 for line in data.splitlines()
41 |                 if line.startswith('spdx-id:')
42 |             )
43 | 
44 |             licenses.append((spdx, license_text))
45 | 
46 |         print('LICENSES = (')
47 |         for spdx, text in sorted(licenses):
48 |             print('    (')
49 |             print(f'        {spdx!r},')
50 |             print("        '''\\")
51 |             print(text.replace('\t', '    ').replace(' \n', '').strip())
52 |             print("''',")
53 |             print('    ),')
54 |         print(')')
55 |     return 0
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     raise SystemExit(main())
60 | 


--------------------------------------------------------------------------------
/identify/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pre-commit/identify/40af39f8124a4e8029ff3716c2b0bbf2e1e5fb1e/identify/__init__.py


--------------------------------------------------------------------------------
/identify/cli.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import argparse
 4 | import json
 5 | from collections.abc import Sequence
 6 | 
 7 | from identify import identify
 8 | 
 9 | 
10 | def main(argv: Sequence[str] | None = None) -> int:
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--filename-only', action='store_true')
13 |     parser.add_argument('path')
14 |     args = parser.parse_args(argv)
15 | 
16 |     if args.filename_only:
17 |         func = identify.tags_from_filename
18 |     else:
19 |         func = identify.tags_from_path
20 | 
21 |     try:
22 |         tags = sorted(func(args.path))
23 |     except ValueError as e:
24 |         print(e)
25 |         return 1
26 | 
27 |     if not tags:
28 |         return 1
29 |     else:
30 |         print(json.dumps(tags))
31 |         return 0
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     raise SystemExit(main())
36 | 


--------------------------------------------------------------------------------
/identify/extensions.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | EXTENSIONS = {
  3 |     'adoc': {'text', 'asciidoc'},
  4 |     'ai': {'binary', 'adobe-illustrator'},
  5 |     'aj': {'text', 'aspectj'},
  6 |     'asciidoc': {'text', 'asciidoc'},
  7 |     'apinotes': {'text', 'apinotes'},
  8 |     'asar': {'binary', 'asar'},
  9 |     'asm': {'text', 'asm'},
 10 |     'astro': {'text', 'astro'},
 11 |     'avif': {'binary', 'image', 'avif'},
 12 |     'avsc': {'text', 'avro-schema'},
 13 |     'bash': {'text', 'shell', 'bash'},
 14 |     'bat': {'text', 'batch'},
 15 |     'bats': {'text', 'shell', 'bash', 'bats'},
 16 |     'bazel': {'text', 'bazel'},
 17 |     'bb': {'text', 'bitbake'},
 18 |     'bbappend': {'text', 'bitbake'},
 19 |     'bbclass': {'text', 'bitbake'},
 20 |     'beancount': {'text', 'beancount'},
 21 |     'bib': {'text', 'bib'},
 22 |     'bmp': {'binary', 'image', 'bitmap'},
 23 |     'bz2': {'binary', 'bzip2'},
 24 |     'bz3': {'binary', 'bzip3'},
 25 |     'bzl': {'text', 'bazel'},
 26 |     'c': {'text', 'c'},
 27 |     'c++': {'text', 'c++'},
 28 |     'c++m': {'text', 'c++'},
 29 |     'cc': {'text', 'c++'},
 30 |     'ccm': {'text', 'c++'},
 31 |     'cfg': {'text'},
 32 |     'chs': {'text', 'c2hs'},
 33 |     'cjs': {'text', 'javascript'},
 34 |     'clj': {'text', 'clojure'},
 35 |     'cljc': {'text', 'clojure'},
 36 |     'cljs': {'text', 'clojure', 'clojurescript'},
 37 |     'cmake': {'text', 'cmake'},
 38 |     'cnf': {'text'},
 39 |     'coffee': {'text', 'coffee'},
 40 |     'conf': {'text'},
 41 |     'cpp': {'text', 'c++'},
 42 |     'cppm': {'text', 'c++'},
 43 |     'cr': {'text', 'crystal'},
 44 |     'crt': {'text', 'pem'},
 45 |     'cs': {'text', 'c#'},
 46 |     'csproj': {'text', 'xml', 'csproj', 'msbuild'},
 47 |     'csh': {'text', 'shell', 'csh'},
 48 |     'cson': {'text', 'cson'},
 49 |     'css': {'text', 'css'},
 50 |     'csv': {'text', 'csv'},
 51 |     'csx': {'text', 'c#', 'c#script'},
 52 |     'cu': {'text', 'cuda'},
 53 |     'cue': {'text', 'cue'},
 54 |     'cuh': {'text', 'cuda'},
 55 |     'cxx': {'text', 'c++'},
 56 |     'cxxm': {'text', 'c++'},
 57 |     'cylc': {'text', 'cylc'},
 58 |     'dart': {'text', 'dart'},
 59 |     'dbc': {'text', 'dbc'},
 60 |     'def': {'text', 'def'},
 61 |     'dll': {'binary'},
 62 |     'dtd': {'text', 'dtd'},
 63 |     'ear': {'binary', 'zip', 'jar'},
 64 |     'edn': {'text', 'clojure', 'edn'},
 65 |     'ejs': {'text', 'ejs'},
 66 |     'ejson': {'text', 'json', 'ejson'},
 67 |     'elm': {'text', 'elm'},
 68 |     'env': {'text', 'dotenv'},
 69 |     'eot': {'binary', 'eot'},
 70 |     'eps': {'binary', 'eps'},
 71 |     'erb': {'text', 'erb'},
 72 |     'erl': {'text', 'erlang'},
 73 |     'ex': {'text', 'elixir'},
 74 |     'exe': {'binary'},
 75 |     'exs': {'text', 'elixir'},
 76 |     'eyaml': {'text', 'yaml'},
 77 |     'f03': {'text', 'fortran'},
 78 |     'f08': {'text', 'fortran'},
 79 |     'f90': {'text', 'fortran'},
 80 |     'f95': {'text', 'fortran'},
 81 |     'feature': {'text', 'gherkin'},
 82 |     'fish': {'text', 'fish'},
 83 |     'fits': {'binary', 'fits'},
 84 |     'fs': {'text', 'f#'},
 85 |     'fsproj': {'text', 'xml', 'fsproj', 'msbuild'},
 86 |     'fsx': {'text', 'f#', 'f#script'},
 87 |     'gd': {'text', 'gdscript'},
 88 |     'gemspec': {'text', 'ruby'},
 89 |     'geojson': {'text', 'geojson', 'json'},
 90 |     'ggb': {'binary', 'zip', 'ggb'},
 91 |     'gif': {'binary', 'image', 'gif'},
 92 |     'gleam': {'text', 'gleam'},
 93 |     'go': {'text', 'go'},
 94 |     'gotmpl': {'text', 'gotmpl'},
 95 |     'gpx': {'text', 'gpx', 'xml'},
 96 |     'graphql': {'text', 'graphql'},
 97 |     'gradle': {'text', 'groovy'},
 98 |     'groovy': {'text', 'groovy'},
 99 |     'gyb': {'text', 'gyb'},
100 |     'gyp': {'text', 'gyp', 'python'},
101 |     'gypi': {'text', 'gyp', 'python'},
102 |     'gz': {'binary', 'gzip'},
103 |     'h': {'text', 'header', 'c', 'c++'},
104 |     'hbs': {'text', 'handlebars'},
105 |     'hcl': {'text', 'hcl'},
106 |     'hh': {'text', 'header', 'c++'},
107 |     'hpp': {'text', 'header', 'c++'},
108 |     'hrl': {'text', 'erlang'},
109 |     'hs': {'text', 'haskell'},
110 |     'htm': {'text', 'html'},
111 |     'html': {'text', 'html'},
112 |     'hxx': {'text', 'header', 'c++'},
113 |     'icns': {'binary', 'icns'},
114 |     'ico': {'binary', 'icon'},
115 |     'ics': {'text', 'icalendar'},
116 |     'idl': {'text', 'idl'},
117 |     'idr': {'text', 'idris'},
118 |     'inc': {'text', 'inc'},
119 |     'ini': {'text', 'ini'},
120 |     'inl': {'text', 'inl', 'c++'},
121 |     'ino': {'text', 'ino', 'c++'},
122 |     'inx': {'text', 'xml', 'inx'},
123 |     'ipynb': {'text', 'jupyter', 'json'},
124 |     'ixx': {'text', 'c++'},
125 |     'j2': {'text', 'jinja'},
126 |     'jade': {'text', 'jade'},
127 |     'jar': {'binary', 'zip', 'jar'},
128 |     'java': {'text', 'java'},
129 |     'jenkins': {'text', 'groovy', 'jenkins'},
130 |     'jenkinsfile': {'text', 'groovy', 'jenkins'},
131 |     'jinja': {'text', 'jinja'},
132 |     'jinja2': {'text', 'jinja'},
133 |     'jl': {'text', 'julia'},
134 |     'jpeg': {'binary', 'image', 'jpeg'},
135 |     'jpg': {'binary', 'image', 'jpeg'},
136 |     'js': {'text', 'javascript'},
137 |     'json': {'text', 'json'},
138 |     'jsonld': {'text', 'json', 'jsonld'},
139 |     'jsonnet': {'text', 'jsonnet'},
140 |     'json5': {'text', 'json5'},
141 |     'jsx': {'text', 'jsx'},
142 |     'key': {'text', 'pem'},
143 |     'kml': {'text', 'kml', 'xml'},
144 |     'kt': {'text', 'kotlin'},
145 |     'kts': {'text', 'kotlin'},
146 |     'lean': {'text', 'lean'},
147 |     'lektorproject': {'text', 'ini', 'lektorproject'},
148 |     'less': {'text', 'less'},
149 |     'lfm': {'text', 'lazarus', 'lazarus-form'},
150 |     'lhs': {'text', 'literate-haskell'},
151 |     'libsonnet': {'text', 'jsonnet'},
152 |     'lidr': {'text', 'idris'},
153 |     'liquid': {'text', 'liquid'},
154 |     'lpi': {'text', 'lazarus', 'xml'},
155 |     'lpr': {'text', 'lazarus', 'pascal'},
156 |     'lr': {'text', 'lektor'},
157 |     'lua': {'text', 'lua'},
158 |     'm': {'text', 'objective-c'},
159 |     'm4': {'text', 'm4'},
160 |     'magik': {'text', 'magik'},
161 |     'make': {'text', 'makefile'},
162 |     'manifest': {'text', 'manifest'},
163 |     'map': {'text', 'map'},
164 |     'markdown': {'text', 'markdown'},
165 |     'md': {'text', 'markdown'},
166 |     'mdx': {'text', 'mdx'},
167 |     'meson': {'text', 'meson'},
168 |     'metal': {'text', 'metal'},
169 |     'mib': {'text', 'mib'},
170 |     'mjs': {'text', 'javascript'},
171 |     'mk': {'text', 'makefile'},
172 |     'ml': {'text', 'ocaml'},
173 |     'mli': {'text', 'ocaml'},
174 |     'mm': {'text', 'c++', 'objective-c++'},
175 |     'modulemap': {'text', 'modulemap'},
176 |     'mscx': {'text', 'xml', 'musescore'},
177 |     'mscz': {'binary', 'zip', 'musescore'},
178 |     'mustache': {'text', 'mustache'},
179 |     'myst': {'text', 'myst'},
180 |     'ngdoc': {'text', 'ngdoc'},
181 |     'nim': {'text', 'nim'},
182 |     'nims': {'text', 'nim'},
183 |     'nimble': {'text', 'nimble'},
184 |     'nix': {'text', 'nix'},
185 |     'njk': {'text', 'nunjucks'},
186 |     'otf': {'binary', 'otf'},
187 |     'p12': {'binary', 'p12'},
188 |     'pas': {'text', 'pascal'},
189 |     'patch': {'text', 'diff'},
190 |     'pdf': {'binary', 'pdf'},
191 |     'pem': {'text', 'pem'},
192 |     'php': {'text', 'php'},
193 |     'php4': {'text', 'php'},
194 |     'php5': {'text', 'php'},
195 |     'phtml': {'text', 'php'},
196 |     'pl': {'text', 'perl'},
197 |     'plantuml': {'text', 'plantuml'},
198 |     'pm': {'text', 'perl'},
199 |     'png': {'binary', 'image', 'png'},
200 |     'po': {'text', 'pofile'},
201 |     'pom': {'pom', 'text', 'xml'},
202 |     'pp': {'text', 'puppet'},
203 |     'prisma': {'text', 'prisma'},
204 |     'properties': {'text', 'java-properties'},
205 |     'props': {'text', 'xml', 'msbuild'},
206 |     'proto': {'text', 'proto'},
207 |     'ps1': {'text', 'powershell'},
208 |     'psd1': {'text', 'powershell'},
209 |     'psm1': {'text', 'powershell'},
210 |     'pug': {'text', 'pug'},
211 |     'puml': {'text', 'plantuml'},
212 |     'purs': {'text', 'purescript'},
213 |     'pxd': {'text', 'cython'},
214 |     'pxi': {'text', 'cython'},
215 |     'py': {'text', 'python'},
216 |     'pyi': {'text', 'pyi'},
217 |     'pyproj': {'text', 'xml', 'pyproj', 'msbuild'},
218 |     'pyt': {'text', 'python'},
219 |     'pyx': {'text', 'cython'},
220 |     'pyz': {'binary', 'pyz'},
221 |     'pyzw': {'binary', 'pyz'},
222 |     'qml': {'text', 'qml'},
223 |     'r': {'text', 'r'},
224 |     'rake': {'text', 'ruby'},
225 |     'rb': {'text', 'ruby'},
226 |     'resx': {'text', 'resx', 'xml'},
227 |     'rng': {'text', 'xml', 'relax-ng'},
228 |     'rs': {'text', 'rust'},
229 |     'rst': {'text', 'rst'},
230 |     's': {'text', 'asm'},
231 |     'sas': {'text', 'sas'},
232 |     'sass': {'text', 'sass'},
233 |     'sbt': {'text', 'sbt', 'scala'},
234 |     'sc': {'text', 'scala'},
235 |     'scala': {'text', 'scala'},
236 |     'scm': {'text', 'scheme'},
237 |     'scss': {'text', 'scss'},
238 |     'sh': {'text', 'shell'},
239 |     'sln': {'text', 'sln'},
240 |     'sls': {'text', 'salt'},
241 |     'so': {'binary'},
242 |     'sol': {'text', 'solidity'},
243 |     'spec': {'text', 'spec'},
244 |     'sql': {'text', 'sql'},
245 |     'ss': {'text', 'scheme'},
246 |     'sty': {'text', 'tex'},
247 |     'styl': {'text', 'stylus'},
248 |     'sv': {'text', 'system-verilog'},
249 |     'svelte': {'text', 'svelte'},
250 |     'svg': {'text', 'image', 'svg', 'xml'},
251 |     'svh': {'text', 'system-verilog'},
252 |     'swf': {'binary', 'swf'},
253 |     'swift': {'text', 'swift'},
254 |     'swiftdeps': {'text', 'swiftdeps'},
255 |     'tac': {'text', 'twisted', 'python'},
256 |     'tar': {'binary', 'tar'},
257 |     'targets': {'text', 'xml', 'msbuild'},
258 |     'templ': {'text', 'templ'},
259 |     'tex': {'text', 'tex'},
260 |     'textproto': {'text', 'textproto'},
261 |     'tf': {'text', 'terraform'},
262 |     'tfvars': {'text', 'terraform'},
263 |     'tgz': {'binary', 'gzip'},
264 |     'thrift': {'text', 'thrift'},
265 |     'tiff': {'binary', 'image', 'tiff'},
266 |     'toml': {'text', 'toml'},
267 |     'ts': {'text', 'ts'},
268 |     'tsv': {'text', 'tsv'},
269 |     'tsx': {'text', 'tsx'},
270 |     'ttf': {'binary', 'ttf'},
271 |     'twig': {'text', 'twig'},
272 |     'txsprofile': {'text', 'ini', 'txsprofile'},
273 |     'txt': {'text', 'plain-text'},
274 |     'txtpb': {'text', 'textproto'},
275 |     'urdf': {'text', 'xml', 'urdf'},
276 |     'v': {'text', 'verilog'},
277 |     'vb': {'text', 'vb'},
278 |     'vbproj': {'text', 'xml', 'vbproj', 'msbuild'},
279 |     'vcxproj': {'text', 'xml', 'vcxproj', 'msbuild'},
280 |     'vdx': {'text', 'vdx'},
281 |     'vh': {'text', 'verilog'},
282 |     'vhd': {'text', 'vhdl'},
283 |     'vim': {'text', 'vim'},
284 |     'vtl': {'text', 'vtl'},
285 |     'vue': {'text', 'vue'},
286 |     'war': {'binary', 'zip', 'jar'},
287 |     'wav': {'binary', 'audio', 'wav'},
288 |     'webp': {'binary', 'image', 'webp'},
289 |     'whl': {'binary', 'wheel', 'zip'},
290 |     'wkt': {'text', 'wkt'},
291 |     'woff': {'binary', 'woff'},
292 |     'woff2': {'binary', 'woff2'},
293 |     'wsdl': {'text', 'xml', 'wsdl'},
294 |     'wsgi': {'text', 'wsgi', 'python'},
295 |     'xhtml': {'text', 'xml', 'html', 'xhtml'},
296 |     'xacro': {'text', 'xml', 'urdf', 'xacro'},
297 |     'xctestplan': {'text', 'json'},
298 |     'xml': {'text', 'xml'},
299 |     'xq': {'text', 'xquery'},
300 |     'xql': {'text', 'xquery'},
301 |     'xqm': {'text', 'xquery'},
302 |     'xqu': {'text', 'xquery'},
303 |     'xquery': {'text', 'xquery'},
304 |     'xqy': {'text', 'xquery'},
305 |     'xsd': {'text', 'xml', 'xsd'},
306 |     'xsl': {'text', 'xml', 'xsl'},
307 |     'xslt': {'text', 'xml', 'xsl'},
308 |     'yaml': {'text', 'yaml'},
309 |     'yamlld': {'text', 'yaml', 'yamlld'},
310 |     'yang': {'text', 'yang'},
311 |     'yin': {'text', 'xml', 'yin'},
312 |     'yml': {'text', 'yaml'},
313 |     'zcml': {'text', 'xml', 'zcml'},
314 |     'zig': {'text', 'zig'},
315 |     'zip': {'binary', 'zip'},
316 |     'zpt': {'text', 'zpt'},
317 |     'zsh': {'text', 'shell', 'zsh'},
318 | }
319 | EXTENSIONS_NEED_BINARY_CHECK = {
320 |     'plist': {'plist'},
321 |     'ppm': {'image', 'ppm'},
322 | }
323 | 
324 | NAMES = {
325 |     '.ansible-lint': EXTENSIONS['yaml'],
326 |     '.babelrc': EXTENSIONS['json'] | {'babelrc'},
327 |     '.bash_aliases': EXTENSIONS['bash'],
328 |     '.bash_profile': EXTENSIONS['bash'],
329 |     '.bashrc': EXTENSIONS['bash'],
330 |     '.bazelrc': {'text', 'bazelrc'},
331 |     '.bowerrc': EXTENSIONS['json'] | {'bowerrc'},
332 |     '.browserslistrc': {'text', 'browserslistrc'},
333 |     '.clang-format': EXTENSIONS['yaml'],
334 |     '.clang-tidy': EXTENSIONS['yaml'],
335 |     '.codespellrc': EXTENSIONS['ini'] | {'codespellrc'},
336 |     '.coveragerc': EXTENSIONS['ini'] | {'coveragerc'},
337 |     '.cshrc': EXTENSIONS['csh'],
338 |     '.csslintrc': EXTENSIONS['json'] | {'csslintrc'},
339 |     '.dockerignore': {'text', 'dockerignore'},
340 |     '.editorconfig': {'text', 'editorconfig'},
341 |     '.envrc': EXTENSIONS['bash'],
342 |     '.flake8': EXTENSIONS['ini'] | {'flake8'},
343 |     '.gitattributes': {'text', 'gitattributes'},
344 |     '.gitconfig': EXTENSIONS['ini'] | {'gitconfig'},
345 |     '.gitignore': {'text', 'gitignore'},
346 |     '.gitlint': EXTENSIONS['ini'] | {'gitlint'},
347 |     '.gitmodules': {'text', 'gitmodules'},
348 |     '.hgrc': EXTENSIONS['ini'] | {'hgrc'},
349 |     '.isort.cfg': EXTENSIONS['ini'] | {'isort'},
350 |     '.jshintrc': EXTENSIONS['json'] | {'jshintrc'},
351 |     '.mailmap': {'text', 'mailmap'},
352 |     '.mention-bot': EXTENSIONS['json'] | {'mention-bot'},
353 |     '.npmignore': {'text', 'npmignore'},
354 |     '.pdbrc': EXTENSIONS['py'] | {'pdbrc'},
355 |     '.prettierignore': {'text', 'gitignore', 'prettierignore'},
356 |     '.pypirc': EXTENSIONS['ini'] | {'pypirc'},
357 |     '.rstcheck.cfg': EXTENSIONS['ini'],
358 |     '.salt-lint': EXTENSIONS['yaml'] | {'salt-lint'},
359 |     '.sqlfluff': EXTENSIONS['ini'],
360 |     '.yamllint': EXTENSIONS['yaml'] | {'yamllint'},
361 |     '.zlogin': EXTENSIONS['zsh'],
362 |     '.zlogout': EXTENSIONS['zsh'],
363 |     '.zprofile': EXTENSIONS['zsh'],
364 |     '.zshrc': EXTENSIONS['zsh'],
365 |     '.zshenv': EXTENSIONS['zsh'],
366 |     'AUTHORS': EXTENSIONS['txt'],
367 |     'bblayers.conf': EXTENSIONS['bb'],
368 |     'bitbake.conf': EXTENSIONS['bb'],
369 |     'BUILD': EXTENSIONS['bzl'],
370 |     'Cargo.toml': EXTENSIONS['toml'] | {'cargo'},
371 |     'Cargo.lock': EXTENSIONS['toml'] | {'cargo-lock'},
372 |     'CMakeLists.txt': EXTENSIONS['cmake'],
373 |     'CHANGELOG': EXTENSIONS['txt'],
374 |     'config.ru': EXTENSIONS['rb'],
375 |     'Containerfile': {'text', 'dockerfile'},
376 |     'CONTRIBUTING': EXTENSIONS['txt'],
377 |     'copy.bara.sky': EXTENSIONS['bzl'],
378 |     'COPYING': EXTENSIONS['txt'],
379 |     'Dockerfile': {'text', 'dockerfile'},
380 |     'direnvrc': EXTENSIONS['bash'],
381 |     'Gemfile': EXTENSIONS['rb'],
382 |     'Gemfile.lock': {'text'},
383 |     'GNUmakefile': EXTENSIONS['mk'],
384 |     'go.mod': {'text', 'go-mod'},
385 |     'go.sum': {'text', 'go-sum'},
386 |     'Jenkinsfile': EXTENSIONS['jenkins'],
387 |     'LICENSE': EXTENSIONS['txt'],
388 |     'MAINTAINERS': EXTENSIONS['txt'],
389 |     'Makefile': EXTENSIONS['mk'],
390 |     'meson.build': EXTENSIONS['meson'],
391 |     'meson_options.txt': EXTENSIONS['meson'],
392 |     'makefile': EXTENSIONS['mk'],
393 |     'NEWS': EXTENSIONS['txt'],
394 |     'NOTICE': EXTENSIONS['txt'],
395 |     'PATENTS': EXTENSIONS['txt'],
396 |     'Pipfile': EXTENSIONS['toml'],
397 |     'Pipfile.lock': EXTENSIONS['json'],
398 |     'PKGBUILD': {'text', 'bash', 'pkgbuild', 'alpm'},
399 |     'poetry.lock': EXTENSIONS['toml'],
400 |     'pom.xml': EXTENSIONS['pom'],
401 |     'pylintrc': EXTENSIONS['ini'] | {'pylintrc'},
402 |     'README': EXTENSIONS['txt'],
403 |     'Rakefile': EXTENSIONS['rb'],
404 |     'rebar.config': EXTENSIONS['erl'],
405 |     'setup.cfg': EXTENSIONS['ini'],
406 |     'sys.config': EXTENSIONS['erl'],
407 |     'sys.config.src': EXTENSIONS['erl'],
408 |     'Tiltfile': {'text', 'tiltfile'},
409 |     'Vagrantfile': EXTENSIONS['rb'],
410 |     'WORKSPACE': EXTENSIONS['bzl'],
411 |     'wscript': EXTENSIONS['py'],
412 | }
413 | 


--------------------------------------------------------------------------------
/identify/identify.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import errno
  4 | import math
  5 | import os.path
  6 | import re
  7 | import shlex
  8 | import stat
  9 | import string
 10 | import sys
 11 | from typing import IO
 12 | 
 13 | from identify import extensions
 14 | from identify import interpreters
 15 | from identify.vendor import licenses
 16 | 
 17 | 
 18 | printable = frozenset(string.printable)
 19 | 
 20 | DIRECTORY = 'directory'
 21 | SYMLINK = 'symlink'
 22 | SOCKET = 'socket'
 23 | FILE = 'file'
 24 | EXECUTABLE = 'executable'
 25 | NON_EXECUTABLE = 'non-executable'
 26 | TEXT = 'text'
 27 | BINARY = 'binary'
 28 | 
 29 | TYPE_TAGS = frozenset((DIRECTORY, FILE, SYMLINK, SOCKET))
 30 | MODE_TAGS = frozenset((EXECUTABLE, NON_EXECUTABLE))
 31 | ENCODING_TAGS = frozenset((BINARY, TEXT))
 32 | _ALL_TAGS = {*TYPE_TAGS, *MODE_TAGS, *ENCODING_TAGS}
 33 | _ALL_TAGS.update(*extensions.EXTENSIONS.values())
 34 | _ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
 35 | _ALL_TAGS.update(*extensions.NAMES.values())
 36 | _ALL_TAGS.update(*interpreters.INTERPRETERS.values())
 37 | ALL_TAGS = frozenset(_ALL_TAGS)
 38 | 
 39 | 
 40 | def tags_from_path(path: str) -> set[str]:
 41 |     try:
 42 |         sr = os.lstat(path)
 43 |     except (OSError, ValueError):  # same error-handling as `os.lexists()`
 44 |         raise ValueError(f'{path} does not exist.')
 45 | 
 46 |     mode = sr.st_mode
 47 |     if stat.S_ISDIR(mode):
 48 |         return {DIRECTORY}
 49 |     if stat.S_ISLNK(mode):
 50 |         return {SYMLINK}
 51 |     if stat.S_ISSOCK(mode):
 52 |         return {SOCKET}
 53 | 
 54 |     tags = {FILE}
 55 | 
 56 |     executable = os.access(path, os.X_OK)
 57 |     if executable:
 58 |         tags.add(EXECUTABLE)
 59 |     else:
 60 |         tags.add(NON_EXECUTABLE)
 61 | 
 62 |     # As an optimization, if we're able to read tags from the filename, then we
 63 |     # don't peek at the file contents.
 64 |     t = tags_from_filename(os.path.basename(path))
 65 |     if len(t) > 0:
 66 |         tags.update(t)
 67 |     else:
 68 |         if executable:
 69 |             shebang = parse_shebang_from_file(path)
 70 |             if len(shebang) > 0:
 71 |                 tags.update(tags_from_interpreter(shebang[0]))
 72 | 
 73 |     # some extensions can be both binary and text
 74 |     # see EXTENSIONS_NEED_BINARY_CHECK
 75 |     if not ENCODING_TAGS & tags:
 76 |         if file_is_text(path):
 77 |             tags.add(TEXT)
 78 |         else:
 79 |             tags.add(BINARY)
 80 | 
 81 |     assert ENCODING_TAGS & tags, tags
 82 |     assert MODE_TAGS & tags, tags
 83 |     return tags
 84 | 
 85 | 
 86 | def tags_from_filename(path: str) -> set[str]:
 87 |     _, filename = os.path.split(path)
 88 |     _, ext = os.path.splitext(filename)
 89 | 
 90 |     ret = set()
 91 | 
 92 |     # Allow e.g. "Dockerfile.xenial" to match "Dockerfile"
 93 |     for part in [filename] + filename.split('.'):
 94 |         if part in extensions.NAMES:
 95 |             ret.update(extensions.NAMES[part])
 96 |             break
 97 | 
 98 |     if len(ext) > 0:
 99 |         ext = ext[1:].lower()
100 |         if ext in extensions.EXTENSIONS:
101 |             ret.update(extensions.EXTENSIONS[ext])
102 |         elif ext in extensions.EXTENSIONS_NEED_BINARY_CHECK:
103 |             ret.update(extensions.EXTENSIONS_NEED_BINARY_CHECK[ext])
104 | 
105 |     return ret
106 | 
107 | 
108 | def tags_from_interpreter(interpreter: str) -> set[str]:
109 |     _, _, interpreter = interpreter.rpartition('/')
110 | 
111 |     # Try "python3.5.2" => "python3.5" => "python3" until one matches.
112 |     while interpreter:
113 |         if interpreter in interpreters.INTERPRETERS:
114 |             return interpreters.INTERPRETERS[interpreter]
115 |         else:
116 |             interpreter, _, _ = interpreter.rpartition('.')
117 | 
118 |     return set()
119 | 
120 | 
121 | def is_text(bytesio: IO[bytes]) -> bool:
122 |     """Return whether the first KB of contents seems to be binary.
123 | 
124 |     This is roughly based on libmagic's binary/text detection:
125 |     https://github.com/file/file/blob/df74b09b9027676088c797528edcaae5a9ce9ad0/src/encoding.c#L203-L228
126 |     """
127 |     text_chars = (
128 |         bytearray([7, 8, 9, 10, 11, 12, 13, 27]) +
129 |         bytearray(range(0x20, 0x7F)) +
130 |         bytearray(range(0x80, 0X100))
131 |     )
132 |     return not bool(bytesio.read(1024).translate(None, text_chars))
133 | 
134 | 
135 | def file_is_text(path: str) -> bool:
136 |     if not os.path.lexists(path):
137 |         raise ValueError(f'{path} does not exist.')
138 |     with open(path, 'rb') as f:
139 |         return is_text(f)
140 | 
141 | 
142 | def _shebang_split(line: str) -> list[str]:
143 |     try:
144 |         # shebangs aren't supposed to be quoted, though some tools such as
145 |         # setuptools will write them with quotes so we'll best-guess parse
146 |         # with shlex first
147 |         return shlex.split(line)
148 |     except ValueError:
149 |         # failing that, we'll do a more "traditional" shebang parsing which
150 |         # just involves splitting by whitespace
151 |         return line.split()
152 | 
153 | 
154 | def _parse_nix_shebang(
155 |         bytesio: IO[bytes],
156 |         cmd: tuple[str, ...],
157 | ) -> tuple[str, ...]:
158 |     while bytesio.read(2) == b'#!':
159 |         next_line_b = bytesio.readline()
160 |         try:
161 |             next_line = next_line_b.decode('UTF-8')
162 |         except UnicodeDecodeError:
163 |             return cmd
164 | 
165 |         for c in next_line:
166 |             if c not in printable:
167 |                 return cmd
168 | 
169 |         line_tokens = tuple(_shebang_split(next_line.strip()))
170 |         for i, token in enumerate(line_tokens[:-1]):
171 |             if token != '-i':
172 |                 continue
173 |             # the argument to -i flag
174 |             cmd = (line_tokens[i + 1],)
175 |     return cmd
176 | 
177 | 
178 | def parse_shebang(bytesio: IO[bytes]) -> tuple[str, ...]:
179 |     """Parse the shebang from a file opened for reading binary."""
180 |     if bytesio.read(2) != b'#!':
181 |         return ()
182 |     first_line_b = bytesio.readline()
183 |     try:
184 |         first_line = first_line_b.decode('UTF-8')
185 |     except UnicodeDecodeError:
186 |         return ()
187 | 
188 |     # Require only printable ascii
189 |     for c in first_line:
190 |         if c not in printable:
191 |             return ()
192 | 
193 |     cmd = tuple(_shebang_split(first_line.strip()))
194 |     if cmd[:2] == ('/usr/bin/env', '-S'):
195 |         cmd = cmd[2:]
196 |     elif cmd[:1] == ('/usr/bin/env',):
197 |         cmd = cmd[1:]
198 | 
199 |     if cmd == ('nix-shell',):
200 |         return _parse_nix_shebang(bytesio, cmd)
201 | 
202 |     return cmd
203 | 
204 | 
205 | def parse_shebang_from_file(path: str) -> tuple[str, ...]:
206 |     """Parse the shebang given a file path."""
207 |     if not os.path.lexists(path):
208 |         raise ValueError(f'{path} does not exist.')
209 |     if not os.access(path, os.X_OK):
210 |         return ()
211 | 
212 |     try:
213 |         with open(path, 'rb') as f:
214 |             return parse_shebang(f)
215 |     except OSError as e:
216 |         if e.errno == errno.EINVAL:
217 |             return ()
218 |         else:
219 |             raise
220 | 
221 | 
222 | COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
223 | WS_RE = re.compile(r'\s+')
224 | 
225 | 
226 | def _norm_license(s: str) -> str:
227 |     s = COPYRIGHT_RE.sub('', s)
228 |     s = WS_RE.sub(' ', s)
229 |     return s.strip()
230 | 
231 | 
232 | def license_id(filename: str) -> str | None:
233 |     """Return the spdx id for the license contained in `filename`.  If no
234 |     license is detected, returns `None`.
235 | 
236 |     spdx: https://spdx.org/licenses/
237 |     licenses from choosealicense.com: https://github.com/choosealicense.com
238 | 
239 |     Approximate algorithm:
240 | 
241 |     1. strip copyright line
242 |     2. normalize whitespace (replace all whitespace with a single space)
243 |     3. check exact text match with existing licenses
244 |     4. failing that use edit distance
245 |     """
246 |     import ukkonen  # `pip install identify[license]`
247 | 
248 |     with open(filename, encoding='UTF-8') as f:
249 |         contents = f.read()
250 | 
251 |     norm = _norm_license(contents)
252 | 
253 |     min_edit_dist = sys.maxsize
254 |     min_edit_dist_spdx = ''
255 | 
256 |     cutoff = math.ceil(.05 * len(norm))
257 | 
258 |     # try exact matches
259 |     for spdx, text in licenses.LICENSES:
260 |         norm_license = _norm_license(text)
261 |         if norm == norm_license:
262 |             return spdx
263 | 
264 |         # skip the slow calculation if the lengths are very different
265 |         if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
266 |             continue
267 | 
268 |         edit_dist = ukkonen.distance(norm, norm_license, cutoff)
269 |         if edit_dist < cutoff and edit_dist < min_edit_dist:
270 |             min_edit_dist = edit_dist
271 |             min_edit_dist_spdx = spdx
272 | 
273 |     # if there's less than 5% edited from the license, we found our match
274 |     if norm and min_edit_dist < cutoff:
275 |         return min_edit_dist_spdx
276 |     else:
277 |         # no matches :'(
278 |         return None
279 | 


--------------------------------------------------------------------------------
/identify/interpreters.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | INTERPRETERS = {
 3 |     'ash': {'shell', 'ash'},
 4 |     'awk': {'awk'},
 5 |     'bash': {'shell', 'bash'},
 6 |     'bats': {'shell', 'bash', 'bats'},
 7 |     'cbsd': {'shell', 'cbsd'},
 8 |     'csh': {'shell', 'csh'},
 9 |     'dash': {'shell', 'dash'},
10 |     'expect': {'expect'},
11 |     'ksh': {'shell', 'ksh'},
12 |     'node': {'javascript'},
13 |     'nodejs': {'javascript'},
14 |     'perl': {'perl'},
15 |     'php': {'php'},
16 |     'php7': {'php', 'php7'},
17 |     'php8': {'php', 'php8'},
18 |     'python': {'python'},
19 |     'python2': {'python', 'python2'},
20 |     'python3': {'python', 'python3'},
21 |     'ruby': {'ruby'},
22 |     'sh': {'shell', 'sh'},
23 |     'tcsh': {'shell', 'tcsh'},
24 |     'zsh': {'shell', 'zsh'},
25 | }
26 | 


--------------------------------------------------------------------------------
/identify/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pre-commit/identify/40af39f8124a4e8029ff3716c2b0bbf2e1e5fb1e/identify/py.typed


--------------------------------------------------------------------------------
/identify/vendor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pre-commit/identify/40af39f8124a4e8029ff3716c2b0bbf2e1e5fb1e/identify/vendor/__init__.py


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | covdefaults
2 | coverage
3 | pytest
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = identify
 3 | version = 2.6.12
 4 | description = File identification library for Python
 5 | long_description = file: README.md
 6 | long_description_content_type = text/markdown
 7 | url = https://github.com/pre-commit/identify
 8 | author = Chris Kuehl
 9 | author_email = ckuehl@ocf.berkeley.edu
10 | license = MIT
11 | license_files = LICENSE
12 | classifiers =
13 |     Programming Language :: Python :: 3
14 |     Programming Language :: Python :: 3 :: Only
15 |     Programming Language :: Python :: Implementation :: CPython
16 |     Programming Language :: Python :: Implementation :: PyPy
17 | 
18 | [options]
19 | packages = find:
20 | python_requires = >=3.9
21 | 
22 | [options.packages.find]
23 | exclude =
24 |     tests*
25 |     testing*
26 | 
27 | [options.entry_points]
28 | console_scripts =
29 |     identify-cli=identify.cli:main
30 | 
31 | [options.extras_require]
32 | license =
33 |     ukkonen
34 | 
35 | [options.package_data]
36 | identify =
37 |     py.typed
38 | 
39 | [bdist_wheel]
40 | universal = True
41 | 
42 | [coverage:run]
43 | plugins = covdefaults
44 | 
45 | [mypy]
46 | check_untyped_defs = true
47 | disallow_any_generics = true
48 | disallow_incomplete_defs = true
49 | disallow_untyped_defs = true
50 | warn_redundant_casts = true
51 | warn_unused_ignores = true
52 | 
53 | [mypy-testing.*]
54 | disallow_untyped_defs = false
55 | 
56 | [mypy-tests.*]
57 | disallow_untyped_defs = false
58 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from setuptools import setup
4 | setup()
5 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pre-commit/identify/40af39f8124a4e8029ff3716c2b0bbf2e1e5fb1e/tests/__init__.py


--------------------------------------------------------------------------------
/tests/cli_test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from identify import cli
 4 | 
 5 | 
 6 | def test_identify_cli(capsys):
 7 |     ret = cli.main(('setup.py',))
 8 |     out, _ = capsys.readouterr()
 9 |     assert ret == 0
10 |     assert out == '["file", "non-executable", "python", "text"]\n'
11 | 
12 | 
13 | def test_identify_cli_filename_only(capsys):
14 |     ret = cli.main(('setup.py', '--filename-only'))
15 |     out, _ = capsys.readouterr()
16 |     assert ret == 0
17 |     assert out == '["python", "text"]\n'
18 | 
19 | 
20 | def test_identify_cli_filename_only_unidentified(capsys):
21 |     ret = cli.main(('x.unknown', '--filename-only'))
22 |     out, _ = capsys.readouterr()
23 |     assert ret == 1
24 |     assert out == ''
25 | 
26 | 
27 | def test_file_not_found(capsys):
28 |     ret = cli.main(('x.unknown',))
29 |     out, _ = capsys.readouterr()
30 |     assert ret == 1
31 |     assert out == 'x.unknown does not exist.\n'
32 | 


--------------------------------------------------------------------------------
/tests/extensions_test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | 
 5 | from identify import extensions
 6 | 
 7 | 
 8 | @pytest.mark.parametrize('extension', extensions.EXTENSIONS)
 9 | def test_extensions_have_binary_or_text(extension):
10 |     tags = extensions.EXTENSIONS[extension]
11 |     assert len({'text', 'binary'} & tags) == 1, tags
12 | 
13 | 
14 | @pytest.mark.parametrize('name', extensions.NAMES)
15 | def test_names_have_binary_or_text(name):
16 |     tags = extensions.NAMES[name]
17 |     assert len({'text', 'binary'} & tags) == 1, tags
18 | 
19 | 
20 | @pytest.mark.parametrize('extension', extensions.EXTENSIONS_NEED_BINARY_CHECK)
21 | def test_need_binary_check_do_not_specify_text_binary(extension):
22 |     tags = extensions.EXTENSIONS_NEED_BINARY_CHECK[extension]
23 |     assert len({'text', 'binary'} & tags) == 0, tags
24 | 
25 | 
26 | def test_mutually_exclusive_check_types():
27 |     assert not (
28 |         set(extensions.EXTENSIONS) &
29 |         set(extensions.EXTENSIONS_NEED_BINARY_CHECK)
30 |     )
31 | 


--------------------------------------------------------------------------------
/tests/identify_test.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import builtins
  4 | import errno
  5 | import io
  6 | import os
  7 | import socket
  8 | import stat
  9 | from tempfile import TemporaryDirectory
 10 | from unittest import mock
 11 | 
 12 | import pytest
 13 | 
 14 | from identify import identify
 15 | 
 16 | 
 17 | def test_all_tags_includes_basic_ones():
 18 |     assert 'file' in identify.ALL_TAGS
 19 |     assert 'directory' in identify.ALL_TAGS
 20 |     assert 'executable' in identify.ALL_TAGS
 21 |     assert 'text' in identify.ALL_TAGS
 22 |     assert 'socket' in identify.ALL_TAGS
 23 | 
 24 | 
 25 | @pytest.mark.parametrize(
 26 |     'tag_group',
 27 |     (
 28 |         identify.TYPE_TAGS,
 29 |         identify.MODE_TAGS,
 30 |         identify.ENCODING_TAGS,
 31 |     ),
 32 | )
 33 | def test_all_tags_contains_all_groups(tag_group):
 34 |     assert tag_group < identify.ALL_TAGS
 35 | 
 36 | 
 37 | def test_all_tags_contains_each_type():
 38 |     assert 'xml' in identify.ALL_TAGS  # extension
 39 |     assert 'plist' in identify.ALL_TAGS  # extension, needs binary check
 40 |     assert 'dockerfile' in identify.ALL_TAGS  # by file convention
 41 |     assert 'python3' in identify.ALL_TAGS  # by shebang
 42 |     assert 'php8' in identify.ALL_TAGS  # by shebang
 43 | 
 44 | 
 45 | def test_tags_from_path_does_not_exist(tmpdir):
 46 |     x = tmpdir.join('foo')
 47 |     with pytest.raises(ValueError):
 48 |         identify.tags_from_path(x.strpath)
 49 | 
 50 | 
 51 | def test_tags_from_path_directory(tmpdir):
 52 |     x = tmpdir.join('foo')
 53 |     x.mkdir()
 54 |     assert identify.tags_from_path(x.strpath) == {'directory'}
 55 | 
 56 | 
 57 | def test_tags_from_path_symlink(tmpdir):
 58 |     x = tmpdir.join('foo')
 59 |     x.mksymlinkto(tmpdir.join('lol').ensure())
 60 |     assert identify.tags_from_path(x.strpath) == {'symlink'}
 61 | 
 62 | 
 63 | def test_tags_from_path_socket():
 64 |     tmproot = '/tmp'  # short path avoids `OSError: AF_UNIX path too long`
 65 |     with TemporaryDirectory(dir=tmproot) as tmpdir:
 66 |         socket_path = os.path.join(tmpdir, 'socket')
 67 |         with socket.socket(socket.AF_UNIX) as sock:
 68 |             sock.bind(socket_path)
 69 |             tags = identify.tags_from_path(socket_path)
 70 | 
 71 |     assert tags == {'socket'}
 72 | 
 73 | 
 74 | def test_tags_from_path_broken_symlink(tmpdir):
 75 |     x = tmpdir.join('foo')
 76 |     x.mksymlinkto(tmpdir.join('lol'))
 77 |     assert identify.tags_from_path(x.strpath) == {'symlink'}
 78 | 
 79 | 
 80 | def test_tags_from_path_simple_file(tmpdir):
 81 |     x = tmpdir.join('test.py').ensure()
 82 |     assert identify.tags_from_path(x.strpath) == {
 83 |         'file', 'text', 'non-executable', 'python',
 84 |     }
 85 | 
 86 | 
 87 | def test_tags_from_path_file_with_incomplete_shebang(tmpdir):
 88 |     x = tmpdir.join('test')
 89 |     x.write_text('#!   \n', encoding='UTF-8')
 90 |     make_executable(x.strpath)
 91 |     assert identify.tags_from_path(x.strpath) == {
 92 |         'file', 'text', 'executable',
 93 |     }
 94 | 
 95 | 
 96 | def test_tags_from_path_file_with_shebang_non_executable(tmpdir):
 97 |     x = tmpdir.join('test')
 98 |     x.write_text('#!/usr/bin/env python\nimport sys\n', encoding='UTF-8')
 99 |     assert identify.tags_from_path(x.strpath) == {
100 |         'file', 'text', 'non-executable',
101 |     }
102 | 
103 | 
104 | def test_tags_from_path_file_with_shebang_executable(tmpdir):
105 |     x = tmpdir.join('test')
106 |     x.write_text('#!/usr/bin/env python\nimport sys\n', encoding='UTF-8')
107 |     make_executable(x.strpath)
108 |     assert identify.tags_from_path(x.strpath) == {
109 |         'file', 'text', 'executable', 'python',
110 |     }
111 | 
112 | 
113 | def test_tags_from_path_binary(tmpdir):
114 |     x = tmpdir.join('test')
115 |     x.write(b'\x7f\x45\x4c\x46\x02\x01\x01')
116 |     make_executable(x.strpath)
117 |     assert identify.tags_from_path(x.strpath) == {
118 |         'file', 'binary', 'executable',
119 |     }
120 | 
121 | 
122 | def test_tags_from_path_plist_binary(tmpdir):
123 |     x = tmpdir.join('t.plist')
124 |     x.write_binary(
125 |         b'bplist00\xd1\x01\x02_\x10\x0fLast Login NameWDefault\x08\x0b\x1d\x00'
126 |         b'\x00\x00\x00\x00\x00\x01\x01\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00'
127 |         b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00%',
128 |     )
129 |     assert identify.tags_from_path(x.strpath) == {
130 |         'file', 'plist', 'binary', 'non-executable',
131 |     }
132 | 
133 | 
134 | def test_tags_from_path_plist_text(tmpdir):
135 |     x = tmpdir.join('t.plist')
136 |     x.write(
137 |         '<?xml version="1.0" encoding="UTF-8"?>\n'
138 |         '<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">\n'  # noqa: E501
139 |         '<plist version="1.0">\n'
140 |         '<dict>\n'
141 |         '\t<key>Last Login Name</key>\n'
142 |         '\t<string>Default</string>\n'
143 |         '</dict>\n'
144 |         '</plist>\n',
145 |     )
146 |     assert identify.tags_from_path(x.strpath) == {
147 |         'file', 'plist', 'text', 'non-executable',
148 |     }
149 | 
150 | 
151 | @pytest.mark.parametrize(
152 |     ('filename', 'expected'),
153 |     (
154 |         ('.salt-lint', {'text', 'salt-lint', 'yaml'}),
155 |         ('test.py', {'text', 'python'}),
156 |         ('test.mk', {'text', 'makefile'}),
157 |         ('Makefile', {'text', 'makefile'}),
158 |         ('Containerfile', {'text', 'dockerfile'}),
159 |         ('Dockerfile', {'text', 'dockerfile'}),
160 |         ('Dockerfile.xenial', {'text', 'dockerfile'}),
161 |         ('xenial.Dockerfile', {'text', 'dockerfile'}),
162 |         ('Pipfile', {'text', 'toml'}),
163 |         ('Pipfile.lock', {'text', 'json'}),
164 |         ('mod/test.py', {'text', 'python'}),
165 |         ('mod/Dockerfile', {'text', 'dockerfile'}),
166 |         ('config.ru', {'text', 'ruby'}),
167 |         ('Gemfile', {'text', 'ruby'}),
168 |         ('Gemfile.lock', {'text'}),
169 |         ('Jenkinsfile', {'text', 'groovy', 'jenkins'}),
170 |         ('build.jenkins', {'text', 'groovy', 'jenkins'}),
171 |         ('build.jenkinsfile', {'text', 'groovy', 'jenkins'}),
172 |         ('meson.build', {'text', 'meson'}),
173 |         ('meson_options.txt', {'text', 'plain-text', 'meson'}),
174 |         ('Vagrantfile', {'text', 'ruby'}),
175 |         ('Tiltfile', {'text', 'tiltfile'}),
176 |         ('Tiltfile.abc', {'text', 'tiltfile'}),
177 |         ('test.Tiltfile', {'text', 'tiltfile'}),
178 | 
179 |         # does not set binary / text
180 |         ('f.plist', {'plist'}),
181 | 
182 |         # case of extension should be ignored
183 |         ('f.JPG', {'binary', 'image', 'jpeg'}),
184 |         # but case of name checks should still be honored
185 |         ('dockerfile.py', {'text', 'python'}),
186 | 
187 |         # full filename tests should take precedence over extension tests
188 |         ('test.cfg', {'text'}),
189 |         ('setup.cfg', {'text', 'ini'}),
190 | 
191 |         # Filename matches should still include extensions if applicable
192 |         ('README.md', {'text', 'markdown', 'plain-text'}),
193 | 
194 |         ('test.weird-unrecognized-extension', set()),
195 |         ('test', set()),
196 |         ('', set()),
197 |     ),
198 | )
199 | def test_tags_from_filename(filename, expected):
200 |     assert identify.tags_from_filename(filename) == expected
201 | 
202 | 
203 | @pytest.mark.parametrize(
204 |     ('interpreter', 'expected'),
205 |     (
206 |         ('python', {'python'}),
207 |         ('python3', {'python3', 'python'}),
208 |         ('python3.5.2', {'python3', 'python'}),
209 |         ('/usr/bin/python3.5.2', {'python3', 'python'}),
210 |         ('/usr/bin/herpderpderpderpderp', set()),
211 |         ('something-random', set()),
212 |         ('', set()),
213 |     ),
214 | )
215 | def test_tags_from_interpreter(interpreter, expected):
216 |     assert identify.tags_from_interpreter(interpreter) == expected
217 | 
218 | 
219 | @pytest.mark.parametrize(
220 |     ('data', 'expected'),
221 |     (
222 |         (b'hello world', True),
223 |         (b'', True),
224 |         ('éóñəå  ⊂(◉‿◉)つ(ノ≥∇≤)ノ'.encode(), True),
225 |         (r'¯\_(ツ)_/¯'.encode(), True),
226 |         ('♪┏(・o･)┛♪┗ ( ･o･) ┓♪┏ ( ) ┛♪┗ (･o･ ) ┓♪'.encode(), True),
227 |         ('éóñå'.encode('latin1'), True),
228 | 
229 |         (b'hello world\x00', False),
230 |         # first few bytes of /bin/bash
231 |         (b'\x7f\x45\x4c\x46\x02\x01\x01', False),
232 |         # some /dev/urandom output
233 |         (b'\x43\x92\xd9\x0f\xaf\x32\x2c', False),
234 |     ),
235 | )
236 | def test_is_text(data, expected):
237 |     assert identify.is_text(io.BytesIO(data)) is expected
238 | 
239 | 
240 | def test_file_is_text_simple(tmpdir):
241 |     x = tmpdir.join('f')
242 |     x.write_text('hello there\n', encoding='UTF-8')
243 |     assert identify.file_is_text(x.strpath) is True
244 | 
245 | 
246 | def test_file_is_text_does_not_exist(tmpdir):
247 |     x = tmpdir.join('f')
248 |     with pytest.raises(ValueError):
249 |         identify.file_is_text(x.strpath)
250 | 
251 | 
252 | @pytest.mark.parametrize(
253 |     ('s', 'expected'),
254 |     (
255 |         (b'', ()),
256 |         (b'#!/usr/bin/python', ('/usr/bin/python',)),
257 |         (b'#!/usr/bin/env python', ('python',)),
258 |         (b'#! /usr/bin/python', ('/usr/bin/python',)),
259 |         (b'#!/usr/bin/foo  python', ('/usr/bin/foo', 'python')),
260 |         # despite this being invalid, setuptools will write shebangs like this
261 |         (b'#!"/path/with spaces/x" y', ('/path/with spaces/x', 'y')),
262 |         # this is apparently completely ok to embed quotes
263 |         (b"#!/path'with/quotes    y", ("/path'with/quotes", 'y')),
264 |         # Don't regress on leading/trailing ws
265 |         (b"#! /path'with/quotes y ", ("/path'with/quotes", 'y')),
266 |         # Test nix-shell specialites with shebang on second line
267 |         (
268 |             b'#! /usr/bin/env nix-shell\n'
269 |             b'#! nix-shell -i bash -p python',
270 |             ('bash',),
271 |         ),
272 |         (
273 |             b'#! /usr/bin/env nix-shell\n'
274 |             b'#! nix-shell -i python -p coreutils',
275 |             ('python',),
276 |         ),
277 |         (
278 |             b'#! /usr/bin/env nix-shell\n'
279 |             b'#! nix-shell -p coreutils -i python',
280 |             ('python',),
281 |         ),
282 |         # multi-line and no whitespace variation
283 |         (
284 |             b'#! /usr/bin/env nix-shell\n'
285 |             b'#! nix-shell -p coreutils\n'
286 |             b'#! nix-shell -i python',
287 |             ('python',),
288 |         ),
289 |         (
290 |             b'#! /usr/bin/env nix-shell\n'
291 |             b'#!nix-shell -p coreutils\n'
292 |             b'#!nix-shell -i python',
293 |             ('python',),
294 |         ),
295 |         (
296 |             b'#! /usr/bin/env nix-shell\n'
297 |             b'#!\xf9\x93\x01\x42\xcd',
298 |             ('nix-shell',),
299 |         ),
300 |         (
301 |             b'#! /usr/bin/env nix-shell\n'
302 |             b'#!\x00\x00\x00\x00',
303 |             ('nix-shell',),
304 |         ),
305 |         # non-proper nix-shell
306 |         (b'#! /usr/bin/nix-shell', ('/usr/bin/nix-shell',)),
307 |         (b'#! /usr/bin/env nix-shell', ('nix-shell',)),
308 |         (
309 |             b'#! /usr/bin/env nix-shell non-portable-argument',
310 |             ('nix-shell', 'non-portable-argument'),
311 |         ),
312 |         (
313 |             b'#! /usr/bin/env nix-shell\n'
314 |             b'#! nix-shell -i',
315 |             ('nix-shell',),   # guard against index error
316 |         ),
317 |         # interpret quotes correctly
318 |         (
319 |             b'#!/usr/bin/env nix-shell\n'
320 |             b'#!nix-shell --argstr x "a -i python3 p"\n'
321 |             b'#!nix-shell -p hello\n'
322 |             b'#!nix-shell -i bash\n'
323 |             b'#!nix-shell --argstr y "b -i runhaskell q"',
324 |             ('bash',),
325 |         ),
326 |         (b'\xf9\x93\x01\x42\xcd', ()),
327 |         (b'#!\xf9\x93\x01\x42\xcd', ()),
328 |         (b'#!\x00\x00\x00\x00', ()),
329 |         # shebang lines with multiple arguments
330 |         (b'#!/usr/bin/env -S python -u', ('python', '-u')),
331 |         (b'#!/usr/bin/env', ()),
332 |         (b'#!/usr/bin/env -S', ()),
333 |     ),
334 | )
335 | def test_parse_shebang(s, expected):
336 |     assert identify.parse_shebang(io.BytesIO(s)) == expected
337 | 
338 | 
339 | def test_parse_shebang_from_file_does_not_exist():
340 |     with pytest.raises(ValueError):
341 |         identify.parse_shebang_from_file('herp derp derp')
342 | 
343 | 
344 | def test_parse_shebang_from_file_nonexecutable(tmpdir):
345 |     x = tmpdir.join('f')
346 |     x.write_text('#!/usr/bin/env python', encoding='UTF-8')
347 |     assert identify.parse_shebang_from_file(x.strpath) == ()
348 | 
349 | 
350 | def test_parse_shebang_from_file_simple(tmpdir):
351 |     x = tmpdir.join('f')
352 |     x.write_text('#!/usr/bin/env python', encoding='UTF-8')
353 |     make_executable(x.strpath)
354 |     assert identify.parse_shebang_from_file(x.strpath) == ('python',)
355 | 
356 | 
357 | def test_parse_shebang_open_raises_einval(tmpdir):
358 |     x = tmpdir.join('f')
359 |     x.write('#!/usr/bin/env not-expected\n')
360 |     make_executable(x)
361 |     error = OSError(errno.EINVAL, f'Invalid argument {x}')
362 |     with mock.patch.object(builtins, 'open', side_effect=error):
363 |         assert identify.parse_shebang_from_file(x.strpath) == ()
364 | 
365 | 
366 | def make_executable(filename):
367 |     original_mode = os.stat(filename).st_mode
368 |     os.chmod(
369 |         filename,
370 |         original_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH,
371 |     )
372 | 
373 | 
374 | def test_license_identification():
375 |     assert identify.license_id('LICENSE') == 'MIT'
376 | 
377 | 
378 | def test_license_exact_identification(tmpdir):
379 |     wtfpl = '''\
380 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
381 |                     Version 2, December 2004
382 | 
383 |  Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
384 | 
385 |  Everyone is permitted to copy and distribute verbatim or modified
386 |  copies of this license document, and changing it is allowed as long
387 |  as the name is changed.
388 | 
389 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
390 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
391 | 
392 |   0. You just DO WHAT THE FUCK YOU WANT TO.
393 | '''
394 |     f = tmpdir.join('LICENSE')
395 |     f.write(wtfpl)
396 |     assert identify.license_id(f.strpath) == 'WTFPL'
397 | 
398 | 
399 | def test_license_not_identified():
400 |     assert identify.license_id(os.devnull) is None
401 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py,pre-commit
 3 | 
 4 | [testenv]
 5 | deps = -rrequirements-dev.txt
 6 | extras = license
 7 | commands =
 8 |     coverage erase
 9 |     coverage run -m pytest {posargs:tests}
10 |     coverage report
11 | 
12 | [testenv:pre-commit]
13 | skip_install = true
14 | deps = pre-commit
15 | commands = pre-commit run --all-files --show-diff-on-failure
16 | 
17 | [pep8]
18 | ignore = E265,E501,W504
19 | 


--------------------------------------------------------------------------------