├── .coveragerc
├── .gitattributes
├── .github
    └── workflows
    │   ├── ci.yml
    │   └── publish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmark
    ├── README.md
    ├── results.csv
    ├── results.json
    ├── results_v0.2.0.csv
    ├── results_v0.2.0.json
    └── run.py
├── codecov.yml
├── pyproject.toml
├── setup.cfg
├── setup.py
├── src
    └── dirhash
    │   ├── __init__.py
    │   ├── _version.py
    │   └── cli.py
├── tests
    ├── test_cli.py
    └── test_dirhash.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 | source = dirhash
4 | omit = _version.py
5 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | src/dirhash/_version.py export-subst
2 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "master"
 7 |   pull_request:
 8 |     branches:
 9 |       - "**"
10 |   workflow_dispatch:
11 |   release:
12 |     types: [published, edited]
13 | 
14 | jobs:
15 |   pre-commit:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v4
19 |       - uses: actions/setup-python@v5
20 |         with:
21 |           python-version: "3.8"
22 |       - uses: pre-commit/action@v3.0.1
23 | 
24 |   tests:
25 |     runs-on: ${{ matrix.os }}
26 |     strategy:
27 |       fail-fast: false
28 |       matrix:
29 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
30 |         os: [ubuntu-latest, windows-latest]
31 | 
32 |     steps:
33 |       - uses: actions/checkout@v4
34 |       - name: Set up Python ${{ matrix.python-version }}
35 |         uses: actions/setup-python@v5
36 |         with:
37 |           python-version: ${{ matrix.python-version }}
38 |       - name: Install dependencies
39 |         run: |
40 |           python -m pip install --upgrade pip
41 |           python -m pip install tox tox-gh-actions
42 |       - name: Cache tox environments
43 |         id: cache-tox
44 |         uses: actions/cache@v4
45 |         with:
46 |           path: .tox
47 |           # setup.py and setup.cfg have versioning info that would impact the
48 |           # tox environment. hashFiles only takes a single file path or pattern
49 |           # at the moment.
50 |           key: ${{ runner.os }}-${{ matrix.python-version }}-tox-${{ hashFiles('setup.py') }}-${{ hashFiles('setup.cfg') }} }}
51 |       - name: Test with tox
52 |         run: tox
53 |       - uses: codecov/codecov-action@v4
54 |         if: matrix.python-version == '3.8' && matrix.os == 'ubuntu-latest'
55 |         with:
56 |           token: ${{ secrets.CODECOV_TOKEN }}
57 |           verbose: true
58 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
  1 | # Based on https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/#
  2 | name: Publish Python Package
  3 | 
  4 | on:
  5 |   push:
  6 |     tags:
  7 |       - "v[0-9]+.[0-9]+.[0-9]*"
  8 | 
  9 | jobs:
 10 |   build:
 11 |     name: Build distribution
 12 |     runs-on: ubuntu-latest
 13 | 
 14 |     steps:
 15 |       - uses: actions/checkout@v4
 16 |       # NOTE: tags are not present unless triggered by tag push
 17 |       # - name: Get tags
 18 |       #   run: git fetch --tags origin
 19 |       # - name: List tags
 20 |       #   run: git tag --list
 21 |       # TODO: somehow versioneer does not pickup the tag when workflow is not triggered by a
 22 |       #  tag push, getting e.g. (for sister repo scantree) scantree-0+untagged.1.gd74b1d5,
 23 |       #  see: https://github.com/andhus/scantree/actions/runs/7485873305/job/20375116541#step:7:42)
 24 |       - name: Set up Python
 25 |         uses: actions/setup-python@v5
 26 |         with:
 27 |           python-version: "3.x"
 28 |       - name: Install pypa/build
 29 |         run: >-
 30 |           python3 -m
 31 |           pip install
 32 |           build
 33 |           --user
 34 |       - name: Build a binary wheel and a source tarball
 35 |         run: python3 -m build
 36 |       - name: Store the distribution packages
 37 |         uses: actions/upload-artifact@v4
 38 |         with:
 39 |           name: python-package-distributions
 40 |           path: dist/
 41 | 
 42 |   publish-to-pypi:
 43 |     name: Publish to PyPI
 44 |     # TODO we need to make sure the tag matches the version!
 45 |     if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
 46 |     needs:
 47 |       - build
 48 |     runs-on: ubuntu-latest
 49 |     environment:
 50 |       name: pypi
 51 |       url: https://pypi.org/p/dirhash
 52 |     permissions:
 53 |       id-token: write # IMPORTANT: mandatory for trusted publishing
 54 | 
 55 |     steps:
 56 |       - name: Download all the dists
 57 |         uses: actions/download-artifact@v4
 58 |         with:
 59 |           name: python-package-distributions
 60 |           path: dist/
 61 |       - name: Publish distribution 📦 to PyPI
 62 |         uses: pypa/gh-action-pypi-publish@release/v1
 63 | 
 64 |   github-release:
 65 |     name: Sign and upload to GitHub Release
 66 |     needs:
 67 |       - publish-to-pypi
 68 |     runs-on: ubuntu-latest
 69 | 
 70 |     permissions:
 71 |       contents: write # IMPORTANT: mandatory for making GitHub Releases
 72 |       id-token: write # IMPORTANT: mandatory for sigstore
 73 | 
 74 |     steps:
 75 |       - name: Download all the dists
 76 |         uses: actions/download-artifact@v4
 77 |         with:
 78 |           name: python-package-distributions
 79 |           path: dist/
 80 |       - name: Sign the dists with Sigstore
 81 |         uses: sigstore/gh-action-sigstore-python@v2.1.1
 82 |         with:
 83 |           inputs: >-
 84 |             ./dist/*.tar.gz
 85 |             ./dist/*.whl
 86 |       - name: Create GitHub Release
 87 |         env:
 88 |           GITHUB_TOKEN: ${{ github.token }}
 89 |         run: >-
 90 |           gh release create
 91 |           '${{ github.ref_name }}'
 92 |           --repo '${{ github.repository }}'
 93 |           --notes ""
 94 |       - name: Upload artifact signatures to GitHub Release
 95 |         env:
 96 |           GITHUB_TOKEN: ${{ github.token }}
 97 |         # Upload to GitHub Release using the `gh` CLI.
 98 |         # `dist/` contains the built packages, and the
 99 |         # sigstore-produced signatures and certificates.
100 |         run: >-
101 |           gh release upload
102 |           '${{ github.ref_name }}' dist/**
103 |           --repo '${{ github.repository }}'
104 | 
105 |   publish-to-testpypi:
106 |     name: Publish to TestPyPI
107 |     if: startsWith(github.ref, 'refs/tags/') # only publish on tag pushes
108 |     needs:
109 |       - build
110 |     runs-on: ubuntu-latest
111 | 
112 |     environment:
113 |       name: testpypi
114 |       url: https://test.pypi.org/p/dirhash
115 | 
116 |     permissions:
117 |       id-token: write # IMPORTANT: mandatory for trusted publishing
118 | 
119 |     steps:
120 |       - name: Download all the dists
121 |         uses: actions/download-artifact@v4
122 |         with:
123 |           name: python-package-distributions
124 |           path: dist/
125 |       - name: Publish distribution 📦 to TestPyPI
126 |         uses: pypa/gh-action-pypi-publish@release/v1
127 |         with:
128 |           repository-url: https://test.pypi.org/legacy/
129 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # Pycharm
107 | .idea/
108 | 
109 | # VSC
110 | .vscode/
111 | 
112 | # Project specific
113 | benchmark/test_cases/*
114 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/mirrors-prettier
 3 |     rev: v3.1.0
 4 |     hooks:
 5 |       - id: prettier
 6 |         args: [--prose-wrap=preserve, --print-width=88]
 7 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 8 |     rev: v0.3.7
 9 |     hooks:
10 |       - id: ruff
11 |         args:
12 |           - --fix
13 |       - id: ruff-format
14 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 7 | 
 8 | ## [Unreleased]
 9 | 
10 | NIL
11 | 
12 | ## [0.2.0] - 2019-04-20
13 | 
14 | Complies with [Dirhash Standard](https://github.com/andhus/dirhash) Version [0.1.0](https://github.com/andhus/dirhash/releases/v0.1.0)
15 | 
16 | ### Added
17 | 
18 | - A first implementation based on the formalized [Dirhash Standard](https://github.com/andhus/dirhash).
19 | - This changelog.
20 | - Results form a new benchmark run after changes. The `benchmark/run.py` now outputs results files which names include the `dirhash.__version__`.
21 | 
22 | ### Changed
23 | 
24 | - **Significant breaking changes** from version 0.1.1 - both regarding API and the
25 |   underlying method/protocol for computing the hash. This means that **hashes
26 |   computed with this version will differ from hashes computed with version < 0.2.0 for
27 |   same directory**.
28 | - This dirhash python implementation has moved to here
29 |   [github.com/andhus/dirhash-python](https://github.com/andhus/dirhash-python) from
30 |   the previous repository
31 |   [github.com/andhus/dirhash](https://github.com/andhus/dirhash)
32 |   which now contains the formal description of the Dirhash Standard.
33 | 
34 | ### Removed
35 | 
36 | - All support for the `.dirhashignore` file. This seemed superfluous, please file an
37 |   issue if you need this feature.
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Anders Huss
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![codecov](https://codecov.io/gh/andhus/dirhash-python/branch/master/graph/badge.svg)](https://codecov.io/gh/andhus/dirhash-python)
  2 | 
  3 | # dirhash
  4 | 
  5 | A lightweight python module and CLI for computing the hash of any
  6 | directory based on its files' structure and content.
  7 | 
  8 | - Supports all hashing algorithms of Python's built-in `hashlib` module.
  9 | - Glob/wildcard (".gitignore style") path matching for expressive filtering of files to include/exclude.
 10 | - Multiprocessing for up to [6x speed-up](#performance)
 11 | 
 12 | The hash is computed according to the [Dirhash Standard](https://github.com/andhus/dirhash), which is designed to allow for consistent and collision resistant generation/verification of directory hashes across implementations.
 13 | 
 14 | ## Installation
 15 | 
 16 | From PyPI:
 17 | 
 18 | ```commandline
 19 | pip install dirhash
 20 | ```
 21 | 
 22 | Or directly from source:
 23 | 
 24 | ```commandline
 25 | git clone git@github.com:andhus/dirhash-python.git
 26 | pip install dirhash/
 27 | ```
 28 | 
 29 | ## Usage
 30 | 
 31 | Python module:
 32 | 
 33 | ```python
 34 | from dirhash import dirhash
 35 | 
 36 | dirpath = "path/to/directory"
 37 | dir_md5 = dirhash(dirpath, "md5")
 38 | pyfiles_md5 = dirhash(dirpath, "md5", match=["*.py"])
 39 | no_hidden_sha1 = dirhash(dirpath, "sha1", ignore=[".*", ".*/"])
 40 | ```
 41 | 
 42 | CLI:
 43 | 
 44 | ```commandline
 45 | dirhash path/to/directory -a md5
 46 | dirhash path/to/directory -a md5 --match "*.py"
 47 | dirhash path/to/directory -a sha1 --ignore ".*"  ".*/"
 48 | ```
 49 | 
 50 | ## Why?
 51 | 
 52 | If you (or your application) need to verify the integrity of a set of files as well
 53 | as their name and location, you might find this useful. Use-cases range from
 54 | verification of your image classification dataset (before spending GPU-$$$ on
 55 | training your fancy Deep Learning model) to validation of generated files in
 56 | regression-testing.
 57 | 
 58 | There isn't really a standard way of doing this. There are plenty of recipes out
 59 | there (see e.g. these SO-questions for [linux](https://stackoverflow.com/questions/545387/linux-compute-a-single-hash-for-a-given-folder-contents)
 60 | and [python](https://stackoverflow.com/questions/24937495/how-can-i-calculate-a-hash-for-a-filesystem-directory-using-python))
 61 | but I couldn't find one that is properly tested (there are some gotcha:s to cover!)
 62 | and documented with a compelling user interface. `dirhash` was created with this as
 63 | the goal.
 64 | 
 65 | [checksumdir](https://github.com/cakepietoast/checksumdir) is another python
 66 | module/tool with similar intent (that inspired this project) but it lacks much of the
 67 | functionality offered here (most notably including file names/structure in the hash)
 68 | and lacks tests.
 69 | 
 70 | ## Performance
 71 | 
 72 | The python `hashlib` implementation of common hashing algorithms are highly
 73 | optimised. `dirhash` mainly parses the file tree, pipes data to `hashlib` and
 74 | combines the output. Reasonable measures have been taken to minimize the overhead
 75 | and for common use-cases, the majority of time is spent reading data from disk
 76 | and executing `hashlib` code.
 77 | 
 78 | The main effort to boost performance is support for multiprocessing, where the
 79 | reading and hashing is parallelized over individual files.
 80 | 
 81 | As a reference, let's compare the performance of the `dirhash` [CLI](https://github.com/andhus/dirhash-python/blob/master/src/dirhash/cli.py)
 82 | with the shell command:
 83 | 
 84 | `find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5`
 85 | 
 86 | which is the top answer for the SO-question:
 87 | [Linux: compute a single hash for a given folder & contents?](https://stackoverflow.com/questions/545387/linux-compute-a-single-hash-for-a-given-folder-contents)
 88 | Results for two test cases are shown below. Both have 1 GiB of random data: in
 89 | "flat_1k_1MB", split into 1k files (1 MiB each) in a flat structure, and in
 90 | "nested_32k_32kB", into 32k files (32 KiB each) spread over the 256 leaf directories
 91 | in a binary tree of depth 8.
 92 | 
 93 | | Implementation       | Test Case       | Time (s) | Speed up |
 94 | | -------------------- | --------------- | -------: | -------: |
 95 | | shell reference      | flat_1k_1MB     |     2.29 |   -> 1.0 |
 96 | | `dirhash`            | flat_1k_1MB     |     1.67 |     1.36 |
 97 | | `dirhash`(8 workers) | flat_1k_1MB     |     0.48 | **4.73** |
 98 | | shell reference      | nested_32k_32kB |     6.82 |   -> 1.0 |
 99 | | `dirhash`            | nested_32k_32kB |     3.43 |     2.00 |
100 | | `dirhash`(8 workers) | nested_32k_32kB |     1.14 | **6.00** |
101 | 
102 | The benchmark was run a MacBook Pro (2018), further details and source code [here](https://github.com/andhus/dirhash-python/tree/master/benchmark).
103 | 
104 | ## Documentation
105 | 
106 | Please refer to `dirhash -h`, the python [source code](https://github.com/andhus/dirhash-python/blob/master/src/dirhash/__init__.py) and the [Dirhash Standard](https://github.com/andhus/dirhash).
107 | 


--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmark
 2 | 
 3 | As a reference, the performance of `dirhash` is benchmarked against the shell command:
 4 | 
 5 | `find path/to/folder -type f -print0 | sort -z | xargs -0 md5 | md5`
 6 | 
 7 | (top answer for the SO-question:
 8 | [Linux: compute a single hash for a given folder & contents?](https://stackoverflow.com/questions/545387/linux-compute-a-single-hash-for-a-given-folder-contents))
 9 | 
10 | Each test case contains 1 GiB of random data, split equally into 8, 1k or 32k files,
11 | in a flat or nested (binary tree of depth 8) structure.
12 | 
13 | For a fair comparison, _the CLI version_ of `dirhash` was used (including startup
14 | time for loading of python modules etc.).
15 | 
16 | For full details/reproducibility see/run the `run.py` script for which the output is
17 | found in `results.csv`. These results were generated on a MacBook Pro (2018):
18 | 
19 | - 2,2 GHz Intel Core i7 (`sysctl -n hw.physicalcpu hw.logicalcpu`-> 6, 12)
20 | - 16 GB 2400 MHz DDR4
21 | - APPLE SSD AP0512M
22 | 
23 | ## Sample results:
24 | 
25 | | Implementation       | Test Case       | Time (s) | Speed up |
26 | | -------------------- | --------------- | -------: | -------: |
27 | | shell reference      | flat_1k_1MB     |     2.29 |   -> 1.0 |
28 | | `dirhash`            | flat_1k_1MB     |     1.67 |     1.36 |
29 | | `dirhash`(8 workers) | flat_1k_1MB     |     0.48 | **4.73** |
30 | | shell reference      | nested_32k_32kB |     6.82 |   -> 1.0 |
31 | | `dirhash`            | nested_32k_32kB |     3.43 |     2.00 |
32 | | `dirhash`(8 workers) | nested_32k_32kB |     1.14 | **6.00** |
33 | 


--------------------------------------------------------------------------------
/benchmark/results.csv:
--------------------------------------------------------------------------------
 1 | ,test_case,implementation,algorithm,workers,t_best,t_median,speed-up (median)
 2 | 0,flat_8_128MB,shell reference,md5,1,2.014,2.02,1.0
 3 | 1,flat_8_128MB,dirhash,md5,1,1.602,1.604,1.2593516209476308
 4 | 2,flat_8_128MB,dirhash,md5,2,0.977,0.98,2.061224489795918
 5 | 3,flat_8_128MB,dirhash,md5,4,0.562,0.569,3.5500878734622145
 6 | 4,flat_8_128MB,dirhash,md5,8,0.464,0.473,4.2706131078224105
 7 | 5,flat_1k_1MB,shell reference,md5,1,2.263,2.268,1.0
 8 | 6,flat_1k_1MB,dirhash,md5,1,1.662,1.667,1.3605278944211157
 9 | 7,flat_1k_1MB,dirhash,md5,2,0.978,0.983,2.3072227873855544
10 | 8,flat_1k_1MB,dirhash,md5,4,0.57,0.58,3.910344827586207
11 | 9,flat_1k_1MB,dirhash,md5,8,0.476,0.48,4.725
12 | 10,flat_32k_32kB,shell reference,md5,1,6.711,6.721,1.0
13 | 11,flat_32k_32kB,dirhash,md5,1,3.329,3.354,2.003875968992248
14 | 12,flat_32k_32kB,dirhash,md5,2,2.067,2.074,3.240597878495661
15 | 13,flat_32k_32kB,dirhash,md5,4,1.345,1.362,4.934654919236417
16 | 14,flat_32k_32kB,dirhash,md5,8,1.09,1.094,6.143510054844606
17 | 15,nested_1k_1MB,shell reference,md5,1,2.296,2.306,1.0
18 | 16,nested_1k_1MB,dirhash,md5,1,1.713,1.714,1.3453908984830805
19 | 17,nested_1k_1MB,dirhash,md5,2,0.996,1.009,2.285431119920714
20 | 18,nested_1k_1MB,dirhash,md5,4,0.601,0.602,3.8305647840531565
21 | 19,nested_1k_1MB,dirhash,md5,8,0.499,0.505,4.566336633663366
22 | 20,nested_32k_32kB,shell reference,md5,1,6.814,6.818,1.0
23 | 21,nested_32k_32kB,dirhash,md5,1,3.376,3.426,1.9900758902510214
24 | 22,nested_32k_32kB,dirhash,md5,2,2.147,2.153,3.166744078030655
25 | 23,nested_32k_32kB,dirhash,md5,4,1.414,1.416,4.814971751412429
26 | 24,nested_32k_32kB,dirhash,md5,8,1.137,1.138,5.991212653778559
27 | 25,flat_8_128MB,shell reference,sha1,1,2.181,2.196,1.0
28 | 26,flat_8_128MB,dirhash,sha1,1,1.214,1.225,1.7926530612244898
29 | 27,flat_8_128MB,dirhash,sha1,2,0.768,0.774,2.8372093023255816
30 | 28,flat_8_128MB,dirhash,sha1,4,0.467,0.474,4.632911392405064
31 | 29,flat_8_128MB,dirhash,sha1,8,0.47,0.477,4.603773584905661
32 | 30,flat_1k_1MB,shell reference,sha1,1,2.221,2.229,1.0
33 | 31,flat_1k_1MB,dirhash,sha1,1,1.252,1.263,1.7648456057007127
34 | 32,flat_1k_1MB,dirhash,sha1,2,0.774,0.777,2.8687258687258685
35 | 33,flat_1k_1MB,dirhash,sha1,4,0.471,0.477,4.672955974842767
36 | 34,flat_1k_1MB,dirhash,sha1,8,0.378,0.478,4.663179916317992
37 | 35,flat_32k_32kB,shell reference,sha1,1,4.178,4.224,1.0
38 | 36,flat_32k_32kB,dirhash,sha1,1,2.921,3.008,1.4042553191489362
39 | 37,flat_32k_32kB,dirhash,sha1,2,1.888,1.892,2.232558139534884
40 | 38,flat_32k_32kB,dirhash,sha1,4,1.266,1.275,3.3129411764705887
41 | 39,flat_32k_32kB,dirhash,sha1,8,1.072,1.079,3.914735866543096
42 | 40,nested_1k_1MB,shell reference,sha1,1,2.236,2.26,1.0
43 | 41,nested_1k_1MB,dirhash,sha1,1,1.308,1.314,1.719939117199391
44 | 42,nested_1k_1MB,dirhash,sha1,2,0.797,0.8,2.8249999999999997
45 | 43,nested_1k_1MB,dirhash,sha1,4,0.501,0.509,4.4400785854616895
46 | 44,nested_1k_1MB,dirhash,sha1,8,0.499,0.503,4.493041749502981
47 | 45,nested_32k_32kB,shell reference,sha1,1,4.383,4.406,1.0
48 | 46,nested_32k_32kB,dirhash,sha1,1,3.041,3.05,1.4445901639344263
49 | 47,nested_32k_32kB,dirhash,sha1,2,1.943,1.965,2.242239185750636
50 | 48,nested_32k_32kB,dirhash,sha1,4,1.329,1.334,3.3028485757121433
51 | 49,nested_32k_32kB,dirhash,sha1,8,1.14,1.149,3.8346388163620535
52 | 


--------------------------------------------------------------------------------
/benchmark/results.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "test_case": "flat_8_128MB",
  4 |     "implementation": "shell reference",
  5 |     "algorithm": "md5",
  6 |     "workers": 1,
  7 |     "t_best": 2.014,
  8 |     "t_median": 2.02
  9 |   },
 10 |   {
 11 |     "test_case": "flat_8_128MB",
 12 |     "implementation": "dirhash",
 13 |     "algorithm": "md5",
 14 |     "workers": 1,
 15 |     "t_best": 1.602,
 16 |     "t_median": 1.604
 17 |   },
 18 |   {
 19 |     "test_case": "flat_8_128MB",
 20 |     "implementation": "dirhash",
 21 |     "algorithm": "md5",
 22 |     "workers": 2,
 23 |     "t_best": 0.977,
 24 |     "t_median": 0.98
 25 |   },
 26 |   {
 27 |     "test_case": "flat_8_128MB",
 28 |     "implementation": "dirhash",
 29 |     "algorithm": "md5",
 30 |     "workers": 4,
 31 |     "t_best": 0.562,
 32 |     "t_median": 0.569
 33 |   },
 34 |   {
 35 |     "test_case": "flat_8_128MB",
 36 |     "implementation": "dirhash",
 37 |     "algorithm": "md5",
 38 |     "workers": 8,
 39 |     "t_best": 0.464,
 40 |     "t_median": 0.473
 41 |   },
 42 |   {
 43 |     "test_case": "flat_1k_1MB",
 44 |     "implementation": "shell reference",
 45 |     "algorithm": "md5",
 46 |     "workers": 1,
 47 |     "t_best": 2.263,
 48 |     "t_median": 2.268
 49 |   },
 50 |   {
 51 |     "test_case": "flat_1k_1MB",
 52 |     "implementation": "dirhash",
 53 |     "algorithm": "md5",
 54 |     "workers": 1,
 55 |     "t_best": 1.662,
 56 |     "t_median": 1.667
 57 |   },
 58 |   {
 59 |     "test_case": "flat_1k_1MB",
 60 |     "implementation": "dirhash",
 61 |     "algorithm": "md5",
 62 |     "workers": 2,
 63 |     "t_best": 0.978,
 64 |     "t_median": 0.983
 65 |   },
 66 |   {
 67 |     "test_case": "flat_1k_1MB",
 68 |     "implementation": "dirhash",
 69 |     "algorithm": "md5",
 70 |     "workers": 4,
 71 |     "t_best": 0.57,
 72 |     "t_median": 0.58
 73 |   },
 74 |   {
 75 |     "test_case": "flat_1k_1MB",
 76 |     "implementation": "dirhash",
 77 |     "algorithm": "md5",
 78 |     "workers": 8,
 79 |     "t_best": 0.476,
 80 |     "t_median": 0.48
 81 |   },
 82 |   {
 83 |     "test_case": "flat_32k_32kB",
 84 |     "implementation": "shell reference",
 85 |     "algorithm": "md5",
 86 |     "workers": 1,
 87 |     "t_best": 6.711,
 88 |     "t_median": 6.721
 89 |   },
 90 |   {
 91 |     "test_case": "flat_32k_32kB",
 92 |     "implementation": "dirhash",
 93 |     "algorithm": "md5",
 94 |     "workers": 1,
 95 |     "t_best": 3.329,
 96 |     "t_median": 3.354
 97 |   },
 98 |   {
 99 |     "test_case": "flat_32k_32kB",
100 |     "implementation": "dirhash",
101 |     "algorithm": "md5",
102 |     "workers": 2,
103 |     "t_best": 2.067,
104 |     "t_median": 2.074
105 |   },
106 |   {
107 |     "test_case": "flat_32k_32kB",
108 |     "implementation": "dirhash",
109 |     "algorithm": "md5",
110 |     "workers": 4,
111 |     "t_best": 1.345,
112 |     "t_median": 1.362
113 |   },
114 |   {
115 |     "test_case": "flat_32k_32kB",
116 |     "implementation": "dirhash",
117 |     "algorithm": "md5",
118 |     "workers": 8,
119 |     "t_best": 1.09,
120 |     "t_median": 1.094
121 |   },
122 |   {
123 |     "test_case": "nested_1k_1MB",
124 |     "implementation": "shell reference",
125 |     "algorithm": "md5",
126 |     "workers": 1,
127 |     "t_best": 2.296,
128 |     "t_median": 2.306
129 |   },
130 |   {
131 |     "test_case": "nested_1k_1MB",
132 |     "implementation": "dirhash",
133 |     "algorithm": "md5",
134 |     "workers": 1,
135 |     "t_best": 1.713,
136 |     "t_median": 1.714
137 |   },
138 |   {
139 |     "test_case": "nested_1k_1MB",
140 |     "implementation": "dirhash",
141 |     "algorithm": "md5",
142 |     "workers": 2,
143 |     "t_best": 0.996,
144 |     "t_median": 1.009
145 |   },
146 |   {
147 |     "test_case": "nested_1k_1MB",
148 |     "implementation": "dirhash",
149 |     "algorithm": "md5",
150 |     "workers": 4,
151 |     "t_best": 0.601,
152 |     "t_median": 0.602
153 |   },
154 |   {
155 |     "test_case": "nested_1k_1MB",
156 |     "implementation": "dirhash",
157 |     "algorithm": "md5",
158 |     "workers": 8,
159 |     "t_best": 0.499,
160 |     "t_median": 0.505
161 |   },
162 |   {
163 |     "test_case": "nested_32k_32kB",
164 |     "implementation": "shell reference",
165 |     "algorithm": "md5",
166 |     "workers": 1,
167 |     "t_best": 6.814,
168 |     "t_median": 6.818
169 |   },
170 |   {
171 |     "test_case": "nested_32k_32kB",
172 |     "implementation": "dirhash",
173 |     "algorithm": "md5",
174 |     "workers": 1,
175 |     "t_best": 3.376,
176 |     "t_median": 3.426
177 |   },
178 |   {
179 |     "test_case": "nested_32k_32kB",
180 |     "implementation": "dirhash",
181 |     "algorithm": "md5",
182 |     "workers": 2,
183 |     "t_best": 2.147,
184 |     "t_median": 2.153
185 |   },
186 |   {
187 |     "test_case": "nested_32k_32kB",
188 |     "implementation": "dirhash",
189 |     "algorithm": "md5",
190 |     "workers": 4,
191 |     "t_best": 1.414,
192 |     "t_median": 1.416
193 |   },
194 |   {
195 |     "test_case": "nested_32k_32kB",
196 |     "implementation": "dirhash",
197 |     "algorithm": "md5",
198 |     "workers": 8,
199 |     "t_best": 1.137,
200 |     "t_median": 1.138
201 |   },
202 |   {
203 |     "test_case": "flat_8_128MB",
204 |     "implementation": "shell reference",
205 |     "algorithm": "sha1",
206 |     "workers": 1,
207 |     "t_best": 2.181,
208 |     "t_median": 2.196
209 |   },
210 |   {
211 |     "test_case": "flat_8_128MB",
212 |     "implementation": "dirhash",
213 |     "algorithm": "sha1",
214 |     "workers": 1,
215 |     "t_best": 1.214,
216 |     "t_median": 1.225
217 |   },
218 |   {
219 |     "test_case": "flat_8_128MB",
220 |     "implementation": "dirhash",
221 |     "algorithm": "sha1",
222 |     "workers": 2,
223 |     "t_best": 0.768,
224 |     "t_median": 0.774
225 |   },
226 |   {
227 |     "test_case": "flat_8_128MB",
228 |     "implementation": "dirhash",
229 |     "algorithm": "sha1",
230 |     "workers": 4,
231 |     "t_best": 0.467,
232 |     "t_median": 0.474
233 |   },
234 |   {
235 |     "test_case": "flat_8_128MB",
236 |     "implementation": "dirhash",
237 |     "algorithm": "sha1",
238 |     "workers": 8,
239 |     "t_best": 0.47,
240 |     "t_median": 0.477
241 |   },
242 |   {
243 |     "test_case": "flat_1k_1MB",
244 |     "implementation": "shell reference",
245 |     "algorithm": "sha1",
246 |     "workers": 1,
247 |     "t_best": 2.221,
248 |     "t_median": 2.229
249 |   },
250 |   {
251 |     "test_case": "flat_1k_1MB",
252 |     "implementation": "dirhash",
253 |     "algorithm": "sha1",
254 |     "workers": 1,
255 |     "t_best": 1.252,
256 |     "t_median": 1.263
257 |   },
258 |   {
259 |     "test_case": "flat_1k_1MB",
260 |     "implementation": "dirhash",
261 |     "algorithm": "sha1",
262 |     "workers": 2,
263 |     "t_best": 0.774,
264 |     "t_median": 0.777
265 |   },
266 |   {
267 |     "test_case": "flat_1k_1MB",
268 |     "implementation": "dirhash",
269 |     "algorithm": "sha1",
270 |     "workers": 4,
271 |     "t_best": 0.471,
272 |     "t_median": 0.477
273 |   },
274 |   {
275 |     "test_case": "flat_1k_1MB",
276 |     "implementation": "dirhash",
277 |     "algorithm": "sha1",
278 |     "workers": 8,
279 |     "t_best": 0.378,
280 |     "t_median": 0.478
281 |   },
282 |   {
283 |     "test_case": "flat_32k_32kB",
284 |     "implementation": "shell reference",
285 |     "algorithm": "sha1",
286 |     "workers": 1,
287 |     "t_best": 4.178,
288 |     "t_median": 4.224
289 |   },
290 |   {
291 |     "test_case": "flat_32k_32kB",
292 |     "implementation": "dirhash",
293 |     "algorithm": "sha1",
294 |     "workers": 1,
295 |     "t_best": 2.921,
296 |     "t_median": 3.008
297 |   },
298 |   {
299 |     "test_case": "flat_32k_32kB",
300 |     "implementation": "dirhash",
301 |     "algorithm": "sha1",
302 |     "workers": 2,
303 |     "t_best": 1.888,
304 |     "t_median": 1.892
305 |   },
306 |   {
307 |     "test_case": "flat_32k_32kB",
308 |     "implementation": "dirhash",
309 |     "algorithm": "sha1",
310 |     "workers": 4,
311 |     "t_best": 1.266,
312 |     "t_median": 1.275
313 |   },
314 |   {
315 |     "test_case": "flat_32k_32kB",
316 |     "implementation": "dirhash",
317 |     "algorithm": "sha1",
318 |     "workers": 8,
319 |     "t_best": 1.072,
320 |     "t_median": 1.079
321 |   },
322 |   {
323 |     "test_case": "nested_1k_1MB",
324 |     "implementation": "shell reference",
325 |     "algorithm": "sha1",
326 |     "workers": 1,
327 |     "t_best": 2.236,
328 |     "t_median": 2.26
329 |   },
330 |   {
331 |     "test_case": "nested_1k_1MB",
332 |     "implementation": "dirhash",
333 |     "algorithm": "sha1",
334 |     "workers": 1,
335 |     "t_best": 1.308,
336 |     "t_median": 1.314
337 |   },
338 |   {
339 |     "test_case": "nested_1k_1MB",
340 |     "implementation": "dirhash",
341 |     "algorithm": "sha1",
342 |     "workers": 2,
343 |     "t_best": 0.797,
344 |     "t_median": 0.8
345 |   },
346 |   {
347 |     "test_case": "nested_1k_1MB",
348 |     "implementation": "dirhash",
349 |     "algorithm": "sha1",
350 |     "workers": 4,
351 |     "t_best": 0.501,
352 |     "t_median": 0.509
353 |   },
354 |   {
355 |     "test_case": "nested_1k_1MB",
356 |     "implementation": "dirhash",
357 |     "algorithm": "sha1",
358 |     "workers": 8,
359 |     "t_best": 0.499,
360 |     "t_median": 0.503
361 |   },
362 |   {
363 |     "test_case": "nested_32k_32kB",
364 |     "implementation": "shell reference",
365 |     "algorithm": "sha1",
366 |     "workers": 1,
367 |     "t_best": 4.383,
368 |     "t_median": 4.406
369 |   },
370 |   {
371 |     "test_case": "nested_32k_32kB",
372 |     "implementation": "dirhash",
373 |     "algorithm": "sha1",
374 |     "workers": 1,
375 |     "t_best": 3.041,
376 |     "t_median": 3.05
377 |   },
378 |   {
379 |     "test_case": "nested_32k_32kB",
380 |     "implementation": "dirhash",
381 |     "algorithm": "sha1",
382 |     "workers": 2,
383 |     "t_best": 1.943,
384 |     "t_median": 1.965
385 |   },
386 |   {
387 |     "test_case": "nested_32k_32kB",
388 |     "implementation": "dirhash",
389 |     "algorithm": "sha1",
390 |     "workers": 4,
391 |     "t_best": 1.329,
392 |     "t_median": 1.334
393 |   },
394 |   {
395 |     "test_case": "nested_32k_32kB",
396 |     "implementation": "dirhash",
397 |     "algorithm": "sha1",
398 |     "workers": 8,
399 |     "t_best": 1.14,
400 |     "t_median": 1.149
401 |   }
402 | ]
403 | 


--------------------------------------------------------------------------------
/benchmark/results_v0.2.0.csv:
--------------------------------------------------------------------------------
 1 | ,test_case,implementation,algorithm,workers,t_best,t_median,speed-up (median)
 2 | 0,flat_8_128MB,shell reference,md5,1,2.079,2.083,1.0
 3 | 1,flat_8_128MB,dirhash_impl,md5,1,1.734,1.945,1.0709511568123393
 4 | 2,flat_8_128MB,dirhash_impl,md5,2,0.999,1.183,1.760777683854607
 5 | 3,flat_8_128MB,dirhash_impl,md5,4,0.711,0.728,2.8612637362637368
 6 | 4,flat_8_128MB,dirhash_impl,md5,8,0.504,0.518,4.021235521235521
 7 | 5,flat_1k_1MB,shell reference,md5,1,3.383,3.679,1.0
 8 | 6,flat_1k_1MB,dirhash_impl,md5,1,1.846,1.921,1.9151483602290473
 9 | 7,flat_1k_1MB,dirhash_impl,md5,2,1.137,1.158,3.1770293609671847
10 | 8,flat_1k_1MB,dirhash_impl,md5,4,0.74,0.749,4.911882510013351
11 | 9,flat_1k_1MB,dirhash_impl,md5,8,0.53,0.534,6.889513108614231
12 | 10,flat_32k_32kB,shell reference,md5,1,13.827,18.213,1.0
13 | 11,flat_32k_32kB,dirhash_impl,md5,1,13.655,13.808,1.3190179606025494
14 | 12,flat_32k_32kB,dirhash_impl,md5,2,3.276,3.33,5.469369369369369
15 | 13,flat_32k_32kB,dirhash_impl,md5,4,2.409,2.421,7.522924411400249
16 | 14,flat_32k_32kB,dirhash_impl,md5,8,2.045,2.086,8.731064237775648
17 | 15,nested_1k_1MB,shell reference,md5,1,3.284,3.332,1.0
18 | 16,nested_1k_1MB,dirhash_impl,md5,1,1.717,1.725,1.9315942028985504
19 | 17,nested_1k_1MB,dirhash_impl,md5,2,1.026,1.034,3.222437137330754
20 | 18,nested_1k_1MB,dirhash_impl,md5,4,0.622,0.633,5.263823064770932
21 | 19,nested_1k_1MB,dirhash_impl,md5,8,0.522,0.529,6.29867674858223
22 | 20,nested_32k_32kB,shell reference,md5,1,11.898,12.125,1.0
23 | 21,nested_32k_32kB,dirhash_impl,md5,1,13.858,14.146,0.8571327583769263
24 | 22,nested_32k_32kB,dirhash_impl,md5,2,2.781,2.987,4.059256779377302
25 | 23,nested_32k_32kB,dirhash_impl,md5,4,1.894,1.92,6.315104166666667
26 | 24,nested_32k_32kB,dirhash_impl,md5,8,1.55,1.568,7.732780612244897
27 | 25,flat_8_128MB,shell reference,sha1,1,2.042,2.05,1.0
28 | 26,flat_8_128MB,dirhash_impl,sha1,1,1.338,1.354,1.5140324963072376
29 | 27,flat_8_128MB,dirhash_impl,sha1,2,0.79,0.794,2.5818639798488663
30 | 28,flat_8_128MB,dirhash_impl,sha1,4,0.583,0.593,3.456998313659359
31 | 29,flat_8_128MB,dirhash_impl,sha1,8,0.483,0.487,4.209445585215605
32 | 30,flat_1k_1MB,shell reference,sha1,1,2.118,2.129,1.0
33 | 31,flat_1k_1MB,dirhash_impl,sha1,1,1.39,1.531,1.3905943827563685
34 | 32,flat_1k_1MB,dirhash_impl,sha1,2,0.925,0.932,2.2843347639484977
35 | 33,flat_1k_1MB,dirhash_impl,sha1,4,0.614,0.629,3.384737678855326
36 | 34,flat_1k_1MB,dirhash_impl,sha1,8,0.511,0.52,4.094230769230769
37 | 35,flat_32k_32kB,shell reference,sha1,1,10.551,10.97,1.0
38 | 36,flat_32k_32kB,dirhash_impl,sha1,1,4.663,4.76,2.304621848739496
39 | 37,flat_32k_32kB,dirhash_impl,sha1,2,3.108,3.235,3.3910355486862445
40 | 38,flat_32k_32kB,dirhash_impl,sha1,4,2.342,2.361,4.6463362981787375
41 | 39,flat_32k_32kB,dirhash_impl,sha1,8,2.071,2.094,5.2387774594078325
42 | 40,nested_1k_1MB,shell reference,sha1,1,2.11,2.159,1.0
43 | 41,nested_1k_1MB,dirhash_impl,sha1,1,1.436,1.47,1.4687074829931972
44 | 42,nested_1k_1MB,dirhash_impl,sha1,2,0.925,0.937,2.3041622198505864
45 | 43,nested_1k_1MB,dirhash_impl,sha1,4,0.627,0.643,3.357698289269051
46 | 44,nested_1k_1MB,dirhash_impl,sha1,8,0.516,0.527,4.096774193548386
47 | 45,nested_32k_32kB,shell reference,sha1,1,3.982,7.147,1.0
48 | 46,nested_32k_32kB,dirhash_impl,sha1,1,4.114,4.156,1.7196823869104911
49 | 47,nested_32k_32kB,dirhash_impl,sha1,2,2.598,2.616,2.7320336391437308
50 | 48,nested_32k_32kB,dirhash_impl,sha1,4,1.809,1.831,3.9033315128345167
51 | 49,nested_32k_32kB,dirhash_impl,sha1,8,1.552,1.58,4.523417721518987
52 | 


--------------------------------------------------------------------------------
/benchmark/results_v0.2.0.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "test_case": "flat_8_128MB",
  4 |     "implementation": "shell reference",
  5 |     "algorithm": "md5",
  6 |     "workers": 1,
  7 |     "t_best": 2.079,
  8 |     "t_median": 2.083
  9 |   },
 10 |   {
 11 |     "test_case": "flat_8_128MB",
 12 |     "implementation": "dirhash",
 13 |     "algorithm": "md5",
 14 |     "workers": 1,
 15 |     "t_best": 1.734,
 16 |     "t_median": 1.945
 17 |   },
 18 |   {
 19 |     "test_case": "flat_8_128MB",
 20 |     "implementation": "dirhash",
 21 |     "algorithm": "md5",
 22 |     "workers": 2,
 23 |     "t_best": 0.999,
 24 |     "t_median": 1.183
 25 |   },
 26 |   {
 27 |     "test_case": "flat_8_128MB",
 28 |     "implementation": "dirhash",
 29 |     "algorithm": "md5",
 30 |     "workers": 4,
 31 |     "t_best": 0.711,
 32 |     "t_median": 0.728
 33 |   },
 34 |   {
 35 |     "test_case": "flat_8_128MB",
 36 |     "implementation": "dirhash",
 37 |     "algorithm": "md5",
 38 |     "workers": 8,
 39 |     "t_best": 0.504,
 40 |     "t_median": 0.518
 41 |   },
 42 |   {
 43 |     "test_case": "flat_1k_1MB",
 44 |     "implementation": "shell reference",
 45 |     "algorithm": "md5",
 46 |     "workers": 1,
 47 |     "t_best": 3.383,
 48 |     "t_median": 3.679
 49 |   },
 50 |   {
 51 |     "test_case": "flat_1k_1MB",
 52 |     "implementation": "dirhash",
 53 |     "algorithm": "md5",
 54 |     "workers": 1,
 55 |     "t_best": 1.846,
 56 |     "t_median": 1.921
 57 |   },
 58 |   {
 59 |     "test_case": "flat_1k_1MB",
 60 |     "implementation": "dirhash",
 61 |     "algorithm": "md5",
 62 |     "workers": 2,
 63 |     "t_best": 1.137,
 64 |     "t_median": 1.158
 65 |   },
 66 |   {
 67 |     "test_case": "flat_1k_1MB",
 68 |     "implementation": "dirhash",
 69 |     "algorithm": "md5",
 70 |     "workers": 4,
 71 |     "t_best": 0.74,
 72 |     "t_median": 0.749
 73 |   },
 74 |   {
 75 |     "test_case": "flat_1k_1MB",
 76 |     "implementation": "dirhash",
 77 |     "algorithm": "md5",
 78 |     "workers": 8,
 79 |     "t_best": 0.53,
 80 |     "t_median": 0.534
 81 |   },
 82 |   {
 83 |     "test_case": "flat_32k_32kB",
 84 |     "implementation": "shell reference",
 85 |     "algorithm": "md5",
 86 |     "workers": 1,
 87 |     "t_best": 13.827,
 88 |     "t_median": 18.213
 89 |   },
 90 |   {
 91 |     "test_case": "flat_32k_32kB",
 92 |     "implementation": "dirhash",
 93 |     "algorithm": "md5",
 94 |     "workers": 1,
 95 |     "t_best": 13.655,
 96 |     "t_median": 13.808
 97 |   },
 98 |   {
 99 |     "test_case": "flat_32k_32kB",
100 |     "implementation": "dirhash",
101 |     "algorithm": "md5",
102 |     "workers": 2,
103 |     "t_best": 3.276,
104 |     "t_median": 3.33
105 |   },
106 |   {
107 |     "test_case": "flat_32k_32kB",
108 |     "implementation": "dirhash",
109 |     "algorithm": "md5",
110 |     "workers": 4,
111 |     "t_best": 2.409,
112 |     "t_median": 2.421
113 |   },
114 |   {
115 |     "test_case": "flat_32k_32kB",
116 |     "implementation": "dirhash",
117 |     "algorithm": "md5",
118 |     "workers": 8,
119 |     "t_best": 2.045,
120 |     "t_median": 2.086
121 |   },
122 |   {
123 |     "test_case": "nested_1k_1MB",
124 |     "implementation": "shell reference",
125 |     "algorithm": "md5",
126 |     "workers": 1,
127 |     "t_best": 3.284,
128 |     "t_median": 3.332
129 |   },
130 |   {
131 |     "test_case": "nested_1k_1MB",
132 |     "implementation": "dirhash",
133 |     "algorithm": "md5",
134 |     "workers": 1,
135 |     "t_best": 1.717,
136 |     "t_median": 1.725
137 |   },
138 |   {
139 |     "test_case": "nested_1k_1MB",
140 |     "implementation": "dirhash",
141 |     "algorithm": "md5",
142 |     "workers": 2,
143 |     "t_best": 1.026,
144 |     "t_median": 1.034
145 |   },
146 |   {
147 |     "test_case": "nested_1k_1MB",
148 |     "implementation": "dirhash",
149 |     "algorithm": "md5",
150 |     "workers": 4,
151 |     "t_best": 0.622,
152 |     "t_median": 0.633
153 |   },
154 |   {
155 |     "test_case": "nested_1k_1MB",
156 |     "implementation": "dirhash",
157 |     "algorithm": "md5",
158 |     "workers": 8,
159 |     "t_best": 0.522,
160 |     "t_median": 0.529
161 |   },
162 |   {
163 |     "test_case": "nested_32k_32kB",
164 |     "implementation": "shell reference",
165 |     "algorithm": "md5",
166 |     "workers": 1,
167 |     "t_best": 11.898,
168 |     "t_median": 12.125
169 |   },
170 |   {
171 |     "test_case": "nested_32k_32kB",
172 |     "implementation": "dirhash",
173 |     "algorithm": "md5",
174 |     "workers": 1,
175 |     "t_best": 13.858,
176 |     "t_median": 14.146
177 |   },
178 |   {
179 |     "test_case": "nested_32k_32kB",
180 |     "implementation": "dirhash",
181 |     "algorithm": "md5",
182 |     "workers": 2,
183 |     "t_best": 2.781,
184 |     "t_median": 2.987
185 |   },
186 |   {
187 |     "test_case": "nested_32k_32kB",
188 |     "implementation": "dirhash",
189 |     "algorithm": "md5",
190 |     "workers": 4,
191 |     "t_best": 1.894,
192 |     "t_median": 1.92
193 |   },
194 |   {
195 |     "test_case": "nested_32k_32kB",
196 |     "implementation": "dirhash",
197 |     "algorithm": "md5",
198 |     "workers": 8,
199 |     "t_best": 1.55,
200 |     "t_median": 1.568
201 |   },
202 |   {
203 |     "test_case": "flat_8_128MB",
204 |     "implementation": "shell reference",
205 |     "algorithm": "sha1",
206 |     "workers": 1,
207 |     "t_best": 2.042,
208 |     "t_median": 2.05
209 |   },
210 |   {
211 |     "test_case": "flat_8_128MB",
212 |     "implementation": "dirhash",
213 |     "algorithm": "sha1",
214 |     "workers": 1,
215 |     "t_best": 1.338,
216 |     "t_median": 1.354
217 |   },
218 |   {
219 |     "test_case": "flat_8_128MB",
220 |     "implementation": "dirhash",
221 |     "algorithm": "sha1",
222 |     "workers": 2,
223 |     "t_best": 0.79,
224 |     "t_median": 0.794
225 |   },
226 |   {
227 |     "test_case": "flat_8_128MB",
228 |     "implementation": "dirhash",
229 |     "algorithm": "sha1",
230 |     "workers": 4,
231 |     "t_best": 0.583,
232 |     "t_median": 0.593
233 |   },
234 |   {
235 |     "test_case": "flat_8_128MB",
236 |     "implementation": "dirhash",
237 |     "algorithm": "sha1",
238 |     "workers": 8,
239 |     "t_best": 0.483,
240 |     "t_median": 0.487
241 |   },
242 |   {
243 |     "test_case": "flat_1k_1MB",
244 |     "implementation": "shell reference",
245 |     "algorithm": "sha1",
246 |     "workers": 1,
247 |     "t_best": 2.118,
248 |     "t_median": 2.129
249 |   },
250 |   {
251 |     "test_case": "flat_1k_1MB",
252 |     "implementation": "dirhash",
253 |     "algorithm": "sha1",
254 |     "workers": 1,
255 |     "t_best": 1.39,
256 |     "t_median": 1.531
257 |   },
258 |   {
259 |     "test_case": "flat_1k_1MB",
260 |     "implementation": "dirhash",
261 |     "algorithm": "sha1",
262 |     "workers": 2,
263 |     "t_best": 0.925,
264 |     "t_median": 0.932
265 |   },
266 |   {
267 |     "test_case": "flat_1k_1MB",
268 |     "implementation": "dirhash",
269 |     "algorithm": "sha1",
270 |     "workers": 4,
271 |     "t_best": 0.614,
272 |     "t_median": 0.629
273 |   },
274 |   {
275 |     "test_case": "flat_1k_1MB",
276 |     "implementation": "dirhash",
277 |     "algorithm": "sha1",
278 |     "workers": 8,
279 |     "t_best": 0.511,
280 |     "t_median": 0.52
281 |   },
282 |   {
283 |     "test_case": "flat_32k_32kB",
284 |     "implementation": "shell reference",
285 |     "algorithm": "sha1",
286 |     "workers": 1,
287 |     "t_best": 10.551,
288 |     "t_median": 10.97
289 |   },
290 |   {
291 |     "test_case": "flat_32k_32kB",
292 |     "implementation": "dirhash",
293 |     "algorithm": "sha1",
294 |     "workers": 1,
295 |     "t_best": 4.663,
296 |     "t_median": 4.76
297 |   },
298 |   {
299 |     "test_case": "flat_32k_32kB",
300 |     "implementation": "dirhash",
301 |     "algorithm": "sha1",
302 |     "workers": 2,
303 |     "t_best": 3.108,
304 |     "t_median": 3.235
305 |   },
306 |   {
307 |     "test_case": "flat_32k_32kB",
308 |     "implementation": "dirhash",
309 |     "algorithm": "sha1",
310 |     "workers": 4,
311 |     "t_best": 2.342,
312 |     "t_median": 2.361
313 |   },
314 |   {
315 |     "test_case": "flat_32k_32kB",
316 |     "implementation": "dirhash",
317 |     "algorithm": "sha1",
318 |     "workers": 8,
319 |     "t_best": 2.071,
320 |     "t_median": 2.094
321 |   },
322 |   {
323 |     "test_case": "nested_1k_1MB",
324 |     "implementation": "shell reference",
325 |     "algorithm": "sha1",
326 |     "workers": 1,
327 |     "t_best": 2.11,
328 |     "t_median": 2.159
329 |   },
330 |   {
331 |     "test_case": "nested_1k_1MB",
332 |     "implementation": "dirhash",
333 |     "algorithm": "sha1",
334 |     "workers": 1,
335 |     "t_best": 1.436,
336 |     "t_median": 1.47
337 |   },
338 |   {
339 |     "test_case": "nested_1k_1MB",
340 |     "implementation": "dirhash",
341 |     "algorithm": "sha1",
342 |     "workers": 2,
343 |     "t_best": 0.925,
344 |     "t_median": 0.937
345 |   },
346 |   {
347 |     "test_case": "nested_1k_1MB",
348 |     "implementation": "dirhash",
349 |     "algorithm": "sha1",
350 |     "workers": 4,
351 |     "t_best": 0.627,
352 |     "t_median": 0.643
353 |   },
354 |   {
355 |     "test_case": "nested_1k_1MB",
356 |     "implementation": "dirhash",
357 |     "algorithm": "sha1",
358 |     "workers": 8,
359 |     "t_best": 0.516,
360 |     "t_median": 0.527
361 |   },
362 |   {
363 |     "test_case": "nested_32k_32kB",
364 |     "implementation": "shell reference",
365 |     "algorithm": "sha1",
366 |     "workers": 1,
367 |     "t_best": 3.982,
368 |     "t_median": 7.147
369 |   },
370 |   {
371 |     "test_case": "nested_32k_32kB",
372 |     "implementation": "dirhash",
373 |     "algorithm": "sha1",
374 |     "workers": 1,
375 |     "t_best": 4.114,
376 |     "t_median": 4.156
377 |   },
378 |   {
379 |     "test_case": "nested_32k_32kB",
380 |     "implementation": "dirhash",
381 |     "algorithm": "sha1",
382 |     "workers": 2,
383 |     "t_best": 2.598,
384 |     "t_median": 2.616
385 |   },
386 |   {
387 |     "test_case": "nested_32k_32kB",
388 |     "implementation": "dirhash",
389 |     "algorithm": "sha1",
390 |     "workers": 4,
391 |     "t_best": 1.809,
392 |     "t_median": 1.831
393 |   },
394 |   {
395 |     "test_case": "nested_32k_32kB",
396 |     "implementation": "dirhash",
397 |     "algorithm": "sha1",
398 |     "workers": 8,
399 |     "t_best": 1.552,
400 |     "t_median": 1.58
401 |   }
402 | ]
403 | 


--------------------------------------------------------------------------------
/benchmark/run.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import subprocess
  4 | from statistics import median
  5 | 
  6 | from dirhash import __version__
  7 | 
  8 | BENCHMARK_ROOT = os.path.abspath(os.path.join(__file__, os.pardir))
  9 | 
 10 | TEST_CASES = {
 11 |     "flat_8_128MB": {"depth": 0, "num_files": 2**3, "file_size": 2**27},
 12 |     "flat_1k_1MB": {"depth": 0, "num_files": 2**10, "file_size": 2**20},
 13 |     "flat_32k_32kB": {"depth": 0, "num_files": 2**15, "file_size": 2**15},
 14 |     "nested_1k_1MB": {"depth": 8, "num_files": 2**10, "file_size": 2**20},
 15 |     "nested_32k_32kB": {"depth": 8, "num_files": 2**15, "file_size": 2**15},
 16 | }
 17 | 
 18 | 
 19 | def int_chunks(x, n):
 20 |     base = x // n
 21 |     remain = x % n
 22 |     chunks = [base] * n
 23 |     for i in range(remain):
 24 |         chunks[i] += 1
 25 | 
 26 |     return chunks
 27 | 
 28 | 
 29 | def write_file_tree(dirpath, depth, num_files, file_size, branch_factor=2):
 30 |     assert num_files >= branch_factor**depth
 31 |     os.mkdir(dirpath)
 32 |     if depth == 0:
 33 |         fill = len(str(num_files))
 34 |         for i in range(num_files):
 35 |             filepath = os.path.join(dirpath, "f_" + str(i).rjust(fill, "0"))
 36 |             with open(filepath, "wb") as f:
 37 |                 f.write(os.urandom(file_size))
 38 |     else:
 39 |         fill = len(str(branch_factor))
 40 |         for i, num_files_branch in enumerate(int_chunks(num_files, branch_factor)):
 41 |             dirpath_branch = os.path.join(dirpath, "d_" + str(i).rjust(fill, "0"))
 42 |             write_file_tree(
 43 |                 dirpath_branch, depth - 1, num_files_branch, file_size, branch_factor
 44 |             )
 45 | 
 46 | 
 47 | def require_test_cases():
 48 |     test_cases_root = os.path.join(BENCHMARK_ROOT, "test_cases")
 49 |     if not os.path.exists(test_cases_root):
 50 |         os.mkdir(test_cases_root)
 51 |     test_case_paths = []
 52 |     for name, kwargs in TEST_CASES.items():
 53 |         test_case_path = os.path.join(test_cases_root, name)
 54 |         if not os.path.exists(test_case_path):
 55 |             print(f"creating test case: {name}: {kwargs}")
 56 |             write_file_tree(test_case_path, **kwargs)
 57 |         test_case_paths.append(test_case_path)
 58 | 
 59 |     return test_case_paths
 60 | 
 61 | 
 62 | def time_shell(cmd, runs=1, repetitions=1, setup=None):
 63 |     time_cmd = f"time for i in {{1..{repetitions}}}; do {cmd}; done"
 64 |     if setup is not None:
 65 |         time_cmd = f"{setup}; {time_cmd}"
 66 | 
 67 |     realtimes = []
 68 |     for _run in range(runs):
 69 |         process = subprocess.run(
 70 |             time_cmd, capture_output=True, text=True, shell=True, check=True
 71 |         )
 72 |         output_lines = process.stderr.split("\n")
 73 |         try:
 74 |             t_real, t_user, t_sys = output_lines[-4:-1]
 75 |             assert t_real.startswith("real")
 76 |             t_str = t_real.split("\t")[1]
 77 |             min_str, sec_str = t_str.split("m")
 78 |             sec = 60 * int(min_str) + float(sec_str[:-1])
 79 |             sec_per_rep = sec / repetitions
 80 |         except Exception as exc:
 81 |             raise RuntimeError(
 82 |                 f"Failed to parse `time` stderr output: {process.stderr}"
 83 |             ) from exc
 84 |         realtimes.append(sec_per_rep)
 85 | 
 86 |     return realtimes
 87 | 
 88 | 
 89 | def get_reference_shell_cmd(dirpath, algorithm):
 90 |     if algorithm == "md5":
 91 |         pass
 92 |     elif algorithm.startswith("sha"):
 93 |         version = int(algorithm[3:])
 94 |         algorithm = f"shasum -a {version}"
 95 |     else:
 96 |         raise ValueError("only md5 and sha supported")
 97 | 
 98 |     return (
 99 |         f"find {dirpath} -type f -print0 | sort -z | xargs -0 {algorithm} | {algorithm}"
100 |     )
101 | 
102 | 
103 | def get_dirhash_shell_cmd(dirpath, algorithm, workers=1):
104 |     return f"dirhash {dirpath} -a {algorithm} -j {workers}"
105 | 
106 | 
107 | def benchmark(dirpath, algorithm, **kwargs):
108 |     test_case = os.path.basename(dirpath)
109 |     result = []
110 | 
111 |     cmd = get_reference_shell_cmd(dirpath, algorithm)
112 |     realtimes = time_shell(cmd=cmd, **kwargs)
113 |     res = {
114 |         "test_case": test_case,
115 |         "implementation": "shell reference",
116 |         "algorithm": algorithm,
117 |         "workers": 1,
118 |         "t_best": min(realtimes),
119 |         "t_median": median(realtimes),
120 |     }
121 |     print(res)
122 |     print(realtimes)
123 |     result.append(res)
124 | 
125 |     for workers in [1, 2, 4, 8]:
126 |         cmd = get_dirhash_shell_cmd(dirpath, algorithm, workers)
127 |         realtimes = time_shell(cmd=cmd, **kwargs)
128 |         res = {
129 |             "test_case": test_case,
130 |             "implementation": "dirhash",
131 |             "algorithm": algorithm,
132 |             "workers": workers,
133 |             "t_best": min(realtimes),
134 |             "t_median": median(realtimes),
135 |         }
136 |         print(res)
137 |         print(realtimes)
138 |         result.append(res)
139 | 
140 |     return result
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     test_cases = require_test_cases()
145 |     results = []
146 |     for alg in ["md5", "sha1"]:
147 |         for test_case in test_cases:
148 |             result = benchmark(test_case, algorithm=alg, runs=5, repetitions=1)
149 |             results.extend(result)
150 | 
151 |     result_fname = f"results_v{__version__}"
152 | 
153 |     with open(os.path.join(BENCHMARK_ROOT, result_fname + ".json"), "w") as f:
154 |         json.dump(results, f, indent=2)
155 | 
156 |     try:
157 |         import pandas as pd
158 | 
159 |         df = pd.DataFrame(results)
160 |         df = df[
161 |             [
162 |                 "test_case",
163 |                 "implementation",
164 |                 "algorithm",
165 |                 "workers",
166 |                 "t_best",
167 |                 "t_median",
168 |             ]
169 |         ]
170 |         for (_tc, _alg), subdf in df.groupby(["test_case", "algorithm"]):
171 |             t_ref = subdf[subdf["implementation"] == "shell reference"][
172 |                 "t_median"
173 |             ].values[0]
174 |             speed_up = t_ref / subdf["t_median"]
175 |             df.loc[speed_up.index, "speed-up (median)"] = speed_up
176 |         print(df)
177 |         df_hd = df[df["implementation"] == "dirhash"]
178 |         df_hd_1w = df_hd[df_hd["workers"] == 1]
179 |         df_hd_8w = df_hd[df_hd["workers"] == 8]
180 |         mean_speedup_1w = df_hd_1w.mean()["speed-up (median)"]
181 |         mean_speedup_8w = df_hd_8w.mean()["speed-up (median)"]
182 |         print(f"\nAverage speedup (single process): {mean_speedup_1w}")
183 |         print(df_hd_1w)
184 |         print(f"\nAverage speedup multiprocess (8 workers): {mean_speedup_8w}")
185 |         print(df_hd_8w)
186 |         df.to_csv(os.path.join(BENCHMARK_ROOT, result_fname + ".csv"))
187 |     except ImportError:
188 |         pass
189 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 |   status:
3 |     project:
4 |       default:
5 |         target: 100% # the required coverage value
6 |         threshold: 5% # the leniency in hitting the target
7 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "versioneer==0.29"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [tool.ruff]
 6 | target-version = "py38"
 7 | 
 8 | [tool.ruff.lint]
 9 | select = [
10 |     "E",  # pycodestyle errors
11 |     "W",  # pycodestyle warnings
12 |     "F",  # pyflakes
13 |     "I",  # isort
14 |     "B",  # flake8-bugbear
15 |     "C4", # flake8-comprehensions
16 |     "UP", # pyupgrade
17 | ]
18 | 
19 | [tool.ruff.lint.isort]
20 | known-local-folder = ["dirhash"]
21 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [versioneer]
2 | VCS = git
3 | style = pep440
4 | versionfile_source = src/dirhash/_version.py
5 | versionfile_build = dirhash/_version.py
6 | tag_prefix = v
7 | parentdir_prefix = dirhash-
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import versioneer
 4 | from setuptools import find_packages, setup
 5 | 
 6 | PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))
 7 | 
 8 | DESCRIPTION = "Python module and CLI for hashing of file system directories."
 9 | 
10 | try:
11 |     with open(os.path.join(PROJECT_ROOT, "README.md"), encoding="utf-8") as f:
12 |         long_description = "\n" + f.read()
13 | except OSError:
14 |     long_description = DESCRIPTION
15 | 
16 | setup(
17 |     name="dirhash",
18 |     version=versioneer.get_version(),
19 |     cmdclass=versioneer.get_cmdclass(),
20 |     description=DESCRIPTION,
21 |     long_description=long_description,
22 |     long_description_content_type="text/markdown",
23 |     url="https://github.com/andhus/dirhash-python",
24 |     author="Anders Huss",
25 |     author_email="andhus@kth.se",
26 |     license="MIT",
27 |     python_requires=">=3.8",
28 |     install_requires=["scantree>=0.0.4"],
29 |     packages=find_packages("src"),
30 |     package_dir={"": "src"},
31 |     include_package_data=True,
32 |     entry_points={
33 |         "console_scripts": ["dirhash=dirhash.cli:main"],
34 |     },
35 |     tests_require=["pre-commit", "pytest", "pytest-cov"],
36 | )
37 | 


--------------------------------------------------------------------------------
/src/dirhash/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """dirhash - a python library (and CLI) for hashing of file system directories."""
  3 | 
  4 | import hashlib
  5 | import os
  6 | from functools import partial
  7 | from multiprocessing import Pool
  8 | 
  9 | from scantree import CyclicLinkedDir, RecursionFilter, scantree
 10 | 
 11 | from . import _version
 12 | 
 13 | __version__ = _version.get_versions()["version"]
 14 | 
 15 | __all__ = [
 16 |     "__version__",
 17 |     "algorithms_guaranteed",
 18 |     "algorithms_available",
 19 |     "dirhash",
 20 |     "dirhash_impl",
 21 |     "included_paths",
 22 |     "Filter",
 23 |     "get_match_patterns",
 24 |     "Protocol",
 25 | ]
 26 | 
 27 | 
 28 | algorithms_guaranteed = {"md5", "sha1", "sha224", "sha256", "sha384", "sha512"}
 29 | algorithms_available = hashlib.algorithms_available
 30 | 
 31 | 
 32 | def dirhash(
 33 |     directory,
 34 |     algorithm,
 35 |     match=("*",),
 36 |     ignore=None,
 37 |     linked_dirs=True,
 38 |     linked_files=True,
 39 |     empty_dirs=False,
 40 |     entry_properties=("name", "data"),
 41 |     allow_cyclic_links=False,
 42 |     chunk_size=2**20,
 43 |     jobs=1,
 44 | ):
 45 |     """Computes the hash of a directory based on its structure and content.
 46 | 
 47 |     # Arguments
 48 |         directory: Union[str, pathlib.Path] - Path to the directory to hash.
 49 |         algorithm: str - The name of the hashing algorithm to use. See
 50 |             `dirhash.algorithms_available` for the available options.
 51 |         match: Iterable[str] - An iterable of glob/wildcard match-patterns for paths
 52 |             to include when computing the hash. Default is ["*"] which means that all
 53 |             files and directories are matched.  To e.g. only include python source
 54 |             files, use: `match=["*.py"]`. See "Path Selection and Filtering" section
 55 |             below for further details.
 56 |         ignore: Optional[Iterable[str]] - An iterable of glob/wildcard match-patterns
 57 |             for paths to ignore when computing the hash. Default `None` (no ignore
 58 |             patterns). To e.g. exclude hidden files and directories use:
 59 |             `ignore=[".*/", ".*"]`. See "Path Selection and Filtering" section below
 60 |             for further details.
 61 |         linked_dirs: bool - If `True` (default), follow symbolic links to other
 62 |             *directories* and include these and their content in the hash
 63 |             computation.
 64 |         linked_files: bool - If `True` (default), include symbolic linked files in
 65 |             the hash computation.
 66 |         empty_dirs: bool - If `True`, include empty directories when computing the
 67 |             hash. A directory is considered empty if it does not contain any files
 68 |             that *matches provided matching criteria*. Default `False`, i.e. empty
 69 |             directories are ignored (as is done in git version control).
 70 |         entry_properties: Iterable[str] - A set (i.e. order does not matter) of the
 71 |             file/directory properties to consider when computing the hash. Supported
 72 |             properties are {"name", "data", "is_link"} where at least one of
 73 |             "name" and "data" must be included. Default is ["name", "data"] which
 74 |             means that the content (actual data) as well as the path relative to the
 75 |             root `directory` of files will affect the hash value. See "Entry
 76 |             Properties Interpretation" section below for further details.
 77 |         allow_cyclic_links: bool - If `False` (default) a `SymlinkRecursionError` is
 78 |             raised on presence of cyclic symbolic links. If set to `True` the the
 79 |             dirhash value for directory causing the cyclic link is replaced with the
 80 |             hash function hexdigest of the relative path from the link to the target.
 81 |         chunk_size: int - The number of bytes to read in one go from files while
 82 |             being hashed. A too small size will slow down the processing and a larger
 83 |             size consumes more working memory. Default 2**20 byte = 1 MiB.
 84 |         jobs: int - The number of processes to use when computing the hash.
 85 |             Default `1`, which means that a single (the main) process is used. NOTE
 86 |             that using multiprocessing can significantly speed-up execution, see
 87 |             `https://github.com/andhus/dirhash-python/benchmark` for further
 88 |             details.
 89 | 
 90 |     # Returns
 91 |         str - The hash/checksum as a string of the hexadecimal digits (the result of
 92 |         `hexdigest` method of the hashlib._hashlib.HASH object corresponding to the
 93 |         provided `algorithm`).
 94 | 
 95 |     # Raises
 96 |         TypeError/ValueError: For incorrectly provided arguments.
 97 |         SymlinkRecursionError: In case the `directory` contains symbolic links that
 98 |             lead to (infinite) recursion and `allow_cyclic_links=False` (default).
 99 | 
100 |     # Path Selection and Filtering
101 |         Provided glob/wildcard (".gitignore style") match-patterns determine what
102 |         paths within the `directory` to include when computing the hash value. Paths
103 |         *relative to the root `directory`* (i.e. excluding the name of the root
104 |         directory itself) are matched against the patterns.
105 |             The `match` argument represent what should be *included* - as opposed
106 |         to the `ignore` argument for which matches are *excluded*. Using `ignore` is
107 |         just short for adding the same patterns to the `match` argument with the
108 |         prefix "!", i.e. the calls bellow are equivalent:
109 |             `dirhash(..., match=["*", "!<pattern>"])`
110 |             `dirhash(..., ignore=["<pattern>"])`
111 |         To validate which paths are included, call `dirhash.included_paths` with
112 |         the same values for the arguments: `match`, `ignore`, `linked_dirs`,
113 |         `linked_files` and `empty_dirs` to get a list of all paths that will be
114 |         included when computing the hash by this function.
115 | 
116 |     # Entry Properties Interpretation
117 |         - ["name", "data"] (Default) - The name as well as data is included. Due to
118 |             the recursive nature of the dirhash computation, "name" implies that the
119 |             path relative to the root `directory` of each file/directory affects the
120 |             computed hash value.
121 |         - ["data"] - Compute the hash only based on the data of files -
122 |             *not* their names or the names of their parent directories. NOTE that
123 |             the tree structure in which files are organized under the `directory`
124 |             root still influences the computed hash. As longs as all files have
125 |             the same content and are organised the same way in relation to all
126 |             other files in the Directed Acyclic Graph representing the file-tree,
127 |             the hash will remain the same (but the "name of nodes" does not
128 |             matter). This option can e.g. be used to verify that that data is
129 |             unchanged after renaming files (change extensions etc.).
130 |         - ["name"] - Compute the hash only based on the name and location of
131 |             files in the file tree under the `directory` root. This option can
132 |             e.g. be used to check if any files have been added/moved/removed,
133 |             ignoring the content of each file.
134 |         - "is_link" - if this options is added to any of the cases above the
135 |             hash value is also affected by whether a file or directory is a
136 |             symbolic link or not. NOTE: with this property added, the hash
137 |             will be different than without it even if there are no symbolic links
138 |             in the directory.
139 | 
140 |     # References
141 |         See https://github.com/andhus/dirhash/README.md for a formal
142 |         description of how the returned hash value is computed.
143 |     """
144 |     filter_ = Filter(
145 |         match_patterns=get_match_patterns(match=match, ignore=ignore),
146 |         linked_dirs=linked_dirs,
147 |         linked_files=linked_files,
148 |         empty_dirs=empty_dirs,
149 |     )
150 |     protocol = Protocol(
151 |         entry_properties=entry_properties, allow_cyclic_links=allow_cyclic_links
152 |     )
153 |     return dirhash_impl(
154 |         directory=directory,
155 |         algorithm=algorithm,
156 |         filter_=filter_,
157 |         protocol=protocol,
158 |         chunk_size=chunk_size,
159 |         jobs=jobs,
160 |     )
161 | 
162 | 
163 | def dirhash_impl(
164 |     directory, algorithm, filter_=None, protocol=None, chunk_size=2**20, jobs=1
165 | ):
166 |     """Computes the hash of a directory based on its structure and content.
167 | 
168 |     In contrast to `dirhash.dirhash`, this function accepts custom implementations of
169 |     the `dirhash.Filter` and `dirhash.Protocol` classes.
170 | 
171 |     # Arguments
172 |         directory: Union[str, pathlib.Path] - Path to the directory to hash.
173 |         algorithm: str - The name of the hashing algorithm to use. See
174 |             `dirhash.algorithms_available` for the available options.
175 |             It is also possible to provide a callable object that returns an instance
176 |             implementing the `hashlib._hashlib.HASH` interface.
177 |         filter_: dirhash.Filter - Determines what files and directories to include
178 |             when computing the hash. See docs of `dirhash.Filter` for further
179 |             details.
180 |         protocol: dirhash.Protocol - Determines (mainly) what properties of files and
181 |             directories to consider when computing the hash value.
182 |         chunk_size: int - The number of bytes to read in one go from files while
183 |             being hashed. A too small size will slow down the processing and a larger
184 |             size consumes more working memory. Default 2**20 byte = 1 MiB.
185 |         jobs: int - The number of processes to use when computing the hash.
186 |             Default `1`, which means that a single (the main) process is used. NOTE
187 |             that using multiprocessing can significantly speed-up execution, see
188 |             `https://github.com/andhus/dirhash/tree/master/benchmark` for further
189 |             details.
190 | 
191 |     # Returns
192 |         str - The hash/checksum as a string of the hexadecimal digits (the result of
193 |         `hexdigest` method of the hashlib._hashlib.HASH object corresponding to the
194 |         provided `algorithm`).
195 | 
196 |     # Raises
197 |         TypeError/ValueError: For incorrectly provided arguments.
198 |         SymlinkRecursionError: In case the `directory` contains symbolic links that
199 |             lead to (infinite) recursion and the protocol option `allow_cyclic_links`
200 |             is `False`.
201 | 
202 |     # References
203 |         See https://github.com/andhus/dirhash/README.md for a formal
204 |         description of how the returned hash value is computed.
205 |     """
206 | 
207 |     def get_instance(value, cls_, argname):
208 |         if isinstance(value, cls_):
209 |             return value
210 |         if value is None:
211 |             return cls_()
212 |         raise TypeError(f"{argname} must be an instance of {cls_} or None")
213 | 
214 |     filter_ = get_instance(filter_, Filter, "filter_")
215 |     protocol = get_instance(protocol, Protocol, "protocol")
216 |     hasher_factory = _get_hasher_factory(algorithm)
217 | 
218 |     def dir_apply(dir_node):
219 |         if not filter_.empty_dirs:
220 |             if dir_node.path.relative == "" and dir_node.empty:
221 |                 # only check if root node is empty (other empty dirs are filter
222 |                 # before `dir_apply` with `filter_.empty_dirs=False`)
223 |                 raise ValueError(f"{directory}: Nothing to hash")
224 |         descriptor = protocol.get_descriptor(dir_node)
225 |         _dirhash = hasher_factory(descriptor.encode("utf-8")).hexdigest()
226 | 
227 |         return dir_node.path, _dirhash
228 | 
229 |     if jobs == 1:
230 |         cache = {}
231 | 
232 |         def file_apply(path):
233 |             return path, _get_filehash(
234 |                 path.real, hasher_factory, chunk_size=chunk_size, cache=cache
235 |             )
236 | 
237 |         _, dirhash_ = scantree(
238 |             directory,
239 |             recursion_filter=filter_,
240 |             file_apply=file_apply,
241 |             dir_apply=dir_apply,
242 |             follow_links=True,
243 |             allow_cyclic_links=protocol.allow_cyclic_links,
244 |             cache_file_apply=False,
245 |             include_empty=filter_.empty_dirs,
246 |             jobs=1,
247 |         )
248 |     else:  # multiprocessing
249 |         real_paths = set()
250 | 
251 |         def extract_real_paths(path):
252 |             real_paths.add(path.real)
253 |             return path
254 | 
255 |         root_node = scantree(
256 |             directory,
257 |             recursion_filter=filter_,
258 |             file_apply=extract_real_paths,
259 |             follow_links=True,
260 |             allow_cyclic_links=protocol.allow_cyclic_links,
261 |             cache_file_apply=False,
262 |             include_empty=filter_.empty_dirs,
263 |             jobs=1,
264 |         )
265 |         real_paths = list(real_paths)
266 |         # hash files in parallel
267 |         file_hashes = _parmap(
268 |             partial(
269 |                 _get_filehash, hasher_factory=hasher_factory, chunk_size=chunk_size
270 |             ),
271 |             real_paths,
272 |             jobs=jobs,
273 |         )
274 |         # prepare the mapping with precomputed file hashes
275 |         real_path_to_hash = dict(zip(real_paths, file_hashes))
276 | 
277 |         def file_apply(path):
278 |             return path, real_path_to_hash[path.real]
279 | 
280 |         _, dirhash_ = root_node.apply(file_apply=file_apply, dir_apply=dir_apply)
281 | 
282 |     return dirhash_
283 | 
284 | 
285 | def included_paths(
286 |     directory,
287 |     match=("*",),
288 |     ignore=None,
289 |     linked_dirs=True,
290 |     linked_files=True,
291 |     empty_dirs=False,
292 |     allow_cyclic_links=False,
293 | ):
294 |     """Inspect what paths are included for the corresponding arguments to the
295 |     `dirhash.dirhash` function.
296 | 
297 |     # Arguments:
298 |         This function accepts the following subset of the function `dirhash.dirhash`
299 |         arguments: `directory`, `match`, `ignore`, `linked_dirs`, `linked_files`,
300 |         `empty_dirs` and `allow_cyclic_links`, *with the same interpretation*. See
301 |         docs of `dirhash.dirhash` for further details.
302 | 
303 |     # Returns
304 |         List[str] - A sorted list of the paths that would be included when computing
305 |         the hash of the `directory` using `dirhash.dirhash` and the same arguments.
306 |     """
307 |     filter_ = Filter(
308 |         match_patterns=get_match_patterns(match=match, ignore=ignore),
309 |         linked_dirs=linked_dirs,
310 |         linked_files=linked_files,
311 |         empty_dirs=empty_dirs,
312 |     )
313 |     protocol = Protocol(allow_cyclic_links=allow_cyclic_links)
314 | 
315 |     leafpaths = scantree(
316 |         directory,
317 |         recursion_filter=filter_,
318 |         follow_links=True,
319 |         allow_cyclic_links=protocol.allow_cyclic_links,
320 |         include_empty=filter_.empty_dirs,
321 |     ).leafpaths()
322 | 
323 |     return [
324 |         path.relative if path.is_file() else os.path.join(path.relative, ".")
325 |         for path in leafpaths
326 |     ]
327 | 
328 | 
329 | class Filter(RecursionFilter):
330 |     """Specification of what files and directories to include for the `dirhash`
331 |     computation.
332 | 
333 |     # Arguments
334 |         match: Iterable[str] - An iterable of glob/wildcard (".gitignore style")
335 |             match patterns for selection of which files and directories to include.
336 |             Paths *relative to the root `directory`* (i.e. excluding the name of the
337 |             root directory itself) are matched against the provided patterns. For
338 |             example, to include all files, except for hidden ones use:
339 |             `match=['*', '!.*']` Default `None` which is equivalent to `['*']`,
340 |             i.e. everything is included.
341 |         linked_dirs: bool - If `True` (default), follow symbolic links to other
342 |             *directories* and include these and their content in the hash
343 |             computation.
344 |         linked_files: bool - If `True` (default), include symbolic linked files in
345 |             the hash computation.
346 |         empty_dirs: bool - If `True`, include empty directories when computing the
347 |             hash. A directory is considered empty if it does not contain any files
348 |             that *matches provided matching criteria*. Default `False`, i.e. empty
349 |             directories are ignored (as is done in git version control).
350 |     """
351 | 
352 |     def __init__(
353 |         self, match_patterns=None, linked_dirs=True, linked_files=True, empty_dirs=False
354 |     ):
355 |         super().__init__(
356 |             linked_dirs=linked_dirs, linked_files=linked_files, match=match_patterns
357 |         )
358 |         self.empty_dirs = empty_dirs
359 | 
360 | 
361 | def get_match_patterns(
362 |     match=None,
363 |     ignore=None,
364 |     ignore_extensions=None,
365 |     ignore_hidden=False,
366 | ):
367 |     """Helper to compose a list of list of glob/wildcard (".gitignore style") match
368 |     patterns based on options dedicated for a few standard use-cases.
369 | 
370 |     # Arguments
371 |         match: Optional[List[str]] - A list of match-patterns for files to *include*.
372 |             Default `None` which is equivalent to `['*']`, i.e. everything is
373 |             included (unless excluded by arguments below).
374 |         ignore: Optional[List[str]] -  A list of match-patterns for files to
375 |             *ignore*. Default `None` (no ignore patterns).
376 |         ignore_extensions: Optional[List[str]] -  A list of file extensions to
377 |             ignore. Short for `ignore=['*.<my extension>', ...]` Default `None` (no
378 |             extensions ignored).
379 |         ignore_hidden: bool - If `True` ignore hidden files and directories. Short
380 |             for `ignore=['.*', '.*/']` Default `False`.
381 |     """
382 |     match = ["*"] if match is None else list(match)
383 |     ignore = [] if ignore is None else list(ignore)
384 |     ignore_extensions = [] if ignore_extensions is None else list(ignore_extensions)
385 | 
386 |     if ignore_hidden:
387 |         ignore.extend([".*", ".*/"])
388 | 
389 |     for ext in ignore_extensions:
390 |         if not ext.startswith("."):
391 |             ext = "." + ext
392 |         ext = "*" + ext
393 |         ignore.append(ext)
394 | 
395 |     match_spec = match + ["!" + ign for ign in ignore]
396 | 
397 |     def deduplicate(items):
398 |         items_set = set()
399 |         dd_items = []
400 |         for item in items:
401 |             if item not in items_set:
402 |                 dd_items.append(item)
403 |                 items_set.add(item)
404 | 
405 |         return dd_items
406 | 
407 |     return deduplicate(match_spec)
408 | 
409 | 
410 | class Protocol:
411 |     """Specifications of which file and directory properties to consider when
412 |         computing the `dirhash` value.
413 | 
414 |     # Arguments
415 |         entry_properties: Iterable[str] - A combination of the supported properties
416 |             {"name", "data", "is_link"} where at least one of "name" and "data" is
417 |             included. Interpretation:
418 |             - ["name", "data"] (Default) - The name as well as data is included. Due
419 |                 to the recursive nature of the dirhash computation, "name" implies
420 |                 that the path relative to the root `directory` of each file/directory
421 |                 affects the computed hash value.
422 |             - ["data"] - Compute the hash only based on the data of files -
423 |                 *not* their names or the names of their parent directories. NOTE that
424 |                 the tree structure in which files are organized under the `directory`
425 |                 root still influences the computed hash. As longs as all files have
426 |                 the same content and are organised the same way in relation to all
427 |                 other files in the Directed Acyclic Graph representing the file-tree,
428 |                 the hash will remain the same (but the "name of nodes" does not
429 |                 matter). This option can e.g. be used to verify that that data is
430 |                 unchanged after renaming files (change extensions etc.).
431 |             - ["name"] - Compute the hash only based on the name and location of
432 |                 files in the file tree under the `directory` root. This option can
433 |                 e.g. be used to check if any files have been added/moved/removed,
434 |                 ignoring the content of each file.
435 |             - "is_link" - if this options is added to any of the cases above the
436 |                 hash value is also affected by whether a file or directory is a
437 |                 symbolic link or not. NOTE: which this property added, the hash
438 |                 will be different than without it even if there are no symbolic links
439 |                 in the directory.
440 |         allow_cyclic_links: bool - If `False` (default) a `SymlinkRecursionError` is
441 |             raised on presence of cyclic symbolic links. If set to `True` the the
442 |             dirhash value for directory causing the cyclic link is replaced with the
443 |             hash function hexdigest of the relative path from the link to the target.
444 |     """
445 | 
446 |     class EntryProperties:
447 |         NAME = "name"
448 |         DATA = "data"
449 |         IS_LINK = "is_link"
450 |         options = {NAME, DATA, IS_LINK}
451 |         _DIRHASH = "dirhash"
452 | 
453 |     _entry_property_separator = "\000"
454 |     _entry_descriptor_separator = "\000\000"
455 | 
456 |     def __init__(self, entry_properties=("name", "data"), allow_cyclic_links=False):
457 |         entry_properties = set(entry_properties)
458 |         if not entry_properties.issubset(self.EntryProperties.options):
459 |             raise ValueError(
460 |                 f"entry properties {entry_properties - self.EntryProperties.options} "
461 |                 "not supported"
462 |             )
463 |         if not (
464 |             self.EntryProperties.NAME in entry_properties
465 |             or self.EntryProperties.DATA in entry_properties
466 |         ):
467 |             raise ValueError(
468 |                 "at least one of entry properties `name` and `data` must be used"
469 |             )
470 |         self.entry_properties = entry_properties
471 |         self._include_name = self.EntryProperties.NAME in entry_properties
472 |         self._include_data = self.EntryProperties.DATA in entry_properties
473 |         self._include_is_link = self.EntryProperties.IS_LINK in entry_properties
474 | 
475 |         if not isinstance(allow_cyclic_links, bool):
476 |             raise ValueError(
477 |                 f"allow_cyclic_link must be a boolean, got {allow_cyclic_links}"
478 |             )
479 |         self.allow_cyclic_links = allow_cyclic_links
480 | 
481 |     def get_descriptor(self, dir_node):
482 |         if isinstance(dir_node, CyclicLinkedDir):
483 |             return self._get_cyclic_linked_dir_descriptor(dir_node)
484 | 
485 |         entries = dir_node.directories + dir_node.files
486 |         entry_descriptors = [
487 |             self._get_entry_descriptor(self._get_entry_properties(path, entry_hash))
488 |             for path, entry_hash in entries
489 |         ]
490 |         return self._entry_descriptor_separator.join(sorted(entry_descriptors))
491 | 
492 |     @classmethod
493 |     def _get_entry_descriptor(cls, entry_properties):
494 |         entry_strings = [f"{name}:{value}" for name, value in entry_properties]
495 |         return cls._entry_property_separator.join(sorted(entry_strings))
496 | 
497 |     def _get_entry_properties(self, path, entry_hash):
498 |         properties = []
499 |         if path.is_dir():
500 |             properties.append((self.EntryProperties._DIRHASH, entry_hash))
501 |         elif self._include_data:  # path is file
502 |             properties.append((self.EntryProperties.DATA, entry_hash))
503 | 
504 |         if self._include_name:
505 |             properties.append((self.EntryProperties.NAME, path.name))
506 |         if self._include_is_link:
507 |             properties.append((self.EntryProperties.IS_LINK, path.is_symlink))
508 | 
509 |         return properties
510 | 
511 |     def _get_cyclic_linked_dir_descriptor(self, dir_node):
512 |         relpath = dir_node.path.relative
513 |         target_relpath = dir_node.target_path.relative
514 |         path_to_target = os.path.relpath(
515 |             # the extra '.' is needed if link back to root, because
516 |             # an empty path ('') is not supported by os.path.relpath
517 |             os.path.join(".", target_relpath),
518 |             os.path.join(".", relpath),
519 |         )
520 |         # TODO normalize posix!
521 |         return path_to_target
522 | 
523 | 
524 | def _get_hasher_factory(algorithm):
525 |     """Returns a "factory" of hasher instances corresponding to the given algorithm
526 |     name. Bypasses input argument `algorithm` if it is already a hasher factory
527 |     (verified by attempting calls to required methods).
528 |     """
529 |     if algorithm in algorithms_guaranteed:
530 |         return getattr(hashlib, algorithm)
531 | 
532 |     if algorithm in algorithms_available:
533 |         return partial(hashlib.new, algorithm)
534 | 
535 |     try:  # bypass algorithm if already a hasher factory
536 |         hasher = algorithm(b"")
537 |         hasher.update(b"")
538 |         hasher.hexdigest()
539 |         return algorithm
540 |     except:  # noqa: E722
541 |         pass
542 | 
543 |     raise ValueError(f"`algorithm` must be one of: {algorithms_available}`")
544 | 
545 | 
546 | def _parmap(func, iterable, jobs=1):
547 |     """Map with multiprocessing.Pool"""
548 |     if jobs == 1:
549 |         return [func(element) for element in iterable]
550 | 
551 |     pool = Pool(jobs)
552 |     try:
553 |         results = pool.map(func, iterable)
554 |     finally:
555 |         pool.close()
556 | 
557 |     return results
558 | 
559 | 
560 | def _get_filehash(filepath, hasher_factory, chunk_size, cache=None):
561 |     """Compute the hash of the given filepath.
562 | 
563 |     # Arguments
564 |         filepath: str - Path to the file to hash.
565 |         hasher_factory: (f: f() -> hashlib._hashlib.HASH): Callable that returns an
566 |             instance of the `hashlib._hashlib.HASH` interface.
567 |         chunk_size (int): The number of bytes to read in one go from files while
568 |             being hashed.
569 |         cache ({str: str} | None): A mapping from `filepath` to hash (return value
570 |             of this function). If not None, a lookup will be attempted before hashing
571 |             the file and the result will be added after completion.
572 | 
573 |     # Returns
574 |         The hash/checksum as a string the of hexadecimal digits.
575 | 
576 |     # Side-effects
577 |         The `cache` is updated if not None.
578 |     """
579 |     if cache is not None:
580 |         filehash = cache.get(filepath, None)
581 |         if filehash is None:
582 |             filehash = _get_filehash(filepath, hasher_factory, chunk_size)
583 |             cache[filepath] = filehash
584 |         return filehash
585 | 
586 |     hasher = hasher_factory()
587 |     with open(filepath, "rb") as f:
588 |         for chunk in iter(lambda: f.read(chunk_size), b""):
589 |             hasher.update(chunk)
590 | 
591 |     return hasher.hexdigest()
592 | 


--------------------------------------------------------------------------------
/src/dirhash/_version.py:
--------------------------------------------------------------------------------
  1 | # This file helps to compute a version number in source trees obtained from
  2 | # git-archive tarball (such as those provided by githubs download-from-tag
  3 | # feature). Distribution tarballs (built by setup.py sdist) and build
  4 | # directories (produced by setup.py build) will contain a much shorter file
  5 | # that just contains the computed version number.
  6 | 
  7 | # This file is released into the public domain.
  8 | # Generated by versioneer-0.29
  9 | # https://github.com/python-versioneer/python-versioneer
 10 | 
 11 | # ruff: noqa
 12 | 
 13 | """Git implementation of _version.py."""
 14 | 
 15 | import errno
 16 | import functools
 17 | import os
 18 | import re
 19 | import subprocess
 20 | import sys
 21 | from typing import Any, Callable, Dict, List, Optional, Tuple
 22 | 
 23 | 
 24 | def get_keywords() -> Dict[str, str]:
 25 |     """Get the keywords needed to look up the version information."""
 26 |     # these strings will be replaced by git during git-archive.
 27 |     # setup.py/versioneer.py will grep for the variable names, so they must
 28 |     # each be defined on a line of their own. _version.py will just call
 29 |     # get_keywords().
 30 |     git_refnames = " (HEAD -> master, tag: v0.5.0)"
 31 |     git_full = "1ead28a0ede6c8f039ab8b8107b71b011b3d435d"
 32 |     git_date = "2024-08-04 00:12:01 +0200"
 33 |     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
 34 |     return keywords
 35 | 
 36 | 
 37 | class VersioneerConfig:
 38 |     """Container for Versioneer configuration parameters."""
 39 | 
 40 |     VCS: str
 41 |     style: str
 42 |     tag_prefix: str
 43 |     parentdir_prefix: str
 44 |     versionfile_source: str
 45 |     verbose: bool
 46 | 
 47 | 
 48 | def get_config() -> VersioneerConfig:
 49 |     """Create, populate and return the VersioneerConfig() object."""
 50 |     # these strings are filled in when 'setup.py versioneer' creates
 51 |     # _version.py
 52 |     cfg = VersioneerConfig()
 53 |     cfg.VCS = "git"
 54 |     cfg.style = "pep440"
 55 |     cfg.tag_prefix = "v"
 56 |     cfg.parentdir_prefix = "dirhash-"
 57 |     cfg.versionfile_source = "src/dirhash/_version.py"
 58 |     cfg.verbose = False
 59 |     return cfg
 60 | 
 61 | 
 62 | class NotThisMethod(Exception):
 63 |     """Exception raised if a method is not valid for the current scenario."""
 64 | 
 65 | 
 66 | LONG_VERSION_PY: Dict[str, str] = {}
 67 | HANDLERS: Dict[str, Dict[str, Callable]] = {}
 68 | 
 69 | 
 70 | def register_vcs_handler(vcs: str, method: str) -> Callable:  # decorator
 71 |     """Create decorator to mark a method as the handler of a VCS."""
 72 | 
 73 |     def decorate(f: Callable) -> Callable:
 74 |         """Store f in HANDLERS[vcs][method]."""
 75 |         if vcs not in HANDLERS:
 76 |             HANDLERS[vcs] = {}
 77 |         HANDLERS[vcs][method] = f
 78 |         return f
 79 | 
 80 |     return decorate
 81 | 
 82 | 
 83 | def run_command(
 84 |     commands: List[str],
 85 |     args: List[str],
 86 |     cwd: Optional[str] = None,
 87 |     verbose: bool = False,
 88 |     hide_stderr: bool = False,
 89 |     env: Optional[Dict[str, str]] = None,
 90 | ) -> Tuple[Optional[str], Optional[int]]:
 91 |     """Call the given command(s)."""
 92 |     assert isinstance(commands, list)
 93 |     process = None
 94 | 
 95 |     popen_kwargs: Dict[str, Any] = {}
 96 |     if sys.platform == "win32":
 97 |         # This hides the console window if pythonw.exe is used
 98 |         startupinfo = subprocess.STARTUPINFO()
 99 |         startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
100 |         popen_kwargs["startupinfo"] = startupinfo
101 | 
102 |     for command in commands:
103 |         try:
104 |             dispcmd = str([command] + args)
105 |             # remember shell=False, so use git.cmd on windows, not just git
106 |             process = subprocess.Popen(
107 |                 [command] + args,
108 |                 cwd=cwd,
109 |                 env=env,
110 |                 stdout=subprocess.PIPE,
111 |                 stderr=(subprocess.PIPE if hide_stderr else None),
112 |                 **popen_kwargs,
113 |             )
114 |             break
115 |         except OSError as e:
116 |             if e.errno == errno.ENOENT:
117 |                 continue
118 |             if verbose:
119 |                 print("unable to run %s" % dispcmd)
120 |                 print(e)
121 |             return None, None
122 |     else:
123 |         if verbose:
124 |             print(f"unable to find command, tried {commands}")
125 |         return None, None
126 |     stdout = process.communicate()[0].strip().decode()
127 |     if process.returncode != 0:
128 |         if verbose:
129 |             print("unable to run %s (error)" % dispcmd)
130 |             print("stdout was %s" % stdout)
131 |         return None, process.returncode
132 |     return stdout, process.returncode
133 | 
134 | 
135 | def versions_from_parentdir(
136 |     parentdir_prefix: str,
137 |     root: str,
138 |     verbose: bool,
139 | ) -> Dict[str, Any]:
140 |     """Try to determine the version from the parent directory name.
141 | 
142 |     Source tarballs conventionally unpack into a directory that includes both
143 |     the project name and a version string. We will also support searching up
144 |     two directory levels for an appropriately named parent directory
145 |     """
146 |     rootdirs = []
147 | 
148 |     for _ in range(3):
149 |         dirname = os.path.basename(root)
150 |         if dirname.startswith(parentdir_prefix):
151 |             return {
152 |                 "version": dirname[len(parentdir_prefix) :],
153 |                 "full-revisionid": None,
154 |                 "dirty": False,
155 |                 "error": None,
156 |                 "date": None,
157 |             }
158 |         rootdirs.append(root)
159 |         root = os.path.dirname(root)  # up a level
160 | 
161 |     if verbose:
162 |         print(
163 |             "Tried directories %s but none started with prefix %s"
164 |             % (str(rootdirs), parentdir_prefix)
165 |         )
166 |     raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
167 | 
168 | 
169 | @register_vcs_handler("git", "get_keywords")
170 | def git_get_keywords(versionfile_abs: str) -> Dict[str, str]:
171 |     """Extract version information from the given file."""
172 |     # the code embedded in _version.py can just fetch the value of these
173 |     # keywords. When used from setup.py, we don't want to import _version.py,
174 |     # so we do it with a regexp instead. This function is not used from
175 |     # _version.py.
176 |     keywords: Dict[str, str] = {}
177 |     try:
178 |         with open(versionfile_abs) as fobj:
179 |             for line in fobj:
180 |                 if line.strip().startswith("git_refnames ="):
181 |                     mo = re.search(r'=\s*"(.*)"', line)
182 |                     if mo:
183 |                         keywords["refnames"] = mo.group(1)
184 |                 if line.strip().startswith("git_full ="):
185 |                     mo = re.search(r'=\s*"(.*)"', line)
186 |                     if mo:
187 |                         keywords["full"] = mo.group(1)
188 |                 if line.strip().startswith("git_date ="):
189 |                     mo = re.search(r'=\s*"(.*)"', line)
190 |                     if mo:
191 |                         keywords["date"] = mo.group(1)
192 |     except OSError:
193 |         pass
194 |     return keywords
195 | 
196 | 
197 | @register_vcs_handler("git", "keywords")
198 | def git_versions_from_keywords(
199 |     keywords: Dict[str, str],
200 |     tag_prefix: str,
201 |     verbose: bool,
202 | ) -> Dict[str, Any]:
203 |     """Get version information from git keywords."""
204 |     if "refnames" not in keywords:
205 |         raise NotThisMethod("Short version file found")
206 |     date = keywords.get("date")
207 |     if date is not None:
208 |         # Use only the last line.  Previous lines may contain GPG signature
209 |         # information.
210 |         date = date.splitlines()[-1]
211 | 
212 |         # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
213 |         # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
214 |         # -like" string, which we must then edit to make compliant), because
215 |         # it's been around since git-1.5.3, and it's too difficult to
216 |         # discover which version we're using, or to work around using an
217 |         # older one.
218 |         date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
219 |     refnames = keywords["refnames"].strip()
220 |     if refnames.startswith("$Format"):
221 |         if verbose:
222 |             print("keywords are unexpanded, not using")
223 |         raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
224 |     refs = {r.strip() for r in refnames.strip("()").split(",")}
225 |     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
226 |     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
227 |     TAG = "tag: "
228 |     tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)}
229 |     if not tags:
230 |         # Either we're using git < 1.8.3, or there really are no tags. We use
231 |         # a heuristic: assume all version tags have a digit. The old git %d
232 |         # expansion behaves like git log --decorate=short and strips out the
233 |         # refs/heads/ and refs/tags/ prefixes that would let us distinguish
234 |         # between branches and tags. By ignoring refnames without digits, we
235 |         # filter out many common branch names like "release" and
236 |         # "stabilization", as well as "HEAD" and "master".
237 |         tags = {r for r in refs if re.search(r"\d", r)}
238 |         if verbose:
239 |             print("discarding '%s', no digits" % ",".join(refs - tags))
240 |     if verbose:
241 |         print("likely tags: %s" % ",".join(sorted(tags)))
242 |     for ref in sorted(tags):
243 |         # sorting will prefer e.g. "2.0" over "2.0rc1"
244 |         if ref.startswith(tag_prefix):
245 |             r = ref[len(tag_prefix) :]
246 |             # Filter out refs that exactly match prefix or that don't start
247 |             # with a number once the prefix is stripped (mostly a concern
248 |             # when prefix is '')
249 |             if not re.match(r"\d", r):
250 |                 continue
251 |             if verbose:
252 |                 print("picking %s" % r)
253 |             return {
254 |                 "version": r,
255 |                 "full-revisionid": keywords["full"].strip(),
256 |                 "dirty": False,
257 |                 "error": None,
258 |                 "date": date,
259 |             }
260 |     # no suitable tags, so version is "0+unknown", but full hex is still there
261 |     if verbose:
262 |         print("no suitable tags, using unknown + full revision id")
263 |     return {
264 |         "version": "0+unknown",
265 |         "full-revisionid": keywords["full"].strip(),
266 |         "dirty": False,
267 |         "error": "no suitable tags",
268 |         "date": None,
269 |     }
270 | 
271 | 
272 | @register_vcs_handler("git", "pieces_from_vcs")
273 | def git_pieces_from_vcs(
274 |     tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command
275 | ) -> Dict[str, Any]:
276 |     """Get version from 'git describe' in the root of the source tree.
277 | 
278 |     This only gets called if the git-archive 'subst' keywords were *not*
279 |     expanded, and _version.py hasn't already been rewritten with a short
280 |     version string, meaning we're inside a checked out source tree.
281 |     """
282 |     GITS = ["git"]
283 |     if sys.platform == "win32":
284 |         GITS = ["git.cmd", "git.exe"]
285 | 
286 |     # GIT_DIR can interfere with correct operation of Versioneer.
287 |     # It may be intended to be passed to the Versioneer-versioned project,
288 |     # but that should not change where we get our version from.
289 |     env = os.environ.copy()
290 |     env.pop("GIT_DIR", None)
291 |     runner = functools.partial(runner, env=env)
292 | 
293 |     _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose)
294 |     if rc != 0:
295 |         if verbose:
296 |             print("Directory %s not under git control" % root)
297 |         raise NotThisMethod("'git rev-parse --git-dir' returned error")
298 | 
299 |     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
300 |     # if there isn't one, this yields HEX[-dirty] (no NUM)
301 |     describe_out, rc = runner(
302 |         GITS,
303 |         [
304 |             "describe",
305 |             "--tags",
306 |             "--dirty",
307 |             "--always",
308 |             "--long",
309 |             "--match",
310 |             f"{tag_prefix}[[:digit:]]*",
311 |         ],
312 |         cwd=root,
313 |     )
314 |     # --long was added in git-1.5.5
315 |     if describe_out is None:
316 |         raise NotThisMethod("'git describe' failed")
317 |     describe_out = describe_out.strip()
318 |     full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
319 |     if full_out is None:
320 |         raise NotThisMethod("'git rev-parse' failed")
321 |     full_out = full_out.strip()
322 | 
323 |     pieces: Dict[str, Any] = {}
324 |     pieces["long"] = full_out
325 |     pieces["short"] = full_out[:7]  # maybe improved later
326 |     pieces["error"] = None
327 | 
328 |     branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root)
329 |     # --abbrev-ref was added in git-1.6.3
330 |     if rc != 0 or branch_name is None:
331 |         raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
332 |     branch_name = branch_name.strip()
333 | 
334 |     if branch_name == "HEAD":
335 |         # If we aren't exactly on a branch, pick a branch which represents
336 |         # the current commit. If all else fails, we are on a branchless
337 |         # commit.
338 |         branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
339 |         # --contains was added in git-1.5.4
340 |         if rc != 0 or branches is None:
341 |             raise NotThisMethod("'git branch --contains' returned error")
342 |         branches = branches.split("\n")
343 | 
344 |         # Remove the first line if we're running detached
345 |         if "(" in branches[0]:
346 |             branches.pop(0)
347 | 
348 |         # Strip off the leading "* " from the list of branches.
349 |         branches = [branch[2:] for branch in branches]
350 |         if "master" in branches:
351 |             branch_name = "master"
352 |         elif not branches:
353 |             branch_name = None
354 |         else:
355 |             # Pick the first branch that is returned. Good or bad.
356 |             branch_name = branches[0]
357 | 
358 |     pieces["branch"] = branch_name
359 | 
360 |     # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
361 |     # TAG might have hyphens.
362 |     git_describe = describe_out
363 | 
364 |     # look for -dirty suffix
365 |     dirty = git_describe.endswith("-dirty")
366 |     pieces["dirty"] = dirty
367 |     if dirty:
368 |         git_describe = git_describe[: git_describe.rindex("-dirty")]
369 | 
370 |     # now we have TAG-NUM-gHEX or HEX
371 | 
372 |     if "-" in git_describe:
373 |         # TAG-NUM-gHEX
374 |         mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
375 |         if not mo:
376 |             # unparsable. Maybe git-describe is misbehaving?
377 |             pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
378 |             return pieces
379 | 
380 |         # tag
381 |         full_tag = mo.group(1)
382 |         if not full_tag.startswith(tag_prefix):
383 |             if verbose:
384 |                 fmt = "tag '%s' doesn't start with prefix '%s'"
385 |                 print(fmt % (full_tag, tag_prefix))
386 |             pieces["error"] = (
387 |                 f"tag '{full_tag}' doesn't start with prefix '{tag_prefix}'"
388 |             )
389 |             return pieces
390 |         pieces["closest-tag"] = full_tag[len(tag_prefix) :]
391 | 
392 |         # distance: number of commits since tag
393 |         pieces["distance"] = int(mo.group(2))
394 | 
395 |         # commit: short hex revision ID
396 |         pieces["short"] = mo.group(3)
397 | 
398 |     else:
399 |         # HEX: no tags
400 |         pieces["closest-tag"] = None
401 |         out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
402 |         pieces["distance"] = len(out.split())  # total number of commits
403 | 
404 |     # commit date: see ISO-8601 comment in git_versions_from_keywords()
405 |     date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
406 |     # Use only the last line.  Previous lines may contain GPG signature
407 |     # information.
408 |     date = date.splitlines()[-1]
409 |     pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
410 | 
411 |     return pieces
412 | 
413 | 
414 | def plus_or_dot(pieces: Dict[str, Any]) -> str:
415 |     """Return a + if we don't already have one, else return a ."""
416 |     if "+" in pieces.get("closest-tag", ""):
417 |         return "."
418 |     return "+"
419 | 
420 | 
421 | def render_pep440(pieces: Dict[str, Any]) -> str:
422 |     """Build up version string, with post-release "local version identifier".
423 | 
424 |     Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
425 |     get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
426 | 
427 |     Exceptions:
428 |     1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
429 |     """
430 |     if pieces["closest-tag"]:
431 |         rendered = pieces["closest-tag"]
432 |         if pieces["distance"] or pieces["dirty"]:
433 |             rendered += plus_or_dot(pieces)
434 |             rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
435 |             if pieces["dirty"]:
436 |                 rendered += ".dirty"
437 |     else:
438 |         # exception #1
439 |         rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
440 |         if pieces["dirty"]:
441 |             rendered += ".dirty"
442 |     return rendered
443 | 
444 | 
445 | def render_pep440_branch(pieces: Dict[str, Any]) -> str:
446 |     """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
447 | 
448 |     The ".dev0" means not master branch. Note that .dev0 sorts backwards
449 |     (a feature branch will appear "older" than the master branch).
450 | 
451 |     Exceptions:
452 |     1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
453 |     """
454 |     if pieces["closest-tag"]:
455 |         rendered = pieces["closest-tag"]
456 |         if pieces["distance"] or pieces["dirty"]:
457 |             if pieces["branch"] != "master":
458 |                 rendered += ".dev0"
459 |             rendered += plus_or_dot(pieces)
460 |             rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
461 |             if pieces["dirty"]:
462 |                 rendered += ".dirty"
463 |     else:
464 |         # exception #1
465 |         rendered = "0"
466 |         if pieces["branch"] != "master":
467 |             rendered += ".dev0"
468 |         rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
469 |         if pieces["dirty"]:
470 |             rendered += ".dirty"
471 |     return rendered
472 | 
473 | 
474 | def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]:
475 |     """Split pep440 version string at the post-release segment.
476 | 
477 |     Returns the release segments before the post-release and the
478 |     post-release version number (or -1 if no post-release segment is present).
479 |     """
480 |     vc = str.split(ver, ".post")
481 |     return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
482 | 
483 | 
484 | def render_pep440_pre(pieces: Dict[str, Any]) -> str:
485 |     """TAG[.postN.devDISTANCE] -- No -dirty.
486 | 
487 |     Exceptions:
488 |     1: no tags. 0.post0.devDISTANCE
489 |     """
490 |     if pieces["closest-tag"]:
491 |         if pieces["distance"]:
492 |             # update the post release segment
493 |             tag_version, post_version = pep440_split_post(pieces["closest-tag"])
494 |             rendered = tag_version
495 |             if post_version is not None:
496 |                 rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"])
497 |             else:
498 |                 rendered += ".post0.dev%d" % (pieces["distance"])
499 |         else:
500 |             # no commits, use the tag as the version
501 |             rendered = pieces["closest-tag"]
502 |     else:
503 |         # exception #1
504 |         rendered = "0.post0.dev%d" % pieces["distance"]
505 |     return rendered
506 | 
507 | 
508 | def render_pep440_post(pieces: Dict[str, Any]) -> str:
509 |     """TAG[.postDISTANCE[.dev0]+gHEX] .
510 | 
511 |     The ".dev0" means dirty. Note that .dev0 sorts backwards
512 |     (a dirty tree will appear "older" than the corresponding clean one),
513 |     but you shouldn't be releasing software with -dirty anyways.
514 | 
515 |     Exceptions:
516 |     1: no tags. 0.postDISTANCE[.dev0]
517 |     """
518 |     if pieces["closest-tag"]:
519 |         rendered = pieces["closest-tag"]
520 |         if pieces["distance"] or pieces["dirty"]:
521 |             rendered += ".post%d" % pieces["distance"]
522 |             if pieces["dirty"]:
523 |                 rendered += ".dev0"
524 |             rendered += plus_or_dot(pieces)
525 |             rendered += "g%s" % pieces["short"]
526 |     else:
527 |         # exception #1
528 |         rendered = "0.post%d" % pieces["distance"]
529 |         if pieces["dirty"]:
530 |             rendered += ".dev0"
531 |         rendered += "+g%s" % pieces["short"]
532 |     return rendered
533 | 
534 | 
535 | def render_pep440_post_branch(pieces: Dict[str, Any]) -> str:
536 |     """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
537 | 
538 |     The ".dev0" means not master branch.
539 | 
540 |     Exceptions:
541 |     1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
542 |     """
543 |     if pieces["closest-tag"]:
544 |         rendered = pieces["closest-tag"]
545 |         if pieces["distance"] or pieces["dirty"]:
546 |             rendered += ".post%d" % pieces["distance"]
547 |             if pieces["branch"] != "master":
548 |                 rendered += ".dev0"
549 |             rendered += plus_or_dot(pieces)
550 |             rendered += "g%s" % pieces["short"]
551 |             if pieces["dirty"]:
552 |                 rendered += ".dirty"
553 |     else:
554 |         # exception #1
555 |         rendered = "0.post%d" % pieces["distance"]
556 |         if pieces["branch"] != "master":
557 |             rendered += ".dev0"
558 |         rendered += "+g%s" % pieces["short"]
559 |         if pieces["dirty"]:
560 |             rendered += ".dirty"
561 |     return rendered
562 | 
563 | 
564 | def render_pep440_old(pieces: Dict[str, Any]) -> str:
565 |     """TAG[.postDISTANCE[.dev0]] .
566 | 
567 |     The ".dev0" means dirty.
568 | 
569 |     Exceptions:
570 |     1: no tags. 0.postDISTANCE[.dev0]
571 |     """
572 |     if pieces["closest-tag"]:
573 |         rendered = pieces["closest-tag"]
574 |         if pieces["distance"] or pieces["dirty"]:
575 |             rendered += ".post%d" % pieces["distance"]
576 |             if pieces["dirty"]:
577 |                 rendered += ".dev0"
578 |     else:
579 |         # exception #1
580 |         rendered = "0.post%d" % pieces["distance"]
581 |         if pieces["dirty"]:
582 |             rendered += ".dev0"
583 |     return rendered
584 | 
585 | 
586 | def render_git_describe(pieces: Dict[str, Any]) -> str:
587 |     """TAG[-DISTANCE-gHEX][-dirty].
588 | 
589 |     Like 'git describe --tags --dirty --always'.
590 | 
591 |     Exceptions:
592 |     1: no tags. HEX[-dirty]  (note: no 'g' prefix)
593 |     """
594 |     if pieces["closest-tag"]:
595 |         rendered = pieces["closest-tag"]
596 |         if pieces["distance"]:
597 |             rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
598 |     else:
599 |         # exception #1
600 |         rendered = pieces["short"]
601 |     if pieces["dirty"]:
602 |         rendered += "-dirty"
603 |     return rendered
604 | 
605 | 
606 | def render_git_describe_long(pieces: Dict[str, Any]) -> str:
607 |     """TAG-DISTANCE-gHEX[-dirty].
608 | 
609 |     Like 'git describe --tags --dirty --always -long'.
610 |     The distance/hash is unconditional.
611 | 
612 |     Exceptions:
613 |     1: no tags. HEX[-dirty]  (note: no 'g' prefix)
614 |     """
615 |     if pieces["closest-tag"]:
616 |         rendered = pieces["closest-tag"]
617 |         rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
618 |     else:
619 |         # exception #1
620 |         rendered = pieces["short"]
621 |     if pieces["dirty"]:
622 |         rendered += "-dirty"
623 |     return rendered
624 | 
625 | 
626 | def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]:
627 |     """Render the given version pieces into the requested style."""
628 |     if pieces["error"]:
629 |         return {
630 |             "version": "unknown",
631 |             "full-revisionid": pieces.get("long"),
632 |             "dirty": None,
633 |             "error": pieces["error"],
634 |             "date": None,
635 |         }
636 | 
637 |     if not style or style == "default":
638 |         style = "pep440"  # the default
639 | 
640 |     if style == "pep440":
641 |         rendered = render_pep440(pieces)
642 |     elif style == "pep440-branch":
643 |         rendered = render_pep440_branch(pieces)
644 |     elif style == "pep440-pre":
645 |         rendered = render_pep440_pre(pieces)
646 |     elif style == "pep440-post":
647 |         rendered = render_pep440_post(pieces)
648 |     elif style == "pep440-post-branch":
649 |         rendered = render_pep440_post_branch(pieces)
650 |     elif style == "pep440-old":
651 |         rendered = render_pep440_old(pieces)
652 |     elif style == "git-describe":
653 |         rendered = render_git_describe(pieces)
654 |     elif style == "git-describe-long":
655 |         rendered = render_git_describe_long(pieces)
656 |     else:
657 |         raise ValueError("unknown style '%s'" % style)
658 | 
659 |     return {
660 |         "version": rendered,
661 |         "full-revisionid": pieces["long"],
662 |         "dirty": pieces["dirty"],
663 |         "error": None,
664 |         "date": pieces.get("date"),
665 |     }
666 | 
667 | 
668 | def get_versions() -> Dict[str, Any]:
669 |     """Get version information or return default if unable to do so."""
670 |     # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
671 |     # __file__, we can work backwards from there to the root. Some
672 |     # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
673 |     # case we can only use expanded keywords.
674 | 
675 |     cfg = get_config()
676 |     verbose = cfg.verbose
677 | 
678 |     try:
679 |         return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
680 |     except NotThisMethod:
681 |         pass
682 | 
683 |     try:
684 |         root = os.path.realpath(__file__)
685 |         # versionfile_source is the relative path from the top of the source
686 |         # tree (where the .git directory might live) to this file. Invert
687 |         # this to find the root from __file__.
688 |         for _ in cfg.versionfile_source.split("/"):
689 |             root = os.path.dirname(root)
690 |     except NameError:
691 |         return {
692 |             "version": "0+unknown",
693 |             "full-revisionid": None,
694 |             "dirty": None,
695 |             "error": "unable to find root of source tree",
696 |             "date": None,
697 |         }
698 | 
699 |     try:
700 |         pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
701 |         return render(pieces, cfg.style)
702 |     except NotThisMethod:
703 |         pass
704 | 
705 |     try:
706 |         if cfg.parentdir_prefix:
707 |             return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
708 |     except NotThisMethod:
709 |         pass
710 | 
711 |     return {
712 |         "version": "0+unknown",
713 |         "full-revisionid": None,
714 |         "dirty": None,
715 |         "error": "unable to compute version",
716 |         "date": None,
717 |     }
718 | 


--------------------------------------------------------------------------------
/src/dirhash/cli.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Get hash for the content and/or structure of a directory."""
  3 | 
  4 | import argparse
  5 | import sys
  6 | 
  7 | import dirhash
  8 | 
  9 | 
 10 | def main():
 11 |     try:
 12 |         kwargs = get_kwargs(sys.argv[1:])
 13 |         if kwargs.pop("list"):
 14 |             # kwargs below have no effect when listing
 15 |             for k in ["algorithm", "chunk_size", "jobs", "entry_properties"]:
 16 |                 kwargs.pop(k)
 17 |             for leafpath in dirhash.included_paths(**kwargs):
 18 |                 print(leafpath)
 19 |         else:
 20 |             print(dirhash.dirhash(**kwargs))
 21 |     except Exception as e:  # pragma: no cover (not picked up by coverage)
 22 |         sys.stderr.write(f"dirhash: {e}\n")
 23 |         sys.exit(1)
 24 | 
 25 | 
 26 | def get_kwargs(args):
 27 |     parser = argparse.ArgumentParser(description="Determine the hash for a directory.")
 28 |     parser.add_argument(
 29 |         "-v",
 30 |         "--version",
 31 |         action="version",
 32 |         version=f"dirhash {dirhash.__version__}",
 33 |     )
 34 |     parser.add_argument("directory", help="Directory to hash.")
 35 |     parser.add_argument(
 36 |         "-a",
 37 |         "--algorithm",
 38 |         choices=dirhash.algorithms_available,
 39 |         default="md5",
 40 |         help=(
 41 |             "Hashing algorithm to use, by default 'md5'. "
 42 |             f"Always available: {sorted(dirhash.algorithms_guaranteed)}. "
 43 |             f"Additionally available on current platform: "
 44 |             f"{sorted(dirhash.algorithms_available - dirhash.algorithms_guaranteed)}. "
 45 |             "Note that the same algorithm may appear multiple times in this set "
 46 |             "under different names (thanks to OpenSSL) "
 47 |             "[https://docs.python.org/2/library/hashlib.html]."
 48 |         ),
 49 |         metavar="",
 50 |     )
 51 | 
 52 |     filter_options = parser.add_argument_group(
 53 |         title="Filtering options",
 54 |         description=(
 55 |             "Specify what files and directories to include. All files and "
 56 |             "directories (including symbolic links) are included by default. The "
 57 |             "--match/--ignore arguments allows for selection using glob/wildcard "
 58 |             '(".gitignore style") path matching. Paths relative to the root '
 59 |             "`directory` (i.e. excluding the name of the root directory itself) are "
 60 |             "matched against the provided patterns. For example, to only include "
 61 |             'python source files, use: `dirhash path/to/dir -m "*.py"` or to '
 62 |             "exclude hidden files and directories use: "
 63 |             '`dirhash path/to.dir -i ".*" ".*/"` which is short for '
 64 |             '`dirhash path/to.dir -m "*" "!.*" "!.*/"`. By adding the --list '
 65 |             "argument, all included paths, for the given filtering arguments, are "
 66 |             "returned instead of the hash value. For further details see "
 67 |             "https://github.com/andhus/dirhash/README.md#filtering"
 68 |         ),
 69 |     )
 70 |     filter_options.add_argument(
 71 |         "-m",
 72 |         "--match",
 73 |         nargs="+",
 74 |         default=["*"],
 75 |         help=(
 76 |             "One or several patterns for paths to include. NOTE: patterns "
 77 |             'with an asterisk must be in quotes ("*") or the asterisk '
 78 |             "preceded by an escape character (`*)."
 79 |         ),
 80 |         metavar="",
 81 |     )
 82 |     filter_options.add_argument(
 83 |         "-i",
 84 |         "--ignore",
 85 |         nargs="+",
 86 |         default=None,
 87 |         help=(
 88 |             "One or several patterns for paths to exclude. NOTE: patterns "
 89 |             'with an asterisk must be in quotes ("*") or the asterisk '
 90 |             "preceded by an escape character (`*)."
 91 |         ),
 92 |         metavar="",
 93 |     )
 94 |     filter_options.add_argument(
 95 |         "--empty-dirs",
 96 |         action="store_true",
 97 |         default=False,
 98 |         help="Include empty directories (containing no files that meet the matching "
 99 |         "criteria and no non-empty sub directories).",
100 |     )
101 |     filter_options.add_argument(
102 |         "--no-linked-dirs",
103 |         dest="linked_dirs",
104 |         action="store_false",
105 |         help="Do not include symbolic links to other directories.",
106 |     )
107 |     filter_options.add_argument(
108 |         "--no-linked-files",
109 |         dest="linked_files",
110 |         action="store_false",
111 |         help="Do not include symbolic links to files.",
112 |     )
113 |     parser.set_defaults(linked_dirs=True, linked_files=True)
114 | 
115 |     protocol_options = parser.add_argument_group(
116 |         title="Protocol options",
117 |         description=(
118 |             "Specify what properties of files and directories to include and "
119 |             "whether to allow cyclic links. For further details see "
120 |             "https://github.com/andhus/dirhash/DIRHASH_STANDARD.md#protocol"
121 |         ),
122 |     )
123 |     protocol_options.add_argument(
124 |         "-p",
125 |         "--properties",
126 |         nargs="+",
127 |         dest="entry_properties",
128 |         default=["data", "name"],
129 |         help=(
130 |             "List of file/directory properties to include in the hash. Available "
131 |             f"properties are: {list(dirhash.Protocol.EntryProperties.options)} and at "
132 |             "least one of name and data must be included. Default is [data name] which "
133 |             "means that both the name/paths and content (actual data) of files and "
134 |             "directories will be included"
135 |         ),
136 |         metavar="",
137 |     )
138 |     protocol_options.add_argument(
139 |         "-c",
140 |         "--allow-cyclic-links",
141 |         default=False,
142 |         action="store_true",
143 |         help=(
144 |             "Allow presence of cyclic links (by hashing the relative path to the "
145 |             "target directory)."
146 |         ),
147 |     )
148 | 
149 |     implementation_options = parser.add_argument_group(
150 |         title="Implementation options", description=""
151 |     )
152 |     implementation_options.add_argument(
153 |         "-s",
154 |         "--chunk-size",
155 |         default=2**20,
156 |         type=int,
157 |         help="The chunk size (in bytes) for reading of files.",
158 |     )
159 |     implementation_options.add_argument(
160 |         "-j",
161 |         "--jobs",
162 |         type=int,
163 |         default=1,  # TODO make default number of cores?
164 |         help="Number of jobs (parallel processes) to use.",
165 |     )
166 | 
167 |     special_options = parser.add_argument_group(title="Special options")
168 |     special_options.add_argument(
169 |         "-l",
170 |         "--list",
171 |         action="store_true",
172 |         default=False,
173 |         help="List the file paths that will be taken into account, given the "
174 |         "provided filtering options.",
175 |     )
176 | 
177 |     return vars(parser.parse_args(args))
178 | 
179 | 
180 | if __name__ == "__main__":  # pragma: no cover
181 |     main()
182 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shlex
  3 | import subprocess
  4 | import sys
  5 | 
  6 | import pytest
  7 | 
  8 | import dirhash
  9 | 
 10 | console_script = os.path.join(
 11 |     os.path.dirname(sys.executable),
 12 |     "dirhash.exe" if os.name == "nt" else "dirhash",
 13 | )
 14 | if not os.path.isfile(console_script):
 15 |     print(os.listdir(os.path.dirname(sys.executable)))
 16 |     raise FileNotFoundError(f"Could not find console script at {console_script}.")
 17 | if not os.access(console_script, os.X_OK):
 18 |     raise PermissionError(f"Console script at {console_script} is not executable.")
 19 | 
 20 | 
 21 | def dirhash_run(argstring, add_env=None):
 22 |     if add_env:
 23 |         env = os.environ.copy()
 24 |         env.update(add_env)
 25 |     else:
 26 |         env = None
 27 |     process = subprocess.Popen(
 28 |         [console_script] + shlex.split(argstring),
 29 |         stdout=subprocess.PIPE,
 30 |         stderr=subprocess.PIPE,
 31 |         text=True,
 32 |         env=env,
 33 |     )
 34 |     output, error = process.communicate()
 35 | 
 36 |     # in python3 output and error are `bytes` as opposed to `str` in python2
 37 |     if isinstance(output, bytes):
 38 |         output = output.decode("utf-8")
 39 |     if isinstance(error, bytes):
 40 |         error = error.decode("utf-8")
 41 | 
 42 |     return output, error, process.returncode
 43 | 
 44 | 
 45 | def create_default_tree(tmpdir):
 46 |     """
 47 |     tmpdir/
 48 |     |__.dir/
 49 |     |  |__file
 50 |     |__.file
 51 |     |__dir/
 52 |     |  |__file
 53 |     |__empty/
 54 |     |__file
 55 |     |__file.ext1
 56 |     |__file.ext2
 57 |     """
 58 |     dotdir = tmpdir.mkdir(".dir")
 59 |     dotdir.join("file").write("file in hidden sub-directory")
 60 |     tmpdir.join(".file").write("hidden file")
 61 |     dir = tmpdir.mkdir("dir")
 62 |     dir.join("file").write("file in sub-directory")
 63 |     tmpdir.mkdir("empty")
 64 |     tmpdir.join("file").write("file")
 65 |     tmpdir.join("file.ext1").write("file with extension .ext1")
 66 |     tmpdir.join("file.ext2").write("file with extension .ext2")
 67 | 
 68 | 
 69 | def osp(path: str) -> str:
 70 |     """Normalize path for OS."""
 71 |     if os.name == "nt":  # pragma: no cover
 72 |         return path.replace("/", "\\")
 73 |     return path
 74 | 
 75 | 
 76 | class TestCLI:
 77 |     @pytest.mark.parametrize(
 78 |         "argstring, non_default_kwargs",
 79 |         [
 80 |             (". -a md5", {}),
 81 |             (".. -a md5", {"directory": ".."}),
 82 |             ("target-dir -a md5", {"directory": "target-dir"}),
 83 |             (". -a sha256", {"algorithm": "sha256"}),
 84 |             # Filtering options
 85 |             ('. -a md5 -m "*" "!.*"', {"match": ["*", "!.*"]}),
 86 |             (
 87 |                 '. -a md5 --match "d1/*" "d2/*" --ignore "*.txt"',
 88 |                 {"match": ["d1/*", "d2/*"], "ignore": ["*.txt"]},
 89 |             ),
 90 |             (". -a md5 --empty-dirs", {"empty_dirs": True}),
 91 |             (". -a md5 --no-linked-dirs", {"linked_dirs": False}),
 92 |             (". -a md5 --no-linked-files", {"linked_files": False}),
 93 |             # Protocol options
 94 |             (". -a md5 --allow-cyclic-links", {"allow_cyclic_links": True}),
 95 |             (". -a md5 --properties name", {"entry_properties": ["name"]}),
 96 |             (". -a md5 --properties name data", {"entry_properties": ["name", "data"]}),
 97 |             # Implementation
 98 |             (". -a md5 -j 10", {"jobs": 10}),
 99 |             (". -a md5 -s 32000", {"chunk_size": 32000}),
100 |         ],
101 |     )
102 |     def test_get_kwargs(self, argstring, non_default_kwargs):
103 |         from dirhash.cli import get_kwargs
104 | 
105 |         kwargs_expected = {
106 |             "list": False,
107 |             "directory": ".",
108 |             "algorithm": "md5",
109 |             "match": ["*"],
110 |             "ignore": None,
111 |             "empty_dirs": False,
112 |             "linked_dirs": True,
113 |             "linked_files": True,
114 |             "entry_properties": ["data", "name"],
115 |             "allow_cyclic_links": False,
116 |             "chunk_size": 2**20,
117 |             "jobs": 1,
118 |         }
119 |         kwargs_expected.update(non_default_kwargs)
120 |         kwargs = get_kwargs(shlex.split(argstring))
121 |         assert kwargs == kwargs_expected
122 | 
123 |     @pytest.mark.parametrize(
124 |         "description, argstrings, output",
125 |         [
126 |             (
127 |                 "ARGS WITHOUT EFFECT WHEN LISTING",
128 |                 [
129 |                     ". -l",
130 |                     ". --list",
131 |                     ". -a md5 --list",
132 |                     ". -a sha256 --list",
133 |                     ". --properties name --list",
134 |                     ". --jobs 2 --list",
135 |                     ". --chunk-size 2 --list",
136 |                 ],
137 |                 (
138 |                     ".dir/file\n"
139 |                     ".file\n"
140 |                     "dir/file\n"
141 |                     "file\n"
142 |                     "file.ext1\n"
143 |                     "file.ext2\n"
144 |                 ),
145 |             ),
146 |             (
147 |                 "IGNORE EXTENSION",
148 |                 [
149 |                     '. -i "*.ext1" --list',
150 |                     '. --ignore "*.ext1" --list',
151 |                     '. -m "*" "!*.ext1" --list',
152 |                     '. --match "*" "!*.ext1" --list',
153 |                 ],
154 |                 (".dir/file\n" ".file\n" "dir/file\n" "file\n" "file.ext2\n"),
155 |             ),
156 |             (
157 |                 "IGNORE MULTIPLE EXTENSIONS",
158 |                 ['. -i "*.ext1" "*.ext2" --list', '. -i "*.ext*" --list'],
159 |                 (".dir/file\n" ".file\n" "dir/file\n" "file\n"),
160 |             ),
161 |             (
162 |                 "IGNORE HIDDEN",
163 |                 ['. -i ".*" ".*/" --list'],
164 |                 ("dir/file\n" "file\n" "file.ext1\n" "file.ext2\n"),
165 |             ),
166 |             (
167 |                 "INCLUDE EMPTY",
168 |                 [". --empty-dirs --list"],
169 |                 (
170 |                     ".dir/file\n"
171 |                     ".file\n"
172 |                     "dir/file\n"
173 |                     "empty/.\n"
174 |                     "file\n"
175 |                     "file.ext1\n"
176 |                     "file.ext2\n"
177 |                 ),
178 |             ),
179 |         ],
180 |     )
181 |     def test_list(self, description, argstrings, output, tmpdir):
182 |         create_default_tree(tmpdir)
183 |         with tmpdir.as_cwd():
184 |             for argstring in argstrings:
185 |                 o, error, returncode = dirhash_run(argstring)
186 |                 assert returncode == 0
187 |                 assert error == ""
188 |                 assert o == osp(output)
189 | 
190 |     @pytest.mark.parametrize(
191 |         "argstring, kwargs, expected_hashes",
192 |         [
193 |             (
194 |                 ". -a md5",
195 |                 {"algorithm": "md5"},
196 |                 [
197 |                     "594c48dde0776b03eddeeb0232190be7",
198 |                     "d8ab965636d48e407b73b9dbba4cb928",
199 |                     "050e7bc9ffcb09c15186c04e0f8026df",
200 |                 ],
201 |             ),
202 |             (
203 |                 ". -a sha256",
204 |                 {"algorithm": "sha256"},
205 |                 [
206 |                     "23a04964149889e932ba3348fe22442f4f6a3b3fec616a386a70579ee857ab7b",
207 |                     "7b76bac43e963f9561f37b96b92d7a174094bff230c6efbf1d8bf650e8b40b7a",
208 |                     "7156da2b2e5a2926eb4b72e65f389343cb6aca0578f0aedcd6f7457abd67d8f5",
209 |                 ],
210 |             ),
211 |         ],
212 |     )
213 |     def test_hash_result(self, argstring, kwargs, expected_hashes, tmpdir):
214 |         # verify same result from cmdline and library + regression test of actual
215 |         # hashes
216 |         create_default_tree(tmpdir)
217 |         with tmpdir.as_cwd():
218 |             for add_argstring, add_kwargs, expected_hash in zip(
219 |                 ["", " -p data", " -p name"],
220 |                 [
221 |                     {},
222 |                     {"entry_properties": ["data"]},
223 |                     {"entry_properties": ["name"]},
224 |                 ],
225 |                 expected_hashes,
226 |             ):
227 |                 # run CLI
228 |                 full_argstring = argstring + add_argstring
229 |                 cli_out, error, returncode = dirhash_run(full_argstring)
230 |                 assert error == ""
231 |                 assert returncode == 0
232 |                 assert cli_out[-1] == "\n"
233 |                 cli_hash = cli_out[:-1]
234 | 
235 |                 # run CLI multiproc
236 |                 full_argstring_mp = argstring + add_argstring + " --jobs 2"
237 |                 cli_out_mp, error_mp, returncode_mp = dirhash_run(full_argstring_mp)
238 |                 assert error_mp == ""
239 |                 assert returncode_mp == 0
240 |                 assert cli_out_mp[-1] == "\n"
241 |                 cli_hash_mp = cli_out_mp[:-1]
242 | 
243 |                 # run lib function
244 |                 full_kwargs = kwargs.copy()
245 |                 full_kwargs.update(add_kwargs)
246 |                 lib_hash = dirhash.dirhash(str(tmpdir), **full_kwargs)
247 | 
248 |                 assert cli_hash == cli_hash_mp == lib_hash == expected_hash
249 | 
250 |     def test_error_bad_argument(self, tmpdir):
251 |         with tmpdir.as_cwd():
252 |             o, error, returncode = dirhash_run(". --chunk-size not_an_int")
253 |             assert returncode > 0
254 |             assert error != ""
255 | 


--------------------------------------------------------------------------------
/tests/test_dirhash.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import os
  3 | import shutil
  4 | import tempfile
  5 | from time import sleep, time
  6 | 
  7 | import pytest
  8 | from scantree import SymlinkRecursionError
  9 | 
 10 | from dirhash import (
 11 |     Filter,
 12 |     Protocol,
 13 |     _get_hasher_factory,
 14 |     _parmap,
 15 |     algorithms_available,
 16 |     algorithms_guaranteed,
 17 |     dirhash,
 18 |     dirhash_impl,
 19 |     get_match_patterns,
 20 |     included_paths,
 21 | )
 22 | 
 23 | 
 24 | def osp(path: str) -> str:
 25 |     """Normalize path for OS."""
 26 |     if os.name == "nt":  # pragma: no cover
 27 |         return path.replace("/", "\\")
 28 |     return path
 29 | 
 30 | 
 31 | def map_osp(paths):
 32 |     return [osp(path) for path in paths]
 33 | 
 34 | 
 35 | class TestGetHasherFactory:
 36 |     def test_get_guaranteed(self):
 37 |         algorithm_and_hasher_factory = [
 38 |             ("md5", hashlib.md5),
 39 |             ("sha1", hashlib.sha1),
 40 |             ("sha224", hashlib.sha224),
 41 |             ("sha256", hashlib.sha256),
 42 |             ("sha384", hashlib.sha384),
 43 |             ("sha512", hashlib.sha512),
 44 |         ]
 45 |         assert algorithms_guaranteed == {a for a, _ in algorithm_and_hasher_factory}
 46 |         for algorithm, expected_hasher_factory in algorithm_and_hasher_factory:
 47 |             hasher_factory = _get_hasher_factory(algorithm)
 48 |             assert hasher_factory == expected_hasher_factory
 49 | 
 50 |     def test_get_available(self):
 51 |         for algorithm in algorithms_available:
 52 |             hasher_factory = _get_hasher_factory(algorithm)
 53 |             try:
 54 |                 hasher = hasher_factory()
 55 |             except ValueError as exc:
 56 |                 # Some "available" algorithms are not necessarily available
 57 |                 # (fails for e.g. 'ripemd160' in github actions for python 3.8).
 58 |                 # See: https://stackoverflow.com/questions/72409563/unsupported-hash-type-ripemd160-with-hashlib-in-python  # noqa: E501
 59 |                 print(f"Failed to create hasher for {algorithm}: {exc}")
 60 |                 assert exc.args[0] == f"unsupported hash type {algorithm}"
 61 |                 hasher = None
 62 | 
 63 |             if hasher is not None:
 64 |                 assert hasattr(hasher, "update")
 65 |                 assert hasattr(hasher, "hexdigest")
 66 | 
 67 |     def test_not_available(self):
 68 |         with pytest.raises(ValueError):
 69 |             _get_hasher_factory("not available")
 70 | 
 71 |     def test_bypass_hasher_factory(self):
 72 |         # test standard hasher
 73 |         hasher_factory = _get_hasher_factory(hashlib.sha256)
 74 |         assert hasher_factory is hashlib.sha256
 75 | 
 76 |         # test raise on custom hasher with bad interface
 77 |         class IncompleteMockHasher:
 78 |             def __init__(self, *args, **kwargs):
 79 |                 pass
 80 | 
 81 |             def update(self, *args, **kwargs):
 82 |                 pass
 83 | 
 84 |         with pytest.raises(ValueError):
 85 |             _get_hasher_factory(IncompleteMockHasher)
 86 | 
 87 |         # test custom hasher with ok interface
 88 |         class MockHasher(IncompleteMockHasher):
 89 |             def hexdigest(self):
 90 |                 return ""
 91 | 
 92 |         hasher_factory = _get_hasher_factory(MockHasher)
 93 |         assert hasher_factory is MockHasher
 94 | 
 95 | 
 96 | class TestGetMatchPatterns:
 97 |     def test_default_match_all(self):
 98 |         ms = get_match_patterns()
 99 |         assert ms == ["*"]
100 | 
101 |     def test_only_match(self):
102 |         ms = get_match_patterns(match=["a*", "b*"])
103 |         assert ms == ["a*", "b*"]
104 | 
105 |     def test_only_ignore(self):
106 |         ms = get_match_patterns(ignore=["a*", "b*"])
107 |         assert ms == ["*", "!a*", "!b*"]
108 | 
109 |     def test_match_and_ignore(self):
110 |         ms = get_match_patterns(match=["a*"], ignore=["*.ext"])
111 |         assert ms == ["a*", "!*.ext"]
112 | 
113 |     def test_ignore_hidden(self):
114 |         ms = get_match_patterns(ignore_hidden=True)
115 |         assert ms == ["*", "!.*", "!.*/"]
116 | 
117 |         # should not duplicate if present in (general) ignore
118 |         ms = get_match_patterns(ignore=[".*"], ignore_hidden=True)
119 |         assert ms == ["*", "!.*", "!.*/"]
120 | 
121 |         ms = get_match_patterns(ignore=[".*/"], ignore_hidden=True)
122 |         assert ms == ["*", "!.*/", "!.*"]
123 | 
124 |         ms = get_match_patterns(ignore=[".*", ".*/"], ignore_hidden=True)
125 |         assert ms == ["*", "!.*", "!.*/"]
126 | 
127 |     def test_ignore_extensions(self):
128 |         ms = get_match_patterns(ignore_extensions=[".ext"])
129 |         assert ms == ["*", "!*.ext"]
130 | 
131 |         # automatically adds '.'
132 |         ms = get_match_patterns(ignore_extensions=["ext"])
133 |         assert ms == ["*", "!*.ext"]
134 | 
135 |         # mixed also works
136 |         ms = get_match_patterns(ignore_extensions=["ext1", ".ext2"])
137 |         assert ms == ["*", "!*.ext1", "!*.ext2"]
138 | 
139 |         # should not duplicate if present in (general) ignore
140 |         ms = get_match_patterns(ignore=["*.ext"], ignore_extensions=[".ext"])
141 |         assert ms == ["*", "!*.ext"]
142 | 
143 |         ms = get_match_patterns(ignore=["*.ext"], ignore_extensions=["ext"])
144 |         assert ms == ["*", "!*.ext"]
145 | 
146 | 
147 | class TempDirTest:
148 |     def setup_method(self):
149 |         self.dir = tempfile.mkdtemp()
150 | 
151 |     def teardown_method(self):
152 |         if os.path.exists(self.dir):
153 |             shutil.rmtree(self.dir)
154 | 
155 |     def path_to(self, relpath):
156 |         return os.path.join(self.dir, osp(relpath))
157 | 
158 |     def mkdirs(self, dirpath):
159 |         os.makedirs(self.path_to(dirpath))
160 | 
161 |     def mkfile(self, relpath, content=None):
162 |         with open(self.path_to(relpath), "w") as f:
163 |             if content:
164 |                 f.write(content)
165 | 
166 |     def symlink(self, src, dst):
167 |         os.symlink(self.path_to(src), self.path_to(dst))
168 | 
169 |     def remove(self, relpath):
170 |         if os.path.isdir(self.path_to(relpath)):
171 |             shutil.rmtree(self.path_to(relpath))
172 |         os.remove(self.path_to(relpath))
173 | 
174 | 
175 | class TestGetIncludedPaths(TempDirTest):
176 |     # Integration tests with `pathspec` for basic use cases.
177 | 
178 |     def test_basic(self):
179 |         self.mkdirs("root/d1/d11")
180 |         self.mkdirs("root/d2")
181 | 
182 |         self.mkfile("root/f1")
183 |         self.mkfile("root/d1/f1")
184 |         self.mkfile("root/d1/d11/f1")
185 |         self.mkfile("root/d2/f1")
186 | 
187 |         expected_filepaths = map_osp(["d1/d11/f1", "d1/f1", "d2/f1", "f1"])
188 |         filepaths = included_paths(self.path_to("root"))
189 |         assert filepaths == expected_filepaths
190 | 
191 |         # end with '/' or not should not matter
192 |         filepaths = included_paths(self.path_to("root/"))
193 |         assert filepaths == expected_filepaths
194 | 
195 |     def test_not_a_directory(self):
196 |         self.mkdirs("root")
197 |         self.mkfile("root/f1")
198 |         # does not exist
199 |         with pytest.raises(ValueError):
200 |             included_paths(self.path_to("wrong_root"))
201 |         with pytest.raises(ValueError):
202 |             included_paths(self.path_to("root/f1"))
203 | 
204 |     def test_symlinked_file(self):
205 |         self.mkdirs("root")
206 |         self.mkfile("root/f1")
207 |         self.mkfile("linked_file")
208 |         self.symlink("linked_file", "root/f2")
209 | 
210 |         filepaths = included_paths(self.path_to("root"), linked_files=True)
211 |         assert filepaths == ["f1", "f2"]
212 | 
213 |         filepaths = included_paths(self.path_to("root"), linked_files=False)
214 |         assert filepaths == ["f1"]
215 | 
216 |         # default is 'linked_files': True
217 |         filepaths = included_paths(
218 |             self.path_to("root"),
219 |         )
220 |         assert filepaths == ["f1", "f2"]
221 | 
222 |     def test_symlinked_dir(self):
223 |         self.mkdirs("root")
224 |         self.mkfile("root/f1")
225 |         self.mkdirs("linked_dir")
226 |         self.mkfile("linked_dir/f1")
227 |         self.mkfile("linked_dir/f2")
228 |         self.symlink("linked_dir", "root/d1")
229 | 
230 |         filepaths = included_paths(self.path_to("root"), linked_dirs=False)
231 |         assert filepaths == ["f1"]
232 | 
233 |         filepaths = included_paths(self.path_to("root"), linked_dirs=True)
234 |         assert filepaths == map_osp(["d1/f1", "d1/f2", "f1"])
235 | 
236 |         # default is 'linked_dirs': True
237 |         filepaths = included_paths(self.path_to("root"))
238 |         assert filepaths == map_osp(["d1/f1", "d1/f2", "f1"])
239 | 
240 |     def test_cyclic_link(self):
241 |         self.mkdirs("root/d1")
242 |         self.symlink("root", "root/d1/link_back")
243 |         with pytest.raises(SymlinkRecursionError) as exc_info:
244 |             included_paths(self.path_to("root"), allow_cyclic_links=False)
245 |         assert exc_info.value.real_path == os.path.realpath(self.path_to("root"))
246 |         assert exc_info.value.first_path == self.path_to("root/")
247 |         assert exc_info.value.second_path == self.path_to("root/d1/link_back")
248 |         assert str(exc_info.value).startswith("Symlink recursion:")
249 | 
250 |         filepaths = included_paths(self.path_to("root"), allow_cyclic_links=True)
251 |         assert filepaths == map_osp(["d1/link_back/."])
252 | 
253 |         # default is 'allow_cyclic_links': False
254 |         with pytest.raises(SymlinkRecursionError):
255 |             filepaths = included_paths(self.path_to("root"))
256 | 
257 |     def test_ignore_hidden(self):
258 |         self.mkdirs("root/d1")
259 |         self.mkdirs("root/.d2")
260 | 
261 |         self.mkfile("root/f1")
262 |         self.mkfile("root/.f2")
263 |         self.mkfile("root/d1/f1")
264 |         self.mkfile("root/d1/.f2")
265 |         self.mkfile("root/.d2/f1")
266 | 
267 |         # no ignore
268 |         filepaths = included_paths(self.path_to("root"))
269 |         assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"])
270 | 
271 |         # with ignore
272 |         filepaths = included_paths(self.path_to("root"), match=["*", "!.*"])
273 |         assert filepaths == map_osp(["d1/f1", "f1"])
274 | 
275 |     def test_ignore_hidden_files_only(self):
276 |         self.mkdirs("root/d1")
277 |         self.mkdirs("root/.d2")
278 | 
279 |         self.mkfile("root/f1")
280 |         self.mkfile("root/.f2")
281 |         self.mkfile("root/d1/f1")
282 |         self.mkfile("root/d1/.f2")
283 |         self.mkfile("root/.d2/f1")
284 | 
285 |         # no ignore
286 |         filepaths = included_paths(self.path_to("root"))
287 |         assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"])
288 | 
289 |         # with ignore
290 |         filepaths = included_paths(
291 |             self.path_to("root"), match=["**/*", "!**/.*", "**/.*/*", "!**/.*/.*"]
292 |         )
293 |         assert filepaths == map_osp([".d2/f1", "d1/f1", "f1"])
294 | 
295 |     def test_ignore_hidden_explicitly_recursive(self):
296 |         self.mkdirs("root/d1")
297 |         self.mkdirs("root/.d2")
298 | 
299 |         self.mkfile("root/f1")
300 |         self.mkfile("root/.f2")
301 |         self.mkfile("root/d1/f1")
302 |         self.mkfile("root/d1/.f2")
303 |         self.mkfile("root/.d2/f1")
304 | 
305 |         # no ignore
306 |         filepaths = included_paths(self.path_to("root"))
307 |         assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"])
308 | 
309 |         # with ignore
310 |         filepaths = included_paths(self.path_to("root"), match=["*", "!**/.*"])
311 |         assert filepaths == map_osp(["d1/f1", "f1"])
312 | 
313 |     def test_exclude_hidden_dirs(self):
314 |         self.mkdirs("root/d1")
315 |         self.mkdirs("root/.d2")
316 |         self.mkdirs("root/d1/.d1")
317 | 
318 |         self.mkfile("root/f1")
319 |         self.mkfile("root/.f2")
320 |         self.mkfile("root/d1/f1")
321 |         self.mkfile("root/d1/.f2")
322 |         self.mkfile("root/.d2/f1")
323 | 
324 |         # no ignore
325 |         filepaths = included_paths(self.path_to("root"), empty_dirs=True)
326 |         assert filepaths == map_osp(
327 |             [".d2/f1", ".f2", "d1/.d1/.", "d1/.f2", "d1/f1", "f1"]
328 |         )
329 | 
330 |         # with ignore
331 |         filepaths = included_paths(self.path_to("root"), match=["*", "!.*/"])
332 |         assert filepaths == map_osp([".f2", "d1/.f2", "d1/f1", "f1"])
333 | 
334 |     def test_exclude_hidden_dirs_and_files(self):
335 |         self.mkdirs("root/d1")
336 |         self.mkdirs("root/.d2")
337 | 
338 |         self.mkfile("root/f1")
339 |         self.mkfile("root/.f2")
340 |         self.mkfile("root/d1/f1")
341 |         self.mkfile("root/d1/.f2")
342 |         self.mkfile("root/.d2/f1")
343 | 
344 |         # no ignore
345 |         filepaths = included_paths(self.path_to("root"))
346 |         assert filepaths == map_osp([".d2/f1", ".f2", "d1/.f2", "d1/f1", "f1"])
347 | 
348 |         # using ignore
349 |         filepaths = included_paths(self.path_to("root"), match=["*", "!.*/", "!.*"])
350 |         assert filepaths == map_osp(["d1/f1", "f1"])
351 | 
352 |     def test_exclude_extensions(self):
353 |         self.mkdirs("root/d1")
354 | 
355 |         self.mkfile("root/f")
356 |         self.mkfile("root/f.txt")
357 |         self.mkfile("root/f.skip1")
358 |         self.mkfile("root/fskip1")
359 |         self.mkfile("root/f.skip2")
360 |         self.mkfile("root/f.skip1.txt")
361 |         self.mkfile("root/f.skip1.skip2")
362 |         self.mkfile("root/f.skip1skip2")
363 |         self.mkfile("root/d1/f.txt")
364 |         self.mkfile("root/d1/f.skip1")
365 | 
366 |         filepaths = included_paths(
367 |             self.path_to("root"), match=["*", "!*.skip1", "!*.skip2"]
368 |         )
369 |         assert filepaths == map_osp(
370 |             [
371 |                 "d1/f.txt",
372 |                 "f",
373 |                 "f.skip1.txt",
374 |                 "f.skip1skip2",
375 |                 "f.txt",
376 |                 "fskip1",
377 |             ]
378 |         )
379 | 
380 |     def test_empty_dirs_include_vs_exclude(self):
381 |         self.mkdirs("root/d1")
382 |         self.mkdirs("root/d2")
383 |         self.mkdirs("root/d3/d31")
384 |         self.mkdirs("root/d4/d41")
385 | 
386 |         self.mkfile("root/d1/f")
387 |         self.mkfile("root/d3/d31/f")
388 | 
389 |         filepaths = included_paths(self.path_to("root"), empty_dirs=False)
390 |         assert filepaths == map_osp(["d1/f", "d3/d31/f"])
391 | 
392 |         # `include_empty=False` is default
393 |         filepaths = included_paths(self.path_to("root"))
394 |         assert filepaths == map_osp(["d1/f", "d3/d31/f"])
395 | 
396 |         filepaths = included_paths(self.path_to("root"), empty_dirs=True)
397 |         assert filepaths == map_osp(["d1/f", "d2/.", "d3/d31/f", "d4/d41/."])
398 | 
399 |     def test_empty_dirs_because_of_filter_include_vs_exclude(self):
400 |         self.mkdirs("root/d1")
401 |         self.mkdirs("root/d2")
402 | 
403 |         self.mkfile("root/d1/f")
404 |         self.mkfile("root/d2/.f")
405 | 
406 |         filepaths = included_paths(
407 |             self.path_to("root"), match=["*", "!.*"], empty_dirs=False
408 |         )
409 |         assert filepaths == map_osp(["d1/f"])
410 | 
411 |         # `include_empty=False` is default
412 |         filepaths = included_paths(
413 |             self.path_to("root"),
414 |             match=["*", "!.*"],
415 |         )
416 |         assert filepaths == map_osp(["d1/f"])
417 | 
418 |         filepaths = included_paths(
419 |             self.path_to("root"), match=["*", "!.*"], empty_dirs=True
420 |         )
421 |         assert filepaths == map_osp(["d1/f", "d2/."])
422 | 
423 |     def test_empty_dir_inclusion_not_affected_by_match(self):
424 |         self.mkdirs("root/d1")
425 |         self.mkdirs("root/.d2")
426 | 
427 |         # NOTE that empty dirs are not excluded by match_patterns:
428 | 
429 |         filepaths = included_paths(
430 |             self.path_to("root"), match=["*", "!.*"], empty_dirs=True
431 |         )
432 |         assert filepaths == map_osp([".d2/.", "d1/."])
433 | 
434 |         filepaths = included_paths(
435 |             self.path_to("root"), match=["*", "!.*/"], empty_dirs=True
436 |         )
437 |         assert filepaths == map_osp([".d2/.", "d1/."])
438 | 
439 |         filepaths = included_paths(
440 |             self.path_to("root"), match=["*", "!d1"], empty_dirs=True
441 |         )
442 |         assert filepaths == map_osp([".d2/.", "d1/."])
443 | 
444 | 
445 | def dirhash_mp_comp(*args, **kwargs):
446 |     res = dirhash(*args, **kwargs)
447 |     res_mp = dirhash(*args, **{**kwargs, "jobs": 2})
448 |     assert res == res_mp
449 |     return res
450 | 
451 | 
452 | class TestDirhash(TempDirTest):
453 |     def test_guaranteed_algorithms(self):
454 |         self.mkdirs("root/d1/d11")
455 |         self.mkdirs("root/d2")
456 |         self.mkfile("root/f1", "a")
457 |         self.mkfile("root/d1/f1", "b")
458 |         self.mkfile("root/d1/d11/f1", "c")
459 |         self.mkfile("root/d2/f1", "d")
460 | 
461 |         for algorithm, expected_hash in [
462 |             ("md5", "3c631c7f5771468a2187494f802fad8f"),
463 |             ("sha1", "992aa2d00d2ed94f0c19eff7f151f5c6a7e0cc41"),
464 |             ("sha224", "18013e1df933d5781b2eddb94aceeb7ab689643f1df24060fb478999"),
465 |             (
466 |                 "sha256",
467 |                 "ef7e95269fbc0e3478ad31fddd1c7d08" "907d189c61725332e8a2fd14448fe175",
468 |             ),
469 |             (
470 |                 "sha384",
471 |                 "64ef4360c172bc68250f9326ea231cd1"
472 |                 "46a7fa1afe9d386cee0cae0e9f1b4ad2"
473 |                 "1df050d1df436cff792bbe81d6698026",
474 |             ),
475 |             (
476 |                 "sha512",
477 |                 "7854226eb0278bc136056998890a8399"
478 |                 "f85ca383f7c54665026358d28b5dc716"
479 |                 "0ec654d2bcebf5d60974f82ed820600d"
480 |                 "8e807ea53d57578d076ec1c82f501208",
481 |             ),
482 |         ]:
483 |             hash_value = dirhash_mp_comp(self.path_to("root"), algorithm)
484 |             assert hash_value == expected_hash
485 | 
486 |     def test_recursive_descriptor(self):
487 |         self.mkdirs("root/d1")
488 |         self.mkdirs("root/d2")
489 |         self.mkfile("root/f1", "a")
490 |         self.mkfile("root/d1/f12", "b")
491 | 
492 |         f1_desc = "data:a\000name:f1"
493 |         f12_desc = "data:b\000name:f12"
494 |         d1_desc = f"dirhash:{f12_desc}\000name:d1"
495 |         d2_desc = "dirhash:\000name:d2"
496 | 
497 |         empty_dirs_false_expected = "\000\000".join([f1_desc, d1_desc])
498 |         empty_dirs_true_expected = "\000\000".join([f1_desc, d2_desc, d1_desc])
499 | 
500 |         empty_dirs_false = dirhash(self.path_to("root"), algorithm=IdentityHasher)
501 |         assert empty_dirs_false == empty_dirs_false_expected
502 | 
503 |         empty_dirs_true = dirhash(
504 |             self.path_to("root"), algorithm=IdentityHasher, empty_dirs=True
505 |         )
506 |         assert empty_dirs_true == empty_dirs_true_expected
507 | 
508 |     def test_symlinked_file(self):
509 |         self.mkdirs("root1")
510 |         self.mkfile("root1/f1", "a")
511 |         self.mkfile("linked_file", "b")
512 |         self.symlink("linked_file", "root1/f2")
513 | 
514 |         self.mkdirs("root2")
515 |         self.mkfile("root2/f1", "a")
516 |         self.mkfile("root2/f2", "b")
517 | 
518 |         root1_linked_files_true = dirhash_mp_comp(
519 |             self.path_to("root1"), algorithm="md5"
520 |         )
521 |         root1_linked_files_false = dirhash_mp_comp(
522 |             self.path_to("root1"), algorithm="md5", linked_files=False
523 |         )
524 | 
525 |         root2 = dirhash_mp_comp(self.path_to("root2"), algorithm="md5")
526 | 
527 |         assert root1_linked_files_false != root1_linked_files_true
528 |         assert root1_linked_files_true == root2
529 | 
530 |     def test_symlinked_dir(self):
531 |         self.mkdirs("root1")
532 |         self.mkfile("root1/f1", "a")
533 |         self.mkdirs("linked_dir")
534 |         self.mkfile("linked_dir/f1", "b")
535 |         self.mkfile("linked_dir/f2", "c")
536 |         self.symlink("linked_dir", "root1/d1")
537 | 
538 |         self.mkdirs("root2")
539 |         self.mkfile("root2/f1", "a")
540 |         self.mkdirs("root2/d1")
541 |         self.mkfile("root2/d1/f1", "b")
542 |         self.mkfile("root2/d1/f2", "c")
543 | 
544 |         root1_linked_dirs_true = dirhash_mp_comp(
545 |             self.path_to("root1"), algorithm="md5", linked_dirs=True
546 |         )
547 |         root1_linked_dirs_false = dirhash_mp_comp(
548 |             self.path_to("root1"), algorithm="md5", linked_dirs=False
549 |         )
550 |         root2 = dirhash_mp_comp(self.path_to("root2"), algorithm="md5")
551 | 
552 |         assert root1_linked_dirs_false != root1_linked_dirs_true
553 |         assert root1_linked_dirs_true == root2
554 | 
555 |     def test_cache_used_for_symlinks(self):
556 |         self.mkdirs("root/dir")
557 |         self.mkfile("root/file", "< one chunk content")
558 |         for i in range(10):
559 |             self.symlink("root/file", f"root/link_{i}")
560 |         for i in range(10):
561 |             self.symlink("root/file", f"root/dir/link_{i}")
562 |         start = time()
563 |         dirhash(self.path_to("root"), algorithm=SlowHasher)
564 |         end = time()
565 |         elapsed = end - start
566 |         assert elapsed < SlowHasher.wait_time * 2
567 | 
568 |     def test_raise_on_empty_root_without_include_empty(self):
569 |         self.mkdirs("root")
570 |         with pytest.raises(ValueError):
571 |             dirhash_mp_comp(self.path_to("root"), "sha256")
572 | 
573 |     def test_empty_root_include_empty(self):
574 |         self.mkdirs("root")
575 |         dirhash_ = dirhash_mp_comp(self.path_to("root"), "sha256", empty_dirs=True)
576 |         expected_dirhash = hashlib.sha256(b"").hexdigest()
577 |         assert dirhash_ == expected_dirhash
578 | 
579 |     def test_include_empty(self):
580 |         self.mkdirs("root/d1")
581 |         self.mkdirs("root/d2")
582 |         self.mkfile("root/d1/f")
583 | 
584 |         args = (self.path_to("root"), "sha256")
585 |         dirhash_ = dirhash_mp_comp(*args, empty_dirs=False)
586 |         dirhash_empty = dirhash_mp_comp(*args, empty_dirs=True)
587 |         assert dirhash_ != dirhash_empty
588 | 
589 |     def test_chunksize(self):
590 |         self.mkdirs("root")
591 |         self.mkfile("root/numbers.txt", str(range(1000)))
592 | 
593 |         hash_value = dirhash_mp_comp(self.path_to("root"), "sha256")
594 |         for chunk_size in [2**4, 2**8, 2**16]:
595 |             assert (
596 |                 dirhash_mp_comp(self.path_to("root"), "sha256", chunk_size=chunk_size)
597 |                 == hash_value
598 |             )
599 | 
600 |     def test_data_only(self):
601 |         self.mkdirs("root1")
602 |         self.mkfile("root1/a.txt", "abc")
603 |         self.mkfile("root1/b.txt", "def")
604 |         self.mkdirs("root2")
605 |         self.mkfile("root2/a.txt", "abc")
606 |         self.mkfile("root2/c.txt", "def")
607 | 
608 |         hash1 = dirhash_mp_comp(self.path_to("root1"), "sha256")
609 |         hash2 = dirhash_mp_comp(self.path_to("root2"), "sha256")
610 |         assert hash1 != hash2
611 | 
612 |         # with entry hash remains the same as long as order of files is the
613 |         # same
614 |         [dhash1, dhash2] = [
615 |             dirhash_mp_comp(self.path_to(root), "sha256", entry_properties=["data"])
616 |             for root in ["root1", "root2"]
617 |         ]
618 |         assert dhash1 == dhash2
619 | 
620 |     def test_name_only(self):
621 |         self.mkdirs("root1")
622 |         self.mkfile("root1/a.txt", "abc")
623 |         self.mkfile("root1/b.txt", "def")
624 |         self.mkdirs("root2")
625 |         self.mkfile("root2/a.txt", "abc")
626 |         self.mkfile("root2/b.txt", "___")
627 | 
628 |         hash1 = dirhash_mp_comp(self.path_to("root1"), "sha256")
629 |         hash2 = dirhash_mp_comp(self.path_to("root2"), "sha256")
630 |         assert hash1 != hash2
631 | 
632 |         [dhash1, dhash2] = [
633 |             dirhash_mp_comp(self.path_to(root), "sha256", entry_properties=["name"])
634 |             for root in ["root1", "root2"]
635 |         ]
636 |         assert dhash1 == dhash2
637 | 
638 |     def test_is_link_property(self):
639 |         self.mkdirs("root1")
640 |         self.mkfile("root1/a.txt", "abc")
641 |         self.mkfile("root1/b.txt", "def")
642 |         self.mkdirs("root2")
643 |         self.mkfile("b_target", "def")
644 |         self.mkfile("root2/a.txt", "abc")
645 |         self.symlink("b_target", "root2/b.txt")
646 | 
647 |         hash1 = dirhash_mp_comp(self.path_to("root1"), "sha256")
648 |         hash2 = dirhash_mp_comp(self.path_to("root2"), "sha256")
649 |         assert hash1 == hash2
650 | 
651 |         for entry_properties in [
652 |             ["name", "data", "is_link"],
653 |             ["name", "is_link"],
654 |             ["data", "is_link"],
655 |         ]:
656 |             [hash1, hash2] = [
657 |                 dirhash_mp_comp(
658 |                     self.path_to(root), "sha256", entry_properties=entry_properties
659 |                 )
660 |                 for root in ["root1", "root2"]
661 |             ]
662 |             assert hash1 != hash2
663 | 
664 |     def test_raise_on_not_at_least_one_of_name_and_data(self):
665 |         self.mkdirs("root1")
666 |         self.mkfile("root1/a.txt", "abc")
667 |         dirhash_mp_comp(self.path_to("root1"), "sha256")  # check ok
668 |         with pytest.raises(ValueError):
669 |             dirhash_mp_comp(self.path_to("root1"), "sha256", entry_properties=[])
670 | 
671 |         with pytest.raises(ValueError):
672 |             dirhash_mp_comp(
673 |                 self.path_to("root1"), "sha256", entry_properties=["is_link"]
674 |             )
675 | 
676 |     @pytest.mark.skipif(
677 |         os.name == "nt",
678 |         reason="TODO: not getting expected speedup on Windows.",
679 |         # TODO: see https://github.com/andhus/scantree/issues/25
680 |     )
681 |     def test_multiproc_speedup(self):
682 |         self.mkdirs("root/dir")
683 |         num_files = 10
684 |         for i in range(num_files):
685 |             self.mkfile(f"root/file_{i}", "< one chunk content")
686 | 
687 |         expected_min_elapsed_sequential = SlowHasher.wait_time * num_files
688 | 
689 |         start = time()
690 |         dirhash(self.path_to("root"), algorithm=SlowHasher)
691 |         end = time()
692 |         elapsed_sequential = end - start
693 |         assert elapsed_sequential > expected_min_elapsed_sequential
694 | 
695 |         start = time()
696 |         dirhash(self.path_to("root"), algorithm=SlowHasher, jobs=num_files)
697 |         end = time()
698 |         elapsed_muliproc = end - start
699 |         assert elapsed_muliproc < 0.9 * expected_min_elapsed_sequential
700 |         # just check "any speedup", the overhead varies (and is high on Travis)
701 | 
702 |     def test_cache_by_real_path_speedup(self, tmpdir):
703 |         num_links = 10
704 | 
705 |         # reference run without links
706 |         root1 = tmpdir.join("root1")
707 |         root1.ensure(dir=True)
708 |         for i in range(num_links):
709 |             file_i = root1.join(f"file_{i}")
710 |             file_i.write("< one chunk content", ensure=True)
711 | 
712 |         wait_time = SlowHasher.wait_time
713 |         expected_min_elapsed_no_links = wait_time * num_links
714 |         start = time()
715 |         dirhash(root1, algorithm=SlowHasher)
716 |         end = time()
717 |         elapsed_no_links = end - start
718 |         assert elapsed_no_links > expected_min_elapsed_no_links
719 |         overhead = elapsed_no_links - expected_min_elapsed_no_links
720 | 
721 |         # all links to same file
722 |         root2 = tmpdir.join("root2")
723 |         root2.ensure(dir=True)
724 |         target_file = tmpdir.join("target_file")
725 |         target_file.ensure()
726 |         for i in range(num_links):
727 |             os.symlink(target_file, root2.join(f"link_{i}"))
728 | 
729 |         overhead_margin_factor = 1.5
730 |         expected_max_elapsed_with_links = overhead * overhead_margin_factor + wait_time
731 |         assert expected_max_elapsed_with_links < expected_min_elapsed_no_links
732 |         start = time()
733 |         dirhash(root2, algorithm=SlowHasher)
734 |         end = time()
735 |         elapsed_with_links = end - start
736 |         assert elapsed_with_links < expected_max_elapsed_with_links
737 | 
738 |     def test_cache_together_with_multiprocess_speedup(self, tmpdir):
739 |         target_file_names = ["target_file_1", "target_file_2"]
740 |         num_links_per_file = 10
741 |         num_links = num_links_per_file * len(target_file_names)
742 | 
743 |         # reference run without links
744 |         root1 = tmpdir.join("root1")
745 |         root1.ensure(dir=True)
746 |         for i in range(num_links):
747 |             file_i = root1.join(f"file_{i}")
748 |             file_i.write("< one chunk content", ensure=True)
749 | 
750 |         jobs = 2
751 |         wait_time = SlowHasher.wait_time
752 |         expected_min_elapsed_no_links = wait_time * num_links / jobs
753 |         start = time()
754 |         dirhash(root1, algorithm=SlowHasher, jobs=jobs)
755 |         end = time()
756 |         elapsed_no_links = end - start
757 |         assert elapsed_no_links > expected_min_elapsed_no_links
758 |         overhead = elapsed_no_links - expected_min_elapsed_no_links
759 | 
760 |         root2 = tmpdir.join("root2")
761 |         root2.ensure(dir=True)
762 |         for i, target_file_name in enumerate(target_file_names):
763 |             target_file = tmpdir.join(target_file_name)
764 |             target_file.write("< one chunk content", ensure=True)
765 |             for j in range(num_links_per_file):
766 |                 os.symlink(target_file, root2.join(f"link_{i}_{j}"))
767 | 
768 |         overhead_margin_factor = 1.5
769 |         expected_max_elapsed_with_links = (
770 |             overhead * overhead_margin_factor + wait_time * 2
771 |         )
772 |         assert expected_max_elapsed_with_links < expected_min_elapsed_no_links
773 |         start = time()
774 |         dirhash(root2, algorithm=SlowHasher, jobs=jobs)
775 |         end = time()
776 |         elapsed_mp_with_links = end - start
777 |         assert elapsed_mp_with_links < expected_max_elapsed_with_links
778 | 
779 |     def test_hash_cyclic_link_to_root(self):
780 |         self.mkdirs("root/d1")
781 |         self.symlink("root", "root/d1/link_back")
782 |         dirhash(self.path_to("root"), "sha256", allow_cyclic_links=True)
783 | 
784 |     def test_hash_cyclic_link(self):
785 |         self.mkdirs("root/d1/d2")
786 |         self.symlink("root/d1", "root/d1/d2/link_back")
787 |         dirhash(self.path_to("root"), "sha256", allow_cyclic_links=True)
788 | 
789 |     def test_pass_filtering_instance(self):
790 |         self.mkdirs("root")
791 |         self.mkfile("root/f1", "")
792 |         dirhash_impl(self.path_to("root"), "sha256", filter_=Filter())
793 | 
794 |     def test_pass_protocol_instance(self):
795 |         self.mkdirs("root")
796 |         self.mkfile("root/f1", "")
797 |         dirhash_impl(self.path_to("root"), "sha256", protocol=Protocol())
798 | 
799 |     def test_raise_on_wrong_type(self):
800 |         self.mkdirs("root")
801 |         self.mkfile("root/f1", "")
802 |         with pytest.raises(TypeError):
803 |             dirhash_impl(self.path_to("root"), "sha256", filter_="")
804 |         with pytest.raises(TypeError):
805 |             dirhash_impl(self.path_to("root"), "sha256", protocol="")
806 | 
807 | 
808 | class SlowHasher:
809 |     wait_time = 0.25
810 | 
811 |     def __init__(self, *args, **kwargs):
812 |         pass
813 | 
814 |     def update(self, data):
815 |         if data != b"":
816 |             sleep(self.wait_time)
817 | 
818 |     def hexdigest(self):
819 |         return ""
820 | 
821 | 
822 | class IdentityHasher:
823 |     def __init__(self, initial_data=b""):
824 |         self.datas = [initial_data.decode("utf-8")]
825 | 
826 |     def update(self, data):
827 |         self.datas.append(data.decode("utf-8"))
828 | 
829 |     def hexdigest(self):
830 |         return "".join(self.datas)
831 | 
832 | 
833 | class TestProtocol:
834 |     def test_raise_for_invalid_entry_properties(self):
835 |         with pytest.raises(ValueError):
836 |             Protocol(entry_properties=["not-valid"])
837 | 
838 |     def test_raise_for_invalid_allow_cyclic_links(self):
839 |         with pytest.raises(ValueError):
840 |             Protocol(allow_cyclic_links="not-valid")
841 | 
842 | 
843 | def mock_func(x):
844 |     return x * 2
845 | 
846 | 
847 | @pytest.mark.parametrize("jobs", [1, 2, 4])
848 | def test_parmap(jobs):
849 |     inputs = [1, 2, 3, 4]
850 |     assert _parmap(mock_func, inputs, jobs=jobs) == [2, 4, 6, 8]
851 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = pre-commit,py{38,39,310,311,312}
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     pytest
 7 |     pytest-cov
 8 | commands =
 9 |     pytest --cov=dirhash --cov-report=xml --cov-report=term-missing --cov-config=.coveragerc {posargs:tests}
10 | 
11 | [testenv:pre-commit]
12 | skip_install = true
13 | deps = pre-commit
14 | commands = pre-commit run --all-files --show-diff-on-failure
15 | 
16 | [gh-actions]
17 | python =
18 |     3.8: py38
19 |     3.9: py39
20 |     3.10: py310
21 |     3.11: py311
22 |     3.12: py312
23 | 


--------------------------------------------------------------------------------