├── .flake8
├── .git_hooks
    └── pre-push
├── .github
    └── workflows
    │   ├── codacy.yml
    │   ├── codeql.yml
    │   └── dependency-review.yml
├── .gitignore
├── .gitlab-ci.yml
├── .pylintrc
├── CHANGELOG.md
├── LICENSE
├── README.md
├── duplicate_images
    ├── __init__.py
    ├── common.py
    ├── duplicate.py
    ├── function_types.py
    ├── hash_scanner
    │   ├── __init__.py
    │   └── image_hash_scanner.py
    ├── hash_store.py
    ├── image_pair_finder.py
    ├── log.py
    ├── methods.py
    ├── pair_finder_options.py
    ├── parse_commandline.py
    └── progress_bar_manager.py
├── mypy.ini
├── poetry.lock
├── pyproject.toml
└── tests
    ├── integration
        ├── __init__.py
        ├── conftest.py
        ├── data
        │   ├── broken
        │   │   ├── 47ff(1).jpg
        │   │   └── 47ff(2).jpg
        │   ├── different
        │   │   └── pair1
        │   │   │   ├── 20221026_124702.jpg
        │   │   │   └── 20221026_124757.jpg
        │   ├── equal_but_binary_different.json
        │   ├── equal_but_binary_different.pickle
        │   ├── equal_but_binary_different
        │   │   ├── heic_bit_depth
        │   │   │   ├── 20221026_124702_10bit.heic
        │   │   │   └── 20221026_124702_8bit.heic
        │   │   ├── heic_lossless_vs_lossy
        │   │   │   ├── 20221026_124702_lossless.heic
        │   │   │   └── 20221026_124702_q85.heic
        │   │   ├── jpeg_quality
        │   │   │   ├── 20221026_124702_q94.jpg
        │   │   │   └── 20221026_124702_q95.jpg
        │   │   ├── jpeg_vs_heic
        │   │   │   ├── 20221026_124702.heic
        │   │   │   └── 20221026_124702.jpg
        │   │   └── shrunk10%
        │   │   │   ├── 20221026_124702.jpg
        │   │   │   └── 20221026_124702_90%.jpg
        │   ├── exactly_equal
        │   │   ├── heif
        │   │   │   ├── test1.heif
        │   │   │   └── test2.heif
        │   │   ├── pair1
        │   │   │   ├── 20221026_124702_90%-2.jpg
        │   │   │   └── 20221026_124702_90%.jpg
        │   │   ├── pair2
        │   │   │   ├── 20220312_124816-2.jpg
        │   │   │   └── 20220312_124816.jpg
        │   │   ├── pair3
        │   │   │   ├── IMAG0015_small-2.png
        │   │   │   └── IMAG0015_small.png
        │   │   └── webp
        │   │   │   ├── test1.webp
        │   │   │   └── test2.webp
        │   ├── garbage.txt
        │   ├── huge
        │   │   ├── huge.png
        │   │   └── huge2.png
        │   ├── is_image_file
        │   │   ├── is_image
        │   │   │   ├── test.heif
        │   │   │   ├── test.jpg
        │   │   │   ├── test.png
        │   │   │   ├── test.tiff
        │   │   │   └── test.webp
        │   │   └── is_not_image
        │   │   │   ├── test.mp3
        │   │   │   ├── test.ogg
        │   │   │   └── test.txt
        │   ├── jpeg_artifacts
        │   │   ├── jpeg_10
        │   │   │   ├── 20221026_124702_q10.jpg
        │   │   │   └── 20221026_124702_q95.jpg
        │   │   ├── jpeg_25
        │   │   │   ├── 20221026_124702_q25.jpg
        │   │   │   └── 20221026_124702_q95.jpg
        │   │   ├── jpeg_50
        │   │   │   ├── 20221026_124702_q50.jpg
        │   │   │   └── 20221026_124702_q95.jpg
        │   │   └── jpeg_75
        │   │   │   ├── 20221026_124702_q75.jpg
        │   │   │   └── 20221026_124702_q95.jpg
        │   └── similar
        │   │   ├── many
        │   │       ├── 20220218_135622.jpg
        │   │       ├── 20220218_135658.jpg
        │   │       └── 20220218_135708.jpg
        │   │   ├── pair1
        │   │       ├── 20220806_214449.jpg
        │   │       └── 20220806_214600.jpg
        │   │   └── pair2
        │   │       ├── 20220329_210118.jpg
        │   │       └── 20220329_210123.jpg
        ├── test_is_image_file.py
        ├── test_persistent_storage.py
        └── test_real_images.py
    └── unit
        ├── __init__.py
        ├── conftest.py
        ├── test_actions.py
        ├── test_files_in_dirs.py
        ├── test_image_hash_scanner.py
        ├── test_image_pair_finder.py
        ├── test_imagehash.py
        ├── test_parse_commandline.py
        └── test_persistent_storage.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | ignore = F401,S101
4 | 


--------------------------------------------------------------------------------
/.git_hooks/pre-push:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Run the test suite before pushing to remote, so the GitLab CI fails less
 4 | # often
 5 | # This hook is called with the following parameters:
 6 | #
 7 | # $1 -- Name of the remote to which the push is being done
 8 | # $2 -- URL to which the push is being done
 9 | #
10 | # If pushing without using a named remote those arguments will be equal.
11 | #
12 | # If the log message starts with "WIP:" (work in progress) the push is allowed
13 | # even if the tests fail, since it might be necessary to push to share between
14 | # work environments.
15 | 
16 | GITLAB_PROJECT_ID=6643206
17 | CI_LINT_OUTPUT=/tmp/gitlab-ci-lint.json
18 | RED_TEXT='\e[38;5;196m'
19 | GREEN_TEXT='\e[38;5;46m'
20 | RESET_TEXT='\e[0m'
21 | STATUS=0
22 | 
23 | poetry run pytest -n auto tests/unit || STATUS=1
24 | poetry run pytest -n auto tests/integration || STATUS=1
25 | poetry run mypy duplicate_images tests || STATUS=1
26 | poetry run flake8 duplicate_images tests || STATUS=1
27 | poetry run pylint duplicate_images tests || STATUS=1
28 | poetry run bandit -r duplicate_images -q || STATUS=1
29 | 
30 | # lint GitLab CI (nod to https://stackoverflow.com/questions/49090675/how-can-i-test-gitlab-ci-yml#68723161 )
31 | if [ "$GITLAB_ACCESS_TOKEN" != "" ]; then
32 |     rm -f "$CI_LINT_OUTPUT"
33 |     jq --null-input --arg yaml "$(cat .gitlab-ci.yml)" '{ content: $yaml }' | \
34 |       curl -s "https://gitlab.com/api/v4/projects/${GITLAB_PROJECT_ID}/ci/lint" \
35 |         --header 'Content-Type: application/json' \
36 |         --header "PRIVATE-TOKEN: $GITLAB_ACCESS_TOKEN" \
37 |         --data @- > "$CI_LINT_OUTPUT"
38 |     VALID=$(jq -r .valid < "$CI_LINT_OUTPUT")
39 |     if [ "$VALID" = "true" ]; then
40 |         echo "GitLab CI valid: ${GREEN_TEXT}$VALID${RESET_TEXT}"
41 |     else
42 |         echo "GitLab CI valid: ${RED_TEXT}$VALID${RESET_TEXT}"
43 |         echo "errors: $(jq .errors < "$CI_LINT_OUTPUT")"
44 |         echo "warnings: $(jq .warnings < "$CI_LINT_OUTPUT")"
45 |     fi
46 | 	#rm -f "$CI_LINT_OUTPUT"
47 | else
48 |     echo "\$GITLAB_ACCESS_TOKEN not set"
49 | fi
50 | 
51 | # check Changelog is updated
52 | VERSION=$(egrep 'version = ".*"' pyproject.toml | cut -d \" -f 2)
53 | if test -z "$VERSION"
54 | then echo "${RED_TEXT}version not found in pyproject.toml${RESET_TEXT}"; STATUS=1
55 | else
56 |   echo "${GREEN_TEXT}Version $VERSION${RESET_TEXT}"
57 |   if ! grep -q "$VERSION" CHANGELOG.md
58 |   then echo "${RED_TEXT}$VERSION not found in changelog${RESET_TEXT}"; STATUS=1
59 |   else echo "${GREEN_TEXT}$VERSION found in CHANGELOG.md, cool${RESET_TEXT}"
60 |   fi
61 |   if ! fgrep -q "...$VERSION" CHANGELOG.md
62 |   then echo "${RED_TEXT}link to $VERSION diff not found in changelog${RESET_TEXT}"; STATUS=1
63 |   else echo "${GREEN_TEXT}link to $VERSION diff found in CHANGELOG.md, cool${RESET_TEXT}"
64 |   fi
65 |   if ! fgrep -q "## [$VERSION] - $(date +%Y-%m-%d)" CHANGELOG.md
66 |   then echo "${RED_TEXT}date not set correctly in changelog${RESET_TEXT}"; STATUS=1
67 |   else echo "${GREEN_TEXT}date in CHANGELOG.md is $(date +%Y-%m-%d), cool${RESET_TEXT}"
68 |   fi
69 | fi
70 | 
71 | if [ $STATUS -gt 0 ]; then
72 |   commitmsg=$(git log --oneline | head -n 1 | cut -d' ' -f 2-)
73 |   if echo "$commitmsg" | grep '^WIP:'; then
74 |     echo >&2 "Found WIP commit, pushing in spite of failed test suite"
75 |     STATUS=0
76 |   fi
77 | fi
78 | if [ $STATUS -gt 0 ]
79 | then echo "Status: ${RED_TEXT}${STATUS}${RESET_TEXT}"
80 | else echo "Status: ${GREEN_TEXT}${STATUS}${RESET_TEXT}"
81 | fi
82 | exit $STATUS
83 | 


--------------------------------------------------------------------------------
/.github/workflows/codacy.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | 
 6 | # This workflow checks out code, performs a Codacy security scan
 7 | # and integrates the results with the
 8 | # GitHub Advanced Security code scanning feature.  For more information on
 9 | # the Codacy security scan action usage and parameters, see
10 | # https://github.com/codacy/codacy-analysis-cli-action.
11 | # For more information on Codacy Analysis CLI in general, see
12 | # https://github.com/codacy/codacy-analysis-cli.
13 | 
14 | name: Codacy Security Scan
15 | 
16 | on:
17 |   push:
18 |     branches: [ "master" ]
19 |   pull_request:
20 |     # The branches below must be a subset of the branches above
21 |     branches: [ "master" ]
22 |   schedule:
23 |     - cron: '21 5 * * 6'
24 | 
25 | permissions:
26 |   contents: read
27 | 
28 | jobs:
29 |   codacy-security-scan:
30 |     permissions:
31 |       contents: read # for actions/checkout to fetch code
32 |       security-events: write # for github/codeql-action/upload-sarif to upload SARIF results
33 |       actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status 
34 |     name: Codacy Security Scan
35 |     runs-on: ubuntu-latest
36 |     steps:
37 |       # Checkout the repository to the GitHub Actions runner
38 |       - name: Checkout code
39 |         uses: actions/checkout@v3
40 | 
41 |       # Execute Codacy Analysis CLI and generate a SARIF output with the security issues identified during the analysis
42 |       - name: Run Codacy Analysis CLI
43 |         uses: codacy/codacy-analysis-cli-action@d840f886c4bd4edc059706d09c6a1586111c540b
44 |         with:
45 |           # Check https://github.com/codacy/codacy-analysis-cli#project-token to get your project token from your Codacy repository
46 |           # You can also omit the token and run the tools that support default configurations
47 |           project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
48 |           verbose: true
49 |           output: results.sarif
50 |           format: sarif
51 |           # Adjust severity of non-security issues
52 |           gh-code-scanning-compat: true
53 |           # Force 0 exit code to allow SARIF file generation
54 |           # This will handover control about PR rejection to the GitHub side
55 |           max-allowed-issues: 2147483647
56 | 
57 |       # Upload the SARIF file generated in the previous step
58 |       - name: Upload SARIF results file
59 |         uses: github/codeql-action/upload-sarif@v2
60 |         with:
61 |           sarif_file: results.sarif
62 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ "master" ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ "master" ]
20 |   schedule:
21 |     - cron: '27 1 * * 4'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 |         # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
38 | 
39 |     steps:
40 |     - name: Checkout repository
41 |       uses: actions/checkout@v3
42 | 
43 |     # Initializes the CodeQL tools for scanning.
44 |     - name: Initialize CodeQL
45 |       uses: github/codeql-action/init@v2
46 |       with:
47 |         languages: ${{ matrix.language }}
48 |         # If you wish to specify custom queries, you can do so here or in a config file.
49 |         # By default, queries listed here will override any specified in a config file.
50 |         # Prefix the list here with "+" to use these queries and those in the config file.
51 |         
52 |         # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
53 |         # queries: security-extended,security-and-quality
54 | 
55 |         
56 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, Go, or Java).
57 |     # If this step fails, then you should remove it and run the build manually (see below)
58 |     - name: Autobuild
59 |       uses: github/codeql-action/autobuild@v2
60 | 
61 |     # ℹ️ Command-line programs to run using the OS shell.
62 |     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
63 | 
64 |     #   If the Autobuild fails above, remove it and uncomment the following three lines. 
65 |     #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
66 | 
67 |     # - run: |
68 |     #   echo "Run, Build Application using script"
69 |     #   ./location_of_script_within_repo/buildscript.sh
70 | 
71 |     - name: Perform CodeQL Analysis
72 |       uses: github/codeql-action/analyze@v2
73 |       with:
74 |         category: "/language:${{matrix.language}}"
75 | 


--------------------------------------------------------------------------------
/.github/workflows/dependency-review.yml:
--------------------------------------------------------------------------------
 1 | # Dependency Review Action
 2 | #
 3 | # This Action will scan dependency manifest files that change as part of a Pull Request, surfacing known-vulnerable versions of the packages declared or updated in the PR. Once installed, if the workflow run is marked as required, PRs introducing known-vulnerable packages will be blocked from merging.
 4 | #
 5 | # Source repository: https://github.com/actions/dependency-review-action
 6 | # Public documentation: https://docs.github.com/en/code-security/supply-chain-security/understanding-your-software-supply-chain/about-dependency-review#dependency-review-enforcement
 7 | name: 'Dependency Review'
 8 | on:
 9 |   push:
10 |     branches: [ "master" ]
11 |   pull_request:
12 |     # The branches below must be a subset of the branches above
13 |     branches: [ "master" ]
14 |   schedule:
15 |     - cron: '21 5 * * 6'
16 | 
17 | permissions:
18 |   contents: read
19 | 
20 | jobs:
21 |   dependency-review:
22 |     runs-on: ubuntu-latest
23 |     steps:
24 |       - name: 'Checkout Repository'
25 |         uses: actions/checkout@v3
26 |       - name: 'Dependency Review'
27 |         uses: actions/dependency-review-action@v3
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | *.pyc
 3 | .mypy_cache
 4 | .code-quality.json
 5 | *.egg-info
 6 | dist
 7 | *.db
 8 | *.pickle
 9 | *.bak
10 | fil-result
11 | .cache
12 | .local
13 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
  1 | stages:
  2 |   - test
  3 |   - publish
  4 | 
  5 | include:
  6 |   - template: Code-Quality.gitlab-ci.yml
  7 |   - template: Security/SAST.gitlab-ci.yml
  8 | 
  9 | .test:
 10 |   parallel:
 11 |     matrix:
 12 |       - PY_VERSION: [ "3.9", "3.10", "3.11", "3.12" ]
 13 |   stage: test
 14 |   image: python:$PY_VERSION
 15 |   rules:
 16 |     - if: '$CI_PIPELINE_SOURCE == "push" || $CI_PIPELINE_SOURCE == "merge_request_event"'
 17 |   before_script:
 18 |     - test $(echo $PY_VERSION | cut -d . -f 2) -gt 12 && (apt update && apt install -y gfortran libopenblas-dev)
 19 |     - pip install --root-user-action ignore -q poetry
 20 |     - poetry install
 21 | 
 22 | unit tests:
 23 |   extends: .test
 24 |   script:
 25 |     - poetry run pytest --junitxml=pytest.xml tests/unit
 26 |   artifacts:
 27 |     reports:
 28 |       junit:
 29 |         - pytest.xml
 30 | 
 31 | integration tests:
 32 |   extends: .test
 33 |   script:
 34 |     - poetry run pytest --junitxml=pytest.xml tests/integration
 35 |   artifacts:
 36 |     reports:
 37 |       junit:
 38 |         - pytest.xml
 39 | 
 40 | mypy:
 41 |   extends: .test
 42 |   script:
 43 |     - poetry run mypy duplicate_images tests
 44 | 
 45 | flake8:
 46 |   extends: .test
 47 |   script:
 48 |     - poetry run flake8 duplicate_images tests
 49 | 
 50 | pylint:
 51 |   extends: .test
 52 |   script:
 53 |     - test $(echo $PY_VERSION | cut -d . -f 2) -ge 12 && PYLINT_EXTRA_ARGS="--disable=inconsistent-quotes"
 54 |     - poetry run pylint $PYLINT_EXTRA_ARGS duplicate_images tests
 55 | 
 56 | bandit:
 57 |   extends: .test
 58 |   parallel:
 59 |     matrix:
 60 |       - PY_VERSION: [ "3.12" ]
 61 |   script:
 62 |     - poetry run bandit -r duplicate_images
 63 | 
 64 | ChangelogIsUpdated:
 65 |   stage: test
 66 |   image: alpine:latest
 67 |   rules:
 68 |     - if: "$CI_MERGE_REQUEST_ID"
 69 |     - if: $CI_COMMIT_BRANCH == "master"
 70 |   script:
 71 |     - VERSION=$(egrep 'version = ".*"' pyproject.toml | cut -d \" -f 2)
 72 |     - test -n "$VERSION"
 73 |     - fgrep "## [$VERSION]" CHANGELOG.md
 74 |     - fgrep "...$VERSION" CHANGELOG.md
 75 |     - fgrep "## [$VERSION] - $(date +%Y-%m-%d)" CHANGELOG.md
 76 | 
 77 | 
 78 | RunAndCheckResults:
 79 |   extends: .test
 80 |   variables:
 81 |     IMAGE_DIR: tests/integration/data/equal_but_binary_different
 82 |   script:
 83 |     - NUM_FILES=$(find $IMAGE_DIR -type f | wc -l)
 84 |     - EXPECTED_PAIRS=$((NUM_FILES*(NUM_FILES-1)/2)) # should be if all files matched...
 85 |     - EXPECTED_PAIRS=29 # ...turns out not all files match with each other though
 86 |     - NUM_PAIRS=$(poetry run find-dups $IMAGE_DIR | wc -l)
 87 |     - test $NUM_PAIRS -eq $EXPECTED_PAIRS
 88 |     - NUM_PAIRS=$(poetry run find-dups $IMAGE_DIR --progress --quiet | wc -l)
 89 |     - test $NUM_PAIRS -eq $EXPECTED_PAIRS
 90 |     - NUM_PAIRS=$(poetry run find-dups $IMAGE_DIR --algorithm ahash --quiet | wc -l)
 91 |     - test $NUM_PAIRS -eq $EXPECTED_PAIRS
 92 |     - NUM_PAIRS=$(poetry run find-dups $IMAGE_DIR --max-distance 1 --quiet | wc -l)
 93 |     - test $NUM_PAIRS -eq $EXPECTED_PAIRS
 94 |     - NUM_PAIRS=$(poetry run find-dups $IMAGE_DIR --hash-size 8 --quiet | wc -l)
 95 |     - test $NUM_PAIRS -eq $EXPECTED_PAIRS
 96 | 
 97 | RunWithArgs:
 98 |   extends: .test
 99 |   image: python:3.12
100 |   variables:
101 |     IMAGE_DIR: tests/integration/data
102 |     HASH_DB: test.pickle
103 |   parallel:
104 |     matrix:
105 |       - ON_EQUAL: [print, quote_inline, none, d<]
106 |         ALGORITHM: [ahash, colorhash]
107 |         MODE: ["", --slow, --parallel]
108 |   script:
109 |     # ensure the script runs without any error with the given options
110 |     - poetry run find-dups 
111 |         ${IMAGE_DIR} --hash-db ${HASH_DB} --progress 
112 |         --algorithm ${ALGORITHM} 
113 |         --on-equal ${ON_EQUAL} 
114 |         $MODE
115 |     # ensure the hash cache file is written
116 |     - test -f ${HASH_DB}
117 |     - ls -l ${HASH_DB}
118 | 
119 | RunWithArgsExec:
120 |   extends: RunWithArgs
121 |   parallel:
122 |     matrix:
123 |       - ON_EQUAL: [exec]
124 |         ALGORITHM: [ahash]
125 |         MODE: ["", --slow, --parallel]
126 |   script:
127 |     # ensure the script runs without any error with the given options
128 |     - poetry run find-dups 
129 |         ${IMAGE_DIR} --hash-db ${HASH_DB} --progress --exec "ls -l {1} {2}" 
130 |         --algorithm ${ALGORITHM} 
131 |         --on-equal ${ON_EQUAL} 
132 |         $MODE
133 |     # ensure the hash cache file is written
134 |     - test -f ${HASH_DB}
135 |     - ls -l ${HASH_DB}
136 | 
137 | RunWithArgsExecFailure:
138 |   extends: RunWithArgs
139 |   parallel:
140 |     matrix:
141 |       - ON_EQUAL: [ "" ]
142 |         ALGORITHM: [ "" ]
143 |         MODE: [ "" ]
144 |   script:
145 |     # ensure the script fails when given --exec without --on-equal exec
146 |     - poetry run find-dups 
147 |         ${IMAGE_DIR} --hash-db ${HASH_DB} --progress --exec "ls -l {1} {2}" && exit 1
148 |     # ensure the script fails when given --on-equal exec without --exec
149 |     - poetry run find-dups 
150 |         ${IMAGE_DIR} --hash-db ${HASH_DB} --progress --on-equal exec && exit 1
151 |     - exit 0
152 | 
153 | TagIsNew:
154 |   stage: test
155 |   image: alpine:latest
156 |   rules:
157 |     - if: "$CI_MERGE_REQUEST_ID"
158 |     - if: $CI_COMMIT_BRANCH == "master"
159 |   before_script:
160 |     - apk update
161 |     - apk add git
162 |   script:
163 |     - VERSION=$(egrep 'version = ".*"' pyproject.toml | cut -d \" -f 2)
164 |     - test -n "$VERSION"
165 |     - git tag | ( ! grep "^${VERSION}\$" )
166 | 
167 | CreateTag:
168 |   stage: publish
169 |   image: alpine:latest
170 |   rules:
171 |     - if: $CI_COMMIT_BRANCH == "master" && $CI_PIPELINE_SOURCE != "schedule"
172 |       when: on_success
173 |   before_script:
174 |     - apk update
175 |     - apk add git
176 |     - git config user.email "${GITLAB_USER_EMAIL}"
177 |     - git config user.name "${GITLAB_USER_NAME}"
178 |   script:
179 |     - VERSION=$(egrep 'version = ".*"' pyproject.toml | cut -d \" -f 2)
180 |     - echo "**** Tagging release as version $VERSION"
181 |     - git remote add tag-origin https://oauth2:${GITLAB_ACCESS_TOKEN}@gitlab.com/${CI_PROJECT_PATH}
182 |     - git tag -a "${VERSION}" -m "Released $(date +%Y-%m-%d)"
183 |     - git push tag-origin "${VERSION}"
184 | 
185 | PublishToPyPI:
186 |   stage: publish
187 |   image: python:3.11
188 |   rules:
189 |     - if: "$CI_COMMIT_TAG"
190 |       when: on_success
191 |   script:
192 |     - VERSION=$(egrep 'version = ".*"' pyproject.toml | cut -d \" -f 2)
193 |     - test "${CI_COMMIT_TAG}" == "${VERSION}" || exit 1
194 |     - echo "**** Upgrading to ${VERSION}"
195 |     - pip install -q poetry
196 |     - poetry build
197 |     - poetry config repositories.testpypi https://test.pypi.org/legacy/
198 |     - poetry publish --username __token__ --password ${TESTPYPI_TOKEN} --repository testpypi
199 |     - echo "**** Attempting pip install from test PyPI server"
200 |     - apt-get -y -qq update
201 |     - apt-get -y -q install libsndfile1 ffmpeg > /dev/null
202 |     - pip install -q --index-url https://test.pypi.org/simple --extra-index-url https://pypi.org/simple duplicate_images
203 |     - echo "**** Publishing on live PyPI server"
204 |     - poetry publish --username __token__ --password ${PYPI_TOKEN}
205 | 
206 | PushToGithub:
207 |   stage: publish
208 |   image: alpine:latest
209 |   rules:
210 |     - if: "$CI_COMMIT_TAG"
211 |       when: on_success
212 |   before_script:
213 |     - apk update
214 |     - apk add openssh-client git sshpass
215 |     - eval $(ssh-agent -s)
216 |     - echo "$GITHUB_SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add - > /dev/null
217 |     - mkdir -p ~/.ssh
218 |     - chmod 700 ~/.ssh
219 |     - ssh-keyscan github.com >> ~/.ssh/known_hosts
220 |     - chmod 644 ~/.ssh/known_hosts
221 |     - ssh -T git@github.com 2>&1 || true
222 |     - git config user.email "${GITLAB_USER_EMAIL}"
223 |     - git config user.name "${GITLAB_USER_NAME}"
224 |   script:
225 |     - git remote add github git@github.com:lene/DuplicateImages.git
226 |     - git remote show github
227 |     - BRANCH=${CI_COMMIT_BRANCH:-master}
228 |     - git checkout $BRANCH
229 |     - git push github $BRANCH
230 |     - git push github $CI_COMMIT_TAG
231 | 
232 | CreateGithubRelease:
233 |   stage: publish
234 |   needs:
235 |     - PushToGithub
236 |   image: alpine:latest
237 |   rules:
238 |     - if: "$CI_COMMIT_TAG"
239 |       when: on_success
240 |   before_script:
241 |     - apk update
242 |     - apk add curl
243 |   variables:
244 |     RELEASE_API_URL: "https://api.github.com/repos/lene/DuplicateImages/releases"
245 |     DESCRIPTION: "Full Changelog: https://github.com/lene/DuplicateImages/blob/${CI_COMMIT_TAG}/CHANGELOG.md"
246 |   script:
247 |     - POST_DATA='{
248 |         "tag_name":"'${CI_COMMIT_TAG}'",
249 |         "target_commitish":"master",
250 |         "name":"'${CI_COMMIT_TAG}'",
251 |         "body":"'${FULL_DESCRIPTION}${DESCRIPTION}'",
252 |         "draft":false,
253 |         "prerelease":false,
254 |         "generate_release_notes":false
255 |       }'
256 |     - echo $API_URL
257 |     - echo $POST_DATA
258 |     - 'curl -L -X POST 
259 |          -H "Accept: application/vnd.github+json" 
260 |          -H "X-GitHub-Api-Version: 2022-11-28" 
261 |          -H "Authorization: Bearer ${GITHUB_API_TOKEN}"
262 |          ${RELEASE_API_URL} -d "${POST_DATA}"'
263 | 
264 | CreateGitlabRelease:
265 |   stage: publish
266 |   image: registry.gitlab.com/gitlab-org/release-cli:latest
267 |   rules:
268 |     - if: $CI_COMMIT_TAG
269 |   script:
270 |     - echo "running release_job" # dummy, see https://gitlab.com/gitlab-org/gitlab/-/issues/223856
271 |   release:
272 |     tag_name: '$CI_COMMIT_TAG'
273 |     description: './CHANGELOG.md'


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # A comma-separated list of package or module names from where C extensions may
  4 | # be loaded. Extensions are loading into the active Python interpreter and may
  5 | # run arbitrary code.
  6 | extension-pkg-whitelist=
  7 | 
  8 | # Specify a score threshold to be exceeded before program exits with error.
  9 | fail-under=10.0
 10 | 
 11 | # Add files or directories to the blacklist. They should be base names, not
 12 | # paths.
 13 | ignore=CVS
 14 | 
 15 | # Add files or directories matching the regex patterns to the blacklist. The
 16 | # regex matches against base names, not paths.
 17 | ignore-patterns=
 18 | 
 19 | # Python code to execute, usually for sys.path manipulation such as
 20 | # pygtk.require().
 21 | #init-hook=
 22 | 
 23 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
 24 | # number of processors available to use.
 25 | jobs=0
 26 | 
 27 | # Control the amount of potential inferred values when inferring a single
 28 | # object. This can help the performance when dealing with large functions or
 29 | # complex, nested conditions.
 30 | limit-inference-results=100
 31 | 
 32 | # List of plugins (as comma separated values of python module names) to load,
 33 | # usually to register additional checkers.
 34 | load-plugins=
 35 | 
 36 | # Pickle collected data for later comparisons.
 37 | persistent=yes
 38 | 
 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit
 40 | # user-friendly hints instead of false-positive error messages.
 41 | suggestion-mode=yes
 42 | 
 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 44 | # active Python interpreter and may run arbitrary code.
 45 | unsafe-load-any-extension=no
 46 | 
 47 | 
 48 | [MESSAGES CONTROL]
 49 | 
 50 | # Only show warnings with the listed confidence levels. Leave empty to show
 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
 52 | confidence=
 53 | 
 54 | # Disable the message, report, category or checker with the given id(s). You
 55 | # can either give multiple identifiers separated by comma (,) or put this
 56 | # option multiple times (only on the command line, not in the configuration
 57 | # file where it should appear only once). You can also use "--disable=all" to
 58 | # disable everything first and then reenable specific checks. For example, if
 59 | # you want to run only the similarities checker, you can use "--disable=all
 60 | # --enable=similarities". If you want to run only the classes checker, but have
 61 | # no Warning level messages displayed, use "--disable=all --enable=classes
 62 | # --disable=W".
 63 | disable=raw-checker-failed,
 64 |         bad-inline-option,
 65 |         locally-disabled,
 66 |         file-ignored,
 67 |         suppressed-message,
 68 |         useless-suppression,
 69 |         deprecated-pragma,
 70 |         use-symbolic-message-instead,
 71 |         missing-function-docstring,
 72 |         unsubscriptable-object,
 73 |         consider-using-with
 74 | 
 75 | # Enable the message, report, category or checker with the given id(s). You can
 76 | # either give multiple identifier separated by comma (,) or put this option
 77 | # multiple time (only on the command line, not in the configuration file where
 78 | # it should appear only once). See also the "--disable" option for examples.
 79 | enable=c-extension-no-member
 80 | 
 81 | 
 82 | [REPORTS]
 83 | 
 84 | # Python expression which should return a score less than or equal to 10. You
 85 | # have access to the variables 'error', 'warning', 'refactor', and 'convention'
 86 | # which contain the number of messages in each category, as well as 'statement'
 87 | # which is the total number of statements analyzed. This score is used by the
 88 | # global evaluation report (RP0004).
 89 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 90 | 
 91 | # Template used to display messages. This is a python new-style format string
 92 | # used to format the message information. See doc for all details.
 93 | #msg-template=
 94 | 
 95 | # Set the output format. Available formats are text, parseable, colorized, json
 96 | # and msvs (visual studio). You can also give a reporter class, e.g.
 97 | # mypackage.mymodule.MyReporterClass.
 98 | output-format=text
 99 | 
100 | # Tells whether to display a full report or only the messages.
101 | reports=no
102 | 
103 | # Activate the evaluation score.
104 | score=yes
105 | 
106 | 
107 | [REFACTORING]
108 | 
109 | # Maximum number of nested blocks for function / method body
110 | max-nested-blocks=5
111 | 
112 | # Complete name of functions that never returns. When checking for
113 | # inconsistent-return-statements if a never returning function is called then
114 | # it will be considered as an explicit return statement and no message will be
115 | # printed.
116 | never-returning-functions=sys.exit
117 | 
118 | 
119 | [MISCELLANEOUS]
120 | 
121 | # List of note tags to take in consideration, separated by a comma.
122 | notes=FIXME,
123 |       XXX,
124 |       TODO
125 | 
126 | # Regular expression of note tags to take in consideration.
127 | #notes-rgx=
128 | 
129 | 
130 | [BASIC]
131 | 
132 | # Naming style matching correct argument names.
133 | argument-naming-style=snake_case
134 | 
135 | # Regular expression matching correct argument names. Overrides argument-
136 | # naming-style.
137 | #argument-rgx=
138 | 
139 | # Naming style matching correct attribute names.
140 | attr-naming-style=snake_case
141 | 
142 | # Regular expression matching correct attribute names. Overrides attr-naming-
143 | # style.
144 | #attr-rgx=
145 | 
146 | # Bad variable names which should always be refused, separated by a comma.
147 | bad-names=foo,
148 |           bar,
149 |           baz,
150 |           toto,
151 |           tutu,
152 |           tata
153 | 
154 | # Bad variable names regexes, separated by a comma. If names match any regex,
155 | # they will always be refused
156 | bad-names-rgxs=
157 | 
158 | # Naming style matching correct class attribute names.
159 | class-attribute-naming-style=any
160 | 
161 | # Regular expression matching correct class attribute names. Overrides class-
162 | # attribute-naming-style.
163 | #class-attribute-rgx=
164 | 
165 | # Naming style matching correct class names.
166 | class-naming-style=PascalCase
167 | 
168 | # Regular expression matching correct class names. Overrides class-naming-
169 | # style.
170 | #class-rgx=
171 | 
172 | # Naming style matching correct constant names.
173 | const-naming-style=UPPER_CASE
174 | 
175 | # Regular expression matching correct constant names. Overrides const-naming-
176 | # style.
177 | #const-rgx=
178 | 
179 | # Minimum line length for functions/classes that require docstrings, shorter
180 | # ones are exempt.
181 | docstring-min-length=-1
182 | 
183 | # Naming style matching correct function names.
184 | function-naming-style=snake_case
185 | 
186 | # Regular expression matching correct function names. Overrides function-
187 | # naming-style.
188 | #function-rgx=
189 | 
190 | # Good variable names which should always be accepted, separated by a comma.
191 | good-names=i,
192 |            j,
193 |            k,
194 |            x,
195 |            y,
196 |            ex,
197 |            Run,
198 |            _
199 | 
200 | # Good variable names regexes, separated by a comma. If names match any regex,
201 | # they will always be accepted
202 | good-names-rgxs=
203 | 
204 | # Include a hint for the correct naming format with invalid-name.
205 | include-naming-hint=no
206 | 
207 | # Naming style matching correct inline iteration names.
208 | inlinevar-naming-style=any
209 | 
210 | # Regular expression matching correct inline iteration names. Overrides
211 | # inlinevar-naming-style.
212 | #inlinevar-rgx=
213 | 
214 | # Naming style matching correct method names.
215 | method-naming-style=snake_case
216 | 
217 | # Regular expression matching correct method names. Overrides method-naming-
218 | # style.
219 | #method-rgx=
220 | 
221 | # Naming style matching correct module names.
222 | module-naming-style=snake_case
223 | 
224 | # Regular expression matching correct module names. Overrides module-naming-
225 | # style.
226 | #module-rgx=
227 | 
228 | # Colon-delimited sets of names that determine each other's naming style when
229 | # the name regexes allow several styles.
230 | name-group=
231 | 
232 | # Regular expression which should only match function or class names that do
233 | # not require a docstring.
234 | no-docstring-rgx=^_
235 | 
236 | # List of decorators that produce properties, such as abc.abstractproperty. Add
237 | # to this list to register other decorators that produce valid properties.
238 | # These decorators are taken in consideration only for invalid-name.
239 | property-classes=abc.abstractproperty
240 | 
241 | # Naming style matching correct variable names.
242 | variable-naming-style=snake_case
243 | 
244 | # Regular expression matching correct variable names. Overrides variable-
245 | # naming-style.
246 | #variable-rgx=
247 | 
248 | 
249 | [STRING]
250 | 
251 | # This flag controls whether inconsistent-quotes generates a warning when the
252 | # character used as a quote delimiter is used inconsistently within a module.
253 | check-quote-consistency=yes
254 | 
255 | # This flag controls whether the implicit-str-concat should generate a warning
256 | # on implicit string concatenation in sequences defined over several lines.
257 | check-str-concat-over-line-jumps=no
258 | 
259 | 
260 | [FORMAT]
261 | 
262 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
263 | expected-line-ending-format=
264 | 
265 | # Regexp for a line that is allowed to be longer than the limit.
266 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
267 | 
268 | # Number of spaces of indent required inside a hanging or continued line.
269 | indent-after-paren=4
270 | 
271 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
272 | # tab).
273 | indent-string='    '
274 | 
275 | # Maximum number of characters on a single line.
276 | max-line-length=100
277 | 
278 | # Maximum number of lines in a module.
279 | max-module-lines=1000
280 | 
281 | # Allow the body of a class to be on the same line as the declaration if body
282 | # contains single statement.
283 | single-line-class-stmt=no
284 | 
285 | # Allow the body of an if to be on the same line as the test if there is no
286 | # else.
287 | single-line-if-stmt=no
288 | 
289 | 
290 | [SPELLING]
291 | 
292 | # Limits count of emitted suggestions for spelling mistakes.
293 | max-spelling-suggestions=4
294 | 
295 | # Spelling dictionary name. Available dictionaries: none. To make it work,
296 | # install the python-enchant package.
297 | spelling-dict=
298 | 
299 | # List of comma separated words that should not be checked.
300 | spelling-ignore-words=
301 | 
302 | # A path to a file that contains the private dictionary; one word per line.
303 | spelling-private-dict-file=
304 | 
305 | # Tells whether to store unknown words to the private dictionary (see the
306 | # --spelling-private-dict-file option) instead of raising a message.
307 | spelling-store-unknown-words=no
308 | 
309 | 
310 | [TYPECHECK]
311 | 
312 | # List of decorators that produce context managers, such as
313 | # contextlib.contextmanager. Add to this list to register other decorators that
314 | # produce valid context managers.
315 | contextmanager-decorators=contextlib.contextmanager
316 | 
317 | # List of members which are set dynamically and missed by pylint inference
318 | # system, and so shouldn't trigger E1101 when accessed. Python regular
319 | # expressions are accepted.
320 | generated-members=
321 | 
322 | # Tells whether missing members accessed in mixin class should be ignored. A
323 | # mixin class is detected if its name ends with "mixin" (case insensitive).
324 | ignore-mixin-members=yes
325 | 
326 | # Tells whether to warn about missing members when the owner of the attribute
327 | # is inferred to be None.
328 | ignore-none=yes
329 | 
330 | # This flag controls whether pylint should warn about no-member and similar
331 | # checks whenever an opaque object is returned when inferring. The inference
332 | # can return multiple potential results while evaluating a Python object, but
333 | # some branches might not be evaluated, which results in partial inference. In
334 | # that case, it might be useful to still emit no-member and other checks for
335 | # the rest of the inferred objects.
336 | ignore-on-opaque-inference=yes
337 | 
338 | # List of class names for which member attributes should not be checked (useful
339 | # for classes with dynamically set attributes). This supports the use of
340 | # qualified names.
341 | ignored-classes=optparse.Values,thread._local,_thread._local
342 | 
343 | # List of module names for which member attributes should not be checked
344 | # (useful for modules/projects where namespaces are manipulated during runtime
345 | # and thus existing member attributes cannot be deduced by static analysis). It
346 | # supports qualified module names, as well as Unix pattern matching.
347 | ignored-modules=
348 | 
349 | # Show a hint with possible names when a member name was not found. The aspect
350 | # of finding the hint is based on edit distance.
351 | missing-member-hint=yes
352 | 
353 | # The minimum edit distance a name should have in order to be considered a
354 | # similar match for a missing member name.
355 | missing-member-hint-distance=1
356 | 
357 | # The total number of similar names that should be taken in consideration when
358 | # showing a hint for a missing member.
359 | missing-member-max-choices=1
360 | 
361 | # List of decorators that change the signature of a decorated function.
362 | signature-mutators=
363 | 
364 | 
365 | [SIMILARITIES]
366 | 
367 | # Ignore comments when computing similarities.
368 | ignore-comments=yes
369 | 
370 | # Ignore docstrings when computing similarities.
371 | ignore-docstrings=yes
372 | 
373 | # Ignore imports when computing similarities.
374 | ignore-imports=no
375 | 
376 | # Minimum lines number of a similarity.
377 | min-similarity-lines=4
378 | 
379 | 
380 | [LOGGING]
381 | 
382 | # The type of string formatting that logging methods do. `old` means using %
383 | # formatting, `new` is for `{}` formatting.
384 | logging-format-style=old
385 | 
386 | # Logging modules to check that the string format arguments are in logging
387 | # function parameter format.
388 | logging-modules=logging
389 | 
390 | 
391 | [VARIABLES]
392 | 
393 | # List of additional names supposed to be defined in builtins. Remember that
394 | # you should avoid defining new builtins when possible.
395 | additional-builtins=
396 | 
397 | # Tells whether unused global variables should be treated as a violation.
398 | allow-global-unused-variables=yes
399 | 
400 | # List of strings which can identify a callback function by name. A callback
401 | # name must start or end with one of those strings.
402 | callbacks=cb_,
403 |           _cb
404 | 
405 | # A regular expression matching the name of dummy variables (i.e. expected to
406 | # not be used).
407 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
408 | 
409 | # Argument names that match this expression will be ignored. Default to name
410 | # with leading underscore.
411 | ignored-argument-names=_.*|^ignored_|^unused_
412 | 
413 | # Tells whether we should check for unused import in __init__ files.
414 | init-import=no
415 | 
416 | # List of qualified module names which can have objects that can redefine
417 | # builtins.
418 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
419 | 
420 | 
421 | [DESIGN]
422 | 
423 | # Maximum number of arguments for function / method.
424 | max-args=5
425 | 
426 | # Maximum number of attributes for a class (see R0902).
427 | max-attributes=7
428 | 
429 | # Maximum number of boolean expressions in an if statement (see R0916).
430 | max-bool-expr=5
431 | 
432 | # Maximum number of branch for function / method body.
433 | max-branches=12
434 | 
435 | # Maximum number of locals for function / method body.
436 | max-locals=15
437 | 
438 | # Maximum number of parents for a class (see R0901).
439 | max-parents=7
440 | 
441 | # Maximum number of public methods for a class (see R0904).
442 | max-public-methods=20
443 | 
444 | # Maximum number of return / yield for function / method body.
445 | max-returns=6
446 | 
447 | # Maximum number of statements in function / method body.
448 | max-statements=50
449 | 
450 | # Minimum number of public methods for a class (see R0903).
451 | min-public-methods=2
452 | 
453 | 
454 | [IMPORTS]
455 | 
456 | # List of modules that can be imported at any level, not just the top level
457 | # one.
458 | allow-any-import-level=
459 | 
460 | # Allow wildcard imports from modules that define __all__.
461 | allow-wildcard-with-all=no
462 | 
463 | # Analyse import fallback blocks. This can be used to support both Python 2 and
464 | # 3 compatible code, which means that the block might have code that exists
465 | # only in one or another interpreter, leading to false positives when analysed.
466 | analyse-fallback-blocks=no
467 | 
468 | # Deprecated modules which should not be used, separated by a comma.
469 | deprecated-modules=optparse,tkinter.tix
470 | 
471 | # Create a graph of external dependencies in the given file (report RP0402 must
472 | # not be disabled).
473 | ext-import-graph=
474 | 
475 | # Create a graph of every (i.e. internal and external) dependencies in the
476 | # given file (report RP0402 must not be disabled).
477 | import-graph=
478 | 
479 | # Create a graph of internal dependencies in the given file (report RP0402 must
480 | # not be disabled).
481 | int-import-graph=
482 | 
483 | # Force import order to recognize a module as part of the standard
484 | # compatibility libraries.
485 | known-standard-library=
486 | 
487 | # Force import order to recognize a module as part of a third party library.
488 | known-third-party=enchant
489 | 
490 | # Couples of modules and preferred modules, separated by a comma.
491 | preferred-modules=
492 | 
493 | 
494 | [CLASSES]
495 | 
496 | # List of method names used to declare (i.e. assign) instance attributes.
497 | defining-attr-methods=__init__,
498 |                       __new__,
499 |                       setUp,
500 |                       __post_init__
501 | 
502 | # List of member names, which should be excluded from the protected access
503 | # warning.
504 | exclude-protected=_asdict,
505 |                   _fields,
506 |                   _replace,
507 |                   _source,
508 |                   _make
509 | 
510 | # List of valid names for the first argument in a class method.
511 | valid-classmethod-first-arg=cls
512 | 
513 | # List of valid names for the first argument in a metaclass class method.
514 | valid-metaclass-classmethod-first-arg=cls
515 | 
516 | 
517 | [EXCEPTIONS]
518 | 
519 | # Exceptions that will emit a warning when being caught. Defaults to
520 | # "BaseException, Exception".
521 | overgeneral-exceptions=builtins.BaseException,
522 |                        builtins.Exception
523 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## [0.11.9] - 2025-03-03
  4 | 
  5 | ### Added
  6 | - parse config file with `-c|--config-file` to set defaults for CLI options 
  7 | 
  8 | ## [0.11.8] - 2025-02-25
  9 | 
 10 | ### Added
 11 | - add `symlink-bigger` action to replace bigger files of a group with a symlink to the smallest one
 12 | 
 13 | ## [0.11.7] - 2025-02-25
 14 | 
 15 | ### Added
 16 | - add `move-first`, `move-second`, `move-biggest` and `move-smallest` actions as options for
 17 |   `--on-equal` and their shortcuts `m1`, `m2`, `m>` and `m<` along with the `--move-to` and 
 18 |   `--move-recreate-path` options to move files to a different directory
 19 | 
 20 | ## [0.11.6] - 2025-02-24
 21 | 
 22 | ### Updated
 23 | - Print warning when specifying `--exec` without `--on-equal exec`
 24 | 
 25 | ## [0.11.5] - 2025-02-21
 26 | 
 27 | ### Added
 28 | - crop-resistant hash algorithm with `--algorithm=crop_resistant`
 29 | 
 30 | ### Updated
 31 | - Updated dependencies to fix security vulnerabilities
 32 | 
 33 | ## [0.11.4] - 2024-12-16
 34 | 
 35 | ### Updated
 36 | - Check for illegal parameter combination `--group` and `--max-distance`
 37 | - Explicit support for Python 3.13 by testing it in CI 
 38 | - Updated dependencies to fix security vulnerabilities
 39 | 
 40 | ## [0.11.3] - 2024-09-11
 41 | 
 42 | ### Updated
 43 | - Updated dependencies to fix security vulnerabilities
 44 | - Speed up pylint
 45 | 
 46 | ## [0.11.2] - 2024-05-27
 47 | 
 48 | ### Updated
 49 | - Updated dependencies to fix security vulnerabilities
 50 | 
 51 | ## [0.11.1] - 2024-03-14
 52 | 
 53 | ### Fixed 
 54 | - https://github.com/lene/DuplicateImages/issues/11: Guarded against error when using `pillow_heif` 
 55 |   module on Mac OS X 12
 56 | 
 57 | ## [0.11.0] - 2024-01-25
 58 | 
 59 | ### Added
 60 | - Pydoc for modules and classes
 61 | 
 62 | ## [0.10.9] - 2024-01-25
 63 | 
 64 | ### Fixed
 65 | - Cache file is only written to disk if it is changed
 66 | 
 67 | ## [0.10.8] - 2024-01-17
 68 | 
 69 | ### Added
 70 | - optional argument to specify the number of threads with `--parallel`
 71 | - `--parallel-actions` option to run actions in parallel
 72 | - performance optimization when reading the files to compare
 73 | 
 74 | ## [0.10.7] - 2024-01-13
 75 | 
 76 | ### Added
 77 | - Check that `hash_size` ia a power of 2 for `whash` algorithm
 78 | 
 79 | ## [0.10.6] - 2024-01-12
 80 | 
 81 | ### Fixed
 82 | - Python 3.12 compatibility
 83 | - bugfix: guard against OS failures when determining file type
 84 | - small memory optimization
 85 | 
 86 | ## [0.10.5] - 2024-01-12
 87 | 
 88 | ### Added
 89 | - `--exclude-dir` option to exclude directories from scanning
 90 | - `--max-image-pixels` option to allow for huge images to bypass `PIL`'s `DecompressionBombError`
 91 | 
 92 | ## [0.10.4] - 2024-01-11
 93 | 
 94 | ### Fixed
 95 | - Upgrade dependencies to fix security vulnerabilities
 96 | 
 97 | ## [0.10.3] - 2023-10-05
 98 | - Changes to CI only
 99 | 
100 | ## [0.10.2] - 2023-10-05
101 | 
102 | ### Fixed
103 | - Upgrade Pillow dependency to 10.0.1 to fix libWebP security vulnerability
104 | - Upgrade GitPython dependency to 3.1.37 to fix security vulnerability
105 | 
106 | ## [0.10.1] - 2023-09-04
107 | 
108 | ### Added
109 | - Upgrade Python dependency to 3.9 to fix security warning about old SciPy version
110 | - create GitLab release automatically for each new tag
111 | 
112 | ### Fixed
113 | - create GitHub release from the correct state
114 | 
115 | ## [0.10.0] - 2023-09-03
116 | 
117 | ### Added
118 | - Store hashing algorithm and parameters in hash-db file to ensure that the same algorithm is used 
119 |   across separate runs with the same hash-db file
120 | 
121 | ### Changed
122 | - Breaking change in the hash-db file format - files from previous versions are not compatible
123 | 
124 | ## [0.9.2] - 2023-08-26
125 | 
126 | ### Added
127 | - `symlink-smaller` action to replace the smaller files of a group with a symlink to the biggest one
128 | 
129 | ### Changed
130 | - `delete-smaller` and `delete-bigger` actions to `delete-smallest` and `delete-biggest`
131 | 
132 | ## [0.9.1] - 2023-08-23
133 | 
134 | ### Added
135 | - add documentation for new `--group` option
136 | 
137 | ## [0.9.0] - 2023-08-23
138 | 
139 | ### Added 
140 | - CLI option `--group`: instead of pairs, treat similar images as groups of arbitrary size
141 | - refactor `ImagePairFinder` to easier deal with combinations of options
142 | - test coverage for all supported combinations of `--group`/`--parallel`
143 | 
144 | ## [0.8.9] - 2023-08-23
145 | 
146 | ### Added 
147 | - create GitHub release automatically for each new tag
148 | - updated and completed developer documentation
149 | 
150 | ## [0.8.8] - 2023-08-23
151 | 
152 | ### Added 
153 | - more info in log about runtime and warn about bad decisions
154 | 
155 | ## [0.8.7] - 2023-08-22
156 | 
157 | ### Added 
158 | - run bandit SAST scanner in CI and on every push
159 | - fixed some security warnings, intentionally ignored others
160 | - run GitHub dependency scan in GitHub CI on every merge to master and weekly
161 | 
162 | ## [0.8.6] - 2023-08-22
163 | 
164 | ### Added 
165 | - Changelog
166 | 
167 | ## [0.8.5] - 2023-08-21
168 | 
169 | ### Added 
170 | - log execution times for scanning and comparing
171 | - code reorganization
172 | 
173 | ### Changed
174 | - renamed `--serial` option to `--slow`
175 | 
176 | ## [0.8.4] - 2023-08-21
177 | 
178 | ### Fixed
179 | - removed an absolute path in test suite
180 | 
181 | ## [0.8.3] - 2023-08-21
182 | 
183 | ### Added 
184 | - updated dependencies to newest versions
185 | - upped Development Status in metadata to Beta
186 | 
187 | ### Removed
188 | - support for Python 3.7
189 | 
190 | ## [0.8.2] - 2023-08-21
191 | 
192 | ### Added 
193 | - JSON file format for the image hash persistent store
194 | 
195 | ## [0.8.1] - 2023-08-15
196 | 
197 | ### Added
198 | - test WEBP and HEIC image formats
199 | 
200 | ## [0.8.0] - 2023-08-11
201 | 
202 | ### Added 
203 | - change algorithm to run in O(N) instead of O(N^2) by using the image hashes as dict keys
204 |   - old algorithm still runs if using `--max-distance` switch
205 | - add `--serial` CLI switch to explicitly select old algorithm
206 | - test run script in CI with most relevant CLI parameter combinations
207 | 
208 | ### Removed
209 | - `pre-commit` since it causes more trouble than it's worth
210 | 
211 | ## [0.7.4] - 2023-08-10
212 | 
213 | ### Added 
214 | - experiment with `pre-commit` to run commit hooks in a more standardized way 
215 | 
216 | ## [0.7.3] - 2023-08-10
217 | 
218 | ### Added 
219 | - more pedantic linting and tests on all supported Python versions in CI
220 | - add MIT license file
221 | 
222 | ## [0.7.1] - 2023-02-03
223 | 
224 | ### Added
225 | - contributed by [@mreiche](https://github.com/mreiche): support for running any command passed by 
226 |  `--on-equal`
227 | - contributed by [@mreiche](https://github.com/mreiche): faster MIME detection
228 | - contributed by [@mreiche](https://github.com/mreiche): `print_inline` and `quote_inline` actions
229 | 
230 | ## [0.6.5] - 2023-01-02
231 | 
232 | ### Added 
233 | - contributed by [@beijingjazzpanda](https://gitlab.com/beijingjazzpanda): ensure hash-db `.bak` 
234 |   files are created properly
235 | - run Codacy and CodeQL security and dependency scans in CI on GitHub
236 | 
237 | ## [0.6.4] - 2022-09-23
238 | 
239 | ### Added 
240 | - `--hash-size` option to fine tune which images are considered equal
241 | - support new `dhash_vertical` and `phash_simple` image hashing methods
242 | - push to GitHub repository from CI when MR is merged
243 | 
244 | ## [0.6.2] - 2022-09-04
245 | 
246 | ### Added 
247 | - code style: enforce single quotes as default
248 | 
249 | ## [0.6.1] - 2022-09-02
250 | 
251 | ### Added
252 | - `--max-distance` option to fine tune which images are considered equal
253 | 
254 | ## [0.6.0] - 2022-07-22
255 | 
256 | ### Added 
257 | - support HEIC images
258 | - fix dependabot alerts for insecure dependencies
259 | 
260 | ## [0.5.3] - 2021-03-16
261 | 
262 | ### Added 
263 | - add `--quiet` flag to decrease log level
264 | 
265 | ## [0.5.2] - 2021-03-16
266 | 
267 | ### Added 
268 | - add `d1` and `d2` action shortcuts
269 | 
270 | ## [0.5.1] - 2021-03-15
271 | 
272 | ### Added 
273 | - update documentation for new `--hash-db` CLI parameter
274 | 
275 | ## [0.5.0] - 2021-03-15
276 | 
277 | ### Added 
278 | - store the image hashes in a pickle file between runs for a major speedup
279 | - run tests in parallel
280 | 
281 | ## [0.4.1] - 2021-01-17
282 | 
283 | ### Added
284 | - display a progress bar while calculating
285 | 
286 | ## [0.4.0] - 2021-01-16
287 | 
288 | ### Added 
289 | - automatically publish to PyPI from CI when MR is merged
290 | - reorganize code
291 | 
292 | ## [0.3.6] - 2021-01-16
293 | 
294 | ### Added 
295 | - update homepage and description in project metadata
296 | 
297 | ## [0.3.5] - 2021-01-16
298 | 
299 | ### Added 
300 | - change master repository to https://github.com/lene/DuplicateImages.git
301 | 
302 | ## [0.3.4] - 2021-01-16
303 | 
304 | ### Added 
305 | - improve log formatting
306 | - add option to print matching files with quotes, as well as `d>` and `d<` shortcuts
307 | 
308 | ## [0.3.2] - 2021-01-16
309 | 
310 | ### Added 
311 | - use `coloredlogs` and improve log formatting
312 | 
313 | ## [0.3.1] - 2021-01-16
314 | 
315 | ### Added 
316 | - handle error for broken image files
317 | - use `logging` instead of `print()` for output
318 | 
319 | ## [0.3.0] - 2021-01-16
320 | 
321 | ### Added 
322 | - actions to delete bigger/smaller image and view with `eog`
323 | - fuzziness parameter to adjust desired similarity
324 | 
325 | ## [0.2.1] - 2021-01-15
326 | 
327 | ### Added 
328 | - documentation for parallel execution
329 | 
330 | ## [0.2.0] - 2021-01-15
331 | 
332 | ### Added
333 | - additionally use [ImageHash](https://pypi.org/project/ImageHash) to compare images
334 | - run `pylint` against code
335 |  
336 | ## 0.1 - 2021-01-08
337 | 
338 | ### Added
339 | - exact and histogram comparison
340 | - actions if equal: delete one of the pics, view with `xv` or print
341 | 
342 | 
343 | [0.11.9]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.8...0.11.9
344 | [0.11.8]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.7...0.11.8
345 | [0.11.7]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.6...0.11.7
346 | [0.11.6]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.5...0.11.6
347 | [0.11.5]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.4...0.11.5
348 | [0.11.4]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.3...0.11.4
349 | [0.11.3]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.2...0.11.3
350 | [0.11.2]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.1...0.11.2
351 | [0.11.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.0...0.11.1
352 | [0.11.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.9...0.11.0
353 | [0.10.9]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.8...0.10.9
354 | [0.10.8]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.7...0.10.8
355 | [0.10.7]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.6...0.10.7
356 | [0.10.6]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.5...0.10.6
357 | [0.10.5]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.4...0.10.5
358 | [0.10.4]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.3...0.10.4
359 | [0.10.3]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.2...0.10.3
360 | [0.10.2]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.1...0.10.2
361 | [0.10.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.0...0.10.1
362 | [0.10.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.9.2...0.10.0
363 | [0.9.2]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.9.1...0.9.2
364 | [0.9.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.9.0...0.9.1
365 | [0.9.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.9...0.9.0
366 | [0.8.9]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.8...0.8.9
367 | [0.8.8]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.7...0.8.8
368 | [0.8.7]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.6...0.8.7
369 | [0.8.6]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.5...0.8.6
370 | [0.8.5]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.4...0.8.5
371 | [0.8.4]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.3...0.8.4
372 | [0.8.3]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.2...0.8.3
373 | [0.8.2]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.1...0.8.2
374 | [0.8.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.0...0.8.1
375 | [0.8.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.7.4...0.8.0
376 | [0.7.4]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.7.3...0.7.4
377 | [0.7.3]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.7.1...0.7.3
378 | [0.7.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.6.5...0.7.1
379 | [0.6.5]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.6.4...0.6.5
380 | [0.6.4]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.6.2...0.6.4
381 | [0.6.2]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.6.1...0.6.2
382 | [0.6.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.6.0...0.6.1
383 | [0.6.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.5.3...0.6.0
384 | [0.5.3]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.5.2...0.5.3
385 | [0.5.2]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.5.1...0.5.2
386 | [0.5.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.5.0...0.5.1
387 | [0.5.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.4.1...0.5.0
388 | [0.4.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.4.0...0.4.1
389 | [0.4.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.3.6...0.4.0
390 | [0.3.6]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.3.5...0.3.6
391 | [0.3.5]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.3.4...0.3.6
392 | [0.3.4]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.3.2...0.3.4
393 | [0.3.2]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.3.1...0.3.2
394 | [0.3.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.3.0...0.3.1
395 | [0.3.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.2.1...0.3.0
396 | [0.2.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.2.0...0.2.1
397 | [0.2.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/tags/0.2.0
398 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright © 2023 <copyright holders>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
 6 | associated documentation files (the “Software”), to deal in the Software without restriction,
 7 | including without limitation the rights to use, copy, modify, merge, publish, distribute,
 8 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all copies or substantial
12 | portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
15 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
16 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
17 | OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 
20 | Fork this project to create your own MIT license that you can always link to.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Finding Duplicate Images
  2 | 
  3 | Finds equal or similar images in a directory containing (many) image files.
  4 | 
  5 | Official home page: https://github.com/lene/DuplicateImages
  6 | 
  7 | Development page: https://gitlab.com/lilacashes/DuplicateImages
  8 | 
  9 | PyPI page: https://pypi.org/project/duplicate-images
 10 | 
 11 | ## Usage
 12 | 
 13 | Installing:
 14 | ```shell
 15 | $ pip install duplicate_images
 16 | ```
 17 | 
 18 | Printing the help screen:
 19 | ```shell
 20 | $ find-dups -h
 21 | ```
 22 | 
 23 | Quick test run:
 24 | ```shell
 25 | $ find-dups $IMAGE_ROOT 
 26 | ```
 27 | 
 28 | Typical usage:
 29 | ```shell
 30 | $ find-dups $IMAGE_ROOT --parallel --progress --hash-db hashes.json
 31 | ```
 32 | 
 33 | ### Supported image formats
 34 | 
 35 | * JPEG and PNG (tested quite thoroughly)
 36 | * HEIC (experimental support, tested cursorily only)
 37 | * All other 
 38 |   [formats supported](https://pillow.readthedocs.io/en/latest/handbook/image-file-formats.html) by 
 39 |   the `pillow` Python Imaging Library should work, but are not specifically tested. 
 40 | 
 41 | #### Explicitly allow huge images
 42 | 
 43 | The `PIL` image library, which is used as backend, limits the size of images to 178956970 pixels by
 44 | default, to guard against memory exhaustion. For larger images, specify the maximum image size in 
 45 | pixels with the `--max-image-pixels` option.
 46 | 
 47 | ### Image comparison algorithms
 48 | 
 49 | Use the `--algorithm` option to select how equal images are found. The default algorithm is `phash`.
 50 | 
 51 | `ahash`, `colorhash`, `dhash`, `dhash_vertical`, `phash`, `phash_simple`, `whash`, `crop_resistant`:
 52 | seven different image hashing algorithms. See https://pypi.org/project/ImageHash for an introduction
 53 | on image hashing and https://tech.okcupid.com/evaluating-perceptual-image-hashes-at-okcupid-e98a3e74aa3a 
 54 | for some gory details which image hashing algorithm performs best in which situation. For a start I 
 55 | recommend using `phash`, and only evaluating the other algorithms if `phash` does not perform 
 56 | satisfactorily in your use case.
 57 | 
 58 | ### Image similarity threshold configuration
 59 | 
 60 | Use the `--hash-size` parameter to tune the precision of the hashing algorithms. For the `colorhash`
 61 | algorithm the hash size is interpreted as the number of bin bits and defaults to 3. For all other
 62 | algorithms the hash size defaults to 8. For `whash` it must be a power of 2.
 63 | 
 64 | Use the `--max-distance` parameter to tune how close images should be to be considered duplicates.
 65 | The argument is a positive integer. Its value is highly dependent on the algorithm used and the 
 66 | nature of the images compared, so the best value for your use case can oly be found through 
 67 | experimentation. 
 68 | 
 69 | **NOTE:** using the `--max-distance` parameter slows down the comparison considerably with large
 70 | image collections, making the runtime complexity go from O(N) to O(N<sup>2</sup>). If you want to 
 71 | scan collections with at least thousands of images, it is highly recommended to tune the desired 
 72 | similarity threshold with the `--hash-size` parameter alone, if that is at all possible. 
 73 | The '--max-distance' parameter it's incompatible with --group parameter.
 74 | 
 75 | **NOTE:** the `--max-distance` parameter conflicts with tho `--group` parameter. You can only use 
 76 | one at a time.
 77 | 
 78 | ### Pre-storing and using image hashes to speed up computation
 79 | 
 80 | Use the `--hash-db ${FILE}.json` or `--hash-db ${FILE}.pickle` option to store image hashes in the 
 81 | file `$FILE` in JSON or Pickle format and read image hashes from that file if they are already 
 82 | present there. This avoids having to compute the image hashes anew at every run and can 
 83 | significantly speed up run times.
 84 | 
 85 | ### Handling matching images either as pairs or as groups
 86 | 
 87 | By default, matching images are presented as pairs. With the `--group` CLI option, they are handled
 88 | as a group containing all matching images.
 89 | 
 90 | Example: `1.jpg`, `2.jpg` and `3.jpg` in the current folder `.` are equal.
 91 | 
 92 | ```shell
 93 | $ find-dups .
 94 | 1.jpg 2.jpg
 95 | 1.jpg 3.jpg
 96 | 2.jpg 3.jpg
 97 | $ find-dups . --group
 98 | 1.jpg 2.jpg 3.jpg
 99 | ```
100 | 
101 | ### Actions for matching image groups
102 | 
103 | Use the `--on-equal` option to select what to do to pairs of equal images. The default action is 
104 | `print`.
105 | - `delete-first` or `d1`: deletes the first of the files in the group
106 | - `delete-last` or `dl`: deletes the last of the files in the group
107 | - `delete-biggest` or `d>`: deletes the file with the biggest size
108 | - `delete-smallest` or `d<`: deletes the file with the smallest size
109 | - `move-first` or `m1`: moves the first of the files in the group to the folder specified with the
110 |   `--move-to` option
111 | - `move-last` or `ml`: moves the last of the files in the group to the folder specified with the
112 |   `--move-to` option
113 | - `move-biggest` or `m>`: moves the file with the biggest size to the folder specified with the
114 |   `--move-to` option
115 | - `move-smallest` or `m<`: moves the file with the smallest size to the folder specified with the
116 |   `--move-to` option
117 | - `symlink-smaller`: delete the smaller files and replace them to a symlink to the biggest file
118 | - `symlink-bigger`: delete the bigger files and replace them to a symlink to the smallest file
119 | - `eog`: launches the `eog` image viewer to compare the files in the group (*deprecated* by `exec`)
120 | - `xv`: launches the `xv` image viewer to compare the files in the group (*deprecated* by `exec`)
121 | - `print`: prints the files in the group
122 | - `print_inline`: like `print` but without newline
123 | - `quote`: prints the files in the group quoted for POSIX shells
124 | - `quote_inline`: like `quote` but without newline
125 | - `exec`: executes a command (see `--exec` argument below)
126 | - `none`: does nothing; may be useful for benchmarking and testing
127 | 
128 | The `move-*` actions require the `--move-to` option to specify the target folder. Additionally, the
129 | `--move-recreate-path` option can be set to reproduce the directory structure of the source files in
130 | the target folder.
131 | 
132 | The `--exec` argument allows calling another program when the `--on-equal exec` option is given.
133 | You can pass a command line string like `--exec "program {1} {2}"` where `{1}` and `{2}` are
134 | replaced by the matching pair files (or first two files in a group), quoted so the shell recognizes
135 | the files properly. The wildcard `{*}` expands to all files in a matching group, which when called
136 | with the `--group` argument may be more than two images considered equal.
137 | 
138 | #### Examples:
139 | * `--exec "open -a Preview -W {1} {2}"`: Opens the files in MacOS Preview app and waits for it.
140 | * `--exec "ls -s {*}"`: Prints the size (in blocks) next to all files.
141 | * `--exec 'for i in {*}; do dirname $i; basename $i; done'`: Shows the directory and the filename
142 |   separately for all files.
143 | 
144 | ### Parallel execution
145 | 
146 | Use the `--parallel` option to utilize all free cores on your system for calculating image hashes.
147 | Optionally, you can specify the number of processes to use with `--parallel $N`.
148 | 
149 | To execute the `--on-equal` actions in parallel, use the `--parallel-actions` option, which also can
150 | take an optional number of processes to use as argument.
151 | 
152 | ### Excluding subfolders
153 | 
154 | Use the `--exclude-dir` option to exclude subfolders of `$IMAGE_ROOT` from the search. The argument
155 | is a regular expression matching the subfolder names to be excluded. Multiple arguments can be 
156 | passed to `--exclude-dir` to exclude multiple subfolders. 
157 | 
158 | The argument(s) given to `--exclude-dir` may be regular expressions. These regular expressions are 
159 | matched only against the directory name, not the file name.
160 | 
161 | #### Examples
162 | 
163 | Exclude subfolder `$IMAGE_ROOT/foo`:
164 | ```shell
165 | $ find-dups $IMAGE_ROOT --exclude-dir $IMAGE_ROOT/foo
166 | ```
167 | Exclude all subfolders named `foo` or `bar`:
168 | ```shell
169 | $ find-dups $IMAGE_ROOT --exclude-dir foo bar
170 | ```
171 | 
172 | ### Slow execution
173 | 
174 | `find-dups` can also use an alternative algorithm which exhaustively compares all images to each
175 | other, being O(N<sup>2</sup>) in the number of images. This algorithm is selected automatically if
176 | `--max-distance` is not 0.
177 | 
178 | You can use the `--slow` option to use this alternative algorithm specifically. The `--slow` switch
179 | is mutually exclusive with the `--group` switch.
180 | 
181 | ### Progress bar and verbosity control
182 | 
183 | - `--progress` prints a progress bar each for the process of reading the images, and the process of 
184 |   finding duplicates among the scanned image
185 | - `--debug` prints debugging output
186 | - `--quiet` decreases the log level by 1 for each time it is called; `--debug` and `--quiet` cancel
187 |   each other out
188 | 
189 | ## Development notes
190 | 
191 | Needs Python3, Pillow imaging library and `pillow-heif` HEIF plugin to run, additionally Wand for 
192 | the test suite.
193 | 
194 | Uses Poetry for dependency management.
195 | 
196 | ### Installation
197 | 
198 | From source:
199 | ```shell
200 | $ git clone https://gitlab.com/lilacashes/DuplicateImages.git
201 | $ cd DuplicateImages
202 | $ pip3 install poetry
203 | $ poetry install
204 | ```
205 | 
206 | ### Running
207 | 
208 | ```shell
209 | $ poetry run find-dups $PICTURE_DIR
210 | ```
211 | or
212 | ```shell
213 | $ poetry run find-dups -h
214 | ```
215 | for a list of all possible options.
216 | 
217 | ### Test suite
218 | 
219 | Running it all:
220 | ```shell
221 | $ poetry run pytest
222 | $ poetry run mypy duplicate_images tests
223 | $ poetry run flake8
224 | $ poetry run pylint duplicate_images tests
225 | $ poetry run bandit -r duplicate_images
226 | ```
227 | or simply 
228 | ```shell
229 | $ .git_hooks/pre-push
230 | ```
231 | Setting the test suite to be run before every push:
232 | ```shell
233 | $ cd .git/hooks
234 | $ ln -s ../../.git_hooks/pre-push .
235 | ```
236 | 
237 | ### Publishing
238 | 
239 | A tag is created and the new version is published automatically by GitLab CI on every successful
240 | merge to `master`.
241 | 
242 | #### Prerequisites
243 | 
244 | For every Merge Request to `master` it is checked that:
245 | - the `version` number in `pyproject.toml` is not an already existing git tag
246 | - the `CHANGELOG.md` contains an entry for the current version number
247 | 
248 | #### PyPI
249 | 
250 | There is a job in GitLab CI for publishing to `pypi.org` that runs as soon as a new tag is added, 
251 | which happens automatically whenever a MR is merged. The tag is the same as the `version` in the 
252 | `pyproject.toml` file. For every MR it needs to be ensured that the `version` is not the same as an 
253 | already existing tag.
254 | 
255 | To publish the package on PyPI manually:
256 | ```shell
257 | $ poetry config repositories.testpypi https://test.pypi.org/legacy/
258 | $ poetry build
259 | $ poetry publish --username $PYPI_USER --password $PYPI_PASSWORD --repository testpypi && \
260 |   poetry publish --username $PYPI_USER --password $PYPI_PASSWORD
261 | ```
262 | (obviously assuming here that username and password are the same on PyPI and TestPyPI)
263 | 
264 | #### Updating GitHub mirror
265 | 
266 | The GitHub repo `git@github.com:lene/DuplicateImages.git` is set up as a push mirror in GitLab CI, 
267 | but mirroring is flaky at the time and may or may not succeed. The CI job `PushToGithub` should take
268 | care of mirroring to GitHub after every merge to `master`.
269 | 
270 | To push to the GitHub repository manually (assuming the GitHub repo is set up as remote `github`):
271 | ```shell
272 | $ git checkout master
273 | $ git fetch
274 | $ git pull --rebase
275 | $ git tag  # to check that the latest tag is present
276 | $ git push --tags github master 
277 | ```
278 | 
279 | #### Creating Releases on GitHub
280 | 
281 | The CI job `CreateGithubRelease` creates a Release on GitHub, which can then be found under
282 | https://github.com/lene/DuplicateImages/releases.
283 | 
284 | ### Profiling
285 | 
286 | #### CPU time
287 | To show the top functions by time spent, including called functions:
288 | ```shell
289 | $ poetry run python -m cProfile -s tottime ./duplicate_images/duplicate.py \ 
290 |     --algorithm $ALGORITHM --action-equal none $IMAGE_DIR 2>&1 | head -n 15
291 | ```
292 | or, to show the top functions by time spent in the function alone:
293 | ```shell
294 | $ poetry run python -m cProfile -s cumtime ./duplicate_images/duplicate.py \ 
295 |     --algorithm $ALGORITHM --action-equal none $IMAGE_DIR 2>&1 | head -n 15
296 | ```
297 | 
298 | #### Memory usage
299 | ```shell
300 | $ poetry run fil-profile run ./duplicate_images/duplicate.py \
301 |     --algorithm $ALGORITHM --action-equal none $IMAGE_DIR 2>&1
302 | ```
303 | This will open a browser window showing the functions using the most memory (see 
304 | https://pypi.org/project/filprofiler for more details).
305 | 
306 | ## Contributors
307 | 
308 | - Lene Preuss (https://github.com/lene): primary developer
309 | - Mike Reiche (https://github.com/mreiche): support for arbitrary actions, speedups
310 | - https://github.com/beijingjazzpanda: bug fix
311 | 


--------------------------------------------------------------------------------
/duplicate_images/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Given a big set of images, find duplicate and similar images
3 | """
4 | 


--------------------------------------------------------------------------------
/duplicate_images/common.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Functions used in multiple places
 3 | """
 4 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
 5 | 
 6 | import logging
 7 | from functools import wraps
 8 | from pathlib import Path
 9 | from time import time
10 | 
11 | 
12 | def path_with_parent(path: Path) -> str:
13 |     return '/'.join(str(path).rstrip('/').split('/')[-2:])
14 | 
15 | 
16 | def log_execution_time():
17 |     def actual_decorator(method):
18 |         @wraps(method)
19 |         def allow_fail(self, *args, **kwargs):
20 |             start_time = time()
21 |             return_value = method(self, *args, **kwargs)
22 |             logging.info(
23 |                 '%s.%s() run in %.2fs',
24 |                 type(self).__name__, method.__name__,
25 |                 time() - start_time
26 |             )
27 |             return return_value
28 | 
29 |         return allow_fail
30 | 
31 |     return actual_decorator
32 | 


--------------------------------------------------------------------------------
/duplicate_images/duplicate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env /usr/bin/python3
  2 | """
  3 | The main script for the `find-dups` command line tool.
  4 | """
  5 | 
  6 | import logging
  7 | import re
  8 | from argparse import Namespace
  9 | from multiprocessing.pool import ThreadPool
 10 | from os import walk, access, R_OK
 11 | from pathlib import Path
 12 | from typing import Callable, List, Optional
 13 | 
 14 | import PIL.Image
 15 | from filetype import guess
 16 | from pillow_heif import register_heif_opener
 17 | 
 18 | from duplicate_images.common import path_with_parent, log_execution_time
 19 | from duplicate_images.function_types import Results, ImageGroup, ActionFunction
 20 | from duplicate_images.hash_store import FileHashStore
 21 | from duplicate_images.image_pair_finder import ImagePairFinder, PairFinderOptions
 22 | from duplicate_images.log import setup_logging
 23 | from duplicate_images.methods import ACTIONS_ON_EQUALITY, IMAGE_HASH_ALGORITHM, get_hash_size_kwargs
 24 | from duplicate_images.parse_commandline import parse_command_line
 25 | 
 26 | try:
 27 |     register_heif_opener()
 28 | except ImportError as error:
 29 |     logging.warning('HEIF support not available: %s', error)
 30 |     logging.warning('See https://github.com/lene/DuplicateImages/issues/11 for details')
 31 | 
 32 | 
 33 | def is_image_file(filename: Path) -> bool:
 34 |     """Returns True if filename is a readable image file"""
 35 |     try:
 36 |         if access(filename, R_OK) and not filename.is_symlink():
 37 |             kind = guess(filename)
 38 |             return kind is not None and kind.mime.startswith('image/')
 39 |     except OSError as err:
 40 |         logging.warning('Skipping %s: %s', path_with_parent(filename), err)
 41 |     return False
 42 | 
 43 | 
 44 | def folder_matches(filename: Path, regex: re.Pattern) -> bool:
 45 |     return bool(re.search(regex, str(filename.parent)))
 46 | 
 47 | 
 48 | @log_execution_time()
 49 | def files_in_dirs(
 50 |         dir_names: List[Path], is_relevant: Callable[[Path], bool] = lambda f: f.is_file(),
 51 |         exclude_regexes: Optional[List[str]] = None
 52 | ) -> List[Path]:
 53 |     """
 54 |     Returns a list of all files in directory dir_name (recursively scanning subdirectories), which
 55 |     satisfy the condition is_file. If exclude_regexes is given, files in directories matching any
 56 |     of the regular expressions are excluded.
 57 |     """
 58 |     exclude_compiled = [re.compile(regex) for regex in exclude_regexes or []]
 59 |     unfiltered = (
 60 |         Path(root) / filename
 61 |         for dir_name in dir_names
 62 |         for root, _, filenames in walk(dir_name)
 63 |         for filename in filenames
 64 |         if not any(folder_matches(Path(root) / filename, regex) for regex in exclude_compiled)
 65 |     )
 66 |     # astonishingly, filtering in a separate step is faster than in the generator expression
 67 |     return [file for file in unfiltered if is_relevant(file)]
 68 | 
 69 | 
 70 | def get_matches(
 71 |         root_directories: List[Path], algorithm: str,
 72 |         options: PairFinderOptions = PairFinderOptions(),
 73 |         hash_store_path: Optional[Path] = None,
 74 |         exclude_regexes: Optional[List[str]] = None
 75 | ) -> Results:
 76 |     hash_algorithm = IMAGE_HASH_ALGORITHM[algorithm]
 77 |     hash_size_kwargs = get_hash_size_kwargs(hash_algorithm, options.hash_size)
 78 |     image_files = files_in_dirs(root_directories, is_image_file, exclude_regexes)
 79 |     logging.info('%d total files', len(image_files))
 80 |     image_files.sort()
 81 |     logging.info('Computing image hashes')
 82 | 
 83 |     with FileHashStore.create(hash_store_path, algorithm, hash_size_kwargs) as hash_store:
 84 |         return ImagePairFinder.create(
 85 |             image_files, hash_algorithm, options=options, hash_store=hash_store,
 86 |         ).get_equal_groups()
 87 | 
 88 | 
 89 | def execute_actions(matches: Results, args: Namespace) -> None:
 90 |     action_equal = ACTIONS_ON_EQUALITY[args.on_equal]
 91 |     if args.parallel_actions:
 92 |         with ThreadPool(args.parallel_actions) as pool:
 93 |             pool.map(lambda group: execute_action(action_equal, group, args), matches)
 94 |     else:
 95 |         for group in sorted(matches):
 96 |             execute_action(action_equal, group, args)
 97 | 
 98 | 
 99 | def execute_action(action: ActionFunction, group: ImageGroup, args: Namespace) -> None:
100 |     try:
101 |         action(args, group)
102 |     except FileNotFoundError:
103 |         pass
104 | 
105 | 
106 | def set_max_image_pixels(args: Namespace) -> None:
107 |     if args.max_image_pixels is not None:
108 |         PIL.Image.MAX_IMAGE_PIXELS = args.max_image_pixels
109 | 
110 | 
111 | def main() -> None:
112 |     args = parse_command_line()
113 |     setup_logging(args)
114 |     set_max_image_pixels(args)
115 |     options = PairFinderOptions.from_args(args)
116 |     for folder in args.root_directory:
117 |         logging.info(
118 |             'Scanning %s %s', path_with_parent(folder),
119 |             f'(excluding {", ".join(args.exclude_dir)})' if args.exclude_dir else ''
120 |         )
121 |     try:
122 |         matches = get_matches(
123 |             [Path(folder) for folder in args.root_directory], args.algorithm,
124 |             options=options, hash_store_path=Path(args.hash_db) if args.hash_db else None,
125 |             exclude_regexes=list(args.exclude_dir) if args.exclude_dir else None
126 |         )
127 |         logging.info('%d matches', len(matches))
128 |         execute_actions(matches, args)
129 |     except KeyboardInterrupt:
130 |         pass
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     main()
135 | 


--------------------------------------------------------------------------------
/duplicate_images/function_types.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Shorter and more descriptive type aliases used in static type checking for the
 3 | `duplicate_images` package.
 4 | """
 5 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
 6 | 
 7 | from argparse import Namespace
 8 | from pathlib import Path
 9 | from typing import Any, Callable, Dict, List, Optional, Tuple, Generator, Union
10 | 
11 | from PIL import Image
12 | from imagehash import ImageHash, ImageMultiHash
13 | 
14 | Hash = Union[ImageHash, ImageMultiHash]
15 | HashFunction = Callable[[Image.Image], Hash]
16 | ImageGroup = Tuple[Path, ...]
17 | ActionFunction = Callable[[Namespace, ImageGroup], Any]
18 | Results = List[ImageGroup]
19 | ResultsGenerator = Generator[List[Path], None, None]
20 | ResultsGrouper = Callable[[ResultsGenerator], Results]
21 | CacheEntry = Tuple[Path, Optional[Hash]]
22 | Cache = Dict[Path, Hash]
23 | 
24 | 
25 | def is_hash(x: Any) -> bool:
26 |     return isinstance(x, (ImageHash, ImageMultiHash))
27 | 


--------------------------------------------------------------------------------
/duplicate_images/hash_scanner/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Functionality to compute and store the image hashes of a set of images
3 | """
4 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
5 | 
6 | from duplicate_images.hash_scanner.image_hash_scanner import (
7 |     ImageHashScanner, ParallelImageHashScanner
8 | )
9 | 


--------------------------------------------------------------------------------
/duplicate_images/hash_scanner/image_hash_scanner.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Calculate the image hashes of a given set of images
  3 | """
  4 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
  5 | 
  6 | import logging
  7 | import os
  8 | 
  9 | from multiprocessing.pool import ThreadPool
 10 | from pathlib import Path
 11 | from typing import List, Optional, Dict
 12 | 
 13 | from PIL import Image
 14 | from PIL.Image import DecompressionBombError
 15 | 
 16 | from duplicate_images.common import path_with_parent
 17 | from duplicate_images.function_types import CacheEntry, HashFunction
 18 | from duplicate_images.hash_store import HashStore, NullHashStore
 19 | from duplicate_images.methods import get_hash_size_kwargs
 20 | from duplicate_images.pair_finder_options import PairFinderOptions
 21 | from duplicate_images.progress_bar_manager import ProgressBarManager, NullProgressBarManager
 22 | 
 23 | 
 24 | class ImageHashScanner:
 25 |     """
 26 |     Reads images from the given list of files and calculates their image hashes,
 27 |     using a single thread only
 28 |     """
 29 | 
 30 |     @staticmethod
 31 |     def create(
 32 |             files: List[Path], hash_algorithm: HashFunction,
 33 |             options: PairFinderOptions,
 34 |             hash_store: HashStore = NullHashStore(),
 35 |             progress_bars: ProgressBarManager = NullProgressBarManager()
 36 |     ) -> 'ImageHashScanner':
 37 |         hash_size_kwargs = get_hash_size_kwargs(hash_algorithm, options.hash_size)
 38 |         if not options.parallel:
 39 |             return ImageHashScanner(
 40 |                 files, hash_algorithm, hash_size_kwargs, hash_store, progress_bars
 41 |             )
 42 |         return ParallelImageHashScanner(
 43 |             files, hash_algorithm, hash_size_kwargs, hash_store, progress_bars,
 44 |             options.parallel
 45 |         )
 46 | 
 47 |     def __init__(  # pylint: disable = too-many-arguments,too-many-positional-arguments
 48 |             self, files: List[Path], hash_algorithm: HashFunction,
 49 |             hash_size_kwargs: Optional[Dict] = None,
 50 |             hash_store: HashStore = NullHashStore(),
 51 |             progress_bars: ProgressBarManager = NullProgressBarManager()
 52 |     ) -> None:
 53 |         self.files = files
 54 |         self.algorithm = hash_algorithm
 55 |         self.hash_size_kwargs = hash_size_kwargs if hash_size_kwargs is not None else {}
 56 |         self.hash_store = hash_store
 57 |         self.progress_bars = progress_bars
 58 |         logging.info('Using %s', self.class_string())
 59 | 
 60 |     def class_string(self) -> str:
 61 |         return self.__class__.__name__
 62 | 
 63 |     def precalculate_hashes(self) -> List[CacheEntry]:
 64 |         return [self.get_hash(file) for file in self.files]
 65 | 
 66 |     def get_hash(self, file: Path) -> CacheEntry:
 67 |         self.progress_bars.update_reader()
 68 |         try:
 69 |             cached = self.hash_store.get(file)
 70 |             if cached is not None:
 71 |                 return file, cached
 72 | 
 73 |             image_hash = self.algorithm(Image.open(file), **self.hash_size_kwargs)
 74 |             self.hash_store.add(file, image_hash)
 75 |             return file, image_hash
 76 |         except OSError as err:
 77 |             logging.warning('%s: %s', path_with_parent(file), err)
 78 |             return file, None
 79 |         except DecompressionBombError as err:
 80 |             logging.warning('%s: %s', path_with_parent(file), err)
 81 |             logging.warning('To process this file, use the --max-image-pixels option')
 82 |             return file, None
 83 | 
 84 | 
 85 | class ParallelImageHashScanner(ImageHashScanner):
 86 |     """
 87 |     Reads images from the given list of files and calculates their image hashes,
 88 |     using a specified number of threads in parallel
 89 |     """
 90 | 
 91 |     def __init__(  # pylint: disable = too-many-arguments,too-many-positional-arguments
 92 |             self,
 93 |             files: List[Path], hash_algorithm: HashFunction,
 94 |             hash_size_kwargs: Optional[Dict] = None,
 95 |             hash_store: HashStore = NullHashStore(),
 96 |             progress_bars: ProgressBarManager = NullProgressBarManager(),
 97 |             parallel: int = os.cpu_count() or 1
 98 |     ) -> None:
 99 |         self.num_threads = parallel
100 |         super().__init__(files, hash_algorithm, hash_size_kwargs, hash_store, progress_bars)
101 | 
102 |     def class_string(self) -> str:
103 |         return f'{self.__class__.__name__} with {self.num_threads} threads'
104 | 
105 |     def precalculate_hashes(self) -> List[CacheEntry]:
106 |         with ThreadPool(self.num_threads) as pool:
107 |             return pool.map(self.get_hash, self.files)
108 | 


--------------------------------------------------------------------------------
/duplicate_images/hash_store.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Persistent storage for calculated image hashes
  3 | """
  4 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
  5 | 
  6 | import json
  7 | import logging
  8 | import pickle  # nosec
  9 | from pathlib import Path
 10 | from typing import Any, IO, Callable, Optional, Union, Dict, Tuple
 11 | 
 12 | from imagehash import hex_to_hash
 13 | 
 14 | from duplicate_images.common import log_execution_time
 15 | from duplicate_images.function_types import Cache, Hash, is_hash
 16 | 
 17 | 
 18 | class NullHashStore:
 19 |     """
 20 |     Hash store that does not store anything but can be used as a drop-in
 21 |     replacement for `FileHashStore` and `PickleHashStore` when no persistent
 22 |     storage is desired
 23 |     """
 24 | 
 25 |     def __init__(self) -> None:
 26 |         logging.info('No persistent storage for calculated image hashes set up')
 27 | 
 28 |     def __enter__(self) -> 'NullHashStore':
 29 |         return self
 30 | 
 31 |     def __exit__(self, _: Any, __: Any, ___: Any) -> None:
 32 |         pass
 33 | 
 34 |     def get(self, _: Path) -> Optional[Hash]:
 35 |         return None
 36 | 
 37 |     def add(self, _: Path, __: Hash) -> None:
 38 |         pass
 39 | 
 40 | 
 41 | HashStore = Union[NullHashStore, 'FileHashStore', 'PickleHashStore', 'JSONHashStore']
 42 | 
 43 | 
 44 | class FileHashStore:
 45 |     """
 46 |     Base class for persistent storage of calculated image hashes, providing all
 47 |     necessary functionality except for reading and writing data to various file
 48 |     formats
 49 |     """
 50 |     @staticmethod
 51 |     def create(
 52 |             store_path: Optional[Path], algorithm: str, hash_size_kwargs: Dict
 53 |     ) -> Union['FileHashStore', NullHashStore]:
 54 |         if store_path is None:
 55 |             return NullHashStore()
 56 |         if store_path.suffix == '.pickle':
 57 |             return PickleHashStore(store_path, algorithm, hash_size_kwargs)
 58 |         return JSONHashStore(store_path, algorithm, hash_size_kwargs)
 59 | 
 60 |     def __init__(self, store_path: Path, algorithm: str, hash_size_kwargs: Dict) -> None:
 61 |         self.store_path = store_path
 62 |         self.algorithm = algorithm
 63 |         self.hash_size_kwargs = hash_size_kwargs
 64 |         self.values: Cache = {}
 65 |         self.dirty: bool = False
 66 |         try:
 67 |             self.load()
 68 |             logging.info(
 69 |                 'Opened persistent storage %s with %d entries', store_path, len(self.values)
 70 |             )
 71 |         except (FileNotFoundError, EOFError, pickle.PickleError):
 72 |             logging.info('Creating new %s at %s', self.__class__.__name__, store_path)
 73 | 
 74 |     def __enter__(self) -> 'FileHashStore':
 75 |         return self
 76 | 
 77 |     def __exit__(self, _: Any, __: Any, ___: Any) -> None:
 78 |         if not self.dirty:
 79 |             return
 80 |         if self.store_path.is_file():
 81 |             if self.store_path.with_suffix('.bak').is_file():
 82 |                 self.store_path.with_suffix('.bak').unlink()
 83 |             self.store_path.rename(self.store_path.with_suffix('.bak'))
 84 |         self.dump()
 85 | 
 86 |     def add(self, file: Path, image_hash: Hash) -> None:
 87 |         self.values[file] = image_hash
 88 |         self.dirty = True
 89 | 
 90 |     def get(self, file: Path) -> Optional[Hash]:
 91 |         return self.values.get(file)
 92 | 
 93 |     def metadata(self) -> Dict:
 94 |         return {'algorithm': self.algorithm, **self.hash_size_kwargs}
 95 | 
 96 |     def values_with_metadata(self) -> Tuple[Dict, Dict]:
 97 |         return self.values, self.metadata()
 98 | 
 99 |     def checked_load(self, file: IO, load: Callable[[IO], Tuple[Cache, Dict]]) -> None:
100 |         try:
101 |             values, metadata = load(file)  # nosec
102 |         except IndexError as error:
103 |             raise ValueError('Save file not in format: [values, metadata]') from error
104 |         if not isinstance(values, dict):
105 |             raise ValueError(f'Not a dict: {values}')
106 |         if not metadata:
107 |             raise ValueError('Metadata empty')
108 |         if not isinstance(metadata, dict):
109 |             raise ValueError(f'Metadata not a dict: {metadata}')
110 |         bad_keys = [key for key in values.keys() if not isinstance(key, Path)]
111 |         if bad_keys:
112 |             raise ValueError(f'Not a Path: {bad_keys}')
113 |         bad_values = [value for value in values.values() if not is_hash(value)]
114 |         if bad_values:
115 |             raise ValueError(f'Not an image hash: {bad_values}')
116 |         if metadata['algorithm'] != self.algorithm:
117 |             raise ValueError(f'Algorithm mismatch: {metadata["algorithm"]} != {self.algorithm}')
118 |         if metadata.keys() != self.metadata().keys():
119 |             raise ValueError(f'Metadata mismatch: {metadata} != {self.metadata()}')
120 |         if metadata != self.metadata():
121 |             raise ValueError(f'Metadata mismatch: {metadata} != {self.metadata()}')
122 |         self.values = values
123 | 
124 |     def load(self) -> None:
125 |         raise NotImplementedError()
126 | 
127 |     def dump(self) -> None:
128 |         raise NotImplementedError()
129 | 
130 | 
131 | class PickleHashStore(FileHashStore):
132 |     """
133 |     Implementation of `FileHashStore` that reads and stores the calculated
134 |     image hashes in Pickle format
135 |     """
136 | 
137 |     @log_execution_time()
138 |     def load(self) -> None:
139 |         with self.store_path.open('rb') as file:
140 |             self.checked_load(file, pickle.load)
141 | 
142 |     @log_execution_time()
143 |     def dump(self) -> None:
144 |         with self.store_path.open('wb') as file:
145 |             pickle.dump(self.values_with_metadata(), file)  # nosec
146 | 
147 | 
148 | def load_values_and_metadata(file: IO) -> Tuple[Cache, Dict]:
149 |     try:
150 |         valds = json.load(file)
151 |     except json.JSONDecodeError as error:
152 |         raise ValueError('Save file not in JSON format') from error
153 |     if not isinstance(valds, list):
154 |         raise ValueError('Save file not in format: [values, metadata]')
155 |     if not isinstance(valds[0], dict):
156 |         raise ValueError(f'Not a dict: {valds[0]}')
157 |     if not isinstance(valds[1], dict):
158 |         raise ValueError(f'Metadata not a dict: {valds[1]}')
159 |     return {Path(k): hex_to_hash(str(v)) for k, v in valds[0].items()}, valds[1]
160 | 
161 | 
162 | class JSONHashStore(FileHashStore):
163 |     """
164 |     Implementation of `FileHashStore` that reads and stores the calculated
165 |     image hashes in JSON format
166 |     """
167 | 
168 |     @log_execution_time()
169 |     def load(self) -> None:
170 |         with self.store_path.open('r') as file:
171 |             self.checked_load(file, load_values_and_metadata)
172 | 
173 |     # see https://bugs.python.org/issue18820 for why this pain is necessary (Python does not allow
174 |     # to automatically convert dict keys for JSON export
175 |     def converted_values(self):
176 |         return {str(k.resolve()): str(v) for k, v in self.values.items()}
177 | 
178 |     @log_execution_time()
179 |     def dump(self) -> None:
180 |         with self.store_path.open('w') as file:
181 |             json.dump((self.converted_values(), self.metadata()), file)
182 | 


--------------------------------------------------------------------------------
/duplicate_images/image_pair_finder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Finds duplicate images by comparing their image hashes using the given hash
  3 | algorithm
  4 | """
  5 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
  6 | 
  7 | import logging
  8 | from itertools import combinations
  9 | from pathlib import Path
 10 | from time import time
 11 | from typing import Dict, List, Iterator
 12 | 
 13 | from duplicate_images.common import log_execution_time
 14 | from duplicate_images.function_types import (
 15 |     Hash, HashFunction, ImageGroup, Results, ResultsGenerator, ResultsGrouper
 16 | )
 17 | from duplicate_images.hash_scanner import ImageHashScanner
 18 | from duplicate_images.hash_store import HashStore, NullHashStore
 19 | from duplicate_images.pair_finder_options import PairFinderOptions
 20 | from duplicate_images.progress_bar_manager import ProgressBarManager, NullProgressBarManager
 21 | 
 22 | 
 23 | def group_results_as_pairs(results: ResultsGenerator) -> Results:
 24 |     return [
 25 |         pair
 26 |         for result in results
 27 |         for pair in combinations(list(result), 2)
 28 |     ]
 29 | 
 30 | 
 31 | def group_results_as_tuples(results: ResultsGenerator) -> Results:
 32 |     return [tuple(result) for result in results]
 33 | 
 34 | 
 35 | class ImagePairFinder:
 36 |     """
 37 |     Finds duplicate images by comparing their image hashes
 38 |     """
 39 | 
 40 |     @classmethod
 41 |     def create(
 42 |             cls, files: List[Path], hash_algorithm: HashFunction,
 43 |             options: PairFinderOptions = PairFinderOptions(),
 44 |             hash_store: HashStore = NullHashStore()
 45 |     ) -> 'ImagePairFinder':
 46 |         group_results = group_results_as_tuples if options.group else group_results_as_pairs
 47 |         progress_bars = ProgressBarManager.create(len(files), options.show_progress_bars)
 48 |         scanner = ImageHashScanner.create(files, hash_algorithm, options, hash_store, progress_bars)
 49 | 
 50 |         if options.max_distance == 0 and not options.slow:
 51 |             return DictImagePairFinder(
 52 |                 scanner, group_results, options=options, progress_bars=progress_bars
 53 |             )
 54 |         if len(files) > 1000:
 55 |             logging.warning(
 56 |                 'Using %s with a big number of images. Expect slow performance.',
 57 |                 SlowImagePairFinder.__name__
 58 |             )
 59 |             logging.warning('Consider using [Parallel]DictImagePairFinder instead.')
 60 |         return SlowImagePairFinder(scanner, group_results, options, progress_bars)
 61 | 
 62 |     def __init__(  # pylint: disable = too-many-arguments
 63 |             self, scanner: ImageHashScanner,
 64 |             group_results: ResultsGrouper,
 65 |             progress_bars: ProgressBarManager = NullProgressBarManager()
 66 |     ) -> None:
 67 |         self.precalculated_hashes: Dict = {}
 68 |         self.group_results = group_results
 69 |         self.scanner = scanner
 70 |         self.progress_bars = progress_bars
 71 |         self.scan_start_time = time()
 72 |         logging.info('Using %s', self.__class__.__name__)
 73 | 
 74 |     def get_equal_groups(self) -> Results:
 75 |         raise NotImplementedError()
 76 | 
 77 |     def log_scan_finished(self) -> None:
 78 |         logging.info(
 79 |             '%d distinct hashes calculated in %.2fs',
 80 |             len(self.precalculated_hashes), time() - self.scan_start_time
 81 |         )
 82 | 
 83 | 
 84 | class DictImagePairFinder(ImagePairFinder):
 85 |     """
 86 |     Searches by storing the image hashes as keys to a dict.
 87 |     Works only if max_distance == 0.
 88 |     """
 89 |     def __init__(  # pylint: disable = too-many-arguments
 90 |             self, scanner: ImageHashScanner,
 91 |             group_results: ResultsGrouper,
 92 |             options: PairFinderOptions = PairFinderOptions(),
 93 |             progress_bars: ProgressBarManager = NullProgressBarManager()
 94 |     ) -> None:
 95 |         super().__init__(scanner, group_results, progress_bars)
 96 |         if options.max_distance != 0:
 97 |             raise ValueError('DictImagePairFinder only works if max_distance == 0!')
 98 |         self.precalculated_hashes = self.get_hashes()
 99 |         self.progress_bars.close_reader()
100 | 
101 |     @log_execution_time()
102 |     def get_equal_groups(self) -> Results:
103 |         self.progress_bars.close()
104 |         self.log_scan_finished()
105 |         return self.group_results(
106 |             (result for result in self.precalculated_hashes.values() if len(result) > 1)
107 |         )
108 | 
109 |     def get_hashes(self) -> Dict[Hash, List[Path]]:
110 |         hash_dict: Dict[Hash, List[Path]] = {}
111 |         for file, image_hash in self.scanner.precalculate_hashes():
112 |             if image_hash is not None:
113 |                 hash_dict.setdefault(image_hash, []).append(file)
114 |         return hash_dict
115 | 
116 | 
117 | class SlowImagePairFinder(ImagePairFinder):
118 |     """
119 |     Searches by comparing the image hashes of each image to every other, giving O(N^2) performance.
120 |     Does not allow returning the results in groups, only pairs.
121 |     The only option if max_distance != 0.
122 |     """
123 | 
124 |     def __init__(  # pylint: disable = too-many-arguments
125 |             self, scanner: ImageHashScanner,
126 |             group_results: ResultsGrouper,
127 |             options: PairFinderOptions = PairFinderOptions(),
128 |             progress_bars: ProgressBarManager = NullProgressBarManager()
129 |     ) -> None:
130 |         if group_results is group_results_as_tuples:
131 |             raise ValueError(f'{self.__class__.__name__} only works with pairs, not groups')
132 |         super().__init__(scanner, group_results, progress_bars)
133 |         self.max_distance = options.max_distance or 0
134 |         self.precalculated_hashes = self.get_hashes()
135 |         self.progress_bars.close_reader()
136 | 
137 |     @log_execution_time()
138 |     def get_equal_groups(self) -> Results:
139 |         self.log_scan_finished()
140 |         image_files = list(self.precalculated_hashes.keys())
141 |         logging.info('Filtering duplicates')
142 |         matches = self.filter_matches(combinations(image_files, 2))
143 |         self.progress_bars.close()
144 |         return matches
145 | 
146 |     def get_hashes(self) -> Dict[Path, Hash]:
147 |         return {
148 |             file: image_hash for file, image_hash in self.scanner.precalculate_hashes()
149 |             if image_hash is not None
150 |         }
151 | 
152 |     def filter_matches(self, all_pairs: Iterator[ImageGroup]) -> Results:
153 |         self.progress_bars.create_filter_bar(len(self.precalculated_hashes))
154 |         return [
155 |             (file, other_file) for file, other_file in all_pairs
156 |             if self.are_images_equal(file, other_file)
157 |         ]
158 | 
159 |     def are_images_equal(self, file: Path, other_file: Path) -> bool:
160 |         self.progress_bars.update_filter()
161 |         hash_distance = self.precalculated_hashes[file] - self.precalculated_hashes[other_file]
162 |         logging.debug(
163 |             '%-30s - %-30s = %d', file.stem, other_file.stem, hash_distance
164 |         )
165 |         return hash_distance <= self.max_distance
166 | 


--------------------------------------------------------------------------------
/duplicate_images/log.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Logging setup
 3 | """
 4 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
 5 | 
 6 | import logging
 7 | from argparse import Namespace
 8 | 
 9 | import coloredlogs
10 | 
11 | 
12 | def setup_logging(args: Namespace) -> None:
13 |     log_level = logging.DEBUG if args.debug else logging.INFO
14 |     for _ in range(args.quiet):
15 |         log_level += (logging.INFO - logging.DEBUG)
16 |     coloredlogs.install(
17 |         level=log_level, fmt='%(asctime)s %(levelname)s: %(message)s',
18 |         datefmt='%H:%M:%S'
19 |     )
20 | 


--------------------------------------------------------------------------------
/duplicate_images/methods.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Definition of the possible actions run on sets of equal images
  3 | """
  4 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
  5 | 
  6 | import logging
  7 | from argparse import Namespace
  8 | from pathlib import Path
  9 | from shlex import quote
 10 | from subprocess import call  # nosec
 11 | from typing import Callable, Dict, List, Optional, Union
 12 | 
 13 | import imagehash
 14 | 
 15 | from duplicate_images.common import path_with_parent
 16 | from duplicate_images.function_types import ActionFunction, HashFunction, ImageGroup
 17 | 
 18 | __all__ = [
 19 |     'call', 'quote', 'get_hash_size_kwargs', 'IMAGE_HASH_ALGORITHM', 'ALGORITHM_DEFAULTS',
 20 |     'ACTIONS_ON_EQUALITY'
 21 | ]
 22 | 
 23 | 
 24 | def ascending_by_size(group: ImageGroup) -> List[Path]:
 25 |     return sorted(group, key=lambda path: (path.stat().st_size, str(path)))
 26 | 
 27 | 
 28 | def delete_with_log_message(file: Path) -> None:
 29 |     file.unlink()
 30 |     logging.info('Deleted %s', path_with_parent(file))
 31 | 
 32 | 
 33 | def move_with_log_message(file: Path, destination: Path, recreate_path: bool) -> None:
 34 |     target = destination / (file.relative_to(file.anchor) if recreate_path else file.name)
 35 |     if recreate_path:
 36 |         target.parent.mkdir(parents=True, exist_ok=True)
 37 |     file.rename(target)
 38 |     logging.info('Moved %s to %s', file, target)
 39 | 
 40 | 
 41 | def symlink_to_nth_smallest_file(group: ImageGroup, index: int) -> None:
 42 |     biggest = ascending_by_size(group)[index]
 43 |     others = set(group) - {biggest}
 44 |     for file in others:
 45 |         delete_with_log_message(file)
 46 |         file.symlink_to(biggest)
 47 | 
 48 | 
 49 | def shell_exec(args: Namespace, group: ImageGroup) -> None:
 50 |     cmd = args.exec
 51 |     for num, path in enumerate(group):
 52 |         cmd = cmd.replace(f"{'{'}{num + 1}{'}'}", f'{quote(str(path))}')
 53 |     cmd = cmd.replace('{*}', ' '.join([quote(str(path)) for path in group]))
 54 |     call(cmd, shell=True)  # nosec
 55 | 
 56 | 
 57 | def get_hash_size_kwargs(algorithm: HashFunction, size: Optional[int]) -> Dict:
 58 |     if size is None:
 59 |         return ALGORITHM_DEFAULTS.get(algorithm, {'hash_size': 8})
 60 |     kwarg = next(iter(ALGORITHM_DEFAULTS.get(algorithm, {'hash_size': 8})))
 61 |     return {} if kwarg == 'hash_func' else {kwarg: size}
 62 | 
 63 | 
 64 | IMAGE_HASH_ALGORITHM = {
 65 |     'ahash': imagehash.average_hash,
 66 |     'phash': imagehash.phash,
 67 |     'phash_simple': imagehash.phash_simple,
 68 |     'dhash': imagehash.dhash,
 69 |     'dhash_vertical': imagehash.dhash_vertical,
 70 |     'whash': imagehash.whash,
 71 |     'colorhash': imagehash.colorhash,
 72 |     'crop_resistant': imagehash.crop_resistant_hash,
 73 | }  # type: Dict[str, HashFunction]
 74 | 
 75 | ALGORITHM_DEFAULTS: Dict[Callable, Dict[str, Union[int, HashFunction]]] = {
 76 |     imagehash.average_hash: {'hash_size': 8},
 77 |     imagehash.phash: {'hash_size': 8},
 78 |     imagehash.phash_simple: {'hash_size': 8},
 79 |     imagehash.dhash: {'hash_size': 8},
 80 |     imagehash.dhash_vertical: {'hash_size': 8},
 81 |     imagehash.whash: {'hash_size': 8},
 82 |     imagehash.colorhash: {'binbits': 3},
 83 |     imagehash.crop_resistant_hash: {'hash_func': imagehash.phash},
 84 | }
 85 | 
 86 | ACTIONS_ON_EQUALITY: Dict[str, ActionFunction] = {
 87 |     'delete-first': lambda args, group: delete_with_log_message(group[0]),
 88 |     'd1': lambda args, group: delete_with_log_message(group[0]),
 89 |     'delete-last': lambda args, group: delete_with_log_message(group[-1]),
 90 |     'dl': lambda args, group: delete_with_log_message(group[-1]),
 91 |     'delete-biggest': lambda args, group: delete_with_log_message(ascending_by_size(group)[-1]),
 92 |     'd>': lambda args, group: delete_with_log_message(ascending_by_size(group)[-1]),
 93 |     'delete-smallest': lambda args, group: delete_with_log_message(ascending_by_size(group)[0]),
 94 |     'd<': lambda args, group: delete_with_log_message(ascending_by_size(group)[0]),
 95 |     'move-first': lambda args, group: move_with_log_message(
 96 |         group[0], Path(args.move_to), args.move_recreate_path
 97 |     ),
 98 |     'm1': lambda args, group: move_with_log_message(
 99 |         group[0], Path(args.move_to), args.move_recreate_path
100 |     ),
101 |     'move-last': lambda args, group: move_with_log_message(
102 |         group[-1], Path(args.move_to), args.move_recreate_path
103 |     ),
104 |     'ml': lambda args, group: move_with_log_message(
105 |         group[-1], Path(args.move_to), args.move_recreate_path
106 |     ),
107 |     'move-biggest': lambda args, group: move_with_log_message(
108 |         ascending_by_size(group)[-1], Path(args.move_to), args.move_recreate_path
109 |     ),
110 |     'm>': lambda args, group: move_with_log_message(
111 |         ascending_by_size(group)[-1], Path(args.move_to), args.move_recreate_path
112 |     ),
113 |     'move-smallest': lambda args, group: move_with_log_message(
114 |         ascending_by_size(group)[0], Path(args.move_to), args.move_recreate_path
115 |     ),
116 |     'm<': lambda args, group: move_with_log_message(
117 |         ascending_by_size(group)[0], Path(args.move_to), args.move_recreate_path
118 |     ),
119 |     'symlink-smaller': lambda args, group: symlink_to_nth_smallest_file(group, -1),
120 |     'symlink-bigger': lambda args, group: symlink_to_nth_smallest_file(group, 0),
121 |     'eog': lambda args, group: call(['eog'] + [str(pic) for pic in group]),  # nosec
122 |     'xv': lambda args, group: call(['xv', '-nolim'] + [str(pic) for pic in group]),  # nosec
123 |     'print': lambda args, group: print(*group),
124 |     'print_inline': lambda args, group: print(*group, end=' '),
125 |     'quote': lambda args, group: print(' '.join([quote(str(pic)) for pic in group])),
126 |     'quote_inline': lambda args, group: print(
127 |         ' '.join([quote(str(pic)) for pic in group]), end=' '
128 |     ),
129 |     'exec': lambda args, group: shell_exec(args, group),  # pylint: disable=unnecessary-lambda
130 |     'none': lambda args, group: None,
131 | }
132 | 
133 | MOVE_ACTIONS = ['move-first', 'm1', 'move-last', 'ml', 'move-biggest', 'm>', 'move-smallest', 'm<']
134 | 


--------------------------------------------------------------------------------
/duplicate_images/pair_finder_options.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Encapsulates the options for scanning images and detecting duplicates
 3 | """
 4 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
 5 | 
 6 | from argparse import Namespace
 7 | from dataclasses import dataclass
 8 | from typing import Optional
 9 | 
10 | 
11 | @dataclass(frozen=True)
12 | class PairFinderOptions:
13 |     """
14 |     Encapsulates the options for scanning images and detecting duplicates and
15 |     reads them from an `argparse.Namespace` object
16 |     """
17 |     max_distance: int = 0
18 |     hash_size: Optional[int] = None
19 |     show_progress_bars: bool = False
20 |     parallel: Optional[int] = None
21 |     slow: bool = False
22 |     group: bool = False
23 | 
24 |     @classmethod
25 |     def from_args(cls, args: Namespace):
26 |         return cls(
27 |             args.max_distance, args.hash_size, args.progress, args.parallel, args.slow, args.group
28 |         )
29 | 


--------------------------------------------------------------------------------
/duplicate_images/parse_commandline.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Define and parse command line arguments for the `find-dups` command line tool
  3 | """
  4 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
  5 | 
  6 | import logging
  7 | from os import cpu_count
  8 | from argparse import ArgumentParser, Namespace, RawDescriptionHelpFormatter
  9 | from configparser import ConfigParser
 10 | from typing import List, Optional, Dict, Union
 11 | 
 12 | from PIL import Image
 13 | 
 14 | from duplicate_images.methods import ACTIONS_ON_EQUALITY, IMAGE_HASH_ALGORITHM, MOVE_ACTIONS
 15 | 
 16 | DefaultsDict = Dict[str, Union[str, int, bool, None]]
 17 | DEFAULTS: DefaultsDict = {
 18 |     'root_directory': '.',
 19 |     'exclude_dir': None,
 20 |     'algorithm': 'phash',
 21 |     'max_distance': 0,
 22 |     'hash_size': None,
 23 |     'on_equal': 'print',
 24 |     'exec': None,
 25 |     'move_to': None,
 26 |     'move_recreate_path': False,
 27 |     'parallel': None,
 28 |     'parallel_actions': None,
 29 |     'slow': False,
 30 |     'group': False,
 31 |     'progress': False,
 32 |     'debug': False,
 33 |     'quiet': 0,
 34 |     'hash_db': None,
 35 |     'max_image_pixels': None
 36 | }
 37 | 
 38 | 
 39 | def is_power_of_2(n: int) -> bool:
 40 |     # https://stackoverflow.com/questions/57025836/how-to-check-if-a-given-number-is-a-power-of-two
 41 |     return (n != 0) and (n & (n - 1) == 0)
 42 | 
 43 | 
 44 | def parse_command_line(args: Optional[List[str]] = None) -> Namespace:
 45 |     conf_parser = create_config_file_parser()
 46 |     conf_namespace, remaining_argv = conf_parser.parse_known_args(args)
 47 |     defaults = read_defaults_from_config(conf_namespace)
 48 | 
 49 |     parser = create_main_parser(conf_parser, defaults)
 50 |     namespace = parser.parse_args(remaining_argv)
 51 | 
 52 |     check_complex_errors(namespace, parser)
 53 |     return namespace
 54 | 
 55 | 
 56 | def create_config_file_parser() -> ArgumentParser:
 57 |     conf_parser = ArgumentParser(
 58 |         description=__doc__,
 59 |         formatter_class=RawDescriptionHelpFormatter,
 60 |         add_help=False
 61 |     )
 62 |     conf_parser.add_argument('-c', '--config-file', help='Specify config file', metavar='FILE')
 63 |     return conf_parser
 64 | 
 65 | 
 66 | def read_defaults_from_config(conf_namespace: Namespace) -> DefaultsDict:
 67 |     defaults = DEFAULTS.copy()
 68 |     if conf_namespace.config_file:
 69 |         config = ConfigParser()
 70 |         config.read([conf_namespace.config_file])
 71 |         logging.warning(config.sections())
 72 |         defaults.update(dict(config.items('Defaults')))
 73 |     return defaults
 74 | 
 75 | 
 76 | def create_main_parser(parent_parser: ArgumentParser, defaults: DefaultsDict) -> ArgumentParser:
 77 |     parser = ArgumentParser(
 78 |         description='Find pairs of equal or similar images.',
 79 |         # Inherit options from config_parser
 80 |         parents=[parent_parser]
 81 |     )
 82 |     parser.set_defaults(**defaults)
 83 |     parser.add_argument(
 84 |         'root_directory', default='.', nargs='+',
 85 |         help='The root of the directory tree under which images are compared'
 86 |     )
 87 |     parser.add_argument(
 88 |         '--exclude-dir', nargs='*',
 89 |         help='Directories to exclude from the search (can be given as regular expressions)'
 90 |     )
 91 |     parser.add_argument(
 92 |         '--algorithm', choices=IMAGE_HASH_ALGORITHM.keys(),
 93 |         help='Method used to determine if two images are considered equal'
 94 |     )
 95 |     parser.add_argument(
 96 |         '--max-distance', type=int,
 97 |         help='Maximum hash distance for images to be considered equal'
 98 |     )
 99 |     parser.add_argument(
100 |         '--hash-size', type=int,
101 |         help='Hash size (or number of bin bits for colorhash)'
102 |     )
103 |     parser.add_argument(
104 |         '--on-equal', choices=ACTIONS_ON_EQUALITY.keys(),
105 |         help='Command to be run on each pair of images found to be equal'
106 |     )
107 |     parser.add_argument(
108 |         '--exec', type=str,
109 |         help='Command to execute (replaces {1}, {2} or {*} with file paths)'
110 |     )
111 |     parser.add_argument(
112 |         '--move-to', type=str,
113 |         help='Destination directory for moving duplicate images'
114 |     )
115 |     parser.add_argument(
116 |         '--move-recreate-path', action='store_true',
117 |         help='recreate the path the original images are under in the destination directory'
118 |     )
119 |     parser.add_argument(
120 |         '--parallel', nargs='?', type=int, const=cpu_count(),
121 |         help=f'Calculate hashes using PARALLEL threads (default: {cpu_count()})'
122 |     )
123 |     parser.add_argument(
124 |         '--parallel-actions', nargs='?', type=int, const=cpu_count(),
125 |         help=f'Execute actions on equal images using PARALLEL threads (default: {cpu_count()})'
126 |     )
127 |     group = parser.add_mutually_exclusive_group()
128 |     group.add_argument(
129 |         '--slow', action='store_true', help='Use slow (O(N^2)) algorithm'
130 |     )
131 |     group.add_argument(
132 |         '--group', action='store_true',
133 |         help='Handle equal images in a group instead of multiple pairs'
134 |     )
135 |     parser.add_argument(
136 |         '--progress', action='store_true', help='Show progress bars during processing'
137 |     )
138 |     parser.add_argument(
139 |         '--debug', action='store_true', help='Print lots of debugging info'
140 |     )
141 |     parser.add_argument(
142 |         '--quiet', '-q', action='count', help='Decrease log level by one for each'
143 |     )
144 |     parser.add_argument(
145 |         '--hash-db', help='File storing precomputed hashes'
146 |     )
147 |     parser.add_argument(
148 |         '--max-image-pixels', type=int,
149 |         help=f'Maximum size of image in pixels (default: {Image.MAX_IMAGE_PIXELS})'
150 |     )
151 |     return parser
152 | 
153 | 
154 | def check_complex_errors(namespace, parser):
155 |     if namespace.on_equal == 'exec' and not namespace.exec:
156 |         parser.error('--exec argument is required with --on-equal exec')
157 |     if namespace.exec and namespace.on_equal != 'exec':
158 |         parser.error('--exec is only allowed with --on-equal exec')
159 |     if namespace.algorithm == 'whash' and not is_power_of_2(namespace.hash_size):
160 |         parser.error('whash requires hash_size to be a power of 2')
161 |     if namespace.group and namespace.max_distance:
162 |         parser.error('--max-distance: not allowed with argument --group')
163 |     if namespace.move_to and namespace.on_equal not in MOVE_ACTIONS:
164 |         parser.error(f'--move-to requires --on-equal to be one of: {", ".join(MOVE_ACTIONS)}')
165 |     if namespace.on_equal in MOVE_ACTIONS and not namespace.move_to:
166 |         parser.error(f'--on-equal {namespace.move_to} requires --move-to to be set')
167 |     if namespace.move_recreate_path and namespace.on_equal not in MOVE_ACTIONS:
168 |         parser.error(
169 |             f'--move-recreate-path requires --on-equal to be one of: {", ".join(MOVE_ACTIONS)}'
170 |         )
171 | 


--------------------------------------------------------------------------------
/duplicate_images/progress_bar_manager.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Controlling the progress bars shown during processing images
 3 | """
 4 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
 5 | 
 6 | from typing import Optional
 7 | 
 8 | from tqdm import tqdm
 9 | 
10 | 
11 | class ProgressBarManager:
12 |     """
13 |     Manages the progress bars shown during image scan and, optionally, duplicate
14 |     detection
15 |     """
16 |     @classmethod
17 |     def create(cls, files_length: int, active: bool = False) -> 'ProgressBarManager':
18 |         return ProgressBarManager(files_length) if active else NullProgressBarManager()
19 | 
20 |     def __init__(self, files_length: int) -> None:
21 |         self.reader_progress: Optional[tqdm] = tqdm(
22 |             total=files_length, miniters=max(files_length / 100, 5), smoothing=0.1, unit='',
23 |             delay=0.1
24 |         ) if files_length else None
25 |         self.filter_progress: Optional[tqdm] = None
26 | 
27 |     def create_filter_bar(self, hashes_length: int) -> None:
28 |         self.close_reader()
29 |         total_items = int(hashes_length * (hashes_length - 1) / 2)
30 |         self.filter_progress = tqdm(
31 |             total=total_items, unit='',
32 |             unit_scale=True, miniters=max(total_items / 5000, 20000)
33 |         )
34 | 
35 |     def update_reader(self) -> None:
36 |         if self.reader_progress is not None:
37 |             self.reader_progress.update(1)
38 | 
39 |     def update_filter(self) -> None:
40 |         if self.filter_progress is not None:
41 |             self.filter_progress.update(1)
42 | 
43 |     def close_reader(self) -> None:
44 |         if self.reader_progress is not None:
45 |             self.reader_progress.close()
46 | 
47 |     def close(self) -> None:
48 |         if self.filter_progress is not None:
49 |             self.filter_progress.close()
50 | 
51 | 
52 | class NullProgressBarManager(ProgressBarManager):
53 |     """
54 |     Implementation of `ProgressBarManager` that does nothing but can be used in
55 |     place of one
56 |     """
57 |     def __init__(self) -> None:
58 |         super().__init__(0)
59 | 
60 |     def create_filter_bar(self, hashes_length: int) -> None:
61 |         pass
62 | 
63 |     def update_reader(self) -> None:
64 |         pass
65 | 
66 |     def update_filter(self) -> None:
67 |         pass
68 | 
69 |     def close_reader(self) -> None:
70 |         pass
71 | 
72 |     def close(self) -> None:
73 |         pass
74 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | ignore_missing_imports = True
 3 | warn_redundant_casts = True
 4 | strict_optional = True
 5 | warn_unused_ignores = True
 6 | disallow_subclassing_any = False
 7 | no_warn_return_any = True
 8 | 
 9 | [mypy-*]
10 | disallow_untyped_calls = True
11 | disallow_untyped_defs = True
12 | check_untyped_defs = True
13 | no_implicit_optional = True
14 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "duplicate_images"
 3 | version = "0.11.9"
 4 | description = "Finds equal or similar images in a directory containing (many) image files"
 5 | authors = ["Lene Preuss <lene.preuss@gmail.com>"]
 6 | repository = "https://github.com/lene/DuplicateImages.git"
 7 | homepage = "https://github.com/lene/DuplicateImages"
 8 | readme = "README.md"
 9 | classifiers = [
10 |     "Development Status :: 4 - Beta",
11 |     "Environment :: Console",
12 |     "Programming Language :: Python :: 3",
13 |     "Topic :: Multimedia :: Graphics",
14 |     "Topic :: Utilities"
15 | ]
16 | 
17 | [tool.poetry.dependencies]
18 | python = ">=3.9"
19 | Wand = ">=0.6"
20 | pillow = ">=11.0"
21 | imagehash = ">=4.3"
22 | coloredlogs = ">=15.0"
23 | tqdm = ">=4.67"
24 | pillow-heif = ">=0.21"
25 | six = ">=1.17"
26 | numpy = ">=2.0"
27 | filetype = ">=1.2"
28 | setuptools = ">=75.6"
29 | 
30 | [tool.poetry.group.dev.dependencies]
31 | bandit = ">=1.8"
32 | lz4 = ">=4.0"
33 | mypy = ">=1.14"
34 | flake8 = ">=7.1"
35 | pytest = ">=7.1"
36 | pylint = ">=3.3"
37 | pytest-xdist = ">=3.6"
38 | ptpython = ">=3.0"
39 | 
40 | [tool.poetry.scripts]
41 | find-dups = "duplicate_images.duplicate:main"
42 | 
43 | 
44 | [build-system]
45 | requires = ["poetry-core>=1.6"]
46 | build-backend = "poetry.core.masonry.api"
47 | 


--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=missing-docstring
2 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
3 | 


--------------------------------------------------------------------------------
/tests/integration/conftest.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=missing-docstring
 2 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
 3 | 
 4 | from pathlib import Path
 5 | from tempfile import TemporaryDirectory
 6 | from typing import Generator
 7 | 
 8 | import pytest
 9 | 
10 | 
11 | @pytest.fixture
12 | def data_dir() -> Path:
13 |     return Path(__file__).resolve().parent / 'data'
14 | 
15 | 
16 | @pytest.fixture
17 | def tmp_dir() -> Generator[Path, None, None]:
18 |     testdir = TemporaryDirectory()
19 |     yield Path(testdir.name)
20 |     try:
21 |         testdir.cleanup()
22 |     except FileNotFoundError:
23 |         pass
24 | 


--------------------------------------------------------------------------------
/tests/integration/data/broken/47ff(1).jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/broken/47ff(1).jpg


--------------------------------------------------------------------------------
/tests/integration/data/broken/47ff(2).jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/broken/47ff(2).jpg


--------------------------------------------------------------------------------
/tests/integration/data/different/pair1/20221026_124702.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/different/pair1/20221026_124702.jpg


--------------------------------------------------------------------------------
/tests/integration/data/different/pair1/20221026_124757.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/different/pair1/20221026_124757.jpg


--------------------------------------------------------------------------------
/tests/integration/data/equal_but_binary_different.json:
--------------------------------------------------------------------------------
1 | [{"/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk50%/61983_camillabelle_g-50%.jpg": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/camilla_belle_1297719284.jpg": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/61983_camillabelle_g.jpg": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/61983_camillabelle_g-90%.jpg": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk50%/61983_camillabelle_g.jpg": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/61983_camillabelle_g.jpg": "00183c1c3898981e", "/home/preuss/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/61983_camillabelle_g.jpg": "00183c1c3898981e", "/home/preuss/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/camilla_belle_1297719284.jpg": "00183c1c3898981e", "/home/preuss/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/61983_camillabelle_g-90%.jpg": "00183c1c3898981e", "/home/preuss/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/61983_camillabelle_g.jpg": "00183c1c3898981e", "/home/preuss/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk50%/61983_camillabelle_g-50%.jpg": "00183c1c3898981e", "/home/preuss/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk50%/61983_camillabelle_g.jpg": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair2/61983_camillabelle_g.heic": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair2/61983_camillabelle_g.jpg": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair3/61983_camillabelle_g_2_10bit.heic": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair3/61983_camillabelle_g_2_8bit.heic": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair4/61983_camillabelle_g_2_q=50.heic": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair4/61983_camillabelle_g_lossless.heic": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/20221026_124702_q94.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/20221026_124702_q95.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair2/20221026_124702.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair2/20221026_124702.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair3/20221026_124702_10bit.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair3/20221026_124702_8bit.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair4/20221026_124702_lossless.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair4/20221026_124702_q50.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair4/20221026_124702_q80.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/20221026_124702_50%.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/20221026_124702_90%.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702_90%.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk50%/20221026_124702.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk50%/20221026_124702_50%.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk50%/20221026_124702_80%.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk20%/20221026_124702.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk20%/20221026_124702_80%.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk30%/20221026_124702.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk30%/20221026_124702_70%.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair2/20221026_124702_q95.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702_q95.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk20%/20221026_124702_q95.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair4/20221026_124702_q85.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk20%/20221026_124702_85%.jpg": "cc3c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_10bit.heic": "cc3c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_8bit.heic": "cc3c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossles_vs_lossy/20221026_124702_lossless.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossles_vs_lossy/20221026_124702_q85.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q94.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q95.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_lossless.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_q85.heic": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_10bit.heic": "cc3c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_8bit.heic": "cc3c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_lossless.heic": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_q85.heic": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q94.jpg": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q95.jpg": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.heic": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.jpg": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702.jpg": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702_90%.jpg": "cc7c6c071f1f3fae", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_10bit.heic": "9452538c3de569da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_8bit.heic": "9452538c3de569da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_lossless.heic": "9452d38c3de169da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_q85.heic": "9452d38c3de169da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q94.jpg": "9452d38c3de169da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q95.jpg": "9452d38c3de169da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.heic": "9452d38c3de169da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.jpg": "9452d38c3de169da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702.jpg": "9452d38c3de169da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702_90%.jpg": "9452d38c3de169da"}, {"algorithm": "phash", "hash_size": 8}]


--------------------------------------------------------------------------------
/tests/integration/data/equal_but_binary_different.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different.pickle


--------------------------------------------------------------------------------
/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_10bit.heic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_10bit.heic


--------------------------------------------------------------------------------
/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_8bit.heic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_8bit.heic


--------------------------------------------------------------------------------
/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_lossless.heic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_lossless.heic


--------------------------------------------------------------------------------
/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_q85.heic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_q85.heic


--------------------------------------------------------------------------------
/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q94.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q94.jpg


--------------------------------------------------------------------------------
/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q95.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q95.jpg


--------------------------------------------------------------------------------
/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.heic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.heic


--------------------------------------------------------------------------------
/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.jpg


--------------------------------------------------------------------------------
/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702.jpg


--------------------------------------------------------------------------------
/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702_90%.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702_90%.jpg


--------------------------------------------------------------------------------
/tests/integration/data/exactly_equal/heif/test1.heif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/heif/test1.heif


--------------------------------------------------------------------------------
/tests/integration/data/exactly_equal/heif/test2.heif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/heif/test2.heif


--------------------------------------------------------------------------------
/tests/integration/data/exactly_equal/pair1/20221026_124702_90%-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/pair1/20221026_124702_90%-2.jpg


--------------------------------------------------------------------------------
/tests/integration/data/exactly_equal/pair1/20221026_124702_90%.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/pair1/20221026_124702_90%.jpg


--------------------------------------------------------------------------------
/tests/integration/data/exactly_equal/pair2/20220312_124816-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/pair2/20220312_124816-2.jpg


--------------------------------------------------------------------------------
/tests/integration/data/exactly_equal/pair2/20220312_124816.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/pair2/20220312_124816.jpg


--------------------------------------------------------------------------------
/tests/integration/data/exactly_equal/pair3/IMAG0015_small-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/pair3/IMAG0015_small-2.png


--------------------------------------------------------------------------------
/tests/integration/data/exactly_equal/pair3/IMAG0015_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/pair3/IMAG0015_small.png


--------------------------------------------------------------------------------
/tests/integration/data/exactly_equal/webp/test1.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/webp/test1.webp


--------------------------------------------------------------------------------
/tests/integration/data/exactly_equal/webp/test2.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/webp/test2.webp


--------------------------------------------------------------------------------
/tests/integration/data/garbage.txt:
--------------------------------------------------------------------------------
1 | garbage
2 | 


--------------------------------------------------------------------------------
/tests/integration/data/huge/huge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/huge/huge.png


--------------------------------------------------------------------------------
/tests/integration/data/huge/huge2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/huge/huge2.png


--------------------------------------------------------------------------------
/tests/integration/data/is_image_file/is_image/test.heif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_image/test.heif


--------------------------------------------------------------------------------
/tests/integration/data/is_image_file/is_image/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_image/test.jpg


--------------------------------------------------------------------------------
/tests/integration/data/is_image_file/is_image/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_image/test.png


--------------------------------------------------------------------------------
/tests/integration/data/is_image_file/is_image/test.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_image/test.tiff


--------------------------------------------------------------------------------
/tests/integration/data/is_image_file/is_image/test.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_image/test.webp


--------------------------------------------------------------------------------
/tests/integration/data/is_image_file/is_not_image/test.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_not_image/test.mp3


--------------------------------------------------------------------------------
/tests/integration/data/is_image_file/is_not_image/test.ogg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_not_image/test.ogg


--------------------------------------------------------------------------------
/tests/integration/data/is_image_file/is_not_image/test.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_not_image/test.txt


--------------------------------------------------------------------------------
/tests/integration/data/jpeg_artifacts/jpeg_10/20221026_124702_q10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_10/20221026_124702_q10.jpg


--------------------------------------------------------------------------------
/tests/integration/data/jpeg_artifacts/jpeg_10/20221026_124702_q95.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_10/20221026_124702_q95.jpg


--------------------------------------------------------------------------------
/tests/integration/data/jpeg_artifacts/jpeg_25/20221026_124702_q25.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_25/20221026_124702_q25.jpg


--------------------------------------------------------------------------------
/tests/integration/data/jpeg_artifacts/jpeg_25/20221026_124702_q95.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_25/20221026_124702_q95.jpg


--------------------------------------------------------------------------------
/tests/integration/data/jpeg_artifacts/jpeg_50/20221026_124702_q50.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_50/20221026_124702_q50.jpg


--------------------------------------------------------------------------------
/tests/integration/data/jpeg_artifacts/jpeg_50/20221026_124702_q95.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_50/20221026_124702_q95.jpg


--------------------------------------------------------------------------------
/tests/integration/data/jpeg_artifacts/jpeg_75/20221026_124702_q75.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_75/20221026_124702_q75.jpg


--------------------------------------------------------------------------------
/tests/integration/data/jpeg_artifacts/jpeg_75/20221026_124702_q95.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_75/20221026_124702_q95.jpg


--------------------------------------------------------------------------------
/tests/integration/data/similar/many/20220218_135622.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/similar/many/20220218_135622.jpg


--------------------------------------------------------------------------------
/tests/integration/data/similar/many/20220218_135658.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/similar/many/20220218_135658.jpg


--------------------------------------------------------------------------------
/tests/integration/data/similar/many/20220218_135708.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/similar/many/20220218_135708.jpg


--------------------------------------------------------------------------------
/tests/integration/data/similar/pair1/20220806_214449.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/similar/pair1/20220806_214449.jpg


--------------------------------------------------------------------------------
/tests/integration/data/similar/pair1/20220806_214600.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/similar/pair1/20220806_214600.jpg


--------------------------------------------------------------------------------
/tests/integration/data/similar/pair2/20220329_210118.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/similar/pair2/20220329_210118.jpg


--------------------------------------------------------------------------------
/tests/integration/data/similar/pair2/20220329_210123.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/similar/pair2/20220329_210123.jpg


--------------------------------------------------------------------------------
/tests/integration/test_is_image_file.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=missing-docstring
 2 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
 3 | 
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | import pytest
 8 | 
 9 | from duplicate_images.duplicate import is_image_file
10 | 
11 | 
12 | def base_dir() -> Path:
13 |     return Path(__file__).resolve().parent / 'data' / 'is_image_file'
14 | 
15 | 
16 | def image_files() -> List[Path]:
17 |     return list((base_dir() / 'is_image').glob('test.*'))
18 | 
19 | 
20 | def not_image_files() -> List[Path]:
21 |     return list((base_dir() / 'is_not_image').glob('test.*'))
22 | 
23 | 
24 | @pytest.mark.parametrize('image_file', image_files())
25 | def test_image_files_are_recognized(image_file: Path) -> None:
26 |     assert is_image_file(image_file)
27 | 
28 | 
29 | @pytest.mark.parametrize('not_image_file', not_image_files())
30 | def test_non_image_files_are_recognized(not_image_file: Path) -> None:
31 |     assert not is_image_file(not_image_file)
32 | 


--------------------------------------------------------------------------------
/tests/integration/test_persistent_storage.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=missing-docstring
  2 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
  3 | 
  4 | import json
  5 | import pickle
  6 | from pathlib import Path
  7 | from typing import Any, Tuple, Optional
  8 | from unittest.mock import Mock, patch
  9 | 
 10 | import pytest
 11 | 
 12 | from duplicate_images.duplicate import get_matches
 13 | from duplicate_images.pair_finder_options import PairFinderOptions
 14 | 
 15 | 
 16 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different'])
 17 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
 18 | @patch('imagehash.average_hash', return_value=0)
 19 | def test_open_hash_store_with_filename(
 20 |         average_hash: Mock, data_dir: Path, test_set: str, file_type: str
 21 | ) -> None:
 22 |     folder = data_dir / test_set
 23 |     cache_file = folder.with_suffix(f'.{file_type}')
 24 |     creation_time = cache_file.stat().st_ctime
 25 |     get_matches([folder], 'phash', hash_store_path=cache_file)
 26 |     assert average_hash.call_count == 0
 27 |     assert cache_file.stat().st_atime > creation_time
 28 | 
 29 | 
 30 | @pytest.mark.parametrize('test_set', ['different', 'equal_but_binary_different'])
 31 | def test_open_bad_file_format(data_dir: Path, test_set: str) -> None:
 32 |     folder = data_dir / test_set
 33 |     cache_file = data_dir / 'garbage.txt'
 34 |     creation_time = cache_file.stat().st_ctime
 35 |     with pytest.raises(ValueError):
 36 |         get_matches([folder], 'phash', hash_store_path=cache_file)
 37 |     assert cache_file.stat().st_ctime == creation_time
 38 | 
 39 | 
 40 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different'])
 41 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
 42 | def test_open_correct_file_format_but_not_a_tuple(
 43 |         tmp_dir: Path, data_dir: Path, test_set: str, file_type: str
 44 | ) -> None:
 45 |     check_garbage(
 46 |         tmp_dir, data_dir / test_set, file_type, garbage_data='garbage', message=None
 47 |     )
 48 | 
 49 | 
 50 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different'])
 51 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
 52 | def test_open_correct_file_format_but_values_not_a_dict(
 53 |         tmp_dir: Path, data_dir: Path, test_set: str, file_type: str
 54 | ) -> None:
 55 |     check_garbage(
 56 |         tmp_dir, data_dir / test_set, file_type, garbage_data=('garbage', {}), message='Not a dict'
 57 |     )
 58 | 
 59 | 
 60 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different'])
 61 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
 62 | def test_open_correct_file_format_but_metadata_not_a_dict(
 63 |         tmp_dir: Path, data_dir: Path, test_set: str, file_type: str
 64 | ) -> None:
 65 |     check_garbage(
 66 |         tmp_dir, data_dir / test_set, file_type, garbage_data=({}, 'garbage'),
 67 |         message='Metadata not a dict'
 68 |     )
 69 | 
 70 | 
 71 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different'])
 72 | @pytest.mark.parametrize('file_type', ['pickle'])
 73 | def test_open_correct_file_format_but_keys_not_paths(
 74 |         tmp_dir: Path, data_dir: Path, test_set: str, file_type: str
 75 | ) -> None:
 76 |     folder = data_dir / test_set
 77 |     check_garbage(
 78 |         tmp_dir, folder, file_type,
 79 |         garbage_data=(
 80 |             {str(path): 0 for path in folder.glob('**')}, {'algorithm': 'phash', 'hash_size': 8}
 81 |         ),
 82 |         message='Not a Path'
 83 |     )
 84 | 
 85 | 
 86 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different'])
 87 | @pytest.mark.parametrize('file_type', ['pickle'])
 88 | def test_open_correct_file_format_but_values_not_image_hashes(
 89 |         tmp_dir: Path, data_dir: Path, test_set: str, file_type: str
 90 | ) -> None:
 91 |     folder = data_dir / test_set
 92 |     check_garbage(
 93 |         tmp_dir, folder, file_type,
 94 |         garbage_data=(
 95 |             {path: 0 for path in folder.glob('**')}, {'algorithm': 'phash', 'hash_size': 8}
 96 |         ), message='Not an image hash'
 97 |     )
 98 | 
 99 | 
100 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different'])
101 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
102 | def test_open_correct_file_format_but_metadata_missing(
103 |         tmp_dir: Path, data_dir: Path, test_set: str, file_type: str
104 | ) -> None:
105 |     folder = data_dir / test_set
106 |     check_garbage(
107 |         tmp_dir, folder, file_type,
108 |         garbage_data=({path: 0 for path in folder.glob('**')}, ), message=None
109 |     )
110 | 
111 | 
112 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different'])
113 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
114 | def test_open_correct_file_format_but_metadata_empty(
115 |         tmp_dir: Path, data_dir: Path, test_set: str, file_type: str
116 | ) -> None:
117 |     folder = data_dir / test_set
118 |     check_garbage(
119 |         tmp_dir, folder, file_type,
120 |         garbage_data=({path: 0 for path in folder.glob('**')}, {}), message='Metadata empty'
121 |     )
122 | 
123 | 
124 | @pytest.mark.parametrize('test_set', ['different', 'equal_but_binary_different'])
125 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
126 | @pytest.mark.parametrize('algorithms', [('phash', 'ahash')])
127 | def test_opening_with_different_algorithm_leads_to_error(
128 |         tmp_dir: Path, data_dir: Path, test_set: str, file_type: str, algorithms: Tuple[str, str]
129 | ) -> None:
130 |     cache_file = tmp_dir / f'hash_store.{file_type}'
131 |     get_matches([data_dir / test_set], algorithms[0], hash_store_path=cache_file)
132 |     with pytest.raises(ValueError, match='Algorithm mismatch'):
133 |         get_matches([data_dir / test_set], algorithms[1], hash_store_path=cache_file)
134 | 
135 | 
136 | @pytest.mark.parametrize('test_set', ['different', 'equal_but_binary_different'])
137 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
138 | @pytest.mark.parametrize('hash_size', [(8, 9)])
139 | def test_opening_with_different_algorithm_parameters_leads_to_error(
140 |         tmp_dir: Path, data_dir: Path, test_set: str, file_type: str, hash_size: Tuple[int, int]
141 | ) -> None:
142 |     cache_file = tmp_dir / f'hash_store.{file_type}'
143 |     assert not cache_file.is_file()
144 |     get_matches(
145 |         [data_dir / test_set], 'phash', options=PairFinderOptions(hash_size=hash_size[0]),
146 |         hash_store_path=cache_file
147 |     )
148 |     assert cache_file.is_file()
149 |     with pytest.raises(ValueError, match='Metadata mismatch'):
150 |         get_matches(
151 |             [data_dir / test_set], 'phash', options=PairFinderOptions(hash_size=hash_size[1]),
152 |             hash_store_path=cache_file
153 |         )
154 | 
155 | 
156 | def check_garbage(
157 |         temp_dir: Path, folder: Path, file_type: str, garbage_data: Any, message: Optional[str]
158 | ) -> None:
159 |     cache_file = temp_dir / f'garbage.{file_type}'
160 |     if file_type == 'pickle':
161 |         dump_pickle(cache_file, garbage_data)
162 |     else:
163 |         dump_json(cache_file, garbage_data)
164 |     creation_time = cache_file.stat().st_ctime
165 |     with pytest.raises(ValueError, match=message):
166 |         get_matches([folder], 'phash', hash_store_path=cache_file)
167 |     assert cache_file.stat().st_ctime == creation_time
168 | 
169 | 
170 | def dump_pickle(cache_file: Path, garbage_data: Any):
171 |     with cache_file.open('wb') as file:
172 |         pickle.dump(garbage_data, file)
173 | 
174 | 
175 | def dump_json(cache_file: Path, garbage_data: Any):
176 |     with cache_file.open('w') as file:
177 |         json.dump(encode_dict_keys_to_str(garbage_data), file)
178 | 
179 | 
180 | def encode_dict_keys_to_str(obj: Any) -> Any:
181 |     if isinstance(obj, dict):
182 |         return {str(key): value for key, value in obj.items()}
183 |     if isinstance(obj, tuple):
184 |         return tuple(encode_dict_keys_to_str(item) for item in obj)
185 |     return obj
186 | 


--------------------------------------------------------------------------------
/tests/integration/test_real_images.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=missing-docstring
  2 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
  3 | 
  4 | from pathlib import Path
  5 | from typing import List
  6 | 
  7 | import pytest
  8 | 
  9 | from PIL import Image
 10 | from PIL.Image import DecompressionBombError
 11 | 
 12 | from duplicate_images.image_pair_finder import PairFinderOptions
 13 | from duplicate_images.methods import IMAGE_HASH_ALGORITHM
 14 | from duplicate_images.duplicate import (
 15 |     files_in_dirs, is_image_file, get_matches, set_max_image_pixels
 16 | )
 17 | from duplicate_images.parse_commandline import parse_command_line
 18 | 
 19 | HUGE_IMAGE_SIZE = 20000 * 20000
 20 | 
 21 | 
 22 | @pytest.mark.parametrize('parallel', [True, False])
 23 | @pytest.mark.parametrize('slow', [True, False])
 24 | @pytest.mark.parametrize(
 25 |     'algorithm,expected_pairs',
 26 |     [('ahash', 0), ('dhash', 0), ('phash', 0), ('whash', 0)]
 27 | )
 28 | @pytest.mark.parametrize('image_pair', ['pair1', 'pair2'])
 29 | def test_similar(  # pylint:disable = too-many-arguments,too-many-positional-arguments
 30 |         data_dir: Path, image_pair: str, algorithm: str, expected_pairs: int,
 31 |         slow: bool, parallel: bool
 32 | ) -> None:
 33 |     folder = data_dir / 'similar' / image_pair
 34 |     matches = get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel))
 35 |     assert len(matches) == expected_pairs
 36 | 
 37 | 
 38 | @pytest.mark.parametrize(
 39 |     'algorithm,min_distance',
 40 |     [('ahash', 2), ('dhash', 10), ('phash', 14), ('whash', 2), ('colorhash', 0)]
 41 | )
 42 | def test_hash_distance(
 43 |         data_dir: Path, algorithm: str, min_distance: int
 44 | ) -> None:
 45 |     folder = data_dir / 'similar' / 'pair1'
 46 |     hash_algorithm = IMAGE_HASH_ALGORITHM[algorithm]
 47 |     image_files = sorted(files_in_dirs([folder], is_image_file))
 48 |     assert len(image_files) == 2
 49 |     hashes = [hash_algorithm(Image.open(file)) for file in image_files]
 50 |     assert hashes[0] - hashes[1] == min_distance, str(hashes[0] - hashes[1])  # type: ignore
 51 | 
 52 | 
 53 | @pytest.mark.parametrize('parallel', [True, False])
 54 | @pytest.mark.parametrize('slow', [True, False])
 55 | @pytest.mark.parametrize(
 56 |     'algorithm,max_distance',
 57 |     [('ahash', 14), ('dhash', 12), ('phash', 14), ('whash', 16), ('colorhash', 0)]
 58 | )
 59 | def test_similar_distance_matches(
 60 |         data_dir: Path, algorithm: str, max_distance: int, slow: bool, parallel: bool
 61 | ) -> None:
 62 |     folder = data_dir / 'similar' / 'pair1'
 63 |     matches = get_matches(
 64 |         [folder], algorithm, PairFinderOptions(
 65 |             slow=slow, parallel=parallel, max_distance=max_distance
 66 |         )
 67 |     )
 68 |     assert len(matches) == 1
 69 | 
 70 | 
 71 | @pytest.mark.parametrize('parallel', [True, False])
 72 | @pytest.mark.parametrize('slow', [True, False])
 73 | @pytest.mark.parametrize(
 74 |     'algorithm,hash_size',
 75 |     [('ahash', 4), ('whash', 2), ('colorhash', 4)]
 76 | )
 77 | def test_similar_matches_with_smaller_hash_size(
 78 |         data_dir: Path, algorithm: str, hash_size: int, slow: bool, parallel: bool
 79 | ) -> None:
 80 |     folder = data_dir / 'similar' / 'pair1'
 81 |     matches = get_matches(
 82 |         [folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel, hash_size=hash_size)
 83 |     )
 84 |     assert len(matches) == 1
 85 | 
 86 | 
 87 | @pytest.mark.parametrize('parallel', [True, False])
 88 | @pytest.mark.parametrize('slow', [True, False])
 89 | @pytest.mark.parametrize(
 90 |     'algorithm,expected_pairs',
 91 |     [('ahash', 0), ('dhash', 0), ('colorhash', 0), ('phash', 0), ('whash', 0)]
 92 | )
 93 | @pytest.mark.parametrize('image_pair', ['many'])
 94 | def test_similar_many(  # pylint:disable = too-many-arguments,too-many-positional-arguments
 95 |         data_dir: Path, image_pair: str, algorithm: str, expected_pairs: int,
 96 |         slow: bool, parallel: bool
 97 | ) -> None:
 98 |     folder = data_dir / 'similar' / image_pair
 99 |     matches = get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel))
100 |     assert len(matches) == expected_pairs
101 | 
102 | 
103 | @pytest.mark.parametrize('parallel', [True, False])
104 | @pytest.mark.parametrize('slow', [True, False])
105 | @pytest.mark.parametrize(
106 |     'algorithm,expected_pairs',
107 |     [('ahash', 1), ('dhash', 1), ('colorhash', 1), ('phash', 1), ('whash', 1)]
108 | )
109 | @pytest.mark.parametrize(
110 |     'image_pair', [
111 |         'jpeg_quality', 'jpeg_vs_heic', 'heic_bit_depth', 'heic_lossless_vs_lossy', 'shrunk10%'
112 |     ]
113 | )
114 | def test_equal_but_binary_different(  # pylint:disable=R0913,R0917
115 |         data_dir: Path, image_pair: str, algorithm: str, expected_pairs: int,
116 |         slow: bool, parallel: bool
117 | ) -> None:
118 |     folder = data_dir / 'equal_but_binary_different' / image_pair
119 |     matches = get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel))
120 |     assert len(matches) == expected_pairs
121 | 
122 | 
123 | @pytest.mark.parametrize('parallel', [True, False])
124 | @pytest.mark.parametrize('slow', [True, False])
125 | @pytest.mark.parametrize(
126 |     'algorithm,expected_pairs',
127 |     [('ahash', 0), ('dhash', 0), ('colorhash', 0), ('phash', 0), ('whash', 0)]
128 | )
129 | @pytest.mark.parametrize('image_pair', ['jpeg_75', 'jpeg_50', 'jpeg_25', 'jpeg_10'])
130 | def test_jpeg_artifacts(  # pylint:disable = too-many-arguments,too-many-positional-arguments
131 |         data_dir: Path, image_pair: str, algorithm: str, expected_pairs: int,
132 |         slow: bool, parallel: bool
133 | ) -> None:
134 |     folder = data_dir / 'equal_but_binary_different' / image_pair
135 |     matches = get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel))
136 |     assert len(matches) == expected_pairs
137 | 
138 | 
139 | @pytest.mark.parametrize('parallel', [True, False])
140 | @pytest.mark.parametrize('slow', [True, False])
141 | @pytest.mark.parametrize('algorithm', ['ahash', 'dhash', 'colorhash', 'phash', 'whash'])
142 | @pytest.mark.parametrize('image_pair', ['pair1', 'pair2', 'pair3', 'webp', 'heif'])
143 | def test_exactly_equal(
144 |         data_dir: Path, image_pair: str, algorithm: str, slow: bool, parallel: bool
145 | ) -> None:
146 |     folder = data_dir / 'exactly_equal' / image_pair
147 |     matches = get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel))
148 |     assert len(matches) == 1
149 | 
150 | 
151 | @pytest.mark.parametrize('parallel', [True, False])
152 | @pytest.mark.parametrize('slow', [True, False])
153 | @pytest.mark.parametrize('algorithm', ['ahash', 'dhash', 'colorhash', 'phash', 'whash'])
154 | @pytest.mark.parametrize('image_pair', ['pair1'])
155 | def test_different(
156 |         data_dir: Path, image_pair: str, algorithm: str, slow: bool, parallel: bool
157 | ) -> None:
158 |     folder = data_dir / 'different' / image_pair
159 |     matches = get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel))
160 |     assert not matches
161 | 
162 | 
163 | @pytest.mark.parametrize('parallel', [True, False])
164 | @pytest.mark.parametrize('slow', [True, False])
165 | @pytest.mark.parametrize(
166 |     'test_case,image_pair,algorithm,expected_pairs',
167 |     [
168 |         ('similar', 'pair2', 'ahash', 0),
169 |         ('similar', 'pair2', 'dhash', 0),
170 |         ('similar', 'pair2', 'colorhash', 1),
171 |         ('similar', 'pair2', 'phash', 0),
172 |         ('similar', 'pair2', 'whash', 0),
173 |     ]
174 | )
175 | def test_inconsistent_results_for_different_algorithms(  # pylint:disable=R0913,R0917
176 |         data_dir: Path, test_case: str, image_pair: str, algorithm: str, expected_pairs: int,
177 |         slow: bool, parallel: bool
178 | ) -> None:
179 |     folder = data_dir / test_case / image_pair
180 |     matches = get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel))
181 |     assert len(matches) == expected_pairs
182 | 
183 | 
184 | @pytest.mark.parametrize('parallel', [True, False])
185 | @pytest.mark.parametrize('slow', [True, False])
186 | @pytest.mark.parametrize('algorithm', ['ahash', 'dhash', 'colorhash', 'phash', 'whash'])
187 | def test_broken_image_files_do_not_raise_os_error(
188 |         data_dir: Path, algorithm: str, slow: bool, parallel: bool
189 | ) -> None:
190 |     folder = data_dir / 'broken'
191 |     get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel))
192 | 
193 | 
194 | @pytest.mark.parametrize('parallel', [True, False])
195 | @pytest.mark.parametrize('algorithm', ['ahash', 'dhash', 'colorhash', 'phash', 'whash'])
196 | @pytest.mark.parametrize(
197 |     'folders', [
198 |         ['heic_bit_depth'],  # images in this folder appear different to those in the following
199 |         ['heic_lossless_vs_lossy', 'jpeg_quality', 'jpeg_vs_heic', 'shrunk10%']
200 |     ]
201 | )
202 | def test_multiple_images_appear_as_group(
203 |         data_dir: Path, folders: List[Path], algorithm: str, parallel: bool
204 | ) -> None:
205 |     folders = [data_dir / 'equal_but_binary_different' / folder for folder in folders]
206 |     matches = get_matches(folders, algorithm, PairFinderOptions(group=True, parallel=parallel))
207 |     assert len(matches) == 1
208 |     assert len(matches[0]) == len(files_in_dirs(folders))
209 | 
210 | 
211 | @pytest.mark.parametrize('algorithm', ['ahash'])  # only one of each is needed, it works the same
212 | @pytest.mark.parametrize('folders', [['heic_bit_depth']])  # in all cases
213 | def test_slow_image_finder_fails_with_group_option(
214 |         data_dir: Path, folders: List[Path], algorithm: str
215 | ) -> None:
216 |     folders = [data_dir / 'equal_but_binary_different' / folder for folder in folders]
217 |     with pytest.raises(ValueError):
218 |         get_matches(folders, algorithm, PairFinderOptions(slow=True, group=True))
219 | 
220 | 
221 | @pytest.mark.parametrize('algorithm', ['ahash'])
222 | @pytest.mark.parametrize('folder', ['huge'])
223 | def test_huge_image_fails_loading_per_default(
224 |         data_dir: Path, algorithm: str, folder: str
225 | ) -> None:
226 |     hash_algorithm = IMAGE_HASH_ALGORITHM[algorithm]
227 |     image_files = sorted(files_in_dirs([(data_dir / folder)], is_image_file))
228 |     with pytest.raises(DecompressionBombError):
229 |         for file in image_files:
230 |             hash_algorithm(Image.open(file))
231 | 
232 | 
233 | @pytest.mark.parametrize('algorithm', ['ahash'])
234 | @pytest.mark.parametrize('folder', ['huge'])
235 | def test_huge_image_succeeds_with_max_image_size_set(
236 |         data_dir: Path, algorithm: str, folder: str
237 | ) -> None:
238 |     sub_folder = data_dir / folder
239 |     args = parse_command_line([str(sub_folder), '--max-image-pixels', str(HUGE_IMAGE_SIZE)])
240 |     set_max_image_pixels(args)
241 |     matches = get_matches([sub_folder], algorithm, PairFinderOptions.from_args(args))
242 |     assert len(matches) == 1
243 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=missing-docstring
2 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
3 | 


--------------------------------------------------------------------------------
/tests/unit/conftest.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=missing-docstring
  2 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
  3 | 
  4 | import random
  5 | import shutil
  6 | from pathlib import Path
  7 | from tempfile import NamedTemporaryFile, TemporaryDirectory, mkdtemp
  8 | from typing import Generator, List, Tuple
  9 | from unittest.mock import Mock
 10 | 
 11 | import pillow_heif
 12 | import pytest
 13 | from imagehash import ImageHash
 14 | from numpy import array
 15 | from wand.color import Color
 16 | from wand.drawing import Drawing
 17 | from wand.image import Image
 18 | 
 19 | IMAGE_WIDTH = 40
 20 | MOCK_IMAGE_HASH_VALUE = ImageHash(array([[True, True], [True, True]]))  # just some random value
 21 | mock_algorithm = Mock(return_value=MOCK_IMAGE_HASH_VALUE)
 22 | 
 23 | 
 24 | def create_image(file: Path, width: int) -> Path:
 25 |     if file.suffix == '.heif':
 26 |         return create_heif_image(file, width)
 27 |     height = int(width * 3 / 4)
 28 |     color = Color('Black')
 29 |     image = Image(width=width, height=height, background=color)
 30 |     image.save(filename=file)
 31 |     return file
 32 | 
 33 | 
 34 | def create_heif_image(file_path: Path, width: int) -> Path:
 35 |     height = int(width * 3 / 4)
 36 |     heif_file = pillow_heif.from_bytes(
 37 |         mode='BGR;16',
 38 |         size=(height, width),
 39 |         data=bytes([0] * 3 * 2 * width * height)
 40 |     )
 41 |     with open(file_path, 'wb') as file:
 42 |         heif_file.save(fp=file, quality=-1)
 43 |     return file_path
 44 | 
 45 | 
 46 | def fill_image_with_random_pixels(file: Path, seed: int = 0) -> None:
 47 |     random.seed(seed)
 48 |     image = Image(filename=file)
 49 |     with Drawing() as draw:
 50 |         for x in range(0, image.size[0]):
 51 |             for y in range(0, image.size[1]):
 52 |                 color = Color(f'rgb({random_short()},{random_short()},{random_short()}')
 53 |                 draw.fill_color = color
 54 |                 draw.point(x, y)
 55 |             draw(image)
 56 |     image.save(filename=file)
 57 | 
 58 | 
 59 | def named_file(name: str, images: List[Path]) -> Path:
 60 |     return next(filter(lambda f: name + '_' in f.name, images))
 61 | 
 62 | 
 63 | def random_short() -> int:
 64 |     return random.randrange(65535)  # noqa: S311
 65 | 
 66 | 
 67 | def save(image: Image, path: Path) -> None:
 68 |     """
 69 |     Save image without letting the wand module create a backup file (which would
 70 |     confuse tearDownClass()
 71 |     """
 72 |     with path.open('wb') as file:
 73 |         image.save(file=file)
 74 | 
 75 | 
 76 | def delete_image_file(file: Path, images: List[Path]) -> None:
 77 |     file.unlink()
 78 |     images.remove(file)
 79 | 
 80 | 
 81 | def copy_image_file(file: Path, images: List[Path]) -> Path:
 82 |     copied_file = file.with_suffix('.bak')
 83 |     shutil.copyfile(file, copied_file)
 84 |     images.append(copied_file)
 85 |     return copied_file
 86 | 
 87 | 
 88 | def is_pair_found(element1: Path, element2: Path, matches: List[Tuple[Path, Path]]) -> bool:
 89 |     return (element1, element2) in matches or (element2, element1) in matches
 90 | 
 91 | 
 92 | @pytest.fixture(name='top_directory', scope='session')
 93 | def fixture_top_directory() -> Generator[TemporaryDirectory, None, None]:
 94 |     top_dir = TemporaryDirectory()
 95 |     yield top_dir
 96 |     try:
 97 |         top_dir.cleanup()
 98 |     except FileNotFoundError:
 99 |         pass
100 | 
101 | 
102 | @pytest.fixture(name='sub_directory', scope='session')
103 | def fixture_sub_directory(top_directory: TemporaryDirectory) -> TemporaryDirectory:
104 |     return TemporaryDirectory(dir=top_directory.name)
105 | 
106 | 
107 | def create_jpg_and_png(top_directory: TemporaryDirectory) -> List[Path]:
108 |     images = []
109 |     jpeg_file = create_image(
110 |         Path(NamedTemporaryFile(dir=top_directory.name, prefix='jpeg_', suffix='.jpg').name),
111 |         IMAGE_WIDTH
112 |     )
113 |     images.append(jpeg_file)
114 |     png_file = create_image(
115 |         Path(NamedTemporaryFile(dir=top_directory.name, prefix='png_', suffix='.png').name),
116 |         IMAGE_WIDTH
117 |     )
118 |     images.append(png_file)
119 |     return images
120 | 
121 | 
122 | def create_half_jpg(top_directory: TemporaryDirectory) -> Path:
123 |     half_file = create_image(
124 |         Path(
125 |             NamedTemporaryFile(dir=top_directory.name, prefix='test_half_', suffix='.jpg').name
126 |         ),
127 |         IMAGE_WIDTH
128 |     )
129 |     image = Image(filename=half_file)
130 |     image.transform(f'{int(IMAGE_WIDTH / 2)}x{int(IMAGE_WIDTH * 3 / 8)}')
131 |     save(image, half_file)
132 |     return half_file
133 | 
134 | 
135 | @pytest.fixture(scope='session')
136 | def image_files(
137 |         top_directory: TemporaryDirectory, sub_directory: TemporaryDirectory
138 | ) -> Generator[List[Path], None, None]:
139 |     images = create_jpg_and_png(top_directory)
140 |     heif_file = create_heif_image(
141 |         Path(NamedTemporaryFile(dir=top_directory.name, prefix='heif_', suffix='.heif').name),
142 |         IMAGE_WIDTH
143 |     )
144 |     images.append(heif_file)
145 |     subdir_file = create_image(
146 |         Path(NamedTemporaryFile(dir=sub_directory.name, prefix='subdir_', suffix='.jpg').name),
147 |         IMAGE_WIDTH
148 |     )
149 |     fill_image_with_random_pixels(subdir_file)
150 |     images.append(subdir_file)
151 |     half_file = create_half_jpg(top_directory)
152 |     images.append(half_file)
153 |     yield images
154 |     for file in images:
155 |         file.unlink(missing_ok=False)
156 | 
157 | 
158 | @pytest.fixture
159 | def reset_call_count():
160 |     mock_algorithm.call_count = 0
161 | 
162 | 
163 | @pytest.fixture
164 | def hash_store_path(file_type: str) -> Path:
165 |     top_directory = Path(mkdtemp())
166 |     return Path(NamedTemporaryFile(dir=top_directory, suffix=f'.{file_type}').name)
167 | 


--------------------------------------------------------------------------------
/tests/unit/test_actions.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=missing-docstring
  2 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
  3 | 
  4 | import shlex
  5 | from argparse import Namespace
  6 | from datetime import datetime, timedelta
  7 | from math import factorial
  8 | from pathlib import Path
  9 | from tempfile import TemporaryDirectory, NamedTemporaryFile
 10 | from typing import List, Generator, Tuple, Callable
 11 | from unittest.mock import Mock, patch
 12 | 
 13 | import pytest
 14 | 
 15 | from duplicate_images import duplicate
 16 | from duplicate_images.function_types import Results
 17 | from duplicate_images.image_pair_finder import ImagePairFinder
 18 | from duplicate_images.methods import IMAGE_HASH_ALGORITHM, quote
 19 | from duplicate_images.pair_finder_options import PairFinderOptions
 20 | from duplicate_images.parse_commandline import parse_command_line
 21 | from .conftest import create_jpg_and_png, create_half_jpg, create_image, IMAGE_WIDTH
 22 | 
 23 | HASH_ALGORITHM = IMAGE_HASH_ALGORITHM['phash']
 24 | 
 25 | 
 26 | @pytest.fixture(name='equal_images')
 27 | def fixture_equal_images(
 28 |         top_directory: TemporaryDirectory, group: bool
 29 | ) -> Generator[List[Path], None, None]:
 30 |     images = create_jpg_and_png(top_directory)
 31 |     if group:
 32 |         half_file = create_half_jpg(top_directory)
 33 |         images.append(half_file)
 34 |     yield images
 35 |     for file in images:
 36 |         file.unlink(missing_ok=True)
 37 | 
 38 | 
 39 | @pytest.fixture(name='many_equal_images')
 40 | def fixture_many_equal_images(
 41 |         top_directory: TemporaryDirectory, num_images: int
 42 | ) -> Generator[List[Path], None, None]:
 43 |     images = []
 44 |     for _ in range(num_images):
 45 |         file_name = Path(
 46 |             NamedTemporaryFile(dir=top_directory.name, prefix='jpeg_', suffix='.jpg').name
 47 |         )
 48 |         create_image(file_name, IMAGE_WIDTH)
 49 |         images.append(file_name)
 50 |     yield images
 51 |     for file in images:
 52 |         file.unlink(missing_ok=True)
 53 | 
 54 | 
 55 | def get_equals(equal_images: List[Path], group: bool) -> List[Tuple[Path, ...]]:
 56 |     equals = ImagePairFinder.create(
 57 |         equal_images, HASH_ALGORITHM, options=PairFinderOptions(group=group)
 58 |     ).get_equal_groups()
 59 |     assert len(equals) == 1
 60 |     return equals
 61 | 
 62 | 
 63 | def paths_ascending_by_size(equals: Results):
 64 |     return sorted(sum(equals, ()), key=lambda path: (path.stat().st_size, str(path)))
 65 | 
 66 | 
 67 | def get_biggest(equals: Results) -> Path:
 68 |     return paths_ascending_by_size(equals)[-1]
 69 | 
 70 | 
 71 | def get_smallest(equals: Results) -> Path:
 72 |     return paths_ascending_by_size(equals)[0]
 73 | 
 74 | 
 75 | def check_relevant_is_deleted_and_others_are_present(
 76 |         equals: Results, option: str, relevant: Path
 77 | ) -> None:
 78 |     others = set(path[0] for path in equals) - {relevant}
 79 |     duplicate.execute_actions(equals, parse_command_line(['/', '--on-equal', option]))
 80 |     assert not relevant.is_file()
 81 |     for other in others:
 82 |         assert other.is_file()
 83 | 
 84 | 
 85 | def check_relevant_is_moved(equals: Results, option: str, relevant: Path) -> None:
 86 |     with TemporaryDirectory() as destination:
 87 |         args = parse_command_line(['/', '--on-equal', option, '--move-to', destination])
 88 |         duplicate.execute_actions(equals, args)
 89 |         assert not relevant.is_file()
 90 |         assert Path(destination, relevant.name).is_file()
 91 | 
 92 | 
 93 | @pytest.mark.parametrize('option', ['delete-first', 'd1'])
 94 | @pytest.mark.parametrize('group', [True, False])
 95 | def test_delete_first(equal_images: List[Path], option: str, group: bool) -> None:
 96 |     equals = get_equals(equal_images, group)
 97 |     relevant = equals[0][0]
 98 |     check_relevant_is_deleted_and_others_are_present(equals, option, relevant)
 99 | 
100 | 
101 | @pytest.mark.parametrize('option', ['delete-last', 'dl'])
102 | @pytest.mark.parametrize('group', [True, False])
103 | def test_delete_last(equal_images: List[Path], option: str, group: bool) -> None:
104 |     equals = get_equals(equal_images, group)
105 |     relevant = equals[0][-1]
106 |     check_relevant_is_deleted_and_others_are_present(equals, option, relevant)
107 | 
108 | 
109 | @pytest.mark.parametrize('option', ['delete-biggest', 'd>'])
110 | @pytest.mark.parametrize('group', [True, False])
111 | def test_delete_biggest(equal_images: List[Path], option: str, group: bool) -> None:
112 |     equals = get_equals(equal_images, group)
113 |     relevant = get_biggest(equals)
114 |     check_relevant_is_deleted_and_others_are_present(equals, option, relevant)
115 | 
116 | 
117 | @pytest.mark.parametrize('option', ['delete-smallest', 'd<'])
118 | @pytest.mark.parametrize('group', [True, False])
119 | def test_delete_smallest(equal_images: List[Path], option: str, group: bool) -> None:
120 |     equals = get_equals(equal_images, group)
121 |     relevant = get_smallest(equals)
122 |     check_relevant_is_deleted_and_others_are_present(equals, option, relevant)
123 | 
124 | 
125 | @pytest.mark.parametrize('option', ['move-first', 'm1'])
126 | @pytest.mark.parametrize('group', [True, False])
127 | def test_move_first(equal_images: List[Path], option: str, group: bool) -> None:
128 |     equals = get_equals(equal_images, group)
129 |     relevant = equals[0][0]
130 |     check_relevant_is_moved(equals, option, relevant)
131 | 
132 | 
133 | @pytest.mark.parametrize('option', ['move-last', 'ml'])
134 | @pytest.mark.parametrize('group', [True, False])
135 | def test_move_last(equal_images: List[Path], option: str, group: bool) -> None:
136 |     equals = get_equals(equal_images, group)
137 |     relevant = equals[0][-1]
138 |     check_relevant_is_moved(equals, option, relevant)
139 | 
140 | 
141 | @pytest.mark.parametrize('option', ['move-biggest', 'm>'])
142 | @pytest.mark.parametrize('group', [True, False])
143 | def test_move_biggest(equal_images: List[Path], option: str, group: bool) -> None:
144 |     equals = get_equals(equal_images, group)
145 |     relevant = get_biggest(equals)
146 |     check_relevant_is_moved(equals, option, relevant)
147 | 
148 | 
149 | @pytest.mark.parametrize('option', ['move-smallest', 'm<'])
150 | @pytest.mark.parametrize('group', [True, False])
151 | def test_move_smallest(equal_images: List[Path], option: str, group: bool) -> None:
152 |     equals = get_equals(equal_images, group)
153 |     relevant = get_smallest(equals)
154 |     check_relevant_is_moved(equals, option, relevant)
155 | 
156 | 
157 | @pytest.mark.parametrize('option', ['move-first'])
158 | @pytest.mark.parametrize('group', [True, False])
159 | def test_move_with_recreate_path_recreates_path_under_target_folder(
160 |         equal_images: List[Path], option: str, group: bool
161 | ) -> None:
162 |     equals = get_equals(equal_images, group)
163 |     relevant = equals[0][0]
164 |     with TemporaryDirectory() as destination:
165 |         args = parse_command_line(
166 |             ['/', '--on-equal', option, '--move-to', destination, '--move-recreate-path']
167 |         )
168 |         duplicate.execute_actions(equals, args)
169 |         assert not relevant.is_file()
170 |         assert (Path(destination) / relevant.relative_to(relevant.anchor)).is_file()
171 | 
172 | 
173 | def check_command_is_called(
174 |         mock_call: Mock, args: Namespace, equal_images: List[Path], group: bool
175 | ) -> None:
176 |     equals = get_equals(equal_images, group)
177 |     duplicate.execute_actions(equals, args)
178 |     mock_call.assert_called_once()
179 |     assert args.on_equal in mock_call.call_args_list[0].args[0]
180 | 
181 | 
182 | def check_all_equal_images_are_present(mock_call: Mock, equal_images: List[Path]):
183 |     paths_as_set = set(str(path) for path in equal_images)
184 |     assert set(mock_call.call_args.args[0]) & paths_as_set == paths_as_set
185 | 
186 | 
187 | @patch('duplicate_images.methods.call')
188 | @pytest.mark.parametrize('option', ['xv', 'eog'])
189 | @pytest.mark.parametrize('group', [True, False])
190 | def test_xv(mock_call: Mock, equal_images: List[Path], option: str, group: bool) -> None:
191 |     check_command_is_called(
192 |         mock_call, parse_command_line(['/', '--on-equal', option]), equal_images, group
193 |     )
194 |     assert option in set(mock_call.call_args.args[0])
195 |     check_all_equal_images_are_present(mock_call, equal_images)
196 | 
197 | 
198 | @patch('builtins.print')
199 | @pytest.mark.parametrize('option', ['print', 'print_inline'])
200 | @pytest.mark.parametrize('group', [True, False])
201 | def test_print(mock_print: Mock, equal_images: List[Path], option: str, group: bool) -> None:
202 |     equals = get_equals(equal_images, group)
203 |     duplicate.execute_actions(equals, parse_command_line(['/', '--on-equal', option]))
204 |     assert mock_print.call_count == len(equals)
205 |     for path in equals[0]:
206 |         assert path in mock_print.call_args_list[0].args
207 | 
208 | 
209 | def test_quote_string():
210 |     quoted = shlex.quote('string with "quotes"')
211 |     assert quoted == "'string with \"quotes\"'"
212 | 
213 |     quoted = shlex.quote('/path/with/one space.jpg')
214 |     assert quoted == "'/path/with/one space.jpg'"
215 | 
216 | 
217 | @patch('builtins.print')
218 | @pytest.mark.parametrize('option', ['quote', 'quote_inline'])
219 | @pytest.mark.parametrize('group', [True, False])
220 | def test_quote(mock_print: Mock, equal_images: List[Path], option: str, group: bool) -> None:
221 |     equals = get_equals(equal_images, group)
222 |     duplicate.execute_actions(equals, parse_command_line(['/', '--on-equal', option]))
223 |     assert mock_print.call_count == len(equals)
224 |     for path in equals[0]:
225 |         assert str(path) in mock_print.call_args_list[0].args[0]
226 |         assert quote(str(path)) in mock_print.call_args_list[0].args[0]
227 | 
228 | 
229 | @patch('duplicate_images.methods.shell_exec')
230 | @pytest.mark.parametrize('option', ['exec'])
231 | @pytest.mark.parametrize('exec_cmd', ['ls {1} {2}', 'ls {*}'])
232 | @pytest.mark.parametrize('group', [True, False])
233 | def test_shell_exec(
234 |         mock_call: Mock, equal_images: List[Path], option: str, exec_cmd: str, group: bool
235 | ) -> None:
236 |     check_command_is_called(
237 |         mock_call, parse_command_line(['/', '--on-equal', option, '--exec', exec_cmd]),
238 |         equal_images, group
239 |     )
240 | 
241 | 
242 | @patch('duplicate_images.methods.call')
243 | @pytest.mark.parametrize('option', ['exec'])
244 | @pytest.mark.parametrize('exec_cmd', ['ls {*}'])
245 | @pytest.mark.parametrize('group', [True, False])
246 | def test_wildcard_exec_parameter(
247 |         mock_call: Mock, equal_images: List[Path], option: str, exec_cmd: str, group: bool
248 | ) -> None:
249 |     equals = get_equals(equal_images, group)
250 |     args = parse_command_line(['/', '--on-equal', option, '--exec', exec_cmd])
251 |     duplicate.execute_actions(equals, args)
252 |     mock_call.assert_called_once()
253 |     for path in (str(path) for path in equal_images):
254 |         assert path in mock_call.call_args.args[0]
255 | 
256 | 
257 | @pytest.mark.parametrize('option', ['symlink-smaller'])
258 | @pytest.mark.parametrize('group', [True, False])
259 | def test_symlink_smaller(equal_images: List[Path], option: str, group: bool):
260 |     check_symlink(equal_images, option, group, get_biggest)
261 | 
262 | 
263 | @pytest.mark.parametrize('option', ['symlink-bigger'])
264 | @pytest.mark.parametrize('group', [True, False])
265 | def test_symlink_bigger(equal_images: List[Path], option: str, group: bool):
266 |     check_symlink(equal_images, option, group, get_smallest)
267 | 
268 | 
269 | def check_symlink(
270 |         equal_images: List[Path], option: str, group: bool, get_relevant: Callable[[Results], Path]
271 | ) -> None:
272 |     equals = get_equals(equal_images, group)
273 |     relevant = get_relevant(equals)
274 |     args = parse_command_line(['/', '--on-equal', option])
275 |     duplicate.execute_actions(equals, args)
276 |     assert relevant.is_file()
277 |     others = set(equal_images) - {relevant}
278 |     for path in others:
279 |         assert path.is_symlink()
280 |         assert path.resolve() == relevant
281 | 
282 | 
283 | @pytest.mark.parametrize('num_images', [7])
284 | @pytest.mark.parametrize('parallel', [4, 10, 20])
285 | @pytest.mark.parametrize('sleep_time', [0.005])
286 | def test_parallel_actions(
287 |         many_equal_images: List[Path], num_images: int, parallel: int, sleep_time: float
288 | ) -> None:
289 |     equals = ImagePairFinder.create(
290 |         many_equal_images, HASH_ALGORITHM, options=PairFinderOptions(group=False)
291 |     ).get_equal_groups()
292 |     assert len(equals) == factorial(num_images) / (factorial(2) * factorial(num_images - 2))
293 | 
294 |     execution_time_single = actions_execution_time(
295 |         equals, sleep_time, []
296 |     )
297 |     execution_time_parallel = actions_execution_time(
298 |         equals, sleep_time, ['--parallel-actions', str(parallel)]
299 |     )
300 |     assert execution_time_parallel < execution_time_single
301 | 
302 | 
303 | def actions_execution_time(equals: Results, sleep_time: float, extra_args: List[str]) -> timedelta:
304 |     args = parse_command_line(
305 |         ['.', '--on-equal', 'exec', '--exec', f'sleep {sleep_time}'] + extra_args
306 |     )
307 |     start_time = datetime.now()
308 |     duplicate.execute_actions(equals, args)
309 |     return datetime.now() - start_time
310 | 
311 | 
312 | @pytest.mark.parametrize('option', ['unknown-option'])
313 | def test_unknown_option(option: str) -> None:
314 |     with pytest.raises(SystemExit):
315 |         parse_command_line(['/', '--on-equal', option])
316 | 


--------------------------------------------------------------------------------
/tests/unit/test_files_in_dirs.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=missing-docstring
 2 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
 3 | 
 4 | from pathlib import Path
 5 | from tempfile import TemporaryDirectory
 6 | from typing import Generator
 7 | from unittest.mock import patch
 8 | 
 9 | import pytest
10 | 
11 | from duplicate_images.duplicate import files_in_dirs, is_image_file
12 | from .conftest import create_image
13 | 
14 | NUM_NUMBERED_FILES = 3
15 | TEST_IMAGE_WIDTH = 40
16 | 
17 | 
18 | @pytest.fixture(name='temp_dir', scope='session')
19 | def top_folder() -> Generator[Path, None, None]:
20 |     with TemporaryDirectory() as temp_dir:
21 |         yield Path(temp_dir)
22 | 
23 | 
24 | @pytest.fixture(name='filled_folder', scope='session')
25 | def filled_temp_dir(temp_dir: Path) -> Generator[Path, None, None]:
26 |     for i in range(NUM_NUMBERED_FILES):
27 |         (temp_dir / str(i)).mkdir()
28 |         (temp_dir / str(i) / f'{i}.txt').open('w').close()
29 |     yield temp_dir
30 | 
31 | 
32 | def test_files_in_dirs_finds_created_empty_files(filled_folder: Path) -> None:
33 |     found = files_in_dirs([filled_folder])
34 |     assert NUM_NUMBERED_FILES == len(found)
35 |     assert sorted(found) == sorted(filled_folder.glob('?/?.txt'))
36 | 
37 | 
38 | def test_files_in_dirs_ignores_empty_files_if_looking_for_images(filled_folder: Path) -> None:
39 |     found = files_in_dirs([filled_folder], is_image_file)
40 |     assert not found
41 | 
42 | 
43 | def test_files_in_dirs_ignores_subdir_matching_number_regex(filled_folder: Path) -> None:
44 |     found = files_in_dirs([filled_folder], exclude_regexes=[r'/\d$'])
45 |     assert not found
46 | 
47 | 
48 | def test_files_in_dirs_ignores_subdir_matching_explicit_name(filled_folder: Path) -> None:
49 |     assert NUM_NUMBERED_FILES >= 1
50 |     found = files_in_dirs([filled_folder], exclude_regexes=['/1$'])
51 |     assert NUM_NUMBERED_FILES - 1 == len(found)
52 | 
53 | 
54 | def test_files_in_dirs_ignores_subdirs_matching_multiple_names(filled_folder: Path) -> None:
55 |     assert NUM_NUMBERED_FILES >= 2
56 |     found = files_in_dirs([filled_folder], exclude_regexes=['/1$', '/2$'])
57 |     assert NUM_NUMBERED_FILES - 2 == len(found)
58 | 
59 | 
60 | def test_files_in_dirs_ignores_patterns_in_file_names(filled_folder: Path) -> None:
61 |     assert NUM_NUMBERED_FILES >= 1
62 |     found = files_in_dirs([filled_folder], exclude_regexes=['/1.txt$'])
63 |     assert NUM_NUMBERED_FILES == len(found)
64 | 
65 | 
66 | def test_files_in_dirs_with_arbitrary_condition(filled_folder: Path) -> None:
67 |     assert NUM_NUMBERED_FILES >= 2
68 |     found = files_in_dirs([filled_folder], is_relevant=lambda f: '2.txt' == f.name)
69 |     assert 1 == len(found)
70 |     assert '2.txt' == found[0].name
71 | 
72 | 
73 | def test_is_image_file_empty_file(filled_folder: Path) -> None:
74 |     assert not is_image_file(filled_folder / '1' / '1.txt')
75 | 
76 | 
77 | @pytest.mark.parametrize('extension', ['jpg', 'png', 'heif'])
78 | def test_is_image_file_image_file(temp_dir: Path, extension: str) -> None:
79 |     create_image(temp_dir / f'1.{extension}', TEST_IMAGE_WIDTH)
80 |     assert is_image_file(temp_dir / f'1.{extension}')
81 | 
82 | 
83 | @pytest.mark.parametrize('extension', ['jpg', 'png', 'heif'])
84 | def test_is_image_file_with_os_failure(temp_dir: Path, extension: str) -> None:
85 |     create_image(temp_dir / f'1.{extension}', TEST_IMAGE_WIDTH)
86 |     with patch('builtins.open', side_effect=OSError()):
87 |         assert not is_image_file(temp_dir / f'1.{extension}')
88 | 


--------------------------------------------------------------------------------
/tests/unit/test_image_hash_scanner.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=missing-docstring
 2 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
 3 | 
 4 | from pathlib import Path
 5 | from typing import Callable, List
 6 | 
 7 | import pytest
 8 | 
 9 | from duplicate_images.hash_scanner import ImageHashScanner, ParallelImageHashScanner
10 | from duplicate_images.methods import IMAGE_HASH_ALGORITHM, ALGORITHM_DEFAULTS, get_hash_size_kwargs
11 | from .conftest import mock_algorithm, MOCK_IMAGE_HASH_VALUE
12 | 
13 | 
14 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys()))
15 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner])
16 | @pytest.mark.parametrize('hash_size', [4, 7, 9])
17 | def test_different_hash_size_sets_options(
18 |         algorithm: str, scanner_class: Callable, hash_size: int
19 | ) -> None:
20 |     if algorithm == 'crop_resistant':
21 |         return  # crop_resistant does not support hash_size
22 |     hash_size_kwargs = get_hash_size_kwargs(IMAGE_HASH_ALGORITHM[algorithm], hash_size)
23 |     scanner = scanner_class([], IMAGE_HASH_ALGORITHM[algorithm], hash_size_kwargs)
24 |     assert isinstance(scanner.hash_size_kwargs, dict)
25 |     assert len(scanner.hash_size_kwargs) == 1
26 |     assert list(scanner.hash_size_kwargs.values())[0] == hash_size
27 |     assert list(scanner.hash_size_kwargs.keys())[0] == next(iter(
28 |         ALGORITHM_DEFAULTS[IMAGE_HASH_ALGORITHM[algorithm]]
29 |     ))
30 |     assert scanner.hash_size_kwargs == hash_size_kwargs
31 | 
32 | 
33 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner])
34 | def test_hash_values_correct(image_files: List[Path], scanner_class: Callable) -> None:
35 |     scanner = scanner_class(image_files, mock_algorithm)
36 |     for cache_entry in scanner.precalculate_hashes():
37 |         assert cache_entry[1] == MOCK_IMAGE_HASH_VALUE
38 | 


--------------------------------------------------------------------------------
/tests/unit/test_image_pair_finder.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=missing-docstring
  2 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
  3 | 
  4 | from pathlib import Path
  5 | from tempfile import TemporaryDirectory
  6 | from typing import Any, Callable, List, Tuple
  7 | from unittest.mock import Mock
  8 | 
  9 | import pytest
 10 | 
 11 | from duplicate_images.duplicate import files_in_dirs
 12 | from duplicate_images.hash_scanner import ImageHashScanner, ParallelImageHashScanner
 13 | from duplicate_images.image_pair_finder import (
 14 |     DictImagePairFinder, PairFinderOptions, SlowImagePairFinder, group_results_as_pairs
 15 | )
 16 | from duplicate_images.methods import IMAGE_HASH_ALGORITHM, get_hash_size_kwargs
 17 | from .conftest import is_pair_found, copy_image_file, delete_image_file, named_file
 18 | 
 19 | 
 20 | def element_in_list_of_tuples(element: Any, tuples: List[Tuple[Any, Any]]) -> bool:
 21 |     return any(element in tuple for tuple in tuples)
 22 | 
 23 | 
 24 | def test_get_files(top_directory: TemporaryDirectory, image_files: List[Path]) -> None:
 25 |     files = files_in_dirs([top_directory.name])
 26 |     assert set(files) == set(image_files)
 27 | 
 28 | 
 29 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys()))
 30 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner])
 31 | @pytest.mark.parametrize(
 32 |     'finder_class,max_distance', [
 33 |         (DictImagePairFinder, 0), (SlowImagePairFinder, 0), (SlowImagePairFinder, 1)
 34 |     ]
 35 | )
 36 | def test_hashes_equal_for_copied_image(
 37 |         image_files: List[Path], algorithm: str,
 38 |         scanner_class: Callable, finder_class: Callable, max_distance: int
 39 | ) -> None:
 40 |     jpeg_file = named_file('jpeg', image_files)
 41 |     copied_file = copy_image_file(jpeg_file, image_files)
 42 |     scanner = scanner_class(image_files, IMAGE_HASH_ALGORITHM[algorithm])
 43 |     equals = finder_class(
 44 |         scanner, group_results_as_pairs, options=PairFinderOptions(max_distance=max_distance)
 45 |     ).get_equal_groups()
 46 |     try:
 47 |         assert is_pair_found(jpeg_file, copied_file, equals)
 48 |     finally:
 49 |         delete_image_file(copied_file, image_files)
 50 | 
 51 | 
 52 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys()))
 53 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner])
 54 | @pytest.mark.parametrize(
 55 |     'finder_class,max_distance', [
 56 |         (DictImagePairFinder, 0), (SlowImagePairFinder, 0), (SlowImagePairFinder, 1)
 57 |     ]
 58 | )
 59 | def test_hashes_not_equal_for_noisy_image(
 60 |         image_files: List[Path], algorithm: str,
 61 |         scanner_class: Callable, finder_class: Callable, max_distance: int
 62 | ) -> None:
 63 |     if algorithm == 'crop_resistant':
 64 |         return  # crop_resistant gives false results for noisy images
 65 |     subdir_file = named_file('subdir', image_files)
 66 |     scanner = scanner_class(image_files, IMAGE_HASH_ALGORITHM[algorithm])
 67 |     equals = finder_class(
 68 |         scanner, group_results_as_pairs, options=PairFinderOptions(max_distance=max_distance)
 69 |     ).get_equal_groups()
 70 |     assert not element_in_list_of_tuples(subdir_file, equals)
 71 | 
 72 | 
 73 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys()))
 74 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner])
 75 | @pytest.mark.parametrize(
 76 |     'finder_class,max_distance', [
 77 |         (DictImagePairFinder, 0), (SlowImagePairFinder, 0), (SlowImagePairFinder, 1)
 78 |     ]
 79 | )
 80 | def test_hashes_equal_for_different_image_format(
 81 |         image_files: List[Path], algorithm: str,
 82 |         scanner_class: Callable, finder_class: Callable, max_distance: int
 83 | ) -> None:
 84 |     jpeg_file = named_file('jpeg', image_files)
 85 |     png_file = named_file('png', image_files)
 86 |     scanner = scanner_class(image_files, IMAGE_HASH_ALGORITHM[algorithm])
 87 |     equals = finder_class(
 88 |         scanner, group_results_as_pairs, options=PairFinderOptions(max_distance=max_distance)
 89 |     ).get_equal_groups()
 90 |     assert (jpeg_file, png_file) in equals
 91 | 
 92 | 
 93 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys()))
 94 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner])
 95 | @pytest.mark.parametrize(
 96 |     'finder_class,max_distance', [
 97 |         (DictImagePairFinder, 0), (SlowImagePairFinder, 0), (SlowImagePairFinder, 1)
 98 |     ]
 99 | )
100 | def test_hashes_equal_for_scaled_image(
101 |         image_files: List[Path], algorithm: str,
102 |         scanner_class: Callable, finder_class: Callable, max_distance: int
103 | ) -> None:
104 |     jpeg_file = named_file('jpeg', image_files)
105 |     half_file = named_file('half', image_files)
106 |     scanner = scanner_class(image_files, IMAGE_HASH_ALGORITHM[algorithm])
107 |     equals = finder_class(
108 |         scanner, group_results_as_pairs, options=PairFinderOptions(max_distance=max_distance)
109 |     ).get_equal_groups()
110 |     assert (jpeg_file, half_file) in equals
111 | 
112 | 
113 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys()))
114 | @pytest.mark.parametrize('scanner_class', [ParallelImageHashScanner])
115 | @pytest.mark.parametrize(
116 |     'finder_class,max_distance', [
117 |         (DictImagePairFinder, 0), (SlowImagePairFinder, 0), (SlowImagePairFinder, 1)
118 |     ]
119 | )
120 | def test_parallel_filtering_gives_same_results(
121 |         image_files: List[Path], algorithm: str,
122 |         scanner_class: Callable, finder_class: Callable, max_distance: int
123 | ) -> None:
124 |     if algorithm == 'crop_resistant':
125 |         return  # crop_resistant does not support parallel scanning
126 |     jpeg_file = named_file('jpeg', image_files)
127 |     png_file = named_file('png', image_files)
128 |     half_file = named_file('half', image_files)
129 |     heif_file = named_file('heif', image_files)
130 |     subdir_file = named_file('subdir', image_files)
131 |     scanner = scanner_class(image_files, IMAGE_HASH_ALGORITHM[algorithm])
132 |     equals = finder_class(
133 |         scanner, group_results_as_pairs, options=PairFinderOptions(max_distance=max_distance)
134 |     ).get_equal_groups()
135 |     assert is_pair_found(jpeg_file, png_file, equals)
136 |     assert is_pair_found(jpeg_file, heif_file, equals)
137 |     assert is_pair_found(jpeg_file, half_file, equals)
138 |     assert is_pair_found(png_file, half_file, equals)
139 |     assert is_pair_found(png_file, heif_file, equals)
140 |     assert is_pair_found(half_file, heif_file, equals)
141 |     assert not is_pair_found(jpeg_file, subdir_file, equals)
142 |     assert not is_pair_found(png_file, subdir_file, equals)
143 |     assert not is_pair_found(heif_file, subdir_file, equals)
144 |     assert not is_pair_found(half_file, subdir_file, equals)
145 | 
146 | 
147 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys()))
148 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner])
149 | @pytest.mark.parametrize('finder_class', [DictImagePairFinder, SlowImagePairFinder])
150 | @pytest.mark.parametrize('hash_size', [4, 16])
151 | def test_different_hash_size_finds_scaled_images(
152 |         image_files: List[Path], algorithm: str, scanner_class: Callable, finder_class: Callable,
153 |         hash_size: int
154 | ) -> None:
155 |     jpeg_file = named_file('jpeg', image_files)
156 |     half_file = named_file('half', image_files)
157 |     scanner = scanner_class(
158 |         image_files, IMAGE_HASH_ALGORITHM[algorithm],
159 |         get_hash_size_kwargs(IMAGE_HASH_ALGORITHM[algorithm], hash_size)
160 |     )
161 |     equals = finder_class(scanner, group_results=group_results_as_pairs).get_equal_groups()
162 |     assert (jpeg_file, half_file) in equals
163 | 
164 | 
165 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys()))
166 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner])
167 | @pytest.mark.parametrize('finder_class', [DictImagePairFinder, SlowImagePairFinder])
168 | @pytest.mark.parametrize('hash_size', [4, 16])
169 | def test_smaller_hash_size_finds_similar_images(
170 |         image_files: List[Path], algorithm: str, scanner_class: Callable, finder_class: Callable,
171 |         hash_size: int
172 | ) -> None:
173 |     jpeg_file = named_file('jpeg', image_files)
174 |     half_file = named_file('half', image_files)
175 |     scanner = scanner_class(
176 |         image_files, IMAGE_HASH_ALGORITHM[algorithm],
177 |         get_hash_size_kwargs(IMAGE_HASH_ALGORITHM[algorithm], hash_size)
178 |     )
179 |     equals = finder_class(scanner, group_results=group_results_as_pairs).get_equal_groups()
180 |     assert (jpeg_file, half_file) in equals
181 | 
182 | 
183 | @pytest.mark.parametrize('max_distance', [1, 2])
184 | def test_dict_image_finder_fails_for_max_distance_greater_0(max_distance: int) -> None:
185 |     with pytest.raises(ValueError):
186 |         DictImagePairFinder(
187 |             scanner=Mock(), group_results=Mock(),
188 |             options=PairFinderOptions(max_distance=max_distance)
189 |         )
190 | 


--------------------------------------------------------------------------------
/tests/unit/test_imagehash.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=missing-docstring
 2 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
 3 | 
 4 | from pathlib import Path
 5 | from typing import List
 6 | 
 7 | import pytest
 8 | 
 9 | from duplicate_images.function_types import Results
10 | from duplicate_images.image_pair_finder import ImagePairFinder, PairFinderOptions
11 | from duplicate_images.methods import IMAGE_HASH_ALGORITHM
12 | 
13 | 
14 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys()))
15 | def test_sequential(image_files: List[Path], algorithm: str) -> None:
16 |     equals = ImagePairFinder.create(
17 |         image_files, IMAGE_HASH_ALGORITHM[algorithm]
18 |     ).get_equal_groups()
19 |     check_results(equals, algorithm)
20 | 
21 | 
22 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys()))
23 | def test_parallel(image_files: List[Path], algorithm: str) -> None:
24 |     equals = ImagePairFinder.create(
25 |         image_files, IMAGE_HASH_ALGORITHM[algorithm],
26 |         options=PairFinderOptions(parallel=True)
27 |     ).get_equal_groups()
28 |     check_results(equals, algorithm)
29 | 
30 | 
31 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys()))
32 | def test_max_distance(image_files: List[Path], algorithm: str) -> None:
33 |     equals = ImagePairFinder.create(
34 |         image_files, IMAGE_HASH_ALGORITHM[algorithm], options=PairFinderOptions(max_distance=1)
35 |     ).get_equal_groups()
36 |     check_results(equals, algorithm)
37 | 
38 | 
39 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys()))
40 | def test_explicit_hash_size_works(image_files: List[Path], algorithm: str) -> None:
41 |     equals = ImagePairFinder.create(
42 |         image_files, IMAGE_HASH_ALGORITHM[algorithm],
43 |         options=PairFinderOptions(hash_size=8)
44 |     ).get_equal_groups()
45 |     check_results(equals, algorithm)
46 | 
47 | 
48 | def test_bad_hash_size_whash(image_files: List[Path]) -> None:
49 |     with pytest.raises(AssertionError):
50 |         ImagePairFinder.create(
51 |             image_files, IMAGE_HASH_ALGORITHM['whash'], options=PairFinderOptions(hash_size=9)
52 |         ).get_equal_groups()
53 | 
54 | 
55 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys()))
56 | def test_max_distance_parallel(image_files: List[Path], algorithm: str) -> None:
57 |     equals = ImagePairFinder.create(
58 |         image_files, IMAGE_HASH_ALGORITHM[algorithm],
59 |         options=PairFinderOptions(parallel=True, max_distance=1)
60 |     ).get_equal_groups()
61 |     check_results(equals, algorithm)
62 | 
63 | 
64 | @pytest.mark.parametrize('parallel', [False, True])
65 | @pytest.mark.parametrize('max_distance', [0, 1])
66 | @pytest.mark.parametrize('hash_size', [4, 8])
67 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys()))
68 | def test_create_with_all_parameters(
69 |         image_files: List[Path], parallel: bool, max_distance: int, hash_size: int, algorithm: str
70 | ) -> None:
71 |     equals = ImagePairFinder.create(
72 |         image_files, IMAGE_HASH_ALGORITHM[algorithm],
73 |         options=PairFinderOptions(max_distance=max_distance, hash_size=hash_size, parallel=parallel)
74 |     ).get_equal_groups()
75 |     check_results(equals, algorithm)
76 | 
77 | 
78 | def check_results(equals: Results, algorithm: str) -> None:
79 |     assert any('jpeg_' in pair[0].name and 'half_' in pair[1].name for pair in equals)
80 |     assert any('png_' in pair[0].name and 'half_' in pair[1].name for pair in equals)
81 |     assert any('jpeg_' in pair[0].name and 'png_' in pair[1].name for pair in equals)
82 |     if algorithm != 'crop_resistant':
83 |         assert not any('jpeg_' in pair[0].name and 'subdir_' in pair[1].name for pair in equals), [
84 |             ['/'.join(p[0].parts[3:]), '/'.join(p[1].parts[3:])]
85 |             for p in equals if 'subdir_' in p[1].name
86 |         ]
87 | 


--------------------------------------------------------------------------------
/tests/unit/test_parse_commandline.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=missing-docstring
  2 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
  3 | 
  4 | from os import cpu_count
  5 | from pathlib import Path
  6 | from tempfile import TemporaryDirectory
  7 | 
  8 | import pytest
  9 | from duplicate_images.methods import ACTIONS_ON_EQUALITY, MOVE_ACTIONS
 10 | 
 11 | from duplicate_images.parse_commandline import parse_command_line
 12 | 
 13 | NON_MOVE_ACTIONS = sorted(list(ACTIONS_ON_EQUALITY.keys() - set(MOVE_ACTIONS)))
 14 | MOCK_CONFIG_VALUES = {
 15 |     'exclude_dir': '/tmp/mock',
 16 |     'algorithm': 'mock',
 17 |     'max_distance': -1,
 18 |     'hash_size': -1,
 19 |     'on_equal': 'mock',
 20 |     'parallel': -1,
 21 |     'parallel_actions': -1,
 22 |     'hash_db': '/tmp/mock.json',
 23 |     'max_image_pixels': -1
 24 | }
 25 | 
 26 | 
 27 | def test_root_dir_required() -> None:
 28 |     with pytest.raises(SystemExit):
 29 |         parse_command_line([])
 30 | 
 31 | 
 32 | def test_one_root_dir_exists() -> None:
 33 |     args = parse_command_line(['.'])
 34 |     assert args.root_directory
 35 | 
 36 | 
 37 | def test_one_root_dir_recognized() -> None:
 38 |     args = parse_command_line(['.'])
 39 |     assert len(args.root_directory) == 1
 40 | 
 41 | 
 42 | def test_one_root_dir_parsed() -> None:
 43 |     args = parse_command_line(['.'])
 44 |     assert args.root_directory == ['.']
 45 | 
 46 | 
 47 | def test_two_root_dirs_recognized() -> None:
 48 |     args = parse_command_line(['.', '/home'])
 49 |     assert len(args.root_directory) == 2
 50 | 
 51 | 
 52 | def test_two_root_dirs_parsed() -> None:
 53 |     args = parse_command_line(['.', '/home'])
 54 |     assert args.root_directory == ['.', '/home']
 55 | 
 56 | 
 57 | def test_parallel_unspecified() -> None:
 58 |     args = parse_command_line(['.'])
 59 |     assert args.parallel is None
 60 | 
 61 | 
 62 | def test_parallel_default_arg() -> None:
 63 |     args = parse_command_line(['.', '--parallel'])
 64 |     assert args.parallel == cpu_count()
 65 | 
 66 | 
 67 | @pytest.mark.parametrize('parallel', ['1', '2', '4', '8', '16'])
 68 | def test_parallel_explicit_arg(parallel) -> None:
 69 |     args = parse_command_line(['.', '--parallel', parallel])
 70 |     assert args.parallel == int(parallel)
 71 | 
 72 | 
 73 | def test_parallel_actions_unspecified() -> None:
 74 |     args = parse_command_line(['.'])
 75 |     assert args.parallel_actions is None
 76 | 
 77 | 
 78 | def test_parallel_actions_default_arg() -> None:
 79 |     args = parse_command_line(['.', '--parallel-actions'])
 80 |     assert args.parallel_actions == cpu_count()
 81 | 
 82 | 
 83 | @pytest.mark.parametrize('parallel', ['1', '2', '4', '8', '16'])
 84 | def test_parallel_actions_explicit_arg(parallel) -> None:
 85 |     args = parse_command_line(['.', '--parallel-actions', parallel])
 86 |     assert args.parallel_actions == int(parallel)
 87 | 
 88 | 
 89 | def test_exclude_dir_unspecified() -> None:
 90 |     args = parse_command_line(['.'])
 91 |     assert args.exclude_dir is None
 92 | 
 93 | 
 94 | def test_one_exclude_dir() -> None:
 95 |     args = parse_command_line(['.', '--exclude-dir', 'foo'])
 96 |     assert args.exclude_dir == ['foo']
 97 | 
 98 | 
 99 | def test_two_exclude_dirs() -> None:
100 |     args = parse_command_line(['.', '--exclude-dir', 'foo', 'bar'])
101 |     assert args.exclude_dir == ['foo', 'bar']
102 | 
103 | 
104 | def test_one_exclude_dir_with_space() -> None:
105 |     args = parse_command_line(['.', '--exclude-dir', 'foo bar'])
106 |     assert args.exclude_dir == ['foo bar']
107 | 
108 | 
109 | def test_exec_fails_without_on_equal() -> None:
110 |     with pytest.raises(SystemExit):
111 |         parse_command_line(['.', '--exec', 'command'])
112 | 
113 | 
114 | @pytest.mark.parametrize('option', MOVE_ACTIONS)
115 | def test_move_fails_without_target_folder_specified(option: str) -> None:
116 |     with pytest.raises(SystemExit):
117 |         parse_command_line(['/', '--on-equal', option])
118 | 
119 | 
120 | @pytest.mark.parametrize('option', NON_MOVE_ACTIONS)
121 | def test_non_move_action_fails_with_target_folder_specified(option: str) -> None:
122 |     with pytest.raises(SystemExit):
123 |         parse_command_line(['/', '--on-equal', option, '--move-to', '/'])
124 | 
125 | 
126 | @pytest.mark.parametrize('option', NON_MOVE_ACTIONS)
127 | def test_non_move_action_fails_with_recreate_path_specified(option: str) -> None:
128 |     with pytest.raises(SystemExit):
129 |         parse_command_line(['/', '--on-equal', option, '--move-recreate-path'])
130 | 
131 | 
132 | @pytest.fixture(name='config_file', scope='session')
133 | def fixture_config_file(top_directory: TemporaryDirectory) -> Path:
134 |     config_file = Path(top_directory.name) / 'duplicate_images.cfg'
135 |     with config_file.open('w') as file:
136 |         file.write('[Defaults]\n')
137 |         for key, value in MOCK_CONFIG_VALUES.items():
138 |             file.write(f'{key} = {value}\n')
139 |     return config_file
140 | 
141 | 
142 | def test_config_file_is_read(config_file: Path) -> None:
143 |     args = parse_command_line(['--config-file', str(config_file), '/tmp'])
144 |     assert args.root_directory == ['/tmp']
145 | 
146 | 
147 | def test_read_options_from_config_file(config_file: Path) -> None:
148 |     args = parse_command_line(['--config-file', str(config_file), '/tmp'])
149 |     assert args.exclude_dir == MOCK_CONFIG_VALUES['exclude_dir']
150 | 
151 | 
152 | @pytest.mark.parametrize('option', MOCK_CONFIG_VALUES.keys())
153 | def test_read_options_from_config_file_overridden_by_command_line(
154 |         config_file: Path, option: str
155 | ) -> None:
156 |     args = parse_command_line(['--config-file', str(config_file), '/tmp'])
157 |     assert vars(args)[option] == MOCK_CONFIG_VALUES[option]
158 | 


--------------------------------------------------------------------------------
/tests/unit/test_persistent_storage.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=missing-docstring
  2 | __author__ = 'Lene Preuss <lene.preuss@gmail.com>'
  3 | 
  4 | import json
  5 | import logging
  6 | import pickle
  7 | from itertools import combinations
  8 | from pathlib import Path
  9 | from tempfile import TemporaryDirectory
 10 | from time import sleep
 11 | from typing import List
 12 | 
 13 | import pytest
 14 | 
 15 | from duplicate_images.duplicate import files_in_dirs, is_image_file
 16 | from duplicate_images.function_types import Cache
 17 | from duplicate_images.image_pair_finder import ImagePairFinder
 18 | from duplicate_images.pair_finder_options import PairFinderOptions
 19 | from duplicate_images.hash_store import (
 20 |     PickleHashStore, JSONHashStore, FileHashStore, HashStore, NullHashStore
 21 | )
 22 | from .conftest import MOCK_IMAGE_HASH_VALUE, mock_algorithm, create_jpg_and_png
 23 | 
 24 | DEFAULT_ALGORITHM = 'phash'
 25 | DEFAULT_HASH_SIZE = {'hash_size': 8}
 26 | DEFAULT_METADATA = {'algorithm': DEFAULT_ALGORITHM, **DEFAULT_HASH_SIZE}
 27 | 
 28 | 
 29 | class MockHashStore(FileHashStore):  # pylint: disable=abstract-method
 30 |     def __init__(self, values: Cache) -> None:  # pylint: disable=super-init-not-called
 31 |         self.values = values
 32 | 
 33 | 
 34 | def test_empty_hash_store_calculates_hash_values(
 35 |         top_directory: TemporaryDirectory, image_files: List[Path],
 36 |         reset_call_count  # pylint: disable=unused-argument
 37 | ) -> None:
 38 |     finder = generate_pair_finder(top_directory, NullHashStore())
 39 |     assert mock_algorithm.call_count > 0
 40 |     check_correct_results(finder, image_files)
 41 | 
 42 | 
 43 | def test_filled_hash_store_does_not_calculate_hash_values(
 44 |         top_directory: TemporaryDirectory, image_files: List[Path],
 45 |         reset_call_count  # pylint: disable=unused-argument
 46 | ) -> None:
 47 |     hash_store = MockHashStore({path: MOCK_IMAGE_HASH_VALUE for path in image_files})
 48 |     generate_pair_finder(top_directory, hash_store)
 49 |     assert mock_algorithm.call_count == 0
 50 | 
 51 | 
 52 | def test_empty_hash_store_is_filled(
 53 |         top_directory: TemporaryDirectory, reset_call_count  # pylint: disable=unused-argument
 54 | ) -> None:
 55 |     finder = generate_pair_finder(top_directory, NullHashStore())
 56 |     original_call_number = mock_algorithm.call_count
 57 |     finder.get_equal_groups()
 58 |     assert mock_algorithm.call_count == original_call_number
 59 | 
 60 | 
 61 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
 62 | def test_hash_store_is_written(
 63 |         top_directory: TemporaryDirectory, hash_store_path: Path
 64 | ) -> None:
 65 |     create_verified_hash_store(top_directory, hash_store_path)
 66 |     assert hash_store_path.is_file()
 67 | 
 68 | 
 69 | @pytest.mark.parametrize('file_type', ['pickle'])
 70 | def test_pickle_file_contains_correct_hashes(
 71 |         top_directory: TemporaryDirectory, image_files: List[Path], hash_store_path
 72 | ) -> None:
 73 |     create_verified_hash_store(top_directory, hash_store_path)
 74 |     with hash_store_path.open('rb') as pickle_file:
 75 |         written_hashes = pickle.load(pickle_file)[0]
 76 |     for file_name in image_files:
 77 |         assert file_name in written_hashes
 78 |         assert written_hashes[file_name] == MOCK_IMAGE_HASH_VALUE
 79 | 
 80 | 
 81 | @pytest.mark.parametrize('file_type', ['json'])
 82 | def test_json_file_contains_correct_hashes(
 83 |         top_directory: TemporaryDirectory, image_files: List[Path], hash_store_path
 84 | ) -> None:
 85 |     create_verified_hash_store(top_directory, hash_store_path)
 86 |     with hash_store_path.open('r') as json_file:
 87 |         written_hashes = json.load(json_file)[0]
 88 |     for file_name in image_files:
 89 |         assert str(file_name) in written_hashes
 90 |         assert written_hashes[str(file_name)] == str(MOCK_IMAGE_HASH_VALUE)
 91 | 
 92 | 
 93 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
 94 | def test_hash_store_load_loads(
 95 |         top_directory: TemporaryDirectory, image_files: List[Path], hash_store_path
 96 | ) -> None:
 97 |     create_verified_hash_store(top_directory, hash_store_path)
 98 |     hash_store_class = PickleHashStore if hash_store_path.suffix == '.pickle' else JSONHashStore
 99 |     hash_store = hash_store_class(hash_store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE)
100 |     hash_store.load()
101 |     written_hashes = hash_store.values
102 |     for file_name in image_files:
103 |         assert str(file_name) in map(str, written_hashes.keys())
104 |         assert str(written_hashes[file_name]) == str(MOCK_IMAGE_HASH_VALUE)
105 | 
106 | 
107 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
108 | def test_backup_file_created(
109 |         top_directory: TemporaryDirectory, hash_store_path: Path
110 | ) -> None:
111 |     create_verified_hash_store(top_directory, hash_store_path)
112 |     assert not hash_store_path.with_suffix('.bak').is_file()
113 |     create_verified_hash_store(top_directory, hash_store_path)
114 |     assert hash_store_path.with_suffix('.bak').is_file()
115 | 
116 | 
117 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
118 | def test_existing_backup_file_does_not_lead_to_error(
119 |         top_directory: TemporaryDirectory, hash_store_path: Path
120 | ) -> None:
121 |     create_verified_hash_store(top_directory, hash_store_path)  # create hash store
122 |     create_verified_hash_store(top_directory, hash_store_path)  # create backup file
123 |     create_verified_hash_store(top_directory, hash_store_path)  # check it works still
124 | 
125 | 
126 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
127 | def test_checked_load_sets_values(top_directory: TemporaryDirectory, hash_store_path: Path) -> None:
128 |     create_verified_hash_store(top_directory, hash_store_path)
129 |     hash_store_class = PickleHashStore if hash_store_path.suffix == '.pickle' else JSONHashStore
130 |     hash_store = hash_store_class(hash_store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE)
131 |     hash_store.load()
132 |     logging.warning(image_list(top_directory))
133 |     assert hash_store.values == {path: MOCK_IMAGE_HASH_VALUE for path in image_list(top_directory)}
134 | 
135 | 
136 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
137 | def test_checked_load_sets_metadata(
138 |         top_directory: TemporaryDirectory, hash_store_path: Path
139 | ) -> None:
140 |     create_verified_hash_store(top_directory, hash_store_path)
141 |     hash_store_class = PickleHashStore if hash_store_path.suffix == '.pickle' else JSONHashStore
142 |     hash_store = hash_store_class(hash_store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE)
143 |     hash_store.load()
144 |     assert hash_store.metadata() == DEFAULT_METADATA
145 | 
146 | 
147 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
148 | def test_hash_store_not_written_if_not_changed(
149 |         top_directory: TemporaryDirectory, hash_store_path: Path
150 | ) -> None:
151 |     create_verified_hash_store(top_directory, hash_store_path)
152 |     assert hash_store_path.is_file()
153 |     creation_time = hash_store_path.stat().st_ctime
154 |     scan_images_with_hash_store(top_directory, hash_store_path)
155 |     assert hash_store_path.stat().st_ctime == creation_time
156 |     assert hash_store_path.stat().st_mtime == creation_time
157 | 
158 | 
159 | @pytest.mark.parametrize('file_type', ['pickle', 'json'])
160 | def test_hash_store_is_accessed_even_if_not_changed(
161 |         top_directory: TemporaryDirectory, hash_store_path: Path
162 | ) -> None:
163 |     create_verified_hash_store(top_directory, hash_store_path)
164 |     assert hash_store_path.is_file()
165 |     sleep(0.01)  # ensure the access time is different
166 |     creation_time = hash_store_path.stat().st_ctime
167 |     scan_images_with_hash_store(top_directory, hash_store_path)
168 |     assert hash_store_path.stat().st_atime > creation_time
169 | 
170 | 
171 | def image_list(top_directory: TemporaryDirectory) -> List[Path]:
172 |     return sorted(files_in_dirs([top_directory.name], is_relevant=is_image_file))
173 | 
174 | 
175 | def generate_pair_finder(
176 |         top_directory: TemporaryDirectory, hash_store: HashStore
177 | ) -> ImagePairFinder:
178 |     return ImagePairFinder.create(
179 |         image_list(top_directory), mock_algorithm, options=PairFinderOptions(slow=True),
180 |         hash_store=hash_store
181 |     )
182 | 
183 | 
184 | def create_verified_hash_store(top_directory: TemporaryDirectory, store_path: Path) -> None:
185 |     create_jpg_and_png(top_directory)
186 |     scan_images_with_hash_store(top_directory, store_path)
187 | 
188 | 
189 | def scan_images_with_hash_store(top_directory: TemporaryDirectory, store_path: Path) -> None:
190 |     with FileHashStore.create(store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE) as hash_store:
191 |         finder = generate_pair_finder(top_directory, hash_store)
192 |         finder.get_equal_groups()
193 | 
194 | 
195 | def check_correct_results(finder: ImagePairFinder, images: List[Path]) -> None:
196 |     pairs = finder.get_equal_groups()
197 |     expected_pairs = combinations(images, 2)
198 |     expected_pairs_string = f'{[(p[0].name, p[1].name) for p in pairs]}'
199 |     for pair in expected_pairs:
200 |         assert pair in pairs or (pair[1], pair[0]) in pairs, \
201 |             f'{pair[0].name}, {pair[1].name} not in {expected_pairs_string}'
202 | 


--------------------------------------------------------------------------------