├── .flake8 ├── .git_hooks └── pre-push ├── .github └── workflows │ ├── codacy.yml │ ├── codeql.yml │ └── dependency-review.yml ├── .gitignore ├── .gitlab-ci.yml ├── .pylintrc ├── CHANGELOG.md ├── LICENSE ├── README.md ├── duplicate_images ├── __init__.py ├── common.py ├── duplicate.py ├── function_types.py ├── hash_scanner │ ├── __init__.py │ └── image_hash_scanner.py ├── hash_store.py ├── image_pair_finder.py ├── log.py ├── methods.py ├── pair_finder_options.py ├── parse_commandline.py └── progress_bar_manager.py ├── mypy.ini ├── poetry.lock ├── pyproject.toml └── tests ├── integration ├── __init__.py ├── conftest.py ├── data │ ├── broken │ │ ├── 47ff(1).jpg │ │ └── 47ff(2).jpg │ ├── different │ │ └── pair1 │ │ │ ├── 20221026_124702.jpg │ │ │ └── 20221026_124757.jpg │ ├── equal_but_binary_different.json │ ├── equal_but_binary_different.pickle │ ├── equal_but_binary_different │ │ ├── heic_bit_depth │ │ │ ├── 20221026_124702_10bit.heic │ │ │ └── 20221026_124702_8bit.heic │ │ ├── heic_lossless_vs_lossy │ │ │ ├── 20221026_124702_lossless.heic │ │ │ └── 20221026_124702_q85.heic │ │ ├── jpeg_quality │ │ │ ├── 20221026_124702_q94.jpg │ │ │ └── 20221026_124702_q95.jpg │ │ ├── jpeg_vs_heic │ │ │ ├── 20221026_124702.heic │ │ │ └── 20221026_124702.jpg │ │ └── shrunk10% │ │ │ ├── 20221026_124702.jpg │ │ │ └── 20221026_124702_90%.jpg │ ├── exactly_equal │ │ ├── heif │ │ │ ├── test1.heif │ │ │ └── test2.heif │ │ ├── pair1 │ │ │ ├── 20221026_124702_90%-2.jpg │ │ │ └── 20221026_124702_90%.jpg │ │ ├── pair2 │ │ │ ├── 20220312_124816-2.jpg │ │ │ └── 20220312_124816.jpg │ │ ├── pair3 │ │ │ ├── IMAG0015_small-2.png │ │ │ └── IMAG0015_small.png │ │ └── webp │ │ │ ├── test1.webp │ │ │ └── test2.webp │ ├── garbage.txt │ ├── huge │ │ ├── huge.png │ │ └── huge2.png │ ├── is_image_file │ │ ├── is_image │ │ │ ├── test.heif │ │ │ ├── test.jpg │ │ │ ├── test.png │ │ │ ├── test.tiff │ │ │ └── test.webp │ │ └── is_not_image │ │ │ ├── test.mp3 │ │ │ ├── test.ogg │ │ │ └── test.txt │ ├── jpeg_artifacts │ │ ├── jpeg_10 │ │ │ ├── 20221026_124702_q10.jpg │ │ │ └── 20221026_124702_q95.jpg │ │ ├── jpeg_25 │ │ │ ├── 20221026_124702_q25.jpg │ │ │ └── 20221026_124702_q95.jpg │ │ ├── jpeg_50 │ │ │ ├── 20221026_124702_q50.jpg │ │ │ └── 20221026_124702_q95.jpg │ │ └── jpeg_75 │ │ │ ├── 20221026_124702_q75.jpg │ │ │ └── 20221026_124702_q95.jpg │ └── similar │ │ ├── many │ │ ├── 20220218_135622.jpg │ │ ├── 20220218_135658.jpg │ │ └── 20220218_135708.jpg │ │ ├── pair1 │ │ ├── 20220806_214449.jpg │ │ └── 20220806_214600.jpg │ │ └── pair2 │ │ ├── 20220329_210118.jpg │ │ └── 20220329_210123.jpg ├── test_is_image_file.py ├── test_persistent_storage.py └── test_real_images.py └── unit ├── __init__.py ├── conftest.py ├── test_actions.py ├── test_files_in_dirs.py ├── test_image_hash_scanner.py ├── test_image_pair_finder.py ├── test_imagehash.py ├── test_parse_commandline.py └── test_persistent_storage.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | ignore = F401,S101 4 | -------------------------------------------------------------------------------- /.git_hooks/pre-push: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Run the test suite before pushing to remote, so the GitLab CI fails less 4 | # often 5 | # This hook is called with the following parameters: 6 | # 7 | # $1 -- Name of the remote to which the push is being done 8 | # $2 -- URL to which the push is being done 9 | # 10 | # If pushing without using a named remote those arguments will be equal. 11 | # 12 | # If the log message starts with "WIP:" (work in progress) the push is allowed 13 | # even if the tests fail, since it might be necessary to push to share between 14 | # work environments. 15 | 16 | GITLAB_PROJECT_ID=6643206 17 | CI_LINT_OUTPUT=/tmp/gitlab-ci-lint.json 18 | RED_TEXT='\e[38;5;196m' 19 | GREEN_TEXT='\e[38;5;46m' 20 | RESET_TEXT='\e[0m' 21 | STATUS=0 22 | 23 | poetry run pytest -n auto tests/unit || STATUS=1 24 | poetry run pytest -n auto tests/integration || STATUS=1 25 | poetry run mypy duplicate_images tests || STATUS=1 26 | poetry run flake8 duplicate_images tests || STATUS=1 27 | poetry run pylint duplicate_images tests || STATUS=1 28 | poetry run bandit -r duplicate_images -q || STATUS=1 29 | 30 | # lint GitLab CI (nod to https://stackoverflow.com/questions/49090675/how-can-i-test-gitlab-ci-yml#68723161 ) 31 | if [ "$GITLAB_ACCESS_TOKEN" != "" ]; then 32 | rm -f "$CI_LINT_OUTPUT" 33 | jq --null-input --arg yaml "$(cat .gitlab-ci.yml)" '{ content: $yaml }' | \ 34 | curl -s "https://gitlab.com/api/v4/projects/${GITLAB_PROJECT_ID}/ci/lint" \ 35 | --header 'Content-Type: application/json' \ 36 | --header "PRIVATE-TOKEN: $GITLAB_ACCESS_TOKEN" \ 37 | --data @- > "$CI_LINT_OUTPUT" 38 | VALID=$(jq -r .valid < "$CI_LINT_OUTPUT") 39 | if [ "$VALID" = "true" ]; then 40 | echo "GitLab CI valid: ${GREEN_TEXT}$VALID${RESET_TEXT}" 41 | else 42 | echo "GitLab CI valid: ${RED_TEXT}$VALID${RESET_TEXT}" 43 | echo "errors: $(jq .errors < "$CI_LINT_OUTPUT")" 44 | echo "warnings: $(jq .warnings < "$CI_LINT_OUTPUT")" 45 | fi 46 | #rm -f "$CI_LINT_OUTPUT" 47 | else 48 | echo "\$GITLAB_ACCESS_TOKEN not set" 49 | fi 50 | 51 | # check Changelog is updated 52 | VERSION=$(egrep 'version = ".*"' pyproject.toml | cut -d \" -f 2) 53 | if test -z "$VERSION" 54 | then echo "${RED_TEXT}version not found in pyproject.toml${RESET_TEXT}"; STATUS=1 55 | else 56 | echo "${GREEN_TEXT}Version $VERSION${RESET_TEXT}" 57 | if ! grep -q "$VERSION" CHANGELOG.md 58 | then echo "${RED_TEXT}$VERSION not found in changelog${RESET_TEXT}"; STATUS=1 59 | else echo "${GREEN_TEXT}$VERSION found in CHANGELOG.md, cool${RESET_TEXT}" 60 | fi 61 | if ! fgrep -q "...$VERSION" CHANGELOG.md 62 | then echo "${RED_TEXT}link to $VERSION diff not found in changelog${RESET_TEXT}"; STATUS=1 63 | else echo "${GREEN_TEXT}link to $VERSION diff found in CHANGELOG.md, cool${RESET_TEXT}" 64 | fi 65 | if ! fgrep -q "## [$VERSION] - $(date +%Y-%m-%d)" CHANGELOG.md 66 | then echo "${RED_TEXT}date not set correctly in changelog${RESET_TEXT}"; STATUS=1 67 | else echo "${GREEN_TEXT}date in CHANGELOG.md is $(date +%Y-%m-%d), cool${RESET_TEXT}" 68 | fi 69 | fi 70 | 71 | if [ $STATUS -gt 0 ]; then 72 | commitmsg=$(git log --oneline | head -n 1 | cut -d' ' -f 2-) 73 | if echo "$commitmsg" | grep '^WIP:'; then 74 | echo >&2 "Found WIP commit, pushing in spite of failed test suite" 75 | STATUS=0 76 | fi 77 | fi 78 | if [ $STATUS -gt 0 ] 79 | then echo "Status: ${RED_TEXT}${STATUS}${RESET_TEXT}" 80 | else echo "Status: ${GREEN_TEXT}${STATUS}${RESET_TEXT}" 81 | fi 82 | exit $STATUS 83 | -------------------------------------------------------------------------------- /.github/workflows/codacy.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | 6 | # This workflow checks out code, performs a Codacy security scan 7 | # and integrates the results with the 8 | # GitHub Advanced Security code scanning feature. For more information on 9 | # the Codacy security scan action usage and parameters, see 10 | # https://github.com/codacy/codacy-analysis-cli-action. 11 | # For more information on Codacy Analysis CLI in general, see 12 | # https://github.com/codacy/codacy-analysis-cli. 13 | 14 | name: Codacy Security Scan 15 | 16 | on: 17 | push: 18 | branches: [ "master" ] 19 | pull_request: 20 | # The branches below must be a subset of the branches above 21 | branches: [ "master" ] 22 | schedule: 23 | - cron: '21 5 * * 6' 24 | 25 | permissions: 26 | contents: read 27 | 28 | jobs: 29 | codacy-security-scan: 30 | permissions: 31 | contents: read # for actions/checkout to fetch code 32 | security-events: write # for github/codeql-action/upload-sarif to upload SARIF results 33 | actions: read # only required for a private repository by github/codeql-action/upload-sarif to get the Action run status 34 | name: Codacy Security Scan 35 | runs-on: ubuntu-latest 36 | steps: 37 | # Checkout the repository to the GitHub Actions runner 38 | - name: Checkout code 39 | uses: actions/checkout@v3 40 | 41 | # Execute Codacy Analysis CLI and generate a SARIF output with the security issues identified during the analysis 42 | - name: Run Codacy Analysis CLI 43 | uses: codacy/codacy-analysis-cli-action@d840f886c4bd4edc059706d09c6a1586111c540b 44 | with: 45 | # Check https://github.com/codacy/codacy-analysis-cli#project-token to get your project token from your Codacy repository 46 | # You can also omit the token and run the tools that support default configurations 47 | project-token: ${{ secrets.CODACY_PROJECT_TOKEN }} 48 | verbose: true 49 | output: results.sarif 50 | format: sarif 51 | # Adjust severity of non-security issues 52 | gh-code-scanning-compat: true 53 | # Force 0 exit code to allow SARIF file generation 54 | # This will handover control about PR rejection to the GitHub side 55 | max-allowed-issues: 2147483647 56 | 57 | # Upload the SARIF file generated in the previous step 58 | - name: Upload SARIF results file 59 | uses: github/codeql-action/upload-sarif@v2 60 | with: 61 | sarif_file: results.sarif 62 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "master" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "master" ] 20 | schedule: 21 | - cron: '27 1 * * 4' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v3 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v2 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | 52 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 53 | # queries: security-extended,security-and-quality 54 | 55 | 56 | # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). 57 | # If this step fails, then you should remove it and run the build manually (see below) 58 | - name: Autobuild 59 | uses: github/codeql-action/autobuild@v2 60 | 61 | # ℹ️ Command-line programs to run using the OS shell. 62 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 63 | 64 | # If the Autobuild fails above, remove it and uncomment the following three lines. 65 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 66 | 67 | # - run: | 68 | # echo "Run, Build Application using script" 69 | # ./location_of_script_within_repo/buildscript.sh 70 | 71 | - name: Perform CodeQL Analysis 72 | uses: github/codeql-action/analyze@v2 73 | with: 74 | category: "/language:${{matrix.language}}" 75 | -------------------------------------------------------------------------------- /.github/workflows/dependency-review.yml: -------------------------------------------------------------------------------- 1 | # Dependency Review Action 2 | # 3 | # This Action will scan dependency manifest files that change as part of a Pull Request, surfacing known-vulnerable versions of the packages declared or updated in the PR. Once installed, if the workflow run is marked as required, PRs introducing known-vulnerable packages will be blocked from merging. 4 | # 5 | # Source repository: https://github.com/actions/dependency-review-action 6 | # Public documentation: https://docs.github.com/en/code-security/supply-chain-security/understanding-your-software-supply-chain/about-dependency-review#dependency-review-enforcement 7 | name: 'Dependency Review' 8 | on: 9 | push: 10 | branches: [ "master" ] 11 | pull_request: 12 | # The branches below must be a subset of the branches above 13 | branches: [ "master" ] 14 | schedule: 15 | - cron: '21 5 * * 6' 16 | 17 | permissions: 18 | contents: read 19 | 20 | jobs: 21 | dependency-review: 22 | runs-on: ubuntu-latest 23 | steps: 24 | - name: 'Checkout Repository' 25 | uses: actions/checkout@v3 26 | - name: 'Dependency Review' 27 | uses: actions/dependency-review-action@v3 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | .mypy_cache 4 | .code-quality.json 5 | *.egg-info 6 | dist 7 | *.db 8 | *.pickle 9 | *.bak 10 | fil-result 11 | .cache 12 | .local 13 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | stages: 2 | - test 3 | - publish 4 | 5 | include: 6 | - template: Code-Quality.gitlab-ci.yml 7 | - template: Security/SAST.gitlab-ci.yml 8 | 9 | .test: 10 | parallel: 11 | matrix: 12 | - PY_VERSION: [ "3.9", "3.10", "3.11", "3.12" ] 13 | stage: test 14 | image: python:$PY_VERSION 15 | rules: 16 | - if: '$CI_PIPELINE_SOURCE == "push" || $CI_PIPELINE_SOURCE == "merge_request_event"' 17 | before_script: 18 | - test $(echo $PY_VERSION | cut -d . -f 2) -gt 12 && (apt update && apt install -y gfortran libopenblas-dev) 19 | - pip install --root-user-action ignore -q poetry 20 | - poetry install 21 | 22 | unit tests: 23 | extends: .test 24 | script: 25 | - poetry run pytest --junitxml=pytest.xml tests/unit 26 | artifacts: 27 | reports: 28 | junit: 29 | - pytest.xml 30 | 31 | integration tests: 32 | extends: .test 33 | script: 34 | - poetry run pytest --junitxml=pytest.xml tests/integration 35 | artifacts: 36 | reports: 37 | junit: 38 | - pytest.xml 39 | 40 | mypy: 41 | extends: .test 42 | script: 43 | - poetry run mypy duplicate_images tests 44 | 45 | flake8: 46 | extends: .test 47 | script: 48 | - poetry run flake8 duplicate_images tests 49 | 50 | pylint: 51 | extends: .test 52 | script: 53 | - test $(echo $PY_VERSION | cut -d . -f 2) -ge 12 && PYLINT_EXTRA_ARGS="--disable=inconsistent-quotes" 54 | - poetry run pylint $PYLINT_EXTRA_ARGS duplicate_images tests 55 | 56 | bandit: 57 | extends: .test 58 | parallel: 59 | matrix: 60 | - PY_VERSION: [ "3.12" ] 61 | script: 62 | - poetry run bandit -r duplicate_images 63 | 64 | ChangelogIsUpdated: 65 | stage: test 66 | image: alpine:latest 67 | rules: 68 | - if: "$CI_MERGE_REQUEST_ID" 69 | - if: $CI_COMMIT_BRANCH == "master" 70 | script: 71 | - VERSION=$(egrep 'version = ".*"' pyproject.toml | cut -d \" -f 2) 72 | - test -n "$VERSION" 73 | - fgrep "## [$VERSION]" CHANGELOG.md 74 | - fgrep "...$VERSION" CHANGELOG.md 75 | - fgrep "## [$VERSION] - $(date +%Y-%m-%d)" CHANGELOG.md 76 | 77 | 78 | RunAndCheckResults: 79 | extends: .test 80 | variables: 81 | IMAGE_DIR: tests/integration/data/equal_but_binary_different 82 | script: 83 | - NUM_FILES=$(find $IMAGE_DIR -type f | wc -l) 84 | - EXPECTED_PAIRS=$((NUM_FILES*(NUM_FILES-1)/2)) # should be if all files matched... 85 | - EXPECTED_PAIRS=29 # ...turns out not all files match with each other though 86 | - NUM_PAIRS=$(poetry run find-dups $IMAGE_DIR | wc -l) 87 | - test $NUM_PAIRS -eq $EXPECTED_PAIRS 88 | - NUM_PAIRS=$(poetry run find-dups $IMAGE_DIR --progress --quiet | wc -l) 89 | - test $NUM_PAIRS -eq $EXPECTED_PAIRS 90 | - NUM_PAIRS=$(poetry run find-dups $IMAGE_DIR --algorithm ahash --quiet | wc -l) 91 | - test $NUM_PAIRS -eq $EXPECTED_PAIRS 92 | - NUM_PAIRS=$(poetry run find-dups $IMAGE_DIR --max-distance 1 --quiet | wc -l) 93 | - test $NUM_PAIRS -eq $EXPECTED_PAIRS 94 | - NUM_PAIRS=$(poetry run find-dups $IMAGE_DIR --hash-size 8 --quiet | wc -l) 95 | - test $NUM_PAIRS -eq $EXPECTED_PAIRS 96 | 97 | RunWithArgs: 98 | extends: .test 99 | image: python:3.12 100 | variables: 101 | IMAGE_DIR: tests/integration/data 102 | HASH_DB: test.pickle 103 | parallel: 104 | matrix: 105 | - ON_EQUAL: [print, quote_inline, none, d<] 106 | ALGORITHM: [ahash, colorhash] 107 | MODE: ["", --slow, --parallel] 108 | script: 109 | # ensure the script runs without any error with the given options 110 | - poetry run find-dups 111 | ${IMAGE_DIR} --hash-db ${HASH_DB} --progress 112 | --algorithm ${ALGORITHM} 113 | --on-equal ${ON_EQUAL} 114 | $MODE 115 | # ensure the hash cache file is written 116 | - test -f ${HASH_DB} 117 | - ls -l ${HASH_DB} 118 | 119 | RunWithArgsExec: 120 | extends: RunWithArgs 121 | parallel: 122 | matrix: 123 | - ON_EQUAL: [exec] 124 | ALGORITHM: [ahash] 125 | MODE: ["", --slow, --parallel] 126 | script: 127 | # ensure the script runs without any error with the given options 128 | - poetry run find-dups 129 | ${IMAGE_DIR} --hash-db ${HASH_DB} --progress --exec "ls -l {1} {2}" 130 | --algorithm ${ALGORITHM} 131 | --on-equal ${ON_EQUAL} 132 | $MODE 133 | # ensure the hash cache file is written 134 | - test -f ${HASH_DB} 135 | - ls -l ${HASH_DB} 136 | 137 | RunWithArgsExecFailure: 138 | extends: RunWithArgs 139 | parallel: 140 | matrix: 141 | - ON_EQUAL: [ "" ] 142 | ALGORITHM: [ "" ] 143 | MODE: [ "" ] 144 | script: 145 | # ensure the script fails when given --exec without --on-equal exec 146 | - poetry run find-dups 147 | ${IMAGE_DIR} --hash-db ${HASH_DB} --progress --exec "ls -l {1} {2}" && exit 1 148 | # ensure the script fails when given --on-equal exec without --exec 149 | - poetry run find-dups 150 | ${IMAGE_DIR} --hash-db ${HASH_DB} --progress --on-equal exec && exit 1 151 | - exit 0 152 | 153 | TagIsNew: 154 | stage: test 155 | image: alpine:latest 156 | rules: 157 | - if: "$CI_MERGE_REQUEST_ID" 158 | - if: $CI_COMMIT_BRANCH == "master" 159 | before_script: 160 | - apk update 161 | - apk add git 162 | script: 163 | - VERSION=$(egrep 'version = ".*"' pyproject.toml | cut -d \" -f 2) 164 | - test -n "$VERSION" 165 | - git tag | ( ! grep "^${VERSION}\$" ) 166 | 167 | CreateTag: 168 | stage: publish 169 | image: alpine:latest 170 | rules: 171 | - if: $CI_COMMIT_BRANCH == "master" && $CI_PIPELINE_SOURCE != "schedule" 172 | when: on_success 173 | before_script: 174 | - apk update 175 | - apk add git 176 | - git config user.email "${GITLAB_USER_EMAIL}" 177 | - git config user.name "${GITLAB_USER_NAME}" 178 | script: 179 | - VERSION=$(egrep 'version = ".*"' pyproject.toml | cut -d \" -f 2) 180 | - echo "**** Tagging release as version $VERSION" 181 | - git remote add tag-origin https://oauth2:${GITLAB_ACCESS_TOKEN}@gitlab.com/${CI_PROJECT_PATH} 182 | - git tag -a "${VERSION}" -m "Released $(date +%Y-%m-%d)" 183 | - git push tag-origin "${VERSION}" 184 | 185 | PublishToPyPI: 186 | stage: publish 187 | image: python:3.11 188 | rules: 189 | - if: "$CI_COMMIT_TAG" 190 | when: on_success 191 | script: 192 | - VERSION=$(egrep 'version = ".*"' pyproject.toml | cut -d \" -f 2) 193 | - test "${CI_COMMIT_TAG}" == "${VERSION}" || exit 1 194 | - echo "**** Upgrading to ${VERSION}" 195 | - pip install -q poetry 196 | - poetry build 197 | - poetry config repositories.testpypi https://test.pypi.org/legacy/ 198 | - poetry publish --username __token__ --password ${TESTPYPI_TOKEN} --repository testpypi 199 | - echo "**** Attempting pip install from test PyPI server" 200 | - apt-get -y -qq update 201 | - apt-get -y -q install libsndfile1 ffmpeg > /dev/null 202 | - pip install -q --index-url https://test.pypi.org/simple --extra-index-url https://pypi.org/simple duplicate_images 203 | - echo "**** Publishing on live PyPI server" 204 | - poetry publish --username __token__ --password ${PYPI_TOKEN} 205 | 206 | PushToGithub: 207 | stage: publish 208 | image: alpine:latest 209 | rules: 210 | - if: "$CI_COMMIT_TAG" 211 | when: on_success 212 | before_script: 213 | - apk update 214 | - apk add openssh-client git sshpass 215 | - eval $(ssh-agent -s) 216 | - echo "$GITHUB_SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add - > /dev/null 217 | - mkdir -p ~/.ssh 218 | - chmod 700 ~/.ssh 219 | - ssh-keyscan github.com >> ~/.ssh/known_hosts 220 | - chmod 644 ~/.ssh/known_hosts 221 | - ssh -T git@github.com 2>&1 || true 222 | - git config user.email "${GITLAB_USER_EMAIL}" 223 | - git config user.name "${GITLAB_USER_NAME}" 224 | script: 225 | - git remote add github git@github.com:lene/DuplicateImages.git 226 | - git remote show github 227 | - BRANCH=${CI_COMMIT_BRANCH:-master} 228 | - git checkout $BRANCH 229 | - git push github $BRANCH 230 | - git push github $CI_COMMIT_TAG 231 | 232 | CreateGithubRelease: 233 | stage: publish 234 | needs: 235 | - PushToGithub 236 | image: alpine:latest 237 | rules: 238 | - if: "$CI_COMMIT_TAG" 239 | when: on_success 240 | before_script: 241 | - apk update 242 | - apk add curl 243 | variables: 244 | RELEASE_API_URL: "https://api.github.com/repos/lene/DuplicateImages/releases" 245 | DESCRIPTION: "Full Changelog: https://github.com/lene/DuplicateImages/blob/${CI_COMMIT_TAG}/CHANGELOG.md" 246 | script: 247 | - POST_DATA='{ 248 | "tag_name":"'${CI_COMMIT_TAG}'", 249 | "target_commitish":"master", 250 | "name":"'${CI_COMMIT_TAG}'", 251 | "body":"'${FULL_DESCRIPTION}${DESCRIPTION}'", 252 | "draft":false, 253 | "prerelease":false, 254 | "generate_release_notes":false 255 | }' 256 | - echo $API_URL 257 | - echo $POST_DATA 258 | - 'curl -L -X POST 259 | -H "Accept: application/vnd.github+json" 260 | -H "X-GitHub-Api-Version: 2022-11-28" 261 | -H "Authorization: Bearer ${GITHUB_API_TOKEN}" 262 | ${RELEASE_API_URL} -d "${POST_DATA}"' 263 | 264 | CreateGitlabRelease: 265 | stage: publish 266 | image: registry.gitlab.com/gitlab-org/release-cli:latest 267 | rules: 268 | - if: $CI_COMMIT_TAG 269 | script: 270 | - echo "running release_job" # dummy, see https://gitlab.com/gitlab-org/gitlab/-/issues/223856 271 | release: 272 | tag_name: '$CI_COMMIT_TAG' 273 | description: './CHANGELOG.md' -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-whitelist= 7 | 8 | # Specify a score threshold to be exceeded before program exits with error. 9 | fail-under=10.0 10 | 11 | # Add files or directories to the blacklist. They should be base names, not 12 | # paths. 13 | ignore=CVS 14 | 15 | # Add files or directories matching the regex patterns to the blacklist. The 16 | # regex matches against base names, not paths. 17 | ignore-patterns= 18 | 19 | # Python code to execute, usually for sys.path manipulation such as 20 | # pygtk.require(). 21 | #init-hook= 22 | 23 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 24 | # number of processors available to use. 25 | jobs=0 26 | 27 | # Control the amount of potential inferred values when inferring a single 28 | # object. This can help the performance when dealing with large functions or 29 | # complex, nested conditions. 30 | limit-inference-results=100 31 | 32 | # List of plugins (as comma separated values of python module names) to load, 33 | # usually to register additional checkers. 34 | load-plugins= 35 | 36 | # Pickle collected data for later comparisons. 37 | persistent=yes 38 | 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit 40 | # user-friendly hints instead of false-positive error messages. 41 | suggestion-mode=yes 42 | 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the 44 | # active Python interpreter and may run arbitrary code. 45 | unsafe-load-any-extension=no 46 | 47 | 48 | [MESSAGES CONTROL] 49 | 50 | # Only show warnings with the listed confidence levels. Leave empty to show 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 52 | confidence= 53 | 54 | # Disable the message, report, category or checker with the given id(s). You 55 | # can either give multiple identifiers separated by comma (,) or put this 56 | # option multiple times (only on the command line, not in the configuration 57 | # file where it should appear only once). You can also use "--disable=all" to 58 | # disable everything first and then reenable specific checks. For example, if 59 | # you want to run only the similarities checker, you can use "--disable=all 60 | # --enable=similarities". If you want to run only the classes checker, but have 61 | # no Warning level messages displayed, use "--disable=all --enable=classes 62 | # --disable=W". 63 | disable=raw-checker-failed, 64 | bad-inline-option, 65 | locally-disabled, 66 | file-ignored, 67 | suppressed-message, 68 | useless-suppression, 69 | deprecated-pragma, 70 | use-symbolic-message-instead, 71 | missing-function-docstring, 72 | unsubscriptable-object, 73 | consider-using-with 74 | 75 | # Enable the message, report, category or checker with the given id(s). You can 76 | # either give multiple identifier separated by comma (,) or put this option 77 | # multiple time (only on the command line, not in the configuration file where 78 | # it should appear only once). See also the "--disable" option for examples. 79 | enable=c-extension-no-member 80 | 81 | 82 | [REPORTS] 83 | 84 | # Python expression which should return a score less than or equal to 10. You 85 | # have access to the variables 'error', 'warning', 'refactor', and 'convention' 86 | # which contain the number of messages in each category, as well as 'statement' 87 | # which is the total number of statements analyzed. This score is used by the 88 | # global evaluation report (RP0004). 89 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 90 | 91 | # Template used to display messages. This is a python new-style format string 92 | # used to format the message information. See doc for all details. 93 | #msg-template= 94 | 95 | # Set the output format. Available formats are text, parseable, colorized, json 96 | # and msvs (visual studio). You can also give a reporter class, e.g. 97 | # mypackage.mymodule.MyReporterClass. 98 | output-format=text 99 | 100 | # Tells whether to display a full report or only the messages. 101 | reports=no 102 | 103 | # Activate the evaluation score. 104 | score=yes 105 | 106 | 107 | [REFACTORING] 108 | 109 | # Maximum number of nested blocks for function / method body 110 | max-nested-blocks=5 111 | 112 | # Complete name of functions that never returns. When checking for 113 | # inconsistent-return-statements if a never returning function is called then 114 | # it will be considered as an explicit return statement and no message will be 115 | # printed. 116 | never-returning-functions=sys.exit 117 | 118 | 119 | [MISCELLANEOUS] 120 | 121 | # List of note tags to take in consideration, separated by a comma. 122 | notes=FIXME, 123 | XXX, 124 | TODO 125 | 126 | # Regular expression of note tags to take in consideration. 127 | #notes-rgx= 128 | 129 | 130 | [BASIC] 131 | 132 | # Naming style matching correct argument names. 133 | argument-naming-style=snake_case 134 | 135 | # Regular expression matching correct argument names. Overrides argument- 136 | # naming-style. 137 | #argument-rgx= 138 | 139 | # Naming style matching correct attribute names. 140 | attr-naming-style=snake_case 141 | 142 | # Regular expression matching correct attribute names. Overrides attr-naming- 143 | # style. 144 | #attr-rgx= 145 | 146 | # Bad variable names which should always be refused, separated by a comma. 147 | bad-names=foo, 148 | bar, 149 | baz, 150 | toto, 151 | tutu, 152 | tata 153 | 154 | # Bad variable names regexes, separated by a comma. If names match any regex, 155 | # they will always be refused 156 | bad-names-rgxs= 157 | 158 | # Naming style matching correct class attribute names. 159 | class-attribute-naming-style=any 160 | 161 | # Regular expression matching correct class attribute names. Overrides class- 162 | # attribute-naming-style. 163 | #class-attribute-rgx= 164 | 165 | # Naming style matching correct class names. 166 | class-naming-style=PascalCase 167 | 168 | # Regular expression matching correct class names. Overrides class-naming- 169 | # style. 170 | #class-rgx= 171 | 172 | # Naming style matching correct constant names. 173 | const-naming-style=UPPER_CASE 174 | 175 | # Regular expression matching correct constant names. Overrides const-naming- 176 | # style. 177 | #const-rgx= 178 | 179 | # Minimum line length for functions/classes that require docstrings, shorter 180 | # ones are exempt. 181 | docstring-min-length=-1 182 | 183 | # Naming style matching correct function names. 184 | function-naming-style=snake_case 185 | 186 | # Regular expression matching correct function names. Overrides function- 187 | # naming-style. 188 | #function-rgx= 189 | 190 | # Good variable names which should always be accepted, separated by a comma. 191 | good-names=i, 192 | j, 193 | k, 194 | x, 195 | y, 196 | ex, 197 | Run, 198 | _ 199 | 200 | # Good variable names regexes, separated by a comma. If names match any regex, 201 | # they will always be accepted 202 | good-names-rgxs= 203 | 204 | # Include a hint for the correct naming format with invalid-name. 205 | include-naming-hint=no 206 | 207 | # Naming style matching correct inline iteration names. 208 | inlinevar-naming-style=any 209 | 210 | # Regular expression matching correct inline iteration names. Overrides 211 | # inlinevar-naming-style. 212 | #inlinevar-rgx= 213 | 214 | # Naming style matching correct method names. 215 | method-naming-style=snake_case 216 | 217 | # Regular expression matching correct method names. Overrides method-naming- 218 | # style. 219 | #method-rgx= 220 | 221 | # Naming style matching correct module names. 222 | module-naming-style=snake_case 223 | 224 | # Regular expression matching correct module names. Overrides module-naming- 225 | # style. 226 | #module-rgx= 227 | 228 | # Colon-delimited sets of names that determine each other's naming style when 229 | # the name regexes allow several styles. 230 | name-group= 231 | 232 | # Regular expression which should only match function or class names that do 233 | # not require a docstring. 234 | no-docstring-rgx=^_ 235 | 236 | # List of decorators that produce properties, such as abc.abstractproperty. Add 237 | # to this list to register other decorators that produce valid properties. 238 | # These decorators are taken in consideration only for invalid-name. 239 | property-classes=abc.abstractproperty 240 | 241 | # Naming style matching correct variable names. 242 | variable-naming-style=snake_case 243 | 244 | # Regular expression matching correct variable names. Overrides variable- 245 | # naming-style. 246 | #variable-rgx= 247 | 248 | 249 | [STRING] 250 | 251 | # This flag controls whether inconsistent-quotes generates a warning when the 252 | # character used as a quote delimiter is used inconsistently within a module. 253 | check-quote-consistency=yes 254 | 255 | # This flag controls whether the implicit-str-concat should generate a warning 256 | # on implicit string concatenation in sequences defined over several lines. 257 | check-str-concat-over-line-jumps=no 258 | 259 | 260 | [FORMAT] 261 | 262 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 263 | expected-line-ending-format= 264 | 265 | # Regexp for a line that is allowed to be longer than the limit. 266 | ignore-long-lines=^\s*(# )??$ 267 | 268 | # Number of spaces of indent required inside a hanging or continued line. 269 | indent-after-paren=4 270 | 271 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 272 | # tab). 273 | indent-string=' ' 274 | 275 | # Maximum number of characters on a single line. 276 | max-line-length=100 277 | 278 | # Maximum number of lines in a module. 279 | max-module-lines=1000 280 | 281 | # Allow the body of a class to be on the same line as the declaration if body 282 | # contains single statement. 283 | single-line-class-stmt=no 284 | 285 | # Allow the body of an if to be on the same line as the test if there is no 286 | # else. 287 | single-line-if-stmt=no 288 | 289 | 290 | [SPELLING] 291 | 292 | # Limits count of emitted suggestions for spelling mistakes. 293 | max-spelling-suggestions=4 294 | 295 | # Spelling dictionary name. Available dictionaries: none. To make it work, 296 | # install the python-enchant package. 297 | spelling-dict= 298 | 299 | # List of comma separated words that should not be checked. 300 | spelling-ignore-words= 301 | 302 | # A path to a file that contains the private dictionary; one word per line. 303 | spelling-private-dict-file= 304 | 305 | # Tells whether to store unknown words to the private dictionary (see the 306 | # --spelling-private-dict-file option) instead of raising a message. 307 | spelling-store-unknown-words=no 308 | 309 | 310 | [TYPECHECK] 311 | 312 | # List of decorators that produce context managers, such as 313 | # contextlib.contextmanager. Add to this list to register other decorators that 314 | # produce valid context managers. 315 | contextmanager-decorators=contextlib.contextmanager 316 | 317 | # List of members which are set dynamically and missed by pylint inference 318 | # system, and so shouldn't trigger E1101 when accessed. Python regular 319 | # expressions are accepted. 320 | generated-members= 321 | 322 | # Tells whether missing members accessed in mixin class should be ignored. A 323 | # mixin class is detected if its name ends with "mixin" (case insensitive). 324 | ignore-mixin-members=yes 325 | 326 | # Tells whether to warn about missing members when the owner of the attribute 327 | # is inferred to be None. 328 | ignore-none=yes 329 | 330 | # This flag controls whether pylint should warn about no-member and similar 331 | # checks whenever an opaque object is returned when inferring. The inference 332 | # can return multiple potential results while evaluating a Python object, but 333 | # some branches might not be evaluated, which results in partial inference. In 334 | # that case, it might be useful to still emit no-member and other checks for 335 | # the rest of the inferred objects. 336 | ignore-on-opaque-inference=yes 337 | 338 | # List of class names for which member attributes should not be checked (useful 339 | # for classes with dynamically set attributes). This supports the use of 340 | # qualified names. 341 | ignored-classes=optparse.Values,thread._local,_thread._local 342 | 343 | # List of module names for which member attributes should not be checked 344 | # (useful for modules/projects where namespaces are manipulated during runtime 345 | # and thus existing member attributes cannot be deduced by static analysis). It 346 | # supports qualified module names, as well as Unix pattern matching. 347 | ignored-modules= 348 | 349 | # Show a hint with possible names when a member name was not found. The aspect 350 | # of finding the hint is based on edit distance. 351 | missing-member-hint=yes 352 | 353 | # The minimum edit distance a name should have in order to be considered a 354 | # similar match for a missing member name. 355 | missing-member-hint-distance=1 356 | 357 | # The total number of similar names that should be taken in consideration when 358 | # showing a hint for a missing member. 359 | missing-member-max-choices=1 360 | 361 | # List of decorators that change the signature of a decorated function. 362 | signature-mutators= 363 | 364 | 365 | [SIMILARITIES] 366 | 367 | # Ignore comments when computing similarities. 368 | ignore-comments=yes 369 | 370 | # Ignore docstrings when computing similarities. 371 | ignore-docstrings=yes 372 | 373 | # Ignore imports when computing similarities. 374 | ignore-imports=no 375 | 376 | # Minimum lines number of a similarity. 377 | min-similarity-lines=4 378 | 379 | 380 | [LOGGING] 381 | 382 | # The type of string formatting that logging methods do. `old` means using % 383 | # formatting, `new` is for `{}` formatting. 384 | logging-format-style=old 385 | 386 | # Logging modules to check that the string format arguments are in logging 387 | # function parameter format. 388 | logging-modules=logging 389 | 390 | 391 | [VARIABLES] 392 | 393 | # List of additional names supposed to be defined in builtins. Remember that 394 | # you should avoid defining new builtins when possible. 395 | additional-builtins= 396 | 397 | # Tells whether unused global variables should be treated as a violation. 398 | allow-global-unused-variables=yes 399 | 400 | # List of strings which can identify a callback function by name. A callback 401 | # name must start or end with one of those strings. 402 | callbacks=cb_, 403 | _cb 404 | 405 | # A regular expression matching the name of dummy variables (i.e. expected to 406 | # not be used). 407 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 408 | 409 | # Argument names that match this expression will be ignored. Default to name 410 | # with leading underscore. 411 | ignored-argument-names=_.*|^ignored_|^unused_ 412 | 413 | # Tells whether we should check for unused import in __init__ files. 414 | init-import=no 415 | 416 | # List of qualified module names which can have objects that can redefine 417 | # builtins. 418 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 419 | 420 | 421 | [DESIGN] 422 | 423 | # Maximum number of arguments for function / method. 424 | max-args=5 425 | 426 | # Maximum number of attributes for a class (see R0902). 427 | max-attributes=7 428 | 429 | # Maximum number of boolean expressions in an if statement (see R0916). 430 | max-bool-expr=5 431 | 432 | # Maximum number of branch for function / method body. 433 | max-branches=12 434 | 435 | # Maximum number of locals for function / method body. 436 | max-locals=15 437 | 438 | # Maximum number of parents for a class (see R0901). 439 | max-parents=7 440 | 441 | # Maximum number of public methods for a class (see R0904). 442 | max-public-methods=20 443 | 444 | # Maximum number of return / yield for function / method body. 445 | max-returns=6 446 | 447 | # Maximum number of statements in function / method body. 448 | max-statements=50 449 | 450 | # Minimum number of public methods for a class (see R0903). 451 | min-public-methods=2 452 | 453 | 454 | [IMPORTS] 455 | 456 | # List of modules that can be imported at any level, not just the top level 457 | # one. 458 | allow-any-import-level= 459 | 460 | # Allow wildcard imports from modules that define __all__. 461 | allow-wildcard-with-all=no 462 | 463 | # Analyse import fallback blocks. This can be used to support both Python 2 and 464 | # 3 compatible code, which means that the block might have code that exists 465 | # only in one or another interpreter, leading to false positives when analysed. 466 | analyse-fallback-blocks=no 467 | 468 | # Deprecated modules which should not be used, separated by a comma. 469 | deprecated-modules=optparse,tkinter.tix 470 | 471 | # Create a graph of external dependencies in the given file (report RP0402 must 472 | # not be disabled). 473 | ext-import-graph= 474 | 475 | # Create a graph of every (i.e. internal and external) dependencies in the 476 | # given file (report RP0402 must not be disabled). 477 | import-graph= 478 | 479 | # Create a graph of internal dependencies in the given file (report RP0402 must 480 | # not be disabled). 481 | int-import-graph= 482 | 483 | # Force import order to recognize a module as part of the standard 484 | # compatibility libraries. 485 | known-standard-library= 486 | 487 | # Force import order to recognize a module as part of a third party library. 488 | known-third-party=enchant 489 | 490 | # Couples of modules and preferred modules, separated by a comma. 491 | preferred-modules= 492 | 493 | 494 | [CLASSES] 495 | 496 | # List of method names used to declare (i.e. assign) instance attributes. 497 | defining-attr-methods=__init__, 498 | __new__, 499 | setUp, 500 | __post_init__ 501 | 502 | # List of member names, which should be excluded from the protected access 503 | # warning. 504 | exclude-protected=_asdict, 505 | _fields, 506 | _replace, 507 | _source, 508 | _make 509 | 510 | # List of valid names for the first argument in a class method. 511 | valid-classmethod-first-arg=cls 512 | 513 | # List of valid names for the first argument in a metaclass class method. 514 | valid-metaclass-classmethod-first-arg=cls 515 | 516 | 517 | [EXCEPTIONS] 518 | 519 | # Exceptions that will emit a warning when being caught. Defaults to 520 | # "BaseException, Exception". 521 | overgeneral-exceptions=builtins.BaseException, 522 | builtins.Exception 523 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [0.11.9] - 2025-03-03 4 | 5 | ### Added 6 | - parse config file with `-c|--config-file` to set defaults for CLI options 7 | 8 | ## [0.11.8] - 2025-02-25 9 | 10 | ### Added 11 | - add `symlink-bigger` action to replace bigger files of a group with a symlink to the smallest one 12 | 13 | ## [0.11.7] - 2025-02-25 14 | 15 | ### Added 16 | - add `move-first`, `move-second`, `move-biggest` and `move-smallest` actions as options for 17 | `--on-equal` and their shortcuts `m1`, `m2`, `m>` and `m<` along with the `--move-to` and 18 | `--move-recreate-path` options to move files to a different directory 19 | 20 | ## [0.11.6] - 2025-02-24 21 | 22 | ### Updated 23 | - Print warning when specifying `--exec` without `--on-equal exec` 24 | 25 | ## [0.11.5] - 2025-02-21 26 | 27 | ### Added 28 | - crop-resistant hash algorithm with `--algorithm=crop_resistant` 29 | 30 | ### Updated 31 | - Updated dependencies to fix security vulnerabilities 32 | 33 | ## [0.11.4] - 2024-12-16 34 | 35 | ### Updated 36 | - Check for illegal parameter combination `--group` and `--max-distance` 37 | - Explicit support for Python 3.13 by testing it in CI 38 | - Updated dependencies to fix security vulnerabilities 39 | 40 | ## [0.11.3] - 2024-09-11 41 | 42 | ### Updated 43 | - Updated dependencies to fix security vulnerabilities 44 | - Speed up pylint 45 | 46 | ## [0.11.2] - 2024-05-27 47 | 48 | ### Updated 49 | - Updated dependencies to fix security vulnerabilities 50 | 51 | ## [0.11.1] - 2024-03-14 52 | 53 | ### Fixed 54 | - https://github.com/lene/DuplicateImages/issues/11: Guarded against error when using `pillow_heif` 55 | module on Mac OS X 12 56 | 57 | ## [0.11.0] - 2024-01-25 58 | 59 | ### Added 60 | - Pydoc for modules and classes 61 | 62 | ## [0.10.9] - 2024-01-25 63 | 64 | ### Fixed 65 | - Cache file is only written to disk if it is changed 66 | 67 | ## [0.10.8] - 2024-01-17 68 | 69 | ### Added 70 | - optional argument to specify the number of threads with `--parallel` 71 | - `--parallel-actions` option to run actions in parallel 72 | - performance optimization when reading the files to compare 73 | 74 | ## [0.10.7] - 2024-01-13 75 | 76 | ### Added 77 | - Check that `hash_size` ia a power of 2 for `whash` algorithm 78 | 79 | ## [0.10.6] - 2024-01-12 80 | 81 | ### Fixed 82 | - Python 3.12 compatibility 83 | - bugfix: guard against OS failures when determining file type 84 | - small memory optimization 85 | 86 | ## [0.10.5] - 2024-01-12 87 | 88 | ### Added 89 | - `--exclude-dir` option to exclude directories from scanning 90 | - `--max-image-pixels` option to allow for huge images to bypass `PIL`'s `DecompressionBombError` 91 | 92 | ## [0.10.4] - 2024-01-11 93 | 94 | ### Fixed 95 | - Upgrade dependencies to fix security vulnerabilities 96 | 97 | ## [0.10.3] - 2023-10-05 98 | - Changes to CI only 99 | 100 | ## [0.10.2] - 2023-10-05 101 | 102 | ### Fixed 103 | - Upgrade Pillow dependency to 10.0.1 to fix libWebP security vulnerability 104 | - Upgrade GitPython dependency to 3.1.37 to fix security vulnerability 105 | 106 | ## [0.10.1] - 2023-09-04 107 | 108 | ### Added 109 | - Upgrade Python dependency to 3.9 to fix security warning about old SciPy version 110 | - create GitLab release automatically for each new tag 111 | 112 | ### Fixed 113 | - create GitHub release from the correct state 114 | 115 | ## [0.10.0] - 2023-09-03 116 | 117 | ### Added 118 | - Store hashing algorithm and parameters in hash-db file to ensure that the same algorithm is used 119 | across separate runs with the same hash-db file 120 | 121 | ### Changed 122 | - Breaking change in the hash-db file format - files from previous versions are not compatible 123 | 124 | ## [0.9.2] - 2023-08-26 125 | 126 | ### Added 127 | - `symlink-smaller` action to replace the smaller files of a group with a symlink to the biggest one 128 | 129 | ### Changed 130 | - `delete-smaller` and `delete-bigger` actions to `delete-smallest` and `delete-biggest` 131 | 132 | ## [0.9.1] - 2023-08-23 133 | 134 | ### Added 135 | - add documentation for new `--group` option 136 | 137 | ## [0.9.0] - 2023-08-23 138 | 139 | ### Added 140 | - CLI option `--group`: instead of pairs, treat similar images as groups of arbitrary size 141 | - refactor `ImagePairFinder` to easier deal with combinations of options 142 | - test coverage for all supported combinations of `--group`/`--parallel` 143 | 144 | ## [0.8.9] - 2023-08-23 145 | 146 | ### Added 147 | - create GitHub release automatically for each new tag 148 | - updated and completed developer documentation 149 | 150 | ## [0.8.8] - 2023-08-23 151 | 152 | ### Added 153 | - more info in log about runtime and warn about bad decisions 154 | 155 | ## [0.8.7] - 2023-08-22 156 | 157 | ### Added 158 | - run bandit SAST scanner in CI and on every push 159 | - fixed some security warnings, intentionally ignored others 160 | - run GitHub dependency scan in GitHub CI on every merge to master and weekly 161 | 162 | ## [0.8.6] - 2023-08-22 163 | 164 | ### Added 165 | - Changelog 166 | 167 | ## [0.8.5] - 2023-08-21 168 | 169 | ### Added 170 | - log execution times for scanning and comparing 171 | - code reorganization 172 | 173 | ### Changed 174 | - renamed `--serial` option to `--slow` 175 | 176 | ## [0.8.4] - 2023-08-21 177 | 178 | ### Fixed 179 | - removed an absolute path in test suite 180 | 181 | ## [0.8.3] - 2023-08-21 182 | 183 | ### Added 184 | - updated dependencies to newest versions 185 | - upped Development Status in metadata to Beta 186 | 187 | ### Removed 188 | - support for Python 3.7 189 | 190 | ## [0.8.2] - 2023-08-21 191 | 192 | ### Added 193 | - JSON file format for the image hash persistent store 194 | 195 | ## [0.8.1] - 2023-08-15 196 | 197 | ### Added 198 | - test WEBP and HEIC image formats 199 | 200 | ## [0.8.0] - 2023-08-11 201 | 202 | ### Added 203 | - change algorithm to run in O(N) instead of O(N^2) by using the image hashes as dict keys 204 | - old algorithm still runs if using `--max-distance` switch 205 | - add `--serial` CLI switch to explicitly select old algorithm 206 | - test run script in CI with most relevant CLI parameter combinations 207 | 208 | ### Removed 209 | - `pre-commit` since it causes more trouble than it's worth 210 | 211 | ## [0.7.4] - 2023-08-10 212 | 213 | ### Added 214 | - experiment with `pre-commit` to run commit hooks in a more standardized way 215 | 216 | ## [0.7.3] - 2023-08-10 217 | 218 | ### Added 219 | - more pedantic linting and tests on all supported Python versions in CI 220 | - add MIT license file 221 | 222 | ## [0.7.1] - 2023-02-03 223 | 224 | ### Added 225 | - contributed by [@mreiche](https://github.com/mreiche): support for running any command passed by 226 | `--on-equal` 227 | - contributed by [@mreiche](https://github.com/mreiche): faster MIME detection 228 | - contributed by [@mreiche](https://github.com/mreiche): `print_inline` and `quote_inline` actions 229 | 230 | ## [0.6.5] - 2023-01-02 231 | 232 | ### Added 233 | - contributed by [@beijingjazzpanda](https://gitlab.com/beijingjazzpanda): ensure hash-db `.bak` 234 | files are created properly 235 | - run Codacy and CodeQL security and dependency scans in CI on GitHub 236 | 237 | ## [0.6.4] - 2022-09-23 238 | 239 | ### Added 240 | - `--hash-size` option to fine tune which images are considered equal 241 | - support new `dhash_vertical` and `phash_simple` image hashing methods 242 | - push to GitHub repository from CI when MR is merged 243 | 244 | ## [0.6.2] - 2022-09-04 245 | 246 | ### Added 247 | - code style: enforce single quotes as default 248 | 249 | ## [0.6.1] - 2022-09-02 250 | 251 | ### Added 252 | - `--max-distance` option to fine tune which images are considered equal 253 | 254 | ## [0.6.0] - 2022-07-22 255 | 256 | ### Added 257 | - support HEIC images 258 | - fix dependabot alerts for insecure dependencies 259 | 260 | ## [0.5.3] - 2021-03-16 261 | 262 | ### Added 263 | - add `--quiet` flag to decrease log level 264 | 265 | ## [0.5.2] - 2021-03-16 266 | 267 | ### Added 268 | - add `d1` and `d2` action shortcuts 269 | 270 | ## [0.5.1] - 2021-03-15 271 | 272 | ### Added 273 | - update documentation for new `--hash-db` CLI parameter 274 | 275 | ## [0.5.0] - 2021-03-15 276 | 277 | ### Added 278 | - store the image hashes in a pickle file between runs for a major speedup 279 | - run tests in parallel 280 | 281 | ## [0.4.1] - 2021-01-17 282 | 283 | ### Added 284 | - display a progress bar while calculating 285 | 286 | ## [0.4.0] - 2021-01-16 287 | 288 | ### Added 289 | - automatically publish to PyPI from CI when MR is merged 290 | - reorganize code 291 | 292 | ## [0.3.6] - 2021-01-16 293 | 294 | ### Added 295 | - update homepage and description in project metadata 296 | 297 | ## [0.3.5] - 2021-01-16 298 | 299 | ### Added 300 | - change master repository to https://github.com/lene/DuplicateImages.git 301 | 302 | ## [0.3.4] - 2021-01-16 303 | 304 | ### Added 305 | - improve log formatting 306 | - add option to print matching files with quotes, as well as `d>` and `d<` shortcuts 307 | 308 | ## [0.3.2] - 2021-01-16 309 | 310 | ### Added 311 | - use `coloredlogs` and improve log formatting 312 | 313 | ## [0.3.1] - 2021-01-16 314 | 315 | ### Added 316 | - handle error for broken image files 317 | - use `logging` instead of `print()` for output 318 | 319 | ## [0.3.0] - 2021-01-16 320 | 321 | ### Added 322 | - actions to delete bigger/smaller image and view with `eog` 323 | - fuzziness parameter to adjust desired similarity 324 | 325 | ## [0.2.1] - 2021-01-15 326 | 327 | ### Added 328 | - documentation for parallel execution 329 | 330 | ## [0.2.0] - 2021-01-15 331 | 332 | ### Added 333 | - additionally use [ImageHash](https://pypi.org/project/ImageHash) to compare images 334 | - run `pylint` against code 335 | 336 | ## 0.1 - 2021-01-08 337 | 338 | ### Added 339 | - exact and histogram comparison 340 | - actions if equal: delete one of the pics, view with `xv` or print 341 | 342 | 343 | [0.11.9]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.8...0.11.9 344 | [0.11.8]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.7...0.11.8 345 | [0.11.7]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.6...0.11.7 346 | [0.11.6]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.5...0.11.6 347 | [0.11.5]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.4...0.11.5 348 | [0.11.4]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.3...0.11.4 349 | [0.11.3]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.2...0.11.3 350 | [0.11.2]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.1...0.11.2 351 | [0.11.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.11.0...0.11.1 352 | [0.11.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.9...0.11.0 353 | [0.10.9]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.8...0.10.9 354 | [0.10.8]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.7...0.10.8 355 | [0.10.7]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.6...0.10.7 356 | [0.10.6]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.5...0.10.6 357 | [0.10.5]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.4...0.10.5 358 | [0.10.4]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.3...0.10.4 359 | [0.10.3]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.2...0.10.3 360 | [0.10.2]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.1...0.10.2 361 | [0.10.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.10.0...0.10.1 362 | [0.10.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.9.2...0.10.0 363 | [0.9.2]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.9.1...0.9.2 364 | [0.9.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.9.0...0.9.1 365 | [0.9.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.9...0.9.0 366 | [0.8.9]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.8...0.8.9 367 | [0.8.8]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.7...0.8.8 368 | [0.8.7]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.6...0.8.7 369 | [0.8.6]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.5...0.8.6 370 | [0.8.5]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.4...0.8.5 371 | [0.8.4]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.3...0.8.4 372 | [0.8.3]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.2...0.8.3 373 | [0.8.2]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.1...0.8.2 374 | [0.8.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.8.0...0.8.1 375 | [0.8.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.7.4...0.8.0 376 | [0.7.4]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.7.3...0.7.4 377 | [0.7.3]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.7.1...0.7.3 378 | [0.7.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.6.5...0.7.1 379 | [0.6.5]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.6.4...0.6.5 380 | [0.6.4]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.6.2...0.6.4 381 | [0.6.2]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.6.1...0.6.2 382 | [0.6.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.6.0...0.6.1 383 | [0.6.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.5.3...0.6.0 384 | [0.5.3]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.5.2...0.5.3 385 | [0.5.2]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.5.1...0.5.2 386 | [0.5.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.5.0...0.5.1 387 | [0.5.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.4.1...0.5.0 388 | [0.4.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.4.0...0.4.1 389 | [0.4.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.3.6...0.4.0 390 | [0.3.6]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.3.5...0.3.6 391 | [0.3.5]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.3.4...0.3.6 392 | [0.3.4]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.3.2...0.3.4 393 | [0.3.2]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.3.1...0.3.2 394 | [0.3.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.3.0...0.3.1 395 | [0.3.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.2.1...0.3.0 396 | [0.2.1]: https://gitlab.com/duplicateimages/DuplicateImages/-/compare/0.2.0...0.2.1 397 | [0.2.0]: https://gitlab.com/duplicateimages/DuplicateImages/-/tags/0.2.0 398 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright © 2023 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 6 | associated documentation files (the “Software”), to deal in the Software without restriction, 7 | including without limitation the rights to use, copy, modify, merge, publish, distribute, 8 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all copies or substantial 12 | portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT 15 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 16 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 17 | OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | Fork this project to create your own MIT license that you can always link to. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Finding Duplicate Images 2 | 3 | Finds equal or similar images in a directory containing (many) image files. 4 | 5 | Official home page: https://github.com/lene/DuplicateImages 6 | 7 | Development page: https://gitlab.com/lilacashes/DuplicateImages 8 | 9 | PyPI page: https://pypi.org/project/duplicate-images 10 | 11 | ## Usage 12 | 13 | Installing: 14 | ```shell 15 | $ pip install duplicate_images 16 | ``` 17 | 18 | Printing the help screen: 19 | ```shell 20 | $ find-dups -h 21 | ``` 22 | 23 | Quick test run: 24 | ```shell 25 | $ find-dups $IMAGE_ROOT 26 | ``` 27 | 28 | Typical usage: 29 | ```shell 30 | $ find-dups $IMAGE_ROOT --parallel --progress --hash-db hashes.json 31 | ``` 32 | 33 | ### Supported image formats 34 | 35 | * JPEG and PNG (tested quite thoroughly) 36 | * HEIC (experimental support, tested cursorily only) 37 | * All other 38 | [formats supported](https://pillow.readthedocs.io/en/latest/handbook/image-file-formats.html) by 39 | the `pillow` Python Imaging Library should work, but are not specifically tested. 40 | 41 | #### Explicitly allow huge images 42 | 43 | The `PIL` image library, which is used as backend, limits the size of images to 178956970 pixels by 44 | default, to guard against memory exhaustion. For larger images, specify the maximum image size in 45 | pixels with the `--max-image-pixels` option. 46 | 47 | ### Image comparison algorithms 48 | 49 | Use the `--algorithm` option to select how equal images are found. The default algorithm is `phash`. 50 | 51 | `ahash`, `colorhash`, `dhash`, `dhash_vertical`, `phash`, `phash_simple`, `whash`, `crop_resistant`: 52 | seven different image hashing algorithms. See https://pypi.org/project/ImageHash for an introduction 53 | on image hashing and https://tech.okcupid.com/evaluating-perceptual-image-hashes-at-okcupid-e98a3e74aa3a 54 | for some gory details which image hashing algorithm performs best in which situation. For a start I 55 | recommend using `phash`, and only evaluating the other algorithms if `phash` does not perform 56 | satisfactorily in your use case. 57 | 58 | ### Image similarity threshold configuration 59 | 60 | Use the `--hash-size` parameter to tune the precision of the hashing algorithms. For the `colorhash` 61 | algorithm the hash size is interpreted as the number of bin bits and defaults to 3. For all other 62 | algorithms the hash size defaults to 8. For `whash` it must be a power of 2. 63 | 64 | Use the `--max-distance` parameter to tune how close images should be to be considered duplicates. 65 | The argument is a positive integer. Its value is highly dependent on the algorithm used and the 66 | nature of the images compared, so the best value for your use case can oly be found through 67 | experimentation. 68 | 69 | **NOTE:** using the `--max-distance` parameter slows down the comparison considerably with large 70 | image collections, making the runtime complexity go from O(N) to O(N2). If you want to 71 | scan collections with at least thousands of images, it is highly recommended to tune the desired 72 | similarity threshold with the `--hash-size` parameter alone, if that is at all possible. 73 | The '--max-distance' parameter it's incompatible with --group parameter. 74 | 75 | **NOTE:** the `--max-distance` parameter conflicts with tho `--group` parameter. You can only use 76 | one at a time. 77 | 78 | ### Pre-storing and using image hashes to speed up computation 79 | 80 | Use the `--hash-db ${FILE}.json` or `--hash-db ${FILE}.pickle` option to store image hashes in the 81 | file `$FILE` in JSON or Pickle format and read image hashes from that file if they are already 82 | present there. This avoids having to compute the image hashes anew at every run and can 83 | significantly speed up run times. 84 | 85 | ### Handling matching images either as pairs or as groups 86 | 87 | By default, matching images are presented as pairs. With the `--group` CLI option, they are handled 88 | as a group containing all matching images. 89 | 90 | Example: `1.jpg`, `2.jpg` and `3.jpg` in the current folder `.` are equal. 91 | 92 | ```shell 93 | $ find-dups . 94 | 1.jpg 2.jpg 95 | 1.jpg 3.jpg 96 | 2.jpg 3.jpg 97 | $ find-dups . --group 98 | 1.jpg 2.jpg 3.jpg 99 | ``` 100 | 101 | ### Actions for matching image groups 102 | 103 | Use the `--on-equal` option to select what to do to pairs of equal images. The default action is 104 | `print`. 105 | - `delete-first` or `d1`: deletes the first of the files in the group 106 | - `delete-last` or `dl`: deletes the last of the files in the group 107 | - `delete-biggest` or `d>`: deletes the file with the biggest size 108 | - `delete-smallest` or `d<`: deletes the file with the smallest size 109 | - `move-first` or `m1`: moves the first of the files in the group to the folder specified with the 110 | `--move-to` option 111 | - `move-last` or `ml`: moves the last of the files in the group to the folder specified with the 112 | `--move-to` option 113 | - `move-biggest` or `m>`: moves the file with the biggest size to the folder specified with the 114 | `--move-to` option 115 | - `move-smallest` or `m<`: moves the file with the smallest size to the folder specified with the 116 | `--move-to` option 117 | - `symlink-smaller`: delete the smaller files and replace them to a symlink to the biggest file 118 | - `symlink-bigger`: delete the bigger files and replace them to a symlink to the smallest file 119 | - `eog`: launches the `eog` image viewer to compare the files in the group (*deprecated* by `exec`) 120 | - `xv`: launches the `xv` image viewer to compare the files in the group (*deprecated* by `exec`) 121 | - `print`: prints the files in the group 122 | - `print_inline`: like `print` but without newline 123 | - `quote`: prints the files in the group quoted for POSIX shells 124 | - `quote_inline`: like `quote` but without newline 125 | - `exec`: executes a command (see `--exec` argument below) 126 | - `none`: does nothing; may be useful for benchmarking and testing 127 | 128 | The `move-*` actions require the `--move-to` option to specify the target folder. Additionally, the 129 | `--move-recreate-path` option can be set to reproduce the directory structure of the source files in 130 | the target folder. 131 | 132 | The `--exec` argument allows calling another program when the `--on-equal exec` option is given. 133 | You can pass a command line string like `--exec "program {1} {2}"` where `{1}` and `{2}` are 134 | replaced by the matching pair files (or first two files in a group), quoted so the shell recognizes 135 | the files properly. The wildcard `{*}` expands to all files in a matching group, which when called 136 | with the `--group` argument may be more than two images considered equal. 137 | 138 | #### Examples: 139 | * `--exec "open -a Preview -W {1} {2}"`: Opens the files in MacOS Preview app and waits for it. 140 | * `--exec "ls -s {*}"`: Prints the size (in blocks) next to all files. 141 | * `--exec 'for i in {*}; do dirname $i; basename $i; done'`: Shows the directory and the filename 142 | separately for all files. 143 | 144 | ### Parallel execution 145 | 146 | Use the `--parallel` option to utilize all free cores on your system for calculating image hashes. 147 | Optionally, you can specify the number of processes to use with `--parallel $N`. 148 | 149 | To execute the `--on-equal` actions in parallel, use the `--parallel-actions` option, which also can 150 | take an optional number of processes to use as argument. 151 | 152 | ### Excluding subfolders 153 | 154 | Use the `--exclude-dir` option to exclude subfolders of `$IMAGE_ROOT` from the search. The argument 155 | is a regular expression matching the subfolder names to be excluded. Multiple arguments can be 156 | passed to `--exclude-dir` to exclude multiple subfolders. 157 | 158 | The argument(s) given to `--exclude-dir` may be regular expressions. These regular expressions are 159 | matched only against the directory name, not the file name. 160 | 161 | #### Examples 162 | 163 | Exclude subfolder `$IMAGE_ROOT/foo`: 164 | ```shell 165 | $ find-dups $IMAGE_ROOT --exclude-dir $IMAGE_ROOT/foo 166 | ``` 167 | Exclude all subfolders named `foo` or `bar`: 168 | ```shell 169 | $ find-dups $IMAGE_ROOT --exclude-dir foo bar 170 | ``` 171 | 172 | ### Slow execution 173 | 174 | `find-dups` can also use an alternative algorithm which exhaustively compares all images to each 175 | other, being O(N2) in the number of images. This algorithm is selected automatically if 176 | `--max-distance` is not 0. 177 | 178 | You can use the `--slow` option to use this alternative algorithm specifically. The `--slow` switch 179 | is mutually exclusive with the `--group` switch. 180 | 181 | ### Progress bar and verbosity control 182 | 183 | - `--progress` prints a progress bar each for the process of reading the images, and the process of 184 | finding duplicates among the scanned image 185 | - `--debug` prints debugging output 186 | - `--quiet` decreases the log level by 1 for each time it is called; `--debug` and `--quiet` cancel 187 | each other out 188 | 189 | ## Development notes 190 | 191 | Needs Python3, Pillow imaging library and `pillow-heif` HEIF plugin to run, additionally Wand for 192 | the test suite. 193 | 194 | Uses Poetry for dependency management. 195 | 196 | ### Installation 197 | 198 | From source: 199 | ```shell 200 | $ git clone https://gitlab.com/lilacashes/DuplicateImages.git 201 | $ cd DuplicateImages 202 | $ pip3 install poetry 203 | $ poetry install 204 | ``` 205 | 206 | ### Running 207 | 208 | ```shell 209 | $ poetry run find-dups $PICTURE_DIR 210 | ``` 211 | or 212 | ```shell 213 | $ poetry run find-dups -h 214 | ``` 215 | for a list of all possible options. 216 | 217 | ### Test suite 218 | 219 | Running it all: 220 | ```shell 221 | $ poetry run pytest 222 | $ poetry run mypy duplicate_images tests 223 | $ poetry run flake8 224 | $ poetry run pylint duplicate_images tests 225 | $ poetry run bandit -r duplicate_images 226 | ``` 227 | or simply 228 | ```shell 229 | $ .git_hooks/pre-push 230 | ``` 231 | Setting the test suite to be run before every push: 232 | ```shell 233 | $ cd .git/hooks 234 | $ ln -s ../../.git_hooks/pre-push . 235 | ``` 236 | 237 | ### Publishing 238 | 239 | A tag is created and the new version is published automatically by GitLab CI on every successful 240 | merge to `master`. 241 | 242 | #### Prerequisites 243 | 244 | For every Merge Request to `master` it is checked that: 245 | - the `version` number in `pyproject.toml` is not an already existing git tag 246 | - the `CHANGELOG.md` contains an entry for the current version number 247 | 248 | #### PyPI 249 | 250 | There is a job in GitLab CI for publishing to `pypi.org` that runs as soon as a new tag is added, 251 | which happens automatically whenever a MR is merged. The tag is the same as the `version` in the 252 | `pyproject.toml` file. For every MR it needs to be ensured that the `version` is not the same as an 253 | already existing tag. 254 | 255 | To publish the package on PyPI manually: 256 | ```shell 257 | $ poetry config repositories.testpypi https://test.pypi.org/legacy/ 258 | $ poetry build 259 | $ poetry publish --username $PYPI_USER --password $PYPI_PASSWORD --repository testpypi && \ 260 | poetry publish --username $PYPI_USER --password $PYPI_PASSWORD 261 | ``` 262 | (obviously assuming here that username and password are the same on PyPI and TestPyPI) 263 | 264 | #### Updating GitHub mirror 265 | 266 | The GitHub repo `git@github.com:lene/DuplicateImages.git` is set up as a push mirror in GitLab CI, 267 | but mirroring is flaky at the time and may or may not succeed. The CI job `PushToGithub` should take 268 | care of mirroring to GitHub after every merge to `master`. 269 | 270 | To push to the GitHub repository manually (assuming the GitHub repo is set up as remote `github`): 271 | ```shell 272 | $ git checkout master 273 | $ git fetch 274 | $ git pull --rebase 275 | $ git tag # to check that the latest tag is present 276 | $ git push --tags github master 277 | ``` 278 | 279 | #### Creating Releases on GitHub 280 | 281 | The CI job `CreateGithubRelease` creates a Release on GitHub, which can then be found under 282 | https://github.com/lene/DuplicateImages/releases. 283 | 284 | ### Profiling 285 | 286 | #### CPU time 287 | To show the top functions by time spent, including called functions: 288 | ```shell 289 | $ poetry run python -m cProfile -s tottime ./duplicate_images/duplicate.py \ 290 | --algorithm $ALGORITHM --action-equal none $IMAGE_DIR 2>&1 | head -n 15 291 | ``` 292 | or, to show the top functions by time spent in the function alone: 293 | ```shell 294 | $ poetry run python -m cProfile -s cumtime ./duplicate_images/duplicate.py \ 295 | --algorithm $ALGORITHM --action-equal none $IMAGE_DIR 2>&1 | head -n 15 296 | ``` 297 | 298 | #### Memory usage 299 | ```shell 300 | $ poetry run fil-profile run ./duplicate_images/duplicate.py \ 301 | --algorithm $ALGORITHM --action-equal none $IMAGE_DIR 2>&1 302 | ``` 303 | This will open a browser window showing the functions using the most memory (see 304 | https://pypi.org/project/filprofiler for more details). 305 | 306 | ## Contributors 307 | 308 | - Lene Preuss (https://github.com/lene): primary developer 309 | - Mike Reiche (https://github.com/mreiche): support for arbitrary actions, speedups 310 | - https://github.com/beijingjazzpanda: bug fix 311 | -------------------------------------------------------------------------------- /duplicate_images/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Given a big set of images, find duplicate and similar images 3 | """ 4 | -------------------------------------------------------------------------------- /duplicate_images/common.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions used in multiple places 3 | """ 4 | __author__ = 'Lene Preuss ' 5 | 6 | import logging 7 | from functools import wraps 8 | from pathlib import Path 9 | from time import time 10 | 11 | 12 | def path_with_parent(path: Path) -> str: 13 | return '/'.join(str(path).rstrip('/').split('/')[-2:]) 14 | 15 | 16 | def log_execution_time(): 17 | def actual_decorator(method): 18 | @wraps(method) 19 | def allow_fail(self, *args, **kwargs): 20 | start_time = time() 21 | return_value = method(self, *args, **kwargs) 22 | logging.info( 23 | '%s.%s() run in %.2fs', 24 | type(self).__name__, method.__name__, 25 | time() - start_time 26 | ) 27 | return return_value 28 | 29 | return allow_fail 30 | 31 | return actual_decorator 32 | -------------------------------------------------------------------------------- /duplicate_images/duplicate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env /usr/bin/python3 2 | """ 3 | The main script for the `find-dups` command line tool. 4 | """ 5 | 6 | import logging 7 | import re 8 | from argparse import Namespace 9 | from multiprocessing.pool import ThreadPool 10 | from os import walk, access, R_OK 11 | from pathlib import Path 12 | from typing import Callable, List, Optional 13 | 14 | import PIL.Image 15 | from filetype import guess 16 | from pillow_heif import register_heif_opener 17 | 18 | from duplicate_images.common import path_with_parent, log_execution_time 19 | from duplicate_images.function_types import Results, ImageGroup, ActionFunction 20 | from duplicate_images.hash_store import FileHashStore 21 | from duplicate_images.image_pair_finder import ImagePairFinder, PairFinderOptions 22 | from duplicate_images.log import setup_logging 23 | from duplicate_images.methods import ACTIONS_ON_EQUALITY, IMAGE_HASH_ALGORITHM, get_hash_size_kwargs 24 | from duplicate_images.parse_commandline import parse_command_line 25 | 26 | try: 27 | register_heif_opener() 28 | except ImportError as error: 29 | logging.warning('HEIF support not available: %s', error) 30 | logging.warning('See https://github.com/lene/DuplicateImages/issues/11 for details') 31 | 32 | 33 | def is_image_file(filename: Path) -> bool: 34 | """Returns True if filename is a readable image file""" 35 | try: 36 | if access(filename, R_OK) and not filename.is_symlink(): 37 | kind = guess(filename) 38 | return kind is not None and kind.mime.startswith('image/') 39 | except OSError as err: 40 | logging.warning('Skipping %s: %s', path_with_parent(filename), err) 41 | return False 42 | 43 | 44 | def folder_matches(filename: Path, regex: re.Pattern) -> bool: 45 | return bool(re.search(regex, str(filename.parent))) 46 | 47 | 48 | @log_execution_time() 49 | def files_in_dirs( 50 | dir_names: List[Path], is_relevant: Callable[[Path], bool] = lambda f: f.is_file(), 51 | exclude_regexes: Optional[List[str]] = None 52 | ) -> List[Path]: 53 | """ 54 | Returns a list of all files in directory dir_name (recursively scanning subdirectories), which 55 | satisfy the condition is_file. If exclude_regexes is given, files in directories matching any 56 | of the regular expressions are excluded. 57 | """ 58 | exclude_compiled = [re.compile(regex) for regex in exclude_regexes or []] 59 | unfiltered = ( 60 | Path(root) / filename 61 | for dir_name in dir_names 62 | for root, _, filenames in walk(dir_name) 63 | for filename in filenames 64 | if not any(folder_matches(Path(root) / filename, regex) for regex in exclude_compiled) 65 | ) 66 | # astonishingly, filtering in a separate step is faster than in the generator expression 67 | return [file for file in unfiltered if is_relevant(file)] 68 | 69 | 70 | def get_matches( 71 | root_directories: List[Path], algorithm: str, 72 | options: PairFinderOptions = PairFinderOptions(), 73 | hash_store_path: Optional[Path] = None, 74 | exclude_regexes: Optional[List[str]] = None 75 | ) -> Results: 76 | hash_algorithm = IMAGE_HASH_ALGORITHM[algorithm] 77 | hash_size_kwargs = get_hash_size_kwargs(hash_algorithm, options.hash_size) 78 | image_files = files_in_dirs(root_directories, is_image_file, exclude_regexes) 79 | logging.info('%d total files', len(image_files)) 80 | image_files.sort() 81 | logging.info('Computing image hashes') 82 | 83 | with FileHashStore.create(hash_store_path, algorithm, hash_size_kwargs) as hash_store: 84 | return ImagePairFinder.create( 85 | image_files, hash_algorithm, options=options, hash_store=hash_store, 86 | ).get_equal_groups() 87 | 88 | 89 | def execute_actions(matches: Results, args: Namespace) -> None: 90 | action_equal = ACTIONS_ON_EQUALITY[args.on_equal] 91 | if args.parallel_actions: 92 | with ThreadPool(args.parallel_actions) as pool: 93 | pool.map(lambda group: execute_action(action_equal, group, args), matches) 94 | else: 95 | for group in sorted(matches): 96 | execute_action(action_equal, group, args) 97 | 98 | 99 | def execute_action(action: ActionFunction, group: ImageGroup, args: Namespace) -> None: 100 | try: 101 | action(args, group) 102 | except FileNotFoundError: 103 | pass 104 | 105 | 106 | def set_max_image_pixels(args: Namespace) -> None: 107 | if args.max_image_pixels is not None: 108 | PIL.Image.MAX_IMAGE_PIXELS = args.max_image_pixels 109 | 110 | 111 | def main() -> None: 112 | args = parse_command_line() 113 | setup_logging(args) 114 | set_max_image_pixels(args) 115 | options = PairFinderOptions.from_args(args) 116 | for folder in args.root_directory: 117 | logging.info( 118 | 'Scanning %s %s', path_with_parent(folder), 119 | f'(excluding {", ".join(args.exclude_dir)})' if args.exclude_dir else '' 120 | ) 121 | try: 122 | matches = get_matches( 123 | [Path(folder) for folder in args.root_directory], args.algorithm, 124 | options=options, hash_store_path=Path(args.hash_db) if args.hash_db else None, 125 | exclude_regexes=list(args.exclude_dir) if args.exclude_dir else None 126 | ) 127 | logging.info('%d matches', len(matches)) 128 | execute_actions(matches, args) 129 | except KeyboardInterrupt: 130 | pass 131 | 132 | 133 | if __name__ == '__main__': 134 | main() 135 | -------------------------------------------------------------------------------- /duplicate_images/function_types.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shorter and more descriptive type aliases used in static type checking for the 3 | `duplicate_images` package. 4 | """ 5 | __author__ = 'Lene Preuss ' 6 | 7 | from argparse import Namespace 8 | from pathlib import Path 9 | from typing import Any, Callable, Dict, List, Optional, Tuple, Generator, Union 10 | 11 | from PIL import Image 12 | from imagehash import ImageHash, ImageMultiHash 13 | 14 | Hash = Union[ImageHash, ImageMultiHash] 15 | HashFunction = Callable[[Image.Image], Hash] 16 | ImageGroup = Tuple[Path, ...] 17 | ActionFunction = Callable[[Namespace, ImageGroup], Any] 18 | Results = List[ImageGroup] 19 | ResultsGenerator = Generator[List[Path], None, None] 20 | ResultsGrouper = Callable[[ResultsGenerator], Results] 21 | CacheEntry = Tuple[Path, Optional[Hash]] 22 | Cache = Dict[Path, Hash] 23 | 24 | 25 | def is_hash(x: Any) -> bool: 26 | return isinstance(x, (ImageHash, ImageMultiHash)) 27 | -------------------------------------------------------------------------------- /duplicate_images/hash_scanner/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functionality to compute and store the image hashes of a set of images 3 | """ 4 | __author__ = 'Lene Preuss ' 5 | 6 | from duplicate_images.hash_scanner.image_hash_scanner import ( 7 | ImageHashScanner, ParallelImageHashScanner 8 | ) 9 | -------------------------------------------------------------------------------- /duplicate_images/hash_scanner/image_hash_scanner.py: -------------------------------------------------------------------------------- 1 | """ 2 | Calculate the image hashes of a given set of images 3 | """ 4 | __author__ = 'Lene Preuss ' 5 | 6 | import logging 7 | import os 8 | 9 | from multiprocessing.pool import ThreadPool 10 | from pathlib import Path 11 | from typing import List, Optional, Dict 12 | 13 | from PIL import Image 14 | from PIL.Image import DecompressionBombError 15 | 16 | from duplicate_images.common import path_with_parent 17 | from duplicate_images.function_types import CacheEntry, HashFunction 18 | from duplicate_images.hash_store import HashStore, NullHashStore 19 | from duplicate_images.methods import get_hash_size_kwargs 20 | from duplicate_images.pair_finder_options import PairFinderOptions 21 | from duplicate_images.progress_bar_manager import ProgressBarManager, NullProgressBarManager 22 | 23 | 24 | class ImageHashScanner: 25 | """ 26 | Reads images from the given list of files and calculates their image hashes, 27 | using a single thread only 28 | """ 29 | 30 | @staticmethod 31 | def create( 32 | files: List[Path], hash_algorithm: HashFunction, 33 | options: PairFinderOptions, 34 | hash_store: HashStore = NullHashStore(), 35 | progress_bars: ProgressBarManager = NullProgressBarManager() 36 | ) -> 'ImageHashScanner': 37 | hash_size_kwargs = get_hash_size_kwargs(hash_algorithm, options.hash_size) 38 | if not options.parallel: 39 | return ImageHashScanner( 40 | files, hash_algorithm, hash_size_kwargs, hash_store, progress_bars 41 | ) 42 | return ParallelImageHashScanner( 43 | files, hash_algorithm, hash_size_kwargs, hash_store, progress_bars, 44 | options.parallel 45 | ) 46 | 47 | def __init__( # pylint: disable = too-many-arguments,too-many-positional-arguments 48 | self, files: List[Path], hash_algorithm: HashFunction, 49 | hash_size_kwargs: Optional[Dict] = None, 50 | hash_store: HashStore = NullHashStore(), 51 | progress_bars: ProgressBarManager = NullProgressBarManager() 52 | ) -> None: 53 | self.files = files 54 | self.algorithm = hash_algorithm 55 | self.hash_size_kwargs = hash_size_kwargs if hash_size_kwargs is not None else {} 56 | self.hash_store = hash_store 57 | self.progress_bars = progress_bars 58 | logging.info('Using %s', self.class_string()) 59 | 60 | def class_string(self) -> str: 61 | return self.__class__.__name__ 62 | 63 | def precalculate_hashes(self) -> List[CacheEntry]: 64 | return [self.get_hash(file) for file in self.files] 65 | 66 | def get_hash(self, file: Path) -> CacheEntry: 67 | self.progress_bars.update_reader() 68 | try: 69 | cached = self.hash_store.get(file) 70 | if cached is not None: 71 | return file, cached 72 | 73 | image_hash = self.algorithm(Image.open(file), **self.hash_size_kwargs) 74 | self.hash_store.add(file, image_hash) 75 | return file, image_hash 76 | except OSError as err: 77 | logging.warning('%s: %s', path_with_parent(file), err) 78 | return file, None 79 | except DecompressionBombError as err: 80 | logging.warning('%s: %s', path_with_parent(file), err) 81 | logging.warning('To process this file, use the --max-image-pixels option') 82 | return file, None 83 | 84 | 85 | class ParallelImageHashScanner(ImageHashScanner): 86 | """ 87 | Reads images from the given list of files and calculates their image hashes, 88 | using a specified number of threads in parallel 89 | """ 90 | 91 | def __init__( # pylint: disable = too-many-arguments,too-many-positional-arguments 92 | self, 93 | files: List[Path], hash_algorithm: HashFunction, 94 | hash_size_kwargs: Optional[Dict] = None, 95 | hash_store: HashStore = NullHashStore(), 96 | progress_bars: ProgressBarManager = NullProgressBarManager(), 97 | parallel: int = os.cpu_count() or 1 98 | ) -> None: 99 | self.num_threads = parallel 100 | super().__init__(files, hash_algorithm, hash_size_kwargs, hash_store, progress_bars) 101 | 102 | def class_string(self) -> str: 103 | return f'{self.__class__.__name__} with {self.num_threads} threads' 104 | 105 | def precalculate_hashes(self) -> List[CacheEntry]: 106 | with ThreadPool(self.num_threads) as pool: 107 | return pool.map(self.get_hash, self.files) 108 | -------------------------------------------------------------------------------- /duplicate_images/hash_store.py: -------------------------------------------------------------------------------- 1 | """ 2 | Persistent storage for calculated image hashes 3 | """ 4 | __author__ = 'Lene Preuss ' 5 | 6 | import json 7 | import logging 8 | import pickle # nosec 9 | from pathlib import Path 10 | from typing import Any, IO, Callable, Optional, Union, Dict, Tuple 11 | 12 | from imagehash import hex_to_hash 13 | 14 | from duplicate_images.common import log_execution_time 15 | from duplicate_images.function_types import Cache, Hash, is_hash 16 | 17 | 18 | class NullHashStore: 19 | """ 20 | Hash store that does not store anything but can be used as a drop-in 21 | replacement for `FileHashStore` and `PickleHashStore` when no persistent 22 | storage is desired 23 | """ 24 | 25 | def __init__(self) -> None: 26 | logging.info('No persistent storage for calculated image hashes set up') 27 | 28 | def __enter__(self) -> 'NullHashStore': 29 | return self 30 | 31 | def __exit__(self, _: Any, __: Any, ___: Any) -> None: 32 | pass 33 | 34 | def get(self, _: Path) -> Optional[Hash]: 35 | return None 36 | 37 | def add(self, _: Path, __: Hash) -> None: 38 | pass 39 | 40 | 41 | HashStore = Union[NullHashStore, 'FileHashStore', 'PickleHashStore', 'JSONHashStore'] 42 | 43 | 44 | class FileHashStore: 45 | """ 46 | Base class for persistent storage of calculated image hashes, providing all 47 | necessary functionality except for reading and writing data to various file 48 | formats 49 | """ 50 | @staticmethod 51 | def create( 52 | store_path: Optional[Path], algorithm: str, hash_size_kwargs: Dict 53 | ) -> Union['FileHashStore', NullHashStore]: 54 | if store_path is None: 55 | return NullHashStore() 56 | if store_path.suffix == '.pickle': 57 | return PickleHashStore(store_path, algorithm, hash_size_kwargs) 58 | return JSONHashStore(store_path, algorithm, hash_size_kwargs) 59 | 60 | def __init__(self, store_path: Path, algorithm: str, hash_size_kwargs: Dict) -> None: 61 | self.store_path = store_path 62 | self.algorithm = algorithm 63 | self.hash_size_kwargs = hash_size_kwargs 64 | self.values: Cache = {} 65 | self.dirty: bool = False 66 | try: 67 | self.load() 68 | logging.info( 69 | 'Opened persistent storage %s with %d entries', store_path, len(self.values) 70 | ) 71 | except (FileNotFoundError, EOFError, pickle.PickleError): 72 | logging.info('Creating new %s at %s', self.__class__.__name__, store_path) 73 | 74 | def __enter__(self) -> 'FileHashStore': 75 | return self 76 | 77 | def __exit__(self, _: Any, __: Any, ___: Any) -> None: 78 | if not self.dirty: 79 | return 80 | if self.store_path.is_file(): 81 | if self.store_path.with_suffix('.bak').is_file(): 82 | self.store_path.with_suffix('.bak').unlink() 83 | self.store_path.rename(self.store_path.with_suffix('.bak')) 84 | self.dump() 85 | 86 | def add(self, file: Path, image_hash: Hash) -> None: 87 | self.values[file] = image_hash 88 | self.dirty = True 89 | 90 | def get(self, file: Path) -> Optional[Hash]: 91 | return self.values.get(file) 92 | 93 | def metadata(self) -> Dict: 94 | return {'algorithm': self.algorithm, **self.hash_size_kwargs} 95 | 96 | def values_with_metadata(self) -> Tuple[Dict, Dict]: 97 | return self.values, self.metadata() 98 | 99 | def checked_load(self, file: IO, load: Callable[[IO], Tuple[Cache, Dict]]) -> None: 100 | try: 101 | values, metadata = load(file) # nosec 102 | except IndexError as error: 103 | raise ValueError('Save file not in format: [values, metadata]') from error 104 | if not isinstance(values, dict): 105 | raise ValueError(f'Not a dict: {values}') 106 | if not metadata: 107 | raise ValueError('Metadata empty') 108 | if not isinstance(metadata, dict): 109 | raise ValueError(f'Metadata not a dict: {metadata}') 110 | bad_keys = [key for key in values.keys() if not isinstance(key, Path)] 111 | if bad_keys: 112 | raise ValueError(f'Not a Path: {bad_keys}') 113 | bad_values = [value for value in values.values() if not is_hash(value)] 114 | if bad_values: 115 | raise ValueError(f'Not an image hash: {bad_values}') 116 | if metadata['algorithm'] != self.algorithm: 117 | raise ValueError(f'Algorithm mismatch: {metadata["algorithm"]} != {self.algorithm}') 118 | if metadata.keys() != self.metadata().keys(): 119 | raise ValueError(f'Metadata mismatch: {metadata} != {self.metadata()}') 120 | if metadata != self.metadata(): 121 | raise ValueError(f'Metadata mismatch: {metadata} != {self.metadata()}') 122 | self.values = values 123 | 124 | def load(self) -> None: 125 | raise NotImplementedError() 126 | 127 | def dump(self) -> None: 128 | raise NotImplementedError() 129 | 130 | 131 | class PickleHashStore(FileHashStore): 132 | """ 133 | Implementation of `FileHashStore` that reads and stores the calculated 134 | image hashes in Pickle format 135 | """ 136 | 137 | @log_execution_time() 138 | def load(self) -> None: 139 | with self.store_path.open('rb') as file: 140 | self.checked_load(file, pickle.load) 141 | 142 | @log_execution_time() 143 | def dump(self) -> None: 144 | with self.store_path.open('wb') as file: 145 | pickle.dump(self.values_with_metadata(), file) # nosec 146 | 147 | 148 | def load_values_and_metadata(file: IO) -> Tuple[Cache, Dict]: 149 | try: 150 | valds = json.load(file) 151 | except json.JSONDecodeError as error: 152 | raise ValueError('Save file not in JSON format') from error 153 | if not isinstance(valds, list): 154 | raise ValueError('Save file not in format: [values, metadata]') 155 | if not isinstance(valds[0], dict): 156 | raise ValueError(f'Not a dict: {valds[0]}') 157 | if not isinstance(valds[1], dict): 158 | raise ValueError(f'Metadata not a dict: {valds[1]}') 159 | return {Path(k): hex_to_hash(str(v)) for k, v in valds[0].items()}, valds[1] 160 | 161 | 162 | class JSONHashStore(FileHashStore): 163 | """ 164 | Implementation of `FileHashStore` that reads and stores the calculated 165 | image hashes in JSON format 166 | """ 167 | 168 | @log_execution_time() 169 | def load(self) -> None: 170 | with self.store_path.open('r') as file: 171 | self.checked_load(file, load_values_and_metadata) 172 | 173 | # see https://bugs.python.org/issue18820 for why this pain is necessary (Python does not allow 174 | # to automatically convert dict keys for JSON export 175 | def converted_values(self): 176 | return {str(k.resolve()): str(v) for k, v in self.values.items()} 177 | 178 | @log_execution_time() 179 | def dump(self) -> None: 180 | with self.store_path.open('w') as file: 181 | json.dump((self.converted_values(), self.metadata()), file) 182 | -------------------------------------------------------------------------------- /duplicate_images/image_pair_finder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Finds duplicate images by comparing their image hashes using the given hash 3 | algorithm 4 | """ 5 | __author__ = 'Lene Preuss ' 6 | 7 | import logging 8 | from itertools import combinations 9 | from pathlib import Path 10 | from time import time 11 | from typing import Dict, List, Iterator 12 | 13 | from duplicate_images.common import log_execution_time 14 | from duplicate_images.function_types import ( 15 | Hash, HashFunction, ImageGroup, Results, ResultsGenerator, ResultsGrouper 16 | ) 17 | from duplicate_images.hash_scanner import ImageHashScanner 18 | from duplicate_images.hash_store import HashStore, NullHashStore 19 | from duplicate_images.pair_finder_options import PairFinderOptions 20 | from duplicate_images.progress_bar_manager import ProgressBarManager, NullProgressBarManager 21 | 22 | 23 | def group_results_as_pairs(results: ResultsGenerator) -> Results: 24 | return [ 25 | pair 26 | for result in results 27 | for pair in combinations(list(result), 2) 28 | ] 29 | 30 | 31 | def group_results_as_tuples(results: ResultsGenerator) -> Results: 32 | return [tuple(result) for result in results] 33 | 34 | 35 | class ImagePairFinder: 36 | """ 37 | Finds duplicate images by comparing their image hashes 38 | """ 39 | 40 | @classmethod 41 | def create( 42 | cls, files: List[Path], hash_algorithm: HashFunction, 43 | options: PairFinderOptions = PairFinderOptions(), 44 | hash_store: HashStore = NullHashStore() 45 | ) -> 'ImagePairFinder': 46 | group_results = group_results_as_tuples if options.group else group_results_as_pairs 47 | progress_bars = ProgressBarManager.create(len(files), options.show_progress_bars) 48 | scanner = ImageHashScanner.create(files, hash_algorithm, options, hash_store, progress_bars) 49 | 50 | if options.max_distance == 0 and not options.slow: 51 | return DictImagePairFinder( 52 | scanner, group_results, options=options, progress_bars=progress_bars 53 | ) 54 | if len(files) > 1000: 55 | logging.warning( 56 | 'Using %s with a big number of images. Expect slow performance.', 57 | SlowImagePairFinder.__name__ 58 | ) 59 | logging.warning('Consider using [Parallel]DictImagePairFinder instead.') 60 | return SlowImagePairFinder(scanner, group_results, options, progress_bars) 61 | 62 | def __init__( # pylint: disable = too-many-arguments 63 | self, scanner: ImageHashScanner, 64 | group_results: ResultsGrouper, 65 | progress_bars: ProgressBarManager = NullProgressBarManager() 66 | ) -> None: 67 | self.precalculated_hashes: Dict = {} 68 | self.group_results = group_results 69 | self.scanner = scanner 70 | self.progress_bars = progress_bars 71 | self.scan_start_time = time() 72 | logging.info('Using %s', self.__class__.__name__) 73 | 74 | def get_equal_groups(self) -> Results: 75 | raise NotImplementedError() 76 | 77 | def log_scan_finished(self) -> None: 78 | logging.info( 79 | '%d distinct hashes calculated in %.2fs', 80 | len(self.precalculated_hashes), time() - self.scan_start_time 81 | ) 82 | 83 | 84 | class DictImagePairFinder(ImagePairFinder): 85 | """ 86 | Searches by storing the image hashes as keys to a dict. 87 | Works only if max_distance == 0. 88 | """ 89 | def __init__( # pylint: disable = too-many-arguments 90 | self, scanner: ImageHashScanner, 91 | group_results: ResultsGrouper, 92 | options: PairFinderOptions = PairFinderOptions(), 93 | progress_bars: ProgressBarManager = NullProgressBarManager() 94 | ) -> None: 95 | super().__init__(scanner, group_results, progress_bars) 96 | if options.max_distance != 0: 97 | raise ValueError('DictImagePairFinder only works if max_distance == 0!') 98 | self.precalculated_hashes = self.get_hashes() 99 | self.progress_bars.close_reader() 100 | 101 | @log_execution_time() 102 | def get_equal_groups(self) -> Results: 103 | self.progress_bars.close() 104 | self.log_scan_finished() 105 | return self.group_results( 106 | (result for result in self.precalculated_hashes.values() if len(result) > 1) 107 | ) 108 | 109 | def get_hashes(self) -> Dict[Hash, List[Path]]: 110 | hash_dict: Dict[Hash, List[Path]] = {} 111 | for file, image_hash in self.scanner.precalculate_hashes(): 112 | if image_hash is not None: 113 | hash_dict.setdefault(image_hash, []).append(file) 114 | return hash_dict 115 | 116 | 117 | class SlowImagePairFinder(ImagePairFinder): 118 | """ 119 | Searches by comparing the image hashes of each image to every other, giving O(N^2) performance. 120 | Does not allow returning the results in groups, only pairs. 121 | The only option if max_distance != 0. 122 | """ 123 | 124 | def __init__( # pylint: disable = too-many-arguments 125 | self, scanner: ImageHashScanner, 126 | group_results: ResultsGrouper, 127 | options: PairFinderOptions = PairFinderOptions(), 128 | progress_bars: ProgressBarManager = NullProgressBarManager() 129 | ) -> None: 130 | if group_results is group_results_as_tuples: 131 | raise ValueError(f'{self.__class__.__name__} only works with pairs, not groups') 132 | super().__init__(scanner, group_results, progress_bars) 133 | self.max_distance = options.max_distance or 0 134 | self.precalculated_hashes = self.get_hashes() 135 | self.progress_bars.close_reader() 136 | 137 | @log_execution_time() 138 | def get_equal_groups(self) -> Results: 139 | self.log_scan_finished() 140 | image_files = list(self.precalculated_hashes.keys()) 141 | logging.info('Filtering duplicates') 142 | matches = self.filter_matches(combinations(image_files, 2)) 143 | self.progress_bars.close() 144 | return matches 145 | 146 | def get_hashes(self) -> Dict[Path, Hash]: 147 | return { 148 | file: image_hash for file, image_hash in self.scanner.precalculate_hashes() 149 | if image_hash is not None 150 | } 151 | 152 | def filter_matches(self, all_pairs: Iterator[ImageGroup]) -> Results: 153 | self.progress_bars.create_filter_bar(len(self.precalculated_hashes)) 154 | return [ 155 | (file, other_file) for file, other_file in all_pairs 156 | if self.are_images_equal(file, other_file) 157 | ] 158 | 159 | def are_images_equal(self, file: Path, other_file: Path) -> bool: 160 | self.progress_bars.update_filter() 161 | hash_distance = self.precalculated_hashes[file] - self.precalculated_hashes[other_file] 162 | logging.debug( 163 | '%-30s - %-30s = %d', file.stem, other_file.stem, hash_distance 164 | ) 165 | return hash_distance <= self.max_distance 166 | -------------------------------------------------------------------------------- /duplicate_images/log.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logging setup 3 | """ 4 | __author__ = 'Lene Preuss ' 5 | 6 | import logging 7 | from argparse import Namespace 8 | 9 | import coloredlogs 10 | 11 | 12 | def setup_logging(args: Namespace) -> None: 13 | log_level = logging.DEBUG if args.debug else logging.INFO 14 | for _ in range(args.quiet): 15 | log_level += (logging.INFO - logging.DEBUG) 16 | coloredlogs.install( 17 | level=log_level, fmt='%(asctime)s %(levelname)s: %(message)s', 18 | datefmt='%H:%M:%S' 19 | ) 20 | -------------------------------------------------------------------------------- /duplicate_images/methods.py: -------------------------------------------------------------------------------- 1 | """ 2 | Definition of the possible actions run on sets of equal images 3 | """ 4 | __author__ = 'Lene Preuss ' 5 | 6 | import logging 7 | from argparse import Namespace 8 | from pathlib import Path 9 | from shlex import quote 10 | from subprocess import call # nosec 11 | from typing import Callable, Dict, List, Optional, Union 12 | 13 | import imagehash 14 | 15 | from duplicate_images.common import path_with_parent 16 | from duplicate_images.function_types import ActionFunction, HashFunction, ImageGroup 17 | 18 | __all__ = [ 19 | 'call', 'quote', 'get_hash_size_kwargs', 'IMAGE_HASH_ALGORITHM', 'ALGORITHM_DEFAULTS', 20 | 'ACTIONS_ON_EQUALITY' 21 | ] 22 | 23 | 24 | def ascending_by_size(group: ImageGroup) -> List[Path]: 25 | return sorted(group, key=lambda path: (path.stat().st_size, str(path))) 26 | 27 | 28 | def delete_with_log_message(file: Path) -> None: 29 | file.unlink() 30 | logging.info('Deleted %s', path_with_parent(file)) 31 | 32 | 33 | def move_with_log_message(file: Path, destination: Path, recreate_path: bool) -> None: 34 | target = destination / (file.relative_to(file.anchor) if recreate_path else file.name) 35 | if recreate_path: 36 | target.parent.mkdir(parents=True, exist_ok=True) 37 | file.rename(target) 38 | logging.info('Moved %s to %s', file, target) 39 | 40 | 41 | def symlink_to_nth_smallest_file(group: ImageGroup, index: int) -> None: 42 | biggest = ascending_by_size(group)[index] 43 | others = set(group) - {biggest} 44 | for file in others: 45 | delete_with_log_message(file) 46 | file.symlink_to(biggest) 47 | 48 | 49 | def shell_exec(args: Namespace, group: ImageGroup) -> None: 50 | cmd = args.exec 51 | for num, path in enumerate(group): 52 | cmd = cmd.replace(f"{'{'}{num + 1}{'}'}", f'{quote(str(path))}') 53 | cmd = cmd.replace('{*}', ' '.join([quote(str(path)) for path in group])) 54 | call(cmd, shell=True) # nosec 55 | 56 | 57 | def get_hash_size_kwargs(algorithm: HashFunction, size: Optional[int]) -> Dict: 58 | if size is None: 59 | return ALGORITHM_DEFAULTS.get(algorithm, {'hash_size': 8}) 60 | kwarg = next(iter(ALGORITHM_DEFAULTS.get(algorithm, {'hash_size': 8}))) 61 | return {} if kwarg == 'hash_func' else {kwarg: size} 62 | 63 | 64 | IMAGE_HASH_ALGORITHM = { 65 | 'ahash': imagehash.average_hash, 66 | 'phash': imagehash.phash, 67 | 'phash_simple': imagehash.phash_simple, 68 | 'dhash': imagehash.dhash, 69 | 'dhash_vertical': imagehash.dhash_vertical, 70 | 'whash': imagehash.whash, 71 | 'colorhash': imagehash.colorhash, 72 | 'crop_resistant': imagehash.crop_resistant_hash, 73 | } # type: Dict[str, HashFunction] 74 | 75 | ALGORITHM_DEFAULTS: Dict[Callable, Dict[str, Union[int, HashFunction]]] = { 76 | imagehash.average_hash: {'hash_size': 8}, 77 | imagehash.phash: {'hash_size': 8}, 78 | imagehash.phash_simple: {'hash_size': 8}, 79 | imagehash.dhash: {'hash_size': 8}, 80 | imagehash.dhash_vertical: {'hash_size': 8}, 81 | imagehash.whash: {'hash_size': 8}, 82 | imagehash.colorhash: {'binbits': 3}, 83 | imagehash.crop_resistant_hash: {'hash_func': imagehash.phash}, 84 | } 85 | 86 | ACTIONS_ON_EQUALITY: Dict[str, ActionFunction] = { 87 | 'delete-first': lambda args, group: delete_with_log_message(group[0]), 88 | 'd1': lambda args, group: delete_with_log_message(group[0]), 89 | 'delete-last': lambda args, group: delete_with_log_message(group[-1]), 90 | 'dl': lambda args, group: delete_with_log_message(group[-1]), 91 | 'delete-biggest': lambda args, group: delete_with_log_message(ascending_by_size(group)[-1]), 92 | 'd>': lambda args, group: delete_with_log_message(ascending_by_size(group)[-1]), 93 | 'delete-smallest': lambda args, group: delete_with_log_message(ascending_by_size(group)[0]), 94 | 'd<': lambda args, group: delete_with_log_message(ascending_by_size(group)[0]), 95 | 'move-first': lambda args, group: move_with_log_message( 96 | group[0], Path(args.move_to), args.move_recreate_path 97 | ), 98 | 'm1': lambda args, group: move_with_log_message( 99 | group[0], Path(args.move_to), args.move_recreate_path 100 | ), 101 | 'move-last': lambda args, group: move_with_log_message( 102 | group[-1], Path(args.move_to), args.move_recreate_path 103 | ), 104 | 'ml': lambda args, group: move_with_log_message( 105 | group[-1], Path(args.move_to), args.move_recreate_path 106 | ), 107 | 'move-biggest': lambda args, group: move_with_log_message( 108 | ascending_by_size(group)[-1], Path(args.move_to), args.move_recreate_path 109 | ), 110 | 'm>': lambda args, group: move_with_log_message( 111 | ascending_by_size(group)[-1], Path(args.move_to), args.move_recreate_path 112 | ), 113 | 'move-smallest': lambda args, group: move_with_log_message( 114 | ascending_by_size(group)[0], Path(args.move_to), args.move_recreate_path 115 | ), 116 | 'm<': lambda args, group: move_with_log_message( 117 | ascending_by_size(group)[0], Path(args.move_to), args.move_recreate_path 118 | ), 119 | 'symlink-smaller': lambda args, group: symlink_to_nth_smallest_file(group, -1), 120 | 'symlink-bigger': lambda args, group: symlink_to_nth_smallest_file(group, 0), 121 | 'eog': lambda args, group: call(['eog'] + [str(pic) for pic in group]), # nosec 122 | 'xv': lambda args, group: call(['xv', '-nolim'] + [str(pic) for pic in group]), # nosec 123 | 'print': lambda args, group: print(*group), 124 | 'print_inline': lambda args, group: print(*group, end=' '), 125 | 'quote': lambda args, group: print(' '.join([quote(str(pic)) for pic in group])), 126 | 'quote_inline': lambda args, group: print( 127 | ' '.join([quote(str(pic)) for pic in group]), end=' ' 128 | ), 129 | 'exec': lambda args, group: shell_exec(args, group), # pylint: disable=unnecessary-lambda 130 | 'none': lambda args, group: None, 131 | } 132 | 133 | MOVE_ACTIONS = ['move-first', 'm1', 'move-last', 'ml', 'move-biggest', 'm>', 'move-smallest', 'm<'] 134 | -------------------------------------------------------------------------------- /duplicate_images/pair_finder_options.py: -------------------------------------------------------------------------------- 1 | """ 2 | Encapsulates the options for scanning images and detecting duplicates 3 | """ 4 | __author__ = 'Lene Preuss ' 5 | 6 | from argparse import Namespace 7 | from dataclasses import dataclass 8 | from typing import Optional 9 | 10 | 11 | @dataclass(frozen=True) 12 | class PairFinderOptions: 13 | """ 14 | Encapsulates the options for scanning images and detecting duplicates and 15 | reads them from an `argparse.Namespace` object 16 | """ 17 | max_distance: int = 0 18 | hash_size: Optional[int] = None 19 | show_progress_bars: bool = False 20 | parallel: Optional[int] = None 21 | slow: bool = False 22 | group: bool = False 23 | 24 | @classmethod 25 | def from_args(cls, args: Namespace): 26 | return cls( 27 | args.max_distance, args.hash_size, args.progress, args.parallel, args.slow, args.group 28 | ) 29 | -------------------------------------------------------------------------------- /duplicate_images/parse_commandline.py: -------------------------------------------------------------------------------- 1 | """ 2 | Define and parse command line arguments for the `find-dups` command line tool 3 | """ 4 | __author__ = 'Lene Preuss ' 5 | 6 | import logging 7 | from os import cpu_count 8 | from argparse import ArgumentParser, Namespace, RawDescriptionHelpFormatter 9 | from configparser import ConfigParser 10 | from typing import List, Optional, Dict, Union 11 | 12 | from PIL import Image 13 | 14 | from duplicate_images.methods import ACTIONS_ON_EQUALITY, IMAGE_HASH_ALGORITHM, MOVE_ACTIONS 15 | 16 | DefaultsDict = Dict[str, Union[str, int, bool, None]] 17 | DEFAULTS: DefaultsDict = { 18 | 'root_directory': '.', 19 | 'exclude_dir': None, 20 | 'algorithm': 'phash', 21 | 'max_distance': 0, 22 | 'hash_size': None, 23 | 'on_equal': 'print', 24 | 'exec': None, 25 | 'move_to': None, 26 | 'move_recreate_path': False, 27 | 'parallel': None, 28 | 'parallel_actions': None, 29 | 'slow': False, 30 | 'group': False, 31 | 'progress': False, 32 | 'debug': False, 33 | 'quiet': 0, 34 | 'hash_db': None, 35 | 'max_image_pixels': None 36 | } 37 | 38 | 39 | def is_power_of_2(n: int) -> bool: 40 | # https://stackoverflow.com/questions/57025836/how-to-check-if-a-given-number-is-a-power-of-two 41 | return (n != 0) and (n & (n - 1) == 0) 42 | 43 | 44 | def parse_command_line(args: Optional[List[str]] = None) -> Namespace: 45 | conf_parser = create_config_file_parser() 46 | conf_namespace, remaining_argv = conf_parser.parse_known_args(args) 47 | defaults = read_defaults_from_config(conf_namespace) 48 | 49 | parser = create_main_parser(conf_parser, defaults) 50 | namespace = parser.parse_args(remaining_argv) 51 | 52 | check_complex_errors(namespace, parser) 53 | return namespace 54 | 55 | 56 | def create_config_file_parser() -> ArgumentParser: 57 | conf_parser = ArgumentParser( 58 | description=__doc__, 59 | formatter_class=RawDescriptionHelpFormatter, 60 | add_help=False 61 | ) 62 | conf_parser.add_argument('-c', '--config-file', help='Specify config file', metavar='FILE') 63 | return conf_parser 64 | 65 | 66 | def read_defaults_from_config(conf_namespace: Namespace) -> DefaultsDict: 67 | defaults = DEFAULTS.copy() 68 | if conf_namespace.config_file: 69 | config = ConfigParser() 70 | config.read([conf_namespace.config_file]) 71 | logging.warning(config.sections()) 72 | defaults.update(dict(config.items('Defaults'))) 73 | return defaults 74 | 75 | 76 | def create_main_parser(parent_parser: ArgumentParser, defaults: DefaultsDict) -> ArgumentParser: 77 | parser = ArgumentParser( 78 | description='Find pairs of equal or similar images.', 79 | # Inherit options from config_parser 80 | parents=[parent_parser] 81 | ) 82 | parser.set_defaults(**defaults) 83 | parser.add_argument( 84 | 'root_directory', default='.', nargs='+', 85 | help='The root of the directory tree under which images are compared' 86 | ) 87 | parser.add_argument( 88 | '--exclude-dir', nargs='*', 89 | help='Directories to exclude from the search (can be given as regular expressions)' 90 | ) 91 | parser.add_argument( 92 | '--algorithm', choices=IMAGE_HASH_ALGORITHM.keys(), 93 | help='Method used to determine if two images are considered equal' 94 | ) 95 | parser.add_argument( 96 | '--max-distance', type=int, 97 | help='Maximum hash distance for images to be considered equal' 98 | ) 99 | parser.add_argument( 100 | '--hash-size', type=int, 101 | help='Hash size (or number of bin bits for colorhash)' 102 | ) 103 | parser.add_argument( 104 | '--on-equal', choices=ACTIONS_ON_EQUALITY.keys(), 105 | help='Command to be run on each pair of images found to be equal' 106 | ) 107 | parser.add_argument( 108 | '--exec', type=str, 109 | help='Command to execute (replaces {1}, {2} or {*} with file paths)' 110 | ) 111 | parser.add_argument( 112 | '--move-to', type=str, 113 | help='Destination directory for moving duplicate images' 114 | ) 115 | parser.add_argument( 116 | '--move-recreate-path', action='store_true', 117 | help='recreate the path the original images are under in the destination directory' 118 | ) 119 | parser.add_argument( 120 | '--parallel', nargs='?', type=int, const=cpu_count(), 121 | help=f'Calculate hashes using PARALLEL threads (default: {cpu_count()})' 122 | ) 123 | parser.add_argument( 124 | '--parallel-actions', nargs='?', type=int, const=cpu_count(), 125 | help=f'Execute actions on equal images using PARALLEL threads (default: {cpu_count()})' 126 | ) 127 | group = parser.add_mutually_exclusive_group() 128 | group.add_argument( 129 | '--slow', action='store_true', help='Use slow (O(N^2)) algorithm' 130 | ) 131 | group.add_argument( 132 | '--group', action='store_true', 133 | help='Handle equal images in a group instead of multiple pairs' 134 | ) 135 | parser.add_argument( 136 | '--progress', action='store_true', help='Show progress bars during processing' 137 | ) 138 | parser.add_argument( 139 | '--debug', action='store_true', help='Print lots of debugging info' 140 | ) 141 | parser.add_argument( 142 | '--quiet', '-q', action='count', help='Decrease log level by one for each' 143 | ) 144 | parser.add_argument( 145 | '--hash-db', help='File storing precomputed hashes' 146 | ) 147 | parser.add_argument( 148 | '--max-image-pixels', type=int, 149 | help=f'Maximum size of image in pixels (default: {Image.MAX_IMAGE_PIXELS})' 150 | ) 151 | return parser 152 | 153 | 154 | def check_complex_errors(namespace, parser): 155 | if namespace.on_equal == 'exec' and not namespace.exec: 156 | parser.error('--exec argument is required with --on-equal exec') 157 | if namespace.exec and namespace.on_equal != 'exec': 158 | parser.error('--exec is only allowed with --on-equal exec') 159 | if namespace.algorithm == 'whash' and not is_power_of_2(namespace.hash_size): 160 | parser.error('whash requires hash_size to be a power of 2') 161 | if namespace.group and namespace.max_distance: 162 | parser.error('--max-distance: not allowed with argument --group') 163 | if namespace.move_to and namespace.on_equal not in MOVE_ACTIONS: 164 | parser.error(f'--move-to requires --on-equal to be one of: {", ".join(MOVE_ACTIONS)}') 165 | if namespace.on_equal in MOVE_ACTIONS and not namespace.move_to: 166 | parser.error(f'--on-equal {namespace.move_to} requires --move-to to be set') 167 | if namespace.move_recreate_path and namespace.on_equal not in MOVE_ACTIONS: 168 | parser.error( 169 | f'--move-recreate-path requires --on-equal to be one of: {", ".join(MOVE_ACTIONS)}' 170 | ) 171 | -------------------------------------------------------------------------------- /duplicate_images/progress_bar_manager.py: -------------------------------------------------------------------------------- 1 | """ 2 | Controlling the progress bars shown during processing images 3 | """ 4 | __author__ = 'Lene Preuss ' 5 | 6 | from typing import Optional 7 | 8 | from tqdm import tqdm 9 | 10 | 11 | class ProgressBarManager: 12 | """ 13 | Manages the progress bars shown during image scan and, optionally, duplicate 14 | detection 15 | """ 16 | @classmethod 17 | def create(cls, files_length: int, active: bool = False) -> 'ProgressBarManager': 18 | return ProgressBarManager(files_length) if active else NullProgressBarManager() 19 | 20 | def __init__(self, files_length: int) -> None: 21 | self.reader_progress: Optional[tqdm] = tqdm( 22 | total=files_length, miniters=max(files_length / 100, 5), smoothing=0.1, unit='', 23 | delay=0.1 24 | ) if files_length else None 25 | self.filter_progress: Optional[tqdm] = None 26 | 27 | def create_filter_bar(self, hashes_length: int) -> None: 28 | self.close_reader() 29 | total_items = int(hashes_length * (hashes_length - 1) / 2) 30 | self.filter_progress = tqdm( 31 | total=total_items, unit='', 32 | unit_scale=True, miniters=max(total_items / 5000, 20000) 33 | ) 34 | 35 | def update_reader(self) -> None: 36 | if self.reader_progress is not None: 37 | self.reader_progress.update(1) 38 | 39 | def update_filter(self) -> None: 40 | if self.filter_progress is not None: 41 | self.filter_progress.update(1) 42 | 43 | def close_reader(self) -> None: 44 | if self.reader_progress is not None: 45 | self.reader_progress.close() 46 | 47 | def close(self) -> None: 48 | if self.filter_progress is not None: 49 | self.filter_progress.close() 50 | 51 | 52 | class NullProgressBarManager(ProgressBarManager): 53 | """ 54 | Implementation of `ProgressBarManager` that does nothing but can be used in 55 | place of one 56 | """ 57 | def __init__(self) -> None: 58 | super().__init__(0) 59 | 60 | def create_filter_bar(self, hashes_length: int) -> None: 61 | pass 62 | 63 | def update_reader(self) -> None: 64 | pass 65 | 66 | def update_filter(self) -> None: 67 | pass 68 | 69 | def close_reader(self) -> None: 70 | pass 71 | 72 | def close(self) -> None: 73 | pass 74 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True 3 | warn_redundant_casts = True 4 | strict_optional = True 5 | warn_unused_ignores = True 6 | disallow_subclassing_any = False 7 | no_warn_return_any = True 8 | 9 | [mypy-*] 10 | disallow_untyped_calls = True 11 | disallow_untyped_defs = True 12 | check_untyped_defs = True 13 | no_implicit_optional = True 14 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "duplicate_images" 3 | version = "0.11.9" 4 | description = "Finds equal or similar images in a directory containing (many) image files" 5 | authors = ["Lene Preuss "] 6 | repository = "https://github.com/lene/DuplicateImages.git" 7 | homepage = "https://github.com/lene/DuplicateImages" 8 | readme = "README.md" 9 | classifiers = [ 10 | "Development Status :: 4 - Beta", 11 | "Environment :: Console", 12 | "Programming Language :: Python :: 3", 13 | "Topic :: Multimedia :: Graphics", 14 | "Topic :: Utilities" 15 | ] 16 | 17 | [tool.poetry.dependencies] 18 | python = ">=3.9" 19 | Wand = ">=0.6" 20 | pillow = ">=11.0" 21 | imagehash = ">=4.3" 22 | coloredlogs = ">=15.0" 23 | tqdm = ">=4.67" 24 | pillow-heif = ">=0.21" 25 | six = ">=1.17" 26 | numpy = ">=2.0" 27 | filetype = ">=1.2" 28 | setuptools = ">=75.6" 29 | 30 | [tool.poetry.group.dev.dependencies] 31 | bandit = ">=1.8" 32 | lz4 = ">=4.0" 33 | mypy = ">=1.14" 34 | flake8 = ">=7.1" 35 | pytest = ">=7.1" 36 | pylint = ">=3.3" 37 | pytest-xdist = ">=3.6" 38 | ptpython = ">=3.0" 39 | 40 | [tool.poetry.scripts] 41 | find-dups = "duplicate_images.duplicate:main" 42 | 43 | 44 | [build-system] 45 | requires = ["poetry-core>=1.6"] 46 | build-backend = "poetry.core.masonry.api" 47 | -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | __author__ = 'Lene Preuss ' 3 | -------------------------------------------------------------------------------- /tests/integration/conftest.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | __author__ = 'Lene Preuss ' 3 | 4 | from pathlib import Path 5 | from tempfile import TemporaryDirectory 6 | from typing import Generator 7 | 8 | import pytest 9 | 10 | 11 | @pytest.fixture 12 | def data_dir() -> Path: 13 | return Path(__file__).resolve().parent / 'data' 14 | 15 | 16 | @pytest.fixture 17 | def tmp_dir() -> Generator[Path, None, None]: 18 | testdir = TemporaryDirectory() 19 | yield Path(testdir.name) 20 | try: 21 | testdir.cleanup() 22 | except FileNotFoundError: 23 | pass 24 | -------------------------------------------------------------------------------- /tests/integration/data/broken/47ff(1).jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/broken/47ff(1).jpg -------------------------------------------------------------------------------- /tests/integration/data/broken/47ff(2).jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/broken/47ff(2).jpg -------------------------------------------------------------------------------- /tests/integration/data/different/pair1/20221026_124702.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/different/pair1/20221026_124702.jpg -------------------------------------------------------------------------------- /tests/integration/data/different/pair1/20221026_124757.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/different/pair1/20221026_124757.jpg -------------------------------------------------------------------------------- /tests/integration/data/equal_but_binary_different.json: -------------------------------------------------------------------------------- 1 | [{"/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk50%/61983_camillabelle_g-50%.jpg": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/camilla_belle_1297719284.jpg": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/61983_camillabelle_g.jpg": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/61983_camillabelle_g-90%.jpg": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk50%/61983_camillabelle_g.jpg": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/61983_camillabelle_g.jpg": "00183c1c3898981e", "/home/preuss/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/61983_camillabelle_g.jpg": "00183c1c3898981e", "/home/preuss/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/camilla_belle_1297719284.jpg": "00183c1c3898981e", "/home/preuss/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/61983_camillabelle_g-90%.jpg": "00183c1c3898981e", "/home/preuss/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/61983_camillabelle_g.jpg": "00183c1c3898981e", "/home/preuss/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk50%/61983_camillabelle_g-50%.jpg": "00183c1c3898981e", "/home/preuss/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk50%/61983_camillabelle_g.jpg": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair2/61983_camillabelle_g.heic": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair2/61983_camillabelle_g.jpg": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair3/61983_camillabelle_g_2_10bit.heic": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair3/61983_camillabelle_g_2_8bit.heic": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair4/61983_camillabelle_g_2_q=50.heic": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair4/61983_camillabelle_g_lossless.heic": "00183c1c3898981e", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/20221026_124702_q94.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/20221026_124702_q95.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair2/20221026_124702.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair2/20221026_124702.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair3/20221026_124702_10bit.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair3/20221026_124702_8bit.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair4/20221026_124702_lossless.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair4/20221026_124702_q50.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair4/20221026_124702_q80.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/20221026_124702_50%.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair1/20221026_124702_90%.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702_90%.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk50%/20221026_124702.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk50%/20221026_124702_50%.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk50%/20221026_124702_80%.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk20%/20221026_124702.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk20%/20221026_124702_80%.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk30%/20221026_124702.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk30%/20221026_124702_70%.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair2/20221026_124702_q95.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702_q95.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk20%/20221026_124702_q95.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/pair4/20221026_124702_q85.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk20%/20221026_124702_85%.jpg": "cc3c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_10bit.heic": "cc3c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_8bit.heic": "cc3c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossles_vs_lossy/20221026_124702_lossless.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossles_vs_lossy/20221026_124702_q85.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q94.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q95.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.jpg": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_lossless.heic": "cc7c6c071f1f3fae", "/home/lene/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_q85.heic": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_10bit.heic": "cc3c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_8bit.heic": "cc3c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_lossless.heic": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_q85.heic": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q94.jpg": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q95.jpg": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.heic": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.jpg": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702.jpg": "cc7c6c071f1f3fae", "/Users/mikereiche/IdeaProjects/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702_90%.jpg": "cc7c6c071f1f3fae", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_10bit.heic": "9452538c3de569da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_8bit.heic": "9452538c3de569da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_lossless.heic": "9452d38c3de169da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_q85.heic": "9452d38c3de169da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q94.jpg": "9452d38c3de169da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q95.jpg": "9452d38c3de169da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.heic": "9452d38c3de169da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.jpg": "9452d38c3de169da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702.jpg": "9452d38c3de169da", "/home/lepr/workspace/DuplicateImages/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702_90%.jpg": "9452d38c3de169da"}, {"algorithm": "phash", "hash_size": 8}] -------------------------------------------------------------------------------- /tests/integration/data/equal_but_binary_different.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different.pickle -------------------------------------------------------------------------------- /tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_10bit.heic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_10bit.heic -------------------------------------------------------------------------------- /tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_8bit.heic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/heic_bit_depth/20221026_124702_8bit.heic -------------------------------------------------------------------------------- /tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_lossless.heic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_lossless.heic -------------------------------------------------------------------------------- /tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_q85.heic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/heic_lossless_vs_lossy/20221026_124702_q85.heic -------------------------------------------------------------------------------- /tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q94.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q94.jpg -------------------------------------------------------------------------------- /tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q95.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/jpeg_quality/20221026_124702_q95.jpg -------------------------------------------------------------------------------- /tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.heic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.heic -------------------------------------------------------------------------------- /tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/jpeg_vs_heic/20221026_124702.jpg -------------------------------------------------------------------------------- /tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702.jpg -------------------------------------------------------------------------------- /tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702_90%.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/equal_but_binary_different/shrunk10%/20221026_124702_90%.jpg -------------------------------------------------------------------------------- /tests/integration/data/exactly_equal/heif/test1.heif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/heif/test1.heif -------------------------------------------------------------------------------- /tests/integration/data/exactly_equal/heif/test2.heif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/heif/test2.heif -------------------------------------------------------------------------------- /tests/integration/data/exactly_equal/pair1/20221026_124702_90%-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/pair1/20221026_124702_90%-2.jpg -------------------------------------------------------------------------------- /tests/integration/data/exactly_equal/pair1/20221026_124702_90%.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/pair1/20221026_124702_90%.jpg -------------------------------------------------------------------------------- /tests/integration/data/exactly_equal/pair2/20220312_124816-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/pair2/20220312_124816-2.jpg -------------------------------------------------------------------------------- /tests/integration/data/exactly_equal/pair2/20220312_124816.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/pair2/20220312_124816.jpg -------------------------------------------------------------------------------- /tests/integration/data/exactly_equal/pair3/IMAG0015_small-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/pair3/IMAG0015_small-2.png -------------------------------------------------------------------------------- /tests/integration/data/exactly_equal/pair3/IMAG0015_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/pair3/IMAG0015_small.png -------------------------------------------------------------------------------- /tests/integration/data/exactly_equal/webp/test1.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/webp/test1.webp -------------------------------------------------------------------------------- /tests/integration/data/exactly_equal/webp/test2.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/exactly_equal/webp/test2.webp -------------------------------------------------------------------------------- /tests/integration/data/garbage.txt: -------------------------------------------------------------------------------- 1 | garbage 2 | -------------------------------------------------------------------------------- /tests/integration/data/huge/huge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/huge/huge.png -------------------------------------------------------------------------------- /tests/integration/data/huge/huge2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/huge/huge2.png -------------------------------------------------------------------------------- /tests/integration/data/is_image_file/is_image/test.heif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_image/test.heif -------------------------------------------------------------------------------- /tests/integration/data/is_image_file/is_image/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_image/test.jpg -------------------------------------------------------------------------------- /tests/integration/data/is_image_file/is_image/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_image/test.png -------------------------------------------------------------------------------- /tests/integration/data/is_image_file/is_image/test.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_image/test.tiff -------------------------------------------------------------------------------- /tests/integration/data/is_image_file/is_image/test.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_image/test.webp -------------------------------------------------------------------------------- /tests/integration/data/is_image_file/is_not_image/test.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_not_image/test.mp3 -------------------------------------------------------------------------------- /tests/integration/data/is_image_file/is_not_image/test.ogg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_not_image/test.ogg -------------------------------------------------------------------------------- /tests/integration/data/is_image_file/is_not_image/test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/is_image_file/is_not_image/test.txt -------------------------------------------------------------------------------- /tests/integration/data/jpeg_artifacts/jpeg_10/20221026_124702_q10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_10/20221026_124702_q10.jpg -------------------------------------------------------------------------------- /tests/integration/data/jpeg_artifacts/jpeg_10/20221026_124702_q95.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_10/20221026_124702_q95.jpg -------------------------------------------------------------------------------- /tests/integration/data/jpeg_artifacts/jpeg_25/20221026_124702_q25.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_25/20221026_124702_q25.jpg -------------------------------------------------------------------------------- /tests/integration/data/jpeg_artifacts/jpeg_25/20221026_124702_q95.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_25/20221026_124702_q95.jpg -------------------------------------------------------------------------------- /tests/integration/data/jpeg_artifacts/jpeg_50/20221026_124702_q50.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_50/20221026_124702_q50.jpg -------------------------------------------------------------------------------- /tests/integration/data/jpeg_artifacts/jpeg_50/20221026_124702_q95.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_50/20221026_124702_q95.jpg -------------------------------------------------------------------------------- /tests/integration/data/jpeg_artifacts/jpeg_75/20221026_124702_q75.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_75/20221026_124702_q75.jpg -------------------------------------------------------------------------------- /tests/integration/data/jpeg_artifacts/jpeg_75/20221026_124702_q95.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/jpeg_artifacts/jpeg_75/20221026_124702_q95.jpg -------------------------------------------------------------------------------- /tests/integration/data/similar/many/20220218_135622.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/similar/many/20220218_135622.jpg -------------------------------------------------------------------------------- /tests/integration/data/similar/many/20220218_135658.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/similar/many/20220218_135658.jpg -------------------------------------------------------------------------------- /tests/integration/data/similar/many/20220218_135708.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/similar/many/20220218_135708.jpg -------------------------------------------------------------------------------- /tests/integration/data/similar/pair1/20220806_214449.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/similar/pair1/20220806_214449.jpg -------------------------------------------------------------------------------- /tests/integration/data/similar/pair1/20220806_214600.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/similar/pair1/20220806_214600.jpg -------------------------------------------------------------------------------- /tests/integration/data/similar/pair2/20220329_210118.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/similar/pair2/20220329_210118.jpg -------------------------------------------------------------------------------- /tests/integration/data/similar/pair2/20220329_210123.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lene/DuplicateImages/fa179fa63fbd27be2812bf30e8157956ef3abd6e/tests/integration/data/similar/pair2/20220329_210123.jpg -------------------------------------------------------------------------------- /tests/integration/test_is_image_file.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | __author__ = 'Lene Preuss ' 3 | 4 | from pathlib import Path 5 | from typing import List 6 | 7 | import pytest 8 | 9 | from duplicate_images.duplicate import is_image_file 10 | 11 | 12 | def base_dir() -> Path: 13 | return Path(__file__).resolve().parent / 'data' / 'is_image_file' 14 | 15 | 16 | def image_files() -> List[Path]: 17 | return list((base_dir() / 'is_image').glob('test.*')) 18 | 19 | 20 | def not_image_files() -> List[Path]: 21 | return list((base_dir() / 'is_not_image').glob('test.*')) 22 | 23 | 24 | @pytest.mark.parametrize('image_file', image_files()) 25 | def test_image_files_are_recognized(image_file: Path) -> None: 26 | assert is_image_file(image_file) 27 | 28 | 29 | @pytest.mark.parametrize('not_image_file', not_image_files()) 30 | def test_non_image_files_are_recognized(not_image_file: Path) -> None: 31 | assert not is_image_file(not_image_file) 32 | -------------------------------------------------------------------------------- /tests/integration/test_persistent_storage.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | __author__ = 'Lene Preuss ' 3 | 4 | import json 5 | import pickle 6 | from pathlib import Path 7 | from typing import Any, Tuple, Optional 8 | from unittest.mock import Mock, patch 9 | 10 | import pytest 11 | 12 | from duplicate_images.duplicate import get_matches 13 | from duplicate_images.pair_finder_options import PairFinderOptions 14 | 15 | 16 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different']) 17 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 18 | @patch('imagehash.average_hash', return_value=0) 19 | def test_open_hash_store_with_filename( 20 | average_hash: Mock, data_dir: Path, test_set: str, file_type: str 21 | ) -> None: 22 | folder = data_dir / test_set 23 | cache_file = folder.with_suffix(f'.{file_type}') 24 | creation_time = cache_file.stat().st_ctime 25 | get_matches([folder], 'phash', hash_store_path=cache_file) 26 | assert average_hash.call_count == 0 27 | assert cache_file.stat().st_atime > creation_time 28 | 29 | 30 | @pytest.mark.parametrize('test_set', ['different', 'equal_but_binary_different']) 31 | def test_open_bad_file_format(data_dir: Path, test_set: str) -> None: 32 | folder = data_dir / test_set 33 | cache_file = data_dir / 'garbage.txt' 34 | creation_time = cache_file.stat().st_ctime 35 | with pytest.raises(ValueError): 36 | get_matches([folder], 'phash', hash_store_path=cache_file) 37 | assert cache_file.stat().st_ctime == creation_time 38 | 39 | 40 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different']) 41 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 42 | def test_open_correct_file_format_but_not_a_tuple( 43 | tmp_dir: Path, data_dir: Path, test_set: str, file_type: str 44 | ) -> None: 45 | check_garbage( 46 | tmp_dir, data_dir / test_set, file_type, garbage_data='garbage', message=None 47 | ) 48 | 49 | 50 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different']) 51 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 52 | def test_open_correct_file_format_but_values_not_a_dict( 53 | tmp_dir: Path, data_dir: Path, test_set: str, file_type: str 54 | ) -> None: 55 | check_garbage( 56 | tmp_dir, data_dir / test_set, file_type, garbage_data=('garbage', {}), message='Not a dict' 57 | ) 58 | 59 | 60 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different']) 61 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 62 | def test_open_correct_file_format_but_metadata_not_a_dict( 63 | tmp_dir: Path, data_dir: Path, test_set: str, file_type: str 64 | ) -> None: 65 | check_garbage( 66 | tmp_dir, data_dir / test_set, file_type, garbage_data=({}, 'garbage'), 67 | message='Metadata not a dict' 68 | ) 69 | 70 | 71 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different']) 72 | @pytest.mark.parametrize('file_type', ['pickle']) 73 | def test_open_correct_file_format_but_keys_not_paths( 74 | tmp_dir: Path, data_dir: Path, test_set: str, file_type: str 75 | ) -> None: 76 | folder = data_dir / test_set 77 | check_garbage( 78 | tmp_dir, folder, file_type, 79 | garbage_data=( 80 | {str(path): 0 for path in folder.glob('**')}, {'algorithm': 'phash', 'hash_size': 8} 81 | ), 82 | message='Not a Path' 83 | ) 84 | 85 | 86 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different']) 87 | @pytest.mark.parametrize('file_type', ['pickle']) 88 | def test_open_correct_file_format_but_values_not_image_hashes( 89 | tmp_dir: Path, data_dir: Path, test_set: str, file_type: str 90 | ) -> None: 91 | folder = data_dir / test_set 92 | check_garbage( 93 | tmp_dir, folder, file_type, 94 | garbage_data=( 95 | {path: 0 for path in folder.glob('**')}, {'algorithm': 'phash', 'hash_size': 8} 96 | ), message='Not an image hash' 97 | ) 98 | 99 | 100 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different']) 101 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 102 | def test_open_correct_file_format_but_metadata_missing( 103 | tmp_dir: Path, data_dir: Path, test_set: str, file_type: str 104 | ) -> None: 105 | folder = data_dir / test_set 106 | check_garbage( 107 | tmp_dir, folder, file_type, 108 | garbage_data=({path: 0 for path in folder.glob('**')}, ), message=None 109 | ) 110 | 111 | 112 | @pytest.mark.parametrize('test_set', ['equal_but_binary_different']) 113 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 114 | def test_open_correct_file_format_but_metadata_empty( 115 | tmp_dir: Path, data_dir: Path, test_set: str, file_type: str 116 | ) -> None: 117 | folder = data_dir / test_set 118 | check_garbage( 119 | tmp_dir, folder, file_type, 120 | garbage_data=({path: 0 for path in folder.glob('**')}, {}), message='Metadata empty' 121 | ) 122 | 123 | 124 | @pytest.mark.parametrize('test_set', ['different', 'equal_but_binary_different']) 125 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 126 | @pytest.mark.parametrize('algorithms', [('phash', 'ahash')]) 127 | def test_opening_with_different_algorithm_leads_to_error( 128 | tmp_dir: Path, data_dir: Path, test_set: str, file_type: str, algorithms: Tuple[str, str] 129 | ) -> None: 130 | cache_file = tmp_dir / f'hash_store.{file_type}' 131 | get_matches([data_dir / test_set], algorithms[0], hash_store_path=cache_file) 132 | with pytest.raises(ValueError, match='Algorithm mismatch'): 133 | get_matches([data_dir / test_set], algorithms[1], hash_store_path=cache_file) 134 | 135 | 136 | @pytest.mark.parametrize('test_set', ['different', 'equal_but_binary_different']) 137 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 138 | @pytest.mark.parametrize('hash_size', [(8, 9)]) 139 | def test_opening_with_different_algorithm_parameters_leads_to_error( 140 | tmp_dir: Path, data_dir: Path, test_set: str, file_type: str, hash_size: Tuple[int, int] 141 | ) -> None: 142 | cache_file = tmp_dir / f'hash_store.{file_type}' 143 | assert not cache_file.is_file() 144 | get_matches( 145 | [data_dir / test_set], 'phash', options=PairFinderOptions(hash_size=hash_size[0]), 146 | hash_store_path=cache_file 147 | ) 148 | assert cache_file.is_file() 149 | with pytest.raises(ValueError, match='Metadata mismatch'): 150 | get_matches( 151 | [data_dir / test_set], 'phash', options=PairFinderOptions(hash_size=hash_size[1]), 152 | hash_store_path=cache_file 153 | ) 154 | 155 | 156 | def check_garbage( 157 | temp_dir: Path, folder: Path, file_type: str, garbage_data: Any, message: Optional[str] 158 | ) -> None: 159 | cache_file = temp_dir / f'garbage.{file_type}' 160 | if file_type == 'pickle': 161 | dump_pickle(cache_file, garbage_data) 162 | else: 163 | dump_json(cache_file, garbage_data) 164 | creation_time = cache_file.stat().st_ctime 165 | with pytest.raises(ValueError, match=message): 166 | get_matches([folder], 'phash', hash_store_path=cache_file) 167 | assert cache_file.stat().st_ctime == creation_time 168 | 169 | 170 | def dump_pickle(cache_file: Path, garbage_data: Any): 171 | with cache_file.open('wb') as file: 172 | pickle.dump(garbage_data, file) 173 | 174 | 175 | def dump_json(cache_file: Path, garbage_data: Any): 176 | with cache_file.open('w') as file: 177 | json.dump(encode_dict_keys_to_str(garbage_data), file) 178 | 179 | 180 | def encode_dict_keys_to_str(obj: Any) -> Any: 181 | if isinstance(obj, dict): 182 | return {str(key): value for key, value in obj.items()} 183 | if isinstance(obj, tuple): 184 | return tuple(encode_dict_keys_to_str(item) for item in obj) 185 | return obj 186 | -------------------------------------------------------------------------------- /tests/integration/test_real_images.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | __author__ = 'Lene Preuss ' 3 | 4 | from pathlib import Path 5 | from typing import List 6 | 7 | import pytest 8 | 9 | from PIL import Image 10 | from PIL.Image import DecompressionBombError 11 | 12 | from duplicate_images.image_pair_finder import PairFinderOptions 13 | from duplicate_images.methods import IMAGE_HASH_ALGORITHM 14 | from duplicate_images.duplicate import ( 15 | files_in_dirs, is_image_file, get_matches, set_max_image_pixels 16 | ) 17 | from duplicate_images.parse_commandline import parse_command_line 18 | 19 | HUGE_IMAGE_SIZE = 20000 * 20000 20 | 21 | 22 | @pytest.mark.parametrize('parallel', [True, False]) 23 | @pytest.mark.parametrize('slow', [True, False]) 24 | @pytest.mark.parametrize( 25 | 'algorithm,expected_pairs', 26 | [('ahash', 0), ('dhash', 0), ('phash', 0), ('whash', 0)] 27 | ) 28 | @pytest.mark.parametrize('image_pair', ['pair1', 'pair2']) 29 | def test_similar( # pylint:disable = too-many-arguments,too-many-positional-arguments 30 | data_dir: Path, image_pair: str, algorithm: str, expected_pairs: int, 31 | slow: bool, parallel: bool 32 | ) -> None: 33 | folder = data_dir / 'similar' / image_pair 34 | matches = get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel)) 35 | assert len(matches) == expected_pairs 36 | 37 | 38 | @pytest.mark.parametrize( 39 | 'algorithm,min_distance', 40 | [('ahash', 2), ('dhash', 10), ('phash', 14), ('whash', 2), ('colorhash', 0)] 41 | ) 42 | def test_hash_distance( 43 | data_dir: Path, algorithm: str, min_distance: int 44 | ) -> None: 45 | folder = data_dir / 'similar' / 'pair1' 46 | hash_algorithm = IMAGE_HASH_ALGORITHM[algorithm] 47 | image_files = sorted(files_in_dirs([folder], is_image_file)) 48 | assert len(image_files) == 2 49 | hashes = [hash_algorithm(Image.open(file)) for file in image_files] 50 | assert hashes[0] - hashes[1] == min_distance, str(hashes[0] - hashes[1]) # type: ignore 51 | 52 | 53 | @pytest.mark.parametrize('parallel', [True, False]) 54 | @pytest.mark.parametrize('slow', [True, False]) 55 | @pytest.mark.parametrize( 56 | 'algorithm,max_distance', 57 | [('ahash', 14), ('dhash', 12), ('phash', 14), ('whash', 16), ('colorhash', 0)] 58 | ) 59 | def test_similar_distance_matches( 60 | data_dir: Path, algorithm: str, max_distance: int, slow: bool, parallel: bool 61 | ) -> None: 62 | folder = data_dir / 'similar' / 'pair1' 63 | matches = get_matches( 64 | [folder], algorithm, PairFinderOptions( 65 | slow=slow, parallel=parallel, max_distance=max_distance 66 | ) 67 | ) 68 | assert len(matches) == 1 69 | 70 | 71 | @pytest.mark.parametrize('parallel', [True, False]) 72 | @pytest.mark.parametrize('slow', [True, False]) 73 | @pytest.mark.parametrize( 74 | 'algorithm,hash_size', 75 | [('ahash', 4), ('whash', 2), ('colorhash', 4)] 76 | ) 77 | def test_similar_matches_with_smaller_hash_size( 78 | data_dir: Path, algorithm: str, hash_size: int, slow: bool, parallel: bool 79 | ) -> None: 80 | folder = data_dir / 'similar' / 'pair1' 81 | matches = get_matches( 82 | [folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel, hash_size=hash_size) 83 | ) 84 | assert len(matches) == 1 85 | 86 | 87 | @pytest.mark.parametrize('parallel', [True, False]) 88 | @pytest.mark.parametrize('slow', [True, False]) 89 | @pytest.mark.parametrize( 90 | 'algorithm,expected_pairs', 91 | [('ahash', 0), ('dhash', 0), ('colorhash', 0), ('phash', 0), ('whash', 0)] 92 | ) 93 | @pytest.mark.parametrize('image_pair', ['many']) 94 | def test_similar_many( # pylint:disable = too-many-arguments,too-many-positional-arguments 95 | data_dir: Path, image_pair: str, algorithm: str, expected_pairs: int, 96 | slow: bool, parallel: bool 97 | ) -> None: 98 | folder = data_dir / 'similar' / image_pair 99 | matches = get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel)) 100 | assert len(matches) == expected_pairs 101 | 102 | 103 | @pytest.mark.parametrize('parallel', [True, False]) 104 | @pytest.mark.parametrize('slow', [True, False]) 105 | @pytest.mark.parametrize( 106 | 'algorithm,expected_pairs', 107 | [('ahash', 1), ('dhash', 1), ('colorhash', 1), ('phash', 1), ('whash', 1)] 108 | ) 109 | @pytest.mark.parametrize( 110 | 'image_pair', [ 111 | 'jpeg_quality', 'jpeg_vs_heic', 'heic_bit_depth', 'heic_lossless_vs_lossy', 'shrunk10%' 112 | ] 113 | ) 114 | def test_equal_but_binary_different( # pylint:disable=R0913,R0917 115 | data_dir: Path, image_pair: str, algorithm: str, expected_pairs: int, 116 | slow: bool, parallel: bool 117 | ) -> None: 118 | folder = data_dir / 'equal_but_binary_different' / image_pair 119 | matches = get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel)) 120 | assert len(matches) == expected_pairs 121 | 122 | 123 | @pytest.mark.parametrize('parallel', [True, False]) 124 | @pytest.mark.parametrize('slow', [True, False]) 125 | @pytest.mark.parametrize( 126 | 'algorithm,expected_pairs', 127 | [('ahash', 0), ('dhash', 0), ('colorhash', 0), ('phash', 0), ('whash', 0)] 128 | ) 129 | @pytest.mark.parametrize('image_pair', ['jpeg_75', 'jpeg_50', 'jpeg_25', 'jpeg_10']) 130 | def test_jpeg_artifacts( # pylint:disable = too-many-arguments,too-many-positional-arguments 131 | data_dir: Path, image_pair: str, algorithm: str, expected_pairs: int, 132 | slow: bool, parallel: bool 133 | ) -> None: 134 | folder = data_dir / 'equal_but_binary_different' / image_pair 135 | matches = get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel)) 136 | assert len(matches) == expected_pairs 137 | 138 | 139 | @pytest.mark.parametrize('parallel', [True, False]) 140 | @pytest.mark.parametrize('slow', [True, False]) 141 | @pytest.mark.parametrize('algorithm', ['ahash', 'dhash', 'colorhash', 'phash', 'whash']) 142 | @pytest.mark.parametrize('image_pair', ['pair1', 'pair2', 'pair3', 'webp', 'heif']) 143 | def test_exactly_equal( 144 | data_dir: Path, image_pair: str, algorithm: str, slow: bool, parallel: bool 145 | ) -> None: 146 | folder = data_dir / 'exactly_equal' / image_pair 147 | matches = get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel)) 148 | assert len(matches) == 1 149 | 150 | 151 | @pytest.mark.parametrize('parallel', [True, False]) 152 | @pytest.mark.parametrize('slow', [True, False]) 153 | @pytest.mark.parametrize('algorithm', ['ahash', 'dhash', 'colorhash', 'phash', 'whash']) 154 | @pytest.mark.parametrize('image_pair', ['pair1']) 155 | def test_different( 156 | data_dir: Path, image_pair: str, algorithm: str, slow: bool, parallel: bool 157 | ) -> None: 158 | folder = data_dir / 'different' / image_pair 159 | matches = get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel)) 160 | assert not matches 161 | 162 | 163 | @pytest.mark.parametrize('parallel', [True, False]) 164 | @pytest.mark.parametrize('slow', [True, False]) 165 | @pytest.mark.parametrize( 166 | 'test_case,image_pair,algorithm,expected_pairs', 167 | [ 168 | ('similar', 'pair2', 'ahash', 0), 169 | ('similar', 'pair2', 'dhash', 0), 170 | ('similar', 'pair2', 'colorhash', 1), 171 | ('similar', 'pair2', 'phash', 0), 172 | ('similar', 'pair2', 'whash', 0), 173 | ] 174 | ) 175 | def test_inconsistent_results_for_different_algorithms( # pylint:disable=R0913,R0917 176 | data_dir: Path, test_case: str, image_pair: str, algorithm: str, expected_pairs: int, 177 | slow: bool, parallel: bool 178 | ) -> None: 179 | folder = data_dir / test_case / image_pair 180 | matches = get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel)) 181 | assert len(matches) == expected_pairs 182 | 183 | 184 | @pytest.mark.parametrize('parallel', [True, False]) 185 | @pytest.mark.parametrize('slow', [True, False]) 186 | @pytest.mark.parametrize('algorithm', ['ahash', 'dhash', 'colorhash', 'phash', 'whash']) 187 | def test_broken_image_files_do_not_raise_os_error( 188 | data_dir: Path, algorithm: str, slow: bool, parallel: bool 189 | ) -> None: 190 | folder = data_dir / 'broken' 191 | get_matches([folder], algorithm, PairFinderOptions(slow=slow, parallel=parallel)) 192 | 193 | 194 | @pytest.mark.parametrize('parallel', [True, False]) 195 | @pytest.mark.parametrize('algorithm', ['ahash', 'dhash', 'colorhash', 'phash', 'whash']) 196 | @pytest.mark.parametrize( 197 | 'folders', [ 198 | ['heic_bit_depth'], # images in this folder appear different to those in the following 199 | ['heic_lossless_vs_lossy', 'jpeg_quality', 'jpeg_vs_heic', 'shrunk10%'] 200 | ] 201 | ) 202 | def test_multiple_images_appear_as_group( 203 | data_dir: Path, folders: List[Path], algorithm: str, parallel: bool 204 | ) -> None: 205 | folders = [data_dir / 'equal_but_binary_different' / folder for folder in folders] 206 | matches = get_matches(folders, algorithm, PairFinderOptions(group=True, parallel=parallel)) 207 | assert len(matches) == 1 208 | assert len(matches[0]) == len(files_in_dirs(folders)) 209 | 210 | 211 | @pytest.mark.parametrize('algorithm', ['ahash']) # only one of each is needed, it works the same 212 | @pytest.mark.parametrize('folders', [['heic_bit_depth']]) # in all cases 213 | def test_slow_image_finder_fails_with_group_option( 214 | data_dir: Path, folders: List[Path], algorithm: str 215 | ) -> None: 216 | folders = [data_dir / 'equal_but_binary_different' / folder for folder in folders] 217 | with pytest.raises(ValueError): 218 | get_matches(folders, algorithm, PairFinderOptions(slow=True, group=True)) 219 | 220 | 221 | @pytest.mark.parametrize('algorithm', ['ahash']) 222 | @pytest.mark.parametrize('folder', ['huge']) 223 | def test_huge_image_fails_loading_per_default( 224 | data_dir: Path, algorithm: str, folder: str 225 | ) -> None: 226 | hash_algorithm = IMAGE_HASH_ALGORITHM[algorithm] 227 | image_files = sorted(files_in_dirs([(data_dir / folder)], is_image_file)) 228 | with pytest.raises(DecompressionBombError): 229 | for file in image_files: 230 | hash_algorithm(Image.open(file)) 231 | 232 | 233 | @pytest.mark.parametrize('algorithm', ['ahash']) 234 | @pytest.mark.parametrize('folder', ['huge']) 235 | def test_huge_image_succeeds_with_max_image_size_set( 236 | data_dir: Path, algorithm: str, folder: str 237 | ) -> None: 238 | sub_folder = data_dir / folder 239 | args = parse_command_line([str(sub_folder), '--max-image-pixels', str(HUGE_IMAGE_SIZE)]) 240 | set_max_image_pixels(args) 241 | matches = get_matches([sub_folder], algorithm, PairFinderOptions.from_args(args)) 242 | assert len(matches) == 1 243 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | __author__ = 'Lene Preuss ' 3 | -------------------------------------------------------------------------------- /tests/unit/conftest.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | __author__ = 'Lene Preuss ' 3 | 4 | import random 5 | import shutil 6 | from pathlib import Path 7 | from tempfile import NamedTemporaryFile, TemporaryDirectory, mkdtemp 8 | from typing import Generator, List, Tuple 9 | from unittest.mock import Mock 10 | 11 | import pillow_heif 12 | import pytest 13 | from imagehash import ImageHash 14 | from numpy import array 15 | from wand.color import Color 16 | from wand.drawing import Drawing 17 | from wand.image import Image 18 | 19 | IMAGE_WIDTH = 40 20 | MOCK_IMAGE_HASH_VALUE = ImageHash(array([[True, True], [True, True]])) # just some random value 21 | mock_algorithm = Mock(return_value=MOCK_IMAGE_HASH_VALUE) 22 | 23 | 24 | def create_image(file: Path, width: int) -> Path: 25 | if file.suffix == '.heif': 26 | return create_heif_image(file, width) 27 | height = int(width * 3 / 4) 28 | color = Color('Black') 29 | image = Image(width=width, height=height, background=color) 30 | image.save(filename=file) 31 | return file 32 | 33 | 34 | def create_heif_image(file_path: Path, width: int) -> Path: 35 | height = int(width * 3 / 4) 36 | heif_file = pillow_heif.from_bytes( 37 | mode='BGR;16', 38 | size=(height, width), 39 | data=bytes([0] * 3 * 2 * width * height) 40 | ) 41 | with open(file_path, 'wb') as file: 42 | heif_file.save(fp=file, quality=-1) 43 | return file_path 44 | 45 | 46 | def fill_image_with_random_pixels(file: Path, seed: int = 0) -> None: 47 | random.seed(seed) 48 | image = Image(filename=file) 49 | with Drawing() as draw: 50 | for x in range(0, image.size[0]): 51 | for y in range(0, image.size[1]): 52 | color = Color(f'rgb({random_short()},{random_short()},{random_short()}') 53 | draw.fill_color = color 54 | draw.point(x, y) 55 | draw(image) 56 | image.save(filename=file) 57 | 58 | 59 | def named_file(name: str, images: List[Path]) -> Path: 60 | return next(filter(lambda f: name + '_' in f.name, images)) 61 | 62 | 63 | def random_short() -> int: 64 | return random.randrange(65535) # noqa: S311 65 | 66 | 67 | def save(image: Image, path: Path) -> None: 68 | """ 69 | Save image without letting the wand module create a backup file (which would 70 | confuse tearDownClass() 71 | """ 72 | with path.open('wb') as file: 73 | image.save(file=file) 74 | 75 | 76 | def delete_image_file(file: Path, images: List[Path]) -> None: 77 | file.unlink() 78 | images.remove(file) 79 | 80 | 81 | def copy_image_file(file: Path, images: List[Path]) -> Path: 82 | copied_file = file.with_suffix('.bak') 83 | shutil.copyfile(file, copied_file) 84 | images.append(copied_file) 85 | return copied_file 86 | 87 | 88 | def is_pair_found(element1: Path, element2: Path, matches: List[Tuple[Path, Path]]) -> bool: 89 | return (element1, element2) in matches or (element2, element1) in matches 90 | 91 | 92 | @pytest.fixture(name='top_directory', scope='session') 93 | def fixture_top_directory() -> Generator[TemporaryDirectory, None, None]: 94 | top_dir = TemporaryDirectory() 95 | yield top_dir 96 | try: 97 | top_dir.cleanup() 98 | except FileNotFoundError: 99 | pass 100 | 101 | 102 | @pytest.fixture(name='sub_directory', scope='session') 103 | def fixture_sub_directory(top_directory: TemporaryDirectory) -> TemporaryDirectory: 104 | return TemporaryDirectory(dir=top_directory.name) 105 | 106 | 107 | def create_jpg_and_png(top_directory: TemporaryDirectory) -> List[Path]: 108 | images = [] 109 | jpeg_file = create_image( 110 | Path(NamedTemporaryFile(dir=top_directory.name, prefix='jpeg_', suffix='.jpg').name), 111 | IMAGE_WIDTH 112 | ) 113 | images.append(jpeg_file) 114 | png_file = create_image( 115 | Path(NamedTemporaryFile(dir=top_directory.name, prefix='png_', suffix='.png').name), 116 | IMAGE_WIDTH 117 | ) 118 | images.append(png_file) 119 | return images 120 | 121 | 122 | def create_half_jpg(top_directory: TemporaryDirectory) -> Path: 123 | half_file = create_image( 124 | Path( 125 | NamedTemporaryFile(dir=top_directory.name, prefix='test_half_', suffix='.jpg').name 126 | ), 127 | IMAGE_WIDTH 128 | ) 129 | image = Image(filename=half_file) 130 | image.transform(f'{int(IMAGE_WIDTH / 2)}x{int(IMAGE_WIDTH * 3 / 8)}') 131 | save(image, half_file) 132 | return half_file 133 | 134 | 135 | @pytest.fixture(scope='session') 136 | def image_files( 137 | top_directory: TemporaryDirectory, sub_directory: TemporaryDirectory 138 | ) -> Generator[List[Path], None, None]: 139 | images = create_jpg_and_png(top_directory) 140 | heif_file = create_heif_image( 141 | Path(NamedTemporaryFile(dir=top_directory.name, prefix='heif_', suffix='.heif').name), 142 | IMAGE_WIDTH 143 | ) 144 | images.append(heif_file) 145 | subdir_file = create_image( 146 | Path(NamedTemporaryFile(dir=sub_directory.name, prefix='subdir_', suffix='.jpg').name), 147 | IMAGE_WIDTH 148 | ) 149 | fill_image_with_random_pixels(subdir_file) 150 | images.append(subdir_file) 151 | half_file = create_half_jpg(top_directory) 152 | images.append(half_file) 153 | yield images 154 | for file in images: 155 | file.unlink(missing_ok=False) 156 | 157 | 158 | @pytest.fixture 159 | def reset_call_count(): 160 | mock_algorithm.call_count = 0 161 | 162 | 163 | @pytest.fixture 164 | def hash_store_path(file_type: str) -> Path: 165 | top_directory = Path(mkdtemp()) 166 | return Path(NamedTemporaryFile(dir=top_directory, suffix=f'.{file_type}').name) 167 | -------------------------------------------------------------------------------- /tests/unit/test_actions.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | __author__ = 'Lene Preuss ' 3 | 4 | import shlex 5 | from argparse import Namespace 6 | from datetime import datetime, timedelta 7 | from math import factorial 8 | from pathlib import Path 9 | from tempfile import TemporaryDirectory, NamedTemporaryFile 10 | from typing import List, Generator, Tuple, Callable 11 | from unittest.mock import Mock, patch 12 | 13 | import pytest 14 | 15 | from duplicate_images import duplicate 16 | from duplicate_images.function_types import Results 17 | from duplicate_images.image_pair_finder import ImagePairFinder 18 | from duplicate_images.methods import IMAGE_HASH_ALGORITHM, quote 19 | from duplicate_images.pair_finder_options import PairFinderOptions 20 | from duplicate_images.parse_commandline import parse_command_line 21 | from .conftest import create_jpg_and_png, create_half_jpg, create_image, IMAGE_WIDTH 22 | 23 | HASH_ALGORITHM = IMAGE_HASH_ALGORITHM['phash'] 24 | 25 | 26 | @pytest.fixture(name='equal_images') 27 | def fixture_equal_images( 28 | top_directory: TemporaryDirectory, group: bool 29 | ) -> Generator[List[Path], None, None]: 30 | images = create_jpg_and_png(top_directory) 31 | if group: 32 | half_file = create_half_jpg(top_directory) 33 | images.append(half_file) 34 | yield images 35 | for file in images: 36 | file.unlink(missing_ok=True) 37 | 38 | 39 | @pytest.fixture(name='many_equal_images') 40 | def fixture_many_equal_images( 41 | top_directory: TemporaryDirectory, num_images: int 42 | ) -> Generator[List[Path], None, None]: 43 | images = [] 44 | for _ in range(num_images): 45 | file_name = Path( 46 | NamedTemporaryFile(dir=top_directory.name, prefix='jpeg_', suffix='.jpg').name 47 | ) 48 | create_image(file_name, IMAGE_WIDTH) 49 | images.append(file_name) 50 | yield images 51 | for file in images: 52 | file.unlink(missing_ok=True) 53 | 54 | 55 | def get_equals(equal_images: List[Path], group: bool) -> List[Tuple[Path, ...]]: 56 | equals = ImagePairFinder.create( 57 | equal_images, HASH_ALGORITHM, options=PairFinderOptions(group=group) 58 | ).get_equal_groups() 59 | assert len(equals) == 1 60 | return equals 61 | 62 | 63 | def paths_ascending_by_size(equals: Results): 64 | return sorted(sum(equals, ()), key=lambda path: (path.stat().st_size, str(path))) 65 | 66 | 67 | def get_biggest(equals: Results) -> Path: 68 | return paths_ascending_by_size(equals)[-1] 69 | 70 | 71 | def get_smallest(equals: Results) -> Path: 72 | return paths_ascending_by_size(equals)[0] 73 | 74 | 75 | def check_relevant_is_deleted_and_others_are_present( 76 | equals: Results, option: str, relevant: Path 77 | ) -> None: 78 | others = set(path[0] for path in equals) - {relevant} 79 | duplicate.execute_actions(equals, parse_command_line(['/', '--on-equal', option])) 80 | assert not relevant.is_file() 81 | for other in others: 82 | assert other.is_file() 83 | 84 | 85 | def check_relevant_is_moved(equals: Results, option: str, relevant: Path) -> None: 86 | with TemporaryDirectory() as destination: 87 | args = parse_command_line(['/', '--on-equal', option, '--move-to', destination]) 88 | duplicate.execute_actions(equals, args) 89 | assert not relevant.is_file() 90 | assert Path(destination, relevant.name).is_file() 91 | 92 | 93 | @pytest.mark.parametrize('option', ['delete-first', 'd1']) 94 | @pytest.mark.parametrize('group', [True, False]) 95 | def test_delete_first(equal_images: List[Path], option: str, group: bool) -> None: 96 | equals = get_equals(equal_images, group) 97 | relevant = equals[0][0] 98 | check_relevant_is_deleted_and_others_are_present(equals, option, relevant) 99 | 100 | 101 | @pytest.mark.parametrize('option', ['delete-last', 'dl']) 102 | @pytest.mark.parametrize('group', [True, False]) 103 | def test_delete_last(equal_images: List[Path], option: str, group: bool) -> None: 104 | equals = get_equals(equal_images, group) 105 | relevant = equals[0][-1] 106 | check_relevant_is_deleted_and_others_are_present(equals, option, relevant) 107 | 108 | 109 | @pytest.mark.parametrize('option', ['delete-biggest', 'd>']) 110 | @pytest.mark.parametrize('group', [True, False]) 111 | def test_delete_biggest(equal_images: List[Path], option: str, group: bool) -> None: 112 | equals = get_equals(equal_images, group) 113 | relevant = get_biggest(equals) 114 | check_relevant_is_deleted_and_others_are_present(equals, option, relevant) 115 | 116 | 117 | @pytest.mark.parametrize('option', ['delete-smallest', 'd<']) 118 | @pytest.mark.parametrize('group', [True, False]) 119 | def test_delete_smallest(equal_images: List[Path], option: str, group: bool) -> None: 120 | equals = get_equals(equal_images, group) 121 | relevant = get_smallest(equals) 122 | check_relevant_is_deleted_and_others_are_present(equals, option, relevant) 123 | 124 | 125 | @pytest.mark.parametrize('option', ['move-first', 'm1']) 126 | @pytest.mark.parametrize('group', [True, False]) 127 | def test_move_first(equal_images: List[Path], option: str, group: bool) -> None: 128 | equals = get_equals(equal_images, group) 129 | relevant = equals[0][0] 130 | check_relevant_is_moved(equals, option, relevant) 131 | 132 | 133 | @pytest.mark.parametrize('option', ['move-last', 'ml']) 134 | @pytest.mark.parametrize('group', [True, False]) 135 | def test_move_last(equal_images: List[Path], option: str, group: bool) -> None: 136 | equals = get_equals(equal_images, group) 137 | relevant = equals[0][-1] 138 | check_relevant_is_moved(equals, option, relevant) 139 | 140 | 141 | @pytest.mark.parametrize('option', ['move-biggest', 'm>']) 142 | @pytest.mark.parametrize('group', [True, False]) 143 | def test_move_biggest(equal_images: List[Path], option: str, group: bool) -> None: 144 | equals = get_equals(equal_images, group) 145 | relevant = get_biggest(equals) 146 | check_relevant_is_moved(equals, option, relevant) 147 | 148 | 149 | @pytest.mark.parametrize('option', ['move-smallest', 'm<']) 150 | @pytest.mark.parametrize('group', [True, False]) 151 | def test_move_smallest(equal_images: List[Path], option: str, group: bool) -> None: 152 | equals = get_equals(equal_images, group) 153 | relevant = get_smallest(equals) 154 | check_relevant_is_moved(equals, option, relevant) 155 | 156 | 157 | @pytest.mark.parametrize('option', ['move-first']) 158 | @pytest.mark.parametrize('group', [True, False]) 159 | def test_move_with_recreate_path_recreates_path_under_target_folder( 160 | equal_images: List[Path], option: str, group: bool 161 | ) -> None: 162 | equals = get_equals(equal_images, group) 163 | relevant = equals[0][0] 164 | with TemporaryDirectory() as destination: 165 | args = parse_command_line( 166 | ['/', '--on-equal', option, '--move-to', destination, '--move-recreate-path'] 167 | ) 168 | duplicate.execute_actions(equals, args) 169 | assert not relevant.is_file() 170 | assert (Path(destination) / relevant.relative_to(relevant.anchor)).is_file() 171 | 172 | 173 | def check_command_is_called( 174 | mock_call: Mock, args: Namespace, equal_images: List[Path], group: bool 175 | ) -> None: 176 | equals = get_equals(equal_images, group) 177 | duplicate.execute_actions(equals, args) 178 | mock_call.assert_called_once() 179 | assert args.on_equal in mock_call.call_args_list[0].args[0] 180 | 181 | 182 | def check_all_equal_images_are_present(mock_call: Mock, equal_images: List[Path]): 183 | paths_as_set = set(str(path) for path in equal_images) 184 | assert set(mock_call.call_args.args[0]) & paths_as_set == paths_as_set 185 | 186 | 187 | @patch('duplicate_images.methods.call') 188 | @pytest.mark.parametrize('option', ['xv', 'eog']) 189 | @pytest.mark.parametrize('group', [True, False]) 190 | def test_xv(mock_call: Mock, equal_images: List[Path], option: str, group: bool) -> None: 191 | check_command_is_called( 192 | mock_call, parse_command_line(['/', '--on-equal', option]), equal_images, group 193 | ) 194 | assert option in set(mock_call.call_args.args[0]) 195 | check_all_equal_images_are_present(mock_call, equal_images) 196 | 197 | 198 | @patch('builtins.print') 199 | @pytest.mark.parametrize('option', ['print', 'print_inline']) 200 | @pytest.mark.parametrize('group', [True, False]) 201 | def test_print(mock_print: Mock, equal_images: List[Path], option: str, group: bool) -> None: 202 | equals = get_equals(equal_images, group) 203 | duplicate.execute_actions(equals, parse_command_line(['/', '--on-equal', option])) 204 | assert mock_print.call_count == len(equals) 205 | for path in equals[0]: 206 | assert path in mock_print.call_args_list[0].args 207 | 208 | 209 | def test_quote_string(): 210 | quoted = shlex.quote('string with "quotes"') 211 | assert quoted == "'string with \"quotes\"'" 212 | 213 | quoted = shlex.quote('/path/with/one space.jpg') 214 | assert quoted == "'/path/with/one space.jpg'" 215 | 216 | 217 | @patch('builtins.print') 218 | @pytest.mark.parametrize('option', ['quote', 'quote_inline']) 219 | @pytest.mark.parametrize('group', [True, False]) 220 | def test_quote(mock_print: Mock, equal_images: List[Path], option: str, group: bool) -> None: 221 | equals = get_equals(equal_images, group) 222 | duplicate.execute_actions(equals, parse_command_line(['/', '--on-equal', option])) 223 | assert mock_print.call_count == len(equals) 224 | for path in equals[0]: 225 | assert str(path) in mock_print.call_args_list[0].args[0] 226 | assert quote(str(path)) in mock_print.call_args_list[0].args[0] 227 | 228 | 229 | @patch('duplicate_images.methods.shell_exec') 230 | @pytest.mark.parametrize('option', ['exec']) 231 | @pytest.mark.parametrize('exec_cmd', ['ls {1} {2}', 'ls {*}']) 232 | @pytest.mark.parametrize('group', [True, False]) 233 | def test_shell_exec( 234 | mock_call: Mock, equal_images: List[Path], option: str, exec_cmd: str, group: bool 235 | ) -> None: 236 | check_command_is_called( 237 | mock_call, parse_command_line(['/', '--on-equal', option, '--exec', exec_cmd]), 238 | equal_images, group 239 | ) 240 | 241 | 242 | @patch('duplicate_images.methods.call') 243 | @pytest.mark.parametrize('option', ['exec']) 244 | @pytest.mark.parametrize('exec_cmd', ['ls {*}']) 245 | @pytest.mark.parametrize('group', [True, False]) 246 | def test_wildcard_exec_parameter( 247 | mock_call: Mock, equal_images: List[Path], option: str, exec_cmd: str, group: bool 248 | ) -> None: 249 | equals = get_equals(equal_images, group) 250 | args = parse_command_line(['/', '--on-equal', option, '--exec', exec_cmd]) 251 | duplicate.execute_actions(equals, args) 252 | mock_call.assert_called_once() 253 | for path in (str(path) for path in equal_images): 254 | assert path in mock_call.call_args.args[0] 255 | 256 | 257 | @pytest.mark.parametrize('option', ['symlink-smaller']) 258 | @pytest.mark.parametrize('group', [True, False]) 259 | def test_symlink_smaller(equal_images: List[Path], option: str, group: bool): 260 | check_symlink(equal_images, option, group, get_biggest) 261 | 262 | 263 | @pytest.mark.parametrize('option', ['symlink-bigger']) 264 | @pytest.mark.parametrize('group', [True, False]) 265 | def test_symlink_bigger(equal_images: List[Path], option: str, group: bool): 266 | check_symlink(equal_images, option, group, get_smallest) 267 | 268 | 269 | def check_symlink( 270 | equal_images: List[Path], option: str, group: bool, get_relevant: Callable[[Results], Path] 271 | ) -> None: 272 | equals = get_equals(equal_images, group) 273 | relevant = get_relevant(equals) 274 | args = parse_command_line(['/', '--on-equal', option]) 275 | duplicate.execute_actions(equals, args) 276 | assert relevant.is_file() 277 | others = set(equal_images) - {relevant} 278 | for path in others: 279 | assert path.is_symlink() 280 | assert path.resolve() == relevant 281 | 282 | 283 | @pytest.mark.parametrize('num_images', [7]) 284 | @pytest.mark.parametrize('parallel', [4, 10, 20]) 285 | @pytest.mark.parametrize('sleep_time', [0.005]) 286 | def test_parallel_actions( 287 | many_equal_images: List[Path], num_images: int, parallel: int, sleep_time: float 288 | ) -> None: 289 | equals = ImagePairFinder.create( 290 | many_equal_images, HASH_ALGORITHM, options=PairFinderOptions(group=False) 291 | ).get_equal_groups() 292 | assert len(equals) == factorial(num_images) / (factorial(2) * factorial(num_images - 2)) 293 | 294 | execution_time_single = actions_execution_time( 295 | equals, sleep_time, [] 296 | ) 297 | execution_time_parallel = actions_execution_time( 298 | equals, sleep_time, ['--parallel-actions', str(parallel)] 299 | ) 300 | assert execution_time_parallel < execution_time_single 301 | 302 | 303 | def actions_execution_time(equals: Results, sleep_time: float, extra_args: List[str]) -> timedelta: 304 | args = parse_command_line( 305 | ['.', '--on-equal', 'exec', '--exec', f'sleep {sleep_time}'] + extra_args 306 | ) 307 | start_time = datetime.now() 308 | duplicate.execute_actions(equals, args) 309 | return datetime.now() - start_time 310 | 311 | 312 | @pytest.mark.parametrize('option', ['unknown-option']) 313 | def test_unknown_option(option: str) -> None: 314 | with pytest.raises(SystemExit): 315 | parse_command_line(['/', '--on-equal', option]) 316 | -------------------------------------------------------------------------------- /tests/unit/test_files_in_dirs.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | __author__ = 'Lene Preuss ' 3 | 4 | from pathlib import Path 5 | from tempfile import TemporaryDirectory 6 | from typing import Generator 7 | from unittest.mock import patch 8 | 9 | import pytest 10 | 11 | from duplicate_images.duplicate import files_in_dirs, is_image_file 12 | from .conftest import create_image 13 | 14 | NUM_NUMBERED_FILES = 3 15 | TEST_IMAGE_WIDTH = 40 16 | 17 | 18 | @pytest.fixture(name='temp_dir', scope='session') 19 | def top_folder() -> Generator[Path, None, None]: 20 | with TemporaryDirectory() as temp_dir: 21 | yield Path(temp_dir) 22 | 23 | 24 | @pytest.fixture(name='filled_folder', scope='session') 25 | def filled_temp_dir(temp_dir: Path) -> Generator[Path, None, None]: 26 | for i in range(NUM_NUMBERED_FILES): 27 | (temp_dir / str(i)).mkdir() 28 | (temp_dir / str(i) / f'{i}.txt').open('w').close() 29 | yield temp_dir 30 | 31 | 32 | def test_files_in_dirs_finds_created_empty_files(filled_folder: Path) -> None: 33 | found = files_in_dirs([filled_folder]) 34 | assert NUM_NUMBERED_FILES == len(found) 35 | assert sorted(found) == sorted(filled_folder.glob('?/?.txt')) 36 | 37 | 38 | def test_files_in_dirs_ignores_empty_files_if_looking_for_images(filled_folder: Path) -> None: 39 | found = files_in_dirs([filled_folder], is_image_file) 40 | assert not found 41 | 42 | 43 | def test_files_in_dirs_ignores_subdir_matching_number_regex(filled_folder: Path) -> None: 44 | found = files_in_dirs([filled_folder], exclude_regexes=[r'/\d$']) 45 | assert not found 46 | 47 | 48 | def test_files_in_dirs_ignores_subdir_matching_explicit_name(filled_folder: Path) -> None: 49 | assert NUM_NUMBERED_FILES >= 1 50 | found = files_in_dirs([filled_folder], exclude_regexes=['/1$']) 51 | assert NUM_NUMBERED_FILES - 1 == len(found) 52 | 53 | 54 | def test_files_in_dirs_ignores_subdirs_matching_multiple_names(filled_folder: Path) -> None: 55 | assert NUM_NUMBERED_FILES >= 2 56 | found = files_in_dirs([filled_folder], exclude_regexes=['/1$', '/2$']) 57 | assert NUM_NUMBERED_FILES - 2 == len(found) 58 | 59 | 60 | def test_files_in_dirs_ignores_patterns_in_file_names(filled_folder: Path) -> None: 61 | assert NUM_NUMBERED_FILES >= 1 62 | found = files_in_dirs([filled_folder], exclude_regexes=['/1.txt$']) 63 | assert NUM_NUMBERED_FILES == len(found) 64 | 65 | 66 | def test_files_in_dirs_with_arbitrary_condition(filled_folder: Path) -> None: 67 | assert NUM_NUMBERED_FILES >= 2 68 | found = files_in_dirs([filled_folder], is_relevant=lambda f: '2.txt' == f.name) 69 | assert 1 == len(found) 70 | assert '2.txt' == found[0].name 71 | 72 | 73 | def test_is_image_file_empty_file(filled_folder: Path) -> None: 74 | assert not is_image_file(filled_folder / '1' / '1.txt') 75 | 76 | 77 | @pytest.mark.parametrize('extension', ['jpg', 'png', 'heif']) 78 | def test_is_image_file_image_file(temp_dir: Path, extension: str) -> None: 79 | create_image(temp_dir / f'1.{extension}', TEST_IMAGE_WIDTH) 80 | assert is_image_file(temp_dir / f'1.{extension}') 81 | 82 | 83 | @pytest.mark.parametrize('extension', ['jpg', 'png', 'heif']) 84 | def test_is_image_file_with_os_failure(temp_dir: Path, extension: str) -> None: 85 | create_image(temp_dir / f'1.{extension}', TEST_IMAGE_WIDTH) 86 | with patch('builtins.open', side_effect=OSError()): 87 | assert not is_image_file(temp_dir / f'1.{extension}') 88 | -------------------------------------------------------------------------------- /tests/unit/test_image_hash_scanner.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | __author__ = 'Lene Preuss ' 3 | 4 | from pathlib import Path 5 | from typing import Callable, List 6 | 7 | import pytest 8 | 9 | from duplicate_images.hash_scanner import ImageHashScanner, ParallelImageHashScanner 10 | from duplicate_images.methods import IMAGE_HASH_ALGORITHM, ALGORITHM_DEFAULTS, get_hash_size_kwargs 11 | from .conftest import mock_algorithm, MOCK_IMAGE_HASH_VALUE 12 | 13 | 14 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys())) 15 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner]) 16 | @pytest.mark.parametrize('hash_size', [4, 7, 9]) 17 | def test_different_hash_size_sets_options( 18 | algorithm: str, scanner_class: Callable, hash_size: int 19 | ) -> None: 20 | if algorithm == 'crop_resistant': 21 | return # crop_resistant does not support hash_size 22 | hash_size_kwargs = get_hash_size_kwargs(IMAGE_HASH_ALGORITHM[algorithm], hash_size) 23 | scanner = scanner_class([], IMAGE_HASH_ALGORITHM[algorithm], hash_size_kwargs) 24 | assert isinstance(scanner.hash_size_kwargs, dict) 25 | assert len(scanner.hash_size_kwargs) == 1 26 | assert list(scanner.hash_size_kwargs.values())[0] == hash_size 27 | assert list(scanner.hash_size_kwargs.keys())[0] == next(iter( 28 | ALGORITHM_DEFAULTS[IMAGE_HASH_ALGORITHM[algorithm]] 29 | )) 30 | assert scanner.hash_size_kwargs == hash_size_kwargs 31 | 32 | 33 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner]) 34 | def test_hash_values_correct(image_files: List[Path], scanner_class: Callable) -> None: 35 | scanner = scanner_class(image_files, mock_algorithm) 36 | for cache_entry in scanner.precalculate_hashes(): 37 | assert cache_entry[1] == MOCK_IMAGE_HASH_VALUE 38 | -------------------------------------------------------------------------------- /tests/unit/test_image_pair_finder.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | __author__ = 'Lene Preuss ' 3 | 4 | from pathlib import Path 5 | from tempfile import TemporaryDirectory 6 | from typing import Any, Callable, List, Tuple 7 | from unittest.mock import Mock 8 | 9 | import pytest 10 | 11 | from duplicate_images.duplicate import files_in_dirs 12 | from duplicate_images.hash_scanner import ImageHashScanner, ParallelImageHashScanner 13 | from duplicate_images.image_pair_finder import ( 14 | DictImagePairFinder, PairFinderOptions, SlowImagePairFinder, group_results_as_pairs 15 | ) 16 | from duplicate_images.methods import IMAGE_HASH_ALGORITHM, get_hash_size_kwargs 17 | from .conftest import is_pair_found, copy_image_file, delete_image_file, named_file 18 | 19 | 20 | def element_in_list_of_tuples(element: Any, tuples: List[Tuple[Any, Any]]) -> bool: 21 | return any(element in tuple for tuple in tuples) 22 | 23 | 24 | def test_get_files(top_directory: TemporaryDirectory, image_files: List[Path]) -> None: 25 | files = files_in_dirs([top_directory.name]) 26 | assert set(files) == set(image_files) 27 | 28 | 29 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys())) 30 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner]) 31 | @pytest.mark.parametrize( 32 | 'finder_class,max_distance', [ 33 | (DictImagePairFinder, 0), (SlowImagePairFinder, 0), (SlowImagePairFinder, 1) 34 | ] 35 | ) 36 | def test_hashes_equal_for_copied_image( 37 | image_files: List[Path], algorithm: str, 38 | scanner_class: Callable, finder_class: Callable, max_distance: int 39 | ) -> None: 40 | jpeg_file = named_file('jpeg', image_files) 41 | copied_file = copy_image_file(jpeg_file, image_files) 42 | scanner = scanner_class(image_files, IMAGE_HASH_ALGORITHM[algorithm]) 43 | equals = finder_class( 44 | scanner, group_results_as_pairs, options=PairFinderOptions(max_distance=max_distance) 45 | ).get_equal_groups() 46 | try: 47 | assert is_pair_found(jpeg_file, copied_file, equals) 48 | finally: 49 | delete_image_file(copied_file, image_files) 50 | 51 | 52 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys())) 53 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner]) 54 | @pytest.mark.parametrize( 55 | 'finder_class,max_distance', [ 56 | (DictImagePairFinder, 0), (SlowImagePairFinder, 0), (SlowImagePairFinder, 1) 57 | ] 58 | ) 59 | def test_hashes_not_equal_for_noisy_image( 60 | image_files: List[Path], algorithm: str, 61 | scanner_class: Callable, finder_class: Callable, max_distance: int 62 | ) -> None: 63 | if algorithm == 'crop_resistant': 64 | return # crop_resistant gives false results for noisy images 65 | subdir_file = named_file('subdir', image_files) 66 | scanner = scanner_class(image_files, IMAGE_HASH_ALGORITHM[algorithm]) 67 | equals = finder_class( 68 | scanner, group_results_as_pairs, options=PairFinderOptions(max_distance=max_distance) 69 | ).get_equal_groups() 70 | assert not element_in_list_of_tuples(subdir_file, equals) 71 | 72 | 73 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys())) 74 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner]) 75 | @pytest.mark.parametrize( 76 | 'finder_class,max_distance', [ 77 | (DictImagePairFinder, 0), (SlowImagePairFinder, 0), (SlowImagePairFinder, 1) 78 | ] 79 | ) 80 | def test_hashes_equal_for_different_image_format( 81 | image_files: List[Path], algorithm: str, 82 | scanner_class: Callable, finder_class: Callable, max_distance: int 83 | ) -> None: 84 | jpeg_file = named_file('jpeg', image_files) 85 | png_file = named_file('png', image_files) 86 | scanner = scanner_class(image_files, IMAGE_HASH_ALGORITHM[algorithm]) 87 | equals = finder_class( 88 | scanner, group_results_as_pairs, options=PairFinderOptions(max_distance=max_distance) 89 | ).get_equal_groups() 90 | assert (jpeg_file, png_file) in equals 91 | 92 | 93 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys())) 94 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner]) 95 | @pytest.mark.parametrize( 96 | 'finder_class,max_distance', [ 97 | (DictImagePairFinder, 0), (SlowImagePairFinder, 0), (SlowImagePairFinder, 1) 98 | ] 99 | ) 100 | def test_hashes_equal_for_scaled_image( 101 | image_files: List[Path], algorithm: str, 102 | scanner_class: Callable, finder_class: Callable, max_distance: int 103 | ) -> None: 104 | jpeg_file = named_file('jpeg', image_files) 105 | half_file = named_file('half', image_files) 106 | scanner = scanner_class(image_files, IMAGE_HASH_ALGORITHM[algorithm]) 107 | equals = finder_class( 108 | scanner, group_results_as_pairs, options=PairFinderOptions(max_distance=max_distance) 109 | ).get_equal_groups() 110 | assert (jpeg_file, half_file) in equals 111 | 112 | 113 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys())) 114 | @pytest.mark.parametrize('scanner_class', [ParallelImageHashScanner]) 115 | @pytest.mark.parametrize( 116 | 'finder_class,max_distance', [ 117 | (DictImagePairFinder, 0), (SlowImagePairFinder, 0), (SlowImagePairFinder, 1) 118 | ] 119 | ) 120 | def test_parallel_filtering_gives_same_results( 121 | image_files: List[Path], algorithm: str, 122 | scanner_class: Callable, finder_class: Callable, max_distance: int 123 | ) -> None: 124 | if algorithm == 'crop_resistant': 125 | return # crop_resistant does not support parallel scanning 126 | jpeg_file = named_file('jpeg', image_files) 127 | png_file = named_file('png', image_files) 128 | half_file = named_file('half', image_files) 129 | heif_file = named_file('heif', image_files) 130 | subdir_file = named_file('subdir', image_files) 131 | scanner = scanner_class(image_files, IMAGE_HASH_ALGORITHM[algorithm]) 132 | equals = finder_class( 133 | scanner, group_results_as_pairs, options=PairFinderOptions(max_distance=max_distance) 134 | ).get_equal_groups() 135 | assert is_pair_found(jpeg_file, png_file, equals) 136 | assert is_pair_found(jpeg_file, heif_file, equals) 137 | assert is_pair_found(jpeg_file, half_file, equals) 138 | assert is_pair_found(png_file, half_file, equals) 139 | assert is_pair_found(png_file, heif_file, equals) 140 | assert is_pair_found(half_file, heif_file, equals) 141 | assert not is_pair_found(jpeg_file, subdir_file, equals) 142 | assert not is_pair_found(png_file, subdir_file, equals) 143 | assert not is_pair_found(heif_file, subdir_file, equals) 144 | assert not is_pair_found(half_file, subdir_file, equals) 145 | 146 | 147 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys())) 148 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner]) 149 | @pytest.mark.parametrize('finder_class', [DictImagePairFinder, SlowImagePairFinder]) 150 | @pytest.mark.parametrize('hash_size', [4, 16]) 151 | def test_different_hash_size_finds_scaled_images( 152 | image_files: List[Path], algorithm: str, scanner_class: Callable, finder_class: Callable, 153 | hash_size: int 154 | ) -> None: 155 | jpeg_file = named_file('jpeg', image_files) 156 | half_file = named_file('half', image_files) 157 | scanner = scanner_class( 158 | image_files, IMAGE_HASH_ALGORITHM[algorithm], 159 | get_hash_size_kwargs(IMAGE_HASH_ALGORITHM[algorithm], hash_size) 160 | ) 161 | equals = finder_class(scanner, group_results=group_results_as_pairs).get_equal_groups() 162 | assert (jpeg_file, half_file) in equals 163 | 164 | 165 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys())) 166 | @pytest.mark.parametrize('scanner_class', [ImageHashScanner, ParallelImageHashScanner]) 167 | @pytest.mark.parametrize('finder_class', [DictImagePairFinder, SlowImagePairFinder]) 168 | @pytest.mark.parametrize('hash_size', [4, 16]) 169 | def test_smaller_hash_size_finds_similar_images( 170 | image_files: List[Path], algorithm: str, scanner_class: Callable, finder_class: Callable, 171 | hash_size: int 172 | ) -> None: 173 | jpeg_file = named_file('jpeg', image_files) 174 | half_file = named_file('half', image_files) 175 | scanner = scanner_class( 176 | image_files, IMAGE_HASH_ALGORITHM[algorithm], 177 | get_hash_size_kwargs(IMAGE_HASH_ALGORITHM[algorithm], hash_size) 178 | ) 179 | equals = finder_class(scanner, group_results=group_results_as_pairs).get_equal_groups() 180 | assert (jpeg_file, half_file) in equals 181 | 182 | 183 | @pytest.mark.parametrize('max_distance', [1, 2]) 184 | def test_dict_image_finder_fails_for_max_distance_greater_0(max_distance: int) -> None: 185 | with pytest.raises(ValueError): 186 | DictImagePairFinder( 187 | scanner=Mock(), group_results=Mock(), 188 | options=PairFinderOptions(max_distance=max_distance) 189 | ) 190 | -------------------------------------------------------------------------------- /tests/unit/test_imagehash.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | __author__ = 'Lene Preuss ' 3 | 4 | from pathlib import Path 5 | from typing import List 6 | 7 | import pytest 8 | 9 | from duplicate_images.function_types import Results 10 | from duplicate_images.image_pair_finder import ImagePairFinder, PairFinderOptions 11 | from duplicate_images.methods import IMAGE_HASH_ALGORITHM 12 | 13 | 14 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys())) 15 | def test_sequential(image_files: List[Path], algorithm: str) -> None: 16 | equals = ImagePairFinder.create( 17 | image_files, IMAGE_HASH_ALGORITHM[algorithm] 18 | ).get_equal_groups() 19 | check_results(equals, algorithm) 20 | 21 | 22 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys())) 23 | def test_parallel(image_files: List[Path], algorithm: str) -> None: 24 | equals = ImagePairFinder.create( 25 | image_files, IMAGE_HASH_ALGORITHM[algorithm], 26 | options=PairFinderOptions(parallel=True) 27 | ).get_equal_groups() 28 | check_results(equals, algorithm) 29 | 30 | 31 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys())) 32 | def test_max_distance(image_files: List[Path], algorithm: str) -> None: 33 | equals = ImagePairFinder.create( 34 | image_files, IMAGE_HASH_ALGORITHM[algorithm], options=PairFinderOptions(max_distance=1) 35 | ).get_equal_groups() 36 | check_results(equals, algorithm) 37 | 38 | 39 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys())) 40 | def test_explicit_hash_size_works(image_files: List[Path], algorithm: str) -> None: 41 | equals = ImagePairFinder.create( 42 | image_files, IMAGE_HASH_ALGORITHM[algorithm], 43 | options=PairFinderOptions(hash_size=8) 44 | ).get_equal_groups() 45 | check_results(equals, algorithm) 46 | 47 | 48 | def test_bad_hash_size_whash(image_files: List[Path]) -> None: 49 | with pytest.raises(AssertionError): 50 | ImagePairFinder.create( 51 | image_files, IMAGE_HASH_ALGORITHM['whash'], options=PairFinderOptions(hash_size=9) 52 | ).get_equal_groups() 53 | 54 | 55 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys())) 56 | def test_max_distance_parallel(image_files: List[Path], algorithm: str) -> None: 57 | equals = ImagePairFinder.create( 58 | image_files, IMAGE_HASH_ALGORITHM[algorithm], 59 | options=PairFinderOptions(parallel=True, max_distance=1) 60 | ).get_equal_groups() 61 | check_results(equals, algorithm) 62 | 63 | 64 | @pytest.mark.parametrize('parallel', [False, True]) 65 | @pytest.mark.parametrize('max_distance', [0, 1]) 66 | @pytest.mark.parametrize('hash_size', [4, 8]) 67 | @pytest.mark.parametrize('algorithm', list(IMAGE_HASH_ALGORITHM.keys())) 68 | def test_create_with_all_parameters( 69 | image_files: List[Path], parallel: bool, max_distance: int, hash_size: int, algorithm: str 70 | ) -> None: 71 | equals = ImagePairFinder.create( 72 | image_files, IMAGE_HASH_ALGORITHM[algorithm], 73 | options=PairFinderOptions(max_distance=max_distance, hash_size=hash_size, parallel=parallel) 74 | ).get_equal_groups() 75 | check_results(equals, algorithm) 76 | 77 | 78 | def check_results(equals: Results, algorithm: str) -> None: 79 | assert any('jpeg_' in pair[0].name and 'half_' in pair[1].name for pair in equals) 80 | assert any('png_' in pair[0].name and 'half_' in pair[1].name for pair in equals) 81 | assert any('jpeg_' in pair[0].name and 'png_' in pair[1].name for pair in equals) 82 | if algorithm != 'crop_resistant': 83 | assert not any('jpeg_' in pair[0].name and 'subdir_' in pair[1].name for pair in equals), [ 84 | ['/'.join(p[0].parts[3:]), '/'.join(p[1].parts[3:])] 85 | for p in equals if 'subdir_' in p[1].name 86 | ] 87 | -------------------------------------------------------------------------------- /tests/unit/test_parse_commandline.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | __author__ = 'Lene Preuss ' 3 | 4 | from os import cpu_count 5 | from pathlib import Path 6 | from tempfile import TemporaryDirectory 7 | 8 | import pytest 9 | from duplicate_images.methods import ACTIONS_ON_EQUALITY, MOVE_ACTIONS 10 | 11 | from duplicate_images.parse_commandline import parse_command_line 12 | 13 | NON_MOVE_ACTIONS = sorted(list(ACTIONS_ON_EQUALITY.keys() - set(MOVE_ACTIONS))) 14 | MOCK_CONFIG_VALUES = { 15 | 'exclude_dir': '/tmp/mock', 16 | 'algorithm': 'mock', 17 | 'max_distance': -1, 18 | 'hash_size': -1, 19 | 'on_equal': 'mock', 20 | 'parallel': -1, 21 | 'parallel_actions': -1, 22 | 'hash_db': '/tmp/mock.json', 23 | 'max_image_pixels': -1 24 | } 25 | 26 | 27 | def test_root_dir_required() -> None: 28 | with pytest.raises(SystemExit): 29 | parse_command_line([]) 30 | 31 | 32 | def test_one_root_dir_exists() -> None: 33 | args = parse_command_line(['.']) 34 | assert args.root_directory 35 | 36 | 37 | def test_one_root_dir_recognized() -> None: 38 | args = parse_command_line(['.']) 39 | assert len(args.root_directory) == 1 40 | 41 | 42 | def test_one_root_dir_parsed() -> None: 43 | args = parse_command_line(['.']) 44 | assert args.root_directory == ['.'] 45 | 46 | 47 | def test_two_root_dirs_recognized() -> None: 48 | args = parse_command_line(['.', '/home']) 49 | assert len(args.root_directory) == 2 50 | 51 | 52 | def test_two_root_dirs_parsed() -> None: 53 | args = parse_command_line(['.', '/home']) 54 | assert args.root_directory == ['.', '/home'] 55 | 56 | 57 | def test_parallel_unspecified() -> None: 58 | args = parse_command_line(['.']) 59 | assert args.parallel is None 60 | 61 | 62 | def test_parallel_default_arg() -> None: 63 | args = parse_command_line(['.', '--parallel']) 64 | assert args.parallel == cpu_count() 65 | 66 | 67 | @pytest.mark.parametrize('parallel', ['1', '2', '4', '8', '16']) 68 | def test_parallel_explicit_arg(parallel) -> None: 69 | args = parse_command_line(['.', '--parallel', parallel]) 70 | assert args.parallel == int(parallel) 71 | 72 | 73 | def test_parallel_actions_unspecified() -> None: 74 | args = parse_command_line(['.']) 75 | assert args.parallel_actions is None 76 | 77 | 78 | def test_parallel_actions_default_arg() -> None: 79 | args = parse_command_line(['.', '--parallel-actions']) 80 | assert args.parallel_actions == cpu_count() 81 | 82 | 83 | @pytest.mark.parametrize('parallel', ['1', '2', '4', '8', '16']) 84 | def test_parallel_actions_explicit_arg(parallel) -> None: 85 | args = parse_command_line(['.', '--parallel-actions', parallel]) 86 | assert args.parallel_actions == int(parallel) 87 | 88 | 89 | def test_exclude_dir_unspecified() -> None: 90 | args = parse_command_line(['.']) 91 | assert args.exclude_dir is None 92 | 93 | 94 | def test_one_exclude_dir() -> None: 95 | args = parse_command_line(['.', '--exclude-dir', 'foo']) 96 | assert args.exclude_dir == ['foo'] 97 | 98 | 99 | def test_two_exclude_dirs() -> None: 100 | args = parse_command_line(['.', '--exclude-dir', 'foo', 'bar']) 101 | assert args.exclude_dir == ['foo', 'bar'] 102 | 103 | 104 | def test_one_exclude_dir_with_space() -> None: 105 | args = parse_command_line(['.', '--exclude-dir', 'foo bar']) 106 | assert args.exclude_dir == ['foo bar'] 107 | 108 | 109 | def test_exec_fails_without_on_equal() -> None: 110 | with pytest.raises(SystemExit): 111 | parse_command_line(['.', '--exec', 'command']) 112 | 113 | 114 | @pytest.mark.parametrize('option', MOVE_ACTIONS) 115 | def test_move_fails_without_target_folder_specified(option: str) -> None: 116 | with pytest.raises(SystemExit): 117 | parse_command_line(['/', '--on-equal', option]) 118 | 119 | 120 | @pytest.mark.parametrize('option', NON_MOVE_ACTIONS) 121 | def test_non_move_action_fails_with_target_folder_specified(option: str) -> None: 122 | with pytest.raises(SystemExit): 123 | parse_command_line(['/', '--on-equal', option, '--move-to', '/']) 124 | 125 | 126 | @pytest.mark.parametrize('option', NON_MOVE_ACTIONS) 127 | def test_non_move_action_fails_with_recreate_path_specified(option: str) -> None: 128 | with pytest.raises(SystemExit): 129 | parse_command_line(['/', '--on-equal', option, '--move-recreate-path']) 130 | 131 | 132 | @pytest.fixture(name='config_file', scope='session') 133 | def fixture_config_file(top_directory: TemporaryDirectory) -> Path: 134 | config_file = Path(top_directory.name) / 'duplicate_images.cfg' 135 | with config_file.open('w') as file: 136 | file.write('[Defaults]\n') 137 | for key, value in MOCK_CONFIG_VALUES.items(): 138 | file.write(f'{key} = {value}\n') 139 | return config_file 140 | 141 | 142 | def test_config_file_is_read(config_file: Path) -> None: 143 | args = parse_command_line(['--config-file', str(config_file), '/tmp']) 144 | assert args.root_directory == ['/tmp'] 145 | 146 | 147 | def test_read_options_from_config_file(config_file: Path) -> None: 148 | args = parse_command_line(['--config-file', str(config_file), '/tmp']) 149 | assert args.exclude_dir == MOCK_CONFIG_VALUES['exclude_dir'] 150 | 151 | 152 | @pytest.mark.parametrize('option', MOCK_CONFIG_VALUES.keys()) 153 | def test_read_options_from_config_file_overridden_by_command_line( 154 | config_file: Path, option: str 155 | ) -> None: 156 | args = parse_command_line(['--config-file', str(config_file), '/tmp']) 157 | assert vars(args)[option] == MOCK_CONFIG_VALUES[option] 158 | -------------------------------------------------------------------------------- /tests/unit/test_persistent_storage.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=missing-docstring 2 | __author__ = 'Lene Preuss ' 3 | 4 | import json 5 | import logging 6 | import pickle 7 | from itertools import combinations 8 | from pathlib import Path 9 | from tempfile import TemporaryDirectory 10 | from time import sleep 11 | from typing import List 12 | 13 | import pytest 14 | 15 | from duplicate_images.duplicate import files_in_dirs, is_image_file 16 | from duplicate_images.function_types import Cache 17 | from duplicate_images.image_pair_finder import ImagePairFinder 18 | from duplicate_images.pair_finder_options import PairFinderOptions 19 | from duplicate_images.hash_store import ( 20 | PickleHashStore, JSONHashStore, FileHashStore, HashStore, NullHashStore 21 | ) 22 | from .conftest import MOCK_IMAGE_HASH_VALUE, mock_algorithm, create_jpg_and_png 23 | 24 | DEFAULT_ALGORITHM = 'phash' 25 | DEFAULT_HASH_SIZE = {'hash_size': 8} 26 | DEFAULT_METADATA = {'algorithm': DEFAULT_ALGORITHM, **DEFAULT_HASH_SIZE} 27 | 28 | 29 | class MockHashStore(FileHashStore): # pylint: disable=abstract-method 30 | def __init__(self, values: Cache) -> None: # pylint: disable=super-init-not-called 31 | self.values = values 32 | 33 | 34 | def test_empty_hash_store_calculates_hash_values( 35 | top_directory: TemporaryDirectory, image_files: List[Path], 36 | reset_call_count # pylint: disable=unused-argument 37 | ) -> None: 38 | finder = generate_pair_finder(top_directory, NullHashStore()) 39 | assert mock_algorithm.call_count > 0 40 | check_correct_results(finder, image_files) 41 | 42 | 43 | def test_filled_hash_store_does_not_calculate_hash_values( 44 | top_directory: TemporaryDirectory, image_files: List[Path], 45 | reset_call_count # pylint: disable=unused-argument 46 | ) -> None: 47 | hash_store = MockHashStore({path: MOCK_IMAGE_HASH_VALUE for path in image_files}) 48 | generate_pair_finder(top_directory, hash_store) 49 | assert mock_algorithm.call_count == 0 50 | 51 | 52 | def test_empty_hash_store_is_filled( 53 | top_directory: TemporaryDirectory, reset_call_count # pylint: disable=unused-argument 54 | ) -> None: 55 | finder = generate_pair_finder(top_directory, NullHashStore()) 56 | original_call_number = mock_algorithm.call_count 57 | finder.get_equal_groups() 58 | assert mock_algorithm.call_count == original_call_number 59 | 60 | 61 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 62 | def test_hash_store_is_written( 63 | top_directory: TemporaryDirectory, hash_store_path: Path 64 | ) -> None: 65 | create_verified_hash_store(top_directory, hash_store_path) 66 | assert hash_store_path.is_file() 67 | 68 | 69 | @pytest.mark.parametrize('file_type', ['pickle']) 70 | def test_pickle_file_contains_correct_hashes( 71 | top_directory: TemporaryDirectory, image_files: List[Path], hash_store_path 72 | ) -> None: 73 | create_verified_hash_store(top_directory, hash_store_path) 74 | with hash_store_path.open('rb') as pickle_file: 75 | written_hashes = pickle.load(pickle_file)[0] 76 | for file_name in image_files: 77 | assert file_name in written_hashes 78 | assert written_hashes[file_name] == MOCK_IMAGE_HASH_VALUE 79 | 80 | 81 | @pytest.mark.parametrize('file_type', ['json']) 82 | def test_json_file_contains_correct_hashes( 83 | top_directory: TemporaryDirectory, image_files: List[Path], hash_store_path 84 | ) -> None: 85 | create_verified_hash_store(top_directory, hash_store_path) 86 | with hash_store_path.open('r') as json_file: 87 | written_hashes = json.load(json_file)[0] 88 | for file_name in image_files: 89 | assert str(file_name) in written_hashes 90 | assert written_hashes[str(file_name)] == str(MOCK_IMAGE_HASH_VALUE) 91 | 92 | 93 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 94 | def test_hash_store_load_loads( 95 | top_directory: TemporaryDirectory, image_files: List[Path], hash_store_path 96 | ) -> None: 97 | create_verified_hash_store(top_directory, hash_store_path) 98 | hash_store_class = PickleHashStore if hash_store_path.suffix == '.pickle' else JSONHashStore 99 | hash_store = hash_store_class(hash_store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE) 100 | hash_store.load() 101 | written_hashes = hash_store.values 102 | for file_name in image_files: 103 | assert str(file_name) in map(str, written_hashes.keys()) 104 | assert str(written_hashes[file_name]) == str(MOCK_IMAGE_HASH_VALUE) 105 | 106 | 107 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 108 | def test_backup_file_created( 109 | top_directory: TemporaryDirectory, hash_store_path: Path 110 | ) -> None: 111 | create_verified_hash_store(top_directory, hash_store_path) 112 | assert not hash_store_path.with_suffix('.bak').is_file() 113 | create_verified_hash_store(top_directory, hash_store_path) 114 | assert hash_store_path.with_suffix('.bak').is_file() 115 | 116 | 117 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 118 | def test_existing_backup_file_does_not_lead_to_error( 119 | top_directory: TemporaryDirectory, hash_store_path: Path 120 | ) -> None: 121 | create_verified_hash_store(top_directory, hash_store_path) # create hash store 122 | create_verified_hash_store(top_directory, hash_store_path) # create backup file 123 | create_verified_hash_store(top_directory, hash_store_path) # check it works still 124 | 125 | 126 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 127 | def test_checked_load_sets_values(top_directory: TemporaryDirectory, hash_store_path: Path) -> None: 128 | create_verified_hash_store(top_directory, hash_store_path) 129 | hash_store_class = PickleHashStore if hash_store_path.suffix == '.pickle' else JSONHashStore 130 | hash_store = hash_store_class(hash_store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE) 131 | hash_store.load() 132 | logging.warning(image_list(top_directory)) 133 | assert hash_store.values == {path: MOCK_IMAGE_HASH_VALUE for path in image_list(top_directory)} 134 | 135 | 136 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 137 | def test_checked_load_sets_metadata( 138 | top_directory: TemporaryDirectory, hash_store_path: Path 139 | ) -> None: 140 | create_verified_hash_store(top_directory, hash_store_path) 141 | hash_store_class = PickleHashStore if hash_store_path.suffix == '.pickle' else JSONHashStore 142 | hash_store = hash_store_class(hash_store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE) 143 | hash_store.load() 144 | assert hash_store.metadata() == DEFAULT_METADATA 145 | 146 | 147 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 148 | def test_hash_store_not_written_if_not_changed( 149 | top_directory: TemporaryDirectory, hash_store_path: Path 150 | ) -> None: 151 | create_verified_hash_store(top_directory, hash_store_path) 152 | assert hash_store_path.is_file() 153 | creation_time = hash_store_path.stat().st_ctime 154 | scan_images_with_hash_store(top_directory, hash_store_path) 155 | assert hash_store_path.stat().st_ctime == creation_time 156 | assert hash_store_path.stat().st_mtime == creation_time 157 | 158 | 159 | @pytest.mark.parametrize('file_type', ['pickle', 'json']) 160 | def test_hash_store_is_accessed_even_if_not_changed( 161 | top_directory: TemporaryDirectory, hash_store_path: Path 162 | ) -> None: 163 | create_verified_hash_store(top_directory, hash_store_path) 164 | assert hash_store_path.is_file() 165 | sleep(0.01) # ensure the access time is different 166 | creation_time = hash_store_path.stat().st_ctime 167 | scan_images_with_hash_store(top_directory, hash_store_path) 168 | assert hash_store_path.stat().st_atime > creation_time 169 | 170 | 171 | def image_list(top_directory: TemporaryDirectory) -> List[Path]: 172 | return sorted(files_in_dirs([top_directory.name], is_relevant=is_image_file)) 173 | 174 | 175 | def generate_pair_finder( 176 | top_directory: TemporaryDirectory, hash_store: HashStore 177 | ) -> ImagePairFinder: 178 | return ImagePairFinder.create( 179 | image_list(top_directory), mock_algorithm, options=PairFinderOptions(slow=True), 180 | hash_store=hash_store 181 | ) 182 | 183 | 184 | def create_verified_hash_store(top_directory: TemporaryDirectory, store_path: Path) -> None: 185 | create_jpg_and_png(top_directory) 186 | scan_images_with_hash_store(top_directory, store_path) 187 | 188 | 189 | def scan_images_with_hash_store(top_directory: TemporaryDirectory, store_path: Path) -> None: 190 | with FileHashStore.create(store_path, DEFAULT_ALGORITHM, DEFAULT_HASH_SIZE) as hash_store: 191 | finder = generate_pair_finder(top_directory, hash_store) 192 | finder.get_equal_groups() 193 | 194 | 195 | def check_correct_results(finder: ImagePairFinder, images: List[Path]) -> None: 196 | pairs = finder.get_equal_groups() 197 | expected_pairs = combinations(images, 2) 198 | expected_pairs_string = f'{[(p[0].name, p[1].name) for p in pairs]}' 199 | for pair in expected_pairs: 200 | assert pair in pairs or (pair[1], pair[0]) in pairs, \ 201 | f'{pair[0].name}, {pair[1].name} not in {expected_pairs_string}' 202 | --------------------------------------------------------------------------------