├── .codespellignore
├── .dockerignore
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    ├── config.yml
    ├── dependabot.yml
    ├── reaction.yml
    ├── release-drafter.yml
    ├── stale.yml
    └── workflows
    │   ├── codeql-analysis.yml
    │   ├── codespell.yml
    │   ├── dockerimage-latest.yml
    │   ├── dockerimage-release.yml
    │   └── dockerimage-test.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── docker-compose.yml
├── docker
    └── entrypoint.sh
├── example.py
├── poetry.lock
├── py_image_dedup
    ├── __init__.py
    ├── cli.py
    ├── config.py
    ├── library
    │   ├── __init__.py
    │   ├── deduplication_result.py
    │   ├── deduplicator.py
    │   ├── file_watch.py
    │   ├── processing_manager.py
    │   └── progress_manager.py
    ├── persistence
    │   ├── __init__.py
    │   ├── elasticsearchstorebackend.py
    │   └── metadata_key.py
    ├── stats.py
    └── util
    │   ├── __init__.py
    │   ├── file.py
    │   └── image.py
├── py_image_dedup_reference.yaml
├── pyproject.toml
├── pytest.ini
└── tests
    ├── __init__.py
    ├── images
        ├── bottles
        │   ├── IMG_20190903_193151-edited.jpg
        │   ├── IMG_20190903_193151-grayscale.jpg
        │   ├── IMG_20190903_193151-telegram-compression.jpg
        │   └── IMG_20190903_193151.jpg
        ├── building
        │   ├── IMG_20190903_193508-edited.jpg
        │   ├── IMG_20190903_193508-grayscale.jpg
        │   ├── IMG_20190903_193508-telegram-compression.jpg
        │   └── IMG_20190903_193508.jpg
        └── clouds
        │   ├── IMG_20190903_193537-edited.jpg
        │   ├── IMG_20190903_193537-grayscale.jpg
        │   ├── IMG_20190903_193537-telegram-compression.jpg
        │   └── IMG_20190903_193537.jpg
    ├── py_image_dedup.yaml
    ├── test_file_extension.py
    └── test_select_images_to_delete.py


/.codespellignore:
--------------------------------------------------------------------------------
1 | Archiv


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .github
2 | .pytest_cache
3 | duplicates
4 | mnt
5 | venv
6 | *.yaml


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: [markusressel]
 4 | # patreon: # Replace with a single Patreon username
 5 | # open_collective: # Replace with a single Open Collective username
 6 | # ko_fi: # Replace with a single Ko-fi username
 7 | # tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | # community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | # liberapay: TheAlgorithms
10 | # issuehunt: # Replace with a single IssueHunt username
11 | # otechie: # Replace with a single Otechie username
12 | # custom: ['https://paypal.me/markusressel/1']


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | 
 5 | ---
 6 | 
 7 | **Describe the bug**
 8 | A clear and concise description of what the bug is.
 9 | 
10 | **To Reproduce**
11 | Steps to reproduce the behavior:
12 | 1. Go to '...'
13 | 2. Click on '....'
14 | 3. Scroll down to '....'
15 | 4. See error
16 | 
17 | **Expected behavior**
18 | A clear and concise description of what you expected to happen.
19 | 
20 | **Screenshots**
21 | If applicable, add screenshots to help explain your problem.
22 | 
23 | **Desktop (please complete the following information):**
24 |  - OS: [e.g. Linux]
25 | 
26 | **Additional context**
27 | Add any other context about the problem here.
28 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | 
 5 | ---
 6 | 
 7 | **Is your feature request related to a problem? Please describe.**
 8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
 9 | 
10 | **Describe the solution you'd like**
11 | A clear and concise description of what you want to happen.
12 | 
13 | **Describe alternatives you've considered**
14 | A clear and concise description of any alternative solutions or features you've considered.
15 | 
16 | **Additional context**
17 | Add any other context or screenshots about the feature request here.
18 | 


--------------------------------------------------------------------------------
/.github/config.yml:
--------------------------------------------------------------------------------
 1 | # Configuration for new-issue-welcome - https://github.com/behaviorbot/new-issue-welcome
 2 | 
 3 | # Comment to be posted to on first time issues
 4 | newIssueWelcomeComment: >
 5 |   Thanks for opening your first issue here! :tada:
 6 | 
 7 | # Configuration for new-pr-welcome - https://github.com/behaviorbot/new-pr-welcome
 8 | 
 9 | # Comment to be posted to on PRs from first time contributors in your repository
10 | newPRWelcomeComment: >
11 |   Thanks for opening this pull request! :nerd_face:
12 | 
13 | # Configuration for first-pr-merge - https://github.com/behaviorbot/first-pr-merge
14 | 
15 | # Comment to be posted to on pull requests merged by a first time user
16 | firstPRMergeComment: >
17 |   Congrats on merging your first pull request here! You should be proud of yourself :1st_place_medal: 
18 |   ![Congratulations](https://media.giphy.com/media/ehhuGD0nByYxO/giphy.gif)
19 | 
20 | # It is recommend to include as many gifs and emojis as possible
21 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | registries:
 3 |   python-index-pypi-python-org-simple:
 4 |     type: python-index
 5 |     url: https://pypi.python.org/simple/
 6 |     username: "${{secrets.PYTHON_INDEX_PYPI_PYTHON_ORG_SIMPLE_USERNAME}}"
 7 |     password: "${{secrets.PYTHON_INDEX_PYPI_PYTHON_ORG_SIMPLE_PASSWORD}}"
 8 | 
 9 | updates:
10 |   - package-ecosystem: github-actions
11 |     directory: "/"
12 |     schedule:
13 |       # Check for updates to GitHub Actions every week
14 |       interval: "weekly"
15 |   - package-ecosystem: pip
16 |     insecure-external-code-execution: allow
17 |     directory: "/"
18 |     schedule:
19 |       interval: daily
20 |       time: "16:00"
21 |       timezone: Europe/Berlin
22 |     open-pull-requests-limit: 10
23 |     registries:
24 |       - python-index-pypi-python-org-simple
25 | 


--------------------------------------------------------------------------------
/.github/reaction.yml:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/.github/release-drafter.yml:
--------------------------------------------------------------------------------
 1 | categories:
 2 |   - title: 🚀 Features and ✨ Enhancements
 3 |     label: enhancement
 4 |   - title: 🐛 Bugfixes
 5 |     label: bug
 6 | change-template: "* $TITLE (#$NUMBER) by @$AUTHOR"
 7 | template: |
 8 |   ## What’s Changed
 9 | 
10 |   $CHANGES
11 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Configuration for probot-stale - https://github.com/probot/stale
 2 | 
 3 | # Number of days of inactivity before an Issue or Pull Request becomes stale
 4 | daysUntilStale: 60
 5 | 
 6 | # Number of days of inactivity before an Issue or Pull Request with the stale label is closed.
 7 | # Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale.
 8 | daysUntilClose: 14
 9 | 
10 | # Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
11 | exemptLabels:
12 |   - pinned
13 |   - security
14 |   - bug
15 |   - enhancement
16 | 
17 | # Set to true to ignore issues in a project (defaults to false)
18 | exemptProjects: false
19 | 
20 | # Set to true to ignore issues in a milestone (defaults to false)
21 | exemptMilestones: false
22 | 
23 | # Set to true to ignore issues with an assignee (defaults to false)
24 | exemptAssignees: true
25 | 
26 | # Label to use when marking as stale
27 | staleLabel: wontfix
28 | 
29 | # Comment to post when marking as stale. Set to `false` to disable
30 | markComment: >
31 |   This issue has been automatically marked as stale because it has not had
32 |   recent activity. It will be closed if no further activity occurs. Thank you
33 |   for your contributions.
34 | 
35 | # Comment to post when removing the stale label.
36 | # unmarkComment: >
37 | #   Your comment here.
38 | 
39 | # Comment to post when closing a stale Issue or Pull Request.
40 | closeComment: >
41 |   There has been no incentive by contributors or maintainers to revive this stale issue and it will now be closed.
42 | 
43 | # Limit the number of actions per hour, from 1-30. Default is 30
44 | limitPerRun: 30
45 | 
46 | # Limit to only `issues` or `pulls`
47 | only: issues
48 | 
49 | # Optionally, specify configuration settings that are specific to just 'issues' or 'pulls':
50 | # pulls:
51 | #   daysUntilStale: 30
52 | #   markComment: >
53 | #     This pull request has been automatically marked as stale because it has not had
54 | #     recent activity. It will be closed if no further activity occurs. Thank you
55 | #     for your contributions.
56 | 
57 | # issues:
58 | #   exemptLabels:
59 | #     - confirmed
60 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | name: "CodeQL"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master, ]
 6 |   pull_request:
 7 |     # The branches below must be a subset of the branches above
 8 |     branches: [master]
 9 |   schedule:
10 |     - cron: '0 1 * * 4'
11 | 
12 | jobs:
13 |   analyze:
14 |     name: Analyze
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - name: Checkout repository
19 |       uses: actions/checkout@v4
20 |       with:
21 |         # We must fetch at least the immediate parents so that if this is
22 |         # a pull request then we can checkout the head.
23 |         fetch-depth: 2
24 | 
25 |     # If this run was triggered by a pull request event, then checkout
26 |     # the head of the pull request instead of the merge commit.
27 |     - run: git checkout HEAD^2
28 |       if: ${{ github.event_name == 'pull_request' }}
29 | 
30 |     # Initializes the CodeQL tools for scanning.
31 |     - name: Initialize CodeQL
32 |       uses: github/codeql-action/init@v3
33 |       # Override language selection by uncommenting this and choosing your languages
34 |       # with:
35 |       #   languages: go, javascript, csharp, python, cpp, java
36 | 
37 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
38 |     # If this step fails, then you should remove it and run the build manually (see below)
39 |     - name: Autobuild
40 |       uses: github/codeql-action/autobuild@v3
41 | 
42 |     # ℹ️ Command-line programs to run using the OS shell.
43 |     # 📚 https://git.io/JvXDl
44 | 
45 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
46 |     #    and modify them (or add more) to build your code if your project
47 |     #    uses a compiled language
48 | 
49 |     #- run: |
50 |     #   make bootstrap
51 |     #   make release
52 | 
53 |     - name: Perform CodeQL Analysis
54 |       uses: github/codeql-action/analyze@v3
55 | 


--------------------------------------------------------------------------------
/.github/workflows/codespell.yml:
--------------------------------------------------------------------------------
 1 | # Codespell configuration is within pyproject.toml
 2 | ---
 3 | name: Codespell
 4 | 
 5 | on:
 6 |   push:
 7 |     branches: [ master ]
 8 |   pull_request:
 9 |     branches: [ master ]
10 | 
11 | permissions:
12 |   contents: read
13 | 
14 | jobs:
15 |   codespell:
16 |     name: Check for spelling errors
17 |     runs-on: ubuntu-latest
18 | 
19 |     steps:
20 |       - name: Checkout
21 |         uses: actions/checkout@v4
22 |       - name: Annotate locations with typos
23 |         uses: codespell-project/codespell-problem-matcher@v1
24 |       - name: Codespell
25 |         uses: codespell-project/actions-codespell@v2
26 |         with:
27 |           ignore_words_file: .codespellignore
28 |           skip: "*.svg"


--------------------------------------------------------------------------------
/.github/workflows/dockerimage-latest.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image latest
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v4
12 |       - name: Build the Docker image
13 |         run: docker build . --file Dockerfile --tag markusressel/py-image-dedup:latest
14 |       - name: Login to DockerHub Registry
15 |         run: echo ${{ secrets.DOCKERHUB_PASSWORD }} | docker login -u ${{ secrets.DOCKERHUB_USERNAME }} --password-stdin
16 |       - name: Push the Docker image
17 |         if: github.ref_name == 'master'
18 |         run: docker push markusressel/py-image-dedup:latest
19 | 
20 |   dockerHubDescription:
21 |     runs-on: ubuntu-latest
22 |     steps:
23 |       - uses: actions/checkout@v4
24 |       - name: Docker Hub Description
25 |         uses: peter-evans/dockerhub-description@v4.0.2
26 |         env:
27 |           DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
28 |           DOCKERHUB_PASSWORD: ${{ secrets.DOCKERHUB_PASSWORD }}
29 |           DOCKERHUB_REPOSITORY: markusressel/py-image-dedup
30 | 


--------------------------------------------------------------------------------
/.github/workflows/dockerimage-release.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Image Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "*.*.*"
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |       # this writes the tag name into GIT_TAG_NAME
14 |       - name: Get tag name
15 |         uses: little-core-labs/get-git-tag@v3.0.2
16 |       - name: Build the Docker image
17 |         run: docker build . --file Dockerfile --tag markusressel/py-image-dedup:$GIT_TAG_NAME
18 |       - name: Login to DockerHub Registry
19 |         run: echo ${{ secrets.DOCKERHUB_PASSWORD }} | docker login -u ${{ secrets.DOCKERHUB_USERNAME }} --password-stdin
20 |       - name: Push the Docker image
21 |         run: docker push markusressel/py-image-dedup:$GIT_TAG_NAME
22 | 


--------------------------------------------------------------------------------
/.github/workflows/dockerimage-test.yml:
--------------------------------------------------------------------------------
 1 | name: Test Docker Image
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     # The branches below must be a subset of the branches above
 6 |     branches: [ master ]
 7 | 
 8 | jobs:
 9 |   test:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |       - name: Build the Docker image
14 |         run: docker build . --file Dockerfile --tag markusressel/py-image-dedup:latest
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### VirtualEnv template
  3 | # Virtualenv
  4 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
  5 | .Python
  6 | [Bb]in
  7 | [Ii]nclude
  8 | [Ll]ib
  9 | [Ll]ib64
 10 | [Ll]ocal
 11 | [Ss]cripts
 12 | pyvenv.cfg
 13 | .venv
 14 | pip-selfcheck.json
 15 | ### JetBrains template
 16 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 17 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 18 | 
 19 | # User-specific stuff:
 20 | .idea/**/workspace.xml
 21 | .idea/**/tasks.xml
 22 | .idea/dictionaries
 23 | 
 24 | # Sensitive or high-churn files:
 25 | .idea/**/dataSources/
 26 | .idea/**/dataSources.ids
 27 | .idea/**/dataSources.xml
 28 | .idea/**/dataSources.local.xml
 29 | .idea/**/sqlDataSources.xml
 30 | .idea/**/dynamic.xml
 31 | .idea/**/uiDesigner.xml
 32 | 
 33 | # Gradle:
 34 | .idea/**/gradle.xml
 35 | .idea/**/libraries
 36 | 
 37 | # CMake
 38 | cmake-build-debug/
 39 | cmake-build-release/
 40 | 
 41 | # Mongo Explorer plugin:
 42 | .idea/**/mongoSettings.xml
 43 | 
 44 | ## File-based project format:
 45 | *.iws
 46 | 
 47 | ## Plugin-specific files:
 48 | 
 49 | # IntelliJ
 50 | out/
 51 | 
 52 | # mpeltonen/sbt-idea plugin
 53 | .idea_modules/
 54 | 
 55 | # JIRA plugin
 56 | atlassian-ide-plugin.xml
 57 | 
 58 | # Cursive Clojure plugin
 59 | .idea/replstate.xml
 60 | 
 61 | # Crashlytics plugin (for Android Studio and IntelliJ)
 62 | com_crashlytics_export_strings.xml
 63 | crashlytics.properties
 64 | crashlytics-build.properties
 65 | fabric.properties
 66 | ### Python template
 67 | # Byte-compiled / optimized / DLL files
 68 | __pycache__/
 69 | *.py[cod]
 70 | *$py.class
 71 | 
 72 | # C extensions
 73 | *.so
 74 | 
 75 | # Distribution / packaging
 76 | .Python
 77 | build/
 78 | develop-eggs/
 79 | dist/
 80 | downloads/
 81 | eggs/
 82 | .eggs/
 83 | lib/
 84 | lib64/
 85 | parts/
 86 | sdist/
 87 | var/
 88 | wheels/
 89 | *.egg-info/
 90 | .installed.cfg
 91 | *.egg
 92 | MANIFEST
 93 | 
 94 | # PyInstaller
 95 | #  Usually these files are written by a python script from a template
 96 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 97 | *.manifest
 98 | *.spec
 99 | 
100 | # Installer logs
101 | pip-log.txt
102 | pip-delete-this-directory.txt
103 | 
104 | # Unit test / coverage reports
105 | htmlcov/
106 | .tox/
107 | .coverage
108 | .coverage.*
109 | .cache
110 | nosetests.xml
111 | coverage.xml
112 | *.cover
113 | .hypothesis/
114 | 
115 | # Translations
116 | *.mo
117 | *.pot
118 | 
119 | # Django stuff:
120 | *.log
121 | .static_storage/
122 | .media/
123 | local_settings.py
124 | 
125 | # Flask stuff:
126 | instance/
127 | .webassets-cache
128 | 
129 | # Scrapy stuff:
130 | .scrapy
131 | 
132 | # Sphinx documentation
133 | docs/_build/
134 | 
135 | # PyBuilder
136 | target/
137 | 
138 | # Jupyter Notebook
139 | .ipynb_checkpoints
140 | 
141 | # pyenv
142 | .python-version
143 | 
144 | # celery beat schedule file
145 | celerybeat-schedule
146 | 
147 | # SageMath parsed files
148 | *.sage.py
149 | 
150 | # Environments
151 | .env
152 | .venv
153 | env/
154 | venv/
155 | ENV/
156 | env.bak/
157 | venv.bak/
158 | 
159 | # Spyder project settings
160 | .spyderproject
161 | .spyproject
162 | 
163 | # Rope project settings
164 | .ropeproject
165 | 
166 | # mkdocs documentation
167 | /site
168 | 
169 | # mypy
170 | .mypy_cache/
171 | 
172 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # dont use alpine for python builds: https://pythonspeed.com/articles/alpine-docker-python/
 2 | FROM python:3.11-slim-buster
 3 | 
 4 | ENV PYTHONUNBUFFERED=1
 5 | ENV POETRY_VERSION="2.1.2"
 6 | ENV PIP_DISABLE_PIP_VERSION_CHECK=on
 7 | 
 8 | RUN apt-get update \
 9 | && apt-get -y install sudo git python-skimage
10 | 
11 | WORKDIR /app
12 | 
13 | COPY . .
14 | 
15 | COPY poetry.lock pyproject.toml ./
16 | 
17 | RUN apt-get update && \
18 |     apt-get install -y libatlas-base-dev gfortran && \
19 |     apt-get clean && rm -rf /var/lib/apt/lists/*
20 | 
21 | RUN pip install "poetry==$POETRY_VERSION" \
22 |  && POETRY_VIRTUALENVS_CREATE=false poetry install --without dev \
23 |  && pip uninstall -y poetry
24 | 
25 | ENV PUID=1000 PGID=1000
26 | 
27 | ENTRYPOINT [ "docker/entrypoint.sh", "py-image-dedup" ]
28 | CMD [ "daemon" ]
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU AFFERO GENERAL PUBLIC LICENSE
  2 |                        Version 3, 19 November 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU Affero General Public License is a free, copyleft license for
 11 | software and other kinds of works, specifically designed to ensure
 12 | cooperation with the community in the case of network server software.
 13 | 
 14 |   The licenses for most software and other practical works are designed
 15 | to take away your freedom to share and change the works.  By contrast,
 16 | our General Public Licenses are intended to guarantee your freedom to
 17 | share and change all versions of a program--to make sure it remains free
 18 | software for all its users.
 19 | 
 20 |   When we speak of free software, we are referring to freedom, not
 21 | price.  Our General Public Licenses are designed to make sure that you
 22 | have the freedom to distribute copies of free software (and charge for
 23 | them if you wish), that you receive source code or can get it if you
 24 | want it, that you can change the software or use pieces of it in new
 25 | free programs, and that you know you can do these things.
 26 | 
 27 |   Developers that use our General Public Licenses protect your rights
 28 | with two steps: (1) assert copyright on the software, and (2) offer
 29 | you this License which gives you legal permission to copy, distribute
 30 | and/or modify the software.
 31 | 
 32 |   A secondary benefit of defending all users' freedom is that
 33 | improvements made in alternate versions of the program, if they
 34 | receive widespread use, become available for other developers to
 35 | incorporate.  Many developers of free software are heartened and
 36 | encouraged by the resulting cooperation.  However, in the case of
 37 | software used on network servers, this result may fail to come about.
 38 | The GNU General Public License permits making a modified version and
 39 | letting the public access it on a server without ever releasing its
 40 | source code to the public.
 41 | 
 42 |   The GNU Affero General Public License is designed specifically to
 43 | ensure that, in such cases, the modified source code becomes available
 44 | to the community.  It requires the operator of a network server to
 45 | provide the source code of the modified version running there to the
 46 | users of that server.  Therefore, public use of a modified version, on
 47 | a publicly accessible server, gives the public access to the source
 48 | code of the modified version.
 49 | 
 50 |   An older license, called the Affero General Public License and
 51 | published by Affero, was designed to accomplish similar goals.  This is
 52 | a different license, not a version of the Affero GPL, but Affero has
 53 | released a new version of the Affero GPL which permits relicensing under
 54 | this license.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                        TERMS AND CONDITIONS
 60 | 
 61 |   0. Definitions.
 62 | 
 63 |   "This License" refers to version 3 of the GNU Affero General Public License.
 64 | 
 65 |   "Copyright" also means copyright-like laws that apply to other kinds of
 66 | works, such as semiconductor masks.
 67 | 
 68 |   "The Program" refers to any copyrightable work licensed under this
 69 | License.  Each licensee is addressed as "you".  "Licensees" and
 70 | "recipients" may be individuals or organizations.
 71 | 
 72 |   To "modify" a work means to copy from or adapt all or part of the work
 73 | in a fashion requiring copyright permission, other than the making of an
 74 | exact copy.  The resulting work is called a "modified version" of the
 75 | earlier work or a work "based on" the earlier work.
 76 | 
 77 |   A "covered work" means either the unmodified Program or a work based
 78 | on the Program.
 79 | 
 80 |   To "propagate" a work means to do anything with it that, without
 81 | permission, would make you directly or secondarily liable for
 82 | infringement under applicable copyright law, except executing it on a
 83 | computer or modifying a private copy.  Propagation includes copying,
 84 | distribution (with or without modification), making available to the
 85 | public, and in some countries other activities as well.
 86 | 
 87 |   To "convey" a work means any kind of propagation that enables other
 88 | parties to make or receive copies.  Mere interaction with a user through
 89 | a computer network, with no transfer of a copy, is not conveying.
 90 | 
 91 |   An interactive user interface displays "Appropriate Legal Notices"
 92 | to the extent that it includes a convenient and prominently visible
 93 | feature that (1) displays an appropriate copyright notice, and (2)
 94 | tells the user that there is no warranty for the work (except to the
 95 | extent that warranties are provided), that licensees may convey the
 96 | work under this License, and how to view a copy of this License.  If
 97 | the interface presents a list of user commands or options, such as a
 98 | menu, a prominent item in the list meets this criterion.
 99 | 
100 |   1. Source Code.
101 | 
102 |   The "source code" for a work means the preferred form of the work
103 | for making modifications to it.  "Object code" means any non-source
104 | form of a work.
105 | 
106 |   A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 | 
111 |   The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form.  A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 | 
122 |   The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities.  However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work.  For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 | 
135 |   The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 | 
139 |   The Corresponding Source for a work in source code form is that
140 | same work.
141 | 
142 |   2. Basic Permissions.
143 | 
144 |   All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met.  This License explicitly affirms your unlimited
147 | permission to run the unmodified Program.  The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work.  This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 | 
152 |   You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force.  You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright.  Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 | 
163 |   Conveying under any other circumstances is permitted solely under
164 | the conditions stated below.  Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 | 
167 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 | 
169 |   No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 | 
175 |   When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 | 
183 |   4. Conveying Verbatim Copies.
184 | 
185 |   You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 | 
193 |   You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 | 
196 |   5. Conveying Modified Source Versions.
197 | 
198 |   You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 | 
202 |     a) The work must carry prominent notices stating that you modified
203 |     it, and giving a relevant date.
204 | 
205 |     b) The work must carry prominent notices stating that it is
206 |     released under this License and any conditions added under section
207 |     7.  This requirement modifies the requirement in section 4 to
208 |     "keep intact all notices".
209 | 
210 |     c) You must license the entire work, as a whole, under this
211 |     License to anyone who comes into possession of a copy.  This
212 |     License will therefore apply, along with any applicable section 7
213 |     additional terms, to the whole of the work, and all its parts,
214 |     regardless of how they are packaged.  This License gives no
215 |     permission to license the work in any other way, but it does not
216 |     invalidate such permission if you have separately received it.
217 | 
218 |     d) If the work has interactive user interfaces, each must display
219 |     Appropriate Legal Notices; however, if the Program has interactive
220 |     interfaces that do not display Appropriate Legal Notices, your
221 |     work need not make them do so.
222 | 
223 |   A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit.  Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 | 
233 |   6. Conveying Non-Source Forms.
234 | 
235 |   You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 | 
240 |     a) Convey the object code in, or embodied in, a physical product
241 |     (including a physical distribution medium), accompanied by the
242 |     Corresponding Source fixed on a durable physical medium
243 |     customarily used for software interchange.
244 | 
245 |     b) Convey the object code in, or embodied in, a physical product
246 |     (including a physical distribution medium), accompanied by a
247 |     written offer, valid for at least three years and valid for as
248 |     long as you offer spare parts or customer support for that product
249 |     model, to give anyone who possesses the object code either (1) a
250 |     copy of the Corresponding Source for all the software in the
251 |     product that is covered by this License, on a durable physical
252 |     medium customarily used for software interchange, for a price no
253 |     more than your reasonable cost of physically performing this
254 |     conveying of source, or (2) access to copy the
255 |     Corresponding Source from a network server at no charge.
256 | 
257 |     c) Convey individual copies of the object code with a copy of the
258 |     written offer to provide the Corresponding Source.  This
259 |     alternative is allowed only occasionally and noncommercially, and
260 |     only if you received the object code with such an offer, in accord
261 |     with subsection 6b.
262 | 
263 |     d) Convey the object code by offering access from a designated
264 |     place (gratis or for a charge), and offer equivalent access to the
265 |     Corresponding Source in the same way through the same place at no
266 |     further charge.  You need not require recipients to copy the
267 |     Corresponding Source along with the object code.  If the place to
268 |     copy the object code is a network server, the Corresponding Source
269 |     may be on a different server (operated by you or a third party)
270 |     that supports equivalent copying facilities, provided you maintain
271 |     clear directions next to the object code saying where to find the
272 |     Corresponding Source.  Regardless of what server hosts the
273 |     Corresponding Source, you remain obligated to ensure that it is
274 |     available for as long as needed to satisfy these requirements.
275 | 
276 |     e) Convey the object code using peer-to-peer transmission, provided
277 |     you inform other peers where the object code and Corresponding
278 |     Source of the work are being offered to the general public at no
279 |     charge under subsection 6d.
280 | 
281 |   A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 | 
285 |   A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling.  In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage.  For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product.  A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 | 
298 |   "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source.  The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 | 
306 |   If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information.  But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 | 
317 |   The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed.  Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 | 
325 |   Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 | 
331 |   7. Additional Terms.
332 | 
333 |   "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law.  If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 | 
342 |   When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it.  (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.)  You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 | 
349 |   Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 | 
353 |     a) Disclaiming warranty or limiting liability differently from the
354 |     terms of sections 15 and 16 of this License; or
355 | 
356 |     b) Requiring preservation of specified reasonable legal notices or
357 |     author attributions in that material or in the Appropriate Legal
358 |     Notices displayed by works containing it; or
359 | 
360 |     c) Prohibiting misrepresentation of the origin of that material, or
361 |     requiring that modified versions of such material be marked in
362 |     reasonable ways as different from the original version; or
363 | 
364 |     d) Limiting the use for publicity purposes of names of licensors or
365 |     authors of the material; or
366 | 
367 |     e) Declining to grant rights under trademark law for use of some
368 |     trade names, trademarks, or service marks; or
369 | 
370 |     f) Requiring indemnification of licensors and authors of that
371 |     material by anyone who conveys the material (or modified versions of
372 |     it) with contractual assumptions of liability to the recipient, for
373 |     any liability that these contractual assumptions directly impose on
374 |     those licensors and authors.
375 | 
376 |   All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10.  If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term.  If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 | 
386 |   If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 | 
391 |   Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 | 
395 |   8. Termination.
396 | 
397 |   You may not propagate or modify a covered work except as expressly
398 | provided under this License.  Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 | 
403 |   However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 | 
410 |   Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 | 
417 |   Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License.  If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 | 
423 |   9. Acceptance Not Required for Having Copies.
424 | 
425 |   You are not required to accept this License in order to receive or
426 | run a copy of the Program.  Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance.  However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work.  These actions infringe copyright if you do
431 | not accept this License.  Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 | 
434 |   10. Automatic Licensing of Downstream Recipients.
435 | 
436 |   Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License.  You are not responsible
439 | for enforcing compliance by third parties with this License.
440 | 
441 |   An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations.  If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 | 
451 |   You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License.  For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 | 
459 |   11. Patents.
460 | 
461 |   A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based.  The
463 | work thus licensed is called the contributor's "contributor version".
464 | 
465 |   A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version.  For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 | 
475 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 | 
480 |   In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement).  To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 | 
487 |   If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients.  "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 | 
501 |   If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 | 
509 |   A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License.  You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 | 
524 |   Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 | 
528 |   12. No Surrender of Others' Freedom.
529 | 
530 |   If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License.  If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all.  For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 | 
540 |   13. Remote Network Interaction; Use with the GNU General Public License.
541 | 
542 |   Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software.  This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 | 
553 |   Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work.  The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 | 
561 |   14. Revised Versions of this License.
562 | 
563 |   The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time.  Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 | 
568 |   Each version is given a distinguishing version number.  If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation.  If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 | 
577 |   If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 | 
582 |   Later license versions may give you additional or different
583 | permissions.  However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 | 
587 |   15. Disclaimer of Warranty.
588 | 
589 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 | 
598 |   16. Limitation of Liability.
599 | 
600 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 | 
610 |   17. Interpretation of Sections 15 and 16.
611 | 
612 |   If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 | 
619 |                      END OF TERMS AND CONDITIONS
620 | 
621 |             How to Apply These Terms to Your New Programs
622 | 
623 |   If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 | 
627 |   To do so, attach the following notices to the program.  It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 | 
632 |     <one line to give the program's name and a brief idea of what it does.>
633 |     Copyright (C) <year>  <name of author>
634 | 
635 |     This program is free software: you can redistribute it and/or modify
636 |     it under the terms of the GNU Affero General Public License as published
637 |     by the Free Software Foundation, either version 3 of the License, or
638 |     (at your option) any later version.
639 | 
640 |     This program is distributed in the hope that it will be useful,
641 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
642 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
643 |     GNU Affero General Public License for more details.
644 | 
645 |     You should have received a copy of the GNU Affero General Public License
646 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
647 | 
648 | Also add information on how to contact you by electronic and paper mail.
649 | 
650 |   If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source.  For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code.  There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 | 
658 |   You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | <https://www.gnu.org/licenses/>.
662 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include Pipfile.lock


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: all docker clean test
2 | 
3 | docker:
4 | 	sudo docker build . --file Dockerfile --tag markusressel/py-image-dedup:latest
5 | 
6 | test:
7 | 	cd tests; pytest


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # py-image-dedup [![Build Status](https://img.shields.io/endpoint.svg?url=https%3A%2F%2Factions-badge.atrox.dev%2Fmarkusressel%2Fpy-image-dedup%2Fbadge%3Fref%3Dmaster&style=flat)](https://actions-badge.atrox.dev/markusressel/py-image-dedup/goto?ref=master) [![Code Climate](https://codeclimate.com/github/markusressel/py-image-dedup.svg)](https://codeclimate.com/github/markusressel/py-image-dedup) [![PyPI version](https://badge.fury.io/py/py-image-dedup.svg)](https://badge.fury.io/py/py-image-dedup)
  2 | 
  3 | **py-image-dedup** is a tool to sort out or remove duplicates within a photo library. 
  4 | Unlike most other solutions, **py-image-dedup** 
  5 | intentionally uses an approximate image comparison to also detect 
  6 | duplicates of images that slightly differ in resolution, color or other minor details.
  7 | 
  8 | It is build upon [Image-Match](https://github.com/ascribe/image-match) a very popular library to compute
  9 | a pHash for an image and store the result in an ElasticSearch backend for very high scalability.
 10 | 
 11 | [![asciicast](https://asciinema.org/a/3WbBxMXnZyT1QnuTP9fm37wkS.svg)](https://asciinema.org/a/3WbBxMXnZyT1QnuTP9fm37wkS)
 12 | 
 13 | # How it works
 14 | 
 15 | ### Phase 1 - Database cleanup
 16 | 
 17 | In the first phase the elasticsearch backend is checked against the 
 18 | current filesystem state, cleaning up database entries of files that 
 19 | no longer exist. This will speed up queries made later on.
 20 | 
 21 | ### Phase 2 - Counting files
 22 | 
 23 | Although not necessary for the deduplication process it is very convenient
 24 | to have some kind of progress indication while the deduplication process
 25 | is at work. To be able to provide that, available files must be counted beforehand.
 26 | 
 27 | ### Phase 3 - Analysing files
 28 | 
 29 | In this phase every image file is analysed. This means generating a signature (pHash)
 30 | to quickly compare it to other images and adding other metadata of the image
 31 | to the elasticsearch backend that is used in the next phase.
 32 | 
 33 | This phase is quite CPU intensive and the first run take take quite
 34 | some time. Using as much threads as feasible (using the `-t` parameter) 
 35 | is advised to get the best performance.
 36 | 
 37 | Since we might already have a previous version of this file in the database 
 38 | before analysing a given file the file modification time is compared to the
 39 | given one. If the database content seems to be still correct the signature
 40 | for this file will **not** be recalculated. Because of this, subsequent
 41 | runs will be much faster. There still has to happen some file access though,
 42 | so it is probably limited by that.
 43 | 
 44 | ### Phase 4 - Finding duplicates
 45 | 
 46 | Every file is now processed again - but only by means of querying the
 47 | database backend for similar images (within the given `max_dist`).
 48 | If there are images found that match the similarity criteria they are considered
 49 | duplicate candidates. All candidates are then ordered according to the `prioritization_rules`,
 50 | which you can specify yourself in the configuration, see [Configuration](#Configuration).
 51 | 
 52 | If you do not specify `prioritization_rules` yourself, the following order will
 53 | be used:
 54 | 
 55 | 1. pixel count (more is better)
 56 | 1. EXIF data (more exif data is better)
 57 | 1. file size (bigger is better)
 58 | 1. file modification time (newer is better)
 59 | 1. distance (lower is better)
 60 | 1. filename contains "copy" (False is better)
 61 | 1. filename length (longer is better) - (for "edited" versions)
 62 | 1. parent folder path length (shorter is better)
 63 | 1. score (higher is better)
 64 | 
 65 | The first candidate in the resulting list is considered to be the best
 66 | available version of all candidates.
 67 |  
 68 | ### Phase 5 - Moving/Deleting duplicates
 69 | 
 70 | All but the best version of duplicate candidates identified in the previous
 71 | phase are now deleted from the file system (if you didn't specify `--dry-run` of course).
 72 | 
 73 | If `duplicates_target_directory` is set, the specified folder will be used as
 74 | a root directory to move duplicates to, instead of deleting them, replicating their original 
 75 | folder structure.
 76 |  
 77 | ### Phase 6 - Removing empty folders (Optional)
 78 | 
 79 | In the last phase, folders that are empty due to the deduplication 
 80 | process are deleted, cleaning up the directory structure (if turned on in configuration).
 81 | 
 82 | # How to use
 83 | 
 84 | ## Install
 85 | 
 86 | Install **py-image-dedup** using pip:
 87 | 
 88 | ```shell
 89 | pip3 install py-image-dedup
 90 | ```
 91 | 
 92 | ## Configuration
 93 | 
 94 | **py-image-dedup** uses [container-app-conf](https://github.com/markusressel/container-app-conf)
 95 | to provide configuration via a YAML file as well as ENV variables which
 96 | generates a reference config on startup. Have a look at the
 97 | [documentation about it](https://github.com/markusressel/container-app-conf#generate-reference-config).
 98 | 
 99 | See [py_image_dedup_reference.yaml](/py_image_dedup_reference.yaml)
100 | for an example in this repo.
101 | 
102 | ## Setup elasticsearch backend
103 | 
104 | Since this library is based on [Image-Match](https://github.com/ascribe/image-match) 
105 | you need a running elasticsearch instance for efficient storing and 
106 | querying of image signatures.
107 | 
108 | ### Elasticsearch version
109 | 
110 | This library requires elasticsearch version 5 or later. Sadly the
111 | [Image-Match](https://github.com/ascribe/image-match) library still 
112 | specifies version 2, so [a fork of the original project](https://github.com/markusressel/image-match)
113 |  is used instead. This fork is maintained by me, and any contributions
114 |  are very much appreciated.
115 | 
116 | ### Set up the index
117 | 
118 | **py-image-dedup** uses a single index (called `images` by default).
119 | When configured, this index will be created automatically for you. 
120 | 
121 | ## Command line usage
122 | 
123 | **py-image-dedup** can be used from the command line like this:
124 | 
125 | ```shell
126 | py-image-dedup deduplicate --help
127 | ```
128 | 
129 | Have a look at the help output to see how you can customize it.
130 | 
131 | ### Daemon
132 | 
133 | **CAUTION!** This feature is still very much a work in progress. 
134 | **Always** have a backup of your data! 
135 | 
136 | **py-image-dedup** has a built in daemon that allows you to continuously
137 | monitor your source directories and deduplicate them on the fly.
138 | 
139 | When running the daemon (and enabled in configuration) a prometheus reporter
140 | is used to allow you to gather some statistical insights.
141 | 
142 | ```shell
143 | py-image-dedup daemon
144 | ```
145 | 
146 | ## Dry run
147 | 
148 | To analyze images and get an overview of what images would be deleted 
149 | be sure to make a dry run first.
150 | 
151 | ```shell
152 | py-image-dedup deduplicate --dry-run
153 | ```
154 | 
155 | 
156 | ## FreeBSD
157 | 
158 | If you want to run this on a FreeBSD host make sure you have an up
159 | to date release that is able to install ports.
160 | 
161 | Since [Image-Match](https://github.com/ascribe/image-match) does a lot of
162 | math it relies on `numpy` and `scipy`. To get those working on FreeBSD
163 | you have to install them as a port:
164 | 
165 | ```shell
166 | pkg install pkgconf
167 | pkg install py38-numpy
168 | pkg install py27-scipy
169 | ```
170 | 
171 | For `.png` support you also need to install
172 | ```shell
173 | pkg install png
174 | ```
175 | 
176 | I still ran into issues after installing all these and just threw those
177 | two in the mix and it finally worked:
178 | ```shell
179 | pkg install freetype
180 | pkg install py27-matplotlib  # this has a LOT of dependencies
181 | ```
182 | 
183 | ### Encoding issues
184 | 
185 | When using the python library `click` on FreeBSD you might run into
186 | encoding issues. To mitigate this change your locale from `ANSII` to `UTF-8`
187 | if possible.
188 | 
189 | This can be achieved f.ex. by creating a file `~/.login_conf` with the following content:
190 | ```text
191 | me:\
192 | 	:charset=ISO-8859-1:\
193 | 	:lang=de_DE.UTF-8:
194 | ```
195 | 
196 | ## Docker
197 | 
198 | To run **py-image-dedup** using docker you can use the [markusressel/py-image-dedup](https://hub.docker.com/r/markusressel/py-image-dedup) 
199 | image from DockerHub:
200 | 
201 | ```
202 | sudo docker run -t \
203 |     -p 8000:8000 \
204 |     -v /where/the/original/photolibrary/is/located:/data/in \
205 |     -v /where/duplicates/should/be/moved/to:/data/out \
206 |     -e PY_IMAGE_DEDUP_DRY_RUN=False \
207 |     -e PY_IMAGE_DEDUP_ANALYSIS_SOURCE_DIRECTORIES=/data/in/ \
208 |     -e PY_IMAGE_DEDUP_ANALYSIS_RECURSIVE=True \
209 |     -e PY_IMAGE_DEDUP_ANALYSIS_ACROSS_DIRS=True \
210 |     -e PY_IMAGE_DEDUP_ANALYSIS_FILE_EXTENSIONS=.png,.jpg,.jpeg \
211 |     -e PY_IMAGE_DEDUP_ANALYSIS_THREADS=8 \
212 |     -e PY_IMAGE_DEDUP_ANALYSIS_USE_EXIF_DATA=True \
213 |     -e PY_IMAGE_DEDUP_DEDUPLICATION_DUPLICATES_TARGET_DIRECTORY=/data/out/ \
214 |     -e PY_IMAGE_DEDUP_ELASTICSEARCH_AUTO_CREATE_INDEX=True \
215 |     -e PY_IMAGE_DEDUP_ELASTICSEARCH_HOST=elasticsearch \
216 |     -e PY_IMAGE_DEDUP_ELASTICSEARCH_PORT=9200 \
217 |     -e PY_IMAGE_DEDUP_ELASTICSEARCH_INDEX=images \
218 |     -e PY_IMAGE_DEDUP_ELASTICSEARCH_AUTO_CREATE_INDEX=True \
219 |     -e PY_IMAGE_DEDUP_ELASTICSEARCH_MAX_DISTANCE=0.1 \
220 |     -e PY_IMAGE_DEDUP_REMOVE_EMPTY_FOLDERS=False \
221 |     -e PY_IMAGE_DEDUP_STATS_ENABLED=True \
222 |     -e PY_IMAGE_DEDUP_STATS_PORT=8000 \
223 |     markusressel/py-image-dedup:latest
224 | ```
225 | 
226 | Since an elasticsearch instance is required too, you can 
227 | also use the `docker-compose.yml` file included in this repo which will
228 | set up a single-node elasticsearch cluster too:
229 | 
230 | ```shell script
231 | sudo docker-compose up
232 | ```
233 | 
234 | ### UID and GID
235 | 
236 | To run **py-image-dedup** inside the container using a specific user id 
237 | and group id you can use the env variables `PUID=1000` and `PGID=1000`.
238 | 
239 | # Contributing
240 | 
241 | GitHub is for social coding: if you want to write code, I encourage contributions through pull requests from forks
242 | of this repository. Create GitHub tickets for bugs and new features and comment on the ones that you are interested in.
243 | 
244 | # License
245 | 
246 | ```text
247 | py-image-dedup by Markus Ressel
248 | Copyright (C) 2018  Markus Ressel
249 | 
250 | This program is free software: you can redistribute it and/or modify
251 | it under the terms of the GNU General Public License as published by
252 | the Free Software Foundation, either version 3 of the License, or
253 | (at your option) any later version.
254 | 
255 | This program is distributed in the hope that it will be useful,
256 | but WITHOUT ANY WARRANTY; without even the implied warranty of
257 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
258 | GNU General Public License for more details.
259 | 
260 | You should have received a copy of the GNU General Public License
261 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
262 | ```
263 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | 
 3 | services:
 4 |   elasticsearch:
 5 |     image: docker.elastic.co/elasticsearch/elasticsearch:7.9.2
 6 |     ports:
 7 |       - "9200:9200"
 8 |       - "9300:9300"
 9 |     environment:
10 |       - discovery.type=single-node
11 |     networks:
12 |       - docker-elk
13 |     restart: on-failure
14 |   py-image-dedup:
15 |     # build: .
16 |     image: markusressel/py-image-dedup:latest
17 |     environment:
18 |       - PUID=1000
19 |       - PGID=1000
20 |       # change configuration to your liking
21 |       - PY_IMAGE_DEDUP_DRY_RUN=True
22 |       - PY_IMAGE_DEDUP_ANALYSIS_SOURCE_DIRECTORIES=/mnt/source/
23 |       - PY_IMAGE_DEDUP_ANALYSIS_RECURSIVE=True
24 |       - PY_IMAGE_DEDUP_ANALYSIS_ACROSS_DIRS=True
25 |       - PY_IMAGE_DEDUP_ANALYSIS_FILE_EXTENSIONS=.png,.jpg,.jpeg
26 |       - PY_IMAGE_DEDUP_ANALYSIS_THREADS=8
27 |       - PY_IMAGE_DEDUP_ANALYSIS_USE_EXIF_DATA=True
28 |       - PY_IMAGE_DEDUP_DEDUPLICATION_DUPLICATES_TARGET_DIRECTORY=/mnt/duplicates/
29 |       - PY_IMAGE_DEDUP_ELASTICSEARCH_HOST=elasticsearch
30 |       - PY_IMAGE_DEDUP_ELASTICSEARCH_PORT=9200
31 |       - PY_IMAGE_DEDUP_ELASTICSEARCH_INDEX=images
32 |       - PY_IMAGE_DEDUP_ELASTICSEARCH_AUTO_CREATE_INDEX=True
33 |       - PY_IMAGE_DEDUP_ELASTICSEARCH_MAX_DISTANCE=0.1
34 |       - PY_IMAGE_DEDUP_REMOVE_EMPTY_FOLDERS=False
35 |       - PY_IMAGE_DEDUP_STATS_ENABLED=True
36 |       - PY_IMAGE_DEDUP_STATS_PORT=8000
37 |     volumes:
38 |       # optionally mount a YAML configuration file
39 |       # into /app/py_image_dedup.yaml instead of using environment:
40 |       # - /mnt/data3/py_image_dedup.yaml:/app/py_image_dedup.yaml
41 |       # change this to your local source directory:
42 |       - /mnt/data3/py-image-dedup_testdata:/mnt/source
43 |       # change this to your local duplicates directory:
44 |       - /mnt/data3/py-image-dedup_duplicates:/mnt/duplicates
45 |     links:
46 |       - elasticsearch
47 |     networks:
48 |       - docker-elk
49 |     ports:
50 |       - "8000:8000"
51 |     depends_on:
52 |       - elasticsearch
53 |     restart: on-failure
54 | networks:
55 |   docker-elk:
56 |     driver: bridge


--------------------------------------------------------------------------------
/docker/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | set -eu
4 | sudo -E -u "#${PUID}" -g "#${PGID}" "$@"
5 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | from py_image_dedup.config import DeduplicatorConfig
 2 | from py_image_dedup.library.deduplicator import ImageMatchDeduplicator
 3 | 
 4 | config = DeduplicatorConfig()
 5 | config.DRY_RUN.value = True
 6 | # config.ELASTICSEARCH_HOST.value = "192.168.2.24"
 7 | config.SOURCE_DIRECTORIES.value = [
 8 |     # r'/home/markus/py-image-dedup/dir1/',
 9 |     # r'/home/markus/py-image-dedup/dir2/'
10 |     # r'/mnt/data/py-dedup-test/Syncthing/',
11 |     # r'/mnt/sdb2/Sample/',
12 |     r'./tests/images/'
13 | ]
14 | config.SEARCH_ACROSS_ROOT_DIRS.value = True
15 | 
16 | config.ANALYSIS_THREADS.value = 8
17 | config.ANALYSIS_USE_EXIF_DATA.value = False
18 | 
19 | config.ELASTICSEARCH_MAX_DISTANCE.value = 0.30
20 | # config.MAX_FILE_MODIFICATION_TIME_DELTA.value = timedelta(minutes=5)
21 | config.DEDUPLICATOR_DUPLICATES_TARGET_DIRECTORY.value = "./duplicates/"
22 | config.REMOVE_EMPTY_FOLDERS.value = True
23 | 
24 | deduplicator = ImageMatchDeduplicator()
25 | 
26 | # max_file_modification_time_diff=1 * 1000 * 60 * 5,
27 | 
28 | result = deduplicator.deduplicate_all(
29 |     skip_analyze_phase=False,
30 | )
31 | 
32 | result.print_to_console()
33 | 


--------------------------------------------------------------------------------
/py_image_dedup/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logging.basicConfig()
4 | 


--------------------------------------------------------------------------------
/py_image_dedup/cli.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import click
 4 | 
 5 | from py_image_dedup.config import DeduplicatorConfig
 6 | from py_image_dedup.library.deduplicator import ImageMatchDeduplicator
 7 | from py_image_dedup.library.processing_manager import ProcessingManager
 8 | from py_image_dedup.util import echo
 9 | 
10 | IMAGE_HASH_MAP = {}
11 | 
12 | PARAM_SKIP_ANALYSE_PHASE = "skip-analyse-phase"
13 | PARAM_DRY_RUN = "dry-run"
14 | 
15 | CMD_OPTION_NAMES = {
16 |     PARAM_SKIP_ANALYSE_PHASE: ['--skip-analyse-phase', '-sap'],
17 |     PARAM_DRY_RUN: ['--dry-run', '-dr']
18 | }
19 | 
20 | CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
21 | 
22 | 
23 | @click.group(context_settings=CONTEXT_SETTINGS)
24 | @click.version_option()
25 | def cli():
26 |     pass
27 | 
28 | 
29 | def get_option_names(parameter: str) -> list:
30 |     """
31 |     Returns a list of all valid console parameter names for a given parameter
32 |     :param parameter: the parameter to check
33 |     :return: a list of all valid names to use this parameter
34 |     """
35 |     return CMD_OPTION_NAMES[parameter]
36 | 
37 | 
38 | @cli.command(name="analyse")
39 | def c_analyse():
40 |     deduplicator = ImageMatchDeduplicator(interactive=True)
41 |     deduplicator.analyse_all()
42 | 
43 | 
44 | @cli.command(name="deduplicate")
45 | @click.option(*get_option_names(PARAM_SKIP_ANALYSE_PHASE), required=False, default=False, is_flag=True,
46 |               help='When set the image analysis phase will be skipped. Useful if you already did a dry-run.')
47 | @click.option(*get_option_names(PARAM_DRY_RUN), required=False, default=None, is_flag=True,
48 |               help='When set no files or folders will actually be deleted but a preview of '
49 |                    'what WOULD be done will be printed.')
50 | def c_deduplicate(skip_analyse_phase: bool,
51 |                   dry_run: bool):
52 |     config = DeduplicatorConfig()
53 |     if dry_run is not None:
54 |         config.DRY_RUN.value = dry_run
55 |     deduplicator = ImageMatchDeduplicator(interactive=True)
56 |     result = deduplicator.deduplicate_all(
57 |         skip_analyze_phase=skip_analyse_phase,
58 |     )
59 | 
60 |     echo()
61 |     result.print_to_console()
62 | 
63 | 
64 | @cli.command(name="daemon")
65 | @click.option(*get_option_names(PARAM_DRY_RUN), required=False, default=None, is_flag=True,
66 |               help='When set no files or folders will actually be deleted but a preview of '
67 |                    'what WOULD be done will be printed.')
68 | def c_daemon(dry_run: bool):
69 |     echo("Starting daemon...")
70 | 
71 |     config: DeduplicatorConfig = DeduplicatorConfig()
72 |     if dry_run is not None:
73 |         config.DRY_RUN.value = dry_run
74 | 
75 |     if config.STATS_ENABLED.value:
76 |         from prometheus_client import start_http_server
77 |         echo("Starting prometheus reporter...")
78 |         start_http_server(config.STATS_PORT.value)
79 | 
80 |     deduplicator = ImageMatchDeduplicator(interactive=False)
81 |     processing_manager = ProcessingManager(deduplicator)
82 | 
83 |     deduplicator.deduplicate_all()
84 |     processing_manager.start()
85 | 
86 |     try:
87 |         while True:
88 |             time.sleep(1)
89 |     except KeyboardInterrupt:
90 |         processing_manager.stop()
91 | 
92 | 
93 | if __name__ == '__main__':
94 |     cli()
95 | 


--------------------------------------------------------------------------------
/py_image_dedup/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import timedelta
  3 | 
  4 | from container_app_conf import ConfigBase
  5 | from container_app_conf.entry.bool import BoolConfigEntry
  6 | from container_app_conf.entry.dict import DictConfigEntry
  7 | from container_app_conf.entry.file import DirectoryConfigEntry
  8 | from container_app_conf.entry.float import FloatConfigEntry
  9 | from container_app_conf.entry.int import IntConfigEntry
 10 | from container_app_conf.entry.list import ListConfigEntry
 11 | from container_app_conf.entry.regex import RegexConfigEntry
 12 | from container_app_conf.entry.string import StringConfigEntry
 13 | from container_app_conf.entry.timedelta import TimeDeltaConfigEntry
 14 | from container_app_conf.source.env_source import EnvSource
 15 | from container_app_conf.source.yaml_source import YamlSource
 16 | from py_range_parse import Range
 17 | 
 18 | NODE_MAIN = "py_image_dedup"
 19 | 
 20 | NODE_DAEMON = "daemon"
 21 | NODE_PROCESSING_TIMEOUT = "processing_timeout"
 22 | 
 23 | NODE_FILE_OBSERVER_TYPE = "file_observer"
 24 | FILE_OBSERVER_TYPE_POLLING = "polling"
 25 | FILE_OBSERVER_TYPE_INOTIFY = "inotify"
 26 | 
 27 | NODE_DRY_RUN = "dry_run"
 28 | 
 29 | NODE_ELASTICSEARCH = "elasticsearch"
 30 | 
 31 | NODE_HOST = "host"
 32 | NODE_MAX_DISTANCE = "max_distance"
 33 | NODE_AUTO_CREATE_INDEX = "auto_create_index"
 34 | NODE_INDEX = "index"
 35 | 
 36 | NODE_ANALYSIS = "analysis"
 37 | 
 38 | NODE_EXCLUSIONS = "exclusions"
 39 | NODE_SOURCE_DIRECTORIES = "source_directories"
 40 | NODE_RECURSIVE = "recursive"
 41 | NODE_SEARCH_ACROSS_ROOT_DIRS = "across_dirs"
 42 | NODE_FILE_EXTENSIONS = "file_extensions"
 43 | NODE_USE_EXIF_DATA = "use_exif_data"
 44 | NODE_THREADS = "threads"
 45 | 
 46 | NODE_DEDUPLICATION = "deduplication"
 47 | 
 48 | NODE_PRIORITIZATION_RULES = "prioritization_rules"
 49 | NODE_MAX_FILE_MODIFICATION_TIME_DIFF = "max_file_modification_time_diff"
 50 | NODE_REMOVE_EMPTY_FOLDERS = "remove_empty_folders"
 51 | NODE_DUPLICATES_TARGET_DIRECTORY = "duplicates_target_directory"
 52 | 
 53 | NODE_STATS = "stats"
 54 | NODE_ENABLED = "enabled"
 55 | NODE_PORT = "port"
 56 | 
 57 | 
 58 | class DeduplicatorConfig(ConfigBase):
 59 | 
 60 |     def __new__(cls, *args, **kwargs):
 61 |         yaml_source = YamlSource("py_image_dedup")
 62 |         data_sources = [
 63 |             EnvSource(),
 64 |             yaml_source
 65 |         ]
 66 |         return super(DeduplicatorConfig, cls).__new__(cls, *args, data_sources=data_sources, **kwargs)
 67 | 
 68 |     DRY_RUN = BoolConfigEntry(
 69 |         description="If enabled no source file will be touched",
 70 |         key_path=[
 71 |             NODE_MAIN,
 72 |             NODE_DRY_RUN
 73 |         ],
 74 |         default=True
 75 |     )
 76 | 
 77 |     ELASTICSEARCH_HOST = StringConfigEntry(
 78 |         description="Hostname of the elasticsearch backend instance to use.",
 79 |         key_path=[
 80 |             NODE_MAIN,
 81 |             NODE_ELASTICSEARCH,
 82 |             NODE_HOST
 83 |         ],
 84 |         default="127.0.0.1"
 85 |     )
 86 | 
 87 |     ELASTICSEARCH_PORT = IntConfigEntry(
 88 |         description="Port of the elasticsearch backend instance to use.",
 89 |         key_path=[
 90 |             NODE_MAIN,
 91 |             NODE_ELASTICSEARCH,
 92 |             NODE_PORT
 93 |         ],
 94 |         range=Range(1, 65535),
 95 |         default=9200
 96 |     )
 97 | 
 98 |     ELASTICSEARCH_MAX_DISTANCE = FloatConfigEntry(
 99 |         description="Maximum signature distance [0..1] to query from elasticsearch backend.",
100 |         key_path=[
101 |             NODE_MAIN,
102 |             NODE_ELASTICSEARCH,
103 |             NODE_MAX_DISTANCE
104 |         ],
105 |         default=0.10
106 |     )
107 | 
108 |     ELASTICSEARCH_AUTO_CREATE_INDEX = BoolConfigEntry(
109 |         description="Whether to automatically create an index in the target database.",
110 |         key_path=[
111 |             NODE_MAIN,
112 |             NODE_ELASTICSEARCH,
113 |             NODE_AUTO_CREATE_INDEX
114 |         ],
115 |         default=True
116 |     )
117 | 
118 |     ELASTICSEARCH_INDEX = StringConfigEntry(
119 |         description="The index name to use for storing and querying image analysis data.",
120 |         key_path=[
121 |             NODE_MAIN,
122 |             NODE_ELASTICSEARCH,
123 |             NODE_INDEX
124 |         ],
125 |         default="images"
126 |     )
127 | 
128 |     ANALYSIS_USE_EXIF_DATA = BoolConfigEntry(
129 |         description="Whether to scan for EXIF data or not.",
130 |         key_path=[
131 |             NODE_MAIN,
132 |             NODE_ANALYSIS,
133 |             NODE_USE_EXIF_DATA
134 |         ],
135 |         default=True
136 |     )
137 | 
138 |     SOURCE_DIRECTORIES = ListConfigEntry(
139 |         description="Comma separated list of source paths to analyse and deduplicate.",
140 |         item_type=DirectoryConfigEntry,
141 |         item_args={
142 |             "check_existence": True
143 |         },
144 |         key_path=[
145 |             NODE_MAIN,
146 |             NODE_ANALYSIS,
147 |             NODE_SOURCE_DIRECTORIES
148 |         ],
149 |         required=True,
150 |         example=[
151 |             "/home/myuser/pictures/"
152 |         ]
153 |     )
154 | 
155 |     RECURSIVE = BoolConfigEntry(
156 |         description="When set all directories will be recursively analyzed.",
157 |         key_path=[
158 |             NODE_MAIN,
159 |             NODE_ANALYSIS,
160 |             NODE_RECURSIVE
161 |         ],
162 |         default=True
163 |     )
164 | 
165 |     SEARCH_ACROSS_ROOT_DIRS = BoolConfigEntry(
166 |         description="When set duplicates will be found even if they are located in different root directories.",
167 |         key_path=[
168 |             NODE_MAIN,
169 |             NODE_ANALYSIS,
170 |             NODE_SEARCH_ACROSS_ROOT_DIRS
171 |         ],
172 |         default=False
173 |     )
174 | 
175 |     FILE_EXTENSION_FILTER = ListConfigEntry(
176 |         description="Comma separated list of file extensions.",
177 |         item_type=StringConfigEntry,
178 |         key_path=[
179 |             NODE_MAIN,
180 |             NODE_ANALYSIS,
181 |             NODE_FILE_EXTENSIONS
182 |         ],
183 |         required=True,
184 |         default=[
185 |             ".png",
186 |             ".jpg",
187 |             ".jpeg"
188 |         ]
189 |     )
190 | 
191 |     EXCLUSIONS = ListConfigEntry(
192 |         description="Comma separated list of regular expression filters.",
193 |         item_type=RegexConfigEntry,
194 |         key_path=[
195 |             NODE_MAIN,
196 |             NODE_ANALYSIS,
197 |             NODE_EXCLUSIONS
198 |         ],
199 |         default=[]
200 |     )
201 | 
202 |     ANALYSIS_THREADS = IntConfigEntry(
203 |         description="Number of threads to use for image analysis phase.",
204 |         key_path=[
205 |             NODE_MAIN,
206 |             NODE_ANALYSIS,
207 |             NODE_THREADS
208 |         ],
209 |         default=os.cpu_count()
210 |     )
211 | 
212 |     MAX_FILE_MODIFICATION_TIME_DELTA = TimeDeltaConfigEntry(
213 |         description="Maximum file modification date difference between multiple "
214 |                     "duplicates to be considered the same image",
215 |         key_path=[
216 |             NODE_MAIN,
217 |             NODE_DEDUPLICATION,
218 |             NODE_MAX_FILE_MODIFICATION_TIME_DIFF
219 |         ],
220 |         default=None,
221 |         example=timedelta(minutes=5)
222 |     )
223 | 
224 |     PRIORITIZATION_RULES = ListConfigEntry(
225 |         description="Comma separated list of prioritization rules to use for ordering duplicate "
226 |                     "images before proceeding with the deduplication process.",
227 |         item_type=DictConfigEntry,
228 |         key_path=[
229 |             NODE_MAIN,
230 |             NODE_DEDUPLICATION,
231 |             NODE_PRIORITIZATION_RULES
232 |         ],
233 |         required=False,
234 |         default=[
235 |             {"name": "higher-pixel-count"},
236 |             {"name": "more-exif-data"},
237 |             {"name": "bigger-file-size"},
238 |             {"name": "newer-file-modification-date"},
239 |             {"name": "smaller-distance"},
240 |             {"name": "doesnt-contain-copy-in-file-name"},
241 |             {"name": "longer-file-name"},
242 |             {"name": "shorter-folder-path"},
243 |             {"name": "higher-score"},
244 |         ]
245 |     )
246 | 
247 |     REMOVE_EMPTY_FOLDERS = BoolConfigEntry(
248 |         description="Whether to remove empty folders or not.",
249 |         key_path=[
250 |             NODE_MAIN,
251 |             NODE_REMOVE_EMPTY_FOLDERS
252 |         ],
253 |         default=False
254 |     )
255 | 
256 |     DEDUPLICATOR_DUPLICATES_TARGET_DIRECTORY = DirectoryConfigEntry(
257 |         description="Directory path to move duplicates to instead of deleting them.",
258 |         key_path=[
259 |             NODE_MAIN,
260 |             NODE_DEDUPLICATION,
261 |             NODE_DUPLICATES_TARGET_DIRECTORY
262 |         ],
263 |         check_existence=True,
264 |         default=None,
265 |         example="/home/myuser/pictures/duplicates/"
266 |     )
267 | 
268 |     DAEMON_PROCESSING_TIMEOUT = TimeDeltaConfigEntry(
269 |         description="Time to wait for filesystems changes to settle before analysing.",
270 |         key_path=[
271 |             NODE_MAIN,
272 |             NODE_DAEMON,
273 |             NODE_PROCESSING_TIMEOUT
274 |         ],
275 |         default="30s"
276 |     )
277 | 
278 |     DAEMON_FILE_OBSERVER_TYPE = StringConfigEntry(
279 |         description="Type of file observer to use.",
280 |         key_path=[
281 |             NODE_MAIN,
282 |             NODE_DAEMON,
283 |             NODE_FILE_OBSERVER_TYPE
284 |         ],
285 |         regex="|".join([FILE_OBSERVER_TYPE_POLLING, FILE_OBSERVER_TYPE_INOTIFY]),
286 |         default=FILE_OBSERVER_TYPE_POLLING,
287 |         required=True
288 |     )
289 | 
290 |     STATS_ENABLED = BoolConfigEntry(
291 |         description="Whether to enable prometheus statistics or not.",
292 |         key_path=[
293 |             NODE_MAIN,
294 |             NODE_STATS,
295 |             NODE_ENABLED
296 |         ],
297 |         default=True
298 |     )
299 | 
300 |     STATS_PORT = IntConfigEntry(
301 |         description="The port to expose statistics on.",
302 |         key_path=[
303 |             NODE_MAIN,
304 |             NODE_STATS,
305 |             NODE_PORT
306 |         ],
307 |         default=8000
308 |     )
309 | 


--------------------------------------------------------------------------------
/py_image_dedup/library/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import threading
 3 | 
 4 | LOGGER = logging.getLogger(__name__)
 5 | LOGGER.setLevel(logging.DEBUG)
 6 | 
 7 | 
 8 | class Action:
 9 |     def __init__(self, name, color, ):
10 |         self.name = name
11 |         self.color = color
12 | 
13 | 
14 | class ActionEnum:
15 |     NONE = Action("-", "green")
16 |     DELETE = Action("delete", "red")
17 |     MOVE = Action("move", "yellow")
18 | 
19 | 
20 | class RegularIntervalWorker:
21 |     """
22 |     Base class for a worker that executes a specific task in a regular interval.
23 |     """
24 | 
25 |     def __init__(self, interval: float):
26 |         self._interval = interval
27 |         self._timer = None
28 | 
29 |     def start(self):
30 |         """
31 |         Starts the worker
32 |         """
33 |         if self._timer is None:
34 |             LOGGER.debug(f"Starting worker: {self.__class__.__name__}")
35 |             self._schedule_next_run()
36 |         else:
37 |             LOGGER.debug("Already running, ignoring start() call")
38 | 
39 |     def stop(self):
40 |         """
41 |         Stops the worker
42 |         """
43 |         if self._timer is not None:
44 |             self._timer.cancel()
45 |         self._timer = None
46 | 
47 |     def _schedule_next_run(self):
48 |         """
49 |         Schedules the next run
50 |         """
51 |         if self._timer is not None:
52 |             self._timer.cancel()
53 |         self._timer = threading.Timer(self._interval, self._worker_job)
54 |         self._timer.start()
55 | 
56 |     def _worker_job(self):
57 |         """
58 |         The regularly executed task. Override this method.
59 |         """
60 |         try:
61 |             self._run()
62 |         except Exception as e:
63 |             LOGGER.error(e, exc_info=True)
64 |         finally:
65 |             self._schedule_next_run()
66 | 
67 |     def _run(self):
68 |         """
69 |         The regularly executed task. Override this method.
70 |         """
71 |         raise NotImplementedError()
72 | 


--------------------------------------------------------------------------------
/py_image_dedup/library/deduplication_result.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import List
  3 | 
  4 | import click
  5 | from tabulate import tabulate
  6 | 
  7 | from py_image_dedup.library import ActionEnum
  8 | from py_image_dedup.persistence import MetadataKey
  9 | from py_image_dedup.util import echo
 10 | 
 11 | BYTE_IN_A_MB = 1048576
 12 | 
 13 | 
 14 | class DeduplicationResult:
 15 |     def __init__(self):
 16 |         self.item_actions = {}
 17 |         self._removed_folders = set()
 18 |         self._reference_files = {}
 19 |         self._file_duplicates = {}
 20 | 
 21 |     def add_file_action(self, file_path: Path, action: ActionEnum):
 22 |         if file_path in self.item_actions and self.item_actions[file_path] != action:
 23 |             raise ValueError("File path already in result "
 24 |                              "but with different action: {}, {}, {}".format(file_path,
 25 |                                                                             self.item_actions[file_path],
 26 |                                                                             action))
 27 |         self.item_actions[file_path] = action
 28 | 
 29 |     def get_file_with_action(self, action: ActionEnum) -> []:
 30 |         return list({k: v for k, v in self.item_actions.items() if v == action}.keys())
 31 | 
 32 |     def get_duplicate_count(self) -> int:
 33 |         """
 34 |         :return: amount of files that have at least one duplicate
 35 |         """
 36 |         count = 0
 37 |         for key, value in self._file_duplicates.items():
 38 |             if len(value) > 0:
 39 |                 count += 1
 40 | 
 41 |         return count
 42 | 
 43 |     def get_removed_or_moved_files(self):
 44 |         return self.get_file_with_action(ActionEnum.MOVE) + self.get_file_with_action(ActionEnum.DELETE)
 45 | 
 46 |     def get_removed_empty_folders(self) -> []:
 47 |         """
 48 |         :return: a list of empty folders that have been deleted
 49 |         """
 50 |         return self._removed_folders
 51 | 
 52 |     def add_removed_empty_folder(self, folder: Path):
 53 |         """
 54 |         Adds a folder to the list of removed empty folders
 55 |         :param folder: the folder to add
 56 |         """
 57 |         self._removed_folders.add(folder)
 58 | 
 59 |     def set_file_duplicates(self, reference_files: List[dict], duplicate_files: []):
 60 |         """
 61 |         Set a list of files that are duplicates of the reference file
 62 |         :param reference_files: the file that is used as a baseline
 63 |         :param duplicate_files: duplicates of the reference_file
 64 |         """
 65 |         reference_file = reference_files[0]
 66 |         reference_file_path = Path(reference_file[MetadataKey.PATH.value])
 67 |         self._reference_files[reference_file_path] = reference_file
 68 |         self._file_duplicates[reference_file_path] = reference_files[1:] + duplicate_files
 69 | 
 70 |     def get_file_duplicates(self) -> {}:
 71 |         """
 72 |         Get a list of files that are duplicates of other files
 73 |         """
 74 |         return self._file_duplicates
 75 | 
 76 |     def print_to_console(self):
 77 |         title = "" * 7 + "Summary"
 78 |         echo(title, color='cyan')
 79 |         echo('=' * 21, color='cyan')
 80 |         echo(f"Files with duplicates: {self.get_duplicate_count()}")
 81 |         echo(f"Files moved: {len(self.get_file_with_action(ActionEnum.MOVE))}")
 82 |         echo(f"Files deleted: {len(self.get_file_with_action(ActionEnum.DELETE))}")
 83 | 
 84 |         headers = ("Action", "File path", "Dist", "Filesize", "Pixels")
 85 | 
 86 |         for reference_file_path, folder in self.get_file_duplicates().items():
 87 |             duplicate_count = len(folder)
 88 |             if duplicate_count > 0:
 89 |                 columns = []
 90 |                 echo()
 91 | 
 92 |                 for item in [self._reference_files[reference_file_path]] + folder:
 93 |                     file_path = Path(item[MetadataKey.PATH.value])
 94 |                     distance = item[MetadataKey.DISTANCE.value]
 95 |                     distance_rounded = round(distance, 3)
 96 |                     file_size = item[MetadataKey.METADATA.value][MetadataKey.FILE_SIZE.value]
 97 |                     file_size_mb = round(file_size / BYTE_IN_A_MB, 3)
 98 |                     pixel_count = item[MetadataKey.METADATA.value][MetadataKey.PIXELCOUNT.value]
 99 | 
100 |                     action = self.item_actions.get(file_path, ActionEnum.NONE)
101 |                     row = [
102 |                         action.name,
103 |                         file_path,
104 |                         distance_rounded,
105 |                         file_size_mb,
106 |                         pixel_count
107 |                     ]
108 | 
109 |                     # apply action style
110 |                     row = list(map(lambda x: str(click.style(str(x), action.color)), row))
111 |                     columns.append(row)
112 | 
113 |                 self._echo_table(
114 |                     tabulate(columns, headers=headers, colalign=['center', 'left', 'left', 'right', 'right']))
115 | 
116 |         echo()
117 |         echo(f"Removed (empty) folders ({len(self.get_removed_empty_folders())}):")
118 |         for folder in self.get_removed_empty_folders():
119 |             echo(f"{folder}", color='red')
120 | 
121 |     @staticmethod
122 |     def _echo_table(table: str):
123 |         lines = table.splitlines()
124 | 
125 |         for line in lines[:2]:
126 |             echo(line, color='cyan')
127 | 
128 |         for line in lines[2:]:
129 |             echo(line)
130 | 


--------------------------------------------------------------------------------
/py_image_dedup/library/deduplicator.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import filecmp
  3 | import logging
  4 | import os
  5 | import shutil
  6 | import sys
  7 | from concurrent.futures import ThreadPoolExecutor
  8 | from pathlib import Path
  9 | from typing import List
 10 | 
 11 | import click
 12 | from ordered_set import OrderedSet
 13 | 
 14 | from py_image_dedup import util
 15 | from py_image_dedup.config import DeduplicatorConfig
 16 | from py_image_dedup.library import ActionEnum
 17 | from py_image_dedup.library.deduplication_result import DeduplicationResult
 18 | from py_image_dedup.library.progress_manager import ProgressManager
 19 | from py_image_dedup.persistence import ImageSignatureStore
 20 | from py_image_dedup.persistence.elasticsearchstorebackend import ElasticSearchStoreBackend
 21 | from py_image_dedup.persistence.metadata_key import MetadataKey
 22 | from py_image_dedup.stats import DUPLICATE_ACTION_MOVE_COUNT, DUPLICATE_ACTION_DELETE_COUNT, ANALYSIS_TIME, \
 23 |     FIND_DUPLICATES_TIME
 24 | from py_image_dedup.util import file, echo
 25 | from py_image_dedup.util.file import get_files_count, file_has_extension
 26 | 
 27 | LOGGER = logging.getLogger(__name__)
 28 | LOGGER.setLevel(logging.DEBUG)
 29 | 
 30 | 
 31 | class ImageMatchDeduplicator:
 32 |     EXECUTOR = ThreadPoolExecutor()
 33 | 
 34 |     _config: DeduplicatorConfig
 35 |     _progress_manager: ProgressManager
 36 | 
 37 |     _processed_files: dict = {}
 38 |     _deduplication_result: DeduplicationResult = None
 39 | 
 40 |     def __init__(self, interactive: bool):
 41 |         """
 42 | 
 43 |         :param interactive: whether cli output should be interactive or not
 44 |         """
 45 |         self.interactive = interactive
 46 | 
 47 |         self._progress_manager = ProgressManager()
 48 |         self._config = DeduplicatorConfig()
 49 |         self._persistence: ImageSignatureStore = ElasticSearchStoreBackend(
 50 |             host=self._config.ELASTICSEARCH_HOST.value,
 51 |             port=self._config.ELASTICSEARCH_PORT.value,
 52 |             connections_per_node=self._config.ANALYSIS_THREADS.value,
 53 |             el_index=self._config.ELASTICSEARCH_INDEX.value,
 54 |             use_exif_data=self._config.ANALYSIS_USE_EXIF_DATA.value,
 55 |             max_dist=self._config.ELASTICSEARCH_MAX_DISTANCE.value,
 56 |             setup_database=self._config.ELASTICSEARCH_AUTO_CREATE_INDEX.value
 57 |         )
 58 | 
 59 |     def reset_result(self):
 60 |         self._deduplication_result = DeduplicationResult()
 61 |         self._processed_files = {}
 62 | 
 63 |     def analyse_all(self):
 64 |         """
 65 |         Runs the analysis phase independently.
 66 |         """
 67 |         directories = self._config.SOURCE_DIRECTORIES.value
 68 | 
 69 |         echo("Phase 1/2: Counting files ...", color='cyan')
 70 |         directory_map = self._count_files(directories)
 71 | 
 72 |         echo("Phase 2/2: Analyzing files ...", color='cyan')
 73 |         self.analyze_directories(directory_map)
 74 | 
 75 |     def deduplicate_all(self, skip_analyze_phase: bool = False) -> DeduplicationResult:
 76 |         """
 77 |         Runs the full 6 deduplication phases.
 78 |         :param skip_analyze_phase: useful if you already did a dry run and want to do a real run afterwards
 79 |         :return: result of the operation
 80 |         """
 81 |         # see: https://stackoverflow.com/questions/14861891/runtimewarning-invalid-value-encountered-in-divide
 82 |         # and: https://stackoverflow.com/questions/29347987/why-cant-i-suppress-numpy-warnings
 83 |         import warnings
 84 |         warnings.filterwarnings('ignore')
 85 | 
 86 |         directories = self._config.SOURCE_DIRECTORIES.value
 87 |         if len(directories) <= 0:
 88 |             raise ValueError("No root directories to scan")
 89 | 
 90 |         if self._config.DRY_RUN.value:
 91 |             echo("==> DRY RUN! No files or folders will actually be deleted! <==", color='yellow')
 92 | 
 93 |         echo("Phase 1/6: Cleaning up database ...", color='cyan')
 94 |         self.cleanup_database(directories)
 95 | 
 96 |         echo("Phase 2/6: Counting files ...", color='cyan')
 97 |         directory_map = self._count_files(directories)
 98 | 
 99 |         phase_3_text = "Phase 3/6: Analyzing files"
100 |         if skip_analyze_phase:
101 |             echo(phase_3_text + " - Skipping", color='yellow')
102 |         else:
103 |             echo(phase_3_text, color='cyan')
104 |             self.analyze_directories(directory_map)
105 | 
106 |         echo("Phase 4/6: Finding duplicate files ...", color='cyan')
107 |         self.find_duplicates_in_directories(directory_map)
108 | 
109 |         # Phase 5/6: Move or Delete duplicate files
110 |         self.process_duplicates()
111 | 
112 |         self.remove_empty_folders()
113 | 
114 |         return self._deduplication_result
115 | 
116 |     def analyze_directories(self, directory_map: dict):
117 |         """
118 |         Analyzes all files, generates identifiers (if necessary) and stores them for later access
119 |         """
120 |         threads = self._config.ANALYSIS_THREADS.value
121 | 
122 |         # load truncated images too
123 |         # TODO: this causes an infinite loop on some (truncated) images
124 |         # ImageFile.LOAD_TRUNCATED_IMAGES = True
125 | 
126 |         for directory, file_count in directory_map.items():
127 |             self._progress_manager.start(f"Analyzing files in '{directory}'", file_count, "Files", self.interactive)
128 |             self.__walk_directory_files(
129 |                 root_directory=directory,
130 |                 threads=threads,
131 |                 command=lambda root_dir, file_dir, file_path: self.analyze_file(file_path))
132 |             self._progress_manager.clear()
133 | 
134 |     def find_duplicates_in_directories(self, directory_map: dict):
135 |         """
136 |         Finds duplicates in the given directories
137 |         :param directory_map: map of directory path -> file count
138 |         """
139 |         self.reset_result()
140 | 
141 |         for directory, file_count in directory_map.items():
142 |             self._progress_manager.start(f"Finding duplicates in '{directory}' ...", file_count, "Files",
143 |                                          self.interactive)
144 |             self.__walk_directory_files(
145 |                 root_directory=directory,
146 |                 threads=1,  # there seems to be no performance advantage in using multiple threads here
147 |                 command=lambda root_dir, _, file_path: self.find_duplicates_of_file(
148 |                     root_directories=self._config.SOURCE_DIRECTORIES.value,
149 |                     root_directory=root_dir,
150 |                     reference_file_path=file_path
151 |                 )
152 |             )
153 |             self._progress_manager.clear()
154 | 
155 |     def cleanup_database(self, directories: List[Path]):
156 |         """
157 |         Removes database entries of files that don't exist on disk.
158 |         Note that this cleanup will only consider files within one
159 |         of the root directories specified in constructor, as other file paths
160 |         might have been added on other machines.
161 |         :param directories: directories in this run
162 |         """
163 |         # TODO: This iterates through all db entries - even the ones we are ignoring.
164 |         # The db query should be improved to speed this up
165 | 
166 |         count, entries = self._persistence.get_all()
167 |         if count <= 0:
168 |             return
169 | 
170 |         self._progress_manager.start(f"Cleanup database", count, "entries", self.interactive)
171 |         for entry in entries:
172 |             try:
173 |                 image_entry = entry['_source']
174 |                 metadata = image_entry.get(MetadataKey.METADATA.value, {})
175 | 
176 |                 file_path = Path(image_entry[MetadataKey.PATH.value])
177 |                 self._progress_manager.set_postfix(self._truncate_middle(str(file_path)))
178 | 
179 |                 if MetadataKey.DATAMODEL_VERSION.value not in metadata:
180 |                     echo(f"Removing db entry with missing db model version number: {file_path}")
181 |                     self._persistence.remove(str(file_path))
182 |                     continue
183 | 
184 |                 data_version = metadata.get(MetadataKey.DATAMODEL_VERSION.value, -1)
185 |                 if data_version != self._persistence.DATAMODEL_VERSION:
186 |                     echo(f"Removing db entry with old db model version: {file_path}")
187 |                     self._persistence.remove(str(file_path))
188 |                     continue
189 | 
190 |                 # filter by files in at least one of the specified root directories
191 |                 # this is necessary because the database might hold items for other paths already
192 |                 # and those are not interesting to us
193 |                 if not any(root_dir in file_path.parents for root_dir in directories):
194 |                     continue
195 | 
196 |                 if not file_path.exists():
197 |                     echo(f"Removing db entry for missing file: {file_path}")
198 |                     self._persistence.remove(str(file_path))
199 |             except Exception as e:
200 |                 logging.exception(e)
201 |                 echo(f"Error while cleaning up database entry {entry}: {e}")
202 |                 try:
203 |                     image_entry = entry['_source']
204 |                     file_path = Path(image_entry[MetadataKey.PATH.value])
205 |                     self._persistence.remove(str(file_path))
206 |                 except Exception as e:
207 |                     logging.exception(e)
208 |                     echo(f"Error removing db entry: {e}")
209 |             finally:
210 |                 self._progress_manager.inc()
211 |         self._progress_manager.clear()
212 | 
213 |     def _remove_empty_folders(self, directories: List[Path], recursive: bool):
214 |         """
215 |         Searches for empty folders and removes them
216 |         :param directories: directories to scan
217 |         """
218 |         dry_run = self._config.DRY_RUN.value
219 | 
220 |         # remove empty folders
221 |         for directory in directories:
222 |             empty_folders = self._find_empty_folders(directory, recursive, dry_run)
223 |             self._remove_folders(directory, empty_folders, dry_run)
224 | 
225 |     def _count_files(self, directories: List[Path]) -> dict:
226 |         """
227 |         Counts the amount of files to analyze (used in progress) and stores them in a map
228 |         :return map "directory path" -> "directory file count"
229 |         """
230 |         directory_map = {}
231 | 
232 |         self._progress_manager.start(f"Counting files", len(directories), "Dirs", self.interactive)
233 |         for directory in directories:
234 |             self._progress_manager.set_postfix(self._truncate_middle(directory))
235 | 
236 |             file_count = get_files_count(
237 |                 directory,
238 |                 self._config.RECURSIVE.value,
239 |                 self._config.FILE_EXTENSION_FILTER.value,
240 |                 self._config.EXCLUSIONS.value
241 |             )
242 |             directory_map[directory] = file_count
243 | 
244 |             self._progress_manager.inc()
245 |         self._progress_manager.clear()
246 | 
247 |         return directory_map
248 | 
249 |     def __walk_directory_files(self, root_directory: Path, command, threads: int):
250 |         """
251 |         Walks through the files of the given directory
252 |         :param root_directory: the directory to start with
253 |         :param command: the method to execute for every file found
254 |         :return: file_path -> identifier
255 |         """
256 |         with ThreadPoolExecutor(max_workers=threads, thread_name_prefix="py-image-dedup-walker") as self.EXECUTOR:
257 |             for (root, dirs, files) in os.walk(str(root_directory)):
258 |                 # root is the place you're listing
259 |                 # dirs is a list of directories directly under root
260 |                 # files is a list of files directly under root
261 |                 root = Path(root)
262 | 
263 |                 for file in files:
264 |                     file_path = Path(root, file)
265 | 
266 |                     # skip file in exclusion
267 |                     if any(list(map(lambda x: x.search(str(file_path.absolute())), self._config.EXCLUSIONS.value))):
268 |                         continue
269 | 
270 |                     # skip file with unwanted file extension
271 |                     if not file_has_extension(file_path, self._config.FILE_EXTENSION_FILTER.value):
272 |                         continue
273 | 
274 |                     # skip if not existent (probably already deleted)
275 |                     if not file_path.exists():
276 |                         self._progress_manager.inc()
277 |                         continue
278 | 
279 |                     try:
280 |                         self.EXECUTOR.submit(util.reraise_with_stack(command), root_directory, root, file_path)
281 |                     except Exception as e:
282 |                         click.echo(e, err=True)
283 |                         sys.exit(1)
284 | 
285 |                 if not self._config.RECURSIVE.value:
286 |                     return
287 | 
288 |     @ANALYSIS_TIME.time()
289 |     def analyze_file(self, file_path: Path):
290 |         """
291 |         Analyzes a single file
292 |         :param file_path: the file path
293 |         """
294 |         self._progress_manager.set_postfix(self._truncate_middle(file_path))
295 | 
296 |         try:
297 |             self._persistence.add(str(file_path))
298 |         except Exception as e:
299 |             logging.exception(e)
300 |             echo(f"Error analyzing file '{file_path}': {e}")
301 |         finally:
302 |             self._progress_manager.inc()
303 | 
304 |     @FIND_DUPLICATES_TIME.time()
305 |     def find_duplicates_of_file(self, root_directories: List[Path], root_directory: Path, reference_file_path: Path):
306 |         """
307 |         Finds duplicates and marks all but the best copy as "to-be-deleted".
308 |         :param root_directories: valid root directories
309 |         :param root_directory: root directory of reference_file_path
310 |         :param reference_file_path: the file to check for duplicates
311 |         """
312 |         self._progress_manager.inc()
313 |         self._progress_manager.set_postfix(self._truncate_middle(reference_file_path))
314 | 
315 |         # remember processed files to prevent processing files in multiple directions
316 |         if reference_file_path in self._processed_files:
317 |             # already found a better candidate for this file
318 |             return
319 | 
320 |         duplicate_candidates = self._persistence.find_similar(str(reference_file_path))
321 | 
322 |         if self._config.SEARCH_ACROSS_ROOT_DIRS.value:
323 |             # filter by files in at least one of the specified root directories
324 |             # this is necessary because the database might hold items for other paths already
325 |             # and those are not interesting to us
326 |             duplicate_candidates = [
327 |                 candidate for candidate in duplicate_candidates if
328 |                 any(root_dir in Path(candidate[MetadataKey.PATH.value]).parents for root_dir in root_directories)
329 |             ]
330 |         else:
331 |             # filter by files in the same root directory
332 |             duplicate_candidates = [
333 |                 candidate for candidate in duplicate_candidates if
334 |                 root_directory in Path(candidate[MetadataKey.PATH.value]).parents
335 |             ]
336 | 
337 |         if len(duplicate_candidates) <= 0:
338 |             echo(f"No duplication candidates found in database for '{reference_file_path}'. "
339 |                  "This is an indication that the file has not been analysed yet or "
340 |                  "there was an issue analysing it.",
341 |                  color='yellow')
342 | 
343 |         if len(duplicate_candidates) <= 1:
344 |             for candidate in duplicate_candidates:
345 |                 candidate_path = Path(candidate[MetadataKey.PATH.value])
346 | 
347 |                 if candidate_path != reference_file_path:
348 |                     echo(f"Unexpected unique duplication candidate '{candidate_path}' for "
349 |                          f"reference file '{reference_file_path}'", color='yellow')
350 | 
351 |                 self._processed_files[candidate_path] = True
352 | 
353 |             # nothing to do here since the result is unique
354 |             return
355 | 
356 |         # sort by quality criteria and redo the search to use the best candidate as the reference image
357 |         sorted_duplicate_candidates = self._sort_by_quality_descending(duplicate_candidates)
358 |         new_reference_file_path = sorted_duplicate_candidates[0][MetadataKey.PATH.value]
359 |         duplicate_candidates = self._persistence.find_similar(new_reference_file_path)
360 | 
361 |         candidates_to_keep, candidates_to_delete = self._select_images_to_delete(duplicate_candidates)
362 |         self._save_duplicates_for_result(candidates_to_keep, candidates_to_delete)
363 | 
364 |     def _save_duplicates_for_result(self, files_to_keep: List[dict], duplicates: List[dict]) -> None:
365 |         """
366 |         Saves the comparison result for the final summary
367 | 
368 |         :param files_to_keep: list of image that shall be kept
369 |         :param duplicates: less good duplicates
370 |         """
371 |         self._deduplication_result.set_file_duplicates(files_to_keep, duplicates)
372 | 
373 |         for file_to_keep in files_to_keep:
374 |             file_path = Path(file_to_keep[MetadataKey.PATH.value])
375 |             self._deduplication_result.add_file_action(file_path, ActionEnum.NONE)
376 | 
377 |         if self._config.DEDUPLICATOR_DUPLICATES_TARGET_DIRECTORY.value is None:
378 |             action = ActionEnum.DELETE
379 |         else:
380 |             action = ActionEnum.MOVE
381 |         for duplicate in duplicates:
382 |             file_path = Path(duplicate[MetadataKey.PATH.value])
383 |             self._deduplication_result.add_file_action(file_path, action)
384 | 
385 |     def _select_images_to_delete(self, duplicate_candidates: [{}]) -> tuple:
386 |         """
387 |         Selects which image to keep and which to remove
388 |         :return: tuple (image to keep, list of images to remove)
389 |         """
390 |         duplicate_candidates = self._sort_by_quality_descending(duplicate_candidates)
391 | 
392 |         # keep first and mark others for removal
393 |         keep = [duplicate_candidates[0]]
394 |         dont_keep = duplicate_candidates[1:]
395 | 
396 |         # move files that don't fit criteria to "keep" list
397 |         max_mod_time_diff = self._config.MAX_FILE_MODIFICATION_TIME_DELTA.value
398 |         if max_mod_time_diff is not None:
399 |             # filter files that don't match max mod time diff criteria
400 |             best_candidate = keep[0]
401 |             best_match_mod_timestamp = best_candidate[MetadataKey.METADATA.value][
402 |                 MetadataKey.FILE_MODIFICATION_DATE.value]
403 | 
404 |             for c in dont_keep:
405 |                 c_timestamp = c[MetadataKey.METADATA.value][MetadataKey.FILE_MODIFICATION_DATE.value]
406 |                 timestamp_diff = abs(c_timestamp - best_match_mod_timestamp)
407 |                 difference = datetime.timedelta(seconds=timestamp_diff)
408 |                 if difference > max_mod_time_diff:
409 |                     keep.append(c)
410 |             dont_keep = list(filter(lambda x: x not in keep, dont_keep))
411 | 
412 |         # remember that we have processed these files
413 |         for candidate in duplicate_candidates:
414 |             self._processed_files[candidate[MetadataKey.PATH.value]] = True
415 | 
416 |         return keep, dont_keep
417 | 
418 |     @staticmethod
419 |     def _sort_by_quality_descending(duplicate_candidates) -> []:
420 |         """
421 |         Sorts images according to the desired priorities.
422 |         The first item in the list will be the most preferred one of all found duplicates.
423 | 
424 |         :param duplicate_candidates: the images to analyze
425 |         :return: duplicate candidates sorted by given criteria
426 |         """
427 | 
428 |         def sort_criteria(candidate: dict) -> ():
429 |             criteria = []
430 | 
431 |             for rule in DeduplicatorConfig.PRIORITIZATION_RULES.value:
432 |                 rule_name = rule.get("name")
433 |                 if rule_name == "more-exif-data":
434 |                     if MetadataKey.EXIF_DATA.value in candidate[MetadataKey.METADATA.value]:
435 |                         # more exif data is better
436 |                         criteria.append(len(candidate[MetadataKey.METADATA.value][MetadataKey.EXIF_DATA.value]) * -1)
437 |                 elif rule_name == "less-exif-data":
438 |                     if MetadataKey.EXIF_DATA.value in candidate[MetadataKey.METADATA.value]:
439 |                         # more exif data is better
440 |                         criteria.append(len(candidate[MetadataKey.METADATA.value][MetadataKey.EXIF_DATA.value]) * 1)
441 |                 elif rule_name == "bigger-file-size":
442 |                     # reverse, bigger is better
443 |                     criteria.append(candidate[MetadataKey.METADATA.value][MetadataKey.FILE_SIZE.value] * -1)
444 |                 elif rule_name == "smaller-file-size":
445 |                     # smaller is better
446 |                     criteria.append(candidate[MetadataKey.METADATA.value][MetadataKey.FILE_SIZE.value] * 1)
447 |                 elif rule_name == "newer-file-modification-date":
448 |                     # reverse, bigger (later time) is better
449 |                     criteria.append(
450 |                         candidate[MetadataKey.METADATA.value][MetadataKey.FILE_MODIFICATION_DATE.value] * -1)
451 |                 elif rule_name == "older-file-modification-date":
452 |                     # smaller (earlier time) is better
453 |                     criteria.append(
454 |                         candidate[MetadataKey.METADATA.value][MetadataKey.FILE_MODIFICATION_DATE.value] * 1)
455 |                 elif rule_name == "smaller-distance":
456 |                     # smaller distance is better
457 |                     criteria.append(candidate[MetadataKey.DISTANCE.value])
458 |                 elif rule_name == "bigger-distance":
459 |                     # bigger distance is better
460 |                     criteria.append(candidate[MetadataKey.DISTANCE.value] * -1)
461 |                 # elif rule_name == "longer-path":
462 |                 # elif rule_name == "shorter-path":
463 |                 elif rule_name == "contains-copy-in-file-name":
464 |                     # if the filename contains "copy" it is less good
465 |                     criteria.append("copy" in file.get_file_name(candidate[MetadataKey.PATH.value]).lower())
466 |                 elif rule_name == "doesnt-contain-copy-in-file-name":
467 |                     # if the filename contains "copy" it is better
468 |                     criteria.append("copy" not in file.get_file_name(candidate[MetadataKey.PATH.value]).lower())
469 |                 elif rule_name == "longer-file-name":
470 |                     # longer filename is better (for "edited" versions)
471 |                     criteria.append(len(file.get_file_name(candidate[MetadataKey.PATH.value])) * -1)
472 | 
473 |                 elif rule_name == "shorter-file-name":
474 |                     # shorter filename is better (for "edited" versions)
475 |                     criteria.append(len(file.get_file_name(candidate[MetadataKey.PATH.value])) * 1)
476 | 
477 |                 elif rule_name == "longer-folder-path":
478 |                     # shorter folder path is better
479 |                     criteria.append(len(file.get_containing_folder(candidate[MetadataKey.PATH.value])) * -1)
480 |                 elif rule_name == "shorter-folder-path":
481 |                     # shorter folder path is better
482 |                     criteria.append(len(file.get_containing_folder(candidate[MetadataKey.PATH.value])))
483 |                 elif rule_name == "higher-score":
484 |                     # reverse, bigger is better
485 |                     criteria.append(candidate[MetadataKey.SCORE.value] * -1)
486 |                 elif rule_name == "lower-score":
487 |                     # lower is better
488 |                     criteria.append(candidate[MetadataKey.SCORE.value] * 1)
489 |                 elif rule_name == "higher-pixel-count":
490 |                     # higher pixel count is better
491 |                     criteria.append(candidate[MetadataKey.METADATA.value][MetadataKey.PIXELCOUNT.value] * -1)
492 |                 elif rule_name == "lower-pixel-count":
493 |                     # lower pixel count is better
494 |                     criteria.append(candidate[MetadataKey.METADATA.value][MetadataKey.PIXELCOUNT.value] * 1)
495 | 
496 |             # just to assure the order in the result is the same
497 |             # if all other criteria (above) are equal
498 |             # and recurring runs will result in the same order
499 |             # (although they shouldn't be compared twice to begin with)
500 |             criteria.append(candidate[MetadataKey.PATH.value])
501 | 
502 |             return tuple(criteria)
503 | 
504 |         duplicate_candidates = sorted(duplicate_candidates, key=sort_criteria)
505 | 
506 |         return duplicate_candidates
507 | 
508 |     def process_duplicates(self):
509 |         """
510 |         Moves or removes duplicates based on the configuration
511 |         """
512 |         dry_run = self._config.DRY_RUN.value
513 |         duplicate_target_directory = self._config.DEDUPLICATOR_DUPLICATES_TARGET_DIRECTORY.value
514 |         if duplicate_target_directory:
515 |             echo("Phase 5/6: Moving duplicates ...", color='cyan')
516 |             self._move_files_marked_as_delete(duplicate_target_directory, dry_run)
517 |         else:
518 |             echo("Phase 5/6: Removing duplicates ...", color='cyan')
519 |             self._remove_files_marked_as_delete(dry_run)
520 | 
521 |     def _find_empty_folders(self, root_path: Path, recursive: bool, dry_run: bool) -> [str]:
522 |         """
523 |         Finds empty folders within the given root_path
524 |         :param root_path: folder to search in
525 |         """
526 |         result = OrderedSet()
527 | 
528 |         # traverse bottom-up to remove folders that are empty due to file removal
529 |         for root, directories, files in os.walk(str(root_path), topdown=False):
530 |             # get absolute paths of all files and folders in the current root directory
531 |             abs_file_paths = list(map(lambda x: os.path.abspath(os.path.join(root, x)), files))
532 |             abs_folder_paths = list(map(lambda x: os.path.abspath(os.path.join(root, x)), directories))
533 | 
534 |             # find out which of those files were deleted by the deduplication process
535 |             files_deleted = list(
536 |                 map(lambda x: Path(x), filter(
537 |                     lambda x: Path(x) in self._deduplication_result.get_removed_or_moved_files(),
538 |                     abs_file_paths)))
539 |             files_deleted = list(set(files_deleted + list(
540 |                 filter(lambda x: x.parent == Path(root), self._deduplication_result.get_removed_or_moved_files()))))
541 | 
542 |             folders_deleted = list(filter(lambda x: x in result, abs_folder_paths))
543 |             filtered_directories = list(filter(lambda x: x not in folders_deleted, abs_folder_paths))
544 | 
545 |             if dry_run:
546 |                 if len(files_deleted) > 0 and len(files_deleted) == len(files) and len(folders_deleted) == len(
547 |                     directories):
548 |                     result.append(root)
549 |             else:
550 |                 if len(files_deleted) > 0 and len(files) <= 0 and len(directories) <= 0:
551 |                     result.append(root)
552 | 
553 |             if not recursive:
554 |                 break
555 | 
556 |         return result
557 | 
558 |     def _remove_folders(self, root_path: Path, folders: [str], dry_run: bool):
559 |         """
560 |         Function to remove empty folders
561 |         :param root_path:
562 |         """
563 |         echo(f"Removing empty folders ({len(folders)}) in: '{root_path}' ...")
564 | 
565 |         if len(folders) == 0:
566 |             return
567 | 
568 |         self._progress_manager.start("Removing empty folders", len(folders), "Folder", self.interactive)
569 |         for folder in folders:
570 |             self._progress_manager.set_postfix(self._truncate_middle(folder))
571 | 
572 |             if not dry_run:
573 |                 os.rmdir(folder)
574 | 
575 |             self._deduplication_result.add_removed_empty_folder(folder)
576 |             self._progress_manager.inc()
577 |         self._progress_manager.clear()
578 | 
579 |     def _remove_files_marked_as_delete(self, dry_run: bool):
580 |         """
581 |         Removes files that were marked to be deleted in previous deduplication step
582 |         :param dry_run: set to true to simulate this action
583 |         """
584 |         items_to_remove = self._deduplication_result.get_file_with_action(ActionEnum.DELETE)
585 |         marked_files_count = len(items_to_remove)
586 |         if marked_files_count == 0:
587 |             return
588 | 
589 |         self._progress_manager.start("Removing files", marked_files_count, "File", self.interactive)
590 |         self._delete_files(items_to_remove, dry_run)
591 |         self._progress_manager.clear()
592 | 
593 |     def _move_files_marked_as_delete(self, target_dir: Path, dry_run: bool):
594 |         """
595 |         Moves files that were marked to be deleted in previous deduplication step to the target directory
596 |         :param target_dir: the directory to move duplicates to
597 |         :param dry_run: set to true to simulate this action
598 |         """
599 |         items_to_move = self._deduplication_result.get_file_with_action(ActionEnum.MOVE)
600 |         marked_files_count = len(items_to_move)
601 |         if marked_files_count == 0:
602 |             return
603 | 
604 |         self._progress_manager.start("Moving files", marked_files_count, "File", self.interactive)
605 |         self._move_files(items_to_move, target_dir, dry_run)
606 |         self._progress_manager.clear()
607 | 
608 |     def _delete_files(self, files_to_delete: [str], dry_run: bool):
609 |         """
610 |         Deletes files on disk
611 |         :param files_to_delete: list of absolute file paths
612 |         :param dry_run: set to true to simulate this action
613 |         """
614 |         for file_path in files_to_delete:
615 |             self._progress_manager.set_postfix(self._truncate_middle(file_path))
616 | 
617 |             if dry_run:
618 |                 pass
619 |             else:
620 |                 # remove from file system
621 |                 if os.path.exists(file_path):
622 |                     os.remove(file_path)
623 | 
624 |                 # remove from persistence
625 |                 self._persistence.remove(file_path)
626 | 
627 |                 DUPLICATE_ACTION_DELETE_COUNT.inc()
628 | 
629 |             self._progress_manager.inc()
630 | 
631 |     def _move_files(self, files_to_move: List[Path], target_dir: Path, dry_run: bool):
632 |         """
633 |         Moves files on disk
634 |         :param files_to_move: list of absolute file paths
635 |         :param target_dir: directory to move files to
636 |         """
637 |         for file_path in files_to_move:
638 |             self._progress_manager.set_postfix(self._truncate_middle(file_path))
639 | 
640 |             try:
641 |                 if dry_run:
642 |                     continue
643 | 
644 |                 # move file
645 |                 if not file_path.exists():
646 |                     continue
647 | 
648 |                 target_file = Path(str(target_dir), *file_path.parts[1:])
649 |                 if target_file.exists():
650 |                     if filecmp.cmp(file_path, target_file, shallow=False):
651 |                         os.remove(file_path)
652 |                     else:
653 |                         raise ValueError(f"Can't move duplicate file because the target already exists: {target_file}")
654 |                 else:
655 |                     target_file.parent.mkdir(parents=True, exist_ok=True)
656 |                     shutil.move(file_path, target_file)
657 | 
658 |                 # remove from persistence
659 |                 self._persistence.remove(str(file_path))
660 | 
661 |                 DUPLICATE_ACTION_MOVE_COUNT.inc()
662 |             except Exception as ex:
663 |                 logging.exception(ex)
664 |                 # LOGGER.log(ex)
665 |             finally:
666 |                 self._progress_manager.inc()
667 | 
668 |     @staticmethod
669 |     def _truncate_middle(text: any, max_length: int = 50):
670 |         text = str(text)
671 |         if len(text) <= max_length:
672 |             # string is already short-enough, fill up with spaces
673 |             return text + ((max_length - len(text)) * " ")
674 |         # half of the size, minus the 3 .'s
675 |         n_2 = int(max_length / 2) - 3
676 |         # whatever's left
677 |         n_1 = max_length - n_2 - 3
678 |         return '{0}...{1}'.format(text[:n_1], text[-n_2:])
679 | 
680 |     def remove_empty_folders(self):
681 |         phase_6_text = "Phase 6/6: Removing empty folders"
682 |         if not self._config.REMOVE_EMPTY_FOLDERS.value:
683 |             echo(phase_6_text + " - Skipping", color='yellow')
684 |         else:
685 |             echo(phase_6_text, color='cyan')
686 |             self._remove_empty_folders(self._config.SOURCE_DIRECTORIES.value, self._config.RECURSIVE.value)
687 | 


--------------------------------------------------------------------------------
/py_image_dedup/library/file_watch.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from pathlib import Path
 3 | 
 4 | from watchdog.events import FileSystemEventHandler, EVENT_TYPE_MODIFIED, EVENT_TYPE_MOVED, EVENT_TYPE_CREATED, \
 5 |     EVENT_TYPE_DELETED
 6 | 
 7 | from py_image_dedup.config import DeduplicatorConfig
 8 | from py_image_dedup.stats import FILE_EVENT_COUNT
 9 | from py_image_dedup.util import echo
10 | 
11 | 
12 | class EventHandler(FileSystemEventHandler):
13 | 
14 |     def __init__(self, processing_manager):
15 |         super().__init__()
16 |         self.processing_manager = processing_manager
17 | 
18 |         self.config = DeduplicatorConfig()
19 | 
20 |         self.directory_regex = re.compile(rf"^({'|'.join(list(map(str, self.config.SOURCE_DIRECTORIES.value)))}).*$")
21 |         self.file_regex = re.compile(rf"^.*({'|'.join(self.config.FILE_EXTENSION_FILTER.value)})$", re.IGNORECASE)
22 | 
23 |     def on_any_event(self, event):
24 |         if not self._event_matches_filter(event):
25 |             return
26 | 
27 |         FILE_EVENT_COUNT.labels(type=event.event_type).inc()
28 | 
29 |         echo("FileSystemEvent: {} {} {}".format(event.event_type,
30 |                                                 "directory" if event.is_directory else "file",
31 |                                                 event.src_path))
32 | 
33 |         _actions = {
34 |             EVENT_TYPE_CREATED: self.created,
35 |             EVENT_TYPE_MODIFIED: self.modified,
36 |             EVENT_TYPE_MOVED: self.moved,
37 |             EVENT_TYPE_DELETED: self.deleted,
38 |         }
39 |         _actions[event.event_type](event)
40 | 
41 |     def created(self, event):
42 |         self._process(event.src_path)
43 | 
44 |     def modified(self, event):
45 |         self._process(event.src_path)
46 | 
47 |     def moved(self, event):
48 |         self._cleanup(event.src_path)
49 |         self._process(event.dest_path)
50 | 
51 |     def deleted(self, event):
52 |         self._cleanup(event.src_path)
53 | 
54 |     def _process(self, path: str):
55 |         self.processing_manager.add(Path(path))
56 | 
57 |     def _cleanup(self, path: str):
58 |         self.processing_manager.remove(Path(path))
59 | 
60 |     def _event_matches_filter(self, event) -> bool:
61 |         if event.is_directory:
62 |             return False
63 |         else:
64 |             result = bool(self.directory_regex.match(event.src_path))
65 |             result &= bool(self.file_regex.match(event.src_path))
66 |         return result
67 | 


--------------------------------------------------------------------------------
/py_image_dedup/library/processing_manager.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | from datetime import datetime, timedelta
  3 | from pathlib import Path
  4 | from threading import Lock
  5 | from typing import List
  6 | 
  7 | from watchdog.observers.inotify import InotifyObserver
  8 | from watchdog.observers.polling import PollingObserver
  9 | 
 10 | from py_image_dedup.config import DeduplicatorConfig, FILE_OBSERVER_TYPE_INOTIFY, FILE_OBSERVER_TYPE_POLLING
 11 | from py_image_dedup.library import ActionEnum, RegularIntervalWorker
 12 | from py_image_dedup.library.file_watch import EventHandler
 13 | from py_image_dedup.library.progress_manager import ProgressManager
 14 | from py_image_dedup.util.file import get_files_count
 15 | 
 16 | 
 17 | class ProcessingManager(RegularIntervalWorker):
 18 |     lock = Lock()
 19 |     queue = OrderedDict()
 20 | 
 21 |     progress_manager: ProgressManager
 22 | 
 23 |     latest_event_time = None
 24 | 
 25 |     def __init__(self, deduplicator):
 26 |         self.config = DeduplicatorConfig()
 27 |         timeout = self.config.DAEMON_PROCESSING_TIMEOUT.value
 28 |         interval = timeout.total_seconds()
 29 |         super().__init__(interval)
 30 |         self.progress_manager = ProgressManager()
 31 |         self.deduplicator = deduplicator
 32 |         self.event_handler = EventHandler(self)
 33 |         self.observers = []
 34 | 
 35 |     def start(self):
 36 |         observer_type = self.config.DAEMON_FILE_OBSERVER_TYPE.value
 37 |         directories = self.config.SOURCE_DIRECTORIES.value
 38 |         self.observers = self._setup_file_observers(observer_type, directories)
 39 |         super().start()
 40 | 
 41 |     def stop(self):
 42 |         for observer in self.observers:
 43 |             observer.stop()
 44 |             observer.join()
 45 | 
 46 |         self.observers.clear()
 47 | 
 48 |     def _setup_file_observers(self, observer_type: str, source_directories: List[Path]):
 49 |         observers = []
 50 | 
 51 |         for directory in source_directories:
 52 |             if observer_type == FILE_OBSERVER_TYPE_INOTIFY:
 53 |                 observer = InotifyObserver()
 54 |             elif observer_type == FILE_OBSERVER_TYPE_POLLING:
 55 |                 observer = PollingObserver()
 56 |             else:
 57 |                 raise ValueError(f"Unexpected file observer type {observer_type}")
 58 | 
 59 |             observer.schedule(self.event_handler, str(directory), recursive=True)
 60 |             observer.start()
 61 |             observers.append(observer)
 62 | 
 63 |         return observers
 64 | 
 65 |     def add(self, path: Path):
 66 |         with self.lock:
 67 |             self.latest_event_time = datetime.now()
 68 |             if path not in self.queue:
 69 |                 self.queue[path] = path
 70 | 
 71 |     def remove(self, path: Path):
 72 |         if path in self.queue:
 73 |             self.queue.pop(path)
 74 |         self.deduplicator._persistence.remove(str(path))
 75 | 
 76 |     def _should_process(self):
 77 |         return len(self.queue) > 0 and (
 78 |                 self.latest_event_time is None or
 79 |                 (datetime.now() - timedelta(seconds=self._interval) > self.latest_event_time)
 80 |         )
 81 | 
 82 |     def _run(self):
 83 |         with self.lock:
 84 |             self.process_queue()
 85 | 
 86 |     def process_queue(self):
 87 |         if not self._should_process():
 88 |             return
 89 | 
 90 |         self.progress_manager.start("Processing", len(self.queue), "Files", False)
 91 |         while True:
 92 |             try:
 93 |                 path, value = self.queue.popitem()
 94 |                 self._process_queue_item(path, value)
 95 |                 self.progress_manager.inc()
 96 |             except KeyError:
 97 |                 break
 98 |         self.progress_manager.clear()
 99 | 
100 |     def _process_queue_item(self, path, value):
101 |         self.deduplicator.reset_result()
102 | 
103 |         # TODO: only a workaround until files can be processed too
104 |         if path.is_file():
105 |             path = path.parent
106 | 
107 |         if path.is_dir():
108 |             files_count = get_files_count(
109 |                 path,
110 |                 self.config.RECURSIVE.value,
111 |                 self.config.FILE_EXTENSION_FILTER.value,
112 |                 self.config.EXCLUSIONS.value
113 |             )
114 |             directory_map = {
115 |                 path: files_count
116 |             }
117 | 
118 |             self.deduplicator.analyze_directories(directory_map)
119 |             self.deduplicator.find_duplicates_in_directories(directory_map)
120 | 
121 |         # TODO: allow processing individual files
122 |         # if path.is_file():
123 |         #     self.deduplicator.analyze_file(path)
124 |         #     root_dir = Path(os.path.commonpath([path] + self.config.SOURCE_DIRECTORIES.value))
125 |         #     self.deduplicator.find_duplicates_of_file(self.config.SOURCE_DIRECTORIES.value, root_dir, path)
126 | 
127 |         self.deduplicator.process_duplicates()
128 | 
129 |         # TODO: this needs rethinking
130 |         # remove items that have been (re-)moved already from the event queue
131 |         removed_items = self.deduplicator._deduplication_result.get_file_with_action(ActionEnum.DELETE)
132 |         moved_items = self.deduplicator._deduplication_result.get_file_with_action(ActionEnum.MOVE)
133 |         for item in set(removed_items + moved_items):
134 |             if item in self.queue:
135 |                 self.queue.pop(item)
136 |                 self.progress_manager.inc()
137 | 


--------------------------------------------------------------------------------
/py_image_dedup/library/progress_manager.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from tqdm import tqdm
 4 | 
 5 | LOGGER = logging.getLogger(__name__)
 6 | LOGGER.setLevel(logging.DEBUG)
 7 | 
 8 | 
 9 | class ProgressManager:
10 | 
11 |     def __init__(self):
12 |         self._progress_bar = None
13 |         self._task = None
14 |         self._n = None
15 |         self._total = None
16 |         self._unit = None
17 |         self._last_percentage = None
18 | 
19 |     def start(self, task: str, total: int, unit: str, interactive: bool):
20 |         if self._task is not None:
21 |             LOGGER.warning(f"Starting new progress without explicitly closing the current one '{self._task}'")
22 |             self.clear()
23 | 
24 |         self._task = task
25 |         self._total = total
26 |         self._unit = unit
27 |         if interactive:
28 |             self._progress_bar = self._create_progressbar(total, unit)
29 | 
30 |     def set_postfix(self, postfix: str):
31 |         if self._progress_bar is not None and postfix is not None:
32 |             self._progress_bar.set_postfix_str(postfix)
33 | 
34 |     def inc(self, n: int = 1):
35 |         if self._task is None:
36 |             raise AssertionError(
37 |                 "Can't increase before start. "
38 |                 "Please start a new task progress using start() before incrementing it.")
39 | 
40 |         if self._n is None:
41 |             self._n = n
42 |         else:
43 |             self._n += n
44 | 
45 |         if self._progress_bar is not None:
46 |             self._progress_bar.update(n)
47 | 
48 |         new_percentage = int((self._n / self._total) * 100)
49 |         if self._last_percentage is None or self._last_percentage != new_percentage:
50 |             self._last_percentage = new_percentage
51 |             LOGGER.info(f"{self._task}: {new_percentage}% ({self._n}/{self._total})")
52 | 
53 |     def clear(self):
54 |         if self._progress_bar is not None:
55 |             self._progress_bar.close()
56 |             self._progress_bar = None
57 |         self._last_percentage = None
58 |         self._n = None
59 |         self._total = None
60 |         self._task = None
61 |         self._unit = None
62 | 
63 |     def _create_progressbar(self, total_count: int, unit: str) -> tqdm:
64 |         """
65 |         Creates a new progress bar
66 |         :param total_count: target for 100%
67 |         :param unit: "Things" that are counted
68 |         :return: progress bar
69 |         """
70 |         self._progress_bar = tqdm(total=total_count, unit=unit, unit_scale=True, mininterval=1)
71 |         return self._progress_bar
72 | 


--------------------------------------------------------------------------------
/py_image_dedup/persistence/__init__.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | 
  4 | from PIL import TiffImagePlugin
  5 | 
  6 | from py_image_dedup.persistence.metadata_key import MetadataKey
  7 | 
  8 | 
  9 | class ImageSignatureStore:
 10 |     """
 11 |     Base class for Persistence implementations
 12 |     """
 13 | 
 14 |     DATAMODEL_VERSION = 5
 15 | 
 16 |     def __init__(self, use_exif_data: bool = True):
 17 |         self._use_exif_data = use_exif_data
 18 | 
 19 |     def add(self, image_file_path: str):
 20 |         """
 21 |         Analyze an image file and add it to the store
 22 | 
 23 |         :param image_file_path: path to the image file
 24 |         """
 25 |         image_data = self._create_metadata_dict(image_file_path)
 26 | 
 27 |         # check if the file has already been analyzed (and didn't change in the meantime)
 28 |         existing_entity = self.get(image_file_path)
 29 |         if existing_entity is not None:
 30 |             is_data_version_ok = False
 31 |             try:
 32 |                 if MetadataKey.DATAMODEL_VERSION.value in existing_entity[MetadataKey.METADATA.value]:
 33 |                     is_data_version_ok = existing_entity[MetadataKey.METADATA.value][
 34 |                                              MetadataKey.DATAMODEL_VERSION.value] == self.DATAMODEL_VERSION
 35 | 
 36 |                 if is_data_version_ok and \
 37 |                     existing_entity[MetadataKey.METADATA.value][MetadataKey.FILE_SIZE.value] == image_data[
 38 |                     MetadataKey.FILE_SIZE.value] and \
 39 |                     existing_entity[MetadataKey.METADATA.value][
 40 |                         MetadataKey.FILE_MODIFICATION_DATE.value] == image_data[
 41 |                     MetadataKey.FILE_MODIFICATION_DATE.value]:
 42 |                     # print("File is the same, not adding again")
 43 |                     return
 44 |             except Exception as ex:
 45 |                 logging.exception(ex)
 46 |                 try:
 47 |                     self.remove(image_file_path)
 48 |                 except Exception as ex:
 49 |                     logging.exception(ex)
 50 |                     return
 51 | 
 52 |         self._add(image_file_path, image_data)
 53 | 
 54 |     def _create_metadata_dict(self, image_file_path: str) -> dict:
 55 |         """
 56 |         Creates a dictionary that should be stored in persistence
 57 | 
 58 |         :param image_file_path: path to the image file
 59 |         :return: dictionary containing all relevant information
 60 |         """
 61 |         from py_image_dedup.util import image
 62 | 
 63 |         image_data = {}
 64 |         image_data[MetadataKey.PATH.value] = image_file_path
 65 | 
 66 |         # get some metadata
 67 |         file_size = os.stat(image_file_path).st_size
 68 |         file_modification_date = os.path.getmtime(image_file_path)
 69 | 
 70 |         image_data[MetadataKey.DATAMODEL_VERSION.value] = self.DATAMODEL_VERSION
 71 |         image_data[MetadataKey.FILE_SIZE.value] = file_size
 72 |         image_data[MetadataKey.FILE_MODIFICATION_DATE.value] = file_modification_date
 73 | 
 74 |         image_data[MetadataKey.PIXELCOUNT.value] = image.get_pixel_count(image_file_path)
 75 | 
 76 |         if self._use_exif_data:
 77 |             exif_data = image.get_exif_data(image_file_path)
 78 |             exif_data = self._normalize_meta_data_for_db(exif_data)
 79 |             image_data[MetadataKey.EXIF_DATA.value] = exif_data
 80 | 
 81 |         return image_data
 82 | 
 83 |     def _normalize_meta_data_for_db(self, dictionary: dict) -> dict:
 84 |         """
 85 |         :param dictionary:
 86 |         :return:
 87 |         """
 88 |         result = {}
 89 |         for k, v in dictionary.items():
 90 |             if isinstance(v, dict):
 91 |                 result[k] = self._normalize_meta_data_for_db(v)
 92 |                 continue
 93 | 
 94 |             normalized_value = v
 95 |             if isinstance(v, bytes) or isinstance(v, tuple):
 96 |                 normalized_value = str(v)
 97 |             elif isinstance(v, TiffImagePlugin.IFDRational):
 98 |                 if v._denominator != 0:
 99 |                     normalized_value = v._numerator / v._denominator
100 |                 else:
101 |                     normalized_value = float(v._numerator)
102 | 
103 |             result[k] = normalized_value
104 | 
105 |         return result
106 | 
107 |     def _add(self, image_file_path: str, image_data: dict) -> None:
108 |         """
109 |         Saves image data for the specified image file path
110 | 
111 |         :param image_file_path: image file path
112 |         :param image_data: metadata for the image
113 |         """
114 |         raise NotImplementedError()
115 | 
116 |     def get(self, image_file_path: str) -> dict or None:
117 |         """
118 |         Get a store entry by it's file_path
119 |         :param image_file_path: file path to search for
120 |         :return: store entry or None
121 |         """
122 |         raise NotImplementedError()
123 | 
124 |     def get_all(self) -> (int, object):
125 |         """
126 |         :return: item count, stored entries as a generator function
127 |         """
128 |         raise NotImplementedError()
129 | 
130 |     def find_similar(self, reference_image_file_path: str) -> []:
131 |         """
132 |         Search for similar images to the specified one
133 | 
134 |         :param reference_image_file_path: the reference image file
135 |         :return: list of images that are similar to the reference file
136 |         """
137 |         raise NotImplementedError()
138 | 
139 |     def remove(self, image_file_path: str) -> None:
140 |         """
141 |         Remove all entries with the given file path
142 | 
143 |         :param image_file_path: the path of an image file
144 |         """
145 |         raise NotImplementedError()
146 | 
147 |     def remove_entries_of_missing_files(self):
148 |         """
149 |         Remove all entries with files that don't exist
150 |         """
151 |         entries = self.get_all()
152 |         for entry in entries:
153 |             file_path = entry['path']
154 |             if not os.path.exists(file_path):
155 |                 self.remove(file_path)
156 | 
157 |     def remove_all(self) -> None:
158 |         """
159 |         Remove all entries from Database
160 |         """
161 |         raise NotImplementedError()
162 | 


--------------------------------------------------------------------------------
/py_image_dedup/persistence/elasticsearchstorebackend.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | 
  4 | import requests
  5 | from elasticsearch import Elasticsearch
  6 | from image_match.elasticsearch_driver import SignatureES
  7 | 
  8 | from py_image_dedup.persistence import ImageSignatureStore
  9 | from py_image_dedup.util import echo
 10 | 
 11 | 
 12 | class ElasticSearchStoreBackend(ImageSignatureStore):
 13 |     DEFAULT_EL_DOC_TYPE_EL_6 = 'image'
 14 |     DEFAULT_EL_DOC_TYPE_EL_7 = '_doc'
 15 | 
 16 |     def __init__(self,
 17 |                  host: str,
 18 |                  port: int,
 19 |                  connections_per_node: int,
 20 |                  el_index: str,
 21 |                  el_version: int = None,
 22 |                  el_doctype: str = None,
 23 |                  max_dist: float = 0.03,
 24 |                  use_exif_data: bool = True,
 25 |                  setup_database: bool = True,
 26 |                  ):
 27 |         """
 28 |         Image signature persistence backed by image_match and elasticsearch
 29 | 
 30 |         :param host: host address of the elasticsearch server
 31 |         :param port: port of the elasticsearch server
 32 |         :param el_version: elasticsearch version
 33 |         :param el_index: elasticsearch index where the data is stored
 34 |         :param el_doctype: elasticsearch document type of the stored data
 35 |         :param max_dist: maximum "difference" allowed, ranging from [0 .. 1] where 0.2 is still a pretty similar image
 36 |         """
 37 |         super().__init__(use_exif_data)
 38 | 
 39 |         self.host = host
 40 |         self.port = port
 41 |         self._connections_per_node = connections_per_node
 42 | 
 43 |         detected_version = None
 44 |         while detected_version is None:
 45 |             time.sleep(2)
 46 |             detected_version = self._detect_db_version()
 47 | 
 48 |         self._el_version = el_version
 49 |         if self._el_version is not None and detected_version is not None and self._el_version != detected_version:
 50 |             raise AssertionError(
 51 |                 "Detected database version ({}) does not match expected version ({})".format(detected_version,
 52 |                                                                                              self._el_version))
 53 | 
 54 |         if detected_version is not None:
 55 |             self._el_version = detected_version
 56 |         elif self._el_version is None:
 57 |             # assume version 6 by default
 58 |             self._el_version = 6
 59 | 
 60 |         self._el_index = el_index
 61 |         if el_doctype is not None:
 62 |             self._el_doctype = el_doctype
 63 |         else:
 64 |             self._el_doctype = self.DEFAULT_EL_DOC_TYPE_EL_6 if self._el_version < 7 else self.DEFAULT_EL_DOC_TYPE_EL_7
 65 | 
 66 |         self.setup_database = setup_database
 67 |         if setup_database:
 68 |             try:
 69 |                 # self._clear_database()
 70 |                 self._setup_database()
 71 |             except Exception as e:
 72 |                 logging.exception(e)
 73 |                 raise AssertionError("Could not setup database")
 74 | 
 75 |         # noinspection PyTypeChecker
 76 |         self._store = SignatureES(
 77 |             es=Elasticsearch(
 78 |                 hosts=[
 79 |                     {'host': self.host, 'port': self.port}
 80 |                 ],
 81 |                 maxsize=self._connections_per_node,
 82 |             ),
 83 |             # el_version=self._el_version,
 84 |             index=self._el_index,
 85 |             doc_type=self._el_doctype,
 86 |             distance_cutoff=max_dist,
 87 |         )
 88 | 
 89 |     def _detect_db_version(self) -> int or None:
 90 |         try:
 91 |             response = requests.get('http://{}:{}'.format(self.host, self.port))
 92 |             response.raise_for_status()
 93 |             return int(str(response.json()["version"]['number']).split(".")[0])
 94 |         except Exception as ex:
 95 |             logging.exception(ex)
 96 |             return None
 97 | 
 98 |     def _setup_database(self):
 99 |         """
100 |         Creates the expected index, if it does not exist
101 |         """
102 |         response = requests.get('http://{}:{}/{}'.format(self.host, self.port, self._el_index))
103 |         if response.status_code == 200:
104 |             return
105 |         elif response.status_code == 404:
106 | 
107 |             properties = {
108 |                 "properties": {
109 |                     "path": {
110 |                         "type": "keyword",
111 |                         "ignore_above": 256
112 |                     }
113 |                 }
114 |             }
115 | 
116 |             if self._el_version == 7:
117 |                 json_data = {
118 |                     "mappings": properties
119 |                 }
120 |             else:
121 |                 json_data = {
122 |                     "mappings": {
123 |                         self._el_doctype: properties
124 |                     }
125 |                 }
126 | 
127 |             response = requests.put(
128 |                 url='http://{}:{}/{}'.format(self.host, self.port, self._el_index),
129 |                 json=json_data
130 |             )
131 | 
132 |             response.raise_for_status()
133 |         else:
134 |             response.raise_for_status()
135 | 
136 |     def _clear_database(self):
137 |         """
138 |         Removes the index and all data it contains
139 |         """
140 |         requests.delete('http://{}:{}/{}'.format(self.host, self.port, self._el_index))
141 | 
142 |     def _add(self, image_file_path: str, image_data: dict) -> None:
143 |         # remove existing entries
144 |         self.remove(image_file_path)
145 |         self._store.add_image(image_file_path, metadata=image_data)
146 | 
147 |     def get(self, image_file_path: str) -> dict or None:
148 |         """
149 |         Get a store entry by it's file_path
150 |         :param image_file_path: file path to search for
151 |         :return:
152 |         """
153 |         db_entity = self._get(image_file_path)
154 |         return db_entity
155 | 
156 |     def _get(self, image_file_path: str) -> dict or None:
157 |         """
158 |         Get a store entry by it's file_path
159 |         :param image_file_path: file path to search for
160 |         :return: elasticsearch result dictionary
161 |         """
162 |         es_query = {
163 |             'query': {
164 |                 "constant_score": {
165 |                     "filter": {
166 |                         "term": {'path': image_file_path}
167 |                     }
168 |                 }
169 |             }
170 |         }
171 | 
172 |         query_result = self._store.es.search(index=self._el_index, body=es_query)
173 | 
174 |         hits = query_result['hits']['hits']
175 | 
176 |         if len(hits) > 1:
177 |             echo(f"WARNING: More than a single entry for a file, cleaning up: {image_file_path}", color='yellow')
178 |             self.remove(image_file_path)
179 |             self.add(image_file_path)
180 | 
181 |         if len(hits) == 0:
182 |             return None
183 |         else:
184 |             return hits[0]['_source']
185 | 
186 |     def get_all(self) -> (int, object):
187 |         es_query = {
188 |             "track_total_hits": True,
189 |             'query': {'match_all': {}}
190 |         }
191 | 
192 |         item_count = self._store.es.search(index=self._el_index, body=es_query, size=0)['hits']['total']
193 |         if self._el_version >= 7:
194 |             item_count = item_count['value']
195 | 
196 |         from elasticsearch.helpers import scan
197 | 
198 |         el6_params = {
199 |             "doc_type": self._el_doctype
200 |         }
201 |         return item_count, scan(
202 |             self._store.es,
203 |             index=self._el_index,
204 |             preserve_order=True,
205 |             query=es_query,
206 |             **(el6_params if self._el_version < 7 else {})
207 |         )
208 | 
209 |     def find_similar(self, reference_image_file_path: str) -> []:
210 |         try:
211 |             entry = self._get(reference_image_file_path)
212 |             if entry is not None:
213 |                 result = []
214 |                 rec = self._store.search_single_record(entry)
215 |                 result.extend(rec)
216 | 
217 |                 return result
218 |             else:
219 |                 return self._store.search_image(reference_image_file_path, all_orientations=True)
220 |         except Exception as e:
221 |             echo(f"Error querying database for similar images of '{reference_image_file_path}': {e}", color="red")
222 |             return []
223 | 
224 |     def search_metadata(self, metadata: dict) -> []:
225 |         """
226 |         Search for images with metadata properties.
227 | 
228 |         Note: Metadata will be empty if you did not provide it when adding an image
229 |         :param metadata:
230 |         :return:
231 |         """
232 |         search_dict = {}
233 |         for key, value in metadata.items():
234 |             search_dict[f"metadata.{key}"] = value
235 | 
236 |         es_query = {
237 |             'query': {'match': search_dict}
238 |         }
239 | 
240 |         return self._store.es.search(index=self._el_index, body=es_query)
241 | 
242 |     def remove(self, image_file_path: str) -> None:
243 |         # NOTE: this query will only work if the index has been created
244 |         # with a custom mapping for the path property:
245 | 
246 |         # # remove existing index
247 |         # curl -X DELETE "192.168.2.24:9200/images"
248 |         #
249 |         # # create index with custom mapping for "path"
250 |         # curl -X PUT "192.168.2.24:9200/images?pretty" -H "Content-Type: application/json" -d
251 |         # "
252 |         # {
253 |         #   "mappings": {
254 |         #     "image": {
255 |         #       "properties": {
256 |         #         "path": {
257 |         #           "type": "keyword",
258 |         #           "ignore_above": 256
259 |         #         }
260 |         #       }
261 |         #     }
262 |         #   }
263 |         # }
264 |         # "
265 | 
266 |         es_query = {
267 |             'query': {
268 |                 "constant_score": {
269 |                     "filter": {
270 |                         "term": {'path': image_file_path}
271 |                     }
272 |                 }
273 |             }
274 |         }
275 | 
276 |         self._remove_by_query(es_query)
277 | 
278 |     def remove_all(self) -> None:
279 |         es_query = {
280 |             'query': {'match_all': {}}
281 |         }
282 | 
283 |         self._remove_by_query(es_query)
284 | 
285 |     def _remove_by_query(self, es_query: dict):
286 |         el6_params = {
287 |             "doc_type": self._el_doctype
288 |         }
289 | 
290 |         return self._store.es.delete_by_query(
291 |             index=self._el_index,
292 |             body=es_query,
293 |             conflicts="proceed",
294 |             **(el6_params if self._el_version < 7 else {})
295 |         )
296 | 


--------------------------------------------------------------------------------
/py_image_dedup/persistence/metadata_key.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class MetadataKey(Enum):
 5 |     METADATA = "metadata"
 6 | 
 7 |     DATAMODEL_VERSION = "py-image-dedup_datamodel-version"
 8 | 
 9 |     PATH = "path"
10 |     DISTANCE = "dist"
11 |     SCORE = "score"
12 | 
13 |     FILE_SIZE = "filesize"
14 |     FILE_MODIFICATION_DATE = "file_modification_date"
15 | 
16 |     PIXELCOUNT = "pixelcount"
17 |     EXIF_DATA = "exif_data"
18 | 


--------------------------------------------------------------------------------
/py_image_dedup/stats.py:
--------------------------------------------------------------------------------
 1 | from prometheus_client import Gauge, Summary
 2 | 
 3 | DUPLICATE_ACTION_COUNT = Gauge(
 4 |     'duplicate_action_total',
 5 |     'Number of images per action',
 6 |     ['action']
 7 | )
 8 | DUPLICATE_ACTION_NONE_COUNT = DUPLICATE_ACTION_COUNT.labels(action="none")
 9 | DUPLICATE_ACTION_MOVE_COUNT = DUPLICATE_ACTION_COUNT.labels(action="move")
10 | DUPLICATE_ACTION_DELETE_COUNT = DUPLICATE_ACTION_COUNT.labels(action="delete")
11 | 
12 | FILE_EVENT_COUNT = Gauge(
13 |     'file_event_total',
14 |     'Number of file events per event type',
15 |     ['type']
16 | )
17 | 
18 | ANALYSIS_TIME = Summary('analyse_file_summary', 'Time spent analysing a file')
19 | 
20 | FIND_DUPLICATES_TIME = Summary('find_duplicates_summary', 'Time spent finding duplicates of a file')
21 | 


--------------------------------------------------------------------------------
/py_image_dedup/util/__init__.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import logging
 3 | import traceback
 4 | 
 5 | import click
 6 | 
 7 | LOGGER = logging.getLogger(__name__)
 8 | LOGGER.setLevel(logging.DEBUG)
 9 | 
10 | 
11 | def echo(text: str = "", color=None):
12 |     """
13 |     Prints a text to the console
14 |     :param text: the text
15 |     :param color: an optional color
16 |     """
17 |     if text is not click.termui and text is not str:
18 |         text = str(text)
19 |     if color:
20 |         text = click.style(text, fg=color)
21 |     if len(text) > 0:
22 |         LOGGER.debug(text)
23 |     click.echo(text)
24 | 
25 | 
26 | def reraise_with_stack(func):
27 |     """
28 |     Decorator used to reraise exceptions occurring within a future.
29 | 
30 |     :param func: function to decorate
31 |     :return: decorated function
32 |     """
33 | 
34 |     @functools.wraps(func)
35 |     def wrapped(*args, **kwargs):
36 |         try:
37 |             return func(*args, **kwargs)
38 |         except Exception as e:
39 |             traceback_str = traceback.format_exc()
40 |             raise ValueError("Error occurred. Original traceback is\n%s\n" % traceback_str)
41 | 
42 |     return wrapped
43 | 


--------------------------------------------------------------------------------
/py_image_dedup/util/file.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | from typing import List
 4 | 
 5 | 
 6 | def get_file_name(file_path: str) -> str:
 7 |     folder, file = os.path.split(file_path)
 8 |     return file
 9 | 
10 | 
11 | def get_containing_folder(file_path: str) -> str:
12 |     folder, file = os.path.split(file_path)
13 |     return folder
14 | 
15 | 
16 | def file_has_extension(file: Path, extensions: List[str] or None) -> bool:
17 |     """
18 |     Checks if a file has one of the given extensions
19 |     :param file: the file to check
20 |     :param extensions: allowed extensions
21 |     :return: true if it matches (case insensitive), false otherwise
22 |     """
23 |     if not isinstance(extensions, List):
24 |         extensions = [extensions]
25 |     if not extensions:
26 |         return True
27 | 
28 |     if file.suffix.lower() not in (ext.lower() for ext in extensions):
29 |         # skip file with unwanted file extension
30 |         return False
31 |     else:
32 |         return True
33 | 
34 | 
35 | def get_files_count(directory: Path, recursive: bool, file_extensions: List[str] or None, exclusions: List) -> int:
36 |     """
37 |     :param directory: the directory to analyze
38 |     :param recursive: whether to search the directory recursively
39 |     :param file_extensions: file extensions to include
40 |     :return: number of files in the given directory that match the currently set file filter
41 |     """
42 |     files_count = 0
43 |     for r, d, files in os.walk(str(directory)):
44 |         for file in files:
45 |             file = Path(file)
46 |             if any(list(map(lambda x: x.search(str(file.absolute())), exclusions))):
47 |                 continue
48 |             if not file_has_extension(file, file_extensions):
49 |                 continue
50 |             files_count += 1
51 |         if not recursive:
52 |             break
53 | 
54 |     return files_count
55 | 


--------------------------------------------------------------------------------
/py_image_dedup/util/image.py:
--------------------------------------------------------------------------------
 1 | import PIL.ExifTags
 2 | from PIL import Image
 3 | 
 4 | 
 5 | def get_exif_data(image_file_path: str) -> {}:
 6 |     """
 7 |     Tries to extract all exif data from the given image file
 8 |     :param image_file_path: path of the image file
 9 |     :return: dictionary containing all available exif data entries and their values
10 |     """
11 | 
12 |     result = {}
13 |     try:
14 |         img = Image.open(image_file_path)
15 | 
16 |         exif_data = img._getexif()
17 |         if not exif_data:
18 |             return result
19 | 
20 |         for k, v in exif_data.items():
21 |             if k in PIL.ExifTags.TAGS:
22 |                 tag_name = PIL.ExifTags.TAGS[k]
23 |                 result[tag_name] = v
24 |     except Exception as e:
25 |         pass
26 |     return result
27 | 
28 | 
29 | def get_pixel_count(image_file_path: str) -> int:
30 |     try:
31 |         img = Image.open(image_file_path)
32 |         width, height = img.size
33 |         return width * height
34 |     except Exception as e:
35 |         pass
36 |     return 0
37 | 


--------------------------------------------------------------------------------
/py_image_dedup_reference.yaml:
--------------------------------------------------------------------------------
 1 | # This is a reference configuration file explaining all the options
 2 | # of py-image-dedup.
 3 | 
 4 | py_image_dedup:
 5 |   # Configuration for the analysis phase, see README.md
 6 |   analysis:
 7 |     # Whether to search for duplicates across directories when
 8 |     # specifying more than one image source directory
 9 |     across_dirs: false
10 |     # A filter for the file extensions to analyse
11 |     file_extensions:
12 |       - .png
13 |       - .jpg
14 |       - .jpeg
15 |     # Whether to search recursively in each of the source directories
16 |     recursive: true
17 |     # A list of source directories to analyse
18 |     source_directories:
19 |       - /home/myuser/pictures/
20 |     # A list of regex patterns to ignore when traversing any of the
21 |     # source directories
22 |     exclusions:
23 |       - ".*/excluded/.*"
24 |     # The number of threads to use for image analysis.
25 |     # If unset, this will default to `os.cpu_count()`.
26 |     threads: 1
27 |     # Whether to include EXIF data of images in the analysis
28 |     use_exif_data: true
29 | 
30 |   # Deduplication phase specific configuration options, see README.md
31 |   deduplication:
32 |     # The target directory to move duplicate images to
33 |     duplicates_target_directory: /home/myuser/pictures/duplicates/
34 |     # Upper limit on the modification date difference between
35 |     # two duplicate images to be considered the same image.
36 |     max_file_modification_time_diff: 0:05:00
37 |     # Specifies the criteria and their order for ordering the list of duplicates
38 |     # to select the best copy.
39 |     prioritization_rules:
40 |       - name: "more-exif-data"
41 |       - name: "less-exif-data"
42 |       - name: "bigger-file-size"
43 |       - name: "smaller-file-size"
44 |       - name: "newer-file-modification-date"
45 |       - name: "older-file-modification-date"
46 |       - name: "smaller-distance"
47 |       - name: "bigger-distance"
48 |       - name: "longer-path"
49 |       - name: "shorter-path"
50 |       - name: "contains-copy-in-file-name"
51 |       - name: "longer-file-name"
52 |       - name: "shorter-file-name"
53 |       - name: "longer-folder-path"
54 |       - name: "shorter-folder-path"
55 |       - name: "higher-score"
56 |       - name: "lower-score"
57 | 
58 |   # Daemon specific configuration options, see README.md
59 |   daemon:
60 |     # Time for waiting on filesystems changes to settle before analysing.
61 |     timeout: 30s
62 |     # The type of file observer to use.
63 |     # One of: polling, inotify
64 |     file_observer: polling
65 |   # A dry run can be used to validate the log output of a specific configuration
66 |   # before actually deleting or removing any images in any of the source
67 |   # directories.
68 |   dry_run: true
69 | 
70 |   # Elasticsearch specific configuration options, see README.md
71 |   elasticsearch:
72 |     # Whether to automatically create an index in the target database.
73 |     auto_create_index: true
74 |     # Hostname of the elasticsearch backend instance to use
75 |     host: 127.0.0.1
76 |     # Port of the elasticsearch backend instance to use.
77 |     port: 9200
78 |     # The index name to use for storing and querying image analysis data.
79 |     index: images
80 |     # Maximum signature distance [0..1] to query from elasticsearch backend.
81 |     max_distance: 0.1
82 |   # Whether to remove empty folders or not.
83 |   remove_empty_folders: false
84 | 
85 |   # Prometheus exporter specific configuration options, see README.md
86 |   stats:
87 |     # Whether to enable prometheus statistics or not.
88 |     enabled: true
89 |     # The port to expose statistics on.
90 |     port: 8000
91 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "py-image-dedup"
 3 | version = "2.0.1"
 4 | description = "A library to find duplicate images and delete unwanted ones"
 5 | 
 6 | license = "AGPLv3+"
 7 | 
 8 | authors = [
 9 |     "Markus Ressel <mail@markusressel.de>",
10 | ]
11 | 
12 | readme = 'README.md'
13 | 
14 | repository = "https://github.com/markusressel/py-image-dedup"
15 | homepage = "https://github.com/markusressel/py-image-dedup"
16 | 
17 | keywords = ['deduplication', 'py-image-dedup']
18 | 
19 | classifiers = [
20 |     "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
21 |     "Programming Language :: Python :: 3 :: Only",
22 |     "Programming Language :: Python :: 3",
23 |     "Programming Language :: Python :: 3.8",
24 |     "Programming Language :: Python :: 3.9",
25 |     "Development Status :: 5 - Production/Stable"
26 | ]
27 | 
28 | [build-system]
29 | requires = ["poetry-core>=1.0.0"]
30 | build-backend = "poetry.core.masonry.api"
31 | 
32 | [tool.poetry.dependencies]
33 | python = "^3.11"  # Compatible python versions must be declared here
34 | 
35 | Pillow = "*"
36 | ordered-set = "*"
37 | watchdog = ">=0.10.2,<6.1.0"
38 | elasticsearch = "^7"
39 | scipy = "*"
40 | numpy = "*"
41 | container-app-conf = "^5.0.0"
42 | requests = "^2.20.0"
43 | click = "*"
44 | tabulate = ">=0.8.3,<0.10.0"
45 | tqdm = "*"
46 | prometheus-client = "*"
47 | image-match = { git = "https://github.com/markusressel/image-match.git", tag = "3.0.0" }
48 | 
49 | [tool.poetry.group.dev.dependencies]
50 | pytest = "*"
51 | 
52 | [tool.poetry.scripts]
53 | py-image-dedup = 'py_image_dedup.cli:cli'
54 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests
3 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from py_image_dedup.config import DeduplicatorConfig
 4 | 
 5 | 
 6 | class TestBase(unittest.TestCase):
 7 | 
 8 |     def setUp(self):
 9 |         self.config = DeduplicatorConfig()
10 |         from py_image_dedup.library.deduplicator import ImageMatchDeduplicator
11 |         self.under_test = ImageMatchDeduplicator(interactive=False)
12 | 
13 |     def tearDown(self):
14 |         pass
15 | 
16 |     if __name__ == '__main__':
17 |         unittest.main()
18 | 


--------------------------------------------------------------------------------
/tests/images/bottles/IMG_20190903_193151-edited.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/bottles/IMG_20190903_193151-edited.jpg


--------------------------------------------------------------------------------
/tests/images/bottles/IMG_20190903_193151-grayscale.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/bottles/IMG_20190903_193151-grayscale.jpg


--------------------------------------------------------------------------------
/tests/images/bottles/IMG_20190903_193151-telegram-compression.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/bottles/IMG_20190903_193151-telegram-compression.jpg


--------------------------------------------------------------------------------
/tests/images/bottles/IMG_20190903_193151.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/bottles/IMG_20190903_193151.jpg


--------------------------------------------------------------------------------
/tests/images/building/IMG_20190903_193508-edited.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/building/IMG_20190903_193508-edited.jpg


--------------------------------------------------------------------------------
/tests/images/building/IMG_20190903_193508-grayscale.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/building/IMG_20190903_193508-grayscale.jpg


--------------------------------------------------------------------------------
/tests/images/building/IMG_20190903_193508-telegram-compression.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/building/IMG_20190903_193508-telegram-compression.jpg


--------------------------------------------------------------------------------
/tests/images/building/IMG_20190903_193508.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/building/IMG_20190903_193508.jpg


--------------------------------------------------------------------------------
/tests/images/clouds/IMG_20190903_193537-edited.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/clouds/IMG_20190903_193537-edited.jpg


--------------------------------------------------------------------------------
/tests/images/clouds/IMG_20190903_193537-grayscale.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/clouds/IMG_20190903_193537-grayscale.jpg


--------------------------------------------------------------------------------
/tests/images/clouds/IMG_20190903_193537-telegram-compression.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/clouds/IMG_20190903_193537-telegram-compression.jpg


--------------------------------------------------------------------------------
/tests/images/clouds/IMG_20190903_193537.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/clouds/IMG_20190903_193537.jpg


--------------------------------------------------------------------------------
/tests/py_image_dedup.yaml:
--------------------------------------------------------------------------------
 1 | py_image_dedup:
 2 |   dry_run: true
 3 |   analysis:
 4 |     across_dirs: true
 5 |     file_extensions:
 6 |       - .png
 7 |       - .jpg
 8 |       - .jpeg
 9 |     recursive: true
10 |     source_directories:
11 |       - ./images/
12 |     threads: 8
13 |     use_exif_data: true
14 |   deduplication:
15 |     # duplicates_target_directory:
16 |     max_file_modification_time_diff: 0:01:40
17 |     prioritization_rules:
18 |       - name: "more-exif-data"
19 |       - name: "less-exif-data"
20 |       - name: "bigger-file-size"
21 |       - name: "smaller-file-size"
22 |       - name: "newer-file-modification-date"
23 |       - name: "older-file-modification-date"
24 |       - name: "smaller-distance"
25 |       - name: "bigger-distance"
26 |       - name: "longer-path"
27 |       - name: "shorter-path"
28 |       - name: "contains-copy-in-file-name"
29 |       - name: "longer-file-name"
30 |       - name: "shorter-file-name"
31 |       - name: "longer-folder-path"
32 |       - name: "shorter-folder-path"
33 |       - name: "higher-score"
34 |       - name: "lower-score"
35 | 
36 |   elasticsearch:
37 |     auto_create_index: true
38 |     host: 127.0.0.1
39 |     max_distance: 0.1
40 |   remove_empty_folders: false
41 | 
42 | 


--------------------------------------------------------------------------------
/tests/test_file_extension.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from py_image_dedup.util.file import file_has_extension
 4 | from tests import TestBase
 5 | 
 6 | 
 7 | class FileExtensionTest(TestBase):
 8 | 
 9 |     def test_png(self):
10 |         paths = [
11 |             "file.png",
12 |             "file.PNG"
13 |         ]
14 |         for path in paths:
15 |             path = Path(path)
16 |             self.assertTrue(file_has_extension(path, [".png"]))
17 | 


--------------------------------------------------------------------------------
/tests/test_select_images_to_delete.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import unittest
  3 | from random import shuffle
  4 | from random import uniform
  5 | 
  6 | from py_image_dedup.persistence.metadata_key import MetadataKey
  7 | from tests import TestBase
  8 | 
  9 | 
 10 | class SelectImagesToDeleteTest(TestBase):
 11 | 
 12 |     def test_select_images_to_delete__filter_max_mod_time_diff(self):
 13 |         keep = [
 14 |             self._create_default_candidate(modification_date=1500),
 15 |             # file modification time is too far apart from the optimal candidate for these
 16 |             self._create_default_candidate(modification_date=1001),
 17 |         ]
 18 | 
 19 |         max_delta_seconds = int(self.config.MAX_FILE_MODIFICATION_TIME_DELTA.value.total_seconds())
 20 |         for i in range(50):
 21 |             c = self._create_default_candidate(modification_date=random.choice(range(0, 1500 - max_delta_seconds)))
 22 |             keep.append(c)
 23 | 
 24 |         dont_keep = [
 25 |             self._create_default_candidate(modification_date=1450)
 26 |         ]
 27 | 
 28 |         self._run_test(keep, dont_keep)
 29 | 
 30 |     def test_select_images_to_delete__contains_copy(self):
 31 |         keep = [self._create_default_candidate(path="C:/1.jpg")]
 32 | 
 33 |         dont_keep = []
 34 |         for i in range(50):
 35 |             c = self._create_default_candidate(path=f"C:/1{i}-Copy.jpg")
 36 |             dont_keep.append(c)
 37 | 
 38 |         self._run_test(keep, dont_keep)
 39 | 
 40 |     def test_select_images_to_delete__newer_and_bigger(self):
 41 |         keep = [self._create_default_candidate(filesize=100, modification_date=100)]
 42 | 
 43 |         dont_keep = []
 44 |         for i in range(50):
 45 |             c = self._create_default_candidate(filesize=i, modification_date=i)
 46 |             dont_keep.append(c)
 47 | 
 48 |         self._run_test(keep, dont_keep)
 49 | 
 50 |     def test_select_images_to_delete__newer(self):
 51 |         keep = [self._create_default_candidate(modification_date=100)]
 52 | 
 53 |         dont_keep = []
 54 |         for i in range(50):
 55 |             c = self._create_default_candidate(modification_date=i)
 56 |             dont_keep.append(c)
 57 | 
 58 |         self._run_test(keep, dont_keep)
 59 | 
 60 |     def test_select_images_to_delete__bigger(self):
 61 |         keep = [self._create_default_candidate(filesize=100)]
 62 | 
 63 |         dont_keep = []
 64 |         for i in range(50):
 65 |             c = self._create_default_candidate(filesize=i)
 66 |             dont_keep.append(c)
 67 | 
 68 |         self._run_test(keep, dont_keep)
 69 | 
 70 |     def test_select_images_to_delete__all_the_same(self):
 71 |         keep = [self._create_default_candidate(path="C:/00000.jpg")]
 72 | 
 73 |         dont_keep = []
 74 |         for i in range(50):
 75 |             c = self._create_default_candidate(path=f"C:/1{i}.jpg")
 76 |             dont_keep.append(c)
 77 | 
 78 |         self._run_test(keep, dont_keep)
 79 | 
 80 |     def test_select_images_to_delete__all_the_same_2(self):
 81 |         keep = [self._create_default_candidate(path="C:/50-edited.jpg")]
 82 | 
 83 |         dont_keep = []
 84 |         for i in range(50):
 85 |             c = self._create_default_candidate(path=f"C:/{i}.jpg")
 86 |             dont_keep.append(c)
 87 | 
 88 |         self._run_test(keep, dont_keep)
 89 | 
 90 |     def test_select_images_to_delete__higher_score(self):
 91 |         keep = [self._create_default_candidate(score=100)]
 92 | 
 93 |         dont_keep = []
 94 |         for i in range(50):
 95 |             c = self._create_default_candidate()
 96 |             dont_keep.append(c)
 97 | 
 98 |         self._run_test(keep, dont_keep)
 99 | 
100 |     def test_select_images_to_delete__lower_dist(self):
101 |         keep = [self._create_default_candidate(dist=0)]
102 | 
103 |         dont_keep = []
104 |         for i in range(50):
105 |             c = self._create_default_candidate(dist=uniform(0.1, 1.0))
106 |             dont_keep.append(c)
107 | 
108 |         self._run_test(keep, dont_keep)
109 | 
110 |     def test_select_images_to_delete__real_example(self):
111 |         keep = [self._create_default_candidate(
112 |             path=r"M:\Fotos\Markus\Google Photos Archiv\Takeout\Google Photos\2017-06-17\20170617_153437.jpg",
113 |             filesize=10000000, modification_date=1)]
114 | 
115 |         dont_keep = []
116 |         for i in range(50):
117 |             c = self._create_default_candidate(
118 |                 path=r"M:\Fotos\Iris\Syncthing\Telegram Empfangen\223023133_644761%i.jpg" % i,
119 |                 filesize=270000, modification_date=2)
120 |             dont_keep.append(c)
121 | 
122 |         self._run_test(keep, dont_keep)
123 | 
124 |     def _run_test(
125 |         self, keep: [{}], dont_keep: [{}], test_reversed_order: bool = True,
126 |         test_random_input_order: bool = True
127 |     ):
128 |         candidates = keep + dont_keep
129 | 
130 |         kept, not_kept = self.under_test._select_images_to_delete(candidates)
131 |         self._test_result_outcome(kept, not_kept, keep, dont_keep)
132 | 
133 |         if test_reversed_order:
134 |             kept, not_kept = self.under_test._select_images_to_delete(reversed(candidates))
135 |             self._test_result_outcome(kept, not_kept, keep, dont_keep)
136 | 
137 |         if test_random_input_order:
138 |             # test random sort orders of input just to be sure
139 |             for i in range(50):
140 |                 shuffle(candidates)
141 |                 kept, not_kept = self.under_test._select_images_to_delete(candidates)
142 |                 self._test_result_outcome(kept, not_kept, keep, dont_keep)
143 | 
144 |     def _test_result_outcome(self, kept, not_kept, keep: [{}], dont_keep: [{}]):
145 |         for c in keep:
146 |             self.assertIn(c, kept)
147 |         for c in dont_keep:
148 |             self.assertIn(c, not_kept)
149 | 
150 |     @staticmethod
151 |     def _create_default_candidate(
152 |         path: str = "C:/test", dist: float = 0.05, filesize: int = 100,
153 |         modification_date: int = 1, pixel_count: int = 10000, exif_tags: {} = {},
154 |         score: int = 64
155 |     ) -> {}:
156 |         return {
157 |             MetadataKey.PATH.value: path,
158 |             MetadataKey.DISTANCE.value: dist,
159 |             MetadataKey.METADATA.value: {
160 |                 MetadataKey.FILE_SIZE.value: filesize,
161 |                 MetadataKey.FILE_MODIFICATION_DATE.value: modification_date,
162 |                 MetadataKey.PIXELCOUNT.value: pixel_count,
163 |                 MetadataKey.EXIF_DATA.value: exif_tags
164 |             },
165 |             MetadataKey.SCORE.value: score
166 |         }
167 | 
168 | 
169 | if __name__ == '__main__':
170 |     unittest.main()
171 | 


--------------------------------------------------------------------------------