├── .codespellignore
├── .dockerignore
├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── config.yml
├── dependabot.yml
├── reaction.yml
├── release-drafter.yml
├── stale.yml
└── workflows
│ ├── codeql-analysis.yml
│ ├── codespell.yml
│ ├── dockerimage-latest.yml
│ ├── dockerimage-release.yml
│ └── dockerimage-test.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── docker-compose.yml
├── docker
└── entrypoint.sh
├── example.py
├── poetry.lock
├── py_image_dedup
├── __init__.py
├── cli.py
├── config.py
├── library
│ ├── __init__.py
│ ├── deduplication_result.py
│ ├── deduplicator.py
│ ├── file_watch.py
│ ├── processing_manager.py
│ └── progress_manager.py
├── persistence
│ ├── __init__.py
│ ├── elasticsearchstorebackend.py
│ └── metadata_key.py
├── stats.py
└── util
│ ├── __init__.py
│ ├── file.py
│ └── image.py
├── py_image_dedup_reference.yaml
├── pyproject.toml
├── pytest.ini
└── tests
├── __init__.py
├── images
├── bottles
│ ├── IMG_20190903_193151-edited.jpg
│ ├── IMG_20190903_193151-grayscale.jpg
│ ├── IMG_20190903_193151-telegram-compression.jpg
│ └── IMG_20190903_193151.jpg
├── building
│ ├── IMG_20190903_193508-edited.jpg
│ ├── IMG_20190903_193508-grayscale.jpg
│ ├── IMG_20190903_193508-telegram-compression.jpg
│ └── IMG_20190903_193508.jpg
└── clouds
│ ├── IMG_20190903_193537-edited.jpg
│ ├── IMG_20190903_193537-grayscale.jpg
│ ├── IMG_20190903_193537-telegram-compression.jpg
│ └── IMG_20190903_193537.jpg
├── py_image_dedup.yaml
├── test_file_extension.py
└── test_select_images_to_delete.py
/.codespellignore:
--------------------------------------------------------------------------------
1 | Archiv
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .github
2 | .pytest_cache
3 | duplicates
4 | mnt
5 | venv
6 | *.yaml
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [markusressel]
4 | # patreon: # Replace with a single Patreon username
5 | # open_collective: # Replace with a single Open Collective username
6 | # ko_fi: # Replace with a single Ko-fi username
7 | # tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | # community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | # liberapay: TheAlgorithms
10 | # issuehunt: # Replace with a single IssueHunt username
11 | # otechie: # Replace with a single Otechie username
12 | # custom: ['https://paypal.me/markusressel/1']
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 |
5 | ---
6 |
7 | **Describe the bug**
8 | A clear and concise description of what the bug is.
9 |
10 | **To Reproduce**
11 | Steps to reproduce the behavior:
12 | 1. Go to '...'
13 | 2. Click on '....'
14 | 3. Scroll down to '....'
15 | 4. See error
16 |
17 | **Expected behavior**
18 | A clear and concise description of what you expected to happen.
19 |
20 | **Screenshots**
21 | If applicable, add screenshots to help explain your problem.
22 |
23 | **Desktop (please complete the following information):**
24 | - OS: [e.g. Linux]
25 |
26 | **Additional context**
27 | Add any other context about the problem here.
28 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 |
5 | ---
6 |
7 | **Is your feature request related to a problem? Please describe.**
8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
9 |
10 | **Describe the solution you'd like**
11 | A clear and concise description of what you want to happen.
12 |
13 | **Describe alternatives you've considered**
14 | A clear and concise description of any alternative solutions or features you've considered.
15 |
16 | **Additional context**
17 | Add any other context or screenshots about the feature request here.
18 |
--------------------------------------------------------------------------------
/.github/config.yml:
--------------------------------------------------------------------------------
1 | # Configuration for new-issue-welcome - https://github.com/behaviorbot/new-issue-welcome
2 |
3 | # Comment to be posted to on first time issues
4 | newIssueWelcomeComment: >
5 | Thanks for opening your first issue here! :tada:
6 |
7 | # Configuration for new-pr-welcome - https://github.com/behaviorbot/new-pr-welcome
8 |
9 | # Comment to be posted to on PRs from first time contributors in your repository
10 | newPRWelcomeComment: >
11 | Thanks for opening this pull request! :nerd_face:
12 |
13 | # Configuration for first-pr-merge - https://github.com/behaviorbot/first-pr-merge
14 |
15 | # Comment to be posted to on pull requests merged by a first time user
16 | firstPRMergeComment: >
17 | Congrats on merging your first pull request here! You should be proud of yourself :1st_place_medal:
18 | 
19 |
20 | # It is recommend to include as many gifs and emojis as possible
21 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | registries:
3 | python-index-pypi-python-org-simple:
4 | type: python-index
5 | url: https://pypi.python.org/simple/
6 | username: "${{secrets.PYTHON_INDEX_PYPI_PYTHON_ORG_SIMPLE_USERNAME}}"
7 | password: "${{secrets.PYTHON_INDEX_PYPI_PYTHON_ORG_SIMPLE_PASSWORD}}"
8 |
9 | updates:
10 | - package-ecosystem: github-actions
11 | directory: "/"
12 | schedule:
13 | # Check for updates to GitHub Actions every week
14 | interval: "weekly"
15 | - package-ecosystem: pip
16 | insecure-external-code-execution: allow
17 | directory: "/"
18 | schedule:
19 | interval: daily
20 | time: "16:00"
21 | timezone: Europe/Berlin
22 | open-pull-requests-limit: 10
23 | registries:
24 | - python-index-pypi-python-org-simple
25 |
--------------------------------------------------------------------------------
/.github/reaction.yml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/.github/release-drafter.yml:
--------------------------------------------------------------------------------
1 | categories:
2 | - title: 🚀 Features and ✨ Enhancements
3 | label: enhancement
4 | - title: 🐛 Bugfixes
5 | label: bug
6 | change-template: "* $TITLE (#$NUMBER) by @$AUTHOR"
7 | template: |
8 | ## What’s Changed
9 |
10 | $CHANGES
11 |
--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
1 | # Configuration for probot-stale - https://github.com/probot/stale
2 |
3 | # Number of days of inactivity before an Issue or Pull Request becomes stale
4 | daysUntilStale: 60
5 |
6 | # Number of days of inactivity before an Issue or Pull Request with the stale label is closed.
7 | # Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale.
8 | daysUntilClose: 14
9 |
10 | # Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
11 | exemptLabels:
12 | - pinned
13 | - security
14 | - bug
15 | - enhancement
16 |
17 | # Set to true to ignore issues in a project (defaults to false)
18 | exemptProjects: false
19 |
20 | # Set to true to ignore issues in a milestone (defaults to false)
21 | exemptMilestones: false
22 |
23 | # Set to true to ignore issues with an assignee (defaults to false)
24 | exemptAssignees: true
25 |
26 | # Label to use when marking as stale
27 | staleLabel: wontfix
28 |
29 | # Comment to post when marking as stale. Set to `false` to disable
30 | markComment: >
31 | This issue has been automatically marked as stale because it has not had
32 | recent activity. It will be closed if no further activity occurs. Thank you
33 | for your contributions.
34 |
35 | # Comment to post when removing the stale label.
36 | # unmarkComment: >
37 | # Your comment here.
38 |
39 | # Comment to post when closing a stale Issue or Pull Request.
40 | closeComment: >
41 | There has been no incentive by contributors or maintainers to revive this stale issue and it will now be closed.
42 |
43 | # Limit the number of actions per hour, from 1-30. Default is 30
44 | limitPerRun: 30
45 |
46 | # Limit to only `issues` or `pulls`
47 | only: issues
48 |
49 | # Optionally, specify configuration settings that are specific to just 'issues' or 'pulls':
50 | # pulls:
51 | # daysUntilStale: 30
52 | # markComment: >
53 | # This pull request has been automatically marked as stale because it has not had
54 | # recent activity. It will be closed if no further activity occurs. Thank you
55 | # for your contributions.
56 |
57 | # issues:
58 | # exemptLabels:
59 | # - confirmed
60 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | name: "CodeQL"
2 |
3 | on:
4 | push:
5 | branches: [master, ]
6 | pull_request:
7 | # The branches below must be a subset of the branches above
8 | branches: [master]
9 | schedule:
10 | - cron: '0 1 * * 4'
11 |
12 | jobs:
13 | analyze:
14 | name: Analyze
15 | runs-on: ubuntu-latest
16 |
17 | steps:
18 | - name: Checkout repository
19 | uses: actions/checkout@v4
20 | with:
21 | # We must fetch at least the immediate parents so that if this is
22 | # a pull request then we can checkout the head.
23 | fetch-depth: 2
24 |
25 | # If this run was triggered by a pull request event, then checkout
26 | # the head of the pull request instead of the merge commit.
27 | - run: git checkout HEAD^2
28 | if: ${{ github.event_name == 'pull_request' }}
29 |
30 | # Initializes the CodeQL tools for scanning.
31 | - name: Initialize CodeQL
32 | uses: github/codeql-action/init@v3
33 | # Override language selection by uncommenting this and choosing your languages
34 | # with:
35 | # languages: go, javascript, csharp, python, cpp, java
36 |
37 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
38 | # If this step fails, then you should remove it and run the build manually (see below)
39 | - name: Autobuild
40 | uses: github/codeql-action/autobuild@v3
41 |
42 | # ℹ️ Command-line programs to run using the OS shell.
43 | # 📚 https://git.io/JvXDl
44 |
45 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
46 | # and modify them (or add more) to build your code if your project
47 | # uses a compiled language
48 |
49 | #- run: |
50 | # make bootstrap
51 | # make release
52 |
53 | - name: Perform CodeQL Analysis
54 | uses: github/codeql-action/analyze@v3
55 |
--------------------------------------------------------------------------------
/.github/workflows/codespell.yml:
--------------------------------------------------------------------------------
1 | # Codespell configuration is within pyproject.toml
2 | ---
3 | name: Codespell
4 |
5 | on:
6 | push:
7 | branches: [ master ]
8 | pull_request:
9 | branches: [ master ]
10 |
11 | permissions:
12 | contents: read
13 |
14 | jobs:
15 | codespell:
16 | name: Check for spelling errors
17 | runs-on: ubuntu-latest
18 |
19 | steps:
20 | - name: Checkout
21 | uses: actions/checkout@v4
22 | - name: Annotate locations with typos
23 | uses: codespell-project/codespell-problem-matcher@v1
24 | - name: Codespell
25 | uses: codespell-project/actions-codespell@v2
26 | with:
27 | ignore_words_file: .codespellignore
28 | skip: "*.svg"
--------------------------------------------------------------------------------
/.github/workflows/dockerimage-latest.yml:
--------------------------------------------------------------------------------
1 | name: Docker Image latest
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 |
7 | jobs:
8 | build:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v4
12 | - name: Build the Docker image
13 | run: docker build . --file Dockerfile --tag markusressel/py-image-dedup:latest
14 | - name: Login to DockerHub Registry
15 | run: echo ${{ secrets.DOCKERHUB_PASSWORD }} | docker login -u ${{ secrets.DOCKERHUB_USERNAME }} --password-stdin
16 | - name: Push the Docker image
17 | if: github.ref_name == 'master'
18 | run: docker push markusressel/py-image-dedup:latest
19 |
20 | dockerHubDescription:
21 | runs-on: ubuntu-latest
22 | steps:
23 | - uses: actions/checkout@v4
24 | - name: Docker Hub Description
25 | uses: peter-evans/dockerhub-description@v4.0.2
26 | env:
27 | DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
28 | DOCKERHUB_PASSWORD: ${{ secrets.DOCKERHUB_PASSWORD }}
29 | DOCKERHUB_REPOSITORY: markusressel/py-image-dedup
30 |
--------------------------------------------------------------------------------
/.github/workflows/dockerimage-release.yml:
--------------------------------------------------------------------------------
1 | name: Docker Image Release
2 |
3 | on:
4 | push:
5 | tags:
6 | - "*.*.*"
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v4
13 | # this writes the tag name into GIT_TAG_NAME
14 | - name: Get tag name
15 | uses: little-core-labs/get-git-tag@v3.0.2
16 | - name: Build the Docker image
17 | run: docker build . --file Dockerfile --tag markusressel/py-image-dedup:$GIT_TAG_NAME
18 | - name: Login to DockerHub Registry
19 | run: echo ${{ secrets.DOCKERHUB_PASSWORD }} | docker login -u ${{ secrets.DOCKERHUB_USERNAME }} --password-stdin
20 | - name: Push the Docker image
21 | run: docker push markusressel/py-image-dedup:$GIT_TAG_NAME
22 |
--------------------------------------------------------------------------------
/.github/workflows/dockerimage-test.yml:
--------------------------------------------------------------------------------
1 | name: Test Docker Image
2 |
3 | on:
4 | pull_request:
5 | # The branches below must be a subset of the branches above
6 | branches: [ master ]
7 |
8 | jobs:
9 | test:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v4
13 | - name: Build the Docker image
14 | run: docker build . --file Dockerfile --tag markusressel/py-image-dedup:latest
15 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### VirtualEnv template
3 | # Virtualenv
4 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
5 | .Python
6 | [Bb]in
7 | [Ii]nclude
8 | [Ll]ib
9 | [Ll]ib64
10 | [Ll]ocal
11 | [Ss]cripts
12 | pyvenv.cfg
13 | .venv
14 | pip-selfcheck.json
15 | ### JetBrains template
16 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
17 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
18 |
19 | # User-specific stuff:
20 | .idea/**/workspace.xml
21 | .idea/**/tasks.xml
22 | .idea/dictionaries
23 |
24 | # Sensitive or high-churn files:
25 | .idea/**/dataSources/
26 | .idea/**/dataSources.ids
27 | .idea/**/dataSources.xml
28 | .idea/**/dataSources.local.xml
29 | .idea/**/sqlDataSources.xml
30 | .idea/**/dynamic.xml
31 | .idea/**/uiDesigner.xml
32 |
33 | # Gradle:
34 | .idea/**/gradle.xml
35 | .idea/**/libraries
36 |
37 | # CMake
38 | cmake-build-debug/
39 | cmake-build-release/
40 |
41 | # Mongo Explorer plugin:
42 | .idea/**/mongoSettings.xml
43 |
44 | ## File-based project format:
45 | *.iws
46 |
47 | ## Plugin-specific files:
48 |
49 | # IntelliJ
50 | out/
51 |
52 | # mpeltonen/sbt-idea plugin
53 | .idea_modules/
54 |
55 | # JIRA plugin
56 | atlassian-ide-plugin.xml
57 |
58 | # Cursive Clojure plugin
59 | .idea/replstate.xml
60 |
61 | # Crashlytics plugin (for Android Studio and IntelliJ)
62 | com_crashlytics_export_strings.xml
63 | crashlytics.properties
64 | crashlytics-build.properties
65 | fabric.properties
66 | ### Python template
67 | # Byte-compiled / optimized / DLL files
68 | __pycache__/
69 | *.py[cod]
70 | *$py.class
71 |
72 | # C extensions
73 | *.so
74 |
75 | # Distribution / packaging
76 | .Python
77 | build/
78 | develop-eggs/
79 | dist/
80 | downloads/
81 | eggs/
82 | .eggs/
83 | lib/
84 | lib64/
85 | parts/
86 | sdist/
87 | var/
88 | wheels/
89 | *.egg-info/
90 | .installed.cfg
91 | *.egg
92 | MANIFEST
93 |
94 | # PyInstaller
95 | # Usually these files are written by a python script from a template
96 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
97 | *.manifest
98 | *.spec
99 |
100 | # Installer logs
101 | pip-log.txt
102 | pip-delete-this-directory.txt
103 |
104 | # Unit test / coverage reports
105 | htmlcov/
106 | .tox/
107 | .coverage
108 | .coverage.*
109 | .cache
110 | nosetests.xml
111 | coverage.xml
112 | *.cover
113 | .hypothesis/
114 |
115 | # Translations
116 | *.mo
117 | *.pot
118 |
119 | # Django stuff:
120 | *.log
121 | .static_storage/
122 | .media/
123 | local_settings.py
124 |
125 | # Flask stuff:
126 | instance/
127 | .webassets-cache
128 |
129 | # Scrapy stuff:
130 | .scrapy
131 |
132 | # Sphinx documentation
133 | docs/_build/
134 |
135 | # PyBuilder
136 | target/
137 |
138 | # Jupyter Notebook
139 | .ipynb_checkpoints
140 |
141 | # pyenv
142 | .python-version
143 |
144 | # celery beat schedule file
145 | celerybeat-schedule
146 |
147 | # SageMath parsed files
148 | *.sage.py
149 |
150 | # Environments
151 | .env
152 | .venv
153 | env/
154 | venv/
155 | ENV/
156 | env.bak/
157 | venv.bak/
158 |
159 | # Spyder project settings
160 | .spyderproject
161 | .spyproject
162 |
163 | # Rope project settings
164 | .ropeproject
165 |
166 | # mkdocs documentation
167 | /site
168 |
169 | # mypy
170 | .mypy_cache/
171 |
172 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # dont use alpine for python builds: https://pythonspeed.com/articles/alpine-docker-python/
2 | FROM python:3.11-slim-buster
3 |
4 | ENV PYTHONUNBUFFERED=1
5 | ENV POETRY_VERSION="2.1.2"
6 | ENV PIP_DISABLE_PIP_VERSION_CHECK=on
7 |
8 | RUN apt-get update \
9 | && apt-get -y install sudo git python-skimage
10 |
11 | WORKDIR /app
12 |
13 | COPY . .
14 |
15 | COPY poetry.lock pyproject.toml ./
16 |
17 | RUN apt-get update && \
18 | apt-get install -y libatlas-base-dev gfortran && \
19 | apt-get clean && rm -rf /var/lib/apt/lists/*
20 |
21 | RUN pip install "poetry==$POETRY_VERSION" \
22 | && POETRY_VIRTUALENVS_CREATE=false poetry install --without dev \
23 | && pip uninstall -y poetry
24 |
25 | ENV PUID=1000 PGID=1000
26 |
27 | ENTRYPOINT [ "docker/entrypoint.sh", "py-image-dedup" ]
28 | CMD [ "daemon" ]
29 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU AFFERO GENERAL PUBLIC LICENSE
2 | Version 3, 19 November 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU Affero General Public License is a free, copyleft license for
11 | software and other kinds of works, specifically designed to ensure
12 | cooperation with the community in the case of network server software.
13 |
14 | The licenses for most software and other practical works are designed
15 | to take away your freedom to share and change the works. By contrast,
16 | our General Public Licenses are intended to guarantee your freedom to
17 | share and change all versions of a program--to make sure it remains free
18 | software for all its users.
19 |
20 | When we speak of free software, we are referring to freedom, not
21 | price. Our General Public Licenses are designed to make sure that you
22 | have the freedom to distribute copies of free software (and charge for
23 | them if you wish), that you receive source code or can get it if you
24 | want it, that you can change the software or use pieces of it in new
25 | free programs, and that you know you can do these things.
26 |
27 | Developers that use our General Public Licenses protect your rights
28 | with two steps: (1) assert copyright on the software, and (2) offer
29 | you this License which gives you legal permission to copy, distribute
30 | and/or modify the software.
31 |
32 | A secondary benefit of defending all users' freedom is that
33 | improvements made in alternate versions of the program, if they
34 | receive widespread use, become available for other developers to
35 | incorporate. Many developers of free software are heartened and
36 | encouraged by the resulting cooperation. However, in the case of
37 | software used on network servers, this result may fail to come about.
38 | The GNU General Public License permits making a modified version and
39 | letting the public access it on a server without ever releasing its
40 | source code to the public.
41 |
42 | The GNU Affero General Public License is designed specifically to
43 | ensure that, in such cases, the modified source code becomes available
44 | to the community. It requires the operator of a network server to
45 | provide the source code of the modified version running there to the
46 | users of that server. Therefore, public use of a modified version, on
47 | a publicly accessible server, gives the public access to the source
48 | code of the modified version.
49 |
50 | An older license, called the Affero General Public License and
51 | published by Affero, was designed to accomplish similar goals. This is
52 | a different license, not a version of the Affero GPL, but Affero has
53 | released a new version of the Affero GPL which permits relicensing under
54 | this license.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | TERMS AND CONDITIONS
60 |
61 | 0. Definitions.
62 |
63 | "This License" refers to version 3 of the GNU Affero General Public License.
64 |
65 | "Copyright" also means copyright-like laws that apply to other kinds of
66 | works, such as semiconductor masks.
67 |
68 | "The Program" refers to any copyrightable work licensed under this
69 | License. Each licensee is addressed as "you". "Licensees" and
70 | "recipients" may be individuals or organizations.
71 |
72 | To "modify" a work means to copy from or adapt all or part of the work
73 | in a fashion requiring copyright permission, other than the making of an
74 | exact copy. The resulting work is called a "modified version" of the
75 | earlier work or a work "based on" the earlier work.
76 |
77 | A "covered work" means either the unmodified Program or a work based
78 | on the Program.
79 |
80 | To "propagate" a work means to do anything with it that, without
81 | permission, would make you directly or secondarily liable for
82 | infringement under applicable copyright law, except executing it on a
83 | computer or modifying a private copy. Propagation includes copying,
84 | distribution (with or without modification), making available to the
85 | public, and in some countries other activities as well.
86 |
87 | To "convey" a work means any kind of propagation that enables other
88 | parties to make or receive copies. Mere interaction with a user through
89 | a computer network, with no transfer of a copy, is not conveying.
90 |
91 | An interactive user interface displays "Appropriate Legal Notices"
92 | to the extent that it includes a convenient and prominently visible
93 | feature that (1) displays an appropriate copyright notice, and (2)
94 | tells the user that there is no warranty for the work (except to the
95 | extent that warranties are provided), that licensees may convey the
96 | work under this License, and how to view a copy of this License. If
97 | the interface presents a list of user commands or options, such as a
98 | menu, a prominent item in the list meets this criterion.
99 |
100 | 1. Source Code.
101 |
102 | The "source code" for a work means the preferred form of the work
103 | for making modifications to it. "Object code" means any non-source
104 | form of a work.
105 |
106 | A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 |
111 | The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form. A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 |
122 | The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities. However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work. For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 |
135 | The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 |
139 | The Corresponding Source for a work in source code form is that
140 | same work.
141 |
142 | 2. Basic Permissions.
143 |
144 | All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met. This License explicitly affirms your unlimited
147 | permission to run the unmodified Program. The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work. This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 |
152 | You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force. You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright. Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 |
163 | Conveying under any other circumstances is permitted solely under
164 | the conditions stated below. Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 |
167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 |
169 | No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 |
175 | When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 |
183 | 4. Conveying Verbatim Copies.
184 |
185 | You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 |
193 | You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 |
196 | 5. Conveying Modified Source Versions.
197 |
198 | You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 |
202 | a) The work must carry prominent notices stating that you modified
203 | it, and giving a relevant date.
204 |
205 | b) The work must carry prominent notices stating that it is
206 | released under this License and any conditions added under section
207 | 7. This requirement modifies the requirement in section 4 to
208 | "keep intact all notices".
209 |
210 | c) You must license the entire work, as a whole, under this
211 | License to anyone who comes into possession of a copy. This
212 | License will therefore apply, along with any applicable section 7
213 | additional terms, to the whole of the work, and all its parts,
214 | regardless of how they are packaged. This License gives no
215 | permission to license the work in any other way, but it does not
216 | invalidate such permission if you have separately received it.
217 |
218 | d) If the work has interactive user interfaces, each must display
219 | Appropriate Legal Notices; however, if the Program has interactive
220 | interfaces that do not display Appropriate Legal Notices, your
221 | work need not make them do so.
222 |
223 | A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit. Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 |
233 | 6. Conveying Non-Source Forms.
234 |
235 | You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 |
240 | a) Convey the object code in, or embodied in, a physical product
241 | (including a physical distribution medium), accompanied by the
242 | Corresponding Source fixed on a durable physical medium
243 | customarily used for software interchange.
244 |
245 | b) Convey the object code in, or embodied in, a physical product
246 | (including a physical distribution medium), accompanied by a
247 | written offer, valid for at least three years and valid for as
248 | long as you offer spare parts or customer support for that product
249 | model, to give anyone who possesses the object code either (1) a
250 | copy of the Corresponding Source for all the software in the
251 | product that is covered by this License, on a durable physical
252 | medium customarily used for software interchange, for a price no
253 | more than your reasonable cost of physically performing this
254 | conveying of source, or (2) access to copy the
255 | Corresponding Source from a network server at no charge.
256 |
257 | c) Convey individual copies of the object code with a copy of the
258 | written offer to provide the Corresponding Source. This
259 | alternative is allowed only occasionally and noncommercially, and
260 | only if you received the object code with such an offer, in accord
261 | with subsection 6b.
262 |
263 | d) Convey the object code by offering access from a designated
264 | place (gratis or for a charge), and offer equivalent access to the
265 | Corresponding Source in the same way through the same place at no
266 | further charge. You need not require recipients to copy the
267 | Corresponding Source along with the object code. If the place to
268 | copy the object code is a network server, the Corresponding Source
269 | may be on a different server (operated by you or a third party)
270 | that supports equivalent copying facilities, provided you maintain
271 | clear directions next to the object code saying where to find the
272 | Corresponding Source. Regardless of what server hosts the
273 | Corresponding Source, you remain obligated to ensure that it is
274 | available for as long as needed to satisfy these requirements.
275 |
276 | e) Convey the object code using peer-to-peer transmission, provided
277 | you inform other peers where the object code and Corresponding
278 | Source of the work are being offered to the general public at no
279 | charge under subsection 6d.
280 |
281 | A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 |
285 | A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling. In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage. For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product. A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 |
298 | "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source. The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 |
306 | If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information. But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 |
317 | The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed. Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 |
325 | Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 |
331 | 7. Additional Terms.
332 |
333 | "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law. If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 |
342 | When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it. (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.) You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 |
349 | Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 |
353 | a) Disclaiming warranty or limiting liability differently from the
354 | terms of sections 15 and 16 of this License; or
355 |
356 | b) Requiring preservation of specified reasonable legal notices or
357 | author attributions in that material or in the Appropriate Legal
358 | Notices displayed by works containing it; or
359 |
360 | c) Prohibiting misrepresentation of the origin of that material, or
361 | requiring that modified versions of such material be marked in
362 | reasonable ways as different from the original version; or
363 |
364 | d) Limiting the use for publicity purposes of names of licensors or
365 | authors of the material; or
366 |
367 | e) Declining to grant rights under trademark law for use of some
368 | trade names, trademarks, or service marks; or
369 |
370 | f) Requiring indemnification of licensors and authors of that
371 | material by anyone who conveys the material (or modified versions of
372 | it) with contractual assumptions of liability to the recipient, for
373 | any liability that these contractual assumptions directly impose on
374 | those licensors and authors.
375 |
376 | All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10. If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term. If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 |
386 | If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 |
391 | Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 |
395 | 8. Termination.
396 |
397 | You may not propagate or modify a covered work except as expressly
398 | provided under this License. Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 |
403 | However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 |
410 | Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 |
417 | Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License. If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 |
423 | 9. Acceptance Not Required for Having Copies.
424 |
425 | You are not required to accept this License in order to receive or
426 | run a copy of the Program. Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance. However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work. These actions infringe copyright if you do
431 | not accept this License. Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 |
434 | 10. Automatic Licensing of Downstream Recipients.
435 |
436 | Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License. You are not responsible
439 | for enforcing compliance by third parties with this License.
440 |
441 | An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations. If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 |
451 | You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License. For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 |
459 | 11. Patents.
460 |
461 | A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based. The
463 | work thus licensed is called the contributor's "contributor version".
464 |
465 | A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version. For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 |
475 | Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 |
480 | In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement). To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 |
487 | If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients. "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 |
501 | If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 |
509 | A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License. You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 |
524 | Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 |
528 | 12. No Surrender of Others' Freedom.
529 |
530 | If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License. If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all. For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 |
540 | 13. Remote Network Interaction; Use with the GNU General Public License.
541 |
542 | Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software. This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 |
553 | Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work. The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 |
561 | 14. Revised Versions of this License.
562 |
563 | The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time. Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 |
568 | Each version is given a distinguishing version number. If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation. If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 |
577 | If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 |
582 | Later license versions may give you additional or different
583 | permissions. However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 |
587 | 15. Disclaimer of Warranty.
588 |
589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 |
598 | 16. Limitation of Liability.
599 |
600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 |
610 | 17. Interpretation of Sections 15 and 16.
611 |
612 | If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 |
619 | END OF TERMS AND CONDITIONS
620 |
621 | How to Apply These Terms to Your New Programs
622 |
623 | If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 |
627 | To do so, attach the following notices to the program. It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 |
632 |
633 | Copyright (C)
634 |
635 | This program is free software: you can redistribute it and/or modify
636 | it under the terms of the GNU Affero General Public License as published
637 | by the Free Software Foundation, either version 3 of the License, or
638 | (at your option) any later version.
639 |
640 | This program is distributed in the hope that it will be useful,
641 | but WITHOUT ANY WARRANTY; without even the implied warranty of
642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643 | GNU Affero General Public License for more details.
644 |
645 | You should have received a copy of the GNU Affero General Public License
646 | along with this program. If not, see .
647 |
648 | Also add information on how to contact you by electronic and paper mail.
649 |
650 | If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source. For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code. There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 |
658 | You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | .
662 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include Pipfile.lock
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: all docker clean test
2 |
3 | docker:
4 | sudo docker build . --file Dockerfile --tag markusressel/py-image-dedup:latest
5 |
6 | test:
7 | cd tests; pytest
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # py-image-dedup [](https://actions-badge.atrox.dev/markusressel/py-image-dedup/goto?ref=master) [](https://codeclimate.com/github/markusressel/py-image-dedup) [](https://badge.fury.io/py/py-image-dedup)
2 |
3 | **py-image-dedup** is a tool to sort out or remove duplicates within a photo library.
4 | Unlike most other solutions, **py-image-dedup**
5 | intentionally uses an approximate image comparison to also detect
6 | duplicates of images that slightly differ in resolution, color or other minor details.
7 |
8 | It is build upon [Image-Match](https://github.com/ascribe/image-match) a very popular library to compute
9 | a pHash for an image and store the result in an ElasticSearch backend for very high scalability.
10 |
11 | [](https://asciinema.org/a/3WbBxMXnZyT1QnuTP9fm37wkS)
12 |
13 | # How it works
14 |
15 | ### Phase 1 - Database cleanup
16 |
17 | In the first phase the elasticsearch backend is checked against the
18 | current filesystem state, cleaning up database entries of files that
19 | no longer exist. This will speed up queries made later on.
20 |
21 | ### Phase 2 - Counting files
22 |
23 | Although not necessary for the deduplication process it is very convenient
24 | to have some kind of progress indication while the deduplication process
25 | is at work. To be able to provide that, available files must be counted beforehand.
26 |
27 | ### Phase 3 - Analysing files
28 |
29 | In this phase every image file is analysed. This means generating a signature (pHash)
30 | to quickly compare it to other images and adding other metadata of the image
31 | to the elasticsearch backend that is used in the next phase.
32 |
33 | This phase is quite CPU intensive and the first run take take quite
34 | some time. Using as much threads as feasible (using the `-t` parameter)
35 | is advised to get the best performance.
36 |
37 | Since we might already have a previous version of this file in the database
38 | before analysing a given file the file modification time is compared to the
39 | given one. If the database content seems to be still correct the signature
40 | for this file will **not** be recalculated. Because of this, subsequent
41 | runs will be much faster. There still has to happen some file access though,
42 | so it is probably limited by that.
43 |
44 | ### Phase 4 - Finding duplicates
45 |
46 | Every file is now processed again - but only by means of querying the
47 | database backend for similar images (within the given `max_dist`).
48 | If there are images found that match the similarity criteria they are considered
49 | duplicate candidates. All candidates are then ordered according to the `prioritization_rules`,
50 | which you can specify yourself in the configuration, see [Configuration](#Configuration).
51 |
52 | If you do not specify `prioritization_rules` yourself, the following order will
53 | be used:
54 |
55 | 1. pixel count (more is better)
56 | 1. EXIF data (more exif data is better)
57 | 1. file size (bigger is better)
58 | 1. file modification time (newer is better)
59 | 1. distance (lower is better)
60 | 1. filename contains "copy" (False is better)
61 | 1. filename length (longer is better) - (for "edited" versions)
62 | 1. parent folder path length (shorter is better)
63 | 1. score (higher is better)
64 |
65 | The first candidate in the resulting list is considered to be the best
66 | available version of all candidates.
67 |
68 | ### Phase 5 - Moving/Deleting duplicates
69 |
70 | All but the best version of duplicate candidates identified in the previous
71 | phase are now deleted from the file system (if you didn't specify `--dry-run` of course).
72 |
73 | If `duplicates_target_directory` is set, the specified folder will be used as
74 | a root directory to move duplicates to, instead of deleting them, replicating their original
75 | folder structure.
76 |
77 | ### Phase 6 - Removing empty folders (Optional)
78 |
79 | In the last phase, folders that are empty due to the deduplication
80 | process are deleted, cleaning up the directory structure (if turned on in configuration).
81 |
82 | # How to use
83 |
84 | ## Install
85 |
86 | Install **py-image-dedup** using pip:
87 |
88 | ```shell
89 | pip3 install py-image-dedup
90 | ```
91 |
92 | ## Configuration
93 |
94 | **py-image-dedup** uses [container-app-conf](https://github.com/markusressel/container-app-conf)
95 | to provide configuration via a YAML file as well as ENV variables which
96 | generates a reference config on startup. Have a look at the
97 | [documentation about it](https://github.com/markusressel/container-app-conf#generate-reference-config).
98 |
99 | See [py_image_dedup_reference.yaml](/py_image_dedup_reference.yaml)
100 | for an example in this repo.
101 |
102 | ## Setup elasticsearch backend
103 |
104 | Since this library is based on [Image-Match](https://github.com/ascribe/image-match)
105 | you need a running elasticsearch instance for efficient storing and
106 | querying of image signatures.
107 |
108 | ### Elasticsearch version
109 |
110 | This library requires elasticsearch version 5 or later. Sadly the
111 | [Image-Match](https://github.com/ascribe/image-match) library still
112 | specifies version 2, so [a fork of the original project](https://github.com/markusressel/image-match)
113 | is used instead. This fork is maintained by me, and any contributions
114 | are very much appreciated.
115 |
116 | ### Set up the index
117 |
118 | **py-image-dedup** uses a single index (called `images` by default).
119 | When configured, this index will be created automatically for you.
120 |
121 | ## Command line usage
122 |
123 | **py-image-dedup** can be used from the command line like this:
124 |
125 | ```shell
126 | py-image-dedup deduplicate --help
127 | ```
128 |
129 | Have a look at the help output to see how you can customize it.
130 |
131 | ### Daemon
132 |
133 | **CAUTION!** This feature is still very much a work in progress.
134 | **Always** have a backup of your data!
135 |
136 | **py-image-dedup** has a built in daemon that allows you to continuously
137 | monitor your source directories and deduplicate them on the fly.
138 |
139 | When running the daemon (and enabled in configuration) a prometheus reporter
140 | is used to allow you to gather some statistical insights.
141 |
142 | ```shell
143 | py-image-dedup daemon
144 | ```
145 |
146 | ## Dry run
147 |
148 | To analyze images and get an overview of what images would be deleted
149 | be sure to make a dry run first.
150 |
151 | ```shell
152 | py-image-dedup deduplicate --dry-run
153 | ```
154 |
155 |
156 | ## FreeBSD
157 |
158 | If you want to run this on a FreeBSD host make sure you have an up
159 | to date release that is able to install ports.
160 |
161 | Since [Image-Match](https://github.com/ascribe/image-match) does a lot of
162 | math it relies on `numpy` and `scipy`. To get those working on FreeBSD
163 | you have to install them as a port:
164 |
165 | ```shell
166 | pkg install pkgconf
167 | pkg install py38-numpy
168 | pkg install py27-scipy
169 | ```
170 |
171 | For `.png` support you also need to install
172 | ```shell
173 | pkg install png
174 | ```
175 |
176 | I still ran into issues after installing all these and just threw those
177 | two in the mix and it finally worked:
178 | ```shell
179 | pkg install freetype
180 | pkg install py27-matplotlib # this has a LOT of dependencies
181 | ```
182 |
183 | ### Encoding issues
184 |
185 | When using the python library `click` on FreeBSD you might run into
186 | encoding issues. To mitigate this change your locale from `ANSII` to `UTF-8`
187 | if possible.
188 |
189 | This can be achieved f.ex. by creating a file `~/.login_conf` with the following content:
190 | ```text
191 | me:\
192 | :charset=ISO-8859-1:\
193 | :lang=de_DE.UTF-8:
194 | ```
195 |
196 | ## Docker
197 |
198 | To run **py-image-dedup** using docker you can use the [markusressel/py-image-dedup](https://hub.docker.com/r/markusressel/py-image-dedup)
199 | image from DockerHub:
200 |
201 | ```
202 | sudo docker run -t \
203 | -p 8000:8000 \
204 | -v /where/the/original/photolibrary/is/located:/data/in \
205 | -v /where/duplicates/should/be/moved/to:/data/out \
206 | -e PY_IMAGE_DEDUP_DRY_RUN=False \
207 | -e PY_IMAGE_DEDUP_ANALYSIS_SOURCE_DIRECTORIES=/data/in/ \
208 | -e PY_IMAGE_DEDUP_ANALYSIS_RECURSIVE=True \
209 | -e PY_IMAGE_DEDUP_ANALYSIS_ACROSS_DIRS=True \
210 | -e PY_IMAGE_DEDUP_ANALYSIS_FILE_EXTENSIONS=.png,.jpg,.jpeg \
211 | -e PY_IMAGE_DEDUP_ANALYSIS_THREADS=8 \
212 | -e PY_IMAGE_DEDUP_ANALYSIS_USE_EXIF_DATA=True \
213 | -e PY_IMAGE_DEDUP_DEDUPLICATION_DUPLICATES_TARGET_DIRECTORY=/data/out/ \
214 | -e PY_IMAGE_DEDUP_ELASTICSEARCH_AUTO_CREATE_INDEX=True \
215 | -e PY_IMAGE_DEDUP_ELASTICSEARCH_HOST=elasticsearch \
216 | -e PY_IMAGE_DEDUP_ELASTICSEARCH_PORT=9200 \
217 | -e PY_IMAGE_DEDUP_ELASTICSEARCH_INDEX=images \
218 | -e PY_IMAGE_DEDUP_ELASTICSEARCH_AUTO_CREATE_INDEX=True \
219 | -e PY_IMAGE_DEDUP_ELASTICSEARCH_MAX_DISTANCE=0.1 \
220 | -e PY_IMAGE_DEDUP_REMOVE_EMPTY_FOLDERS=False \
221 | -e PY_IMAGE_DEDUP_STATS_ENABLED=True \
222 | -e PY_IMAGE_DEDUP_STATS_PORT=8000 \
223 | markusressel/py-image-dedup:latest
224 | ```
225 |
226 | Since an elasticsearch instance is required too, you can
227 | also use the `docker-compose.yml` file included in this repo which will
228 | set up a single-node elasticsearch cluster too:
229 |
230 | ```shell script
231 | sudo docker-compose up
232 | ```
233 |
234 | ### UID and GID
235 |
236 | To run **py-image-dedup** inside the container using a specific user id
237 | and group id you can use the env variables `PUID=1000` and `PGID=1000`.
238 |
239 | # Contributing
240 |
241 | GitHub is for social coding: if you want to write code, I encourage contributions through pull requests from forks
242 | of this repository. Create GitHub tickets for bugs and new features and comment on the ones that you are interested in.
243 |
244 | # License
245 |
246 | ```text
247 | py-image-dedup by Markus Ressel
248 | Copyright (C) 2018 Markus Ressel
249 |
250 | This program is free software: you can redistribute it and/or modify
251 | it under the terms of the GNU General Public License as published by
252 | the Free Software Foundation, either version 3 of the License, or
253 | (at your option) any later version.
254 |
255 | This program is distributed in the hope that it will be useful,
256 | but WITHOUT ANY WARRANTY; without even the implied warranty of
257 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
258 | GNU General Public License for more details.
259 |
260 | You should have received a copy of the GNU General Public License
261 | along with this program. If not, see .
262 | ```
263 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.7'
2 |
3 | services:
4 | elasticsearch:
5 | image: docker.elastic.co/elasticsearch/elasticsearch:7.9.2
6 | ports:
7 | - "9200:9200"
8 | - "9300:9300"
9 | environment:
10 | - discovery.type=single-node
11 | networks:
12 | - docker-elk
13 | restart: on-failure
14 | py-image-dedup:
15 | # build: .
16 | image: markusressel/py-image-dedup:latest
17 | environment:
18 | - PUID=1000
19 | - PGID=1000
20 | # change configuration to your liking
21 | - PY_IMAGE_DEDUP_DRY_RUN=True
22 | - PY_IMAGE_DEDUP_ANALYSIS_SOURCE_DIRECTORIES=/mnt/source/
23 | - PY_IMAGE_DEDUP_ANALYSIS_RECURSIVE=True
24 | - PY_IMAGE_DEDUP_ANALYSIS_ACROSS_DIRS=True
25 | - PY_IMAGE_DEDUP_ANALYSIS_FILE_EXTENSIONS=.png,.jpg,.jpeg
26 | - PY_IMAGE_DEDUP_ANALYSIS_THREADS=8
27 | - PY_IMAGE_DEDUP_ANALYSIS_USE_EXIF_DATA=True
28 | - PY_IMAGE_DEDUP_DEDUPLICATION_DUPLICATES_TARGET_DIRECTORY=/mnt/duplicates/
29 | - PY_IMAGE_DEDUP_ELASTICSEARCH_HOST=elasticsearch
30 | - PY_IMAGE_DEDUP_ELASTICSEARCH_PORT=9200
31 | - PY_IMAGE_DEDUP_ELASTICSEARCH_INDEX=images
32 | - PY_IMAGE_DEDUP_ELASTICSEARCH_AUTO_CREATE_INDEX=True
33 | - PY_IMAGE_DEDUP_ELASTICSEARCH_MAX_DISTANCE=0.1
34 | - PY_IMAGE_DEDUP_REMOVE_EMPTY_FOLDERS=False
35 | - PY_IMAGE_DEDUP_STATS_ENABLED=True
36 | - PY_IMAGE_DEDUP_STATS_PORT=8000
37 | volumes:
38 | # optionally mount a YAML configuration file
39 | # into /app/py_image_dedup.yaml instead of using environment:
40 | # - /mnt/data3/py_image_dedup.yaml:/app/py_image_dedup.yaml
41 | # change this to your local source directory:
42 | - /mnt/data3/py-image-dedup_testdata:/mnt/source
43 | # change this to your local duplicates directory:
44 | - /mnt/data3/py-image-dedup_duplicates:/mnt/duplicates
45 | links:
46 | - elasticsearch
47 | networks:
48 | - docker-elk
49 | ports:
50 | - "8000:8000"
51 | depends_on:
52 | - elasticsearch
53 | restart: on-failure
54 | networks:
55 | docker-elk:
56 | driver: bridge
--------------------------------------------------------------------------------
/docker/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | set -eu
4 | sudo -E -u "#${PUID}" -g "#${PGID}" "$@"
5 |
--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
1 | from py_image_dedup.config import DeduplicatorConfig
2 | from py_image_dedup.library.deduplicator import ImageMatchDeduplicator
3 |
4 | config = DeduplicatorConfig()
5 | config.DRY_RUN.value = True
6 | # config.ELASTICSEARCH_HOST.value = "192.168.2.24"
7 | config.SOURCE_DIRECTORIES.value = [
8 | # r'/home/markus/py-image-dedup/dir1/',
9 | # r'/home/markus/py-image-dedup/dir2/'
10 | # r'/mnt/data/py-dedup-test/Syncthing/',
11 | # r'/mnt/sdb2/Sample/',
12 | r'./tests/images/'
13 | ]
14 | config.SEARCH_ACROSS_ROOT_DIRS.value = True
15 |
16 | config.ANALYSIS_THREADS.value = 8
17 | config.ANALYSIS_USE_EXIF_DATA.value = False
18 |
19 | config.ELASTICSEARCH_MAX_DISTANCE.value = 0.30
20 | # config.MAX_FILE_MODIFICATION_TIME_DELTA.value = timedelta(minutes=5)
21 | config.DEDUPLICATOR_DUPLICATES_TARGET_DIRECTORY.value = "./duplicates/"
22 | config.REMOVE_EMPTY_FOLDERS.value = True
23 |
24 | deduplicator = ImageMatchDeduplicator()
25 |
26 | # max_file_modification_time_diff=1 * 1000 * 60 * 5,
27 |
28 | result = deduplicator.deduplicate_all(
29 | skip_analyze_phase=False,
30 | )
31 |
32 | result.print_to_console()
33 |
--------------------------------------------------------------------------------
/py_image_dedup/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | logging.basicConfig()
4 |
--------------------------------------------------------------------------------
/py_image_dedup/cli.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import click
4 |
5 | from py_image_dedup.config import DeduplicatorConfig
6 | from py_image_dedup.library.deduplicator import ImageMatchDeduplicator
7 | from py_image_dedup.library.processing_manager import ProcessingManager
8 | from py_image_dedup.util import echo
9 |
10 | IMAGE_HASH_MAP = {}
11 |
12 | PARAM_SKIP_ANALYSE_PHASE = "skip-analyse-phase"
13 | PARAM_DRY_RUN = "dry-run"
14 |
15 | CMD_OPTION_NAMES = {
16 | PARAM_SKIP_ANALYSE_PHASE: ['--skip-analyse-phase', '-sap'],
17 | PARAM_DRY_RUN: ['--dry-run', '-dr']
18 | }
19 |
20 | CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
21 |
22 |
23 | @click.group(context_settings=CONTEXT_SETTINGS)
24 | @click.version_option()
25 | def cli():
26 | pass
27 |
28 |
29 | def get_option_names(parameter: str) -> list:
30 | """
31 | Returns a list of all valid console parameter names for a given parameter
32 | :param parameter: the parameter to check
33 | :return: a list of all valid names to use this parameter
34 | """
35 | return CMD_OPTION_NAMES[parameter]
36 |
37 |
38 | @cli.command(name="analyse")
39 | def c_analyse():
40 | deduplicator = ImageMatchDeduplicator(interactive=True)
41 | deduplicator.analyse_all()
42 |
43 |
44 | @cli.command(name="deduplicate")
45 | @click.option(*get_option_names(PARAM_SKIP_ANALYSE_PHASE), required=False, default=False, is_flag=True,
46 | help='When set the image analysis phase will be skipped. Useful if you already did a dry-run.')
47 | @click.option(*get_option_names(PARAM_DRY_RUN), required=False, default=None, is_flag=True,
48 | help='When set no files or folders will actually be deleted but a preview of '
49 | 'what WOULD be done will be printed.')
50 | def c_deduplicate(skip_analyse_phase: bool,
51 | dry_run: bool):
52 | config = DeduplicatorConfig()
53 | if dry_run is not None:
54 | config.DRY_RUN.value = dry_run
55 | deduplicator = ImageMatchDeduplicator(interactive=True)
56 | result = deduplicator.deduplicate_all(
57 | skip_analyze_phase=skip_analyse_phase,
58 | )
59 |
60 | echo()
61 | result.print_to_console()
62 |
63 |
64 | @cli.command(name="daemon")
65 | @click.option(*get_option_names(PARAM_DRY_RUN), required=False, default=None, is_flag=True,
66 | help='When set no files or folders will actually be deleted but a preview of '
67 | 'what WOULD be done will be printed.')
68 | def c_daemon(dry_run: bool):
69 | echo("Starting daemon...")
70 |
71 | config: DeduplicatorConfig = DeduplicatorConfig()
72 | if dry_run is not None:
73 | config.DRY_RUN.value = dry_run
74 |
75 | if config.STATS_ENABLED.value:
76 | from prometheus_client import start_http_server
77 | echo("Starting prometheus reporter...")
78 | start_http_server(config.STATS_PORT.value)
79 |
80 | deduplicator = ImageMatchDeduplicator(interactive=False)
81 | processing_manager = ProcessingManager(deduplicator)
82 |
83 | deduplicator.deduplicate_all()
84 | processing_manager.start()
85 |
86 | try:
87 | while True:
88 | time.sleep(1)
89 | except KeyboardInterrupt:
90 | processing_manager.stop()
91 |
92 |
93 | if __name__ == '__main__':
94 | cli()
95 |
--------------------------------------------------------------------------------
/py_image_dedup/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | from datetime import timedelta
3 |
4 | from container_app_conf import ConfigBase
5 | from container_app_conf.entry.bool import BoolConfigEntry
6 | from container_app_conf.entry.dict import DictConfigEntry
7 | from container_app_conf.entry.file import DirectoryConfigEntry
8 | from container_app_conf.entry.float import FloatConfigEntry
9 | from container_app_conf.entry.int import IntConfigEntry
10 | from container_app_conf.entry.list import ListConfigEntry
11 | from container_app_conf.entry.regex import RegexConfigEntry
12 | from container_app_conf.entry.string import StringConfigEntry
13 | from container_app_conf.entry.timedelta import TimeDeltaConfigEntry
14 | from container_app_conf.source.env_source import EnvSource
15 | from container_app_conf.source.yaml_source import YamlSource
16 | from py_range_parse import Range
17 |
18 | NODE_MAIN = "py_image_dedup"
19 |
20 | NODE_DAEMON = "daemon"
21 | NODE_PROCESSING_TIMEOUT = "processing_timeout"
22 |
23 | NODE_FILE_OBSERVER_TYPE = "file_observer"
24 | FILE_OBSERVER_TYPE_POLLING = "polling"
25 | FILE_OBSERVER_TYPE_INOTIFY = "inotify"
26 |
27 | NODE_DRY_RUN = "dry_run"
28 |
29 | NODE_ELASTICSEARCH = "elasticsearch"
30 |
31 | NODE_HOST = "host"
32 | NODE_MAX_DISTANCE = "max_distance"
33 | NODE_AUTO_CREATE_INDEX = "auto_create_index"
34 | NODE_INDEX = "index"
35 |
36 | NODE_ANALYSIS = "analysis"
37 |
38 | NODE_EXCLUSIONS = "exclusions"
39 | NODE_SOURCE_DIRECTORIES = "source_directories"
40 | NODE_RECURSIVE = "recursive"
41 | NODE_SEARCH_ACROSS_ROOT_DIRS = "across_dirs"
42 | NODE_FILE_EXTENSIONS = "file_extensions"
43 | NODE_USE_EXIF_DATA = "use_exif_data"
44 | NODE_THREADS = "threads"
45 |
46 | NODE_DEDUPLICATION = "deduplication"
47 |
48 | NODE_PRIORITIZATION_RULES = "prioritization_rules"
49 | NODE_MAX_FILE_MODIFICATION_TIME_DIFF = "max_file_modification_time_diff"
50 | NODE_REMOVE_EMPTY_FOLDERS = "remove_empty_folders"
51 | NODE_DUPLICATES_TARGET_DIRECTORY = "duplicates_target_directory"
52 |
53 | NODE_STATS = "stats"
54 | NODE_ENABLED = "enabled"
55 | NODE_PORT = "port"
56 |
57 |
58 | class DeduplicatorConfig(ConfigBase):
59 |
60 | def __new__(cls, *args, **kwargs):
61 | yaml_source = YamlSource("py_image_dedup")
62 | data_sources = [
63 | EnvSource(),
64 | yaml_source
65 | ]
66 | return super(DeduplicatorConfig, cls).__new__(cls, *args, data_sources=data_sources, **kwargs)
67 |
68 | DRY_RUN = BoolConfigEntry(
69 | description="If enabled no source file will be touched",
70 | key_path=[
71 | NODE_MAIN,
72 | NODE_DRY_RUN
73 | ],
74 | default=True
75 | )
76 |
77 | ELASTICSEARCH_HOST = StringConfigEntry(
78 | description="Hostname of the elasticsearch backend instance to use.",
79 | key_path=[
80 | NODE_MAIN,
81 | NODE_ELASTICSEARCH,
82 | NODE_HOST
83 | ],
84 | default="127.0.0.1"
85 | )
86 |
87 | ELASTICSEARCH_PORT = IntConfigEntry(
88 | description="Port of the elasticsearch backend instance to use.",
89 | key_path=[
90 | NODE_MAIN,
91 | NODE_ELASTICSEARCH,
92 | NODE_PORT
93 | ],
94 | range=Range(1, 65535),
95 | default=9200
96 | )
97 |
98 | ELASTICSEARCH_MAX_DISTANCE = FloatConfigEntry(
99 | description="Maximum signature distance [0..1] to query from elasticsearch backend.",
100 | key_path=[
101 | NODE_MAIN,
102 | NODE_ELASTICSEARCH,
103 | NODE_MAX_DISTANCE
104 | ],
105 | default=0.10
106 | )
107 |
108 | ELASTICSEARCH_AUTO_CREATE_INDEX = BoolConfigEntry(
109 | description="Whether to automatically create an index in the target database.",
110 | key_path=[
111 | NODE_MAIN,
112 | NODE_ELASTICSEARCH,
113 | NODE_AUTO_CREATE_INDEX
114 | ],
115 | default=True
116 | )
117 |
118 | ELASTICSEARCH_INDEX = StringConfigEntry(
119 | description="The index name to use for storing and querying image analysis data.",
120 | key_path=[
121 | NODE_MAIN,
122 | NODE_ELASTICSEARCH,
123 | NODE_INDEX
124 | ],
125 | default="images"
126 | )
127 |
128 | ANALYSIS_USE_EXIF_DATA = BoolConfigEntry(
129 | description="Whether to scan for EXIF data or not.",
130 | key_path=[
131 | NODE_MAIN,
132 | NODE_ANALYSIS,
133 | NODE_USE_EXIF_DATA
134 | ],
135 | default=True
136 | )
137 |
138 | SOURCE_DIRECTORIES = ListConfigEntry(
139 | description="Comma separated list of source paths to analyse and deduplicate.",
140 | item_type=DirectoryConfigEntry,
141 | item_args={
142 | "check_existence": True
143 | },
144 | key_path=[
145 | NODE_MAIN,
146 | NODE_ANALYSIS,
147 | NODE_SOURCE_DIRECTORIES
148 | ],
149 | required=True,
150 | example=[
151 | "/home/myuser/pictures/"
152 | ]
153 | )
154 |
155 | RECURSIVE = BoolConfigEntry(
156 | description="When set all directories will be recursively analyzed.",
157 | key_path=[
158 | NODE_MAIN,
159 | NODE_ANALYSIS,
160 | NODE_RECURSIVE
161 | ],
162 | default=True
163 | )
164 |
165 | SEARCH_ACROSS_ROOT_DIRS = BoolConfigEntry(
166 | description="When set duplicates will be found even if they are located in different root directories.",
167 | key_path=[
168 | NODE_MAIN,
169 | NODE_ANALYSIS,
170 | NODE_SEARCH_ACROSS_ROOT_DIRS
171 | ],
172 | default=False
173 | )
174 |
175 | FILE_EXTENSION_FILTER = ListConfigEntry(
176 | description="Comma separated list of file extensions.",
177 | item_type=StringConfigEntry,
178 | key_path=[
179 | NODE_MAIN,
180 | NODE_ANALYSIS,
181 | NODE_FILE_EXTENSIONS
182 | ],
183 | required=True,
184 | default=[
185 | ".png",
186 | ".jpg",
187 | ".jpeg"
188 | ]
189 | )
190 |
191 | EXCLUSIONS = ListConfigEntry(
192 | description="Comma separated list of regular expression filters.",
193 | item_type=RegexConfigEntry,
194 | key_path=[
195 | NODE_MAIN,
196 | NODE_ANALYSIS,
197 | NODE_EXCLUSIONS
198 | ],
199 | default=[]
200 | )
201 |
202 | ANALYSIS_THREADS = IntConfigEntry(
203 | description="Number of threads to use for image analysis phase.",
204 | key_path=[
205 | NODE_MAIN,
206 | NODE_ANALYSIS,
207 | NODE_THREADS
208 | ],
209 | default=os.cpu_count()
210 | )
211 |
212 | MAX_FILE_MODIFICATION_TIME_DELTA = TimeDeltaConfigEntry(
213 | description="Maximum file modification date difference between multiple "
214 | "duplicates to be considered the same image",
215 | key_path=[
216 | NODE_MAIN,
217 | NODE_DEDUPLICATION,
218 | NODE_MAX_FILE_MODIFICATION_TIME_DIFF
219 | ],
220 | default=None,
221 | example=timedelta(minutes=5)
222 | )
223 |
224 | PRIORITIZATION_RULES = ListConfigEntry(
225 | description="Comma separated list of prioritization rules to use for ordering duplicate "
226 | "images before proceeding with the deduplication process.",
227 | item_type=DictConfigEntry,
228 | key_path=[
229 | NODE_MAIN,
230 | NODE_DEDUPLICATION,
231 | NODE_PRIORITIZATION_RULES
232 | ],
233 | required=False,
234 | default=[
235 | {"name": "higher-pixel-count"},
236 | {"name": "more-exif-data"},
237 | {"name": "bigger-file-size"},
238 | {"name": "newer-file-modification-date"},
239 | {"name": "smaller-distance"},
240 | {"name": "doesnt-contain-copy-in-file-name"},
241 | {"name": "longer-file-name"},
242 | {"name": "shorter-folder-path"},
243 | {"name": "higher-score"},
244 | ]
245 | )
246 |
247 | REMOVE_EMPTY_FOLDERS = BoolConfigEntry(
248 | description="Whether to remove empty folders or not.",
249 | key_path=[
250 | NODE_MAIN,
251 | NODE_REMOVE_EMPTY_FOLDERS
252 | ],
253 | default=False
254 | )
255 |
256 | DEDUPLICATOR_DUPLICATES_TARGET_DIRECTORY = DirectoryConfigEntry(
257 | description="Directory path to move duplicates to instead of deleting them.",
258 | key_path=[
259 | NODE_MAIN,
260 | NODE_DEDUPLICATION,
261 | NODE_DUPLICATES_TARGET_DIRECTORY
262 | ],
263 | check_existence=True,
264 | default=None,
265 | example="/home/myuser/pictures/duplicates/"
266 | )
267 |
268 | DAEMON_PROCESSING_TIMEOUT = TimeDeltaConfigEntry(
269 | description="Time to wait for filesystems changes to settle before analysing.",
270 | key_path=[
271 | NODE_MAIN,
272 | NODE_DAEMON,
273 | NODE_PROCESSING_TIMEOUT
274 | ],
275 | default="30s"
276 | )
277 |
278 | DAEMON_FILE_OBSERVER_TYPE = StringConfigEntry(
279 | description="Type of file observer to use.",
280 | key_path=[
281 | NODE_MAIN,
282 | NODE_DAEMON,
283 | NODE_FILE_OBSERVER_TYPE
284 | ],
285 | regex="|".join([FILE_OBSERVER_TYPE_POLLING, FILE_OBSERVER_TYPE_INOTIFY]),
286 | default=FILE_OBSERVER_TYPE_POLLING,
287 | required=True
288 | )
289 |
290 | STATS_ENABLED = BoolConfigEntry(
291 | description="Whether to enable prometheus statistics or not.",
292 | key_path=[
293 | NODE_MAIN,
294 | NODE_STATS,
295 | NODE_ENABLED
296 | ],
297 | default=True
298 | )
299 |
300 | STATS_PORT = IntConfigEntry(
301 | description="The port to expose statistics on.",
302 | key_path=[
303 | NODE_MAIN,
304 | NODE_STATS,
305 | NODE_PORT
306 | ],
307 | default=8000
308 | )
309 |
--------------------------------------------------------------------------------
/py_image_dedup/library/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import threading
3 |
4 | LOGGER = logging.getLogger(__name__)
5 | LOGGER.setLevel(logging.DEBUG)
6 |
7 |
8 | class Action:
9 | def __init__(self, name, color, ):
10 | self.name = name
11 | self.color = color
12 |
13 |
14 | class ActionEnum:
15 | NONE = Action("-", "green")
16 | DELETE = Action("delete", "red")
17 | MOVE = Action("move", "yellow")
18 |
19 |
20 | class RegularIntervalWorker:
21 | """
22 | Base class for a worker that executes a specific task in a regular interval.
23 | """
24 |
25 | def __init__(self, interval: float):
26 | self._interval = interval
27 | self._timer = None
28 |
29 | def start(self):
30 | """
31 | Starts the worker
32 | """
33 | if self._timer is None:
34 | LOGGER.debug(f"Starting worker: {self.__class__.__name__}")
35 | self._schedule_next_run()
36 | else:
37 | LOGGER.debug("Already running, ignoring start() call")
38 |
39 | def stop(self):
40 | """
41 | Stops the worker
42 | """
43 | if self._timer is not None:
44 | self._timer.cancel()
45 | self._timer = None
46 |
47 | def _schedule_next_run(self):
48 | """
49 | Schedules the next run
50 | """
51 | if self._timer is not None:
52 | self._timer.cancel()
53 | self._timer = threading.Timer(self._interval, self._worker_job)
54 | self._timer.start()
55 |
56 | def _worker_job(self):
57 | """
58 | The regularly executed task. Override this method.
59 | """
60 | try:
61 | self._run()
62 | except Exception as e:
63 | LOGGER.error(e, exc_info=True)
64 | finally:
65 | self._schedule_next_run()
66 |
67 | def _run(self):
68 | """
69 | The regularly executed task. Override this method.
70 | """
71 | raise NotImplementedError()
72 |
--------------------------------------------------------------------------------
/py_image_dedup/library/deduplication_result.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import List
3 |
4 | import click
5 | from tabulate import tabulate
6 |
7 | from py_image_dedup.library import ActionEnum
8 | from py_image_dedup.persistence import MetadataKey
9 | from py_image_dedup.util import echo
10 |
11 | BYTE_IN_A_MB = 1048576
12 |
13 |
14 | class DeduplicationResult:
15 | def __init__(self):
16 | self.item_actions = {}
17 | self._removed_folders = set()
18 | self._reference_files = {}
19 | self._file_duplicates = {}
20 |
21 | def add_file_action(self, file_path: Path, action: ActionEnum):
22 | if file_path in self.item_actions and self.item_actions[file_path] != action:
23 | raise ValueError("File path already in result "
24 | "but with different action: {}, {}, {}".format(file_path,
25 | self.item_actions[file_path],
26 | action))
27 | self.item_actions[file_path] = action
28 |
29 | def get_file_with_action(self, action: ActionEnum) -> []:
30 | return list({k: v for k, v in self.item_actions.items() if v == action}.keys())
31 |
32 | def get_duplicate_count(self) -> int:
33 | """
34 | :return: amount of files that have at least one duplicate
35 | """
36 | count = 0
37 | for key, value in self._file_duplicates.items():
38 | if len(value) > 0:
39 | count += 1
40 |
41 | return count
42 |
43 | def get_removed_or_moved_files(self):
44 | return self.get_file_with_action(ActionEnum.MOVE) + self.get_file_with_action(ActionEnum.DELETE)
45 |
46 | def get_removed_empty_folders(self) -> []:
47 | """
48 | :return: a list of empty folders that have been deleted
49 | """
50 | return self._removed_folders
51 |
52 | def add_removed_empty_folder(self, folder: Path):
53 | """
54 | Adds a folder to the list of removed empty folders
55 | :param folder: the folder to add
56 | """
57 | self._removed_folders.add(folder)
58 |
59 | def set_file_duplicates(self, reference_files: List[dict], duplicate_files: []):
60 | """
61 | Set a list of files that are duplicates of the reference file
62 | :param reference_files: the file that is used as a baseline
63 | :param duplicate_files: duplicates of the reference_file
64 | """
65 | reference_file = reference_files[0]
66 | reference_file_path = Path(reference_file[MetadataKey.PATH.value])
67 | self._reference_files[reference_file_path] = reference_file
68 | self._file_duplicates[reference_file_path] = reference_files[1:] + duplicate_files
69 |
70 | def get_file_duplicates(self) -> {}:
71 | """
72 | Get a list of files that are duplicates of other files
73 | """
74 | return self._file_duplicates
75 |
76 | def print_to_console(self):
77 | title = "" * 7 + "Summary"
78 | echo(title, color='cyan')
79 | echo('=' * 21, color='cyan')
80 | echo(f"Files with duplicates: {self.get_duplicate_count()}")
81 | echo(f"Files moved: {len(self.get_file_with_action(ActionEnum.MOVE))}")
82 | echo(f"Files deleted: {len(self.get_file_with_action(ActionEnum.DELETE))}")
83 |
84 | headers = ("Action", "File path", "Dist", "Filesize", "Pixels")
85 |
86 | for reference_file_path, folder in self.get_file_duplicates().items():
87 | duplicate_count = len(folder)
88 | if duplicate_count > 0:
89 | columns = []
90 | echo()
91 |
92 | for item in [self._reference_files[reference_file_path]] + folder:
93 | file_path = Path(item[MetadataKey.PATH.value])
94 | distance = item[MetadataKey.DISTANCE.value]
95 | distance_rounded = round(distance, 3)
96 | file_size = item[MetadataKey.METADATA.value][MetadataKey.FILE_SIZE.value]
97 | file_size_mb = round(file_size / BYTE_IN_A_MB, 3)
98 | pixel_count = item[MetadataKey.METADATA.value][MetadataKey.PIXELCOUNT.value]
99 |
100 | action = self.item_actions.get(file_path, ActionEnum.NONE)
101 | row = [
102 | action.name,
103 | file_path,
104 | distance_rounded,
105 | file_size_mb,
106 | pixel_count
107 | ]
108 |
109 | # apply action style
110 | row = list(map(lambda x: str(click.style(str(x), action.color)), row))
111 | columns.append(row)
112 |
113 | self._echo_table(
114 | tabulate(columns, headers=headers, colalign=['center', 'left', 'left', 'right', 'right']))
115 |
116 | echo()
117 | echo(f"Removed (empty) folders ({len(self.get_removed_empty_folders())}):")
118 | for folder in self.get_removed_empty_folders():
119 | echo(f"{folder}", color='red')
120 |
121 | @staticmethod
122 | def _echo_table(table: str):
123 | lines = table.splitlines()
124 |
125 | for line in lines[:2]:
126 | echo(line, color='cyan')
127 |
128 | for line in lines[2:]:
129 | echo(line)
130 |
--------------------------------------------------------------------------------
/py_image_dedup/library/deduplicator.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import filecmp
3 | import logging
4 | import os
5 | import shutil
6 | import sys
7 | from concurrent.futures import ThreadPoolExecutor
8 | from pathlib import Path
9 | from typing import List
10 |
11 | import click
12 | from ordered_set import OrderedSet
13 |
14 | from py_image_dedup import util
15 | from py_image_dedup.config import DeduplicatorConfig
16 | from py_image_dedup.library import ActionEnum
17 | from py_image_dedup.library.deduplication_result import DeduplicationResult
18 | from py_image_dedup.library.progress_manager import ProgressManager
19 | from py_image_dedup.persistence import ImageSignatureStore
20 | from py_image_dedup.persistence.elasticsearchstorebackend import ElasticSearchStoreBackend
21 | from py_image_dedup.persistence.metadata_key import MetadataKey
22 | from py_image_dedup.stats import DUPLICATE_ACTION_MOVE_COUNT, DUPLICATE_ACTION_DELETE_COUNT, ANALYSIS_TIME, \
23 | FIND_DUPLICATES_TIME
24 | from py_image_dedup.util import file, echo
25 | from py_image_dedup.util.file import get_files_count, file_has_extension
26 |
27 | LOGGER = logging.getLogger(__name__)
28 | LOGGER.setLevel(logging.DEBUG)
29 |
30 |
31 | class ImageMatchDeduplicator:
32 | EXECUTOR = ThreadPoolExecutor()
33 |
34 | _config: DeduplicatorConfig
35 | _progress_manager: ProgressManager
36 |
37 | _processed_files: dict = {}
38 | _deduplication_result: DeduplicationResult = None
39 |
40 | def __init__(self, interactive: bool):
41 | """
42 |
43 | :param interactive: whether cli output should be interactive or not
44 | """
45 | self.interactive = interactive
46 |
47 | self._progress_manager = ProgressManager()
48 | self._config = DeduplicatorConfig()
49 | self._persistence: ImageSignatureStore = ElasticSearchStoreBackend(
50 | host=self._config.ELASTICSEARCH_HOST.value,
51 | port=self._config.ELASTICSEARCH_PORT.value,
52 | connections_per_node=self._config.ANALYSIS_THREADS.value,
53 | el_index=self._config.ELASTICSEARCH_INDEX.value,
54 | use_exif_data=self._config.ANALYSIS_USE_EXIF_DATA.value,
55 | max_dist=self._config.ELASTICSEARCH_MAX_DISTANCE.value,
56 | setup_database=self._config.ELASTICSEARCH_AUTO_CREATE_INDEX.value
57 | )
58 |
59 | def reset_result(self):
60 | self._deduplication_result = DeduplicationResult()
61 | self._processed_files = {}
62 |
63 | def analyse_all(self):
64 | """
65 | Runs the analysis phase independently.
66 | """
67 | directories = self._config.SOURCE_DIRECTORIES.value
68 |
69 | echo("Phase 1/2: Counting files ...", color='cyan')
70 | directory_map = self._count_files(directories)
71 |
72 | echo("Phase 2/2: Analyzing files ...", color='cyan')
73 | self.analyze_directories(directory_map)
74 |
75 | def deduplicate_all(self, skip_analyze_phase: bool = False) -> DeduplicationResult:
76 | """
77 | Runs the full 6 deduplication phases.
78 | :param skip_analyze_phase: useful if you already did a dry run and want to do a real run afterwards
79 | :return: result of the operation
80 | """
81 | # see: https://stackoverflow.com/questions/14861891/runtimewarning-invalid-value-encountered-in-divide
82 | # and: https://stackoverflow.com/questions/29347987/why-cant-i-suppress-numpy-warnings
83 | import warnings
84 | warnings.filterwarnings('ignore')
85 |
86 | directories = self._config.SOURCE_DIRECTORIES.value
87 | if len(directories) <= 0:
88 | raise ValueError("No root directories to scan")
89 |
90 | if self._config.DRY_RUN.value:
91 | echo("==> DRY RUN! No files or folders will actually be deleted! <==", color='yellow')
92 |
93 | echo("Phase 1/6: Cleaning up database ...", color='cyan')
94 | self.cleanup_database(directories)
95 |
96 | echo("Phase 2/6: Counting files ...", color='cyan')
97 | directory_map = self._count_files(directories)
98 |
99 | phase_3_text = "Phase 3/6: Analyzing files"
100 | if skip_analyze_phase:
101 | echo(phase_3_text + " - Skipping", color='yellow')
102 | else:
103 | echo(phase_3_text, color='cyan')
104 | self.analyze_directories(directory_map)
105 |
106 | echo("Phase 4/6: Finding duplicate files ...", color='cyan')
107 | self.find_duplicates_in_directories(directory_map)
108 |
109 | # Phase 5/6: Move or Delete duplicate files
110 | self.process_duplicates()
111 |
112 | self.remove_empty_folders()
113 |
114 | return self._deduplication_result
115 |
116 | def analyze_directories(self, directory_map: dict):
117 | """
118 | Analyzes all files, generates identifiers (if necessary) and stores them for later access
119 | """
120 | threads = self._config.ANALYSIS_THREADS.value
121 |
122 | # load truncated images too
123 | # TODO: this causes an infinite loop on some (truncated) images
124 | # ImageFile.LOAD_TRUNCATED_IMAGES = True
125 |
126 | for directory, file_count in directory_map.items():
127 | self._progress_manager.start(f"Analyzing files in '{directory}'", file_count, "Files", self.interactive)
128 | self.__walk_directory_files(
129 | root_directory=directory,
130 | threads=threads,
131 | command=lambda root_dir, file_dir, file_path: self.analyze_file(file_path))
132 | self._progress_manager.clear()
133 |
134 | def find_duplicates_in_directories(self, directory_map: dict):
135 | """
136 | Finds duplicates in the given directories
137 | :param directory_map: map of directory path -> file count
138 | """
139 | self.reset_result()
140 |
141 | for directory, file_count in directory_map.items():
142 | self._progress_manager.start(f"Finding duplicates in '{directory}' ...", file_count, "Files",
143 | self.interactive)
144 | self.__walk_directory_files(
145 | root_directory=directory,
146 | threads=1, # there seems to be no performance advantage in using multiple threads here
147 | command=lambda root_dir, _, file_path: self.find_duplicates_of_file(
148 | root_directories=self._config.SOURCE_DIRECTORIES.value,
149 | root_directory=root_dir,
150 | reference_file_path=file_path
151 | )
152 | )
153 | self._progress_manager.clear()
154 |
155 | def cleanup_database(self, directories: List[Path]):
156 | """
157 | Removes database entries of files that don't exist on disk.
158 | Note that this cleanup will only consider files within one
159 | of the root directories specified in constructor, as other file paths
160 | might have been added on other machines.
161 | :param directories: directories in this run
162 | """
163 | # TODO: This iterates through all db entries - even the ones we are ignoring.
164 | # The db query should be improved to speed this up
165 |
166 | count, entries = self._persistence.get_all()
167 | if count <= 0:
168 | return
169 |
170 | self._progress_manager.start(f"Cleanup database", count, "entries", self.interactive)
171 | for entry in entries:
172 | try:
173 | image_entry = entry['_source']
174 | metadata = image_entry.get(MetadataKey.METADATA.value, {})
175 |
176 | file_path = Path(image_entry[MetadataKey.PATH.value])
177 | self._progress_manager.set_postfix(self._truncate_middle(str(file_path)))
178 |
179 | if MetadataKey.DATAMODEL_VERSION.value not in metadata:
180 | echo(f"Removing db entry with missing db model version number: {file_path}")
181 | self._persistence.remove(str(file_path))
182 | continue
183 |
184 | data_version = metadata.get(MetadataKey.DATAMODEL_VERSION.value, -1)
185 | if data_version != self._persistence.DATAMODEL_VERSION:
186 | echo(f"Removing db entry with old db model version: {file_path}")
187 | self._persistence.remove(str(file_path))
188 | continue
189 |
190 | # filter by files in at least one of the specified root directories
191 | # this is necessary because the database might hold items for other paths already
192 | # and those are not interesting to us
193 | if not any(root_dir in file_path.parents for root_dir in directories):
194 | continue
195 |
196 | if not file_path.exists():
197 | echo(f"Removing db entry for missing file: {file_path}")
198 | self._persistence.remove(str(file_path))
199 | except Exception as e:
200 | logging.exception(e)
201 | echo(f"Error while cleaning up database entry {entry}: {e}")
202 | try:
203 | image_entry = entry['_source']
204 | file_path = Path(image_entry[MetadataKey.PATH.value])
205 | self._persistence.remove(str(file_path))
206 | except Exception as e:
207 | logging.exception(e)
208 | echo(f"Error removing db entry: {e}")
209 | finally:
210 | self._progress_manager.inc()
211 | self._progress_manager.clear()
212 |
213 | def _remove_empty_folders(self, directories: List[Path], recursive: bool):
214 | """
215 | Searches for empty folders and removes them
216 | :param directories: directories to scan
217 | """
218 | dry_run = self._config.DRY_RUN.value
219 |
220 | # remove empty folders
221 | for directory in directories:
222 | empty_folders = self._find_empty_folders(directory, recursive, dry_run)
223 | self._remove_folders(directory, empty_folders, dry_run)
224 |
225 | def _count_files(self, directories: List[Path]) -> dict:
226 | """
227 | Counts the amount of files to analyze (used in progress) and stores them in a map
228 | :return map "directory path" -> "directory file count"
229 | """
230 | directory_map = {}
231 |
232 | self._progress_manager.start(f"Counting files", len(directories), "Dirs", self.interactive)
233 | for directory in directories:
234 | self._progress_manager.set_postfix(self._truncate_middle(directory))
235 |
236 | file_count = get_files_count(
237 | directory,
238 | self._config.RECURSIVE.value,
239 | self._config.FILE_EXTENSION_FILTER.value,
240 | self._config.EXCLUSIONS.value
241 | )
242 | directory_map[directory] = file_count
243 |
244 | self._progress_manager.inc()
245 | self._progress_manager.clear()
246 |
247 | return directory_map
248 |
249 | def __walk_directory_files(self, root_directory: Path, command, threads: int):
250 | """
251 | Walks through the files of the given directory
252 | :param root_directory: the directory to start with
253 | :param command: the method to execute for every file found
254 | :return: file_path -> identifier
255 | """
256 | with ThreadPoolExecutor(max_workers=threads, thread_name_prefix="py-image-dedup-walker") as self.EXECUTOR:
257 | for (root, dirs, files) in os.walk(str(root_directory)):
258 | # root is the place you're listing
259 | # dirs is a list of directories directly under root
260 | # files is a list of files directly under root
261 | root = Path(root)
262 |
263 | for file in files:
264 | file_path = Path(root, file)
265 |
266 | # skip file in exclusion
267 | if any(list(map(lambda x: x.search(str(file_path.absolute())), self._config.EXCLUSIONS.value))):
268 | continue
269 |
270 | # skip file with unwanted file extension
271 | if not file_has_extension(file_path, self._config.FILE_EXTENSION_FILTER.value):
272 | continue
273 |
274 | # skip if not existent (probably already deleted)
275 | if not file_path.exists():
276 | self._progress_manager.inc()
277 | continue
278 |
279 | try:
280 | self.EXECUTOR.submit(util.reraise_with_stack(command), root_directory, root, file_path)
281 | except Exception as e:
282 | click.echo(e, err=True)
283 | sys.exit(1)
284 |
285 | if not self._config.RECURSIVE.value:
286 | return
287 |
288 | @ANALYSIS_TIME.time()
289 | def analyze_file(self, file_path: Path):
290 | """
291 | Analyzes a single file
292 | :param file_path: the file path
293 | """
294 | self._progress_manager.set_postfix(self._truncate_middle(file_path))
295 |
296 | try:
297 | self._persistence.add(str(file_path))
298 | except Exception as e:
299 | logging.exception(e)
300 | echo(f"Error analyzing file '{file_path}': {e}")
301 | finally:
302 | self._progress_manager.inc()
303 |
304 | @FIND_DUPLICATES_TIME.time()
305 | def find_duplicates_of_file(self, root_directories: List[Path], root_directory: Path, reference_file_path: Path):
306 | """
307 | Finds duplicates and marks all but the best copy as "to-be-deleted".
308 | :param root_directories: valid root directories
309 | :param root_directory: root directory of reference_file_path
310 | :param reference_file_path: the file to check for duplicates
311 | """
312 | self._progress_manager.inc()
313 | self._progress_manager.set_postfix(self._truncate_middle(reference_file_path))
314 |
315 | # remember processed files to prevent processing files in multiple directions
316 | if reference_file_path in self._processed_files:
317 | # already found a better candidate for this file
318 | return
319 |
320 | duplicate_candidates = self._persistence.find_similar(str(reference_file_path))
321 |
322 | if self._config.SEARCH_ACROSS_ROOT_DIRS.value:
323 | # filter by files in at least one of the specified root directories
324 | # this is necessary because the database might hold items for other paths already
325 | # and those are not interesting to us
326 | duplicate_candidates = [
327 | candidate for candidate in duplicate_candidates if
328 | any(root_dir in Path(candidate[MetadataKey.PATH.value]).parents for root_dir in root_directories)
329 | ]
330 | else:
331 | # filter by files in the same root directory
332 | duplicate_candidates = [
333 | candidate for candidate in duplicate_candidates if
334 | root_directory in Path(candidate[MetadataKey.PATH.value]).parents
335 | ]
336 |
337 | if len(duplicate_candidates) <= 0:
338 | echo(f"No duplication candidates found in database for '{reference_file_path}'. "
339 | "This is an indication that the file has not been analysed yet or "
340 | "there was an issue analysing it.",
341 | color='yellow')
342 |
343 | if len(duplicate_candidates) <= 1:
344 | for candidate in duplicate_candidates:
345 | candidate_path = Path(candidate[MetadataKey.PATH.value])
346 |
347 | if candidate_path != reference_file_path:
348 | echo(f"Unexpected unique duplication candidate '{candidate_path}' for "
349 | f"reference file '{reference_file_path}'", color='yellow')
350 |
351 | self._processed_files[candidate_path] = True
352 |
353 | # nothing to do here since the result is unique
354 | return
355 |
356 | # sort by quality criteria and redo the search to use the best candidate as the reference image
357 | sorted_duplicate_candidates = self._sort_by_quality_descending(duplicate_candidates)
358 | new_reference_file_path = sorted_duplicate_candidates[0][MetadataKey.PATH.value]
359 | duplicate_candidates = self._persistence.find_similar(new_reference_file_path)
360 |
361 | candidates_to_keep, candidates_to_delete = self._select_images_to_delete(duplicate_candidates)
362 | self._save_duplicates_for_result(candidates_to_keep, candidates_to_delete)
363 |
364 | def _save_duplicates_for_result(self, files_to_keep: List[dict], duplicates: List[dict]) -> None:
365 | """
366 | Saves the comparison result for the final summary
367 |
368 | :param files_to_keep: list of image that shall be kept
369 | :param duplicates: less good duplicates
370 | """
371 | self._deduplication_result.set_file_duplicates(files_to_keep, duplicates)
372 |
373 | for file_to_keep in files_to_keep:
374 | file_path = Path(file_to_keep[MetadataKey.PATH.value])
375 | self._deduplication_result.add_file_action(file_path, ActionEnum.NONE)
376 |
377 | if self._config.DEDUPLICATOR_DUPLICATES_TARGET_DIRECTORY.value is None:
378 | action = ActionEnum.DELETE
379 | else:
380 | action = ActionEnum.MOVE
381 | for duplicate in duplicates:
382 | file_path = Path(duplicate[MetadataKey.PATH.value])
383 | self._deduplication_result.add_file_action(file_path, action)
384 |
385 | def _select_images_to_delete(self, duplicate_candidates: [{}]) -> tuple:
386 | """
387 | Selects which image to keep and which to remove
388 | :return: tuple (image to keep, list of images to remove)
389 | """
390 | duplicate_candidates = self._sort_by_quality_descending(duplicate_candidates)
391 |
392 | # keep first and mark others for removal
393 | keep = [duplicate_candidates[0]]
394 | dont_keep = duplicate_candidates[1:]
395 |
396 | # move files that don't fit criteria to "keep" list
397 | max_mod_time_diff = self._config.MAX_FILE_MODIFICATION_TIME_DELTA.value
398 | if max_mod_time_diff is not None:
399 | # filter files that don't match max mod time diff criteria
400 | best_candidate = keep[0]
401 | best_match_mod_timestamp = best_candidate[MetadataKey.METADATA.value][
402 | MetadataKey.FILE_MODIFICATION_DATE.value]
403 |
404 | for c in dont_keep:
405 | c_timestamp = c[MetadataKey.METADATA.value][MetadataKey.FILE_MODIFICATION_DATE.value]
406 | timestamp_diff = abs(c_timestamp - best_match_mod_timestamp)
407 | difference = datetime.timedelta(seconds=timestamp_diff)
408 | if difference > max_mod_time_diff:
409 | keep.append(c)
410 | dont_keep = list(filter(lambda x: x not in keep, dont_keep))
411 |
412 | # remember that we have processed these files
413 | for candidate in duplicate_candidates:
414 | self._processed_files[candidate[MetadataKey.PATH.value]] = True
415 |
416 | return keep, dont_keep
417 |
418 | @staticmethod
419 | def _sort_by_quality_descending(duplicate_candidates) -> []:
420 | """
421 | Sorts images according to the desired priorities.
422 | The first item in the list will be the most preferred one of all found duplicates.
423 |
424 | :param duplicate_candidates: the images to analyze
425 | :return: duplicate candidates sorted by given criteria
426 | """
427 |
428 | def sort_criteria(candidate: dict) -> ():
429 | criteria = []
430 |
431 | for rule in DeduplicatorConfig.PRIORITIZATION_RULES.value:
432 | rule_name = rule.get("name")
433 | if rule_name == "more-exif-data":
434 | if MetadataKey.EXIF_DATA.value in candidate[MetadataKey.METADATA.value]:
435 | # more exif data is better
436 | criteria.append(len(candidate[MetadataKey.METADATA.value][MetadataKey.EXIF_DATA.value]) * -1)
437 | elif rule_name == "less-exif-data":
438 | if MetadataKey.EXIF_DATA.value in candidate[MetadataKey.METADATA.value]:
439 | # more exif data is better
440 | criteria.append(len(candidate[MetadataKey.METADATA.value][MetadataKey.EXIF_DATA.value]) * 1)
441 | elif rule_name == "bigger-file-size":
442 | # reverse, bigger is better
443 | criteria.append(candidate[MetadataKey.METADATA.value][MetadataKey.FILE_SIZE.value] * -1)
444 | elif rule_name == "smaller-file-size":
445 | # smaller is better
446 | criteria.append(candidate[MetadataKey.METADATA.value][MetadataKey.FILE_SIZE.value] * 1)
447 | elif rule_name == "newer-file-modification-date":
448 | # reverse, bigger (later time) is better
449 | criteria.append(
450 | candidate[MetadataKey.METADATA.value][MetadataKey.FILE_MODIFICATION_DATE.value] * -1)
451 | elif rule_name == "older-file-modification-date":
452 | # smaller (earlier time) is better
453 | criteria.append(
454 | candidate[MetadataKey.METADATA.value][MetadataKey.FILE_MODIFICATION_DATE.value] * 1)
455 | elif rule_name == "smaller-distance":
456 | # smaller distance is better
457 | criteria.append(candidate[MetadataKey.DISTANCE.value])
458 | elif rule_name == "bigger-distance":
459 | # bigger distance is better
460 | criteria.append(candidate[MetadataKey.DISTANCE.value] * -1)
461 | # elif rule_name == "longer-path":
462 | # elif rule_name == "shorter-path":
463 | elif rule_name == "contains-copy-in-file-name":
464 | # if the filename contains "copy" it is less good
465 | criteria.append("copy" in file.get_file_name(candidate[MetadataKey.PATH.value]).lower())
466 | elif rule_name == "doesnt-contain-copy-in-file-name":
467 | # if the filename contains "copy" it is better
468 | criteria.append("copy" not in file.get_file_name(candidate[MetadataKey.PATH.value]).lower())
469 | elif rule_name == "longer-file-name":
470 | # longer filename is better (for "edited" versions)
471 | criteria.append(len(file.get_file_name(candidate[MetadataKey.PATH.value])) * -1)
472 |
473 | elif rule_name == "shorter-file-name":
474 | # shorter filename is better (for "edited" versions)
475 | criteria.append(len(file.get_file_name(candidate[MetadataKey.PATH.value])) * 1)
476 |
477 | elif rule_name == "longer-folder-path":
478 | # shorter folder path is better
479 | criteria.append(len(file.get_containing_folder(candidate[MetadataKey.PATH.value])) * -1)
480 | elif rule_name == "shorter-folder-path":
481 | # shorter folder path is better
482 | criteria.append(len(file.get_containing_folder(candidate[MetadataKey.PATH.value])))
483 | elif rule_name == "higher-score":
484 | # reverse, bigger is better
485 | criteria.append(candidate[MetadataKey.SCORE.value] * -1)
486 | elif rule_name == "lower-score":
487 | # lower is better
488 | criteria.append(candidate[MetadataKey.SCORE.value] * 1)
489 | elif rule_name == "higher-pixel-count":
490 | # higher pixel count is better
491 | criteria.append(candidate[MetadataKey.METADATA.value][MetadataKey.PIXELCOUNT.value] * -1)
492 | elif rule_name == "lower-pixel-count":
493 | # lower pixel count is better
494 | criteria.append(candidate[MetadataKey.METADATA.value][MetadataKey.PIXELCOUNT.value] * 1)
495 |
496 | # just to assure the order in the result is the same
497 | # if all other criteria (above) are equal
498 | # and recurring runs will result in the same order
499 | # (although they shouldn't be compared twice to begin with)
500 | criteria.append(candidate[MetadataKey.PATH.value])
501 |
502 | return tuple(criteria)
503 |
504 | duplicate_candidates = sorted(duplicate_candidates, key=sort_criteria)
505 |
506 | return duplicate_candidates
507 |
508 | def process_duplicates(self):
509 | """
510 | Moves or removes duplicates based on the configuration
511 | """
512 | dry_run = self._config.DRY_RUN.value
513 | duplicate_target_directory = self._config.DEDUPLICATOR_DUPLICATES_TARGET_DIRECTORY.value
514 | if duplicate_target_directory:
515 | echo("Phase 5/6: Moving duplicates ...", color='cyan')
516 | self._move_files_marked_as_delete(duplicate_target_directory, dry_run)
517 | else:
518 | echo("Phase 5/6: Removing duplicates ...", color='cyan')
519 | self._remove_files_marked_as_delete(dry_run)
520 |
521 | def _find_empty_folders(self, root_path: Path, recursive: bool, dry_run: bool) -> [str]:
522 | """
523 | Finds empty folders within the given root_path
524 | :param root_path: folder to search in
525 | """
526 | result = OrderedSet()
527 |
528 | # traverse bottom-up to remove folders that are empty due to file removal
529 | for root, directories, files in os.walk(str(root_path), topdown=False):
530 | # get absolute paths of all files and folders in the current root directory
531 | abs_file_paths = list(map(lambda x: os.path.abspath(os.path.join(root, x)), files))
532 | abs_folder_paths = list(map(lambda x: os.path.abspath(os.path.join(root, x)), directories))
533 |
534 | # find out which of those files were deleted by the deduplication process
535 | files_deleted = list(
536 | map(lambda x: Path(x), filter(
537 | lambda x: Path(x) in self._deduplication_result.get_removed_or_moved_files(),
538 | abs_file_paths)))
539 | files_deleted = list(set(files_deleted + list(
540 | filter(lambda x: x.parent == Path(root), self._deduplication_result.get_removed_or_moved_files()))))
541 |
542 | folders_deleted = list(filter(lambda x: x in result, abs_folder_paths))
543 | filtered_directories = list(filter(lambda x: x not in folders_deleted, abs_folder_paths))
544 |
545 | if dry_run:
546 | if len(files_deleted) > 0 and len(files_deleted) == len(files) and len(folders_deleted) == len(
547 | directories):
548 | result.append(root)
549 | else:
550 | if len(files_deleted) > 0 and len(files) <= 0 and len(directories) <= 0:
551 | result.append(root)
552 |
553 | if not recursive:
554 | break
555 |
556 | return result
557 |
558 | def _remove_folders(self, root_path: Path, folders: [str], dry_run: bool):
559 | """
560 | Function to remove empty folders
561 | :param root_path:
562 | """
563 | echo(f"Removing empty folders ({len(folders)}) in: '{root_path}' ...")
564 |
565 | if len(folders) == 0:
566 | return
567 |
568 | self._progress_manager.start("Removing empty folders", len(folders), "Folder", self.interactive)
569 | for folder in folders:
570 | self._progress_manager.set_postfix(self._truncate_middle(folder))
571 |
572 | if not dry_run:
573 | os.rmdir(folder)
574 |
575 | self._deduplication_result.add_removed_empty_folder(folder)
576 | self._progress_manager.inc()
577 | self._progress_manager.clear()
578 |
579 | def _remove_files_marked_as_delete(self, dry_run: bool):
580 | """
581 | Removes files that were marked to be deleted in previous deduplication step
582 | :param dry_run: set to true to simulate this action
583 | """
584 | items_to_remove = self._deduplication_result.get_file_with_action(ActionEnum.DELETE)
585 | marked_files_count = len(items_to_remove)
586 | if marked_files_count == 0:
587 | return
588 |
589 | self._progress_manager.start("Removing files", marked_files_count, "File", self.interactive)
590 | self._delete_files(items_to_remove, dry_run)
591 | self._progress_manager.clear()
592 |
593 | def _move_files_marked_as_delete(self, target_dir: Path, dry_run: bool):
594 | """
595 | Moves files that were marked to be deleted in previous deduplication step to the target directory
596 | :param target_dir: the directory to move duplicates to
597 | :param dry_run: set to true to simulate this action
598 | """
599 | items_to_move = self._deduplication_result.get_file_with_action(ActionEnum.MOVE)
600 | marked_files_count = len(items_to_move)
601 | if marked_files_count == 0:
602 | return
603 |
604 | self._progress_manager.start("Moving files", marked_files_count, "File", self.interactive)
605 | self._move_files(items_to_move, target_dir, dry_run)
606 | self._progress_manager.clear()
607 |
608 | def _delete_files(self, files_to_delete: [str], dry_run: bool):
609 | """
610 | Deletes files on disk
611 | :param files_to_delete: list of absolute file paths
612 | :param dry_run: set to true to simulate this action
613 | """
614 | for file_path in files_to_delete:
615 | self._progress_manager.set_postfix(self._truncate_middle(file_path))
616 |
617 | if dry_run:
618 | pass
619 | else:
620 | # remove from file system
621 | if os.path.exists(file_path):
622 | os.remove(file_path)
623 |
624 | # remove from persistence
625 | self._persistence.remove(file_path)
626 |
627 | DUPLICATE_ACTION_DELETE_COUNT.inc()
628 |
629 | self._progress_manager.inc()
630 |
631 | def _move_files(self, files_to_move: List[Path], target_dir: Path, dry_run: bool):
632 | """
633 | Moves files on disk
634 | :param files_to_move: list of absolute file paths
635 | :param target_dir: directory to move files to
636 | """
637 | for file_path in files_to_move:
638 | self._progress_manager.set_postfix(self._truncate_middle(file_path))
639 |
640 | try:
641 | if dry_run:
642 | continue
643 |
644 | # move file
645 | if not file_path.exists():
646 | continue
647 |
648 | target_file = Path(str(target_dir), *file_path.parts[1:])
649 | if target_file.exists():
650 | if filecmp.cmp(file_path, target_file, shallow=False):
651 | os.remove(file_path)
652 | else:
653 | raise ValueError(f"Can't move duplicate file because the target already exists: {target_file}")
654 | else:
655 | target_file.parent.mkdir(parents=True, exist_ok=True)
656 | shutil.move(file_path, target_file)
657 |
658 | # remove from persistence
659 | self._persistence.remove(str(file_path))
660 |
661 | DUPLICATE_ACTION_MOVE_COUNT.inc()
662 | except Exception as ex:
663 | logging.exception(ex)
664 | # LOGGER.log(ex)
665 | finally:
666 | self._progress_manager.inc()
667 |
668 | @staticmethod
669 | def _truncate_middle(text: any, max_length: int = 50):
670 | text = str(text)
671 | if len(text) <= max_length:
672 | # string is already short-enough, fill up with spaces
673 | return text + ((max_length - len(text)) * " ")
674 | # half of the size, minus the 3 .'s
675 | n_2 = int(max_length / 2) - 3
676 | # whatever's left
677 | n_1 = max_length - n_2 - 3
678 | return '{0}...{1}'.format(text[:n_1], text[-n_2:])
679 |
680 | def remove_empty_folders(self):
681 | phase_6_text = "Phase 6/6: Removing empty folders"
682 | if not self._config.REMOVE_EMPTY_FOLDERS.value:
683 | echo(phase_6_text + " - Skipping", color='yellow')
684 | else:
685 | echo(phase_6_text, color='cyan')
686 | self._remove_empty_folders(self._config.SOURCE_DIRECTORIES.value, self._config.RECURSIVE.value)
687 |
--------------------------------------------------------------------------------
/py_image_dedup/library/file_watch.py:
--------------------------------------------------------------------------------
1 | import re
2 | from pathlib import Path
3 |
4 | from watchdog.events import FileSystemEventHandler, EVENT_TYPE_MODIFIED, EVENT_TYPE_MOVED, EVENT_TYPE_CREATED, \
5 | EVENT_TYPE_DELETED
6 |
7 | from py_image_dedup.config import DeduplicatorConfig
8 | from py_image_dedup.stats import FILE_EVENT_COUNT
9 | from py_image_dedup.util import echo
10 |
11 |
12 | class EventHandler(FileSystemEventHandler):
13 |
14 | def __init__(self, processing_manager):
15 | super().__init__()
16 | self.processing_manager = processing_manager
17 |
18 | self.config = DeduplicatorConfig()
19 |
20 | self.directory_regex = re.compile(rf"^({'|'.join(list(map(str, self.config.SOURCE_DIRECTORIES.value)))}).*$")
21 | self.file_regex = re.compile(rf"^.*({'|'.join(self.config.FILE_EXTENSION_FILTER.value)})$", re.IGNORECASE)
22 |
23 | def on_any_event(self, event):
24 | if not self._event_matches_filter(event):
25 | return
26 |
27 | FILE_EVENT_COUNT.labels(type=event.event_type).inc()
28 |
29 | echo("FileSystemEvent: {} {} {}".format(event.event_type,
30 | "directory" if event.is_directory else "file",
31 | event.src_path))
32 |
33 | _actions = {
34 | EVENT_TYPE_CREATED: self.created,
35 | EVENT_TYPE_MODIFIED: self.modified,
36 | EVENT_TYPE_MOVED: self.moved,
37 | EVENT_TYPE_DELETED: self.deleted,
38 | }
39 | _actions[event.event_type](event)
40 |
41 | def created(self, event):
42 | self._process(event.src_path)
43 |
44 | def modified(self, event):
45 | self._process(event.src_path)
46 |
47 | def moved(self, event):
48 | self._cleanup(event.src_path)
49 | self._process(event.dest_path)
50 |
51 | def deleted(self, event):
52 | self._cleanup(event.src_path)
53 |
54 | def _process(self, path: str):
55 | self.processing_manager.add(Path(path))
56 |
57 | def _cleanup(self, path: str):
58 | self.processing_manager.remove(Path(path))
59 |
60 | def _event_matches_filter(self, event) -> bool:
61 | if event.is_directory:
62 | return False
63 | else:
64 | result = bool(self.directory_regex.match(event.src_path))
65 | result &= bool(self.file_regex.match(event.src_path))
66 | return result
67 |
--------------------------------------------------------------------------------
/py_image_dedup/library/processing_manager.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | from datetime import datetime, timedelta
3 | from pathlib import Path
4 | from threading import Lock
5 | from typing import List
6 |
7 | from watchdog.observers.inotify import InotifyObserver
8 | from watchdog.observers.polling import PollingObserver
9 |
10 | from py_image_dedup.config import DeduplicatorConfig, FILE_OBSERVER_TYPE_INOTIFY, FILE_OBSERVER_TYPE_POLLING
11 | from py_image_dedup.library import ActionEnum, RegularIntervalWorker
12 | from py_image_dedup.library.file_watch import EventHandler
13 | from py_image_dedup.library.progress_manager import ProgressManager
14 | from py_image_dedup.util.file import get_files_count
15 |
16 |
17 | class ProcessingManager(RegularIntervalWorker):
18 | lock = Lock()
19 | queue = OrderedDict()
20 |
21 | progress_manager: ProgressManager
22 |
23 | latest_event_time = None
24 |
25 | def __init__(self, deduplicator):
26 | self.config = DeduplicatorConfig()
27 | timeout = self.config.DAEMON_PROCESSING_TIMEOUT.value
28 | interval = timeout.total_seconds()
29 | super().__init__(interval)
30 | self.progress_manager = ProgressManager()
31 | self.deduplicator = deduplicator
32 | self.event_handler = EventHandler(self)
33 | self.observers = []
34 |
35 | def start(self):
36 | observer_type = self.config.DAEMON_FILE_OBSERVER_TYPE.value
37 | directories = self.config.SOURCE_DIRECTORIES.value
38 | self.observers = self._setup_file_observers(observer_type, directories)
39 | super().start()
40 |
41 | def stop(self):
42 | for observer in self.observers:
43 | observer.stop()
44 | observer.join()
45 |
46 | self.observers.clear()
47 |
48 | def _setup_file_observers(self, observer_type: str, source_directories: List[Path]):
49 | observers = []
50 |
51 | for directory in source_directories:
52 | if observer_type == FILE_OBSERVER_TYPE_INOTIFY:
53 | observer = InotifyObserver()
54 | elif observer_type == FILE_OBSERVER_TYPE_POLLING:
55 | observer = PollingObserver()
56 | else:
57 | raise ValueError(f"Unexpected file observer type {observer_type}")
58 |
59 | observer.schedule(self.event_handler, str(directory), recursive=True)
60 | observer.start()
61 | observers.append(observer)
62 |
63 | return observers
64 |
65 | def add(self, path: Path):
66 | with self.lock:
67 | self.latest_event_time = datetime.now()
68 | if path not in self.queue:
69 | self.queue[path] = path
70 |
71 | def remove(self, path: Path):
72 | if path in self.queue:
73 | self.queue.pop(path)
74 | self.deduplicator._persistence.remove(str(path))
75 |
76 | def _should_process(self):
77 | return len(self.queue) > 0 and (
78 | self.latest_event_time is None or
79 | (datetime.now() - timedelta(seconds=self._interval) > self.latest_event_time)
80 | )
81 |
82 | def _run(self):
83 | with self.lock:
84 | self.process_queue()
85 |
86 | def process_queue(self):
87 | if not self._should_process():
88 | return
89 |
90 | self.progress_manager.start("Processing", len(self.queue), "Files", False)
91 | while True:
92 | try:
93 | path, value = self.queue.popitem()
94 | self._process_queue_item(path, value)
95 | self.progress_manager.inc()
96 | except KeyError:
97 | break
98 | self.progress_manager.clear()
99 |
100 | def _process_queue_item(self, path, value):
101 | self.deduplicator.reset_result()
102 |
103 | # TODO: only a workaround until files can be processed too
104 | if path.is_file():
105 | path = path.parent
106 |
107 | if path.is_dir():
108 | files_count = get_files_count(
109 | path,
110 | self.config.RECURSIVE.value,
111 | self.config.FILE_EXTENSION_FILTER.value,
112 | self.config.EXCLUSIONS.value
113 | )
114 | directory_map = {
115 | path: files_count
116 | }
117 |
118 | self.deduplicator.analyze_directories(directory_map)
119 | self.deduplicator.find_duplicates_in_directories(directory_map)
120 |
121 | # TODO: allow processing individual files
122 | # if path.is_file():
123 | # self.deduplicator.analyze_file(path)
124 | # root_dir = Path(os.path.commonpath([path] + self.config.SOURCE_DIRECTORIES.value))
125 | # self.deduplicator.find_duplicates_of_file(self.config.SOURCE_DIRECTORIES.value, root_dir, path)
126 |
127 | self.deduplicator.process_duplicates()
128 |
129 | # TODO: this needs rethinking
130 | # remove items that have been (re-)moved already from the event queue
131 | removed_items = self.deduplicator._deduplication_result.get_file_with_action(ActionEnum.DELETE)
132 | moved_items = self.deduplicator._deduplication_result.get_file_with_action(ActionEnum.MOVE)
133 | for item in set(removed_items + moved_items):
134 | if item in self.queue:
135 | self.queue.pop(item)
136 | self.progress_manager.inc()
137 |
--------------------------------------------------------------------------------
/py_image_dedup/library/progress_manager.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from tqdm import tqdm
4 |
5 | LOGGER = logging.getLogger(__name__)
6 | LOGGER.setLevel(logging.DEBUG)
7 |
8 |
9 | class ProgressManager:
10 |
11 | def __init__(self):
12 | self._progress_bar = None
13 | self._task = None
14 | self._n = None
15 | self._total = None
16 | self._unit = None
17 | self._last_percentage = None
18 |
19 | def start(self, task: str, total: int, unit: str, interactive: bool):
20 | if self._task is not None:
21 | LOGGER.warning(f"Starting new progress without explicitly closing the current one '{self._task}'")
22 | self.clear()
23 |
24 | self._task = task
25 | self._total = total
26 | self._unit = unit
27 | if interactive:
28 | self._progress_bar = self._create_progressbar(total, unit)
29 |
30 | def set_postfix(self, postfix: str):
31 | if self._progress_bar is not None and postfix is not None:
32 | self._progress_bar.set_postfix_str(postfix)
33 |
34 | def inc(self, n: int = 1):
35 | if self._task is None:
36 | raise AssertionError(
37 | "Can't increase before start. "
38 | "Please start a new task progress using start() before incrementing it.")
39 |
40 | if self._n is None:
41 | self._n = n
42 | else:
43 | self._n += n
44 |
45 | if self._progress_bar is not None:
46 | self._progress_bar.update(n)
47 |
48 | new_percentage = int((self._n / self._total) * 100)
49 | if self._last_percentage is None or self._last_percentage != new_percentage:
50 | self._last_percentage = new_percentage
51 | LOGGER.info(f"{self._task}: {new_percentage}% ({self._n}/{self._total})")
52 |
53 | def clear(self):
54 | if self._progress_bar is not None:
55 | self._progress_bar.close()
56 | self._progress_bar = None
57 | self._last_percentage = None
58 | self._n = None
59 | self._total = None
60 | self._task = None
61 | self._unit = None
62 |
63 | def _create_progressbar(self, total_count: int, unit: str) -> tqdm:
64 | """
65 | Creates a new progress bar
66 | :param total_count: target for 100%
67 | :param unit: "Things" that are counted
68 | :return: progress bar
69 | """
70 | self._progress_bar = tqdm(total=total_count, unit=unit, unit_scale=True, mininterval=1)
71 | return self._progress_bar
72 |
--------------------------------------------------------------------------------
/py_image_dedup/persistence/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | from PIL import TiffImagePlugin
5 |
6 | from py_image_dedup.persistence.metadata_key import MetadataKey
7 |
8 |
9 | class ImageSignatureStore:
10 | """
11 | Base class for Persistence implementations
12 | """
13 |
14 | DATAMODEL_VERSION = 5
15 |
16 | def __init__(self, use_exif_data: bool = True):
17 | self._use_exif_data = use_exif_data
18 |
19 | def add(self, image_file_path: str):
20 | """
21 | Analyze an image file and add it to the store
22 |
23 | :param image_file_path: path to the image file
24 | """
25 | image_data = self._create_metadata_dict(image_file_path)
26 |
27 | # check if the file has already been analyzed (and didn't change in the meantime)
28 | existing_entity = self.get(image_file_path)
29 | if existing_entity is not None:
30 | is_data_version_ok = False
31 | try:
32 | if MetadataKey.DATAMODEL_VERSION.value in existing_entity[MetadataKey.METADATA.value]:
33 | is_data_version_ok = existing_entity[MetadataKey.METADATA.value][
34 | MetadataKey.DATAMODEL_VERSION.value] == self.DATAMODEL_VERSION
35 |
36 | if is_data_version_ok and \
37 | existing_entity[MetadataKey.METADATA.value][MetadataKey.FILE_SIZE.value] == image_data[
38 | MetadataKey.FILE_SIZE.value] and \
39 | existing_entity[MetadataKey.METADATA.value][
40 | MetadataKey.FILE_MODIFICATION_DATE.value] == image_data[
41 | MetadataKey.FILE_MODIFICATION_DATE.value]:
42 | # print("File is the same, not adding again")
43 | return
44 | except Exception as ex:
45 | logging.exception(ex)
46 | try:
47 | self.remove(image_file_path)
48 | except Exception as ex:
49 | logging.exception(ex)
50 | return
51 |
52 | self._add(image_file_path, image_data)
53 |
54 | def _create_metadata_dict(self, image_file_path: str) -> dict:
55 | """
56 | Creates a dictionary that should be stored in persistence
57 |
58 | :param image_file_path: path to the image file
59 | :return: dictionary containing all relevant information
60 | """
61 | from py_image_dedup.util import image
62 |
63 | image_data = {}
64 | image_data[MetadataKey.PATH.value] = image_file_path
65 |
66 | # get some metadata
67 | file_size = os.stat(image_file_path).st_size
68 | file_modification_date = os.path.getmtime(image_file_path)
69 |
70 | image_data[MetadataKey.DATAMODEL_VERSION.value] = self.DATAMODEL_VERSION
71 | image_data[MetadataKey.FILE_SIZE.value] = file_size
72 | image_data[MetadataKey.FILE_MODIFICATION_DATE.value] = file_modification_date
73 |
74 | image_data[MetadataKey.PIXELCOUNT.value] = image.get_pixel_count(image_file_path)
75 |
76 | if self._use_exif_data:
77 | exif_data = image.get_exif_data(image_file_path)
78 | exif_data = self._normalize_meta_data_for_db(exif_data)
79 | image_data[MetadataKey.EXIF_DATA.value] = exif_data
80 |
81 | return image_data
82 |
83 | def _normalize_meta_data_for_db(self, dictionary: dict) -> dict:
84 | """
85 | :param dictionary:
86 | :return:
87 | """
88 | result = {}
89 | for k, v in dictionary.items():
90 | if isinstance(v, dict):
91 | result[k] = self._normalize_meta_data_for_db(v)
92 | continue
93 |
94 | normalized_value = v
95 | if isinstance(v, bytes) or isinstance(v, tuple):
96 | normalized_value = str(v)
97 | elif isinstance(v, TiffImagePlugin.IFDRational):
98 | if v._denominator != 0:
99 | normalized_value = v._numerator / v._denominator
100 | else:
101 | normalized_value = float(v._numerator)
102 |
103 | result[k] = normalized_value
104 |
105 | return result
106 |
107 | def _add(self, image_file_path: str, image_data: dict) -> None:
108 | """
109 | Saves image data for the specified image file path
110 |
111 | :param image_file_path: image file path
112 | :param image_data: metadata for the image
113 | """
114 | raise NotImplementedError()
115 |
116 | def get(self, image_file_path: str) -> dict or None:
117 | """
118 | Get a store entry by it's file_path
119 | :param image_file_path: file path to search for
120 | :return: store entry or None
121 | """
122 | raise NotImplementedError()
123 |
124 | def get_all(self) -> (int, object):
125 | """
126 | :return: item count, stored entries as a generator function
127 | """
128 | raise NotImplementedError()
129 |
130 | def find_similar(self, reference_image_file_path: str) -> []:
131 | """
132 | Search for similar images to the specified one
133 |
134 | :param reference_image_file_path: the reference image file
135 | :return: list of images that are similar to the reference file
136 | """
137 | raise NotImplementedError()
138 |
139 | def remove(self, image_file_path: str) -> None:
140 | """
141 | Remove all entries with the given file path
142 |
143 | :param image_file_path: the path of an image file
144 | """
145 | raise NotImplementedError()
146 |
147 | def remove_entries_of_missing_files(self):
148 | """
149 | Remove all entries with files that don't exist
150 | """
151 | entries = self.get_all()
152 | for entry in entries:
153 | file_path = entry['path']
154 | if not os.path.exists(file_path):
155 | self.remove(file_path)
156 |
157 | def remove_all(self) -> None:
158 | """
159 | Remove all entries from Database
160 | """
161 | raise NotImplementedError()
162 |
--------------------------------------------------------------------------------
/py_image_dedup/persistence/elasticsearchstorebackend.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 |
4 | import requests
5 | from elasticsearch import Elasticsearch
6 | from image_match.elasticsearch_driver import SignatureES
7 |
8 | from py_image_dedup.persistence import ImageSignatureStore
9 | from py_image_dedup.util import echo
10 |
11 |
12 | class ElasticSearchStoreBackend(ImageSignatureStore):
13 | DEFAULT_EL_DOC_TYPE_EL_6 = 'image'
14 | DEFAULT_EL_DOC_TYPE_EL_7 = '_doc'
15 |
16 | def __init__(self,
17 | host: str,
18 | port: int,
19 | connections_per_node: int,
20 | el_index: str,
21 | el_version: int = None,
22 | el_doctype: str = None,
23 | max_dist: float = 0.03,
24 | use_exif_data: bool = True,
25 | setup_database: bool = True,
26 | ):
27 | """
28 | Image signature persistence backed by image_match and elasticsearch
29 |
30 | :param host: host address of the elasticsearch server
31 | :param port: port of the elasticsearch server
32 | :param el_version: elasticsearch version
33 | :param el_index: elasticsearch index where the data is stored
34 | :param el_doctype: elasticsearch document type of the stored data
35 | :param max_dist: maximum "difference" allowed, ranging from [0 .. 1] where 0.2 is still a pretty similar image
36 | """
37 | super().__init__(use_exif_data)
38 |
39 | self.host = host
40 | self.port = port
41 | self._connections_per_node = connections_per_node
42 |
43 | detected_version = None
44 | while detected_version is None:
45 | time.sleep(2)
46 | detected_version = self._detect_db_version()
47 |
48 | self._el_version = el_version
49 | if self._el_version is not None and detected_version is not None and self._el_version != detected_version:
50 | raise AssertionError(
51 | "Detected database version ({}) does not match expected version ({})".format(detected_version,
52 | self._el_version))
53 |
54 | if detected_version is not None:
55 | self._el_version = detected_version
56 | elif self._el_version is None:
57 | # assume version 6 by default
58 | self._el_version = 6
59 |
60 | self._el_index = el_index
61 | if el_doctype is not None:
62 | self._el_doctype = el_doctype
63 | else:
64 | self._el_doctype = self.DEFAULT_EL_DOC_TYPE_EL_6 if self._el_version < 7 else self.DEFAULT_EL_DOC_TYPE_EL_7
65 |
66 | self.setup_database = setup_database
67 | if setup_database:
68 | try:
69 | # self._clear_database()
70 | self._setup_database()
71 | except Exception as e:
72 | logging.exception(e)
73 | raise AssertionError("Could not setup database")
74 |
75 | # noinspection PyTypeChecker
76 | self._store = SignatureES(
77 | es=Elasticsearch(
78 | hosts=[
79 | {'host': self.host, 'port': self.port}
80 | ],
81 | maxsize=self._connections_per_node,
82 | ),
83 | # el_version=self._el_version,
84 | index=self._el_index,
85 | doc_type=self._el_doctype,
86 | distance_cutoff=max_dist,
87 | )
88 |
89 | def _detect_db_version(self) -> int or None:
90 | try:
91 | response = requests.get('http://{}:{}'.format(self.host, self.port))
92 | response.raise_for_status()
93 | return int(str(response.json()["version"]['number']).split(".")[0])
94 | except Exception as ex:
95 | logging.exception(ex)
96 | return None
97 |
98 | def _setup_database(self):
99 | """
100 | Creates the expected index, if it does not exist
101 | """
102 | response = requests.get('http://{}:{}/{}'.format(self.host, self.port, self._el_index))
103 | if response.status_code == 200:
104 | return
105 | elif response.status_code == 404:
106 |
107 | properties = {
108 | "properties": {
109 | "path": {
110 | "type": "keyword",
111 | "ignore_above": 256
112 | }
113 | }
114 | }
115 |
116 | if self._el_version == 7:
117 | json_data = {
118 | "mappings": properties
119 | }
120 | else:
121 | json_data = {
122 | "mappings": {
123 | self._el_doctype: properties
124 | }
125 | }
126 |
127 | response = requests.put(
128 | url='http://{}:{}/{}'.format(self.host, self.port, self._el_index),
129 | json=json_data
130 | )
131 |
132 | response.raise_for_status()
133 | else:
134 | response.raise_for_status()
135 |
136 | def _clear_database(self):
137 | """
138 | Removes the index and all data it contains
139 | """
140 | requests.delete('http://{}:{}/{}'.format(self.host, self.port, self._el_index))
141 |
142 | def _add(self, image_file_path: str, image_data: dict) -> None:
143 | # remove existing entries
144 | self.remove(image_file_path)
145 | self._store.add_image(image_file_path, metadata=image_data)
146 |
147 | def get(self, image_file_path: str) -> dict or None:
148 | """
149 | Get a store entry by it's file_path
150 | :param image_file_path: file path to search for
151 | :return:
152 | """
153 | db_entity = self._get(image_file_path)
154 | return db_entity
155 |
156 | def _get(self, image_file_path: str) -> dict or None:
157 | """
158 | Get a store entry by it's file_path
159 | :param image_file_path: file path to search for
160 | :return: elasticsearch result dictionary
161 | """
162 | es_query = {
163 | 'query': {
164 | "constant_score": {
165 | "filter": {
166 | "term": {'path': image_file_path}
167 | }
168 | }
169 | }
170 | }
171 |
172 | query_result = self._store.es.search(index=self._el_index, body=es_query)
173 |
174 | hits = query_result['hits']['hits']
175 |
176 | if len(hits) > 1:
177 | echo(f"WARNING: More than a single entry for a file, cleaning up: {image_file_path}", color='yellow')
178 | self.remove(image_file_path)
179 | self.add(image_file_path)
180 |
181 | if len(hits) == 0:
182 | return None
183 | else:
184 | return hits[0]['_source']
185 |
186 | def get_all(self) -> (int, object):
187 | es_query = {
188 | "track_total_hits": True,
189 | 'query': {'match_all': {}}
190 | }
191 |
192 | item_count = self._store.es.search(index=self._el_index, body=es_query, size=0)['hits']['total']
193 | if self._el_version >= 7:
194 | item_count = item_count['value']
195 |
196 | from elasticsearch.helpers import scan
197 |
198 | el6_params = {
199 | "doc_type": self._el_doctype
200 | }
201 | return item_count, scan(
202 | self._store.es,
203 | index=self._el_index,
204 | preserve_order=True,
205 | query=es_query,
206 | **(el6_params if self._el_version < 7 else {})
207 | )
208 |
209 | def find_similar(self, reference_image_file_path: str) -> []:
210 | try:
211 | entry = self._get(reference_image_file_path)
212 | if entry is not None:
213 | result = []
214 | rec = self._store.search_single_record(entry)
215 | result.extend(rec)
216 |
217 | return result
218 | else:
219 | return self._store.search_image(reference_image_file_path, all_orientations=True)
220 | except Exception as e:
221 | echo(f"Error querying database for similar images of '{reference_image_file_path}': {e}", color="red")
222 | return []
223 |
224 | def search_metadata(self, metadata: dict) -> []:
225 | """
226 | Search for images with metadata properties.
227 |
228 | Note: Metadata will be empty if you did not provide it when adding an image
229 | :param metadata:
230 | :return:
231 | """
232 | search_dict = {}
233 | for key, value in metadata.items():
234 | search_dict[f"metadata.{key}"] = value
235 |
236 | es_query = {
237 | 'query': {'match': search_dict}
238 | }
239 |
240 | return self._store.es.search(index=self._el_index, body=es_query)
241 |
242 | def remove(self, image_file_path: str) -> None:
243 | # NOTE: this query will only work if the index has been created
244 | # with a custom mapping for the path property:
245 |
246 | # # remove existing index
247 | # curl -X DELETE "192.168.2.24:9200/images"
248 | #
249 | # # create index with custom mapping for "path"
250 | # curl -X PUT "192.168.2.24:9200/images?pretty" -H "Content-Type: application/json" -d
251 | # "
252 | # {
253 | # "mappings": {
254 | # "image": {
255 | # "properties": {
256 | # "path": {
257 | # "type": "keyword",
258 | # "ignore_above": 256
259 | # }
260 | # }
261 | # }
262 | # }
263 | # }
264 | # "
265 |
266 | es_query = {
267 | 'query': {
268 | "constant_score": {
269 | "filter": {
270 | "term": {'path': image_file_path}
271 | }
272 | }
273 | }
274 | }
275 |
276 | self._remove_by_query(es_query)
277 |
278 | def remove_all(self) -> None:
279 | es_query = {
280 | 'query': {'match_all': {}}
281 | }
282 |
283 | self._remove_by_query(es_query)
284 |
285 | def _remove_by_query(self, es_query: dict):
286 | el6_params = {
287 | "doc_type": self._el_doctype
288 | }
289 |
290 | return self._store.es.delete_by_query(
291 | index=self._el_index,
292 | body=es_query,
293 | conflicts="proceed",
294 | **(el6_params if self._el_version < 7 else {})
295 | )
296 |
--------------------------------------------------------------------------------
/py_image_dedup/persistence/metadata_key.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 |
4 | class MetadataKey(Enum):
5 | METADATA = "metadata"
6 |
7 | DATAMODEL_VERSION = "py-image-dedup_datamodel-version"
8 |
9 | PATH = "path"
10 | DISTANCE = "dist"
11 | SCORE = "score"
12 |
13 | FILE_SIZE = "filesize"
14 | FILE_MODIFICATION_DATE = "file_modification_date"
15 |
16 | PIXELCOUNT = "pixelcount"
17 | EXIF_DATA = "exif_data"
18 |
--------------------------------------------------------------------------------
/py_image_dedup/stats.py:
--------------------------------------------------------------------------------
1 | from prometheus_client import Gauge, Summary
2 |
3 | DUPLICATE_ACTION_COUNT = Gauge(
4 | 'duplicate_action_total',
5 | 'Number of images per action',
6 | ['action']
7 | )
8 | DUPLICATE_ACTION_NONE_COUNT = DUPLICATE_ACTION_COUNT.labels(action="none")
9 | DUPLICATE_ACTION_MOVE_COUNT = DUPLICATE_ACTION_COUNT.labels(action="move")
10 | DUPLICATE_ACTION_DELETE_COUNT = DUPLICATE_ACTION_COUNT.labels(action="delete")
11 |
12 | FILE_EVENT_COUNT = Gauge(
13 | 'file_event_total',
14 | 'Number of file events per event type',
15 | ['type']
16 | )
17 |
18 | ANALYSIS_TIME = Summary('analyse_file_summary', 'Time spent analysing a file')
19 |
20 | FIND_DUPLICATES_TIME = Summary('find_duplicates_summary', 'Time spent finding duplicates of a file')
21 |
--------------------------------------------------------------------------------
/py_image_dedup/util/__init__.py:
--------------------------------------------------------------------------------
1 | import functools
2 | import logging
3 | import traceback
4 |
5 | import click
6 |
7 | LOGGER = logging.getLogger(__name__)
8 | LOGGER.setLevel(logging.DEBUG)
9 |
10 |
11 | def echo(text: str = "", color=None):
12 | """
13 | Prints a text to the console
14 | :param text: the text
15 | :param color: an optional color
16 | """
17 | if text is not click.termui and text is not str:
18 | text = str(text)
19 | if color:
20 | text = click.style(text, fg=color)
21 | if len(text) > 0:
22 | LOGGER.debug(text)
23 | click.echo(text)
24 |
25 |
26 | def reraise_with_stack(func):
27 | """
28 | Decorator used to reraise exceptions occurring within a future.
29 |
30 | :param func: function to decorate
31 | :return: decorated function
32 | """
33 |
34 | @functools.wraps(func)
35 | def wrapped(*args, **kwargs):
36 | try:
37 | return func(*args, **kwargs)
38 | except Exception as e:
39 | traceback_str = traceback.format_exc()
40 | raise ValueError("Error occurred. Original traceback is\n%s\n" % traceback_str)
41 |
42 | return wrapped
43 |
--------------------------------------------------------------------------------
/py_image_dedup/util/file.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 | from typing import List
4 |
5 |
6 | def get_file_name(file_path: str) -> str:
7 | folder, file = os.path.split(file_path)
8 | return file
9 |
10 |
11 | def get_containing_folder(file_path: str) -> str:
12 | folder, file = os.path.split(file_path)
13 | return folder
14 |
15 |
16 | def file_has_extension(file: Path, extensions: List[str] or None) -> bool:
17 | """
18 | Checks if a file has one of the given extensions
19 | :param file: the file to check
20 | :param extensions: allowed extensions
21 | :return: true if it matches (case insensitive), false otherwise
22 | """
23 | if not isinstance(extensions, List):
24 | extensions = [extensions]
25 | if not extensions:
26 | return True
27 |
28 | if file.suffix.lower() not in (ext.lower() for ext in extensions):
29 | # skip file with unwanted file extension
30 | return False
31 | else:
32 | return True
33 |
34 |
35 | def get_files_count(directory: Path, recursive: bool, file_extensions: List[str] or None, exclusions: List) -> int:
36 | """
37 | :param directory: the directory to analyze
38 | :param recursive: whether to search the directory recursively
39 | :param file_extensions: file extensions to include
40 | :return: number of files in the given directory that match the currently set file filter
41 | """
42 | files_count = 0
43 | for r, d, files in os.walk(str(directory)):
44 | for file in files:
45 | file = Path(file)
46 | if any(list(map(lambda x: x.search(str(file.absolute())), exclusions))):
47 | continue
48 | if not file_has_extension(file, file_extensions):
49 | continue
50 | files_count += 1
51 | if not recursive:
52 | break
53 |
54 | return files_count
55 |
--------------------------------------------------------------------------------
/py_image_dedup/util/image.py:
--------------------------------------------------------------------------------
1 | import PIL.ExifTags
2 | from PIL import Image
3 |
4 |
5 | def get_exif_data(image_file_path: str) -> {}:
6 | """
7 | Tries to extract all exif data from the given image file
8 | :param image_file_path: path of the image file
9 | :return: dictionary containing all available exif data entries and their values
10 | """
11 |
12 | result = {}
13 | try:
14 | img = Image.open(image_file_path)
15 |
16 | exif_data = img._getexif()
17 | if not exif_data:
18 | return result
19 |
20 | for k, v in exif_data.items():
21 | if k in PIL.ExifTags.TAGS:
22 | tag_name = PIL.ExifTags.TAGS[k]
23 | result[tag_name] = v
24 | except Exception as e:
25 | pass
26 | return result
27 |
28 |
29 | def get_pixel_count(image_file_path: str) -> int:
30 | try:
31 | img = Image.open(image_file_path)
32 | width, height = img.size
33 | return width * height
34 | except Exception as e:
35 | pass
36 | return 0
37 |
--------------------------------------------------------------------------------
/py_image_dedup_reference.yaml:
--------------------------------------------------------------------------------
1 | # This is a reference configuration file explaining all the options
2 | # of py-image-dedup.
3 |
4 | py_image_dedup:
5 | # Configuration for the analysis phase, see README.md
6 | analysis:
7 | # Whether to search for duplicates across directories when
8 | # specifying more than one image source directory
9 | across_dirs: false
10 | # A filter for the file extensions to analyse
11 | file_extensions:
12 | - .png
13 | - .jpg
14 | - .jpeg
15 | # Whether to search recursively in each of the source directories
16 | recursive: true
17 | # A list of source directories to analyse
18 | source_directories:
19 | - /home/myuser/pictures/
20 | # A list of regex patterns to ignore when traversing any of the
21 | # source directories
22 | exclusions:
23 | - ".*/excluded/.*"
24 | # The number of threads to use for image analysis.
25 | # If unset, this will default to `os.cpu_count()`.
26 | threads: 1
27 | # Whether to include EXIF data of images in the analysis
28 | use_exif_data: true
29 |
30 | # Deduplication phase specific configuration options, see README.md
31 | deduplication:
32 | # The target directory to move duplicate images to
33 | duplicates_target_directory: /home/myuser/pictures/duplicates/
34 | # Upper limit on the modification date difference between
35 | # two duplicate images to be considered the same image.
36 | max_file_modification_time_diff: 0:05:00
37 | # Specifies the criteria and their order for ordering the list of duplicates
38 | # to select the best copy.
39 | prioritization_rules:
40 | - name: "more-exif-data"
41 | - name: "less-exif-data"
42 | - name: "bigger-file-size"
43 | - name: "smaller-file-size"
44 | - name: "newer-file-modification-date"
45 | - name: "older-file-modification-date"
46 | - name: "smaller-distance"
47 | - name: "bigger-distance"
48 | - name: "longer-path"
49 | - name: "shorter-path"
50 | - name: "contains-copy-in-file-name"
51 | - name: "longer-file-name"
52 | - name: "shorter-file-name"
53 | - name: "longer-folder-path"
54 | - name: "shorter-folder-path"
55 | - name: "higher-score"
56 | - name: "lower-score"
57 |
58 | # Daemon specific configuration options, see README.md
59 | daemon:
60 | # Time for waiting on filesystems changes to settle before analysing.
61 | timeout: 30s
62 | # The type of file observer to use.
63 | # One of: polling, inotify
64 | file_observer: polling
65 | # A dry run can be used to validate the log output of a specific configuration
66 | # before actually deleting or removing any images in any of the source
67 | # directories.
68 | dry_run: true
69 |
70 | # Elasticsearch specific configuration options, see README.md
71 | elasticsearch:
72 | # Whether to automatically create an index in the target database.
73 | auto_create_index: true
74 | # Hostname of the elasticsearch backend instance to use
75 | host: 127.0.0.1
76 | # Port of the elasticsearch backend instance to use.
77 | port: 9200
78 | # The index name to use for storing and querying image analysis data.
79 | index: images
80 | # Maximum signature distance [0..1] to query from elasticsearch backend.
81 | max_distance: 0.1
82 | # Whether to remove empty folders or not.
83 | remove_empty_folders: false
84 |
85 | # Prometheus exporter specific configuration options, see README.md
86 | stats:
87 | # Whether to enable prometheus statistics or not.
88 | enabled: true
89 | # The port to expose statistics on.
90 | port: 8000
91 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "py-image-dedup"
3 | version = "2.0.1"
4 | description = "A library to find duplicate images and delete unwanted ones"
5 |
6 | license = "AGPLv3+"
7 |
8 | authors = [
9 | "Markus Ressel ",
10 | ]
11 |
12 | readme = 'README.md'
13 |
14 | repository = "https://github.com/markusressel/py-image-dedup"
15 | homepage = "https://github.com/markusressel/py-image-dedup"
16 |
17 | keywords = ['deduplication', 'py-image-dedup']
18 |
19 | classifiers = [
20 | "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
21 | "Programming Language :: Python :: 3 :: Only",
22 | "Programming Language :: Python :: 3",
23 | "Programming Language :: Python :: 3.8",
24 | "Programming Language :: Python :: 3.9",
25 | "Development Status :: 5 - Production/Stable"
26 | ]
27 |
28 | [build-system]
29 | requires = ["poetry-core>=1.0.0"]
30 | build-backend = "poetry.core.masonry.api"
31 |
32 | [tool.poetry.dependencies]
33 | python = "^3.11" # Compatible python versions must be declared here
34 |
35 | Pillow = "*"
36 | ordered-set = "*"
37 | watchdog = ">=0.10.2,<6.1.0"
38 | elasticsearch = "^7"
39 | scipy = "*"
40 | numpy = "*"
41 | container-app-conf = "^5.0.0"
42 | requests = "^2.20.0"
43 | click = "*"
44 | tabulate = ">=0.8.3,<0.10.0"
45 | tqdm = "*"
46 | prometheus-client = "*"
47 | image-match = { git = "https://github.com/markusressel/image-match.git", tag = "3.0.0" }
48 |
49 | [tool.poetry.group.dev.dependencies]
50 | pytest = "*"
51 |
52 | [tool.poetry.scripts]
53 | py-image-dedup = 'py_image_dedup.cli:cli'
54 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests
3 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from py_image_dedup.config import DeduplicatorConfig
4 |
5 |
6 | class TestBase(unittest.TestCase):
7 |
8 | def setUp(self):
9 | self.config = DeduplicatorConfig()
10 | from py_image_dedup.library.deduplicator import ImageMatchDeduplicator
11 | self.under_test = ImageMatchDeduplicator(interactive=False)
12 |
13 | def tearDown(self):
14 | pass
15 |
16 | if __name__ == '__main__':
17 | unittest.main()
18 |
--------------------------------------------------------------------------------
/tests/images/bottles/IMG_20190903_193151-edited.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/bottles/IMG_20190903_193151-edited.jpg
--------------------------------------------------------------------------------
/tests/images/bottles/IMG_20190903_193151-grayscale.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/bottles/IMG_20190903_193151-grayscale.jpg
--------------------------------------------------------------------------------
/tests/images/bottles/IMG_20190903_193151-telegram-compression.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/bottles/IMG_20190903_193151-telegram-compression.jpg
--------------------------------------------------------------------------------
/tests/images/bottles/IMG_20190903_193151.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/bottles/IMG_20190903_193151.jpg
--------------------------------------------------------------------------------
/tests/images/building/IMG_20190903_193508-edited.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/building/IMG_20190903_193508-edited.jpg
--------------------------------------------------------------------------------
/tests/images/building/IMG_20190903_193508-grayscale.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/building/IMG_20190903_193508-grayscale.jpg
--------------------------------------------------------------------------------
/tests/images/building/IMG_20190903_193508-telegram-compression.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/building/IMG_20190903_193508-telegram-compression.jpg
--------------------------------------------------------------------------------
/tests/images/building/IMG_20190903_193508.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/building/IMG_20190903_193508.jpg
--------------------------------------------------------------------------------
/tests/images/clouds/IMG_20190903_193537-edited.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/clouds/IMG_20190903_193537-edited.jpg
--------------------------------------------------------------------------------
/tests/images/clouds/IMG_20190903_193537-grayscale.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/clouds/IMG_20190903_193537-grayscale.jpg
--------------------------------------------------------------------------------
/tests/images/clouds/IMG_20190903_193537-telegram-compression.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/clouds/IMG_20190903_193537-telegram-compression.jpg
--------------------------------------------------------------------------------
/tests/images/clouds/IMG_20190903_193537.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/markusressel/py-image-dedup/c24afc64eb339ae339f15ed03b9ddd8e83a799ce/tests/images/clouds/IMG_20190903_193537.jpg
--------------------------------------------------------------------------------
/tests/py_image_dedup.yaml:
--------------------------------------------------------------------------------
1 | py_image_dedup:
2 | dry_run: true
3 | analysis:
4 | across_dirs: true
5 | file_extensions:
6 | - .png
7 | - .jpg
8 | - .jpeg
9 | recursive: true
10 | source_directories:
11 | - ./images/
12 | threads: 8
13 | use_exif_data: true
14 | deduplication:
15 | # duplicates_target_directory:
16 | max_file_modification_time_diff: 0:01:40
17 | prioritization_rules:
18 | - name: "more-exif-data"
19 | - name: "less-exif-data"
20 | - name: "bigger-file-size"
21 | - name: "smaller-file-size"
22 | - name: "newer-file-modification-date"
23 | - name: "older-file-modification-date"
24 | - name: "smaller-distance"
25 | - name: "bigger-distance"
26 | - name: "longer-path"
27 | - name: "shorter-path"
28 | - name: "contains-copy-in-file-name"
29 | - name: "longer-file-name"
30 | - name: "shorter-file-name"
31 | - name: "longer-folder-path"
32 | - name: "shorter-folder-path"
33 | - name: "higher-score"
34 | - name: "lower-score"
35 |
36 | elasticsearch:
37 | auto_create_index: true
38 | host: 127.0.0.1
39 | max_distance: 0.1
40 | remove_empty_folders: false
41 |
42 |
--------------------------------------------------------------------------------
/tests/test_file_extension.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from py_image_dedup.util.file import file_has_extension
4 | from tests import TestBase
5 |
6 |
7 | class FileExtensionTest(TestBase):
8 |
9 | def test_png(self):
10 | paths = [
11 | "file.png",
12 | "file.PNG"
13 | ]
14 | for path in paths:
15 | path = Path(path)
16 | self.assertTrue(file_has_extension(path, [".png"]))
17 |
--------------------------------------------------------------------------------
/tests/test_select_images_to_delete.py:
--------------------------------------------------------------------------------
1 | import random
2 | import unittest
3 | from random import shuffle
4 | from random import uniform
5 |
6 | from py_image_dedup.persistence.metadata_key import MetadataKey
7 | from tests import TestBase
8 |
9 |
10 | class SelectImagesToDeleteTest(TestBase):
11 |
12 | def test_select_images_to_delete__filter_max_mod_time_diff(self):
13 | keep = [
14 | self._create_default_candidate(modification_date=1500),
15 | # file modification time is too far apart from the optimal candidate for these
16 | self._create_default_candidate(modification_date=1001),
17 | ]
18 |
19 | max_delta_seconds = int(self.config.MAX_FILE_MODIFICATION_TIME_DELTA.value.total_seconds())
20 | for i in range(50):
21 | c = self._create_default_candidate(modification_date=random.choice(range(0, 1500 - max_delta_seconds)))
22 | keep.append(c)
23 |
24 | dont_keep = [
25 | self._create_default_candidate(modification_date=1450)
26 | ]
27 |
28 | self._run_test(keep, dont_keep)
29 |
30 | def test_select_images_to_delete__contains_copy(self):
31 | keep = [self._create_default_candidate(path="C:/1.jpg")]
32 |
33 | dont_keep = []
34 | for i in range(50):
35 | c = self._create_default_candidate(path=f"C:/1{i}-Copy.jpg")
36 | dont_keep.append(c)
37 |
38 | self._run_test(keep, dont_keep)
39 |
40 | def test_select_images_to_delete__newer_and_bigger(self):
41 | keep = [self._create_default_candidate(filesize=100, modification_date=100)]
42 |
43 | dont_keep = []
44 | for i in range(50):
45 | c = self._create_default_candidate(filesize=i, modification_date=i)
46 | dont_keep.append(c)
47 |
48 | self._run_test(keep, dont_keep)
49 |
50 | def test_select_images_to_delete__newer(self):
51 | keep = [self._create_default_candidate(modification_date=100)]
52 |
53 | dont_keep = []
54 | for i in range(50):
55 | c = self._create_default_candidate(modification_date=i)
56 | dont_keep.append(c)
57 |
58 | self._run_test(keep, dont_keep)
59 |
60 | def test_select_images_to_delete__bigger(self):
61 | keep = [self._create_default_candidate(filesize=100)]
62 |
63 | dont_keep = []
64 | for i in range(50):
65 | c = self._create_default_candidate(filesize=i)
66 | dont_keep.append(c)
67 |
68 | self._run_test(keep, dont_keep)
69 |
70 | def test_select_images_to_delete__all_the_same(self):
71 | keep = [self._create_default_candidate(path="C:/00000.jpg")]
72 |
73 | dont_keep = []
74 | for i in range(50):
75 | c = self._create_default_candidate(path=f"C:/1{i}.jpg")
76 | dont_keep.append(c)
77 |
78 | self._run_test(keep, dont_keep)
79 |
80 | def test_select_images_to_delete__all_the_same_2(self):
81 | keep = [self._create_default_candidate(path="C:/50-edited.jpg")]
82 |
83 | dont_keep = []
84 | for i in range(50):
85 | c = self._create_default_candidate(path=f"C:/{i}.jpg")
86 | dont_keep.append(c)
87 |
88 | self._run_test(keep, dont_keep)
89 |
90 | def test_select_images_to_delete__higher_score(self):
91 | keep = [self._create_default_candidate(score=100)]
92 |
93 | dont_keep = []
94 | for i in range(50):
95 | c = self._create_default_candidate()
96 | dont_keep.append(c)
97 |
98 | self._run_test(keep, dont_keep)
99 |
100 | def test_select_images_to_delete__lower_dist(self):
101 | keep = [self._create_default_candidate(dist=0)]
102 |
103 | dont_keep = []
104 | for i in range(50):
105 | c = self._create_default_candidate(dist=uniform(0.1, 1.0))
106 | dont_keep.append(c)
107 |
108 | self._run_test(keep, dont_keep)
109 |
110 | def test_select_images_to_delete__real_example(self):
111 | keep = [self._create_default_candidate(
112 | path=r"M:\Fotos\Markus\Google Photos Archiv\Takeout\Google Photos\2017-06-17\20170617_153437.jpg",
113 | filesize=10000000, modification_date=1)]
114 |
115 | dont_keep = []
116 | for i in range(50):
117 | c = self._create_default_candidate(
118 | path=r"M:\Fotos\Iris\Syncthing\Telegram Empfangen\223023133_644761%i.jpg" % i,
119 | filesize=270000, modification_date=2)
120 | dont_keep.append(c)
121 |
122 | self._run_test(keep, dont_keep)
123 |
124 | def _run_test(
125 | self, keep: [{}], dont_keep: [{}], test_reversed_order: bool = True,
126 | test_random_input_order: bool = True
127 | ):
128 | candidates = keep + dont_keep
129 |
130 | kept, not_kept = self.under_test._select_images_to_delete(candidates)
131 | self._test_result_outcome(kept, not_kept, keep, dont_keep)
132 |
133 | if test_reversed_order:
134 | kept, not_kept = self.under_test._select_images_to_delete(reversed(candidates))
135 | self._test_result_outcome(kept, not_kept, keep, dont_keep)
136 |
137 | if test_random_input_order:
138 | # test random sort orders of input just to be sure
139 | for i in range(50):
140 | shuffle(candidates)
141 | kept, not_kept = self.under_test._select_images_to_delete(candidates)
142 | self._test_result_outcome(kept, not_kept, keep, dont_keep)
143 |
144 | def _test_result_outcome(self, kept, not_kept, keep: [{}], dont_keep: [{}]):
145 | for c in keep:
146 | self.assertIn(c, kept)
147 | for c in dont_keep:
148 | self.assertIn(c, not_kept)
149 |
150 | @staticmethod
151 | def _create_default_candidate(
152 | path: str = "C:/test", dist: float = 0.05, filesize: int = 100,
153 | modification_date: int = 1, pixel_count: int = 10000, exif_tags: {} = {},
154 | score: int = 64
155 | ) -> {}:
156 | return {
157 | MetadataKey.PATH.value: path,
158 | MetadataKey.DISTANCE.value: dist,
159 | MetadataKey.METADATA.value: {
160 | MetadataKey.FILE_SIZE.value: filesize,
161 | MetadataKey.FILE_MODIFICATION_DATE.value: modification_date,
162 | MetadataKey.PIXELCOUNT.value: pixel_count,
163 | MetadataKey.EXIF_DATA.value: exif_tags
164 | },
165 | MetadataKey.SCORE.value: score
166 | }
167 |
168 |
169 | if __name__ == '__main__':
170 | unittest.main()
171 |
--------------------------------------------------------------------------------