├── requirements-linting-old.txt
├── requirements-conda.txt
├── MANIFEST.in
├── imagehash
    ├── py.typed
    └── __init__.py
├── tests
    ├── data
    │   ├── peppers.png
    │   └── imagehash.png
    ├── __init__.py
    ├── test_readme.py
    ├── test_dhash.py
    ├── test_phash.py
    ├── test_average_hash.py
    ├── test_old_hex_conversions.py
    ├── test_hex_conversions_multihash.py
    ├── test_colorhash.py
    ├── utils.py
    ├── test_whash.py
    ├── test_hex_conversions.py
    ├── test_crop_resistant_hash.py
    └── test_hash_is_constant.py
├── requirements-linting-anaconda.txt
├── requirements-linting.txt
├── .editorconfig
├── .gitignore
├── .bumpversion.cfg
├── .coveragerc
├── examples
    ├── github-urls.txt
    ├── run_art.sh
    ├── crop_resistance.py
    ├── crop_resistant_segmentation.py
    ├── run_icons.sh
    └── hashimages.py
├── setup.cfg
├── setup.py
├── LICENSE
├── find_similar_images.py
├── .github
    └── workflows
    │   └── testing.yml
├── Makefile
├── README.rst
└── output.html


/requirements-linting-old.txt:
--------------------------------------------------------------------------------
1 | autopep8
2 | flake8
3 | mccabe
4 | 


--------------------------------------------------------------------------------
/requirements-conda.txt:
--------------------------------------------------------------------------------
1 | pillow
2 | numpy
3 | scipy
4 | pywavelets
5 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include *.txt
3 | include LICENSE
4 | 


--------------------------------------------------------------------------------
/imagehash/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.  The imagehash package uses inline types.
2 | 


--------------------------------------------------------------------------------
/tests/data/peppers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JohannesBuchner/imagehash/HEAD/tests/data/peppers.png


--------------------------------------------------------------------------------
/tests/data/imagehash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JohannesBuchner/imagehash/HEAD/tests/data/imagehash.png


--------------------------------------------------------------------------------
/requirements-linting-anaconda.txt:
--------------------------------------------------------------------------------
1 | autopep8
2 | flake8
3 | isort
4 | mccabe
5 | pillow
6 | numpy
7 | scipy
8 | pywavelets
9 | 


--------------------------------------------------------------------------------
/requirements-linting.txt:
--------------------------------------------------------------------------------
1 | autopep8
2 | flake8
3 | flake8-bugbear
4 | flake8-isort
5 | flake8-simplify
6 | isort
7 | mccabe
8 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | [*]
2 | indent_style = tab
3 | indent_size = 2
4 | [*.yml]
5 | indent_style = space
6 | [*.py]
7 | indent_size = 4
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | *.pyc
 3 | env
 4 | *.jpg
 5 | build
 6 | dist
 7 | ImageHash.egg-info/
 8 | .eggs
 9 | .DS_Store
10 | .python-version
11 | .coverage
12 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import, division, print_function
2 | 
3 | from .utils import TestImageHash  # noqa: F401 Testing the import
4 | 


--------------------------------------------------------------------------------
/.bumpversion.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | current_version = 4.3.2
3 | commit = True
4 | tag = True
5 | 
6 | [bumpversion:file:setup.py]
7 | 
8 | [bumpversion:file:imagehash/__init__.py]
9 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | #
 2 | # .coveragerc to control coverage.py
 3 | #
 4 | 
 5 | [run]
 6 | branch = True
 7 | include = 
 8 |         imagehash/__init__.py
 9 |         find_similar_images.py
10 | 
11 | 
12 | [report]
13 | exclude_lines =
14 |     pragma: no cover
15 |     def __repr__
16 |     if __name__ == .__main__.:
17 | 


--------------------------------------------------------------------------------
/examples/github-urls.txt:
--------------------------------------------------------------------------------
 1 | https://github.com/akveo/eva-icons
 2 | https://github.com/CodeMouse92/VividityIcons
 3 | https://github.com/franksouza183/Evolvere-Icons
 4 | https://github.com/HackeSta/atom-icons
 5 | https://github.com/icons8/welovesvg
 6 | https://github.com/joaobborges/minimal-icons
 7 | https://github.com/pluwen/awesome-iconjar
 8 | https://github.com/simple-icons/simple-icons
 9 | https://github.com/synthagency/icons-flat-osx
10 | https://github.com/twbs/icons
11 | 


--------------------------------------------------------------------------------
/tests/test_readme.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import six
 4 | 
 5 | 
 6 | def test_run():
 7 | 	# test code in README.rst file
 8 | 	# find any chunks after ::
 9 | 	# which code lines, which start with <tab> >>>
10 | 	parent_dir = os.path.dirname(os.path.dirname(__file__))
11 | 	with open(os.path.join(parent_dir, 'README.rst')) as f:
12 | 		chunk = [line.replace('\t>>> ', '') for line in f if line.startswith('\t>>> ')]
13 | 
14 | 	code = ''.join(chunk)
15 | 	print("running::\n" + code)
16 | 	print("result:", six.exec_(code, {}, {}))
17 | 


--------------------------------------------------------------------------------
/examples/run_art.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | for j in 2 3 4 5 6 7 8 9 10 11
 3 | do
 4 | echo "${j} ..."
 5 | paste urls.txt hashes.txt |
 6 | grep -v '0000000000000000 0000000000000000 0000000000000000' | 
 7 | awk '{ print $1,$'$((j+1))'}' > hashesfull.txt
 8 | 
 9 | <hashesfull.txt sort -k2,2 | uniq -f 1 -D | awk '{print $2}' | uniq | 
10 | while read k; do 
11 | { echo "<h1>Cluster $k</h1>"; awk '($2 == "'"$k"'"){print $1}' hashesfull.txt | 
12 | while read path; do echo "<a href='${path}'><img src='${path}' /></a>"; done
13 | }
14 | done > art${j}.html
15 | done
16 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bdist_wheel]
 2 | universal = 1
 3 | 
 4 | [flake8]
 5 | count = True
 6 | statistics = True
 7 | max-line-length = 127
 8 | max-complexity = 10
 9 | avoid-escape=True
10 | ; This ignore differs than autopep8's ignore as to not autofix tabs to spaces, but still warn when mixed
11 | ; variable "hash" is shadowing a python builtin
12 | ; tabs are prefered indentation;
13 | ; Bug with pycodestyle for Python 2.7 where it thinks everything is over-indented with tabs
14 | ignore=
15 | 	A001,
16 | 	W191,E111	
17 | 	E117
18 | per-file-ignores=
19 | 	; False positive with multiline strings https://github.com/PyCQA/pycodestyle/issues/376
20 | 	find_similar_images.py: E101
21 | 


--------------------------------------------------------------------------------
/tests/test_dhash.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | 
 3 | import unittest
 4 | 
 5 | import imagehash
 6 | 
 7 | from .utils import TestImageHash
 8 | 
 9 | 
10 | class Test(TestImageHash):
11 | 	def setUp(self):
12 | 		self.image = self.get_data_image()
13 | 		self.func = imagehash.dhash
14 | 
15 | 	def test_dhash(self):
16 | 		self.check_hash_algorithm(self.func, self.image)
17 | 
18 | 	def test_dhash_length(self):
19 | 		self.check_hash_length(self.func, self.image)
20 | 
21 | 	def test_dhash_stored(self):
22 | 		self.check_hash_stored(self.func, self.image)
23 | 
24 | 	def test_dhash_size(self):
25 | 		self.check_hash_size(self.func, self.image)
26 | 
27 | 
28 | if __name__ == '__main__':
29 | 	unittest.main()
30 | 


--------------------------------------------------------------------------------
/tests/test_phash.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | 
 3 | import unittest
 4 | 
 5 | import imagehash
 6 | 
 7 | from .utils import TestImageHash
 8 | 
 9 | 
10 | class Test(TestImageHash):
11 | 	def setUp(self):
12 | 		self.image = self.get_data_image()
13 | 		self.func = imagehash.phash
14 | 
15 | 	def test_phash(self):
16 | 		self.check_hash_algorithm(self.func, self.image)
17 | 
18 | 	def test_phash_length(self):
19 | 		self.check_hash_length(self.func, self.image)
20 | 
21 | 	def test_phash_stored(self):
22 | 		self.check_hash_stored(self.func, self.image)
23 | 
24 | 	def test_phash_size(self):
25 | 		self.check_hash_size(self.func, self.image)
26 | 
27 | 
28 | if __name__ == '__main__':
29 | 	unittest.main()
30 | 


--------------------------------------------------------------------------------
/tests/test_average_hash.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | 
 3 | import unittest
 4 | 
 5 | import imagehash
 6 | 
 7 | from .utils import TestImageHash
 8 | 
 9 | 
10 | class Test(TestImageHash):
11 | 	def setUp(self):
12 | 		self.image = self.get_data_image()
13 | 		self.func = imagehash.average_hash
14 | 
15 | 	def test_average_hash(self):
16 | 		self.check_hash_algorithm(self.func, self.image)
17 | 
18 | 	def test_average_hash_length(self):
19 | 		self.check_hash_length(self.func, self.image)
20 | 
21 | 	def test_average_hash_stored(self):
22 | 		self.check_hash_stored(self.func, self.image)
23 | 
24 | 	def test_average_hash_size(self):
25 | 		self.check_hash_size(self.func, self.image)
26 | 
27 | 
28 | if __name__ == '__main__':
29 | 	unittest.main()
30 | 


--------------------------------------------------------------------------------
/examples/crop_resistance.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | 
 3 | import imagehash
 4 | 
 5 | SAVE_IMAGES = False
 6 | 
 7 | # Load image
 8 | full_image = Image.open('../tests/data/peppers.png')
 9 | width, height = full_image.size
10 | # Hash it
11 | full_hash = imagehash.crop_resistant_hash(full_image)
12 | 
13 | # Crop it
14 | for x in range(5, 50, 5):
15 | 	start = x / 100
16 | 	end = 1 - start
17 | 	crop_img = full_image.crop((start * width, start * height, end * width, end * height))
18 | 	crop_hash = imagehash.crop_resistant_hash(crop_img)
19 | 	if SAVE_IMAGES:
20 | 		crop_img.save('crop_{}.png'.format(str(x).zfill(2)))
21 | 	crop_diff = full_hash.hash_diff(crop_hash)
22 | 	print(
23 | 		'Cropped {}% from each side. Hash has {} matching segments with {} total hamming distance'.format(
24 | 			x, crop_diff[0], crop_diff[1]
25 | 		)
26 | 	)
27 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | try:
 5 | 	from setuptools import setup
 6 | except BaseException:
 7 | 	from distutils.core import setup
 8 | 
 9 | long_description = ''
10 | with open('README.rst') as f:
11 | 	long_description = f.read()
12 | 
13 | setup(
14 | 	name='ImageHash',
15 | 	version='4.3.2',
16 | 	author='Johannes Buchner',
17 | 	author_email='buchner.johannes@gmx.at',
18 | 	packages=['imagehash'],
19 | 	package_data={'imagehash': ['py.typed']},
20 | 	data_files=[('images', ['tests/data/imagehash.png'])],
21 | 	scripts=['find_similar_images.py'],
22 | 	url='https://github.com/JohannesBuchner/imagehash',
23 | 	license='2-clause BSD License',
24 | 	description='Image Hashing library',
25 | 	long_description=long_description,
26 | 	long_description_content_type='text/x-rst',
27 | 	install_requires=[
28 | 		'numpy',
29 | 		'scipy',		# for phash
30 | 		'pillow',		# or PIL
31 | 		'PyWavelets',  # for whash
32 | 	],
33 | 	test_suite='tests',
34 | 	tests_require=['pytest>=3'],
35 | )
36 | 


--------------------------------------------------------------------------------
/tests/test_old_hex_conversions.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | 
 5 | import imagehash
 6 | 
 7 | # Each row is a test case where the first value is a hexadecimal
 8 | # sequence and the second value is the expected bool array for it.
 9 | old_hexadecimal_to_bool_array = [
10 | 	['ffeb89818193ffff', np.array([
11 | 		[True, True, True, True, True, True, True, True],
12 | 		[True, True, False, True, False, True, True, True],
13 | 		[True, False, False, True, False, False, False, True],
14 | 		[True, False, False, False, False, False, False, True],
15 | 		[True, False, False, False, False, False, False, True],
16 | 		[True, True, False, False, True, False, False, True],
17 | 		[True, True, True, True, True, True, True, True],
18 | 		[True, True, True, True, True, True, True, True]])],
19 | ]
20 | 
21 | 
22 | class TestOldHexConversions(unittest.TestCase):
23 | 
24 | 	def setUp(self):
25 | 		self.from_hex = imagehash.old_hex_to_hash
26 | 
27 | 	def test_hex_to_hash_output(self):
28 | 		for case in old_hexadecimal_to_bool_array:
29 | 			self.assertTrue(np.array_equal(case[1], self.from_hex(case[0]).hash))
30 | 
31 | 
32 | if __name__ == '__main__':
33 | 	unittest.main()
34 | 


--------------------------------------------------------------------------------
/examples/crop_resistant_segmentation.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from PIL import Image, ImageFilter
 3 | 
 4 | import imagehash
 5 | 
 6 | IMAGE_FILE = '../tests/data/peppers.png'
 7 | IMG_SIZE = 300
 8 | SEGMENT_THRESHOLD = 128
 9 | MIN_SEGMENT_SIZE = 500
10 | RAINBOW = [
11 | 	(141, 211, 199),
12 | 	(255, 255, 179),
13 | 	(190, 186, 218),
14 | 	(251, 128, 114),
15 | 	(128, 177, 211),
16 | 	(253, 180, 98),
17 | 	(179, 222, 105),
18 | 	(252, 205, 229),
19 | 	(217, 217, 217),
20 | 	(188, 128, 189)
21 | ]
22 | 
23 | # Load image
24 | full_image = Image.open(IMAGE_FILE)
25 | width, height = full_image.size
26 | # Image pre-processing
27 | image = full_image.convert('L').resize((IMG_SIZE, IMG_SIZE), Image.ANTIALIAS)
28 | # Add filters
29 | image = image.filter(ImageFilter.GaussianBlur()).filter(ImageFilter.MedianFilter())
30 | pixels = numpy.array(image).astype(numpy.float32)
31 | # Split segments
32 | segments = imagehash._find_all_segments(pixels, SEGMENT_THRESHOLD, MIN_SEGMENT_SIZE)
33 | # Change back to RGB
34 | image = image.convert('RGB')
35 | # Colour in segments
36 | for num, segment in enumerate(segments):
37 | 	for x, y in segment:
38 | 		image.putpixel((y, x), RAINBOW[num % len(RAINBOW)])
39 | image.show()
40 | 


--------------------------------------------------------------------------------
/tests/test_hex_conversions_multihash.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | 
 3 | import unittest
 4 | 
 5 | import imagehash
 6 | 
 7 | from .utils import TestImageHash
 8 | 
 9 | 
10 | class Test (TestImageHash):
11 |     def setUp(self):
12 |         self.image = self.get_data_image()
13 | 
14 |     def test_hex_to_multi_hash(self):
15 |         generated_hash = imagehash.crop_resistant_hash(
16 |             self.image,
17 |             # these arguments are required for a true multi image hash with multiple segments
18 |             min_segment_size=500, segmentation_image_size=1000
19 |         )
20 |         string = str(generated_hash)
21 |         emsg = ('Stringified multihash did not match original hash')
22 |         self.assertEqual(
23 |             generated_hash,
24 |             imagehash.hex_to_multihash(string),
25 |             emsg
26 |         )
27 |         string = '0026273b2b19550e,6286373334662535,6636192c47639573,999d6d67a3e82125,27a327c38191a4ad,938971382b328a46'
28 |         emsg = ('Stringified multihash did not match hardcoded original hash')
29 |         self.assertEqual(
30 |             generated_hash,
31 |             imagehash.hex_to_multihash(string),
32 |             emsg
33 |         )
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     unittest.main()
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013-2022, Johannes Buchner
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 5 | 
 6 |     Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 7 |     Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 8 | 
 9 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
10 | 
11 | 


--------------------------------------------------------------------------------
/examples/run_icons.sh:
--------------------------------------------------------------------------------
 1 | cat github-urls.txt | while read url; do git clone $url; done
 2 | 
 3 | find -name '*.svg' | while read path; do convert $path ${path/.svg/.png}; done
 4 | 
 5 | echo "collecting files ..."
 6 | for i in */; do pushd $i >/dev/null; prefix=$(git remote get-url origin|sed 's,https://github.com/,https://raw.githubusercontent.com/,g'); find */ -name '*.svg' | while read path; do test -e "${path/.svg/.png}" && echo $prefix/master/$path $prefix; done; popd >/dev/null; done |
 7 | grep -Ev '\.min\.' > urls.txt
 8 | echo "hashing ..."
 9 | for i in */; do pushd $i >/dev/null; prefix=$(git remote get-url origin|sed 's,https://github.com/,https://raw.githubusercontent.com/,g'); find */ -name '*.svg' | while read path; do test -e "${path/.svg/.png}" && echo $i/${path/.svg/.png}; done; popd >/dev/null; done|
10 | grep -vE '\.min\.' | xargs python3 ~/Downloads/imagehash/hashimage.py > hashes.txt
11 | 
12 | for j in 2 3 4 5 6 7 8 9 10 11 12
13 | do
14 | echo "${j} ..."
15 | paste urls.txt hashes.txt | grep -v '0000000000000000 0000000000000000 0000000000000000' |
16 | awk '{print $1,$2,$'$((j+2))'}' > urlhashes.txt
17 | sort -k3,3 urlhashes.txt | uniq -f 2 -D | awk '{print $3}' | uniq | 
18 | while read k; do 
19 | { echo "<h1>Cluster $k</h1>"; awk '($3 == "'$k'"){print "<a href=\""$2"\"><img loading=\"lazy\" src=\""$1"\" width=\"64\" /></a> "}' urlhashes.txt; }
20 | done > index${j}.html
21 | done
22 | 


--------------------------------------------------------------------------------
/examples/hashimages.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import absolute_import, division, print_function
 3 | 
 4 | import sys
 5 | 
 6 | import numpy as np
 7 | from PIL import Image
 8 | 
 9 | import imagehash
10 | 
11 | hashfuncs = [
12 | 	('ahash', imagehash.average_hash),
13 | 	('phash', imagehash.phash),
14 | 	('dhash', imagehash.dhash),
15 | 	('whash-haar', imagehash.whash),
16 | 	('whash-db4', lambda img: imagehash.whash(img, mode='db4')),
17 | 	('colorhash', imagehash.colorhash),
18 | ]
19 | 
20 | 
21 | def alpharemover(image):
22 | 	if image.mode != 'RGBA':
23 | 		return image
24 | 	canvas = Image.new('RGBA', image.size, (255, 255, 255, 255))
25 | 	canvas.paste(image, mask=image)
26 | 	return canvas.convert('RGB')
27 | 
28 | 
29 | def image_loader(hashfunc, hash_size=8):
30 | 	def function(path):
31 | 		image = alpharemover(Image.open(path))
32 | 		return hashfunc(image)
33 | 	return function
34 | 
35 | 
36 | def with_ztransform_preprocess(hashfunc, hash_size=8):
37 | 	def function(path):
38 | 		image = alpharemover(Image.open(path))
39 | 		image = image.convert('L').resize((hash_size, hash_size), Image.ANTIALIAS)
40 | 		data = image.getdata()
41 | 		quantiles = np.arange(100)
42 | 		quantiles_values = np.percentile(data, quantiles)
43 | 		zdata = (np.interp(data, quantiles_values, quantiles) / 100 * 255).astype(np.uint8)
44 | 		image.putdata(zdata)
45 | 		return hashfunc(image)
46 | 	return function
47 | 
48 | 
49 | hashfuncopeners = [(name, image_loader(func)) for name, func in hashfuncs]
50 | hashfuncopeners += [(name + '-z', with_ztransform_preprocess(func)) for name, func in hashfuncs if name != 'colorhash']
51 | 
52 | files = sys.argv[1:]
53 | for path in files:
54 | 	hashes = [str(hashfuncopener(path)) for name, hashfuncopener in hashfuncopeners]
55 | 	print(path, ' '.join(hashes))
56 | 	# print(path, colorhash(path))
57 | 


--------------------------------------------------------------------------------
/tests/test_colorhash.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | 
 3 | import unittest
 4 | 
 5 | import imagehash
 6 | 
 7 | from .utils import TestImageHash
 8 | 
 9 | CHECK_HASH_DEFAULT = range(2, 5)
10 | CHECK_HASH_SIZE_DEFAULT = range(-1, 1)
11 | 
12 | 
13 | class Test(TestImageHash):
14 | 	def setUp(self):
15 | 		self.image = self.get_data_image()
16 | 		self.func = imagehash.colorhash
17 | 
18 | 	def test_colorhash(self):
19 | 		self.check_hash_algorithm(self.func, self.image)
20 | 
21 | 	def check_hash_algorithm(self, func, image):
22 | 		original_hash = func(image)
23 | 		rotate_image = image.rotate(-1)
24 | 		rotate_hash = func(rotate_image)
25 | 		distance = original_hash - rotate_hash
26 | 		emsg = ('slightly rotated image should have similar hash {} {} {}'.format(original_hash, rotate_hash, distance))
27 | 		self.assertTrue(distance <= 10, emsg)
28 | 		self.assertEqual(original_hash, rotate_hash, emsg)
29 | 		rotate_image = image.rotate(180)
30 | 		rotate_hash = func(rotate_image)
31 | 		emsg = ('flipped image should have same hash {} {}'.format(original_hash, rotate_hash))
32 | 		self.assertEqual(original_hash, rotate_hash, emsg)
33 | 
34 | 	def test_colorhash_stored(self):
35 | 		self.check_hash_stored(self.func, self.image)
36 | 
37 | 	def test_colorhash_length(self):
38 | 		self.check_hash_length(self.func, self.image)
39 | 
40 | 	def test_colorhash_size(self):
41 | 		self.check_hash_size(self.func, self.image)
42 | 
43 | 	def check_hash_stored(self, func, image, binbits=CHECK_HASH_DEFAULT):
44 | 		for bit in binbits:
45 | 			image_hash = func(image, bit)
46 | 			other_hash = imagehash.hex_to_flathash(str(image_hash), bit * (2 + 6 * 2))
47 | 			emsg = 'stringified hash {} != original hash {}'.format(other_hash, image_hash)
48 | 			self.assertEqual(image_hash, other_hash, emsg)
49 | 			distance = image_hash - other_hash
50 | 			emsg = ('unexpected hamming distance {}: original hash {} - stringified hash {}'.format(distance, image_hash, other_hash))
51 | 			self.assertEqual(distance, 0, emsg)
52 | 
53 | 	def check_hash_length(self, func, image, binbits=CHECK_HASH_DEFAULT):
54 | 		for bit in binbits:
55 | 			image_hash = func(image, bit)
56 | 			emsg = 'bit={} is not respected'.format(bit)
57 | 			self.assertEqual(image_hash.hash.size, (2 + 6 * 2) * bit, emsg)
58 | 
59 | 	def check_hash_size(self, func, image, binbits=CHECK_HASH_SIZE_DEFAULT):
60 | 		for bit in binbits:
61 | 			with self.assertRaises(ValueError):
62 | 				func(image, bit)
63 | 
64 | 
65 | if __name__ == '__main__':
66 | 	unittest.main()
67 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | 
 3 | import os
 4 | import os.path
 5 | import unittest
 6 | 
 7 | from PIL import Image
 8 | 
 9 | import imagehash
10 | 
11 | CHECK_HASH_DEFAULT = range(2, 21)
12 | CHECK_HASH_SIZE_DEFAULT = range(-1, 2)
13 | 
14 | 
15 | class TestImageHash(unittest.TestCase):
16 | 	@staticmethod
17 | 	def get_data_image(fname=None):
18 | 		if fname is None:
19 | 			fname = 'imagehash.png'
20 | 		dname = os.path.abspath(os.path.dirname(__file__))
21 | 		target = os.path.join(dname, 'data', fname)
22 | 		if not os.path.isfile(target):
23 | 			emsg = 'Unknown test image file: {!r}'
24 | 			raise ValueError(emsg.format(target))
25 | 		return Image.open(target)
26 | 
27 | 	def check_hash_algorithm(self, func, image):
28 | 		original_hash = func(image)
29 | 		rotate_image = image.rotate(-1)
30 | 		rotate_hash = func(rotate_image)
31 | 		distance = original_hash - rotate_hash
32 | 		emsg = ('slightly rotated image should have similar hash {} {} {}'.format(original_hash, rotate_hash, distance))
33 | 		self.assertTrue(distance <= 10, emsg)
34 | 		rotate_image = image.rotate(-90)
35 | 		rotate_hash = func(rotate_image)
36 | 		emsg = ('rotated image should have different hash {} {}'.format(original_hash, rotate_hash))
37 | 		self.assertNotEqual(original_hash, rotate_hash, emsg)
38 | 		distance = original_hash - rotate_hash
39 | 		emsg = ('rotated image should have larger different hash {} {} {}'.format(original_hash, rotate_hash, distance))
40 | 		self.assertTrue(distance > 10, emsg)
41 | 
42 | 	def check_hash_length(self, func, image, sizes=CHECK_HASH_DEFAULT):
43 | 		for hash_size in sizes:
44 | 			image_hash = func(image, hash_size=hash_size)
45 | 			emsg = 'hash_size={} is not respected'.format(hash_size)
46 | 			self.assertEqual(image_hash.hash.size, hash_size**2, emsg)
47 | 
48 | 	def check_hash_stored(self, func, image, sizes=CHECK_HASH_DEFAULT):
49 | 		for hash_size in sizes:
50 | 			image_hash = func(image, hash_size)
51 | 			other_hash = imagehash.hex_to_hash(str(image_hash))
52 | 			emsg = 'stringified hash {} != original hash {}'.format(other_hash, image_hash)
53 | 			self.assertEqual(image_hash, other_hash, emsg)
54 | 			distance = image_hash - other_hash
55 | 			emsg = ('unexpected hamming distance {}: original hash {} - stringified hash {}'.format(distance, image_hash, other_hash))
56 | 			self.assertEqual(distance, 0, emsg)
57 | 
58 | 	def check_hash_size(self, func, image, sizes=CHECK_HASH_SIZE_DEFAULT):
59 | 		for hash_size in sizes:
60 | 			with self.assertRaises(ValueError):
61 | 				func(image, hash_size)
62 | 


--------------------------------------------------------------------------------
/find_similar_images.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import absolute_import, division, print_function
 3 | 
 4 | from PIL import Image
 5 | 
 6 | import imagehash
 7 | 
 8 | """
 9 | Demo of hashing
10 | """
11 | 
12 | 
13 | def find_similar_images(userpaths, hashfunc=imagehash.average_hash):
14 | 	def is_image(filename):
15 | 		f = filename.lower()
16 | 		return f.endswith('.png') or f.endswith('.jpg') or \
17 | 			f.endswith('.jpeg') or f.endswith('.bmp') or \
18 | 			f.endswith('.gif') or '.jpg' in f or f.endswith('.svg')
19 | 
20 | 	image_filenames = []
21 | 	for userpath in userpaths:
22 | 		image_filenames += [os.path.join(userpath, path) for path in os.listdir(userpath) if is_image(path)]
23 | 	images = {}
24 | 	for img in sorted(image_filenames):
25 | 		try:
26 | 			hash = hashfunc(Image.open(img))
27 | 		except Exception as e:
28 | 			print('Problem:', e, 'with', img)
29 | 			continue
30 | 		if hash in images:
31 | 			print(img, '  already exists as', ' '.join(images[hash]))
32 | 			if 'dupPictures' in img:
33 | 				print('rm -v', img)
34 | 		images[hash] = images.get(hash, []) + [img]
35 | 
36 | 	# for k, img_list in six.iteritems(images):
37 | 	# 	if len(img_list) > 1:
38 | 	# 		print(" ".join(img_list))
39 | 
40 | 
41 | if __name__ == '__main__':  # noqa: C901
42 | 	import os
43 | 	import sys
44 | 
45 | 	def usage():
46 | 		sys.stderr.write("""SYNOPSIS: %s [ahash|phash|dhash|...] [<directory>]
47 | 
48 | Identifies similar images in the directory.
49 | 
50 | Method:
51 |   ahash:          Average hash
52 |   phash:          Perceptual hash
53 |   dhash:          Difference hash
54 |   whash-haar:     Haar wavelet hash
55 |   whash-db4:      Daubechies wavelet hash
56 |   colorhash:      HSV color hash
57 |   crop-resistant: Crop-resistant hash
58 | 
59 | (C) Johannes Buchner, 2013-2017
60 | """ % sys.argv[0])
61 | 		sys.exit(1)
62 | 
63 | 	hashmethod = sys.argv[1] if len(sys.argv) > 1 else usage()
64 | 	if hashmethod == 'ahash':
65 | 		hashfunc = imagehash.average_hash
66 | 	elif hashmethod == 'phash':
67 | 		hashfunc = imagehash.phash
68 | 	elif hashmethod == 'dhash':
69 | 		hashfunc = imagehash.dhash
70 | 	elif hashmethod == 'whash-haar':
71 | 		hashfunc = imagehash.whash
72 | 	elif hashmethod == 'whash-db4':
73 | 		def hashfunc(img):
74 | 			return imagehash.whash(img, mode='db4')
75 | 	elif hashmethod == 'colorhash':
76 | 		hashfunc = imagehash.colorhash
77 | 	elif hashmethod == 'crop-resistant':
78 | 		hashfunc = imagehash.crop_resistant_hash
79 | 	else:
80 | 		usage()
81 | 	userpaths = sys.argv[2:] if len(sys.argv) > 2 else '.'
82 | 	find_similar_images(userpaths=userpaths, hashfunc=hashfunc)
83 | 


--------------------------------------------------------------------------------
/tests/test_whash.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | 
 3 | import unittest
 4 | 
 5 | import six
 6 | from PIL import Image
 7 | 
 8 | import imagehash
 9 | 
10 | from .utils import TestImageHash
11 | 
12 | 
13 | class TestBasic(TestImageHash):
14 | 
15 | 	def setUp(self):
16 | 		self.image = self.get_data_image()
17 | 		self.func = imagehash.whash
18 | 
19 | 	def test_whash(self):
20 | 		self.check_hash_algorithm(self.func, self.image)
21 | 
22 | 	def test_whash_length(self):
23 | 		self.check_hash_length(self.func, self.image, sizes=[2, 4, 8, 16, 32, 64])
24 | 
25 | 	def test_whash_stored(self):
26 | 		self.check_hash_stored(self.func, self.image, sizes=[2, 4, 8, 16, 32, 64])
27 | 
28 | 
29 | class Test(unittest.TestCase):
30 | 	def setUp(self):
31 | 		self.image = self._get_white_image()
32 | 
33 | 	def _get_white_image(self, size=None):
34 | 		if size is None:
35 | 			size = (512, 512)
36 | 		return Image.new('RGB', size, 'white')
37 | 
38 | 	def test_hash_size_2power(self):
39 | 		for hash_size in [4, 8, 16]:
40 | 			hash = imagehash.whash(self.image, hash_size=hash_size)
41 | 			self.assertEqual(hash.hash.size, hash_size**2)
42 | 
43 | 	def test_hash_size_for_small_images(self):
44 | 		default_hash_size = 8
45 | 		for image_size in [(1, 25), (7, 5)]:
46 | 			image = self._get_white_image(image_size)
47 | 			hash = imagehash.whash(image)
48 | 			self.assertEqual(hash.hash.size, default_hash_size**2)
49 | 
50 | 	def test_hash_size_not_2power(self):
51 | 		emsg = 'hash_size is not power of 2'
52 | 		for hash_size in [3, 7, 12]:
53 | 			with six.assertRaisesRegex(self, AssertionError, emsg):
54 | 				imagehash.whash(self.image, hash_size=hash_size)
55 | 
56 | 	def test_hash_size_is_less_than_image_scale(self):
57 | 		image = self._get_white_image((120, 200))
58 | 		emsg = 'hash_size in a wrong range'
59 | 		for hash_size in [128, 512]:
60 | 			with six.assertRaisesRegex(self, AssertionError, emsg):
61 | 				imagehash.whash(image, hash_size=hash_size, image_scale=64)
62 | 
63 | 	def test_custom_hash_size_and_scale(self):
64 | 		hash_size = 16
65 | 		hash = imagehash.whash(self.image, hash_size=hash_size, image_scale=64)
66 | 		self.assertEqual(hash.hash.size, hash_size**2)
67 | 
68 | 	def test_hash_size_more_than_scale(self):
69 | 		emsg = 'hash_size in a wrong range'
70 | 		with six.assertRaisesRegex(self, AssertionError, emsg):
71 | 			imagehash.whash(self.image, hash_size=32, image_scale=16)
72 | 
73 | 	def test_image_scale_not_2power(self):
74 | 		emsg = 'image_scale is not power of 2'
75 | 		for image_scale in [4, 8, 16]:
76 | 			with six.assertRaisesRegex(self, AssertionError, emsg):
77 | 				imagehash.whash(self.image, image_scale=image_scale + 1)
78 | 
79 | 
80 | if __name__ == '__main__':
81 | 	unittest.main()
82 | 


--------------------------------------------------------------------------------
/.github/workflows/testing.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |   schedule:
 7 |     - cron: '42 4 5,20 * *'
 8 | env:
 9 |   CACHE_NUMBER: 0  # increase to reset cache manually
10 | jobs:
11 |   run-tests:
12 |     name: Run tests
13 |     runs-on: ubuntu-latest
14 |     defaults:
15 |       run:
16 |         shell: bash -l {0}
17 |     strategy:
18 |       matrix:
19 |         python-version: ['3.9.1', '3.11']
20 |       fail-fast: false
21 |     steps:
22 |       - uses: actions/checkout@v2
23 |       - uses: conda-incubator/setup-miniconda@v2
24 |         with:
25 |           auto-update-conda: true
26 |           python-version: ${{ matrix.python-version }}
27 |       - name: Set cache date
28 |         run: echo "DATE=$(date +'%Y%m')" >> $GITHUB_ENV
29 |       - name: Conda download cache
30 |         id: myconda-download-cache
31 |         uses: actions/cache@v4
32 |         with:
33 |           path: /usr/share/miniconda/pkgs/
34 |           key: ${{ matrix.python-version }}-conda-${{ env.DATE }}-${{ env.CACHE_NUMBER }}
35 |       - name: Install imagemagick
36 |         run: |
37 |           sudo sudo apt-get update && sudo apt-get -y --no-install-recommends install -y imagemagick libstdc++6 || true
38 |       # conda does not support environment markers
39 |       - name: fix libstdc++ for scipy install
40 |         run: ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /usr/share/miniconda/envs/test/lib/libstdc++.so.6
41 |       - name: Install testing dependencies
42 |         run: |
43 |           conda install -c conda-forge --file requirements-conda.txt --file requirements-linting-old.txt six packaging pytest coveralls coverage libstdcxx-ng toml
44 |       - name: Conda info
45 |         run: |
46 |           conda info
47 |           conda list
48 |       - name: Lint with flake8
49 |         # stop the build if there are Python syntax errors or undefined names
50 |         run: flake8 imagehash/ --show-source
51 |       - name: Check typing with mypy
52 |         run: mypy imagehash tests/*.py --follow-imports=silent --ignore-missing-imports || true
53 |       - name: Test install from setup.py
54 |         run: pip install .
55 |       - run: coverage run -m pytest .
56 |       - name: Convert coverage output to lcov for coveralls
57 |         run: |
58 |           coverage lcov -o lcov.info
59 |           # make paths relative
60 |           sed -i s,$PWD/,,g lcov.info
61 |       - name: prepare coveralls partial upload
62 |         uses: coverallsapp/github-action@master
63 |         with:
64 |           github-token: ${{ secrets.github_token }}
65 |           path-to-lcov: lcov.info
66 |           flag-name: run-${{ matrix.python-version }}
67 |           parallel: true
68 |         
69 | 
70 |   finish:
71 |     needs: run-tests
72 |     runs-on: ubuntu-latest
73 |     steps:
74 |     - name: Coveralls Finished
75 |       uses: coverallsapp/github-action@master
76 |       with:
77 |         github-token: ${{ secrets.github_token }}
78 |         parallel-finished: true
79 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .PHONY: clean clean-test clean-pyc clean-build docs help
  2 | .DEFAULT_GOAL := help
  3 | 
  4 | define BROWSER_PYSCRIPT
  5 | import os, webbrowser, sys
  6 | 
  7 | try:
  8 | 	from urllib import pathname2url
  9 | except:
 10 | 	from urllib.request import pathname2url
 11 | 
 12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
 13 | endef
 14 | export BROWSER_PYSCRIPT
 15 | 
 16 | define PRINT_HELP_PYSCRIPT
 17 | import re, sys
 18 | 
 19 | for line in sys.stdin:
 20 | 	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
 21 | 	if match:
 22 | 		target, help = match.groups()
 23 | 		print("%-20s %s" % (target, help))
 24 | endef
 25 | export PRINT_HELP_PYSCRIPT
 26 | 
 27 | PYTHON := python3
 28 | 
 29 | BROWSER := $(PYTHON) -c "$$BROWSER_PYSCRIPT"
 30 | 
 31 | help:
 32 | 	@$(PYTHON) -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
 33 | 
 34 | clean: clean-build clean-pyc clean-test clean-doc ## remove all build, test, coverage and Python artifacts
 35 | 
 36 | clean-build: ## remove build artifacts
 37 | 	rm -fr build/
 38 | 	rm -fr dist/
 39 | 	rm -fr .eggs/
 40 | 	find . -name '*.egg-info' -exec rm -fr {} +
 41 | 	find . -name '*.egg' -exec rm -f {} +
 42 | 
 43 | clean-pyc: ## remove Python file artifacts
 44 | 	find . -name '*.pyc' -exec rm -f {} +
 45 | 	find . -name '*.pyo' -exec rm -f {} +
 46 | 	find . -name '*~' -exec rm -f {} +
 47 | 	find . -name '__pycache__' -exec rm -fr {} +
 48 | 	find . -name '*.so' -exec rm -f {} +
 49 | 	find . -name '*.c' -exec rm -f {} +
 50 | 
 51 | clean-test: ## remove test and coverage artifacts
 52 | 	rm -fr .tox/
 53 | 	rm -f .coverage
 54 | 	rm -fr htmlcov/
 55 | 	rm -fr .pytest_cache
 56 | 
 57 | clean-doc:
 58 | 	rm -rf docs/build
 59 | 
 60 | lint: ## check style with flake8
 61 | 	flake8 .
 62 | 
 63 | lint-fix: ## fix style with autopep8 and isort; ignores to not autofix tabs to spaces, but still warn when mixed
 64 | 	autopep8 . --in-place --aggressive --aggressive --aggressive --recursive --ignore=W191,E101,E111,E122
 65 | 	isort .
 66 | 
 67 | test: ## run tests quickly with the default Python
 68 | 	pytest
 69 | 
 70 | test-all: ## run tests on every Python version with tox
 71 | 	tox
 72 | 
 73 | coverage: ## check code coverage quickly with the default Python
 74 | 	coverage run --source ultranest -m pytest
 75 | 	coverage report -m
 76 | 	coverage html
 77 | 	$(BROWSER) htmlcov/index.html
 78 | 
 79 | docs: ## generate Sphinx HTML documentation, including API docs
 80 | 	rm -f docs/ultranest.rst
 81 | 	rm -f docs/modules.rst
 82 | 	#nbstripout docs/*.ipynb
 83 | 	sphinx-apidoc -H API -o docs/ ultranest
 84 | 	$(MAKE) -C docs clean
 85 | 	$(MAKE) -C docs html
 86 | 	sed --in-place '/href="ultranest\/mlfriends.html"/d' docs/build/html/_modules/index.html
 87 | 	$(BROWSER) docs/build/html/index.html
 88 | 
 89 | servedocs: docs ## compile the docs watching for changes
 90 | 	watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
 91 | 
 92 | release: dist ## package and upload a release
 93 | 	twine upload -s dist/*.tar.gz dist/*.whl
 94 | 
 95 | dist: clean ## builds source and wheel package
 96 | 	$(PYTHON) setup.py sdist
 97 | 	$(PYTHON) setup.py bdist_wheel
 98 | 	ls -l dist
 99 | 
100 | install: clean ## install the package to the active Python's site-packages
101 | 	$(PYTHON) setup.py install
102 | 


--------------------------------------------------------------------------------
/tests/test_hex_conversions.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | import numpy as np
  4 | 
  5 | import imagehash
  6 | 
  7 | # Each row is a test case where the first value is a bit sequence and
  8 | # the second value is the expected hexadecimal representation for it.
  9 | binary_to_hexadecimal_values = [
 10 | 	['1', '1'],
 11 | 	['11', '3'],
 12 | 	['111', '7'],
 13 | 	['1111', 'f'],
 14 | 	['10000', '10'],
 15 | 	['110000', '30'],
 16 | 	['1110000', '70'],
 17 | 	['11110000', 'f0'],
 18 | 	['00001', '01'],
 19 | 	['000011', '03'],
 20 | 	['0000111', '07'],
 21 | 	['00001111', '0f'],
 22 | 	['10000001', '81'],
 23 | 	['00000000000000001', '00001'],
 24 | 	['000000000000000011', '00003'],
 25 | 	['0000000000000000111', '00007'],
 26 | 	['00000000000000001111', '0000f'],
 27 | 	['11110000111100001111', 'f0f0f'],
 28 | 	['00001111000011110000', '0f0f0'],
 29 | 	['11110000000100100011010001010110011110001001101010111100110111101111', 'f0123456789abcdef'],
 30 | 	['1001111000111100110000011111000011110000110000111110011111000000', '9e3cc1f0f0c3e7c0'],
 31 | 	['1000111100001111000011110000111100001111000010110000101101111010', '8f0f0f0f0f0b0b7a'],
 32 | ]
 33 | 
 34 | # Each row is a test case where the first value is a hexadecimal sequence
 35 | # and the second value is the expected binary representation for it.
 36 | hexadecimal_to_binary_values = [
 37 | 	['1', '0001'],
 38 | 	['2', '0010'],
 39 | 	['3', '0011'],
 40 | 	['a', '1010'],
 41 | 	['f', '1111'],
 42 | 	['101', '100000001'],
 43 | 	['1b1', '110110001'],
 44 | 	['0b1', '010110001'],
 45 | 	['f0f0', '1111000011110000'],
 46 | 	['0f0f', '0000111100001111'],
 47 | 	['000c', '0000000000001100'],
 48 | 	['100000d', '1000000000000000000001101'],
 49 | 	['000000d', '0000000000000000000001101'],
 50 | 	['000000001', '000000000000000000000000000000000001'],
 51 | 	['800000001', '100000000000000000000000000000000001'],
 52 | 	['0000000000001', '0000000000000000000000000000000000000000000000001'],
 53 | 	['1000000000001', '1000000000000000000000000000000000000000000000001'],
 54 | 	['0000000000000001', '0000000000000000000000000000000000000000000000000000000000000001'],
 55 | 	['8000000000000001', '1000000000000000000000000000000000000000000000000000000000000001'],
 56 | ]
 57 | 
 58 | # Each row is a test case where the first value is a hexadecimal
 59 | # sequence and the second value is the expected bool array for it.
 60 | hexadecimal_to_bool_array = [
 61 | 	['9e3cc1f0f0c3e7c0', np.array([
 62 | 		[True, False, False, True, True, True, True, False],
 63 | 		[False, False, True, True, True, True, False, False],
 64 | 		[True, True, False, False, False, False, False, True],
 65 | 		[True, True, True, True, False, False, False, False],
 66 | 		[True, True, True, True, False, False, False, False],
 67 | 		[True, True, False, False, False, False, True, True],
 68 | 		[True, True, True, False, False, True, True, True],
 69 | 		[True, True, False, False, False, False, False, False]])],
 70 | ]
 71 | 
 72 | 
 73 | class TestHexConversions(unittest.TestCase):
 74 | 
 75 | 	def setUp(self):
 76 | 		self.to_hex = imagehash._binary_array_to_hex
 77 | 		self.from_hex = imagehash.hex_to_hash
 78 | 
 79 | 	def test_binary_array_to_hex_input(self):
 80 | 		for case in hexadecimal_to_bool_array:
 81 | 			self.assertEqual(case[0], self.to_hex(case[1]))
 82 | 
 83 | 	def test_hex_to_hash_output(self):
 84 | 		for case in hexadecimal_to_bool_array:
 85 | 			self.assertTrue(np.array_equal(case[1], self.from_hex(case[0]).hash))
 86 | 
 87 | 	def test_conversion_to_hex(self):
 88 | 		for case in binary_to_hexadecimal_values:
 89 | 			expected = case[1]
 90 | 			bit_array = np.array([int(d) for d in case[0]])
 91 | 			result = self.to_hex(bit_array)
 92 | 			self.assertEqual(expected, result)
 93 | 
 94 | 	def test_conversion_from_hex(self):
 95 | 		for case in hexadecimal_to_binary_values:
 96 | 			expected = case[1]
 97 | 			result = ''.join(str(b) for b in 1 * self.from_hex(case[0]).hash.flatten())
 98 | 			self.assertEqual(expected, result)
 99 | 
100 | 
101 | if __name__ == '__main__':
102 | 	unittest.main()
103 | 


--------------------------------------------------------------------------------
/tests/test_crop_resistant_hash.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | 
  3 | import unittest
  4 | from datetime import datetime
  5 | 
  6 | import imagehash
  7 | 
  8 | from .utils import TestImageHash
  9 | 
 10 | 
 11 | class Test(TestImageHash):
 12 | 	def setUp(self):
 13 | 		self.image = self.get_data_image()
 14 | 		self.peppers = self.get_data_image('peppers.png')
 15 | 
 16 | 	def test_segmented_hash(self):
 17 | 		original_hash = imagehash.crop_resistant_hash(self.image)
 18 | 		rotate_image = self.image.rotate(-1)
 19 | 		small_rotate_hash = imagehash.crop_resistant_hash(rotate_image)
 20 | 		emsg = ('slightly rotated image should have similar hash {} {}'.format(original_hash, small_rotate_hash))
 21 | 		self.assertTrue(original_hash.matches(small_rotate_hash), emsg)
 22 | 		rotate_image = self.image.rotate(-90)
 23 | 		large_rotate_hash = imagehash.crop_resistant_hash(rotate_image)
 24 | 		emsg = ('rotated image should have different hash {} {}'.format(original_hash, large_rotate_hash))
 25 | 		self.assertFalse(original_hash.matches(large_rotate_hash), emsg)
 26 | 
 27 | 		other_hashes = [small_rotate_hash, large_rotate_hash]
 28 | 		self.assertEqual(
 29 | 			original_hash.best_match(other_hashes),
 30 | 			small_rotate_hash,
 31 | 			'Hash of the slightly rotated image should be a better match than for the more heavily rotated image.'
 32 | 		)
 33 | 
 34 | 	def test_segmented_hash__hash_func(self):
 35 | 		segmented_ahash = imagehash.crop_resistant_hash(self.image, imagehash.average_hash)
 36 | 		segmented_dhash = imagehash.crop_resistant_hash(self.image, imagehash.dhash)
 37 | 		self.assertFalse(
 38 | 			segmented_ahash.matches(segmented_dhash),
 39 | 			'Segmented hash should not match when the underlying hashing method is not the same'
 40 | 		)
 41 | 
 42 | 	def test_segmented_hash__limit_segments(self):
 43 | 		segmented_orig = imagehash.crop_resistant_hash(self.image)
 44 | 		segmented_limit = imagehash.crop_resistant_hash(self.image, limit_segments=1)
 45 | 		self.assertGreaterEqual(
 46 | 			len(segmented_orig.segment_hashes), len(segmented_limit.segment_hashes),
 47 | 			'Limit segments should mean there are fewer segments'
 48 | 		)
 49 | 		self.assertEqual(
 50 | 			len(segmented_limit.segment_hashes), 1,
 51 | 			'Limit segments should correctly limit the segment count'
 52 | 		)
 53 | 
 54 | 	def test_segmented_hash__segment_threshold(self):
 55 | 		segmented_low_threshold = imagehash.crop_resistant_hash(self.image, segment_threshold=20)
 56 | 		segmented_high_threshold = imagehash.crop_resistant_hash(self.image, segment_threshold=250)
 57 | 		self.assertFalse(
 58 | 			segmented_low_threshold.matches(segmented_high_threshold, region_cutoff=3),
 59 | 			'Segmented hash should not match when segment threshold is changed'
 60 | 		)
 61 | 
 62 | 	def test_segmentation_image_size(self):
 63 | 		start_time = datetime.now()
 64 | 		imagehash.crop_resistant_hash(self.image, segmentation_image_size=200)
 65 | 		small_timed = datetime.now() - start_time
 66 | 
 67 | 		start_time = datetime.now()
 68 | 		imagehash.crop_resistant_hash(self.image, segmentation_image_size=400)
 69 | 		large_timed = datetime.now() - start_time
 70 | 
 71 | 		self.assertGreater(large_timed, small_timed, 'Hashing should take longer when the segmentation image is larger')
 72 | 
 73 | 	def test_min_segment_size(self):
 74 | 		small_segments_hash = imagehash.crop_resistant_hash(self.peppers, min_segment_size=100)
 75 | 		big_segments_hash = imagehash.crop_resistant_hash(self.peppers, min_segment_size=1000)
 76 | 
 77 | 		self.assertGreater(
 78 | 			len(small_segments_hash.segment_hashes),
 79 | 			len(big_segments_hash.segment_hashes),
 80 | 			'Small segment size limit should lead to larger number of segments detected.'
 81 | 		)
 82 | 		self.assertEqual(
 83 | 			small_segments_hash,
 84 | 			big_segments_hash,
 85 | 			'Hashes should still match, as large segments are present in both'
 86 | 		)
 87 | 
 88 | 	def test_crop_resistance(self):
 89 | 		full_image = self.peppers
 90 | 		width, height = full_image.size
 91 | 		crop_10 = full_image.crop((0.05 * width, 0.05 * height, 0.95 * width, 0.95 * height))
 92 | 		crop_40 = full_image.crop((0.2 * width, 0.2 * height, 0.8 * width, 0.8 * height))
 93 | 		crop_asymmetric = full_image.crop((0, 0.3 * height, 0.4 * width, 0.75 * height))
 94 | 
 95 | 		full_hash = imagehash.crop_resistant_hash(full_image, min_segment_size=200)
 96 | 		crop_hash_10 = imagehash.crop_resistant_hash(crop_10)
 97 | 		crop_hash_40 = imagehash.crop_resistant_hash(crop_40)
 98 | 		crop_hash_asymmetric = imagehash.crop_resistant_hash(crop_asymmetric)
 99 | 
100 | 		self.assertEqual(crop_hash_10, full_hash, 'Slightly cropped image hash should match full image hash')
101 | 		self.assertEqual(crop_hash_40, full_hash, 'Heavily cropped image hash should match full image hash')
102 | 		self.assertEqual(
103 | 			crop_hash_asymmetric, full_hash, 'Asymmetrically cropped image hash should match full image hash'
104 | 		)
105 | 
106 | 
107 | if __name__ == '__main__':
108 | 	unittest.main()
109 | 


--------------------------------------------------------------------------------
/tests/test_hash_is_constant.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | 
  3 | import numpy
  4 | import PIL
  5 | from packaging import version
  6 | from PIL import ImageFilter
  7 | 
  8 | import imagehash
  9 | from tests import TestImageHash
 10 | 
 11 | 
 12 | def _calculate_segment_properties(segment):
 13 | 	length = len(segment)
 14 | 	min_y = min(coord[0] for coord in segment)
 15 | 	min_x = min(coord[1] for coord in segment)
 16 | 	max_y = max(coord[0] for coord in segment)
 17 | 	max_x = max(coord[1] for coord in segment)
 18 | 	return {
 19 | 		'length': length,
 20 | 		'min_x': min_x,
 21 | 		'min_y': min_y,
 22 | 		'max_x': max_x,
 23 | 		'max_y': max_y
 24 | 	}
 25 | 
 26 | 
 27 | def _pillow_has_convert_fix():
 28 | 	"""
 29 | 	Pillow version 7.0.0 introduced a fix for a rounding error in Image.convert("L") which means that segmentation is
 30 | 	slightly different after this release.
 31 | 	The PR which fixed the rounding and caused this inconsistency is https://github.com/python-pillow/Pillow/pull/4320
 32 | 	"""
 33 | 	return version.parse(PIL.__version__) >= version.parse('7.0.0')
 34 | 
 35 | 
 36 | class Test(TestImageHash):
 37 | 	def setUp(self):
 38 | 		self.image = self.get_data_image()
 39 | 		self.peppers = self.get_data_image('peppers.png')
 40 | 
 41 | 	def test_average_hash(self):
 42 | 		result_hash = imagehash.average_hash(self.image)
 43 | 		known_hash = 'ffd7918181c9ffff'
 44 | 		self.assertEqual(str(result_hash), known_hash)
 45 | 
 46 | 	def test_phash(self):
 47 | 		result_hash = imagehash.phash(self.image)
 48 | 		known_hash = 'ba8c84536bd3c366'
 49 | 		self.assertEqual(str(result_hash), known_hash)
 50 | 
 51 | 	def test_dhash(self):
 52 | 		result_hash = imagehash.dhash(self.image)
 53 | 		known_hash = '0026273b2b19550e'
 54 | 		self.assertEqual(str(result_hash), known_hash)
 55 | 
 56 | 	def test_whash(self):
 57 | 		result_hash = imagehash.whash(self.image)
 58 | 		known_hash = 'ffd391818181a5e7'
 59 | 		self.assertEqual(str(result_hash), known_hash)
 60 | 
 61 | 	def test_color_hash(self):
 62 | 		result_hash = imagehash.colorhash(self.image)
 63 | 		known_hash = '07007000000'
 64 | 		self.assertEqual(str(result_hash), known_hash)
 65 | 
 66 | 	def test_crop_resistant_hash(self):
 67 | 		result_hash = imagehash.crop_resistant_hash(self.peppers)
 68 | 		if _pillow_has_convert_fix():
 69 | 			known_hash = (
 70 | 				'c4d9f3e3e1c18101,'
 71 | 				'706c6e66464c99b9,'
 72 | 				'98d8f1ecd8f0f0e1,'
 73 | 				'a082c0c49acc6dbd,'
 74 | 				'f1f39b99c1c1b1b1,'
 75 | 				'3a7ece1c9df4fcb9'
 76 | 			)
 77 | 		else:
 78 | 			known_hash = (
 79 | 				'c4d9f1e3e1c18101,'
 80 | 				'706c6e66464c99b9,'
 81 | 				'98d8f1ecd8f0f0e1,'
 82 | 				'a282c0c49acc6dbd,'
 83 | 				'b1f39b99e1c1b1b1,'
 84 | 				'3a7ece1c9df4fcb9'
 85 | 			)
 86 | 		self.assertEqual(str(result_hash), known_hash)
 87 | 
 88 | 	def test_crop_resistant_segmentation(self):
 89 | 		# Image pre-processing
 90 | 		image = self.peppers.convert('L')
 91 | 		if _pillow_has_convert_fix():
 92 | 			known_bw_md5 = '61db06218cc8b9aba14812d965869120'
 93 | 		else:
 94 | 			known_bw_md5 = '61442e74c83cfea67d182481c24c5f3e'
 95 | 		self.assertEqual(
 96 | 			hashlib.md5(image.tobytes()).hexdigest(),
 97 | 			known_bw_md5,
 98 | 			"This hash should match, unless pillow have changed Convert('L') again"
 99 | 		)
100 | 		image = image.resize((300, 300), imagehash.ANTIALIAS)
101 | 		# Add filters
102 | 		image = image.filter(ImageFilter.GaussianBlur()).filter(ImageFilter.MedianFilter())
103 | 		pixels = numpy.array(image).astype(numpy.float32)
104 | 		# Segment
105 | 		segments = imagehash._find_all_segments(pixels, 128, 500)
106 | 		known_segment_count = 6
107 | 		self.assertEqual(len(segments), known_segment_count)
108 | 		if _pillow_has_convert_fix():
109 | 			known_segments = sorted([
110 | 				{'length': 595, 'min_x': 20, 'min_y': 0, 'max_x': 60, 'max_y': 31},
111 | 				{'length': 1458, 'min_x': 61, 'min_y': 0, 'max_x': 156, 'max_y': 58},
112 | 				{'length': 3505, 'min_x': 0, 'min_y': 111, 'max_x': 97, 'max_y': 191},
113 | 				{'length': 8789, 'min_x': 112, 'min_y': 145, 'max_x': 299, 'max_y': 260},
114 | 				{'length': 12153, 'min_x': 157, 'min_y': 0, 'max_x': 299, 'max_y': 148},
115 | 				{'length': 60916, 'min_x': 0, 'min_y': 0, 'max_x': 299, 'max_y': 299}
116 | 			], key=lambda x: (x['length'], x['min_x'], x['min_y'], x['max_x'], x['max_y']))
117 | 		else:
118 | 			known_segments = sorted([
119 | 				{'length': 591, 'min_x': 20, 'min_y': 0, 'max_x': 60, 'max_y': 31},
120 | 				{'length': 1451, 'min_x': 61, 'min_y': 0, 'max_x': 156, 'max_y': 58},
121 | 				{'length': 12040, 'min_x': 157, 'min_y': 0, 'max_x': 299, 'max_y': 147},
122 | 				{'length': 3452, 'min_x': 0, 'min_y': 111, 'max_x': 97, 'max_y': 191},
123 | 				{'length': 8701, 'min_x': 112, 'min_y': 145, 'max_x': 299, 'max_y': 259},
124 | 				{'length': 61179, 'min_x': 0, 'min_y': 0, 'max_x': 299, 'max_y': 299}
125 | 			], key=lambda x: (x['length'], x['min_x'], x['min_y'], x['max_x'], x['max_y']))
126 | 		segment_properties = sorted([
127 | 			_calculate_segment_properties(segment) for segment in segments
128 | 		], key=lambda x: (x['length'], x['min_x'], x['min_y'], x['max_x'], x['max_y']))
129 | 		self.assertEqual(segment_properties, known_segments)
130 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ===========
  2 | ImageHash
  3 | ===========
  4 | 
  5 | An image hashing library written in Python. ImageHash supports:
  6 | 
  7 | * Average hashing
  8 | * Perceptual hashing
  9 | * Difference hashing
 10 | * Wavelet hashing
 11 | * HSV color hashing (colorhash)
 12 | * Crop-resistant hashing
 13 | 
 14 | |CI|_ |Coveralls|_ 
 15 | 
 16 | Rationale
 17 | =========
 18 | 
 19 | Image hashes tell whether two images look nearly identical.
 20 | This is different from cryptographic hashing algorithms (like MD5, SHA-1)
 21 | where tiny changes in the image give completely different hashes. 
 22 | In image fingerprinting, we actually want our similar inputs to have
 23 | similar output hashes as well.
 24 | 
 25 | The image hash algorithms (average, perceptual, difference, wavelet)
 26 | analyse the image structure on luminance (without color information).
 27 | The color hash algorithm analyses the color distribution and 
 28 | black & gray fractions (without position information).
 29 | 
 30 | Installation
 31 | ============
 32 | 
 33 | Based on PIL/Pillow Image, numpy and scipy.fftpack (for pHash)
 34 | Easy installation through `pypi`_::
 35 | 
 36 | 	pip install imagehash
 37 | 
 38 | Basic usage
 39 | ===========
 40 | ::
 41 | 
 42 | 	>>> from PIL import Image
 43 | 	>>> import imagehash
 44 | 	>>> hash = imagehash.average_hash(Image.open('tests/data/imagehash.png'))
 45 | 	>>> print(hash)
 46 | 	ffd7918181c9ffff
 47 | 	>>> otherhash = imagehash.average_hash(Image.open('tests/data/peppers.png'))
 48 | 	>>> print(otherhash)
 49 | 	9f172786e71f1e00
 50 | 	>>> print(hash == otherhash)
 51 | 	False
 52 | 	>>> print(hash - otherhash)  # hamming distance
 53 | 	33
 54 | 
 55 | Each algorithm can also have its hash size adjusted (or in the case of
 56 | colorhash, its :code:`binbits`). Increasing the hash size allows an
 57 | algorithm to store more detail in its hash, increasing its sensitivity
 58 | to changes in detail.
 59 | 
 60 | The demo script **find_similar_images** illustrates how to find similar
 61 | images in a directory.
 62 | 
 63 | Source hosted at GitHub: https://github.com/JohannesBuchner/imagehash
 64 | 
 65 | References
 66 | -----------
 67 | 
 68 | * Average hashing (`aHashref`_)
 69 | * Perceptual hashing (`pHashref`_)
 70 | * Difference hashing (`dHashref`_)
 71 | * Wavelet hashing (`wHashref`_)
 72 | * Crop-resistant hashing (`crop_resistant_hashref`_)
 73 | 
 74 | .. _aHashref: https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html
 75 | .. _pHashref: https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html
 76 | .. _dHashref: https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html
 77 | .. _wHashref: https://fullstackml.com/wavelet-image-hash-in-python-3504fdd282b5
 78 | .. _pypi: https://pypi.python.org/pypi/ImageHash
 79 | .. _crop_resistant_hashref: https://ieeexplore.ieee.org/document/6980335
 80 | 
 81 | Examples
 82 | =========
 83 | 
 84 | To help evaluate how different hashing algorithms behave, below are a few hashes applied
 85 | to two datasets. This will let you know what images an algorithm thinks are basically identical.
 86 | 
 87 | Example 1: Icon dataset
 88 | -----------------------
 89 | 
 90 | Source: 7441 free icons on GitHub (see examples/github-urls.txt).
 91 | 
 92 | The following pages show groups of images with the same hash (the hashing method sees them as the same).
 93 | 
 94 | * `phash <https://johannesbuchner.github.io/imagehash/index3.html>`__ (or `with z-transform <https://johannesbuchner.github.io/imagehash/index9.html>`__)
 95 | * `dhash <https://johannesbuchner.github.io/imagehash/index4.html>`__ (or `with z-transform <https://johannesbuchner.github.io/imagehash/index10.html>`__)
 96 | * `colorhash <https://johannesbuchner.github.io/imagehash/index7.html>`__
 97 | * `average_hash <https://johannesbuchner.github.io/imagehash/index2.html>`__ (`with z-transform <https://johannesbuchner.github.io/imagehash/index8.html>`__)
 98 | 
 99 | The hashes use hashsize=8; colorhash uses binbits=3.
100 | You may want to adjust the hashsize or require some manhattan distance (hash1 - hash2 < threshold).
101 | 
102 | Example 2: Art dataset
103 | ----------------------
104 | 
105 | Source: 109259 art pieces from https://www.parismuseescollections.paris.fr/en/recherche/image-libre/.
106 | 
107 | The following pages show groups of images with the same hash (the hashing method sees them as the same).
108 | 
109 | * `phash <https://johannesbuchner.github.io/imagehash/art3.html>`__ (or `with z-transform <https://johannesbuchner.github.io/imagehash/art9.html>`__)
110 | * `dhash <https://johannesbuchner.github.io/imagehash/art4.html>`__ (or `with z-transform <https://johannesbuchner.github.io/imagehash/art10.html>`__)
111 | * `colorhash <https://johannesbuchner.github.io/imagehash/art7.html>`__
112 | * `average_hash <https://johannesbuchner.github.io/imagehash/art2.html>`__ (`with z-transform <https://johannesbuchner.github.io/imagehash/art8.html>`__)
113 | 
114 | For understanding hash distances, check out these excellent blog posts:
115 | * https://tech.okcupid.com/evaluating-perceptual-image-hashes-at-okcupid-e98a3e74aa3a
116 | * https://content-blockchain.org/research/testing-different-image-hash-functions/
117 | 
118 | Storing hashes
119 | ==============
120 | 
121 | As illustrated above, hashes can be turned into strings.
122 | The strings can be turned back into a ImageHash object as follows.
123 | 
124 | For single perceptual hashes::
125 | 
126 | 	>>> original_hash = imagehash.phash(Image.open('tests/data/imagehash.png'))
127 | 	>>> hash_as_str = str(original_hash)
128 | 	>>> print(hash_as_str)
129 | 	ffd7918181c9ffff
130 | 	>>> restored_hash = imagehash.hex_to_hash(hash_as_str)
131 | 	>>> print(restored_hash)
132 | 	ffd7918181c9ffff
133 | 	>>> assert restored_hash == original_hash
134 | 	>>> assert str(restored_hash) == hash_as_str
135 | 
136 | For crop_resistant_hash::
137 | 
138 | 	>>> original_hash = imagehash.crop_resistant_hash(Image.open('tests/data/imagehash.png'), min_segment_size=500, segmentation_image_size=1000)
139 | 	>>> hash_as_str = str(original_hash)
140 | 	>>> restored_hash = imagehash.hex_to_multihash(hash_as_str)
141 | 	>>> assert restored_hash == original_hash
142 | 	>>> assert str(restored_hash) == hash_as_str
143 | 
144 | For colorhash::
145 | 
146 | 	>>> original_hash = imagehash.colorhash(Image.open('tests/data/imagehash.png'), binbits=3)
147 | 	>>> hash_as_str = str(original_hash)
148 | 	>>> restored_hash = imagehash.hex_to_flathash(hash_as_str, hashsize=3)
149 | 
150 | Efficient database search
151 | -------------------------
152 | 
153 | For storing the hashes in a database and using fast hamming distance
154 | searches, see pointers at https://github.com/JohannesBuchner/imagehash/issues/127
155 | (a blog post on how to do this would be a great contribution!)
156 | 
157 | @KDJDEV points to https://github.com/KDJDEV/imagehash-reverse-image-search-tutorial and writes: 
158 | In this tutorial I use PostgreSQL and `this extension <https://github.com/fake-name/pg-spgist_hamming>`_, 
159 | and show how you can create a reverse image search using hashes generated by this library.
160 | 
161 | 
162 | Changelog
163 | ----------
164 | 
165 | * 4.3: typing annotations by @Avasam @SpangleLabs and @nh2
166 | 
167 | * 4.2: Cropping-Resistant image hashing added by @SpangleLabs
168 | 
169 | * 4.1: Add examples and colorhash
170 | 
171 | * 4.0: Changed binary to hex implementation, because the previous one was broken for various hash sizes. This change breaks compatibility to previously stored hashes; to convert them from the old encoding, use the "old_hex_to_hash" function.
172 | 
173 | * 3.5: Image data handling speed-up
174 | 
175 | * 3.2: whash now also handles smaller-than-hash images
176 | 
177 | * 3.0: dhash had a bug: It computed pixel differences vertically, not horizontally.
178 |        I modified it to follow `dHashref`_. The old function is available as dhash_vertical.
179 | 
180 | * 2.0: Added whash
181 | 
182 | * 1.0: Initial ahash, dhash, phash implementations.
183 | 
184 | Contributing
185 | =============
186 | 
187 | Pull requests and new features are warmly welcome.
188 | 
189 | If you encounter a bug or have a question, please open a GitHub issue. You can also try Stack Overflow.
190 | 
191 | Other projects
192 | ==============
193 | 
194 | * https://github.com/commonsmachinery/blockhash-python
195 | * https://github.com/acoomans/instagram-filters
196 | * https://pippy360.github.io/transformationInvariantImageSearch/
197 | * https://www.phash.org/
198 | * https://pypi.org/project/dhash/
199 | * https://github.com/thorn-oss/perception (based on imagehash code, depends on opencv)
200 | * https://docs.opencv.org/3.4/d4/d93/group__img__hash.html
201 | 
202 | .. |CI| image:: https://github.com/JohannesBuchner/imagehash/actions/workflows/testing.yml/badge.svg
203 | .. _CI: https://github.com/JohannesBuchner/imagehash/actions/workflows/testing.yml
204 | 
205 | .. |Coveralls| image:: https://coveralls.io/repos/github/JohannesBuchner/imagehash/badge.svg
206 | .. _Coveralls: https://coveralls.io/github/JohannesBuchner/imagehash
207 | 


--------------------------------------------------------------------------------
/output.html:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8" ?>
  2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  4 | <head>
  5 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  6 | <meta name="generator" content="Docutils 0.10: http://docutils.sourceforge.net/" />
  7 | <title>ImageHash</title>
  8 | <style type="text/css">
  9 | 
 10 | /*
 11 | :Author: David Goodger (goodger@python.org)
 12 | :Id: $Id: html4css1.css 7514 2012-09-14 14:27:12Z milde $
 13 | :Copyright: This stylesheet has been placed in the public domain.
 14 | 
 15 | Default cascading style sheet for the HTML output of Docutils.
 16 | 
 17 | See http://docutils.sf.net/docs/howto/html-stylesheets.html for how to
 18 | customize this style sheet.
 19 | */
 20 | 
 21 | /* used to remove borders from tables and images */
 22 | .borderless, table.borderless td, table.borderless th {
 23 |   border: 0 }
 24 | 
 25 | table.borderless td, table.borderless th {
 26 |   /* Override padding for "table.docutils td" with "! important".
 27 |      The right padding separates the table cells. */
 28 |   padding: 0 0.5em 0 0 ! important }
 29 | 
 30 | .first {
 31 |   /* Override more specific margin styles with "! important". */
 32 |   margin-top: 0 ! important }
 33 | 
 34 | .last, .with-subtitle {
 35 |   margin-bottom: 0 ! important }
 36 | 
 37 | .hidden {
 38 |   display: none }
 39 | 
 40 | a.toc-backref {
 41 |   text-decoration: none ;
 42 |   color: black }
 43 | 
 44 | blockquote.epigraph {
 45 |   margin: 2em 5em ; }
 46 | 
 47 | dl.docutils dd {
 48 |   margin-bottom: 0.5em }
 49 | 
 50 | object[type="image/svg+xml"], object[type="application/x-shockwave-flash"] {
 51 |   overflow: hidden;
 52 | }
 53 | 
 54 | /* Uncomment (and remove this text!) to get bold-faced definition list terms
 55 | dl.docutils dt {
 56 |   font-weight: bold }
 57 | */
 58 | 
 59 | div.abstract {
 60 |   margin: 2em 5em }
 61 | 
 62 | div.abstract p.topic-title {
 63 |   font-weight: bold ;
 64 |   text-align: center }
 65 | 
 66 | div.admonition, div.attention, div.caution, div.danger, div.error,
 67 | div.hint, div.important, div.note, div.tip, div.warning {
 68 |   margin: 2em ;
 69 |   border: medium outset ;
 70 |   padding: 1em }
 71 | 
 72 | div.admonition p.admonition-title, div.hint p.admonition-title,
 73 | div.important p.admonition-title, div.note p.admonition-title,
 74 | div.tip p.admonition-title {
 75 |   font-weight: bold ;
 76 |   font-family: sans-serif }
 77 | 
 78 | div.attention p.admonition-title, div.caution p.admonition-title,
 79 | div.danger p.admonition-title, div.error p.admonition-title,
 80 | div.warning p.admonition-title, .code .error {
 81 |   color: red ;
 82 |   font-weight: bold ;
 83 |   font-family: sans-serif }
 84 | 
 85 | /* Uncomment (and remove this text!) to get reduced vertical space in
 86 |    compound paragraphs.
 87 | div.compound .compound-first, div.compound .compound-middle {
 88 |   margin-bottom: 0.5em }
 89 | 
 90 | div.compound .compound-last, div.compound .compound-middle {
 91 |   margin-top: 0.5em }
 92 | */
 93 | 
 94 | div.dedication {
 95 |   margin: 2em 5em ;
 96 |   text-align: center ;
 97 |   font-style: italic }
 98 | 
 99 | div.dedication p.topic-title {
100 |   font-weight: bold ;
101 |   font-style: normal }
102 | 
103 | div.figure {
104 |   margin-left: 2em ;
105 |   margin-right: 2em }
106 | 
107 | div.footer, div.header {
108 |   clear: both;
109 |   font-size: smaller }
110 | 
111 | div.line-block {
112 |   display: block ;
113 |   margin-top: 1em ;
114 |   margin-bottom: 1em }
115 | 
116 | div.line-block div.line-block {
117 |   margin-top: 0 ;
118 |   margin-bottom: 0 ;
119 |   margin-left: 1.5em }
120 | 
121 | div.sidebar {
122 |   margin: 0 0 0.5em 1em ;
123 |   border: medium outset ;
124 |   padding: 1em ;
125 |   background-color: #ffffee ;
126 |   width: 40% ;
127 |   float: right ;
128 |   clear: right }
129 | 
130 | div.sidebar p.rubric {
131 |   font-family: sans-serif ;
132 |   font-size: medium }
133 | 
134 | div.system-messages {
135 |   margin: 5em }
136 | 
137 | div.system-messages h1 {
138 |   color: red }
139 | 
140 | div.system-message {
141 |   border: medium outset ;
142 |   padding: 1em }
143 | 
144 | div.system-message p.system-message-title {
145 |   color: red ;
146 |   font-weight: bold }
147 | 
148 | div.topic {
149 |   margin: 2em }
150 | 
151 | h1.section-subtitle, h2.section-subtitle, h3.section-subtitle,
152 | h4.section-subtitle, h5.section-subtitle, h6.section-subtitle {
153 |   margin-top: 0.4em }
154 | 
155 | h1.title {
156 |   text-align: center }
157 | 
158 | h2.subtitle {
159 |   text-align: center }
160 | 
161 | hr.docutils {
162 |   width: 75% }
163 | 
164 | img.align-left, .figure.align-left, object.align-left {
165 |   clear: left ;
166 |   float: left ;
167 |   margin-right: 1em }
168 | 
169 | img.align-right, .figure.align-right, object.align-right {
170 |   clear: right ;
171 |   float: right ;
172 |   margin-left: 1em }
173 | 
174 | img.align-center, .figure.align-center, object.align-center {
175 |   display: block;
176 |   margin-left: auto;
177 |   margin-right: auto;
178 | }
179 | 
180 | .align-left {
181 |   text-align: left }
182 | 
183 | .align-center {
184 |   clear: both ;
185 |   text-align: center }
186 | 
187 | .align-right {
188 |   text-align: right }
189 | 
190 | /* reset inner alignment in figures */
191 | div.align-right {
192 |   text-align: inherit }
193 | 
194 | /* div.align-center * { */
195 | /*   text-align: left } */
196 | 
197 | ol.simple, ul.simple {
198 |   margin-bottom: 1em }
199 | 
200 | ol.arabic {
201 |   list-style: decimal }
202 | 
203 | ol.loweralpha {
204 |   list-style: lower-alpha }
205 | 
206 | ol.upperalpha {
207 |   list-style: upper-alpha }
208 | 
209 | ol.lowerroman {
210 |   list-style: lower-roman }
211 | 
212 | ol.upperroman {
213 |   list-style: upper-roman }
214 | 
215 | p.attribution {
216 |   text-align: right ;
217 |   margin-left: 50% }
218 | 
219 | p.caption {
220 |   font-style: italic }
221 | 
222 | p.credits {
223 |   font-style: italic ;
224 |   font-size: smaller }
225 | 
226 | p.label {
227 |   white-space: nowrap }
228 | 
229 | p.rubric {
230 |   font-weight: bold ;
231 |   font-size: larger ;
232 |   color: maroon ;
233 |   text-align: center }
234 | 
235 | p.sidebar-title {
236 |   font-family: sans-serif ;
237 |   font-weight: bold ;
238 |   font-size: larger }
239 | 
240 | p.sidebar-subtitle {
241 |   font-family: sans-serif ;
242 |   font-weight: bold }
243 | 
244 | p.topic-title {
245 |   font-weight: bold }
246 | 
247 | pre.address {
248 |   margin-bottom: 0 ;
249 |   margin-top: 0 ;
250 |   font: inherit }
251 | 
252 | pre.literal-block, pre.doctest-block, pre.math, pre.code {
253 |   margin-left: 2em ;
254 |   margin-right: 2em }
255 | 
256 | pre.code .ln { color: grey; } /* line numbers */
257 | pre.code, code { background-color: #eeeeee }
258 | pre.code .comment, code .comment { color: #5C6576 }
259 | pre.code .keyword, code .keyword { color: #3B0D06; font-weight: bold }
260 | pre.code .literal.string, code .literal.string { color: #0C5404 }
261 | pre.code .name.builtin, code .name.builtin { color: #352B84 }
262 | pre.code .deleted, code .deleted { background-color: #DEB0A1}
263 | pre.code .inserted, code .inserted { background-color: #A3D289}
264 | 
265 | span.classifier {
266 |   font-family: sans-serif ;
267 |   font-style: oblique }
268 | 
269 | span.classifier-delimiter {
270 |   font-family: sans-serif ;
271 |   font-weight: bold }
272 | 
273 | span.interpreted {
274 |   font-family: sans-serif }
275 | 
276 | span.option {
277 |   white-space: nowrap }
278 | 
279 | span.pre {
280 |   white-space: pre }
281 | 
282 | span.problematic {
283 |   color: red }
284 | 
285 | span.section-subtitle {
286 |   /* font-size relative to parent (h1..h6 element) */
287 |   font-size: 80% }
288 | 
289 | table.citation {
290 |   border-left: solid 1px gray;
291 |   margin-left: 1px }
292 | 
293 | table.docinfo {
294 |   margin: 2em 4em }
295 | 
296 | table.docutils {
297 |   margin-top: 0.5em ;
298 |   margin-bottom: 0.5em }
299 | 
300 | table.footnote {
301 |   border-left: solid 1px black;
302 |   margin-left: 1px }
303 | 
304 | table.docutils td, table.docutils th,
305 | table.docinfo td, table.docinfo th {
306 |   padding-left: 0.5em ;
307 |   padding-right: 0.5em ;
308 |   vertical-align: top }
309 | 
310 | table.docutils th.field-name, table.docinfo th.docinfo-name {
311 |   font-weight: bold ;
312 |   text-align: left ;
313 |   white-space: nowrap ;
314 |   padding-left: 0 }
315 | 
316 | h1 tt.docutils, h2 tt.docutils, h3 tt.docutils,
317 | h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
318 |   font-size: 100% }
319 | 
320 | ul.auto-toc {
321 |   list-style-type: none }
322 | 
323 | </style>
324 | </head>
325 | <body>
326 | <div class="document" id="imagehash">
327 | <h1 class="title">ImageHash</h1>
328 | 
329 | <p>A image hashing library written in Python.
330 | Supports:</p>
331 | <ul class="simple">
332 | <li>average hashing (aHash)</li>
333 | <li>perception hashing (pHash)</li>
334 | <li>difference hashing (dHash)</li>
335 | </ul>
336 | <div class="section" id="requirements">
337 | <h1>Requirements</h1>
338 | <p>Based on PIL Image, numpy and scipy.fftpack (for pHash)</p>
339 | </div>
340 | <div class="section" id="basic-usage">
341 | <h1>Basic usage</h1>
342 | <pre class="literal-block">
343 | &gt;&gt;&gt; import Image
344 | &gt;&gt;&gt; import ImageHash
345 | &gt;&gt;&gt; hash = ImageHash.average_hash(Image.open('test.png'))
346 | &gt;&gt;&gt; print hash
347 | d879f8f89b1bbf
348 | &gt;&gt;&gt; otherhash = ImageHash.average_hash(Image.open('other.bmp'))
349 | &gt;&gt;&gt; print otherhash
350 | ffff3720200ffff
351 | &gt;&gt;&gt; print hash == otherhash
352 | False
353 | &gt;&gt;&gt; print hash - otherhash
354 | 36
355 | </pre>
356 | <p>Demo function find_similar_images illustrates finding similar images in a directory.</p>
357 | <p>References:</p>
358 | <pre class="literal-block">
359 | * pHash implementation following http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html
360 | * dHash implementation following http://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html
361 | </pre>
362 | </div>
363 | </div>
364 | </body>
365 | </html>
366 | 


--------------------------------------------------------------------------------
/imagehash/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Perceptual image hashing library
  3 | 
  4 | Example:
  5 | 
  6 | >>> from PIL import Image
  7 | >>> import imagehash
  8 | >>> hash = imagehash.average_hash(Image.open('test.png'))
  9 | >>> print(hash)
 10 | d879f8f89b1bbf
 11 | >>> otherhash = imagehash.average_hash(Image.open('other.bmp'))
 12 | >>> print(otherhash)
 13 | ffff3720200ffff
 14 | >>> print(hash == otherhash)
 15 | False
 16 | >>> print(hash - otherhash)
 17 | 36
 18 | >>> for r in range(1, 30, 5):
 19 | ...	rothash = imagehash.average_hash(Image.open('test.png').rotate(r))
 20 | ...	print('Rotation by %d: %d Hamming difference' % (r, hash - rothash))
 21 | ...
 22 | Rotation by 1: 2 Hamming difference
 23 | Rotation by 6: 11 Hamming difference
 24 | Rotation by 11: 13 Hamming difference
 25 | Rotation by 16: 17 Hamming difference
 26 | Rotation by 21: 19 Hamming difference
 27 | Rotation by 26: 21 Hamming difference
 28 | >>>
 29 | """
 30 | 
 31 | from __future__ import absolute_import, division, print_function
 32 | 
 33 | import sys
 34 | 
 35 | import numpy
 36 | from PIL import Image, ImageFilter
 37 | 
 38 | try:
 39 | 	ANTIALIAS = Image.Resampling.LANCZOS
 40 | except AttributeError:
 41 | 	# deprecated in pillow 10
 42 | 	# https://pillow.readthedocs.io/en/stable/deprecations.html
 43 | 	ANTIALIAS = Image.ANTIALIAS
 44 | 
 45 | __version__ = '4.3.2'
 46 | 
 47 | """
 48 | You may copy this file, if you keep the copyright information below:
 49 | 
 50 | 
 51 | Copyright (c) 2013-2022, Johannes Buchner
 52 | https://github.com/JohannesBuchner/imagehash
 53 | 
 54 | All rights reserved.
 55 | 
 56 | Redistribution and use in source and binary forms, with or without
 57 | modification, are permitted provided that the following conditions are
 58 | met:
 59 | 
 60 | Redistributions of source code must retain the above copyright
 61 | notice, this list of conditions and the following disclaimer.
 62 | 
 63 | Redistributions in binary form must reproduce the above copyright
 64 | notice, this list of conditions and the following disclaimer in the
 65 | documentation and/or other materials provided with the distribution.
 66 | 
 67 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 68 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 69 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 70 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 71 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 72 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 73 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 74 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 75 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 76 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 77 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 78 | 
 79 | """
 80 | 
 81 | 
 82 | def _binary_array_to_hex(arr):
 83 | 	"""
 84 | 	internal function to make a hex string out of a binary array.
 85 | 	"""
 86 | 	bit_string = ''.join(str(b) for b in 1 * arr.flatten())
 87 | 	width = int(numpy.ceil(len(bit_string) / 4))
 88 | 	return '{:0>{width}x}'.format(int(bit_string, 2), width=width)
 89 | 
 90 | 
 91 | class ImageHash:
 92 | 	"""
 93 | 	Hash encapsulation. Can be used for dictionary keys and comparisons.
 94 | 	"""
 95 | 
 96 | 	def __init__(self, binary_array):
 97 | 		# type: (NDArray) -> None
 98 | 		self.hash = binary_array  # type: NDArray
 99 | 
100 | 	def __str__(self):
101 | 		return _binary_array_to_hex(self.hash.flatten())
102 | 
103 | 	def __repr__(self):
104 | 		return repr(self.hash)
105 | 
106 | 	def __sub__(self, other):
107 | 		# type: (ImageHash) -> int
108 | 		if other is None:
109 | 			raise TypeError('Other hash must not be None.')
110 | 		if self.hash.size != other.hash.size:
111 | 			raise TypeError('ImageHashes must be of the same shape.', self.hash.shape, other.hash.shape)
112 | 		return numpy.count_nonzero(self.hash.flatten() != other.hash.flatten())
113 | 
114 | 	def __eq__(self, other):
115 | 		# type: (object) -> bool
116 | 		if other is None:
117 | 			return False
118 | 		return numpy.array_equal(self.hash.flatten(), other.hash.flatten())  # type: ignore
119 | 
120 | 	def __ne__(self, other):
121 | 		# type: (object) -> bool
122 | 		if other is None:
123 | 			return False
124 | 		return not numpy.array_equal(self.hash.flatten(), other.hash.flatten())  # type: ignore
125 | 
126 | 	def __hash__(self):
127 | 		# this returns a 8 bit integer, intentionally shortening the information
128 | 		return sum([2**(i % 8) for i, v in enumerate(self.hash.flatten()) if v])
129 | 
130 | 	def __len__(self):
131 | 		# Returns the bit length of the hash
132 | 		return self.hash.size
133 | 
134 | 
135 | # dynamic code for typing
136 | try:
137 | 	# specify allowed values if possible (py3.8+)
138 | 	from typing import Literal
139 | 	WhashMode = Literal['haar', 'db4']  # type: ignore
140 | except ImportError:
141 | 	WhashMode = str  # type: ignore
142 | 
143 | try:
144 | 	# enable numpy array typing (py3.7+)
145 | 	import numpy.typing
146 | 	NDArray = numpy.typing.NDArray[numpy.bool_]
147 | except (AttributeError, ImportError):
148 | 	NDArray = list  # type: ignore
149 | 
150 | # type of Callable
151 | if sys.version_info >= (3, 3):
152 | 	if sys.version_info >= (3, 9, 0) and sys.version_info <= (3, 9, 1):
153 | 		# https://stackoverflow.com/questions/65858528/is-collections-abc-callable-bugged-in-python-3-9-1
154 | 		from typing import Callable
155 | 	else:
156 | 		from collections.abc import Callable
157 | 	try:
158 | 		MeanFunc = Callable[[NDArray], float]
159 | 		HashFunc = Callable[[Image.Image], ImageHash]
160 | 	except TypeError:
161 | 		MeanFunc = Callable  # type: ignore
162 | 		HashFunc = Callable  # type: ignore
163 | # end of dynamic code for typing
164 | 
165 | 
166 | def hex_to_hash(hexstr):
167 | 	# type: (str) -> ImageHash
168 | 	"""
169 | 	Convert a stored hash (hex, as retrieved from str(Imagehash))
170 | 	back to a Imagehash object.
171 | 
172 | 	Notes:
173 | 	1. This algorithm assumes all hashes are either
174 | 			bidimensional arrays with dimensions hash_size * hash_size,
175 | 			or onedimensional arrays with dimensions binbits * 14.
176 | 	2. This algorithm does not work for hash_size < 2.
177 | 	"""
178 | 	hash_size = int(numpy.sqrt(len(hexstr) * 4))
179 | 	# assert hash_size == numpy.sqrt(len(hexstr)*4)
180 | 	binary_array = '{:0>{width}b}'.format(int(hexstr, 16), width=hash_size * hash_size)
181 | 	bit_rows = [binary_array[i:i + hash_size] for i in range(0, len(binary_array), hash_size)]
182 | 	hash_array = numpy.array([[bool(int(d)) for d in row] for row in bit_rows])
183 | 	return ImageHash(hash_array)
184 | 
185 | 
186 | def hex_to_flathash(hexstr, hashsize):
187 | 	# type: (str, int) -> ImageHash
188 | 	hash_size = int(len(hexstr) * 4 / (hashsize))
189 | 	binary_array = '{:0>{width}b}'.format(int(hexstr, 16), width=hash_size * hashsize)
190 | 	hash_array = numpy.array([[bool(int(d)) for d in binary_array]])[-hash_size * hashsize:]
191 | 	return ImageHash(hash_array)
192 | 
193 | 
194 | def hex_to_multihash(hexstr):
195 | 	# type: (str) -> ImageMultiHash
196 | 	"""
197 | 	Convert a stored multihash (hex, as retrieved from str(ImageMultiHash))
198 | 	back to an ImageMultiHash object.
199 | 
200 | 	This function is based on hex_to_hash so the same caveats apply. Namely:
201 | 
202 | 	1. This algorithm assumes all hashes are either
203 | 			bidimensional arrays with dimensions hash_size * hash_size,
204 | 			or onedimensional arrays with dimensions binbits * 14.
205 | 	2. This algorithm does not work for hash_size < 2.
206 | 	"""
207 | 	split = hexstr.split(',')
208 | 	hashes = [hex_to_hash(x) for x in split]
209 | 	return ImageMultiHash(hashes)
210 | 
211 | 
212 | def old_hex_to_hash(hexstr, hash_size=8):
213 | 	# type: (str, int) -> ImageHash
214 | 	"""
215 | 	Convert a stored hash (hex, as retrieved from str(Imagehash))
216 | 	back to a Imagehash object. This method should be used for
217 | 	hashes generated by ImageHash up to version 3.7. For hashes
218 | 	generated by newer versions of ImageHash, hex_to_hash should
219 | 	be used instead.
220 | 	"""
221 | 	arr = []
222 | 	count = hash_size * (hash_size // 4)
223 | 	if len(hexstr) != count:
224 | 		emsg = 'Expected hex string size of {}.'
225 | 		raise ValueError(emsg.format(count))
226 | 	for i in range(count // 2):
227 | 		h = hexstr[i * 2:i * 2 + 2]
228 | 		v = int('0x' + h, 16)
229 | 		arr.append([v & 2**i > 0 for i in range(8)])
230 | 	return ImageHash(numpy.array(arr))
231 | 
232 | 
233 | def average_hash(image, hash_size=8, mean=numpy.mean):
234 | 	# type: (Image.Image, int, MeanFunc) -> ImageHash
235 | 	"""
236 | 	Average Hash computation
237 | 
238 | 	Implementation follows https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html
239 | 
240 | 	Step by step explanation: https://web.archive.org/web/20171112054354/https://www.safaribooksonline.com/blog/2013/11/26/image-hashing-with-python/ # noqa: E501
241 | 
242 | 	@image must be a PIL instance.
243 | 	@mean how to determine the average luminescence. can try numpy.median instead.
244 | 	"""
245 | 	if hash_size < 2:
246 | 		raise ValueError('Hash size must be greater than or equal to 2')
247 | 
248 | 	# reduce size and complexity, then convert to grayscale
249 | 	image = image.convert('L').resize((hash_size, hash_size), ANTIALIAS)
250 | 
251 | 	# find average pixel value; 'pixels' is an array of the pixel values, ranging from 0 (black) to 255 (white)
252 | 	pixels = numpy.asarray(image)
253 | 	avg = mean(pixels)
254 | 
255 | 	# create string of bits
256 | 	diff = pixels > avg
257 | 	# make a hash
258 | 	return ImageHash(diff)
259 | 
260 | 
261 | def phash(image, hash_size=8, highfreq_factor=4):
262 | 	# type: (Image.Image, int, int) -> ImageHash
263 | 	"""
264 | 	Perceptual Hash computation.
265 | 
266 | 	Implementation follows https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html
267 | 
268 | 	@image must be a PIL instance.
269 | 	"""
270 | 	if hash_size < 2:
271 | 		raise ValueError('Hash size must be greater than or equal to 2')
272 | 
273 | 	import scipy.fftpack
274 | 	img_size = hash_size * highfreq_factor
275 | 	image = image.convert('L').resize((img_size, img_size), ANTIALIAS)
276 | 	pixels = numpy.asarray(image)
277 | 	dct = scipy.fftpack.dct(scipy.fftpack.dct(pixels, axis=0), axis=1)
278 | 	dctlowfreq = dct[:hash_size, :hash_size]
279 | 	med = numpy.median(dctlowfreq)
280 | 	diff = dctlowfreq > med
281 | 	return ImageHash(diff)
282 | 
283 | 
284 | def phash_simple(image, hash_size=8, highfreq_factor=4):
285 | 	# type: (Image.Image, int, int) -> ImageHash
286 | 	"""
287 | 	Perceptual Hash computation.
288 | 
289 | 	Implementation follows https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html
290 | 
291 | 	@image must be a PIL instance.
292 | 	"""
293 | 	import scipy.fftpack
294 | 	img_size = hash_size * highfreq_factor
295 | 	image = image.convert('L').resize((img_size, img_size), ANTIALIAS)
296 | 	pixels = numpy.asarray(image)
297 | 	dct = scipy.fftpack.dct(pixels)
298 | 	dctlowfreq = dct[:hash_size, 1:hash_size + 1]
299 | 	avg = dctlowfreq.mean()
300 | 	diff = dctlowfreq > avg
301 | 	return ImageHash(diff)
302 | 
303 | 
304 | def dhash(image, hash_size=8):
305 | 	# type: (Image.Image, int) -> ImageHash
306 | 	"""
307 | 	Difference Hash computation.
308 | 
309 | 	following https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html
310 | 
311 | 	computes differences horizontally
312 | 
313 | 	@image must be a PIL instance.
314 | 	"""
315 | 	# resize(w, h), but numpy.array((h, w))
316 | 	if hash_size < 2:
317 | 		raise ValueError('Hash size must be greater than or equal to 2')
318 | 
319 | 	image = image.convert('L').resize((hash_size + 1, hash_size), ANTIALIAS)
320 | 	pixels = numpy.asarray(image)
321 | 	# compute differences between columns
322 | 	diff = pixels[:, 1:] > pixels[:, :-1]
323 | 	return ImageHash(diff)
324 | 
325 | 
326 | def dhash_vertical(image, hash_size=8):
327 | 	# type: (Image.Image, int) -> ImageHash
328 | 	"""
329 | 	Difference Hash computation.
330 | 
331 | 	following https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html
332 | 
333 | 	computes differences vertically
334 | 
335 | 	@image must be a PIL instance.
336 | 	"""
337 | 	# resize(w, h), but numpy.array((h, w))
338 | 	image = image.convert('L').resize((hash_size, hash_size + 1), ANTIALIAS)
339 | 	pixels = numpy.asarray(image)
340 | 	# compute differences between rows
341 | 	diff = pixels[1:, :] > pixels[:-1, :]
342 | 	return ImageHash(diff)
343 | 
344 | 
345 | def whash(image, hash_size=8, image_scale=None, mode='haar', remove_max_haar_ll=True):
346 | 	# type: (Image.Image, int, int | None, WhashMode, bool) -> ImageHash
347 | 	"""
348 | 	Wavelet Hash computation.
349 | 
350 | 	based on https://www.kaggle.com/c/avito-duplicate-ads-detection/
351 | 
352 | 	@image must be a PIL instance.
353 | 	@hash_size must be a power of 2 and less than @image_scale.
354 | 	@image_scale must be power of 2 and less than image size. By default is equal to max
355 | 					power of 2 for an input image.
356 | 	@mode (see modes in pywt library):
357 | 					'haar' - Haar wavelets, by default
358 | 					'db4' - Daubechies wavelets
359 | 	@remove_max_haar_ll - remove the lowest low level (LL) frequency using Haar wavelet.
360 | 	"""
361 | 	import pywt
362 | 	if image_scale is not None:
363 | 		assert image_scale & (image_scale - 1) == 0, 'image_scale is not power of 2'
364 | 	else:
365 | 		image_natural_scale = 2**int(numpy.log2(min(image.size)))
366 | 		image_scale = max(image_natural_scale, hash_size)
367 | 
368 | 	ll_max_level = int(numpy.log2(image_scale))
369 | 
370 | 	level = int(numpy.log2(hash_size))
371 | 	assert hash_size & (hash_size - 1) == 0, 'hash_size is not power of 2'
372 | 	assert level <= ll_max_level, 'hash_size in a wrong range'
373 | 	dwt_level = ll_max_level - level
374 | 
375 | 	image = image.convert('L').resize((image_scale, image_scale), ANTIALIAS)
376 | 	pixels = numpy.asarray(image) / 255.
377 | 
378 | 	# Remove low level frequency LL(max_ll) if @remove_max_haar_ll using haar filter
379 | 	if remove_max_haar_ll:
380 | 		coeffs = pywt.wavedec2(pixels, 'haar', level=ll_max_level)
381 | 		coeffs = list(coeffs)
382 | 		coeffs[0] *= 0
383 | 		pixels = pywt.waverec2(coeffs, 'haar')
384 | 
385 | 	# Use LL(K) as freq, where K is log2(@hash_size)
386 | 	coeffs = pywt.wavedec2(pixels, mode, level=dwt_level)
387 | 	dwt_low = coeffs[0]
388 | 
389 | 	# Subtract median and compute hash
390 | 	med = numpy.median(dwt_low)
391 | 	diff = dwt_low > med
392 | 	return ImageHash(diff)
393 | 
394 | 
395 | def colorhash(image, binbits=3):
396 | 	# type: (Image.Image, int) -> ImageHash
397 | 	"""
398 | 	Color Hash computation.
399 | 
400 | 	Computes fractions of image in intensity, hue and saturation bins:
401 | 
402 | 	* the first binbits encode the black fraction of the image
403 | 	* the next binbits encode the gray fraction of the remaining image (low saturation)
404 | 	* the next 6*binbits encode the fraction in 6 bins of saturation, for highly saturated parts of the remaining image
405 | 	* the next 6*binbits encode the fraction in 6 bins of saturation, for mildly saturated parts of the remaining image
406 | 
407 | 	@binbits number of bits to use to encode each pixel fractions
408 | 	"""
409 | 
410 | 	# bin in hsv space:
411 | 	intensity = numpy.asarray(image.convert('L')).flatten()
412 | 	h, s, v = [numpy.asarray(v).flatten() for v in image.convert('HSV').split()]
413 | 	# black bin
414 | 	mask_black = intensity < 256 // 8
415 | 	frac_black = mask_black.mean()
416 | 	# gray bin (low saturation, but not black)
417 | 	mask_gray = s < 256 // 3
418 | 	frac_gray = numpy.logical_and(~mask_black, mask_gray).mean()
419 | 	# two color bins (medium and high saturation, not in the two above)
420 | 	mask_colors = numpy.logical_and(~mask_black, ~mask_gray)
421 | 	mask_faint_colors = numpy.logical_and(mask_colors, s < 256 * 2 // 3)
422 | 	mask_bright_colors = numpy.logical_and(mask_colors, s > 256 * 2 // 3)
423 | 
424 | 	c = max(1, mask_colors.sum())
425 | 	# in the color bins, make sub-bins by hue
426 | 	hue_bins = numpy.linspace(0, 255, 6 + 1)
427 | 	if mask_faint_colors.any():
428 | 		h_faint_counts, _ = numpy.histogram(h[mask_faint_colors], bins=hue_bins)
429 | 	else:
430 | 		h_faint_counts = numpy.zeros(len(hue_bins) - 1)
431 | 	if mask_bright_colors.any():
432 | 		h_bright_counts, _ = numpy.histogram(h[mask_bright_colors], bins=hue_bins)
433 | 	else:
434 | 		h_bright_counts = numpy.zeros(len(hue_bins) - 1)
435 | 
436 | 	# now we have fractions in each category (6*2 + 2 = 14 bins)
437 | 	# convert to hash and discretize:
438 | 	maxvalue = 2**binbits
439 | 	values = [min(maxvalue - 1, int(frac_black * maxvalue)), min(maxvalue - 1, int(frac_gray * maxvalue))]
440 | 	for counts in list(h_faint_counts) + list(h_bright_counts):
441 | 		values.append(min(maxvalue - 1, int(counts / c * maxvalue)))
442 | 	# print(values)
443 | 	bitarray = []
444 | 	for v in values:
445 | 		bitarray += [v // (2**(binbits - i - 1)) % 2**(binbits - i) > 0 for i in range(binbits)]
446 | 	return ImageHash(numpy.asarray(bitarray).reshape((-1, binbits)))
447 | 
448 | 
449 | class ImageMultiHash:
450 | 	"""
451 | 	This is an image hash containing a list of individual hashes for segments of the image.
452 | 	The matching logic is implemented as described in Efficient Cropping-Resistant Robust Image Hashing
453 | 	"""
454 | 
455 | 	def __init__(self, hashes):
456 | 		# type: (list[ImageHash]) -> None
457 | 		self.segment_hashes = hashes  # type: list[ImageHash]
458 | 
459 | 	def __eq__(self, other):
460 | 		# type: (object) -> bool
461 | 		if other is None:
462 | 			return False
463 | 		return self.matches(other)  # type: ignore
464 | 
465 | 	def __ne__(self, other):
466 | 		# type: (object) -> bool
467 | 		return not self.matches(other)  # type: ignore
468 | 
469 | 	def __sub__(self, other, hamming_cutoff=None, bit_error_rate=None):
470 | 		# type: (ImageMultiHash, float | None, float | None) -> float
471 | 		matches, sum_distance = self.hash_diff(other, hamming_cutoff, bit_error_rate)
472 | 		max_difference = len(self.segment_hashes)
473 | 		if matches == 0:
474 | 			return max_difference
475 | 		max_distance = matches * len(self.segment_hashes[0])
476 | 		tie_breaker = 0 - (float(sum_distance) / max_distance)
477 | 		match_score = matches + tie_breaker
478 | 		return max_difference - match_score
479 | 
480 | 	def __hash__(self):
481 | 		return hash(tuple(hash(segment) for segment in self.segment_hashes))
482 | 
483 | 	def __str__(self):
484 | 		return ','.join(str(x) for x in self.segment_hashes)
485 | 
486 | 	def __repr__(self):
487 | 		return repr(self.segment_hashes)
488 | 
489 | 	def hash_diff(self, other_hash, hamming_cutoff=None, bit_error_rate=None):
490 | 		# type: (ImageMultiHash, float | None, float | None) -> tuple[int, int]
491 | 		"""
492 | 		Gets the difference between two multi-hashes, as a tuple. The first element of the tuple is the number of
493 | 		matching segments, and the second element is the sum of the hamming distances of matching hashes.
494 | 		NOTE: Do not order directly by this tuple, as higher is better for matches, and worse for hamming cutoff.
495 | 		:param other_hash: The image multi hash to compare against
496 | 		:param hamming_cutoff: The maximum hamming distance to a region hash in the target hash
497 | 		:param bit_error_rate: Percentage of bits which can be incorrect, an alternative to the hamming cutoff. The
498 | 		default of 0.25 means that the segment hashes can be up to 25% different
499 | 		"""
500 | 		# Set default hamming cutoff if it's not set.
501 | 		if hamming_cutoff is None:
502 | 			if bit_error_rate is None:
503 | 				bit_error_rate = 0.25
504 | 			hamming_cutoff = len(self.segment_hashes[0]) * bit_error_rate
505 | 		# Get the hash distance for each region hash within cutoff
506 | 		distances = []
507 | 		for segment_hash in self.segment_hashes:
508 | 			lowest_distance = min(
509 | 				segment_hash - other_segment_hash
510 | 				for other_segment_hash in other_hash.segment_hashes
511 | 			)
512 | 			if lowest_distance > hamming_cutoff:
513 | 				continue
514 | 			distances.append(lowest_distance)
515 | 		return len(distances), sum(distances)
516 | 
517 | 	def matches(self, other_hash, region_cutoff=1, hamming_cutoff=None, bit_error_rate=None):
518 | 		# type: (ImageMultiHash, int, float | None, float | None) -> bool
519 | 		"""
520 | 		Checks whether this hash matches another crop resistant hash, `other_hash`.
521 | 		:param other_hash: The image multi hash to compare against
522 | 		:param region_cutoff: The minimum number of regions which must have a matching hash
523 | 		:param hamming_cutoff: The maximum hamming distance to a region hash in the target hash
524 | 		:param bit_error_rate: Percentage of bits which can be incorrect, an alternative to the hamming cutoff. The
525 | 		default of 0.25 means that the segment hashes can be up to 25% different
526 | 		"""
527 | 		matches, _ = self.hash_diff(other_hash, hamming_cutoff, bit_error_rate)
528 | 		return matches >= region_cutoff
529 | 
530 | 	def best_match(self, other_hashes, hamming_cutoff=None, bit_error_rate=None):
531 | 		# type: (list[ImageMultiHash], float | None, float | None) -> ImageMultiHash
532 | 		"""
533 | 		Returns the hash in a list which is the best match to the current hash
534 | 		:param other_hashes: A list of image multi hashes to compare against
535 | 		:param hamming_cutoff: The maximum hamming distance to a region hash in the target hash
536 | 		:param bit_error_rate: Percentage of bits which can be incorrect, an alternative to the hamming cutoff.
537 | 		Defaults to 0.25 if unset, which means the hash can be 25% different
538 | 		"""
539 | 		return min(
540 | 			other_hashes,
541 | 			key=lambda other_hash: self.__sub__(other_hash, hamming_cutoff, bit_error_rate)
542 | 		)
543 | 
544 | 
545 | def _find_region(remaining_pixels, segmented_pixels):
546 | 	"""
547 | 	Finds a region and returns a set of pixel coordinates for it.
548 | 	:param remaining_pixels: A numpy bool array, with True meaning the pixels are remaining to segment
549 | 	:param segmented_pixels: A set of pixel coordinates which have already been assigned to segment. This will be
550 | 	updated with the new pixels added to the returned segment.
551 | 	"""
552 | 	in_region = set()
553 | 	not_in_region = set()
554 | 	# Find the first pixel in remaining_pixels with a value of True
555 | 	available_pixels = numpy.transpose(numpy.nonzero(remaining_pixels))
556 | 	start = tuple(available_pixels[0])
557 | 	in_region.add(start)
558 | 	new_pixels = in_region.copy()
559 | 	while True:
560 | 		try_next = set()
561 | 		# Find surrounding pixels
562 | 		for pixel in new_pixels:
563 | 			x, y = pixel
564 | 			neighbours = [
565 | 				(x - 1, y),
566 | 				(x + 1, y),
567 | 				(x, y - 1),
568 | 				(x, y + 1)
569 | 			]
570 | 			try_next.update(neighbours)
571 | 		# Remove pixels we have already seen
572 | 		try_next.difference_update(segmented_pixels, not_in_region)
573 | 		# If there's no more pixels to try, the region is complete
574 | 		if not try_next:
575 | 			break
576 | 		# Empty new pixels set, so we know whose neighbour's to check next time
577 | 		new_pixels = set()
578 | 		# Check new pixels
579 | 		for pixel in try_next:
580 | 			if remaining_pixels[pixel]:
581 | 				in_region.add(pixel)
582 | 				new_pixels.add(pixel)
583 | 				segmented_pixels.add(pixel)
584 | 			else:
585 | 				not_in_region.add(pixel)
586 | 	return in_region
587 | 
588 | 
589 | def _find_all_segments(pixels, segment_threshold, min_segment_size):
590 | 	"""
591 | 	Finds all the regions within an image pixel array, and returns a list of the regions.
592 | 
593 | 	Note: Slightly different segmentations are produced when using pillow version 6 vs. >=7, due to a change in
594 | 	rounding in the greyscale conversion.
595 | 	:param pixels: A numpy array of the pixel brightnesses.
596 | 	:param segment_threshold: The brightness threshold to use when differentiating between hills and valleys.
597 | 	:param min_segment_size: The minimum number of pixels for a segment.
598 | 	"""
599 | 	img_width, img_height = pixels.shape
600 | 	# threshold pixels
601 | 	threshold_pixels = pixels > segment_threshold
602 | 	unassigned_pixels = numpy.full(pixels.shape, True, dtype=bool)
603 | 
604 | 	segments = []
605 | 	already_segmented = set()
606 | 
607 | 	# Add all the pixels around the border outside the image:
608 | 	already_segmented.update([(-1, z) for z in range(img_height)])
609 | 	already_segmented.update([(z, -1) for z in range(img_width)])
610 | 	already_segmented.update([(img_width, z) for z in range(img_height)])
611 | 	already_segmented.update([(z, img_height) for z in range(img_width)])
612 | 
613 | 	# Find all the "hill" regions
614 | 	while numpy.bitwise_and(threshold_pixels, unassigned_pixels).any():
615 | 		remaining_pixels = numpy.bitwise_and(threshold_pixels, unassigned_pixels)
616 | 		segment = _find_region(remaining_pixels, already_segmented)
617 | 		# Apply segment
618 | 		if len(segment) > min_segment_size:
619 | 			segments.append(segment)
620 | 		for pix in segment:
621 | 			unassigned_pixels[pix] = False
622 | 
623 | 	# Invert the threshold matrix, and find "valleys"
624 | 	threshold_pixels_i = numpy.invert(threshold_pixels)
625 | 	while len(already_segmented) < img_width * img_height:
626 | 		remaining_pixels = numpy.bitwise_and(threshold_pixels_i, unassigned_pixels)
627 | 		segment = _find_region(remaining_pixels, already_segmented)
628 | 		# Apply segment
629 | 		if len(segment) > min_segment_size:
630 | 			segments.append(segment)
631 | 		for pix in segment:
632 | 			unassigned_pixels[pix] = False
633 | 
634 | 	return segments
635 | 
636 | 
637 | def crop_resistant_hash(
638 | 	image,  # type: Image.Image
639 | 	hash_func=dhash,  # type: HashFunc
640 | 	limit_segments=None,  # type: int | None
641 | 	segment_threshold=128,  # type: int
642 | 	min_segment_size=500,  # type: int
643 | 	segmentation_image_size=300  # type: int
644 | ):
645 | 	# type: (...) -> ImageMultiHash
646 | 	"""
647 | 	Creates a CropResistantHash object, by the algorithm described in the paper "Efficient Cropping-Resistant Robust
648 | 	Image Hashing". DOI 10.1109/ARES.2014.85
649 | 	This algorithm partitions the image into bright and dark segments, using a watershed-like algorithm, and then does
650 | 	an image hash on each segment. This makes the image much more resistant to cropping than other algorithms, with
651 | 	the paper claiming resistance to up to 50% cropping, while most other algorithms stop at about 5% cropping.
652 | 
653 | 	Note: Slightly different segmentations are produced when using pillow version 6 vs. >=7, due to a change in
654 | 	rounding in the greyscale conversion. This leads to a slightly different result.
655 | 	:param image: The image to hash
656 | 	:param hash_func: The hashing function to use
657 | 	:param limit_segments: If you have storage requirements, you can limit to hashing only the M largest segments
658 | 	:param segment_threshold: Brightness threshold between hills and valleys. This should be static, putting it between
659 | 	peak and through dynamically breaks the matching
660 | 	:param min_segment_size: Minimum number of pixels for a hashable segment
661 | 	:param segmentation_image_size: Size which the image is resized to before segmentation
662 | 	"""
663 | 
664 | 	orig_image = image.copy()
665 | 	# Convert to gray scale and resize
666 | 	image = image.convert('L').resize((segmentation_image_size, segmentation_image_size), ANTIALIAS)
667 | 	# Add filters
668 | 	image = image.filter(ImageFilter.GaussianBlur()).filter(ImageFilter.MedianFilter())
669 | 	pixels = numpy.array(image).astype(numpy.float32)
670 | 
671 | 	segments = _find_all_segments(pixels, segment_threshold, min_segment_size)
672 | 
673 | 	# If there are no segments, have 1 segment including the whole image
674 | 	if not segments:
675 | 		full_image_segment = {(0, 0), (segmentation_image_size - 1, segmentation_image_size - 1)}
676 | 		segments.append(full_image_segment)
677 | 
678 | 	# If segment limit is set, discard the smaller segments
679 | 	if limit_segments:
680 | 		segments = sorted(segments, key=lambda s: len(s), reverse=True)[:limit_segments]
681 | 
682 | 	# Create bounding box for each segment
683 | 	hashes = []
684 | 	for segment in segments:
685 | 		orig_w, orig_h = orig_image.size
686 | 		scale_w = float(orig_w) / segmentation_image_size
687 | 		scale_h = float(orig_h) / segmentation_image_size
688 | 		min_y = min(coord[0] for coord in segment) * scale_h
689 | 		min_x = min(coord[1] for coord in segment) * scale_w
690 | 		max_y = (max(coord[0] for coord in segment) + 1) * scale_h
691 | 		max_x = (max(coord[1] for coord in segment) + 1) * scale_w
692 | 		# Compute robust hash for each bounding box
693 | 		bounding_box = orig_image.crop((min_x, min_y, max_x, max_y))
694 | 		hashes.append(hash_func(bounding_box))
695 | 		# Show bounding box
696 | 		# im_segment = image.copy()
697 | 		# for pix in segment:
698 | 		# 	im_segment.putpixel(pix[::-1], 255)
699 | 		# im_segment.show()
700 | 		# bounding_box.show()
701 | 
702 | 	return ImageMultiHash(hashes)
703 | 


--------------------------------------------------------------------------------