├── .gitignore
├── README.md
└── code
    ├── db.shelve
    ├── gather.py
    ├── index.py
    ├── output.csv
    └── search.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.pyc
3 | post.md


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Check out the blog post here -> https://realpython.com/blog/python/fingerprinting-images-for-near-duplicate-detection/


--------------------------------------------------------------------------------
/code/db.shelve:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/realpython/image-fingerprinting/3a3dcfbd0a08f704c135b77bc909aac41f63ee64/code/db.shelve


--------------------------------------------------------------------------------
/code/gather.py:
--------------------------------------------------------------------------------
 1 | # USAGE
 2 | # python gather.py --input 101_ObjectCategories --output images --csv output.csv
 3 | 
 4 | # import the necessary packages
 5 | from PIL import Image
 6 | import argparse
 7 | import random
 8 | import shutil
 9 | import glob2
10 | import uuid
11 | 
12 | # construct the argument parse and parse the arguments
13 | ap = argparse.ArgumentParser()
14 | ap.add_argument("-i", "--input", required = True,
15 | 	help = "input directory of images")
16 | ap.add_argument("-o", "--output", required = True,
17 | 	help = "output directory")
18 | ap.add_argument("-c", "--csv", required = True,
19 | 	help = "path to CSV file for image counts")
20 | args = vars(ap.parse_args())
21 | 
22 | # open the output file for writing
23 | output = open(args["csv"], "w")
24 | 
25 | # loop over the input images
26 | for imagePath in glob2.iglob(args["input"] + "/*/*.jpg"):
27 | 	# generate a random filename for the image and copy it to
28 | 	# the output location
29 | 	filename = str(uuid.uuid4()) + ".jpg"
30 | 	shutil.copy(imagePath, args["output"] + "/" + filename)
31 | 
32 | 	# there is a 1 in 500 chance that multiple copies of this
33 | 	# image will be used
34 | 	if random.randint(0, 500) == 0:
35 | 		# initialize the number of times the image is being
36 | 		# duplicated and write it to the output CSV file
37 | 		numTimes = random.randint(1, 8)
38 | 		output.write("%s,%d\n" % (filename, numTimes))
39 | 
40 | 		# loop over a random number of times for this image to
41 | 		# be duplicated
42 | 		for i in xrange(0, numTimes):
43 | 			image = Image.open(imagePath)
44 | 
45 | 			# randomly resize the image, perserving aspect ratio
46 | 			factor = random.uniform(0.95, 1.05)
47 | 			width = int(image.size[0] * factor)
48 | 			ratio = width / float(image.size[0])
49 | 			height = int(image.size[1] * ratio)
50 | 			image = image.resize((width, height), Image.ANTIALIAS)
51 | 
52 | 			# generate a random filename for the image and copy
53 | 			# it to the output directory
54 | 			adjFilename = str(uuid.uuid4()) + ".jpg"
55 | 			shutil.copy(imagePath, args["output"] + "/" + adjFilename)
56 | 
57 | # close the output file
58 | output.close()


--------------------------------------------------------------------------------
/code/index.py:
--------------------------------------------------------------------------------
 1 | # USAGE
 2 | # python index.py --dataset images --shelve db.shelve
 3 | 
 4 | # import the necessary packages
 5 | from PIL import Image
 6 | import imagehash
 7 | import argparse
 8 | import shelve
 9 | import glob
10 | 
11 | # construct the argument parse and parse the arguments
12 | ap = argparse.ArgumentParser()
13 | ap.add_argument("-d", "--dataset", required = True,
14 | 	help = "path to input dataset of images")
15 | ap.add_argument("-s", "--shelve", required = True,
16 | 	help = "output shelve database")
17 | args = vars(ap.parse_args())
18 | 
19 | # open the shelve database
20 | db = shelve.open(args["shelve"], writeback = True)
21 | 
22 | # loop over the image dataset
23 | for imagePath in glob.glob(args["dataset"] + "/*.jpg"):
24 | 	# load the image and compute the difference hash
25 | 	image = Image.open(imagePath)
26 | 	h = str(imagehash.dhash(image))
27 | 
28 | 	# extract the filename from the path and update the database
29 | 	# using the hash as the key and the filename append to the
30 | 	# list of values
31 | 	filename = imagePath[imagePath.rfind("/") + 1:]
32 | 	db[h] = db.get(h, []) + [filename]
33 | 
34 | # close the shelf database
35 | db.close()


--------------------------------------------------------------------------------
/code/output.csv:
--------------------------------------------------------------------------------
 1 | 84eba74d-38ae-4bf6-b8bd-79ffa1dad23a.jpg,2
 2 | 329d1e69-9595-4a1e-abaf-eab35cb8aafa.jpg,1
 3 | 0cd147af-86e8-4fe1-9846-b23344c58293.jpg,1
 4 | 4b653480-a0d5-41e4-b91b-f814cd5289eb.jpg,1
 5 | d09513c4-d671-45df-bd78-e2fa08192341.jpg,7
 6 | 9d355a22-3d59-465e-ad14-138a4e3880bc.jpg,3
 7 | 74c5e5ef-7c75-4ef0-bed0-26468f5534a8.jpg,8
 8 | 273fe667-ae47-4b44-bc9a-8682c944e158.jpg,5
 9 | 9c48e7f3-bd19-4650-acd7-6bfdc1490493.jpg,3
10 | 5134e0c2-34d3-40b6-9473-98de8be16c67.jpg,3
11 | df047086-73dc-480a-8de0-5b5d7b0c00b6.jpg,3
12 | cbc423f5-776e-41ba-817b-d108bdc2eec7.jpg,3
13 | 513d2809-afbe-4081-aee7-29bf61c6cf5a.jpg,6
14 | f2d069fe-9494-426a-b8ef-dfebe4d3da03.jpg,1
15 | 5db94304-23c4-42b2-aa17-6f9669c04071.jpg,8
16 | 1912d003-ff59-4585-9eed-58736297051c.jpg,7
17 | f27716c7-c44b-40e0-a754-9c47c50bcc23.jpg,7
18 | 


--------------------------------------------------------------------------------
/code/search.py:
--------------------------------------------------------------------------------
 1 | # USAGE
 2 | # python search.py --dataset images --shelve db.shelve --query images/84eba74d-38ae-4bf6-b8bd-79ffa1dad23a.jpg
 3 | 
 4 | # import the necessary packages
 5 | from PIL import Image
 6 | import imagehash
 7 | import argparse
 8 | import shelve
 9 | 
10 | # construct the argument parse and parse the arguments
11 | ap = argparse.ArgumentParser()
12 | ap.add_argument("-d", "--dataset", required = True,
13 | 	help = "path to dataset of images")
14 | ap.add_argument("-s", "--shelve", required = True,
15 | 	help = "output shelve database")
16 | ap.add_argument("-q", "--query", required = True,
17 | 	help = "path to the query image")
18 | args = vars(ap.parse_args())
19 | 
20 | # open the shelve database
21 | db = shelve.open(args["shelve"])
22 | 
23 | # load the query image, compute the difference image hash, and
24 | # and grab the images from the database that have the same hash
25 | # value
26 | query = Image.open(args["query"])
27 | h = str(imagehash.dhash(query))
28 | filenames = db[h]
29 | print "Found %d images" % (len(filenames))
30 | 
31 | # loop over the images
32 | for filename in filenames:
33 | 	image = Image.open(args["dataset"] + "/" + filename)
34 | 	image.show()
35 | 
36 | # close the shelve database
37 | db.close()


--------------------------------------------------------------------------------