├── .gitignore ├── README.md └── code ├── db.shelve ├── gather.py ├── index.py ├── output.csv └── search.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | post.md -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Check out the blog post here -> https://realpython.com/blog/python/fingerprinting-images-for-near-duplicate-detection/ -------------------------------------------------------------------------------- /code/db.shelve: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/realpython/image-fingerprinting/3a3dcfbd0a08f704c135b77bc909aac41f63ee64/code/db.shelve -------------------------------------------------------------------------------- /code/gather.py: -------------------------------------------------------------------------------- 1 | # USAGE 2 | # python gather.py --input 101_ObjectCategories --output images --csv output.csv 3 | 4 | # import the necessary packages 5 | from PIL import Image 6 | import argparse 7 | import random 8 | import shutil 9 | import glob2 10 | import uuid 11 | 12 | # construct the argument parse and parse the arguments 13 | ap = argparse.ArgumentParser() 14 | ap.add_argument("-i", "--input", required = True, 15 | help = "input directory of images") 16 | ap.add_argument("-o", "--output", required = True, 17 | help = "output directory") 18 | ap.add_argument("-c", "--csv", required = True, 19 | help = "path to CSV file for image counts") 20 | args = vars(ap.parse_args()) 21 | 22 | # open the output file for writing 23 | output = open(args["csv"], "w") 24 | 25 | # loop over the input images 26 | for imagePath in glob2.iglob(args["input"] + "/*/*.jpg"): 27 | # generate a random filename for the image and copy it to 28 | # the output location 29 | filename = str(uuid.uuid4()) + ".jpg" 30 | shutil.copy(imagePath, args["output"] + "/" + filename) 31 | 32 | # there is a 1 in 500 chance that multiple copies of this 33 | # image will be used 34 | if random.randint(0, 500) == 0: 35 | # initialize the number of times the image is being 36 | # duplicated and write it to the output CSV file 37 | numTimes = random.randint(1, 8) 38 | output.write("%s,%d\n" % (filename, numTimes)) 39 | 40 | # loop over a random number of times for this image to 41 | # be duplicated 42 | for i in xrange(0, numTimes): 43 | image = Image.open(imagePath) 44 | 45 | # randomly resize the image, perserving aspect ratio 46 | factor = random.uniform(0.95, 1.05) 47 | width = int(image.size[0] * factor) 48 | ratio = width / float(image.size[0]) 49 | height = int(image.size[1] * ratio) 50 | image = image.resize((width, height), Image.ANTIALIAS) 51 | 52 | # generate a random filename for the image and copy 53 | # it to the output directory 54 | adjFilename = str(uuid.uuid4()) + ".jpg" 55 | shutil.copy(imagePath, args["output"] + "/" + adjFilename) 56 | 57 | # close the output file 58 | output.close() -------------------------------------------------------------------------------- /code/index.py: -------------------------------------------------------------------------------- 1 | # USAGE 2 | # python index.py --dataset images --shelve db.shelve 3 | 4 | # import the necessary packages 5 | from PIL import Image 6 | import imagehash 7 | import argparse 8 | import shelve 9 | import glob 10 | 11 | # construct the argument parse and parse the arguments 12 | ap = argparse.ArgumentParser() 13 | ap.add_argument("-d", "--dataset", required = True, 14 | help = "path to input dataset of images") 15 | ap.add_argument("-s", "--shelve", required = True, 16 | help = "output shelve database") 17 | args = vars(ap.parse_args()) 18 | 19 | # open the shelve database 20 | db = shelve.open(args["shelve"], writeback = True) 21 | 22 | # loop over the image dataset 23 | for imagePath in glob.glob(args["dataset"] + "/*.jpg"): 24 | # load the image and compute the difference hash 25 | image = Image.open(imagePath) 26 | h = str(imagehash.dhash(image)) 27 | 28 | # extract the filename from the path and update the database 29 | # using the hash as the key and the filename append to the 30 | # list of values 31 | filename = imagePath[imagePath.rfind("/") + 1:] 32 | db[h] = db.get(h, []) + [filename] 33 | 34 | # close the shelf database 35 | db.close() -------------------------------------------------------------------------------- /code/output.csv: -------------------------------------------------------------------------------- 1 | 84eba74d-38ae-4bf6-b8bd-79ffa1dad23a.jpg,2 2 | 329d1e69-9595-4a1e-abaf-eab35cb8aafa.jpg,1 3 | 0cd147af-86e8-4fe1-9846-b23344c58293.jpg,1 4 | 4b653480-a0d5-41e4-b91b-f814cd5289eb.jpg,1 5 | d09513c4-d671-45df-bd78-e2fa08192341.jpg,7 6 | 9d355a22-3d59-465e-ad14-138a4e3880bc.jpg,3 7 | 74c5e5ef-7c75-4ef0-bed0-26468f5534a8.jpg,8 8 | 273fe667-ae47-4b44-bc9a-8682c944e158.jpg,5 9 | 9c48e7f3-bd19-4650-acd7-6bfdc1490493.jpg,3 10 | 5134e0c2-34d3-40b6-9473-98de8be16c67.jpg,3 11 | df047086-73dc-480a-8de0-5b5d7b0c00b6.jpg,3 12 | cbc423f5-776e-41ba-817b-d108bdc2eec7.jpg,3 13 | 513d2809-afbe-4081-aee7-29bf61c6cf5a.jpg,6 14 | f2d069fe-9494-426a-b8ef-dfebe4d3da03.jpg,1 15 | 5db94304-23c4-42b2-aa17-6f9669c04071.jpg,8 16 | 1912d003-ff59-4585-9eed-58736297051c.jpg,7 17 | f27716c7-c44b-40e0-a754-9c47c50bcc23.jpg,7 18 | -------------------------------------------------------------------------------- /code/search.py: -------------------------------------------------------------------------------- 1 | # USAGE 2 | # python search.py --dataset images --shelve db.shelve --query images/84eba74d-38ae-4bf6-b8bd-79ffa1dad23a.jpg 3 | 4 | # import the necessary packages 5 | from PIL import Image 6 | import imagehash 7 | import argparse 8 | import shelve 9 | 10 | # construct the argument parse and parse the arguments 11 | ap = argparse.ArgumentParser() 12 | ap.add_argument("-d", "--dataset", required = True, 13 | help = "path to dataset of images") 14 | ap.add_argument("-s", "--shelve", required = True, 15 | help = "output shelve database") 16 | ap.add_argument("-q", "--query", required = True, 17 | help = "path to the query image") 18 | args = vars(ap.parse_args()) 19 | 20 | # open the shelve database 21 | db = shelve.open(args["shelve"]) 22 | 23 | # load the query image, compute the difference image hash, and 24 | # and grab the images from the database that have the same hash 25 | # value 26 | query = Image.open(args["query"]) 27 | h = str(imagehash.dhash(query)) 28 | filenames = db[h] 29 | print "Found %d images" % (len(filenames)) 30 | 31 | # loop over the images 32 | for filename in filenames: 33 | image = Image.open(args["dataset"] + "/" + filename) 34 | image.show() 35 | 36 | # close the shelve database 37 | db.close() --------------------------------------------------------------------------------