├── .gitignore ├── LICENSE ├── README.md ├── imgdup.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | test 56 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Radu Ioan Fericean 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | imgdup 2 | ====== 3 | 4 | Visual similarity image finder and cleaner (image deduplication tool). 5 | 6 | Install 7 | ------- 8 | 9 | ``` 10 | pip install imgdup 11 | ``` 12 | 13 | or clone the repo and run imgdup.py file directly 14 | 15 | Usage 16 | ----- 17 | 18 | Should be run in the images folder. 19 | 20 | It will create a `duplicates` folder containing similar file pairs indicating which file was kept and which one is gone. You can later review similar files in the `duplicates` folder and decide if you delete or restore each `_GONE_` marked file. 21 | 22 | ```shell 23 | usage: imgdup.py [-h] [-c CMP] [-s SENSITIVITY] [-i] [-d] [-u] 24 | 25 | Compare images base on perceptual similarity. 26 | 27 | optional arguments: 28 | -h, --help show this help message and exit 29 | -c CMP, --cmp CMP compare images by function and keep higher 30 | (resolution, size [resolution]) 31 | -s SENSITIVITY, --sensitivity SENSITIVITY 32 | how similar images must be to be considered duplicates 33 | (0 - very similar, 5 - shomehow similar) 34 | -i, --invert invert the compartison function (keep lower) 35 | -d, --dry_run just print the pairs 36 | -u, --undo put the moved files back 37 | ``` 38 | 39 | [Watch example terminal cast here](http://asciinema.org/a/19620) 40 | 41 | WARNING 42 | ------- 43 | 44 | Backup the image set before running this script! 45 | -------------------------------------------------------------------------------- /imgdup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # inspired by: http://blog.iconfinder.com/detecting-duplicate-images-using-python/ 3 | 4 | from PIL import Image 5 | from glob import glob 6 | from hashlib import md5 7 | import sys, shutil, os, argparse 8 | 9 | DUP_FOLDER = 'duplicates' 10 | KEEP_SUFIX = '_KEPT_' 11 | DELETE_SUFIX = '_GONE_' 12 | KEEP = '%s'+KEEP_SUFIX 13 | DELETE = '%s'+DELETE_SUFIX 14 | 15 | def dhash(image, hash_size = 8): 16 | # Grayscale and shrink the image in one step. 17 | image = image.convert('L').resize( 18 | (hash_size + 1, hash_size), 19 | Image.ANTIALIAS, 20 | ) 21 | 22 | pixels = list(image.getdata()) 23 | 24 | # Compare adjacent pixels. 25 | difference = [] 26 | for row in range(hash_size): 27 | for col in range(hash_size): 28 | pixel_left = image.getpixel((col, row)) 29 | pixel_right = image.getpixel((col + 1, row)) 30 | difference.append(pixel_left > pixel_right) 31 | 32 | # Convert the binary array to a hexadecimal string. 33 | decimal_value = 0 34 | hex_string = [] 35 | for index, value in enumerate(difference): 36 | if value: 37 | decimal_value += 2**(index % 8) 38 | if (index % 8) == 7: 39 | hex_string.append(hex(decimal_value)[2:].rjust(2, '0')) 40 | decimal_value = 0 41 | 42 | return ''.join(hex_string) 43 | 44 | class ImgInfo: 45 | def __init__(self, name, size, cmp_func): 46 | self.name = name 47 | self.res = size 48 | self.cmp_func = cmp_func 49 | 50 | def __lt__(self, other): 51 | self_val = self.cmp_func(self) 52 | other_val = self.cmp_func(other) 53 | return self_val < other_val 54 | 55 | def __eq__(self, other): 56 | self_val = self.cmp_func(self) 57 | other_val = self.cmp_func(other) 58 | return self_val == other_val 59 | 60 | class ImgHash: 61 | def __init__(self, val, info, sensitivity=0): 62 | self.val = val 63 | self.sensitivity = sensitivity 64 | self.img_info = info 65 | 66 | def __eq__(self, other): 67 | #Return the Hamming distance between equal-length sequences 68 | if len(self.val) != len(other.val): 69 | return false 70 | hamming_distance = sum(ch1 != ch2 for ch1, ch2 in zip(self.val, other.val)) 71 | return hamming_distance <= self.sensitivity 72 | 73 | 74 | def __hash__(self): 75 | return hash(self.val) 76 | 77 | def __str__(self): 78 | return self.val 79 | 80 | def resolution(self): 81 | return self.res[0] * self.res[1] 82 | 83 | def size(self): 84 | statinfo = os.stat(self.name) 85 | return statinfo.st_size 86 | 87 | def compa(v1, v2, invert): 88 | return v1 > v2 if not invert else v2 > v1 89 | 90 | 91 | if __name__ == '__main__': 92 | parser = argparse.ArgumentParser(description='Compare images base on perceptual similarity.') 93 | parser.add_argument('-c','--cmp', default=resolution, 94 | help='compare images by function and keep higher (resolution, size [resolution])') 95 | parser.add_argument('-s','--sensitivity', default=0, type=int, 96 | help='how similar images must be to be considered duplicates (0 - very similar, 5 - shomehow similar)') 97 | parser.add_argument('-i','--invert', action='store_true', 98 | help='invert the compartison function (keep lower)') 99 | parser.add_argument('-d','--dry_run', action='store_true', 100 | help='just print the pairs') 101 | parser.add_argument('-u','--undo', action='store_true', 102 | help='put the moved files back') 103 | args = parser.parse_args() 104 | 105 | if args.sensitivity < 0 or args.sensitivity > 5: 106 | print('Invalid sensitivity value %d (0, 5)', args.sensitivity) 107 | sys.exit(1) 108 | 109 | if args.undo: 110 | images = glob(os.path.join(DUP_FOLDER, '*')) 111 | for img_path in images: 112 | if KEEP_SUFIX in img_path: 113 | os.remove(img_path) 114 | if DELETE_SUFIX in img_path: 115 | file_name = img_path.split(DELETE_SUFIX)[-1] 116 | shutil.move(img_path, file_name) 117 | print('recovered %s' % file_name) 118 | try: 119 | os.rmdir(DUP_FOLDER) 120 | except OSError: pass 121 | sys.exit(0) 122 | 123 | img_list = [] 124 | images = [] 125 | 126 | types = ('*.jpg', '*.png', '*.gif', '*.jpeg') 127 | for files in types: 128 | images.extend(glob(files)) 129 | images.extend(glob(files.upper())) 130 | print('Found %d files.'%len(images)) 131 | 132 | count = 0 133 | duplicates = 0 134 | for img_path in images: 135 | sys.stdout.write("\r%d%%" % (count*100/len(images))) 136 | sys.stdout.flush() 137 | count += 1 138 | try: 139 | img = Image.open(img_path) 140 | 141 | comp = getattr(sys.modules[__name__], args.cmp) if type(args.cmp) is str else args.cmp 142 | 143 | ii1 = ImgInfo(img_path, img.size, comp) 144 | a = ImgHash(dhash(img), ii1, args.sensitivity) 145 | try: 146 | index = img_list.index(a) 147 | except ValueError: 148 | index = -1 149 | if index > -1: # hamming_distance comparison using specified sensitivity 150 | duplicates += 1 151 | if not os.path.exists(DUP_FOLDER) and not args.dry_run: os.mkdir(DUP_FOLDER) 152 | ii2 = img_list[index].img_info 153 | if not args.dry_run: 154 | # prefix files with the same hash to make them a pair 155 | prefix = md5((ii1.name + ii2.name).encode('utf-8')).hexdigest()[:5] 156 | if compa(ii1, ii2, args.invert): 157 | shutil.copy(ii1.name, os.path.join(DUP_FOLDER, KEEP % prefix + ii1.name)) 158 | shutil.move(ii2.name, os.path.join(DUP_FOLDER, DELETE % prefix + ii2.name)) 159 | img_list[index] = a # new file was kept 160 | else: 161 | shutil.move(ii1.name, os.path.join(DUP_FOLDER, DELETE % prefix + ii1.name)) 162 | shutil.copy(ii2.name, os.path.join(DUP_FOLDER, KEEP % prefix + ii2.name)) 163 | print("\r%s and %s are too similar" % (ii2.name, ii1.name)) 164 | else: 165 | img_list.append(a) 166 | except IOError: 167 | print("\rerror processing files:", sys.exc_info()) 168 | 169 | print("\rFound %d duplicates"%duplicates) 170 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | setup( 3 | name = "imgdup", 4 | version = "1.3", 5 | packages = find_packages(), 6 | scripts = ['imgdup.py'], 7 | install_requires = ['pillow>=2.8.1'], 8 | 9 | # metadata for upload to PyPI 10 | author = "Radu Ioan Fericean", 11 | author_email = "radu@fericean.ro", 12 | description = "Visual similarity image finder and cleaner (image deduplication tool)", 13 | license = "MIT", 14 | keywords = "deduplication duplicate images image visual finder", 15 | url = "https://github.com/rif/imgdup", # project home page, if any 16 | 17 | ) 18 | --------------------------------------------------------------------------------