├── README.md ├── .gitignore └── dHash.py /README.md: -------------------------------------------------------------------------------- 1 | # DHash 2 | Image comparing with dHash algorithm. 3 | 使用dHash算法实现图片对比、相似图片查重。 4 | 5 | ## dHash简介 6 | dHash算法属于感知哈希算法,用于图片相似度对比。 7 | 8 | 感知哈希算法目前有: 9 | 10 | 1. aHash:平均值哈希,速度快,但准确率较低。 11 | 2. pHash:感知哈希,准确率高,但速度较慢。 12 | 3. `dHash:差异值哈希,速度快,且准确率高。` 13 | 14 | ## Function 15 | #### 1. 获得图片的dHash值: 16 | ```python 17 | hash = DHash.calculate_hash(image) 18 | ``` 19 | 20 | #### 2. 计算两张图片间的汉明距离: 21 | ```python 22 | hamming_distance = DHash.hamming_distance(image1, image2) 23 | ``` 24 | 25 | #### 3. 计算两个dHash值间的汉明距离: 26 | ```python 27 | hamming_distance = DHash.hamming_distance(dHash1, dHash2) 28 | ``` 29 | 30 | 31 | #### dHash算法实现详解: 32 | http://www.jianshu.com/p/193f0089b7a2 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /dHash.py: -------------------------------------------------------------------------------- 1 | class DHash(object): 2 | @staticmethod 3 | def calculate_hash(image): 4 | """ 5 | 计算图片的dHash值 6 | :param image: PIL.Image 7 | :return: dHash值,string类型 8 | """ 9 | difference = DHash.__difference(image) 10 | # 转化为16进制(每个差值为一个bit,每8bit转为一个16进制) 11 | decimal_value = 0 12 | hash_string = "" 13 | for index, value in enumerate(difference): 14 | if value: # value为0, 不用计算, 程序优化 15 | decimal_value += value * (2 ** (index % 8)) 16 | if index % 8 == 7: # 每8位的结束 17 | hash_string += str(hex(decimal_value)[2:].rjust(2, "0")) # 不足2位以0填充。0xf=>0x0f 18 | decimal_value = 0 19 | return hash_string 20 | 21 | @staticmethod 22 | def hamming_distance(first, second): 23 | """ 24 | 计算两张图片的汉明距离(基于dHash算法) 25 | :param first: Image或者dHash值(str) 26 | :param second: Image或者dHash值(str) 27 | :return: hamming distance. 值越大,说明两张图片差别越大,反之,则说明越相似 28 | """ 29 | # A. dHash值计算汉明距离 30 | if isinstance(first, str): 31 | return DHash.__hamming_distance_with_hash(first, second) 32 | 33 | # B. image计算汉明距离 34 | hamming_distance = 0 35 | image1_difference = DHash.__difference(first) 36 | image2_difference = DHash.__difference(second) 37 | for index, img1_pix in enumerate(image1_difference): 38 | img2_pix = image2_difference[index] 39 | if img1_pix != img2_pix: 40 | hamming_distance += 1 41 | return hamming_distance 42 | 43 | @staticmethod 44 | def __difference(image): 45 | """ 46 | *Private method* 47 | 计算image的像素差值 48 | :param image: PIL.Image 49 | :return: 差值数组。0、1组成 50 | """ 51 | resize_width = 9 52 | resize_height = 8 53 | # 1. resize to (9,8) 54 | smaller_image = image.resize((resize_width, resize_height)) 55 | # 2. 灰度化 Grayscale 56 | grayscale_image = smaller_image.convert("L") 57 | # 3. 比较相邻像素 58 | pixels = list(grayscale_image.getdata()) 59 | difference = [] 60 | for row in range(resize_height): 61 | row_start_index = row * resize_width 62 | for col in range(resize_width - 1): 63 | left_pixel_index = row_start_index + col 64 | difference.append(pixels[left_pixel_index] > pixels[left_pixel_index + 1]) 65 | return difference 66 | 67 | @staticmethod 68 | def __hamming_distance_with_hash(dhash1, dhash2): 69 | """ 70 | *Private method* 71 | 根据dHash值计算hamming distance 72 | :param dhash1: str 73 | :param dhash2: str 74 | :return: 汉明距离(int) 75 | """ 76 | difference = (int(dhash1, 16)) ^ (int(dhash2, 16)) 77 | return bin(difference).count("1") 78 | 79 | --------------------------------------------------------------------------------