├── MANIFEST.in ├── .gitignore ├── fingerprint ├── __init__.py └── fingerprint.py ├── CHANGES.txt ├── setup.py ├── LICENSE └── README.md /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.DS_Store 3 | dist 4 | build 5 | -------------------------------------------------------------------------------- /fingerprint/__init__.py: -------------------------------------------------------------------------------- 1 | from .fingerprint import Fingerprint, FingerprintException 2 | 3 | __all__ = ['Fingerprint', 'FingerprintException'] 4 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | v0.1.3 -- v0.1.6, 2020 March 7 -- Package released for Python 3 2 | v0.1.2, 2016 May 15 -- Package rewritten, input string decoded to `utf-8` 3 | v0.1.1, 2011 July 1 -- Immediate fixes 4 | v0.1.0, 2011 June 26 -- Initial release 5 | 6 | 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | from setuptools import find_packages, setup 4 | 5 | 6 | with open('README.md', 'r') as fp: 7 | README = fp.read() 8 | 9 | 10 | setup( 11 | name='fingerprint', 12 | version='0.1.6', 13 | description='Document fingerprint generator', 14 | long_description=README, 15 | long_description_content_type='text/markdown', 16 | author='Kailash Budhathoki', 17 | author_email='kailash.buki@gmail.com', 18 | url='http://github.com/kailashbuki/fingerprint', 19 | license='MIT License', 20 | packages=find_packages(), 21 | platforms='any', 22 | python_requires='>=3.0', 23 | classifiers=[ 24 | 'Development Status :: 4 - Beta', 25 | 'Intended Audience :: Developers', 26 | 'License :: OSI Approved :: MIT License', 27 | 'Operating System :: OS Independent', 28 | 'Programming Language :: Python', 29 | 'Topic :: Software Development :: Libraries :: Python Modules' 30 | ], 31 | ) 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED,INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,FITNESS 15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Fingerprint -- Document Fingerprint Generator 2 | --------------------------------------------- 3 | 4 | Fingerprint is a signature of the document. In particular, it is a representative subset of hash values from the set of all hash values of a document. For more detail, please consider taking a look at [Winnowing: Local Algorithms for Document Fingerprinting](http://theory.stanford.edu/~aiken/publications/papers/sigmod03.pdf) *(specifically Figure 2)*. 5 | 6 | Fingerprint Module Installation 7 | ------------------------------- 8 | 9 | The recommended way to install the `fingerprint` module is to simply use `pip`: 10 | 11 | ```console 12 | $ pip install fingerprint 13 | ``` 14 | Fingerprint officially supports Python >= 3.0. 15 | 16 | How to use fingerprint? 17 | ----------------------- 18 | ```pycon 19 | >>> import fingerprint 20 | >>> fprint = fingerprint.Fingerprint(kgram_len=4, window_len=5, base=10, modulo=1000) 21 | >>> fprint.generate(str="adorunrunrunadorunrun") 22 | >>> fprint.generate(fpath="../CHANGES.txt") 23 | ``` 24 | 25 | The default values for the parameters are 26 | ```python 27 | kgram_len = 50 28 | window_len = 100 29 | base = 101 30 | modulo = sys.maxint 31 | ``` 32 | -------------------------------------------------------------------------------- /fingerprint/fingerprint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Generates fingerprints of text document. 4 | """ 5 | import string 6 | import sys 7 | 8 | __author__ = 'kailash.buki@gmail.com (Kailash Budhathoki)' 9 | __all__ = [ 10 | 'Fingerprint', 11 | 'FingerprintException', 12 | ] 13 | 14 | guid = lambda x: ord(x) 15 | 16 | 17 | class FingerprintException(Exception): 18 | 19 | def __init__(self, *args, **kwargs): 20 | Exception.__init__(self, *args, **kwargs) 21 | 22 | 23 | class Fingerprint(object): 24 | """ 25 | Generates fingerprints of the input text file or plain string. Please consider taking a look at http://theory.stanford.edu/~aiken/publications/papers/sigmod03.pdf for detailed description on how fingerprints are computed. 26 | 27 | Attributes: 28 | kgram_len (Optional[int]): length of the contiguous substring. Defaults to 50. 29 | base (Optional[int]): base required for computing the rolling hash function. Defaults to 101. 30 | modulo (Optional[int]): hash values cannot exceed this value. Defaults to sys.maxint. 31 | window_len (Optional[len]): length of the windows when computing fingerprints. Defaults to 100. 32 | kgrams (List(str)): k-grams extracted from the text 33 | hashes (List(int)): hash values of the k-grams 34 | fingerprints (List(tuple(int))): selected representative hash values along with their positions. 35 | """ 36 | 37 | def __init__(self, kgram_len=None, base=None, modulo=None, window_len=None): 38 | self.kgram_len = kgram_len or 50 39 | self.modulo = modulo or sys.maxsize 40 | self.base = base or 101 41 | self.window_len = window_len or 100 42 | 43 | @property 44 | def base(self): 45 | return self.__base 46 | 47 | @base.setter 48 | def base(self, value): 49 | self.__base = value 50 | self.__most_left = pow(self.base, self.kgram_len-1, self.modulo) 51 | 52 | def get_min_with_pos(self, sequence): 53 | min_val = sys.maxsize 54 | min_pos = 0 55 | for pos, val in enumerate(sequence): 56 | if val <= min_val: 57 | min_val = val 58 | min_pos = pos 59 | return min_val, min_pos 60 | 61 | def normal_hash(self, kgram): 62 | hash = 0 63 | for c in kgram: 64 | if hash != 0: 65 | hash = (hash * self.base) % self.modulo 66 | hash += guid(c) 67 | hash = hash % self.modulo 68 | return hash 69 | 70 | def rolling_hash(self, old_hash, del_char, new_char): 71 | # more powerful version of rolling hash 72 | hash = ((old_hash - guid(del_char) * self.__most_left) 73 | * self.base + guid(new_char)) 74 | hash = hash % self.modulo 75 | return hash 76 | 77 | def prepare_storage(self): 78 | self.hashes = [] 79 | self.fingerprints = [] 80 | self.str = "" 81 | 82 | def generate_kgrams(self): 83 | for i in range(len(self.str) - self.kgram_len + 1): 84 | yield self.str[i:i + self.kgram_len] 85 | 86 | def hash_kgrams(self): 87 | kgram_iter = self.generate_kgrams() 88 | prev_kgram = next(kgram_iter) 89 | prev_hash = self.normal_hash(prev_kgram) 90 | self.hashes.append(prev_hash) 91 | 92 | for cur_kgram in kgram_iter: 93 | prev_hash = self.rolling_hash( 94 | prev_hash, prev_kgram[0], cur_kgram[-1]) 95 | self.hashes.append(prev_hash) 96 | prev_kgram = cur_kgram 97 | 98 | def generate_fingerprints(self): 99 | windows = (self.hashes[i:i + self.window_len] 100 | for i in range(len(self.hashes) - self.window_len + 1)) 101 | 102 | cur_min_hash, cur_min_pos = self.get_min_with_pos(next(windows)) 103 | self.fingerprints.append((cur_min_hash, cur_min_pos)) 104 | 105 | for i, window in enumerate(windows): 106 | min_hash, min_pos = self.get_min_with_pos(window) 107 | min_pos += i + 1 108 | if min_hash != cur_min_hash or min_hash == cur_min_hash and min_pos > cur_min_pos: 109 | cur_min_hash, cur_min_pos = min_hash, min_pos 110 | self.fingerprints.append((min_hash, min_pos)) 111 | 112 | def validate_config(self): 113 | if len(self.str) < self.window_len: 114 | raise FingerprintException( 115 | "Length of the string is smaller than the length of the window.") 116 | 117 | def sanitize(self, str): 118 | exclude = set(string.punctuation + '\n\r ') # or rather + string.whitespace 119 | return ''.join(c for c in str if c not in exclude) 120 | 121 | def generate(self, str=None, fpath=None): 122 | """generates fingerprints of the input. Either provide `str` to compute fingerprint directly from your string or `fpath` to compute fingerprint from the text of the file. Make sure to have your text decoded in `utf-8` format if you pass the input string. 123 | 124 | Args: 125 | str (Optional(str)): string whose fingerprint is to be computed. 126 | fpath (Optional(str)): absolute path of the text file whose fingerprint is to be computed. 127 | 128 | Returns: 129 | List(int): fingerprints of the input. 130 | 131 | Raises: 132 | FingerprintException: If the input string do not meet the requirements of parameters provided for fingerprinting. 133 | """ 134 | self.prepare_storage() 135 | self.str = self.load_file(fpath) if fpath else self.sanitize(str) 136 | self.validate_config() 137 | self.hash_kgrams() 138 | self.generate_fingerprints() 139 | return self.fingerprints 140 | 141 | def load_file(self, fpath): 142 | with open(fpath, 'r') as fp: 143 | data = fp.read() 144 | data = data.encode().decode('utf-8') 145 | return data 146 | --------------------------------------------------------------------------------