├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── requirements.txt ├── setup.py ├── shingles ├── __init__.py ├── shingles.py └── util.py └── tests ├── test_shingles.py └── test_util.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | *.pyo 4 | .DS_Store 5 | .Python 6 | .idea/ 7 | venv/ 8 | build/ 9 | dist/ 10 | *.egg-info/ 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Steven Samson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | init: 2 | pip install -r requirements.txt 3 | 4 | test: 5 | python -m unittest discover -v tests 6 | 7 | clean: 8 | rm -f `find shingles -name '*.pyc'` 9 | rm -f `find shingles -name '*.pyo'` 10 | rm -f `find . -name '*~'` 11 | rm -rf build iso dist api text_shingles-$(VERSION) text_shingles.egg-info 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text Shingles Library 2 | 3 | This is python 3 library to support measuring the similarity of pieces of text based on their [MinHash](https://en.wikipedia.org/wiki/MinHash) signature generated from their k-shingle form. 4 | 5 | ## API 6 | 7 | Text can be represented in MinHash form by creating a new `ShingledText` instance and passing in text as well as optional values for the `random_seed` for hashing (default 5), the `shingle_length` aka the k in k-shingles (default 5), and the `minhash_size` for the size of the MinHash signature (default 200). Variables for the list form of the `minhash` and iterator representation of `shingles` are available for the object. A `similarity` function is also available to compute the [Jaccard similarity](https://en.wikipedia.org/wiki/Jaccard_index) of the two MinHash objects. 8 | 9 | ## Requirements 10 | 11 | This library utilizes Python 3, [NLTK](http://www.nltk.org), and [Murmur Hash](https://pypi.python.org/pypi/mmh3/2.3.1) 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nltk>=3.1 2 | mmh3>=2.3.1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | with open('README.md') as f: 5 | readme = f.read() 6 | 7 | 8 | with open('LICENSE') as f: 9 | license = f.read() 10 | 11 | 12 | with open('requirements.txt') as f: 13 | requirements = f.read().splitlines() 14 | 15 | 16 | setup( 17 | name='shingles', 18 | version='0.0.1', 19 | description='k-shingles for text', 20 | long_description=readme, 21 | author='Steven Samson', 22 | author_email='steven.a.samson@gmail.com', 23 | url='https://github.com/steven-s/text-shingles', 24 | license=license, 25 | packages=find_packages(exclude=('tests')), 26 | install_requires=requirements 27 | ) 28 | -------------------------------------------------------------------------------- /shingles/__init__.py: -------------------------------------------------------------------------------- 1 | from shingles.shingles import * 2 | from shingles.util import * 3 | -------------------------------------------------------------------------------- /shingles/shingles.py: -------------------------------------------------------------------------------- 1 | import mmh3 2 | from nltk import ngrams 3 | 4 | from shingles.util import generate_random_seeds, minhash_similarity 5 | 6 | 7 | class ShingledText: 8 | def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200): 9 | split_text = text.split() 10 | if len(split_text) < shingle_length: 11 | raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length)) 12 | 13 | self.minhash = [] 14 | self.shingles = ngrams(split_text, shingle_length) 15 | 16 | for hash_seed in generate_random_seeds(minhash_size, random_seed): 17 | min_value = float('inf') 18 | for shingle in ngrams(split_text, shingle_length): 19 | value = mmh3.hash(' '.join(shingle), hash_seed) 20 | min_value = min(min_value, value) 21 | self.minhash.append(min_value) 22 | 23 | def similarity(self, other_shingled_text): 24 | return minhash_similarity(self.minhash, other_shingled_text.minhash) 25 | -------------------------------------------------------------------------------- /shingles/util.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def generate_random_seeds(n, seed=5): 5 | random.seed(seed) 6 | return random.sample(range(1, n + 1), n) 7 | 8 | 9 | def minhash_similarity(minhash_a, minhash_b): 10 | match_count = 0 11 | for a_item, b_item in zip(minhash_a, minhash_b): 12 | if a_item == b_item: 13 | match_count += 1 14 | return match_count / len(minhash_a) 15 | -------------------------------------------------------------------------------- /tests/test_shingles.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from shingles.shingles import * 4 | 5 | paragraph = u""" 6 | Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 7 | """ 8 | 9 | rearranged_paragraph = u""" 10 | Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqual. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo conseua 11 | """ 12 | 13 | another_paragraph = u""" 14 | Apparently she had forgotten her age and by force of habit employed all 15 | the old feminine arts. But as soon as the prince had gone her face 16 | resumed its former cold, artificial expression. She returned to the 17 | group where the vicomte was still talking, and again pretended to 18 | listen, while waiting till it would be time to leave. Her task was 19 | accomplished. 20 | """ 21 | 22 | class ShingledTextTestCase(unittest.TestCase): 23 | 24 | def test_shingling_short_text(self): 25 | short_text = u'hello there, im short' 26 | with self.assertRaises(ValueError): 27 | ShingledText(short_text) 28 | 29 | def test_shingling_paragraph(self): 30 | shingled_text = ShingledText(paragraph, 5, 5, 20) 31 | self.assertEqual(20, len(shingled_text.minhash)) 32 | 33 | def test_similarity(self): 34 | shingled_text = ShingledText(paragraph, 5, 5, 20) 35 | rearranged_shingled_text = ShingledText( 36 | rearranged_paragraph, 5, 5, 20) 37 | shingled_diff_text = ShingledText(another_paragraph, 5, 5, 20) 38 | self.assertTrue(0.6 <= shingled_text.similarity( 39 | rearranged_shingled_text)) 40 | self.assertTrue(0.5 > shingled_text.similarity(shingled_diff_text)) 41 | 42 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from shingles.util import * 4 | 5 | class UtilTestCase(unittest.TestCase): 6 | def test_generate_random_seeds(self): 7 | seeds = generate_random_seeds(20, 8) 8 | same_seeds = generate_random_seeds(20, 8) 9 | diff_seeds = generate_random_seeds(20) 10 | 11 | self.assertEqual(20, len(seeds)) 12 | self.assertEqual(seeds, same_seeds) 13 | self.assertNotEqual(seeds, diff_seeds) 14 | 15 | def test_minhash_similarity(self): 16 | minhash_a = [1, 2, 3, 4, 5, 6, 7, 8, 9] 17 | minhash_b = [1, 2, 3, 7, 8, 9, 10, 11, 12] 18 | 19 | self.assertEqual(3/9, minhash_similarity(minhash_a, minhash_b)) 20 | 21 | --------------------------------------------------------------------------------