├── .gitignore ├── MIT.LICENSE ├── README.md ├── autocomplete ├── __init__.py ├── index.py └── utils.py ├── setup.py └── test ├── input.json └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /MIT.LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2012 Feng LI 2 | 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining 5 | a copy of this software and associated documentation files (the 6 | "Software"), to deal in the Software without restriction, including 7 | without limitation the rights to use, copy, modify, merge, publish, 8 | distribute, sublicense, and/or sell copies of the Software, and to 9 | permit persons to whom the Software is furnished to do so, subject to 10 | the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 19 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 20 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 21 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | autocomplete-redis 2 | ============ 3 | 4 | autocomplete-redis is a quora like automatic autocompletion based on redis. 5 | 6 | Installation 7 | --------- 8 | 9 | * install pip (if you haven't yet) `easy_install pip` 10 | 11 | * install pymmseg (support for Chinese Characters): `pip install pymmseg` 12 | 13 | * install autocomplete-redis: `pip install -e git+https://github.com/fengli/autocomplete-redis.git#egg=autocomplete-dev` 14 | 15 | Quick start 16 | ---------- 17 | * Assume you have few items to index. 18 | 19 | ```python 20 | items=[{"uid":'1', "score":9, "term": u"hello world, that's great"}, 21 | {"uid":'2', "score":10, "term": u"what the hell or yell"}, 22 | {"uid":'3', "score":8.5, "term":u"World is like a box of chocolate"}, 23 | ] 24 | ``` 25 | 26 | The code for build the index and search is simple: 27 | 28 | ```python 29 | from autocomplete import Autocomplete 30 | 31 | #build index 32 | au = Autocomplete ("scope") 33 | for item in items: 34 | au.add_item (item) 35 | #search 36 | restuls = au.search_query (u'hel') 37 | 38 | print results 39 | [{'term': 'what the hell or yell', 'score': 10, 'uid': '2'}, {'term': "hello world, that's great", 'score': 9, 'uid': '1'}] 40 | ``` 41 | 42 | 43 | API 44 | --------------- 45 | 46 | * Convention: the item you pass to `autocomplete` should have at least `"uid"` and `"term"`, `"score"` is optional, but it's important if you want to return based on ranking. And you could have other fields as you like. 47 | 48 | ```python 49 | {"uid":'1', "score":9, "term": u"hello world, that's great", 'meta':"1992"} 50 | ``` 51 | * `uid`: the unique identifier for your item 52 | * `score`: the returned items sorted by this value. 53 | * `term`: the string to be indexed. 54 | 55 | * `def __init__ (self, scope, redisaddr="localhost", limits=5, cached=True)` 56 | 57 | * scope: Scope allows you to index multiple independent indexes. 58 | * redisaddr: your redis address 59 | * limits: How many results you want to get. 60 | * cached: Cache multiple keys combination? 61 | 62 | * `def del_index (self)` 63 | 64 | Delete all the indexes. Warning: all data will be deleted. 65 | 66 | * `def add_item (self,item)` 67 | 68 | Add item to index. 69 | 70 | * `def del_item (self,item)` 71 | 72 | Delete item from index. 73 | 74 | * `def update_item (self, item)` 75 | 76 | Update item indexed with item['uid'] with the new version. 77 | 78 | * `def search_query (self,prefix)` 79 | 80 | Search in database for all items that `item['term']` included `PREFIX` 81 | 82 | Bring to you by: 83 | ---------------- 84 | 85 | * http://readpi.com 86 | -------------------------------------------------------------------------------- /autocomplete/__init__.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | 3 | from index import * 4 | 5 | -------------------------------------------------------------------------------- /autocomplete/index.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | import redis 3 | 4 | try: 5 | import simplejson 6 | except: 7 | from django.utils import simplejson 8 | 9 | try: 10 | from django.core import serializers 11 | from django.db.models.loading import get_model 12 | except: 13 | pass 14 | 15 | import mmseg 16 | from autocomplete.utils import queryset_iterator 17 | 18 | class Autocomplete (object): 19 | """ 20 | autocomplete. 21 | """ 22 | 23 | def __init__ (self, scope, redisaddr="localhost", limits=5, cached=True): 24 | self.r = redis.Redis (redisaddr) 25 | self.scope = scope 26 | self.cached=cached 27 | self.limits = limits 28 | self.database = "database:%s" % scope 29 | self.indexbase = "indexbase:%s" % scope 30 | mmseg.Dictionary.load_dictionaries () 31 | 32 | def _get_index_key (self, key): 33 | return "%s:%s" % (self.indexbase, key) 34 | 35 | def del_index (self): 36 | prefixs = self.r.smembers (self.indexbase) 37 | for prefix in prefixs: 38 | self.r.delete(self._get_index_key(prefix)) 39 | self.r.delete(self.indexbase) 40 | self.r.delete(self.database) 41 | 42 | def sanity_check (self, item): 43 | """ 44 | Make sure item has key that's needed. 45 | """ 46 | for key in ("uid","term"): 47 | if not item.has_key (key): 48 | raise Exception ("Item should have key %s"%key ) 49 | 50 | def add_item (self,item): 51 | """ 52 | Create index for ITEM. 53 | """ 54 | self.sanity_check (item) 55 | self.r.hset (self.database, item.get('uid'), simplejson.dumps(item)) 56 | for prefix in self.prefixs_for_term (item['term']): 57 | self.r.sadd (self.indexbase, prefix) 58 | self.r.zadd (self._get_index_key(prefix),item.get('uid'), item.get('score',0)) 59 | 60 | def del_item (self,item): 61 | """ 62 | Delete ITEM from the index 63 | """ 64 | for prefix in self.prefixs_for_term (item['term']): 65 | self.r.zrem (self._get_index_key(prefix), item.get('uid')) 66 | if not self.r.zcard (self._get_index_key(prefix)): 67 | self.r.delete (self._get_index_key(prefix)) 68 | self.r.srem (self.indexbase, prefix) 69 | 70 | def update_item (self, item): 71 | self.del_item (item) 72 | self.add_item (item) 73 | 74 | def prefixs_for_term (self,term): 75 | """ 76 | Get prefixs for TERM. 77 | """ 78 | # Normalization 79 | term=term.lower() 80 | 81 | # Prefixs for term 82 | prefixs=[] 83 | tokens=mmseg.Algorithm(term) 84 | for token in tokens: 85 | word = token.text 86 | for i in xrange (1,len(word)+1): 87 | prefixs.append(word[:i]) 88 | 89 | return prefixs 90 | 91 | def normalize (self,prefix): 92 | """ 93 | Normalize the search string. 94 | """ 95 | tokens = mmseg.Algorithm(prefix.lower()) 96 | return [token.text for token in tokens] 97 | 98 | def search_query (self,prefix): 99 | search_strings = self.normalize (prefix) 100 | 101 | if not search_strings: return [] 102 | 103 | cache_key = self._get_index_key (('|').join(search_strings)) 104 | 105 | if not self.cached or not self.r.exists (cache_key): 106 | self.r.zinterstore (cache_key, map (lambda x: self._get_index_key(x), search_strings)) 107 | self.r.expire (cache_key, 10 * 60) 108 | 109 | ids=self.r.zrevrange (cache_key, 0, self.limits) 110 | if not ids: return ids 111 | return map(lambda x:simplejson.loads(x), 112 | self.r.hmget(self.database, *ids)) 113 | -------------------------------------------------------------------------------- /autocomplete/utils.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | def queryset_iterator(queryset, chunksize=1000): 4 | ''''' 5 | Iterate over a Django Queryset ordered by the primary key 6 | 7 | This method loads a maximum of chunksize (default: 1000) rows in it's 8 | memory at the same time while django normally would load all rows in it's 9 | memory. Using the iterator() method only causes it to not preload all the 10 | classes. 11 | 12 | Note that the implementation of the iterator does not support ordered query sets. 13 | ''' 14 | pk = 0 15 | last_pk = queryset.order_by('-pk')[0].pk 16 | queryset = queryset.order_by('pk') 17 | while pk < last_pk: 18 | for row in queryset.filter(pk__gt=pk)[:chunksize]: 19 | pk = row.pk 20 | yield row 21 | gc.collect() 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | long_description = open('README.md').read() 3 | VERSION = '1.0' 4 | 5 | setup( 6 | name='autocomplete', 7 | version=VERSION, 8 | packages=['autocomplete', 9 | ], 10 | description='Redis based autocompletion (build index and search query).', 11 | long_description=long_description, 12 | author='Feng Li', 13 | author_email='okidogii@gmail.com', 14 | license='MIT License', 15 | url='https://github.com/fengli/autocomplete-redis.git', 16 | platforms=["any"], 17 | classifiers=[ 18 | 'Development Status :: 1 - Beta', 19 | 'Intended Audience :: Developers', 20 | 'License :: OSI Approved :: MIT License', 21 | 'Natural Language :: English', 22 | 'Operating System :: OS Independent', 23 | 'Programming Language :: Python', 24 | ], 25 | ) 26 | -------------------------------------------------------------------------------- /test/input.json: -------------------------------------------------------------------------------- 1 | {"score": "9", "id": "1", "term": "轻轻地你走了"} 2 | {"score": "8", "id": "2", "term": "正如你轻轻地来"} 3 | {"score": "8.5", "id": "3", "term": "你挥一挥衣袖,不带走一片云彩"} 4 | -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | #-*-coding:utf-8-*- 2 | from autocomplete import Autocomplete 3 | import os 4 | import unittest 5 | 6 | class testAutocomplete (unittest.TestCase): 7 | def setUp (self): 8 | self.items=[{"uid":'1', "score":9, "term": u"轻轻地你走了"}, 9 | {"uid":'2', "score":10, "term": u"正如你轻轻地来"}, 10 | {"uid":'3', "score":8.5, "term":u"你挥一挥衣袖,不带走一片云彩"}, 11 | ] 12 | 13 | self.a=Autocomplete("scope") 14 | self.a.del_index() 15 | for item in self.items: 16 | self.a.add_item (item) 17 | 18 | def test_search_query2 (self): 19 | results=self.a.search_query (u'轻轻') 20 | self.assertEqual(len(results),2) 21 | self.assertEqual(results[0]['uid'],'2') 22 | self.assertEqual(results[1]['uid'],'1') 23 | 24 | def test_search_query3 (self): 25 | results=self.a.search_query (u'你 带走') 26 | self.assertEqual(len(results),1) 27 | self.assertEqual(results[0]['uid'],'3') 28 | 29 | def test_search_query4 (self): 30 | results=self.a.search_query (u'你挥一挥衣袖,不带走一片云彩') 31 | self.assertEqual(len(results),1) 32 | self.assertEqual(results[0]['uid'],'3') 33 | 34 | def test_update_item (self): 35 | item = {"uid":'1', "score":13, "term": u"轻轻地你走了"} 36 | self.a.update_item (item) 37 | results=self.a.search_query (u'轻轻') 38 | self.assertEqual(len(results),2) 39 | self.assertEqual(results[0]['uid'],'1') 40 | self.assertEqual(results[1]['uid'],'2') 41 | 42 | def test_del_item (self): 43 | item = {"uid":'1', "score":9, "term": u"轻轻地你走了"} 44 | self.a.del_item (item) 45 | results=self.a.search_query (u'轻轻') 46 | self.assertEqual(len(results),1) 47 | self.assertEqual(results[0]['uid'],'2') 48 | 49 | if __name__=='__main__': 50 | unittest.main () 51 | --------------------------------------------------------------------------------