├── .gitignore ├── README.md ├── __init__.py ├── redis_search ├── __init__.py ├── chinese_pinyin.py ├── data │ └── Mandarin.dat ├── index.py ├── query.py └── util.py ├── setup.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.log 3 | *.idea 4 | *.DS_Store 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Redis-Search-Python 2 | 3 | High performance real-time search (Support Chinese), index in Redis for Python, Rewrite from [https://github.com/huacnlee/redis-search](https://github.com/huacnlee/redis-search) 4 | 5 | ## Features 6 | 7 | * Real-time search 8 | * High performance 9 | * Segment words search and prefix match search 10 | * Sort results by one field 11 | * Homophone search, pinyin search 12 | * Conditions support 13 | 14 | ## Requirements 15 | 16 | * redis 2.4+ 17 | * python 2.7+ 18 | * redis-py 2.8+ 19 | * mmseg [http://pypi.python.org/pypi/mmseg](http://pypi.python.org/pypi/mmseg) -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fireball2018/redis-search-py/525aeed49458caf1760d1c556b3a5e3094be1f49/__init__.py -------------------------------------------------------------------------------- /redis_search/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | -------------------------------------------------------------------------------- /redis_search/chinese_pinyin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #encoding: utf-8 3 | 4 | import os 5 | 6 | class Pinyin(object): 7 | """docstring for Pinyin""" 8 | 9 | p = None 10 | table = {} 11 | 12 | def to_unicode(self, value): 13 | """Converts a string argument to a unicode string. 14 | 15 | If the argument is already a unicode string or None, it is returned 16 | unchanged. Otherwise it must be a byte string and is decoded as utf8. 17 | """ 18 | if isinstance(value, (unicode, type(None))): 19 | return value 20 | assert isinstance(value, bytes) 21 | try: 22 | return unicode(value, "utf8") 23 | except: 24 | return value.encode("utf8").decode("utf8") 25 | 26 | def init_table(self): 27 | """docstring for init_table""" 28 | 29 | if self.table: 30 | return 31 | 32 | self.table = {} 33 | 34 | fp = open(os.path.join(os.path.dirname(__file__), "data", "Mandarin.dat")) 35 | lines = fp.read().strip().split("\n") 36 | for line in lines: 37 | key, value = line.split(' ', 1) 38 | self.table[key] = value 39 | 40 | def translate(self, chars, splitter = ' '): 41 | """docstring for translate""" 42 | 43 | self.init_table() 44 | results = [] 45 | is_english = False 46 | 47 | chars = self.to_unicode(chars) 48 | for char in chars: 49 | key = repr(char)[4:-1].upper() 50 | if key in self.table: 51 | if is_english: 52 | results.append(splitter) 53 | 54 | results.append(self.table[key].strip().split(" ", 1)[0][0:-1].lower()) 55 | results.append(splitter) 56 | 57 | is_english = False 58 | else: 59 | results.append(char) 60 | is_english = True 61 | 62 | return "".join(results).strip(splitter) 63 | 64 | @classmethod 65 | def t(self, chars, splitter = ' '): 66 | """docstring for t""" 67 | 68 | if not self.p: 69 | self.p = Pinyin() 70 | 71 | return self.p.translate(chars, splitter) 72 | 73 | if __name__ == "__main__": 74 | print Pinyin.t("hi梁小波") 75 | 76 | -------------------------------------------------------------------------------- /redis_search/index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import json 5 | import logging 6 | 7 | from chinese_pinyin import Pinyin 8 | 9 | import util 10 | from util import split_words, split_pinyin, utf8, mk_sets_key, mk_score_key, mk_condition_key, mk_complete_key 11 | 12 | class index(object): 13 | """docstring for Index""" 14 | 15 | def __init__(self, name, id, title, score="id", condition_fields=None, 16 | prefix_index_enable=True, exts=None, **kwargs): 17 | 18 | if isinstance(exts, dict): 19 | kwargs.update(exts) 20 | 21 | self.name = name 22 | self.title = utf8(title) 23 | self.id = id 24 | self.score = score 25 | self.exts = kwargs 26 | self.condition_fields = condition_fields if condition_fields and isinstance(condition_fields, list) else [] 27 | self.prefix_index_enable = prefix_index_enable 28 | 29 | def save(self): 30 | """docstring for save""" 31 | 32 | if not self.title: 33 | return False 34 | 35 | data = { 36 | 'name': self.name, 37 | 'id': self.id, 38 | 'title': self.title 39 | } 40 | 41 | if self.exts: 42 | data.update(self.exts) 43 | 44 | pipe = util.redis.pipeline() 45 | 46 | # 将原始数据存入 hashes 47 | res = pipe.hset(self.name, self.id, json.dumps(data)) 48 | 49 | # 保存 sets 索引,以分词的单词为key,用于后面搜索,里面存储 ids 50 | words = self.split_words_for_index(self.title) 51 | 52 | if not words: 53 | logging.info("no words") 54 | return False 55 | 56 | for word in words: 57 | key = mk_sets_key(self.name, word) 58 | 59 | # word index for item id 60 | pipe.sadd(key, self.id) 61 | 62 | if self.score == 'id': 63 | self.score = self.id 64 | 65 | # score for search sort 66 | pipe.set(mk_score_key(self.name, self.id), self.score) 67 | 68 | # 将目前的编号保存到条件(conditions)字段所创立的索引上面 69 | for field in self.condition_fields: 70 | pipe.sadd(mk_condition_key(self.name, field, utf8(data[field])), self.id) 71 | 72 | # commit 73 | pipe.execute() 74 | 75 | if self.prefix_index_enable: 76 | self.save_prefix_index() 77 | 78 | def remove(self, name, id, title): 79 | """docstring for remove""" 80 | 81 | pipe = util.redis.pipeline() 82 | 83 | pipe.hdel(name, id) 84 | words = self.split_words_for_index(title) 85 | 86 | for word in words: 87 | key = mk_sets_key(name, word) 88 | 89 | pipe.srem(key, id) 90 | pipe.delete(mk_score_key(name, id)) 91 | 92 | # remove set for prefix index key 93 | pipe.srem(mk_sets_key(name, title, id)) 94 | 95 | # commit 96 | pipe.execute() 97 | 98 | def split_words_for_index(self, title): 99 | """docstring for split_words_for_index""" 100 | 101 | words = split_words(title) 102 | if util.pinyin_match: 103 | words += split_pinyin(title) 104 | 105 | return words 106 | 107 | def save_fulltext_index(self): 108 | pass 109 | 110 | def save_prefix_index(self): 111 | """docstring for save_prefix_index""" 112 | 113 | words = [] 114 | words.append(self.title.lower()) 115 | 116 | pipe = util.redis.pipeline() 117 | 118 | pipe.sadd(mk_sets_key(self.name, self.title), self.id) 119 | 120 | if util.pinyin_match: 121 | pinyin = Pinyin.t(self.title.lower(), "") 122 | words += pinyin 123 | 124 | pipe.sadd(mk_sets_key(self.name, pinyin), self.id) 125 | 126 | key = mk_complete_key(self.name) 127 | for word in words: 128 | for i in range(0, len(word)): 129 | prefix = word[0:i] 130 | pipe.zadd(key, prefix, 0) 131 | 132 | pipe.zadd(key, word + "*", 0) 133 | 134 | # commit 135 | pipe.execute() 136 | 137 | -------------------------------------------------------------------------------- /redis_search/query.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import time 5 | import json 6 | import logging 7 | 8 | from chinese_pinyin import Pinyin 9 | 10 | import util 11 | from util import split_words, split_pinyin, utf8, mk_sets_key, mk_score_key, mk_condition_key, mk_complete_key 12 | 13 | def query(name, text, offset=0, limit=10, sort_field='id', conditions=None): 14 | """docstring for query""" 15 | 16 | conditions = conditions if isinstance(conditions, dict) and conditions else {} 17 | 18 | tm = time.time() 19 | result = [] 20 | 21 | # 如果搜索文本和查询条件均没有,那就直接返回 [] 22 | if not text.strip() and not conditions: 23 | return result 24 | 25 | text = utf8(text.strip()) 26 | splited_words = split_words(text) 27 | 28 | words = [] 29 | for word in splited_words: 30 | words.append(mk_sets_key(name, word)) 31 | 32 | condition_keys = [] 33 | if conditions: 34 | for c in conditions: 35 | condition_keys.append(mk_condition_key(name, c, utf8(conditions[c]))) 36 | 37 | # 将条件的 key 放入关键词搜索集合内,用于 sinterstore 搜索 38 | words += condition_keys 39 | 40 | if not words: 41 | return result 42 | 43 | temp_store_key = "tmpinterstore:%s" % "+".join(words) 44 | 45 | if len(words) > 1: 46 | if not util.redis.exists(temp_store_key): 47 | # 将多个词语组合对比,得到交集,并存入临时区域 48 | util.redis.sinterstore(temp_store_key, words) 49 | 50 | # 将临时搜索设为1天后自动清除 51 | util.redis.expire(temp_store_key, 86400) 52 | 53 | # 拼音搜索 54 | if util.pinyin_match: 55 | splited_pinyin_words = split_pinyin(text) 56 | 57 | pinyin_words = [] 58 | for w in splited_pinyin_words: 59 | pinyin_words.append(mk_sets_key(name, w)) 60 | 61 | pinyin_words += condition_keys 62 | 63 | temp_sunion_key = "tmpsunionstore:%s" % "+".join(words) 64 | temp_pinyin_store_key = "tmpinterstore:%s" % "+".join(pinyin_words) 65 | 66 | # 找出拼音的 67 | util.redis.sinterstore(temp_pinyin_store_key, pinyin_words) 68 | 69 | # 合并中文和拼音的搜索结果 70 | util.redis.sunionstore(temp_sunion_key, [temp_store_key, temp_pinyin_store_key]) 71 | 72 | # 将临时搜索设为1天后自动清除 73 | util.redis.expire(temp_pinyin_store_key, 86400) 74 | util.redis.expire(temp_sunion_key, 86400) 75 | 76 | temp_store_key = temp_sunion_key 77 | else: 78 | temp_store_key = words[0] 79 | 80 | # 根据需要的数量取出 ids 81 | ids = util.redis.sort(temp_store_key, 82 | start = offset, 83 | num = limit, 84 | by = mk_score_key(name, "*"), 85 | desc = True) 86 | 87 | result = util.hmget(name, ids, sort_field=sort_field) 88 | logging.debug("%s:\"%s\" | Time spend:%ss" % (name, text, time.time()-tm)) 89 | return result 90 | 91 | def complete(name, keyword, limit=10, conditions=None): 92 | """docstring for complete""" 93 | 94 | conditions = conditions if isinstance(conditions, dict) and conditions else {} 95 | 96 | if not keyword and not conditions: 97 | logging.debug("no word and conditions") 98 | return [] 99 | 100 | keyword = utf8(keyword.strip()) 101 | prefix_matchs = [] 102 | 103 | # This is not random, try to get replies < MTU size 104 | rangelen = util.complete_max_length 105 | prefix = keyword.lower() 106 | key = mk_complete_key(name) 107 | 108 | start = util.redis.zrank(key, prefix) 109 | 110 | if start: 111 | count = limit 112 | max_range = start+(rangelen*limit)-1 113 | entries = util.redis.zrange(key, start, max_range) 114 | 115 | while len(prefix_matchs) <= count: 116 | 117 | start += rangelen 118 | if not entries or len(entries) == 0: 119 | break 120 | 121 | for entry in entries: 122 | minlen = min(len(entry), len(prefix)) 123 | 124 | if entry[0:minlen] != prefix[0:minlen]: 125 | count = len(prefix_matchs) 126 | break 127 | 128 | if entry[-1] == "*" and len(prefix_matchs) != count: 129 | 130 | match = entry[:-1] 131 | if match not in prefix_matchs: 132 | prefix_matchs.append(match) 133 | 134 | entries = entries[start:max_range] 135 | 136 | # 组合 words 的特别 key 名 137 | words = [] 138 | for word in prefix_matchs: 139 | words.append(mk_sets_key(name, word)) 140 | 141 | # 组合特别 key ,但这里不会像 query 那样放入 words, 因为在 complete 里面 words 是用 union 取的,condition_keys 和 words 应该取交集 142 | condition_keys = [] 143 | if conditions: 144 | for c in conditions: 145 | condition_keys.append(mk_condition_key(name, c, utf8(conditions[c]))) 146 | 147 | # 按词语搜索 148 | temp_store_key = "tmpsunionstore:%s" % "+".join(words) 149 | if len(words) == 0: 150 | logging.info("no words") 151 | elif len(words) > 1: 152 | if not util.redis.exists(temp_store_key): 153 | 154 | # 将多个词语组合对比,得到并集,并存入临时区域 155 | util.redis.sunionstore(temp_store_key, words) 156 | 157 | # 将临时搜索设为1天后自动清除 158 | util.redis.expire(temp_store_key, 86400) 159 | # 根据需要的数量取出 ids 160 | else: 161 | temp_store_key = words[0] 162 | 163 | # 如果有条件,这里再次组合一下 164 | if condition_keys: 165 | if not words: 166 | condition_keys += temp_store_key 167 | 168 | temp_store_key = "tmpsinterstore:%s" % "+".join(condition_keys) 169 | if not util.redis.exists(temp_store_key): 170 | util.redis.sinterstore(temp_store_key, condition_keys) 171 | util.redis.expire(temp_store_key, 86400) 172 | 173 | ids = util.redis.sort(temp_store_key, 174 | start = 0, 175 | num = limit, 176 | by = mk_score_key(name, "*"), 177 | desc = True) 178 | if not ids: 179 | return [] 180 | 181 | return util.hmget(name, ids) 182 | 183 | -------------------------------------------------------------------------------- /redis_search/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import json 5 | 6 | from chinese_pinyin import Pinyin 7 | 8 | from mmseg import seg_txt 9 | from mmseg.search import seg_txt_search, seg_txt_2_dict 10 | 11 | complete_max_length = 10 12 | pinyin_match = True 13 | debug = True 14 | 15 | redis = None 16 | 17 | def hmget(name, ids, sort_field='id'): 18 | """docstring for hmget""" 19 | 20 | result = [] 21 | if not ids: 22 | return result 23 | 24 | for r in redis.hmget(name, ids): 25 | if r: 26 | result.append(json.loads(r)) 27 | 28 | return result 29 | 30 | def mk_sets_key(name, word): 31 | """docstring for mk_sets_key""" 32 | 33 | return "%s:%s" % (name, word.lower()) 34 | 35 | def mk_score_key(name, id): 36 | """docstring for mk_score_key""" 37 | 38 | return "%s:_score_:%s" % (name, id) 39 | 40 | def mk_condition_key(name, field, id): 41 | """docstring for mk_condition_key""" 42 | 43 | return "%s:_by:_%s:%s" % (name, field, id) 44 | 45 | def mk_complete_key(name): 46 | """docstring for mk_complete_key""" 47 | return "Compl%s" % name 48 | 49 | def split_pinyin(text): 50 | """docstring for split_pinyin""" 51 | 52 | return split_words(Pinyin.t(text)) 53 | 54 | def split_words(text): 55 | """docstring for split_words""" 56 | 57 | words = [] 58 | for i in seg_txt_search(text): 59 | words.append(i) 60 | 61 | return words 62 | 63 | def utf8(value): 64 | if isinstance(value, (bytes, type(None))): 65 | return value 66 | assert isinstance(value, unicode) 67 | return value.encode("utf-8") -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from setuptools import setup 5 | 6 | setup( 7 | name='redis_search', 8 | version='0.2', 9 | packages=['redis_search'], 10 | include_package_data = True, 11 | package_data = { 12 | '': ['data/*.dat'], 13 | }, 14 | author='jiedan', 15 | author_email='lxb429@gmail.com', 16 | license='MIT License', 17 | description="High performance real-time search (Support Chinese), indexes store in Redis for Python", 18 | keywords ='redis search', 19 | url='https://github.com/jiedan/redis-search-py.git', 20 | ) -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import logging 5 | import redis 6 | import time 7 | 8 | import redis_search 9 | 10 | from redis_search.util import split_words 11 | from redis_search.index import index 12 | from redis_search.query import query, complete 13 | 14 | logging.basicConfig(level=logging.INFO, format='%(asctime)s:%(msecs)03d %(levelname)-8s %(message)s', 15 | datefmt='%m-%d %H:%M') 16 | 17 | words = split_words("最主要的更动是:张无忌最后没有选定自己的配偶。:,.") 18 | for w in words: 19 | print w 20 | 21 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0) 22 | redis_search.util.redis = redis.Redis(connection_pool=pool) 23 | 24 | i = index("test", 1, "Redis") 25 | i.save() 26 | 27 | i = index("test", 2, "Redhat") 28 | i.save() 29 | 30 | i = index("test", 3, "张无忌最后没有选定自己的配偶", "id", exts= { 31 | 'username':"jiedan", 'email':'lxb429@gmail.com' 32 | }, password="123456") 33 | i.save() 34 | 35 | i = index("test", 4, "Redis 是一个高性能的key-value数据库", "id", exts= { 36 | 'username':"jiedan", 'email':'lxb429@gmail.com' 37 | }) 38 | i.save() 39 | 40 | i = index("test", 6, "回明朝当皇帝", "id", exts={"title":"回明朝当皇帝"}) 41 | i.save() 42 | 43 | i = index("test", 7, "回明朝做皇帝", "id", exts={"title":"回明朝做皇帝"}) 44 | i.save() 45 | 46 | print "自动完成: r" 47 | users = complete('test', "r") 48 | 49 | for user in users: 50 | print user['id'], user['title'] 51 | 52 | print "-"*10 53 | print "自动完成: redi" 54 | users = complete('test', "redi") 55 | 56 | for user in users: 57 | print user['id'], user['title'] 58 | 59 | print "-"*10 60 | print "自动完成: 张" 61 | users = complete('test', "张") 62 | 63 | for user in users: 64 | print user['id'], user['title'] 65 | 66 | print "-"*10 67 | print "搜索: Redis" 68 | users = query('test', "Redis") 69 | 70 | for user in users: 71 | print user['id'], user['title'] 72 | 73 | print "-"*10 74 | print "搜索: 张无忌" 75 | users = query('test', "张无忌") 76 | 77 | for user in users: 78 | print user['id'], user['title'] 79 | 80 | print "-"*10 81 | print "搜索: 回明朝做皇帝" 82 | users = query('test', "回明朝做皇帝") 83 | 84 | for user in users: 85 | print user['id'], user['title'] 86 | 87 | print "-"*10 88 | print "搜索: 皇帝" 89 | users = query('test', "当皇帝") 90 | 91 | for user in users: 92 | print user['id'], user['title'] 93 | 94 | print "-"*10 95 | print "拼音搜索: zhang" 96 | users = query('test', "zhang") 97 | 98 | for user in users: 99 | print user['id'], user['title'] 100 | --------------------------------------------------------------------------------