├── .gitignore
├── README.md
├── __init__.py
├── redis_search
    ├── __init__.py
    ├── chinese_pinyin.py
    ├── data
    │   └── Mandarin.dat
    ├── index.py
    ├── query.py
    └── util.py
├── setup.py
└── test.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.log
3 | *.idea
4 | *.DS_Store
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Redis-Search-Python
 2 | 
 3 | High performance real-time search (Support Chinese), index in Redis for Python, Rewrite from [https://github.com/huacnlee/redis-search](https://github.com/huacnlee/redis-search)
 4 | 
 5 | ## Features
 6 | 
 7 | * Real-time search
 8 | * High performance
 9 | * Segment words search and prefix match search
10 | * Sort results by one field
11 | * Homophone search, pinyin search
12 | * Conditions support
13 | 
14 | ## Requirements
15 | 
16 | * redis 2.4+
17 | * python 2.7+
18 | * redis-py 2.8+
19 | * mmseg [http://pypi.python.org/pypi/mmseg](http://pypi.python.org/pypi/mmseg)


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fireball2018/redis-search-py/525aeed49458caf1760d1c556b3a5e3094be1f49/__init__.py


--------------------------------------------------------------------------------
/redis_search/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | 


--------------------------------------------------------------------------------
/redis_search/chinese_pinyin.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #encoding: utf-8
 3 | 
 4 | import os
 5 | 
 6 | class Pinyin(object):
 7 |     """docstring for Pinyin"""
 8 | 
 9 |     p = None
10 |     table = {}
11 | 
12 |     def to_unicode(self, value):
13 |         """Converts a string argument to a unicode string.
14 | 
15 |         If the argument is already a unicode string or None, it is returned
16 |         unchanged.  Otherwise it must be a byte string and is decoded as utf8.
17 |         """
18 |         if isinstance(value, (unicode, type(None))):
19 |             return value
20 |         assert isinstance(value, bytes)
21 |         try:
22 |             return unicode(value, "utf8")
23 |         except:
24 |             return value.encode("utf8").decode("utf8")
25 | 
26 |     def init_table(self):
27 |         """docstring for init_table"""
28 | 
29 |         if self.table:
30 |             return
31 |         
32 |         self.table = {}
33 |         
34 |         fp = open(os.path.join(os.path.dirname(__file__), "data", "Mandarin.dat"))
35 |         lines = fp.read().strip().split("\n")
36 |         for line in lines:
37 |             key, value = line.split('	', 1)
38 |             self.table[key] = value
39 |     
40 |     def translate(self, chars, splitter = ' '):
41 |         """docstring for translate"""
42 |         
43 |         self.init_table()
44 |         results = []
45 |         is_english = False
46 | 
47 |         chars = self.to_unicode(chars)
48 |         for char in chars:
49 |             key = repr(char)[4:-1].upper()
50 |             if key in self.table:
51 |                 if is_english:
52 |                     results.append(splitter)
53 |                     
54 |                 results.append(self.table[key].strip().split(" ", 1)[0][0:-1].lower())
55 |                 results.append(splitter)
56 |                 
57 |                 is_english = False
58 |             else:
59 |                 results.append(char)
60 |                 is_english = True
61 |         
62 |         return "".join(results).strip(splitter)
63 |     
64 |     @classmethod
65 |     def t(self, chars, splitter = ' '):
66 |         """docstring for t"""
67 | 
68 |         if not self.p:
69 |             self.p = Pinyin()
70 | 
71 |         return self.p.translate(chars, splitter)
72 |     
73 | if __name__ == "__main__":
74 |     print Pinyin.t("hi梁小波")
75 | 
76 | 


--------------------------------------------------------------------------------
/redis_search/index.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | import json
  5 | import logging
  6 | 
  7 | from chinese_pinyin import Pinyin
  8 | 
  9 | import util
 10 | from util import split_words, split_pinyin, utf8, mk_sets_key, mk_score_key, mk_condition_key, mk_complete_key
 11 | 
 12 | class index(object):
 13 |     """docstring for Index"""
 14 |     
 15 |     def __init__(self, name, id, title, score="id", condition_fields=None, 
 16 |                     prefix_index_enable=True, exts=None, **kwargs):
 17 | 
 18 |         if isinstance(exts, dict):
 19 |             kwargs.update(exts)
 20 |         
 21 |         self.name  = name
 22 |         self.title = utf8(title)
 23 |         self.id    = id
 24 |         self.score = score
 25 |         self.exts  = kwargs
 26 |         self.condition_fields = condition_fields if condition_fields and isinstance(condition_fields, list) else []
 27 |         self.prefix_index_enable = prefix_index_enable
 28 |     
 29 |     def save(self):
 30 |         """docstring for save"""
 31 | 
 32 |         if not self.title:
 33 |             return False
 34 | 
 35 |         data = {
 36 |             'name': self.name,
 37 |             'id': self.id,
 38 |             'title': self.title
 39 |         }
 40 | 
 41 |         if self.exts:
 42 |             data.update(self.exts)
 43 | 
 44 |         pipe = util.redis.pipeline()
 45 | 
 46 |         # 将原始数据存入 hashes
 47 |         res = pipe.hset(self.name, self.id, json.dumps(data))
 48 | 
 49 |         # 保存 sets 索引，以分词的单词为key，用于后面搜索，里面存储 ids
 50 |         words = self.split_words_for_index(self.title)
 51 | 
 52 |         if not words:
 53 |             logging.info("no words")
 54 |             return False
 55 | 
 56 |         for word in words:
 57 |             key = mk_sets_key(self.name, word)
 58 | 
 59 |             # word index for item id
 60 |             pipe.sadd(key, self.id)
 61 | 
 62 |         if self.score == 'id':
 63 |             self.score = self.id
 64 |             
 65 |         # score for search sort
 66 |         pipe.set(mk_score_key(self.name, self.id), self.score)
 67 | 
 68 |         # 将目前的编号保存到条件(conditions)字段所创立的索引上面
 69 |         for field in self.condition_fields:
 70 |             pipe.sadd(mk_condition_key(self.name, field, utf8(data[field])), self.id)
 71 | 
 72 |         # commit
 73 |         pipe.execute()
 74 | 
 75 |         if self.prefix_index_enable:
 76 |             self.save_prefix_index()
 77 | 
 78 |     def remove(self, name, id, title):
 79 |         """docstring for remove"""
 80 |         
 81 |         pipe = util.redis.pipeline()
 82 | 
 83 |         pipe.hdel(name, id)
 84 |         words = self.split_words_for_index(title)
 85 | 
 86 |         for word in words:
 87 |             key = mk_sets_key(name, word)
 88 | 
 89 |             pipe.srem(key, id)
 90 |             pipe.delete(mk_score_key(name, id))
 91 |             
 92 |         # remove set for prefix index key
 93 |         pipe.srem(mk_sets_key(name, title, id))
 94 | 
 95 |         # commit
 96 |         pipe.execute()
 97 |     
 98 |     def split_words_for_index(self, title):
 99 |         """docstring for split_words_for_index"""
100 | 
101 |         words = split_words(title)
102 |         if util.pinyin_match:
103 |             words += split_pinyin(title)
104 |         
105 |         return words
106 | 
107 |     def save_fulltext_index(self):
108 |         pass
109 |     
110 |     def save_prefix_index(self):
111 |         """docstring for save_prefix_index"""
112 | 
113 |         words = []
114 |         words.append(self.title.lower())
115 | 
116 |         pipe = util.redis.pipeline()
117 |         
118 |         pipe.sadd(mk_sets_key(self.name, self.title), self.id)
119 | 
120 |         if util.pinyin_match:
121 |             pinyin = Pinyin.t(self.title.lower(), "")
122 |             words += pinyin
123 | 
124 |             pipe.sadd(mk_sets_key(self.name, pinyin), self.id)
125 | 
126 |         key = mk_complete_key(self.name)
127 |         for word in words:
128 |             for i in range(0, len(word)):
129 |                 prefix = word[0:i]
130 |                 pipe.zadd(key, prefix, 0)
131 |             
132 |             pipe.zadd(key, word + "*", 0)
133 | 
134 |         # commit
135 |         pipe.execute()
136 | 
137 | 


--------------------------------------------------------------------------------
/redis_search/query.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | import time
  5 | import json
  6 | import logging
  7 | 
  8 | from chinese_pinyin import Pinyin
  9 | 
 10 | import util
 11 | from util import split_words, split_pinyin, utf8, mk_sets_key, mk_score_key, mk_condition_key, mk_complete_key
 12 | 
 13 | def query(name, text, offset=0, limit=10, sort_field='id', conditions=None):
 14 |     """docstring for query"""
 15 | 
 16 |     conditions = conditions if isinstance(conditions, dict) and conditions else {}
 17 | 
 18 |     tm = time.time()
 19 |     result = []
 20 | 
 21 |     # 如果搜索文本和查询条件均没有，那就直接返回 []
 22 |     if not text.strip() and not conditions:
 23 |         return result
 24 | 
 25 |     text = utf8(text.strip())
 26 |     splited_words = split_words(text)
 27 | 
 28 |     words = []
 29 |     for word in splited_words:
 30 |         words.append(mk_sets_key(name, word))
 31 | 
 32 |     condition_keys = []
 33 |     if conditions:
 34 |         for c in conditions:
 35 |             condition_keys.append(mk_condition_key(name, c, utf8(conditions[c])))
 36 |             
 37 |         # 将条件的 key 放入关键词搜索集合内，用于 sinterstore 搜索
 38 |         words += condition_keys
 39 |     
 40 |     if not words:
 41 |         return result
 42 | 
 43 |     temp_store_key = "tmpinterstore:%s" % "+".join(words)
 44 |     
 45 |     if len(words) > 1:
 46 |         if not util.redis.exists(temp_store_key):
 47 |             # 将多个词语组合对比，得到交集，并存入临时区域
 48 |             util.redis.sinterstore(temp_store_key, words)
 49 |             
 50 |             # 将临时搜索设为1天后自动清除
 51 |             util.redis.expire(temp_store_key, 86400)
 52 |         
 53 |         # 拼音搜索
 54 |         if util.pinyin_match:
 55 |             splited_pinyin_words = split_pinyin(text)
 56 | 
 57 |             pinyin_words = []
 58 |             for w in splited_pinyin_words:
 59 |                 pinyin_words.append(mk_sets_key(name, w))
 60 |                 
 61 |             pinyin_words += condition_keys
 62 |             
 63 |             temp_sunion_key = "tmpsunionstore:%s" % "+".join(words)
 64 |             temp_pinyin_store_key = "tmpinterstore:%s" % "+".join(pinyin_words)
 65 |             
 66 |             # 找出拼音的
 67 |             util.redis.sinterstore(temp_pinyin_store_key, pinyin_words)
 68 |             
 69 |             # 合并中文和拼音的搜索结果
 70 |             util.redis.sunionstore(temp_sunion_key, [temp_store_key, temp_pinyin_store_key])
 71 |             
 72 |             # 将临时搜索设为1天后自动清除
 73 |             util.redis.expire(temp_pinyin_store_key, 86400)
 74 |             util.redis.expire(temp_sunion_key, 86400)
 75 |             
 76 |             temp_store_key = temp_sunion_key
 77 |     else:
 78 |         temp_store_key = words[0]
 79 | 
 80 |     # 根据需要的数量取出 ids
 81 |     ids = util.redis.sort(temp_store_key,
 82 |                     start = offset,
 83 |                     num = limit,
 84 |                     by = mk_score_key(name, "*"),
 85 |                     desc = True)
 86 | 
 87 |     result = util.hmget(name, ids, sort_field=sort_field)
 88 |     logging.debug("%s:\"%s\" | Time spend:%ss" % (name, text, time.time()-tm))
 89 |     return result
 90 | 
 91 | def complete(name, keyword, limit=10, conditions=None):
 92 |     """docstring for complete"""
 93 | 
 94 |     conditions = conditions if isinstance(conditions, dict) and conditions else {}
 95 | 
 96 |     if not keyword and not conditions:
 97 |         logging.debug("no word and conditions")
 98 |         return []
 99 | 
100 |     keyword = utf8(keyword.strip())
101 |     prefix_matchs = []
102 |     
103 |     # This is not random, try to get replies < MTU size
104 |     rangelen = util.complete_max_length
105 |     prefix = keyword.lower()
106 |     key = mk_complete_key(name)
107 | 
108 |     start = util.redis.zrank(key, prefix)
109 | 
110 |     if start:
111 |         count = limit
112 |         max_range = start+(rangelen*limit)-1
113 |         entries = util.redis.zrange(key, start, max_range)
114 |         
115 |         while len(prefix_matchs) <= count:
116 |             
117 |             start += rangelen
118 |             if not entries or len(entries) == 0:
119 |                 break
120 |             
121 |             for entry in entries:
122 |                 minlen = min(len(entry), len(prefix))
123 | 
124 |                 if entry[0:minlen] != prefix[0:minlen]:
125 |                     count = len(prefix_matchs)
126 |                     break
127 | 
128 |                 if entry[-1] == "*" and len(prefix_matchs) != count:
129 | 
130 |                     match = entry[:-1]
131 |                     if match not in prefix_matchs:
132 |                         prefix_matchs.append(match)
133 |           
134 |             entries = entries[start:max_range]
135 | 
136 |     # 组合 words 的特别 key 名
137 |     words = []
138 |     for word in prefix_matchs:
139 |         words.append(mk_sets_key(name, word))
140 | 
141 |     # 组合特别 key ,但这里不会像 query 那样放入 words， 因为在 complete 里面 words 是用 union 取的，condition_keys 和 words 应该取交集
142 |     condition_keys = []
143 |     if conditions:
144 |         for c in conditions:
145 |             condition_keys.append(mk_condition_key(name, c, utf8(conditions[c])))
146 |     
147 |     # 按词语搜索
148 |     temp_store_key = "tmpsunionstore:%s" % "+".join(words)
149 |     if len(words) == 0:
150 |         logging.info("no words")
151 |     elif len(words) > 1:
152 |         if not util.redis.exists(temp_store_key):
153 |             
154 |             # 将多个词语组合对比，得到并集，并存入临时区域   
155 |             util.redis.sunionstore(temp_store_key, words)
156 |             
157 |             # 将临时搜索设为1天后自动清除
158 |             util.redis.expire(temp_store_key, 86400)
159 |         # 根据需要的数量取出 ids
160 |     else:
161 |         temp_store_key = words[0]
162 | 
163 |     # 如果有条件，这里再次组合一下
164 |     if condition_keys:
165 |         if not words:
166 |             condition_keys += temp_store_key
167 |             
168 |         temp_store_key = "tmpsinterstore:%s" % "+".join(condition_keys)
169 |         if not util.redis.exists(temp_store_key):
170 |             util.redis.sinterstore(temp_store_key, condition_keys)
171 |             util.redis.expire(temp_store_key, 86400)
172 |      
173 |     ids = util.redis.sort(temp_store_key,
174 |                     start = 0,
175 |                     num = limit,
176 |                     by = mk_score_key(name, "*"),
177 |                     desc = True)
178 |     if not ids:
179 |         return []
180 |         
181 |     return util.hmget(name, ids)
182 | 
183 | 


--------------------------------------------------------------------------------
/redis_search/util.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | import json
 5 | 
 6 | from chinese_pinyin import Pinyin
 7 | 
 8 | from mmseg import seg_txt
 9 | from mmseg.search import seg_txt_search, seg_txt_2_dict
10 | 
11 | complete_max_length = 10
12 | pinyin_match        = True
13 | debug               = True
14 | 
15 | redis = None
16 | 
17 | def hmget(name, ids, sort_field='id'):
18 |     """docstring for hmget"""
19 |     
20 |     result = []
21 |     if not ids:
22 |         return result
23 |     
24 |     for r in redis.hmget(name, ids):
25 |         if r:
26 |             result.append(json.loads(r))
27 | 
28 |     return result
29 | 
30 | def mk_sets_key(name, word):
31 |     """docstring for mk_sets_key"""
32 |     
33 |     return "%s:%s" % (name, word.lower())
34 | 
35 | def mk_score_key(name, id):
36 |     """docstring for mk_score_key"""
37 | 
38 |     return "%s:_score_:%s" % (name, id)
39 | 
40 | def mk_condition_key(name, field, id):
41 |     """docstring for mk_condition_key"""
42 | 
43 |     return "%s:_by:_%s:%s" % (name, field, id)
44 | 
45 | def mk_complete_key(name):
46 |     """docstring for mk_complete_key"""
47 |     return "Compl%s" % name
48 | 
49 | def split_pinyin(text):
50 |     """docstring for split_pinyin"""
51 |     
52 |     return split_words(Pinyin.t(text))
53 | 
54 | def split_words(text):
55 |     """docstring for split_words"""
56 |     
57 |     words = []
58 |     for i in seg_txt_search(text):
59 |         words.append(i)
60 | 
61 |     return words
62 | 
63 | def utf8(value):
64 |     if isinstance(value, (bytes, type(None))):
65 |         return value
66 |     assert isinstance(value, unicode)
67 |     return value.encode("utf-8")


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | setup(
 7 |     name='redis_search',
 8 |     version='0.2',
 9 |     packages=['redis_search'],
10 |     include_package_data = True,
11 |     package_data = {
12 |         '': ['data/*.dat'],
13 |     },
14 |     author='jiedan',
15 |     author_email='lxb429@gmail.com',
16 |     license='MIT License',
17 |     description="High performance real-time search (Support Chinese), indexes store in Redis for Python",
18 |     keywords ='redis search',
19 |     url='https://github.com/jiedan/redis-search-py.git',
20 | )


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | import logging
  5 | import redis
  6 | import time
  7 | 
  8 | import redis_search
  9 | 
 10 | from redis_search.util import split_words
 11 | from redis_search.index import index
 12 | from redis_search.query import query, complete
 13 | 
 14 | logging.basicConfig(level=logging.INFO, format='%(asctime)s:%(msecs)03d %(levelname)-8s %(message)s',
 15 |         datefmt='%m-%d %H:%M')
 16 | 
 17 | words = split_words("最主要的更动是：张无忌最后没有选定自己的配偶。:,.")
 18 | for w in words:
 19 |     print w
 20 | 
 21 | pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0)
 22 | redis_search.util.redis = redis.Redis(connection_pool=pool)
 23 | 
 24 | i = index("test", 1, "Redis")
 25 | i.save()
 26 | 
 27 | i = index("test", 2, "Redhat")
 28 | i.save()
 29 | 
 30 | i = index("test", 3, "张无忌最后没有选定自己的配偶", "id", exts= {
 31 |      'username':"jiedan", 'email':'lxb429@gmail.com'
 32 | }, password="123456")
 33 | i.save()
 34 | 
 35 | i = index("test", 4, "Redis 是一个高性能的key-value数据库", "id", exts= {
 36 |     'username':"jiedan", 'email':'lxb429@gmail.com'
 37 | })
 38 | i.save()
 39 | 
 40 | i = index("test", 6, "回明朝当皇帝", "id", exts={"title":"回明朝当皇帝"})
 41 | i.save()
 42 | 
 43 | i = index("test", 7, "回明朝做皇帝", "id", exts={"title":"回明朝做皇帝"})
 44 | i.save()
 45 | 
 46 | print "自动完成: r"
 47 | users = complete('test', "r")
 48 | 
 49 | for user in users:
 50 |     print user['id'], user['title']
 51 | 
 52 | print "-"*10
 53 | print "自动完成: redi"
 54 | users = complete('test', "redi")
 55 | 
 56 | for user in users:
 57 |     print user['id'], user['title']
 58 | 
 59 | print "-"*10
 60 | print "自动完成: 张"
 61 | users = complete('test', "张")
 62 | 
 63 | for user in users:
 64 |     print user['id'], user['title']
 65 | 
 66 | print "-"*10
 67 | print "搜索: Redis"
 68 | users = query('test', "Redis")
 69 |  
 70 | for user in users:
 71 |     print user['id'], user['title']
 72 | 
 73 | print "-"*10
 74 | print "搜索: 张无忌"
 75 | users = query('test', "张无忌")
 76 |  
 77 | for user in users:
 78 |     print user['id'], user['title']
 79 | 
 80 | print "-"*10
 81 | print "搜索: 回明朝做皇帝"
 82 | users = query('test', "回明朝做皇帝")
 83 |  
 84 | for user in users:
 85 |     print user['id'], user['title']
 86 | 
 87 | print "-"*10
 88 | print "搜索: 皇帝"
 89 | users = query('test', "当皇帝")
 90 |  
 91 | for user in users:
 92 |     print user['id'], user['title']
 93 | 
 94 | print "-"*10
 95 | print "拼音搜索: zhang"
 96 | users = query('test', "zhang")
 97 |  
 98 | for user in users:
 99 |     print user['id'], user['title']
100 | 


--------------------------------------------------------------------------------