├── README.md
└── modules
└── plugin_haystack.py
/README.md:
--------------------------------------------------------------------------------
1 | # plugin_haystack.py
2 |
3 | This is an experimental plugin to provide full text-search for web2py.
4 |
5 | ## How to use it?
6 |
7 | Assume the following model:
8 |
9 | db = DAL()
10 | db.define_table('thing',Field('name'),Field('description'))
11 |
12 | You want to be able to perform full text search on the two fields of the above table. You do:
13 |
14 | from plugin_haystack import Haystack
15 | index = Haystack(db.thing)
16 | index.indexes('name','description')
17 |
18 | This will create indexes for all new records inserted, modified, and deleted fro them above table. For example:
19 |
20 | db.thing.insert(name='Char',description='A char') # automatically indexed
21 | db(db.thing.id).update(description='The chair') # automatically re-indexed
22 | db(db.thing).delete() # automatically re-indexed
23 | You can now use Haystack to build queries:
24 |
25 | query = index.search(name='chair',description='chair')
26 |
27 | and use in combinations with other DAL queries:
28 |
29 | print db(query)(db.thing.name.endswith('r')).select()
30 |
31 | ## Supported backends:
32 |
33 | *Simple* (mostly for testing):
34 |
35 | index = Haystack(db.thing)
36 |
37 | *Whoosh* (you must `pip install whoosh`):
38 |
39 | index = Haystack(db.thing,backend=WhooshBackend,indexdir='/path/to/index')
40 |
41 | *Solr* (you must install and run Solr, and `pip install sunburnt`)
42 |
43 | index = Haystack(db.thing,backend=SolrBackend,url='https://localhost:8983')
44 |
45 | ## How does it works?
46 |
47 | web2py Haystack uses a third party backend - built on the local database or on file (Whoosh) or as a web service (Solr) - to create an index of keywords to records in the table (db.thing in the example). When you call:
48 |
49 | index.search(name='chair')
50 |
51 | It performs a query to the backend to get a list of records ids which match the query. It returns the query:
52 |
53 | db.thing.belongs([3, 5, 9, ...])
54 |
55 | Therefore:
56 |
57 | rows = db(index.search(name='chair')).select()
58 |
59 | is the same as
60 |
61 | // find the ids of all records with name matching "chair" and
62 | rows = db(db.thing.id.belongs(ids)).select()
63 |
64 | ## Caveats
65 |
66 | - How the record matching is done depends on the backend. In the Simple case it converts the text to lower case and break the text into tokens (worlds longer then 3 alphanumeric chars), then looks for all indexed records which contain all tokens in the queries text. It returns the first 20 entries found. In the Whoosh case and the Solr case it returns the closest matches, defined by others and more complex algorithms.
67 |
68 | - Web2py does automatic migrations. Haystack does not. This means you cannot simply change the list of indexed fields and expect the index to work. Eventually there should be a way to rebuild the indexes but this has not been implemented yet.
69 |
--------------------------------------------------------------------------------
/modules/plugin_haystack.py:
--------------------------------------------------------------------------------
1 | """
2 | plugin_haystack.py
3 |
4 | This file is an experimental part of the web2py.
5 | It allows full text search using database, Whoosh, or Solr.
6 | Author: Massimo Di Pierro
7 | License: LGPL
8 |
9 | Usage:
10 | db = DAL()
11 | db.define_table('thing',Field('name'),Field('description'))
12 | index = Haystack(db.thing) # table to be indexed
13 | index.indexes('name','description') # fields to be indexed
14 | db.thing.insert(name='Char',description='A char') # automatically indexed
15 | db(db.thing.id).update(description='The chair') # automatically re-indexed
16 | db(db.thing).delete() # automatically re-indexed
17 | query = index.search(name='chair',description='the')
18 | print db(query).select()
19 | """
20 |
21 | import re
22 | import os
23 | from gluon import Field
24 |
25 | DEBUG = True
26 |
27 | class SimpleBackend(object):
28 | regex = re.compile('[\w\-]{2}[\w\-]+')
29 | ignore = set(['and','or','in','of','for','to','from'])
30 | def __init__(self, table, db = None):
31 | self.table = table
32 | self.db = db or table._db
33 | self.idx = self.db.define_table(
34 | 'haystack_%s' % table._tablename,
35 | Field('fieldname'),
36 | Field('keyword'),
37 | Field('record_id','integer'))
38 | def indexes(self,*fieldnames):
39 | self.fieldnames = fieldnames
40 | return self
41 | def after_insert(self,fields,id):
42 | if DEBUG: print 'after insert',fields,id
43 | for fieldname in self.fieldnames:
44 | words = set(self.regex.findall(fields[fieldname].lower())) - self.ignore
45 | for word in words:
46 | self.idx.insert(
47 | fieldname = fieldname,
48 | keyword = word,
49 | record_id = id)
50 | if DEBUG: print self.db(self.idx).select()
51 | return True
52 | def after_update(self,queryset,fields):
53 | if DEBUG: print 'after update',queryset,fields
54 | db = self.db
55 | for id in self.get_ids(queryset):
56 | for fieldname in self.fieldnames:
57 | if fieldname in fields:
58 | words = set(self.regex.findall(fields[fieldname].lower())) - self.ignore
59 | existing_words = set(r.keyword for r in db(
60 | (self.idx.fieldname == fieldname)&
61 | (self.idx.record_id==id)
62 | ).select(self.idx.keyword))
63 | db((self.idx.fieldname == fieldname)&
64 | (self.idx.record_id==id)&
65 | (self.idx.keyword.belongs(list(existing_words - words)))
66 | ).delete()
67 | for new_word in (words - existing_words):
68 | self.idx.insert(
69 | fieldname = fieldname,
70 | keyword = new_word,
71 | record_id = id)
72 | if DEBUG: print self.db(self.idx).select()
73 | return True
74 | def get_ids(self,queryset):
75 | return [r.id for r in queryset.select(self.table._id)]
76 | def after_delete(self,queryset):
77 | if DEBUG: print 'after delete',queryset
78 | ids = self.get_ids(queryset)
79 | self.db(self.idx.record_id.belongs(ids)).delete()
80 | if DEBUG: print self.db(self.idx).select()
81 | return True
82 | def meta_search(self,limit,mode,**fieldkeys):
83 | db = self.db
84 | ids = None
85 | for fieldname in fieldkeys:
86 | if fieldname in self.fieldnames:
87 | words = set(self.regex.findall(fieldkeys[fieldname].lower()))
88 | meta_query = ((self.idx.fieldname==fieldname)&
89 | (self.idx.keyword.belongs(list(words))))
90 | new_ids = set(r.record_id for r in db(meta_query).select(
91 | limitby=(0,limit)))
92 | if mode == 'and':
93 | ids = new_ids if ids is None else ids & new_ids
94 | elif mode == 'or':
95 | ids = new_ids if ids is None else ids | new_ids
96 | return list(ids)
97 |
98 |
99 | class WhooshBackend(SimpleBackend):
100 | def __init__(self, table, indexdir):
101 | self.table = table
102 | self.indexdir = indexdir
103 | if not os.path.exists(indexdir):
104 | os.mkdir(indexdir)
105 | def indexes(self,*fieldnames):
106 | try:
107 | from whoosh.index import create_in
108 | from whoosh.fields import Schema, TEXT, ID
109 | except ImportError:
110 | raise ImportError("Cannot find Whoosh")
111 | self.fieldnames = fieldnames
112 | try:
113 | self.ix = open_dir(self.indexdir)
114 | except:
115 | schema = Schema(id=ID(unique=True,stored=True),
116 | **dict((k,TEXT) for k in fieldnames))
117 | self.ix = create_in(self.indexdir, schema)
118 | def after_insert(self,fields,id):
119 | if DEBUG: print 'after insert',fields,id
120 | writer = self.ix.writer()
121 | writer.add_document(id=unicode(id),
122 | **dict((name,unicode(fields[name]))
123 | for name in self.fieldnames if name in fields))
124 | writer.commit()
125 | return True
126 | def after_update(self,queryset,fields):
127 | if DEBUG: print 'after update',queryset,fields
128 | ids = self.get_ids(queryset)
129 | if ids:
130 | writer = self.ix.writer()
131 | for id in ids:
132 | writer.update_document(id=unicode(id),
133 | **dict((name,unicode(fields[name]))
134 | for name in self.fieldnames if name in fields))
135 | writer.commit()
136 | return True
137 | def after_delete(self,queryset):
138 | if DEBUG: print 'after delete',queryset
139 | ids = self.get_ids(queryset)
140 | if ids:
141 | writer = self.ix.writer()
142 | for id in ids:
143 | writer.delete_by_term('id', unicode(id))
144 | writer.commit()
145 | return True
146 | def meta_search(self,limit,mode,**fieldkeys):
147 | from whoosh.qparser import QueryParser
148 | ids = None
149 | with self.ix.searcher() as searcher:
150 | for fieldname in fieldkeys:
151 | parser = QueryParser(fieldname, schema=self.ix.schema)
152 | query = parser.parse(unicode(fieldkeys[fieldname]))
153 | results = searcher.search(query,limit=limit)
154 | new_ids = set(long(result['id']) for result in results)
155 | if mode == 'and':
156 | ids = new_ids if ids is None else ids & new_ids
157 | elif mode == 'or':
158 | ids = new_ids if ids is None else ids | new_ids
159 | return list(ids)
160 |
161 |
162 | class SolrBackend(SimpleBackend):
163 | def __init__(self, table, url="http://localhost:8983",schema_filename="schema.xml"):
164 | self.table = table
165 | self.url = url
166 | self.schema_filename=schema_filename
167 | def indexes(self,*fieldnames):
168 | try:
169 | import sunburnt
170 | except ImportError:
171 | raise ImportError("Cannot find sunburnt, it is necessary to access Solr")
172 | self.fieldnames = fieldnames
173 | if not os.path.exists(self.schema_filename):
174 | schema='%s' \
175 | % ''.join('' % name for name in fieldname)
176 | open(self.schema_filename,'w').write(shema)
177 | try:
178 | self.interface = sunburnt.SolrInterface(self.url, self.schema_filename)
179 | except:
180 | raise RuntimeError("Cannot connect to Solr: %s" % self.url)
181 | def after_insert(self,fields,id):
182 | if DEBUG: print 'after insert',fields,id
183 | document = {'id':id}
184 | for name in self.fieldnames:
185 | if name in fields:
186 | document[name] = unicode(fields[name])
187 | self.interface.add([document])
188 | self.interface.commit()
189 | return True
190 | def after_update(self,queryset,fields):
191 | """ caveat, this should work but only if ALL indexed fields are updated at once """
192 | if DEBUG: print 'after update',queryset,fields
193 | ids = self.get_ids(queryset)
194 | self.interface.delete(ids)
195 | documents = []
196 | for id in ids:
197 | document = {'id':id}
198 | for name in self.fieldnames:
199 | if name in fields:
200 | document[name] = unicode(fields[name])
201 | documents.append(document)
202 | self.interface.add(documents)
203 | self.interface.commit()
204 | return True
205 | def after_delete(self,queryset):
206 | if DEBUG: print 'after delete',queryset
207 | ids = self.get_ids(queryset)
208 | self.interface.delete(ids)
209 | self.interface.commit()
210 | return True
211 | def meta_search(self,limit,mode,**fieldkeys):
212 | """ mode is ignored hhere since I am not sure what Solr does """
213 | results = self.interface.query(**fieldkeys).paginate(0,limit)
214 | ids = [r['id'] for r in results]
215 | return ids
216 |
217 |
218 | class Haystack(object):
219 | def __init__(self,table,backend=SimpleBackend,**attr):
220 | self.table = table
221 | self.backend = backend(table,**attr)
222 | def indexes(self,*fieldnames):
223 | invalid = [f for f in fieldnames if not f in self.table.fields() or
224 | not self.table[f].type in ('string','text')]
225 | if invalid:
226 | raise RuntimeError("Unable to index fields: %s" % ', '.join(invalid))
227 | self.backend.indexes(*fieldnames)
228 | self.table._after_insert.append(
229 | lambda fields,id: self.backend.after_insert(fields,id))
230 | self.table._after_update.append(
231 | lambda queryset,fields: self.backend.after_update(queryset,fields))
232 | self.table._after_delete.append(
233 | lambda queryset: self.backend.after_delete(queryset))
234 | def search(self,limit=20,mode='and',**fieldkeys):
235 | ids = self.backend.meta_search(limit,mode,**fieldkeys)
236 | return self.table._id.belongs(ids)
237 |
238 | def test(mode='simple'):
239 | db = DAL()
240 | db.define_table('thing',Field('name'),Field('description','text'))
241 | if mode=='simple':
242 | index = Haystack(db.thing)
243 | elif mode=='whoosh':
244 | index = Haystack(db.thing,backend=WhooshBackend,indexdir='test-whoosh')
245 | elif mode=='solr':
246 | index = Haystack(db.thing,backend=SolrBackend,url='https://localhost:8983')
247 | index.indexes('name','description')
248 | id = db.thing.insert(name="table",description = "one table")
249 | id = db.thing.insert(name="table",description = "another table")
250 | assert db(index.search(description='one')).count()==1
251 | assert db(index.search(description='table')).count()==2
252 | assert db(index.search(name='table')).count()==2
253 | assert db(index.search(name='table',description='table')).count()==2
254 | db(db.thing.id==id).update(name='table',description='four legs')
255 | assert db(index.search(description='another')).count()==0
256 | assert db(index.search(description='four')).count()==1
257 | assert db(index.search(description='legs')).count()==1
258 | assert db(index.search(description='legs four')).count()==1
259 | assert db(index.search(name='table')).count()==2
260 | assert db(index.search(name='table',description='table')).count()==1
261 | assert db(index.search(name='table')|
262 | index.search(description='table')).count()==2
263 | db(db.thing.id==id).delete()
264 | assert db(index.search(name='table')).count()==1
265 | db(db.thing).delete()
266 | assert db(index.search(name='table')).count()==0
267 | db.commit()
268 | db.close()
269 |
270 | if __name__=='__main__':
271 | test('simple')
272 | test('whoosh')
273 |
--------------------------------------------------------------------------------