├── README.md └── modules └── plugin_haystack.py /README.md: -------------------------------------------------------------------------------- 1 | # plugin_haystack.py 2 | 3 | This is an experimental plugin to provide full text-search for web2py. 4 | 5 | ## How to use it? 6 | 7 | Assume the following model: 8 | 9 | db = DAL() 10 | db.define_table('thing',Field('name'),Field('description')) 11 | 12 | You want to be able to perform full text search on the two fields of the above table. You do: 13 | 14 | from plugin_haystack import Haystack 15 | index = Haystack(db.thing) 16 | index.indexes('name','description') 17 | 18 | This will create indexes for all new records inserted, modified, and deleted fro them above table. For example: 19 | 20 | db.thing.insert(name='Char',description='A char') # automatically indexed 21 | db(db.thing.id).update(description='The chair') # automatically re-indexed 22 | db(db.thing).delete() # automatically re-indexed 23 | You can now use Haystack to build queries: 24 | 25 | query = index.search(name='chair',description='chair') 26 | 27 | and use in combinations with other DAL queries: 28 | 29 | print db(query)(db.thing.name.endswith('r')).select() 30 | 31 | ## Supported backends: 32 | 33 | *Simple* (mostly for testing): 34 | 35 | index = Haystack(db.thing) 36 | 37 | *Whoosh* (you must `pip install whoosh`): 38 | 39 | index = Haystack(db.thing,backend=WhooshBackend,indexdir='/path/to/index') 40 | 41 | *Solr* (you must install and run Solr, and `pip install sunburnt`) 42 | 43 | index = Haystack(db.thing,backend=SolrBackend,url='https://localhost:8983') 44 | 45 | ## How does it works? 46 | 47 | web2py Haystack uses a third party backend - built on the local database or on file (Whoosh) or as a web service (Solr) - to create an index of keywords to records in the table (db.thing in the example). When you call: 48 | 49 | index.search(name='chair') 50 | 51 | It performs a query to the backend to get a list of records ids which match the query. It returns the query: 52 | 53 | db.thing.belongs([3, 5, 9, ...]) 54 | 55 | Therefore: 56 | 57 | rows = db(index.search(name='chair')).select() 58 | 59 | is the same as 60 | 61 | // find the ids of all records with name matching "chair" and 62 | rows = db(db.thing.id.belongs(ids)).select() 63 | 64 | ## Caveats 65 | 66 | - How the record matching is done depends on the backend. In the Simple case it converts the text to lower case and break the text into tokens (worlds longer then 3 alphanumeric chars), then looks for all indexed records which contain all tokens in the queries text. It returns the first 20 entries found. In the Whoosh case and the Solr case it returns the closest matches, defined by others and more complex algorithms. 67 | 68 | - Web2py does automatic migrations. Haystack does not. This means you cannot simply change the list of indexed fields and expect the index to work. Eventually there should be a way to rebuild the indexes but this has not been implemented yet. 69 | -------------------------------------------------------------------------------- /modules/plugin_haystack.py: -------------------------------------------------------------------------------- 1 | """ 2 | plugin_haystack.py 3 | 4 | This file is an experimental part of the web2py. 5 | It allows full text search using database, Whoosh, or Solr. 6 | Author: Massimo Di Pierro 7 | License: LGPL 8 | 9 | Usage: 10 | db = DAL() 11 | db.define_table('thing',Field('name'),Field('description')) 12 | index = Haystack(db.thing) # table to be indexed 13 | index.indexes('name','description') # fields to be indexed 14 | db.thing.insert(name='Char',description='A char') # automatically indexed 15 | db(db.thing.id).update(description='The chair') # automatically re-indexed 16 | db(db.thing).delete() # automatically re-indexed 17 | query = index.search(name='chair',description='the') 18 | print db(query).select() 19 | """ 20 | 21 | import re 22 | import os 23 | from gluon import Field 24 | 25 | DEBUG = True 26 | 27 | class SimpleBackend(object): 28 | regex = re.compile('[\w\-]{2}[\w\-]+') 29 | ignore = set(['and','or','in','of','for','to','from']) 30 | def __init__(self, table, db = None): 31 | self.table = table 32 | self.db = db or table._db 33 | self.idx = self.db.define_table( 34 | 'haystack_%s' % table._tablename, 35 | Field('fieldname'), 36 | Field('keyword'), 37 | Field('record_id','integer')) 38 | def indexes(self,*fieldnames): 39 | self.fieldnames = fieldnames 40 | return self 41 | def after_insert(self,fields,id): 42 | if DEBUG: print 'after insert',fields,id 43 | for fieldname in self.fieldnames: 44 | words = set(self.regex.findall(fields[fieldname].lower())) - self.ignore 45 | for word in words: 46 | self.idx.insert( 47 | fieldname = fieldname, 48 | keyword = word, 49 | record_id = id) 50 | if DEBUG: print self.db(self.idx).select() 51 | return True 52 | def after_update(self,queryset,fields): 53 | if DEBUG: print 'after update',queryset,fields 54 | db = self.db 55 | for id in self.get_ids(queryset): 56 | for fieldname in self.fieldnames: 57 | if fieldname in fields: 58 | words = set(self.regex.findall(fields[fieldname].lower())) - self.ignore 59 | existing_words = set(r.keyword for r in db( 60 | (self.idx.fieldname == fieldname)& 61 | (self.idx.record_id==id) 62 | ).select(self.idx.keyword)) 63 | db((self.idx.fieldname == fieldname)& 64 | (self.idx.record_id==id)& 65 | (self.idx.keyword.belongs(list(existing_words - words))) 66 | ).delete() 67 | for new_word in (words - existing_words): 68 | self.idx.insert( 69 | fieldname = fieldname, 70 | keyword = new_word, 71 | record_id = id) 72 | if DEBUG: print self.db(self.idx).select() 73 | return True 74 | def get_ids(self,queryset): 75 | return [r.id for r in queryset.select(self.table._id)] 76 | def after_delete(self,queryset): 77 | if DEBUG: print 'after delete',queryset 78 | ids = self.get_ids(queryset) 79 | self.db(self.idx.record_id.belongs(ids)).delete() 80 | if DEBUG: print self.db(self.idx).select() 81 | return True 82 | def meta_search(self,limit,mode,**fieldkeys): 83 | db = self.db 84 | ids = None 85 | for fieldname in fieldkeys: 86 | if fieldname in self.fieldnames: 87 | words = set(self.regex.findall(fieldkeys[fieldname].lower())) 88 | meta_query = ((self.idx.fieldname==fieldname)& 89 | (self.idx.keyword.belongs(list(words)))) 90 | new_ids = set(r.record_id for r in db(meta_query).select( 91 | limitby=(0,limit))) 92 | if mode == 'and': 93 | ids = new_ids if ids is None else ids & new_ids 94 | elif mode == 'or': 95 | ids = new_ids if ids is None else ids | new_ids 96 | return list(ids) 97 | 98 | 99 | class WhooshBackend(SimpleBackend): 100 | def __init__(self, table, indexdir): 101 | self.table = table 102 | self.indexdir = indexdir 103 | if not os.path.exists(indexdir): 104 | os.mkdir(indexdir) 105 | def indexes(self,*fieldnames): 106 | try: 107 | from whoosh.index import create_in 108 | from whoosh.fields import Schema, TEXT, ID 109 | except ImportError: 110 | raise ImportError("Cannot find Whoosh") 111 | self.fieldnames = fieldnames 112 | try: 113 | self.ix = open_dir(self.indexdir) 114 | except: 115 | schema = Schema(id=ID(unique=True,stored=True), 116 | **dict((k,TEXT) for k in fieldnames)) 117 | self.ix = create_in(self.indexdir, schema) 118 | def after_insert(self,fields,id): 119 | if DEBUG: print 'after insert',fields,id 120 | writer = self.ix.writer() 121 | writer.add_document(id=unicode(id), 122 | **dict((name,unicode(fields[name])) 123 | for name in self.fieldnames if name in fields)) 124 | writer.commit() 125 | return True 126 | def after_update(self,queryset,fields): 127 | if DEBUG: print 'after update',queryset,fields 128 | ids = self.get_ids(queryset) 129 | if ids: 130 | writer = self.ix.writer() 131 | for id in ids: 132 | writer.update_document(id=unicode(id), 133 | **dict((name,unicode(fields[name])) 134 | for name in self.fieldnames if name in fields)) 135 | writer.commit() 136 | return True 137 | def after_delete(self,queryset): 138 | if DEBUG: print 'after delete',queryset 139 | ids = self.get_ids(queryset) 140 | if ids: 141 | writer = self.ix.writer() 142 | for id in ids: 143 | writer.delete_by_term('id', unicode(id)) 144 | writer.commit() 145 | return True 146 | def meta_search(self,limit,mode,**fieldkeys): 147 | from whoosh.qparser import QueryParser 148 | ids = None 149 | with self.ix.searcher() as searcher: 150 | for fieldname in fieldkeys: 151 | parser = QueryParser(fieldname, schema=self.ix.schema) 152 | query = parser.parse(unicode(fieldkeys[fieldname])) 153 | results = searcher.search(query,limit=limit) 154 | new_ids = set(long(result['id']) for result in results) 155 | if mode == 'and': 156 | ids = new_ids if ids is None else ids & new_ids 157 | elif mode == 'or': 158 | ids = new_ids if ids is None else ids | new_ids 159 | return list(ids) 160 | 161 | 162 | class SolrBackend(SimpleBackend): 163 | def __init__(self, table, url="http://localhost:8983",schema_filename="schema.xml"): 164 | self.table = table 165 | self.url = url 166 | self.schema_filename=schema_filename 167 | def indexes(self,*fieldnames): 168 | try: 169 | import sunburnt 170 | except ImportError: 171 | raise ImportError("Cannot find sunburnt, it is necessary to access Solr") 172 | self.fieldnames = fieldnames 173 | if not os.path.exists(self.schema_filename): 174 | schema='%s' \ 175 | % ''.join('' % name for name in fieldname) 176 | open(self.schema_filename,'w').write(shema) 177 | try: 178 | self.interface = sunburnt.SolrInterface(self.url, self.schema_filename) 179 | except: 180 | raise RuntimeError("Cannot connect to Solr: %s" % self.url) 181 | def after_insert(self,fields,id): 182 | if DEBUG: print 'after insert',fields,id 183 | document = {'id':id} 184 | for name in self.fieldnames: 185 | if name in fields: 186 | document[name] = unicode(fields[name]) 187 | self.interface.add([document]) 188 | self.interface.commit() 189 | return True 190 | def after_update(self,queryset,fields): 191 | """ caveat, this should work but only if ALL indexed fields are updated at once """ 192 | if DEBUG: print 'after update',queryset,fields 193 | ids = self.get_ids(queryset) 194 | self.interface.delete(ids) 195 | documents = [] 196 | for id in ids: 197 | document = {'id':id} 198 | for name in self.fieldnames: 199 | if name in fields: 200 | document[name] = unicode(fields[name]) 201 | documents.append(document) 202 | self.interface.add(documents) 203 | self.interface.commit() 204 | return True 205 | def after_delete(self,queryset): 206 | if DEBUG: print 'after delete',queryset 207 | ids = self.get_ids(queryset) 208 | self.interface.delete(ids) 209 | self.interface.commit() 210 | return True 211 | def meta_search(self,limit,mode,**fieldkeys): 212 | """ mode is ignored hhere since I am not sure what Solr does """ 213 | results = self.interface.query(**fieldkeys).paginate(0,limit) 214 | ids = [r['id'] for r in results] 215 | return ids 216 | 217 | 218 | class Haystack(object): 219 | def __init__(self,table,backend=SimpleBackend,**attr): 220 | self.table = table 221 | self.backend = backend(table,**attr) 222 | def indexes(self,*fieldnames): 223 | invalid = [f for f in fieldnames if not f in self.table.fields() or 224 | not self.table[f].type in ('string','text')] 225 | if invalid: 226 | raise RuntimeError("Unable to index fields: %s" % ', '.join(invalid)) 227 | self.backend.indexes(*fieldnames) 228 | self.table._after_insert.append( 229 | lambda fields,id: self.backend.after_insert(fields,id)) 230 | self.table._after_update.append( 231 | lambda queryset,fields: self.backend.after_update(queryset,fields)) 232 | self.table._after_delete.append( 233 | lambda queryset: self.backend.after_delete(queryset)) 234 | def search(self,limit=20,mode='and',**fieldkeys): 235 | ids = self.backend.meta_search(limit,mode,**fieldkeys) 236 | return self.table._id.belongs(ids) 237 | 238 | def test(mode='simple'): 239 | db = DAL() 240 | db.define_table('thing',Field('name'),Field('description','text')) 241 | if mode=='simple': 242 | index = Haystack(db.thing) 243 | elif mode=='whoosh': 244 | index = Haystack(db.thing,backend=WhooshBackend,indexdir='test-whoosh') 245 | elif mode=='solr': 246 | index = Haystack(db.thing,backend=SolrBackend,url='https://localhost:8983') 247 | index.indexes('name','description') 248 | id = db.thing.insert(name="table",description = "one table") 249 | id = db.thing.insert(name="table",description = "another table") 250 | assert db(index.search(description='one')).count()==1 251 | assert db(index.search(description='table')).count()==2 252 | assert db(index.search(name='table')).count()==2 253 | assert db(index.search(name='table',description='table')).count()==2 254 | db(db.thing.id==id).update(name='table',description='four legs') 255 | assert db(index.search(description='another')).count()==0 256 | assert db(index.search(description='four')).count()==1 257 | assert db(index.search(description='legs')).count()==1 258 | assert db(index.search(description='legs four')).count()==1 259 | assert db(index.search(name='table')).count()==2 260 | assert db(index.search(name='table',description='table')).count()==1 261 | assert db(index.search(name='table')| 262 | index.search(description='table')).count()==2 263 | db(db.thing.id==id).delete() 264 | assert db(index.search(name='table')).count()==1 265 | db(db.thing).delete() 266 | assert db(index.search(name='table')).count()==0 267 | db.commit() 268 | db.close() 269 | 270 | if __name__=='__main__': 271 | test('simple') 272 | test('whoosh') 273 | --------------------------------------------------------------------------------