├── .gitignore ├── .gitmodules ├── app.yaml ├── index.yaml ├── main.py ├── search └── __init__.py ├── static ├── favicon.ico └── robots.txt └── tests ├── HOW-TO-TEST ├── __init__.py ├── roget.txt ├── test_app.py └── test_search.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .DS_Store 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "search/pyporter2"] 2 | path = search/pyporter2 3 | url = git://github.com/mdirolf/pyporter2.git 4 | -------------------------------------------------------------------------------- /app.yaml: -------------------------------------------------------------------------------- 1 | application: billkatz-test 2 | version: 7-16-2009-1 3 | runtime: python 4 | api_version: 1 5 | 6 | handlers: 7 | - url: .*/favicon\.ico 8 | static_files: static/favicon.ico 9 | upload: static/favicon.ico 10 | 11 | - url: /robots\.txt 12 | static_files: static/robots.txt 13 | upload: static/robots.txt 14 | 15 | - url: /static 16 | static_dir: static 17 | 18 | - url: .* 19 | script: main.py 20 | 21 | skip_files: | 22 | ^(.*/)?( 23 | (app\.yaml)| 24 | (app\.yml)| 25 | (index\.yaml)| 26 | (index\.yml)| 27 | (#.*#)| 28 | (.*~)| 29 | (.*\.py[co])| 30 | (.*/RCS/.*)| 31 | (\..*)| 32 | (tests/.*) 33 | )$ 34 | -------------------------------------------------------------------------------- /index.yaml: -------------------------------------------------------------------------------- 1 | indexes: 2 | 3 | - kind: StemmedIndex 4 | ancestor: yes 5 | 6 | - kind: StemmedIndex 7 | properties: 8 | - name: parent_kind 9 | - name: phrases 10 | 11 | - kind: LiteralIndex 12 | ancestor: yes 13 | 14 | - kind: LiteralIndex 15 | properties: 16 | - name: parent_kind 17 | - name: phrases 18 | 19 | # AUTOGENERATED 20 | 21 | # This index.yaml is automatically updated whenever the dev_appserver 22 | # detects that a new type of query is run. If you want to manage the 23 | # index.yaml file manually, remove the above marker line (the line 24 | # saying "# AUTOGENERATED"). If you want to manage some indexes 25 | # manually, move them above the marker line. The index.yaml file is 26 | # automatically uploaded to the admin console when you next deploy 27 | # your application using appcfg.py. 28 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # The MIT License 4 | # 5 | # Copyright (c) 2009 William T. Katz 6 | # Website/Contact: http://www.billkatz.com 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to 10 | # deal in the Software without restriction, including without limitation 11 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 | # and/or sell copies of the Software, and to permit persons to whom the 13 | # Software is furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | # DEALINGS IN THE SOFTWARE. 25 | 26 | """A super simple Google App Engine text posting app. 27 | 28 | Logged in visitors can add some test and search for keywords across all 29 | added pages. It demos a simple full text search module. 30 | """ 31 | __author__ = 'William T. Katz' 32 | 33 | import cgi 34 | import logging 35 | 36 | from google.appengine.api import users 37 | from google.appengine.ext import db 38 | from google.appengine.ext import webapp 39 | from google.appengine.ext.webapp.util import run_wsgi_app 40 | 41 | # The following are necessary for full-text search demo 42 | import search 43 | INDEXING_URL = '/tasks/searchindexing' 44 | 45 | class Page(search.Searchable, db.Model): 46 | user = db.UserProperty() 47 | title = db.StringProperty() 48 | content = db.TextProperty() 49 | created = db.DateTimeProperty(auto_now=True) 50 | INDEX_TITLE_FROM_PROP = 'title' 51 | # INDEX_USES_MULTI_ENTITIES = False 52 | 53 | class SimplePage(webapp.RequestHandler): 54 | def render(self, html): 55 | user = users.get_current_user() 56 | page = '
Add Page | ' 57 | if user: 58 | page += 'Logged in as %s ' % (user.nickname()) 59 | logout_url = users.create_logout_url(self.request.uri) 60 | page += '| Logout' % (logout_url) 61 | else: 62 | login_url = users.create_login_url(self.request.uri) 63 | page += 'Google login' % (login_url) 64 | page += """
65 |
66 |

Full Text Search Test

67 |

This app tests a full text search module for Google App Engine. 68 | Once you are logged in, you can add text pages that will be indexed via 69 | Task Queue API tasks. The search indices are efficiently stored using 70 | "Relation Index" entities as described in 71 | 72 | this Google I/O talk.

73 |

My blog has an 74 | 75 | article on this appengine-search module. You can download the code from the 76 | appengine-search 77 | github repository under a liberal open source (MIT) license.

78 |
79 | Search for phrase (e.g., 'lorem ipsum'): 80 | """ 81 | page += 'Return Pages retrieves the entire Page entities.
89 | Return Keys Only retrieves just the keys but uses 90 | intelligent key naming to transmit "Title" data via the key names.

91 | """ 92 | page += '
' 93 | page += html 94 | page += '' 95 | self.response.out.write(page) 96 | 97 | class MainPage(SimplePage): 98 | def get(self): 99 | user = users.get_current_user() 100 | if not user: 101 | html = '

Please login to add a page.

' 102 | else: 103 | import time 104 | time_string = time.strftime('Page submitted %X on %x') 105 | html = """ 106 |

Add a text page below:

107 |
108 |
Title: ' 111 | html += """ 112 | This data will be encoded in the key names of index entities.
113 |
114 |
115 |
116 | """ 117 | self.render(html) 118 | 119 | def post(self): 120 | user = users.get_current_user() 121 | content = self.request.get('content') 122 | title = self.request.get('title') 123 | if not user: 124 | self.redirect('/?msg=You+must+be+logged+in') 125 | elif not content: 126 | self.redirect('/') 127 | else: 128 | page = Page(content=content, title=title, user=user) 129 | page.put() 130 | page.enqueue_indexing(url=INDEXING_URL) 131 | html = "
Thanks for entering the following text:
" 132 | html += "
%s
" % (cgi.escape(content)) 133 | self.render(html) 134 | 135 | class SearchPage(SimplePage): 136 | def get(self): 137 | submitbtn = self.request.get('submitbtn') 138 | phrase = self.request.get('phrase') 139 | html = "

'" + phrase + "' was found on these pages:

" 140 | if submitbtn == 'Return Keys Only': 141 | key_list = Page.search(phrase, keys_only=True) 142 | for key_and_title in key_list: 143 | html += "

Title: %s

" % key_and_title[1] 144 | else: 145 | pages = Page.search(phrase) 146 | for page in pages: 147 | html += "

Title: %s

User: %s, Created: %s

%s
" \ 148 | % (page.title, str(page.user), str(page.created), cgi.escape(page.content)) 149 | self.render(html) 150 | 151 | application = webapp.WSGIApplication([ 152 | ('/', MainPage), 153 | ('/search', SearchPage), 154 | (INDEXING_URL, search.SearchIndexing)], debug=True) 155 | 156 | def main(): 157 | run_wsgi_app(application) 158 | 159 | if __name__ == '__main__': 160 | main() 161 | -------------------------------------------------------------------------------- /search/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # The MIT License 4 | # 5 | # Copyright (c) 2009 William T. Katz 6 | # Website/Contact: http://www.billkatz.com 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to 10 | # deal in the Software without restriction, including without limitation 11 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 | # and/or sell copies of the Software, and to permit persons to whom the 13 | # Software is furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | # DEALINGS IN THE SOFTWARE. 25 | 26 | """A simple full-text search system 27 | 28 | This module lets you designate particular entities for full text search 29 | indexing. It uses the Task Queue API to schedule search indexing and 30 | relation index entities (as described in Brett Slatkin's 'Building Scalable, 31 | Complex Apps on App Engine' talk at Google I/O, 2009). 32 | 33 | The keyword extraction code was slightly modified from Ryan Barrett's 34 | SearchableModel implementation. 35 | """ 36 | __author__ = 'William T. Katz' 37 | 38 | import logging 39 | import re 40 | import string 41 | import sys 42 | 43 | from google.appengine.api import datastore 44 | from google.appengine.api import datastore_types 45 | from google.appengine.ext import db 46 | from google.appengine.ext import webapp 47 | 48 | # TODO -- This will eventually be moved out of labs namespace 49 | from google.appengine.api.labs import taskqueue 50 | 51 | # Use python port of Porter2 stemmer. 52 | from search.pyporter2 import Stemmer 53 | 54 | class Error(Exception): 55 | """Base search module error type.""" 56 | 57 | class IndexTitleError(Error): 58 | """Raised when INDEX_TITLE_FROM_PROP or title alterations are incorrect.""" 59 | 60 | # Following module-level constants are cached in instance 61 | 62 | KEY_NAME_DELIMITER = '||' # Used to hold arbitrary strings in key names. 63 | # Should not be contained in derived class key names. 64 | 65 | MAX_ENTITY_SEARCH_PHRASES = datastore._MAX_INDEXED_PROPERTIES - 1 66 | 67 | SEARCH_PHRASE_MIN_LENGTH = 4 68 | 69 | STOP_WORDS = frozenset([ 70 | 'a', 'about', 'according', 'accordingly', 'affected', 'affecting', 'after', 71 | 'again', 'against', 'all', 'almost', 'already', 'also', 'although', 72 | 'always', 'am', 'among', 'an', 'and', 'any', 'anyone', 'apparently', 'are', 73 | 'arise', 'as', 'aside', 'at', 'away', 'be', 'became', 'because', 'become', 74 | 'becomes', 'been', 'before', 'being', 'between', 'both', 'briefly', 'but', 75 | 'by', 'came', 'can', 'cannot', 'certain', 'certainly', 'could', 'did', 'do', 76 | 'does', 'done', 'during', 'each', 'either', 'else', 'etc', 'ever', 'every', 77 | 'following', 'for', 'found', 'from', 'further', 'gave', 'gets', 'give', 78 | 'given', 'giving', 'gone', 'got', 'had', 'hardly', 'has', 'have', 'having', 79 | 'here', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'itself', 80 | 'just', 'keep', 'kept', 'knowledge', 'largely', 'like', 'made', 'mainly', 81 | 'make', 'many', 'might', 'more', 'most', 'mostly', 'much', 'must', 'nearly', 82 | 'necessarily', 'neither', 'next', 'no', 'none', 'nor', 'normally', 'not', 83 | 'noted', 'now', 'obtain', 'obtained', 'of', 'often', 'on', 'only', 'or', 84 | 'other', 'our', 'out', 'owing', 'particularly', 'past', 'perhaps', 'please', 85 | 'poorly', 'possible', 'possibly', 'potentially', 'predominantly', 'present', 86 | 'previously', 'primarily', 'probably', 'prompt', 'promptly', 'put', 87 | 'quickly', 'quite', 'rather', 'readily', 'really', 'recently', 'regarding', 88 | 'regardless', 'relatively', 'respectively', 'resulted', 'resulting', 89 | 'results', 'said', 'same', 'seem', 'seen', 'several', 'shall', 'should', 90 | 'show', 'showed', 'shown', 'shows', 'significantly', 'similar', 'similarly', 91 | 'since', 'slightly', 'so', 'some', 'sometime', 'somewhat', 'soon', 92 | 'specifically', 'state', 'states', 'strongly', 'substantially', 93 | 'successfully', 'such', 'sufficiently', 'than', 'that', 'the', 'their', 94 | 'theirs', 'them', 'then', 'there', 'therefore', 'these', 'they', 'this', 95 | 'those', 'though', 'through', 'throughout', 'to', 'too', 'toward', 'under', 96 | 'unless', 'until', 'up', 'upon', 'use', 'used', 'usefully', 'usefulness', 97 | 'using', 'usually', 'various', 'very', 'was', 'we', 'were', 'what', 'when', 98 | 'where', 'whether', 'which', 'while', 'who', 'whose', 'why', 'widely', 99 | 'will', 'with', 'within', 'without', 'would', 'yet', 'you']) 100 | 101 | PUNCTUATION_REGEX = re.compile('[' + re.escape(string.punctuation) + ']') 102 | 103 | # Rather than have an extra property name to distinguish stemmed from 104 | # non-stemmed index entities, we use different Models that are 105 | # identical to a base index entity. 106 | class SearchIndex(db.Model): 107 | """Holds full text indexing on an entity. 108 | 109 | This model is used by the Searchable mix-in to hold full text 110 | indexes of a parent entity. 111 | """ 112 | @staticmethod 113 | def get_index_key_name(parent, index_num=1): 114 | key = parent.key() 115 | title = key.kind() + ' ' + str(key.id_or_name()) 116 | uniq_key = title + KEY_NAME_DELIMITER + str(index_num) 117 | if hasattr(parent, 'INDEX_TITLE_FROM_PROP'): 118 | logging.debug("Getting key name from property '%s'", parent.INDEX_TITLE_FROM_PROP) 119 | if hasattr(parent, parent.INDEX_TITLE_FROM_PROP): 120 | title = getattr(parent, parent.INDEX_TITLE_FROM_PROP) or title 121 | return uniq_key + KEY_NAME_DELIMITER + title 122 | 123 | @staticmethod 124 | def get_title(key_name=''): 125 | frags = key_name.split(KEY_NAME_DELIMITER) 126 | if len(frags) < 3: 127 | return 'Unknown Title' 128 | else: 129 | return frags[2] 130 | 131 | @staticmethod 132 | def get_index_num(key_name=''): 133 | frags = key_name.split(KEY_NAME_DELIMITER) 134 | if len(frags) < 2: 135 | return '1' 136 | else: 137 | return frags[1] 138 | 139 | @classmethod 140 | def put_index(cls, parent, phrases, index_num=1): 141 | parent_key = parent.key() 142 | args = {'key_name': cls.get_index_key_name(parent, index_num), 143 | 'parent': parent_key, 'parent_kind': parent_key.kind(), 144 | 'phrases': phrases } 145 | return cls(**args).put() 146 | 147 | 148 | class LiteralIndex(SearchIndex): 149 | """Index model for non-inflected search phrases.""" 150 | parent_kind = db.StringProperty(required=True) 151 | phrases = db.StringListProperty(required=True) 152 | 153 | 154 | class StemmedIndex(SearchIndex): 155 | """Index model for stemmed (inflected) search phrases.""" 156 | parent_kind = db.StringProperty(required=True) 157 | phrases = db.StringListProperty(required=True) 158 | 159 | 160 | class Searchable(object): 161 | """A class that supports full text indexing and search on entities. 162 | 163 | Add this class to your model's inheritance declaration like this: 164 | 165 | class Page(Searchable, db.Model): 166 | title = db.StringProperty() 167 | author_name = db.StringProperty() 168 | content = db.TextProperty() 169 | INDEX_TITLE_FROM_PROP = 'title' 170 | # INDEX_STEMMING = False 171 | # INDEX_USES_MULTI_ENTITIES = False 172 | # INDEX_MULTI_WORD = False 173 | # INDEX_ONLY = ['content'] 174 | 175 | There are a few class variables that can be overridden by your Model. 176 | The settings were made class variables because their use should be 177 | declared at Model definition. 178 | 179 | You can declare a string property to be stowed in index key names by 180 | using the INDEX_TITLE_FROM_PROP variable. This allows you to retrieve 181 | useful labels on key-only searches without doing a get() on the whole 182 | entity. 183 | 184 | Defaults are for searches to use stemming, multiple index entities, 185 | and index all basestring-derived properties. Also, two and three-word 186 | phrases are inserted into the index, which can be disable by setting 187 | INDEX_MULTI_WORD to False. 188 | 189 | Stemming is on by default but can be toggled off by setting INDEX_STEMMING 190 | to False in your class declaration. 191 | 192 | You can set a class variable INDEX_ONLY to a list of property names 193 | for indexing. If INDEX_ONLY is not None, only those properties named 194 | in the list will be indexed. 195 | 196 | Because most search phrase lists generated from an entity will be under 197 | the approximately 5000 indexed property limit, you can make indexing 198 | more efficient by setting INDEX_USES_MULTI_ENTITIES to False if you know 199 | your indexed content will be relatively small (or you don't care about 200 | some false negatives). When INDEX_USES_MULTI_ENTITIES is True (default), 201 | there is slight overhead on every indexing operation because 202 | we must query for all index entities and delete unused ones. In the 203 | case of a single index entity, it can be simply overwritten. 204 | 205 | The enqueue_indexing() method should be called after your model is created or 206 | edited: 207 | 208 | myPage = Page(author_name='John Doe', content='My amazing content!') 209 | myPage.put() 210 | myPage.enqueue_indexing(url='/tasks/searchindexing') 211 | 212 | Note that a url must be included that corresponds with the url mapped 213 | to search.LiteralIndexing controller. 214 | 215 | You can limit the properties indexed by passing in a list of 216 | property names: 217 | 218 | myPage.enqueue_indexing(url='/foo', only_index=['content']) 219 | 220 | If you want to risk getting a timeout during indexing, you could 221 | index immediately after putting your model and forego task queueing: 222 | 223 | myPage.put() 224 | myPage.index() 225 | 226 | After your model has been indexed, you may use the search() method: 227 | 228 | Page.search('search phrase') # -> Returns Page entities 229 | Page.search('stuff', keys_only=True) # -> Returns Page keys 230 | 231 | In the case of multi-word search phrases like the first example above, 232 | the search will first list keys that match the full phrase and then 233 | list keys that match the AND of individual keywords. Note that when 234 | INDEX_USES_MULTI_ENTITIES is True (default), if a Page's index is spread 235 | over multiple index entities, the keyword AND may fail portion of the 236 | search may fail, i.e., there will be false negative search results. 237 | 238 | You can use the full_text_search() static method to return all entities, 239 | not just a particular kind, that have been indexed: 240 | 241 | Searchable.full_text_search('stuff') # -> Returns any entities 242 | Searchable.full_text_search('stuff', stemming=False) 243 | 244 | Because stemming can be toggled for any particular Model, only entities will 245 | be returned that match indexing style (i.e., stemming on or off). 246 | """ 247 | 248 | INDEX_ONLY = None # Can set to list of property names to index. 249 | INDEX_STEMMING = True # Allow stemming to be turned off per subclass. 250 | INDEX_MULTI_WORD = True # Add two and three-word phrases to index. 251 | 252 | # If TRUE, incurs additional query/delete overhead on indexing but will workaround 253 | # indexed properties limit (MAX_ENTITY_SEARCH_PHRASES) 254 | INDEX_USES_MULTI_ENTITIES = True 255 | 256 | @staticmethod 257 | def full_text_search(phrase, limit=10, 258 | kind=None, 259 | stemming=INDEX_STEMMING, 260 | multi_word_literal=INDEX_MULTI_WORD): 261 | """Queries search indices for phrases using a merge-join. 262 | 263 | Args: 264 | phrase: String. Search phrase. 265 | kind: String. Returned keys/entities are restricted to this kind. 266 | 267 | Returns: 268 | A list of (key, title) tuples corresponding to the indexed entities. 269 | Multi-word literal matches are returned first. 270 | 271 | TODO -- Should provide feedback if input search phrase has stop words, etc. 272 | """ 273 | index_keys = [] 274 | keywords = PUNCTUATION_REGEX.sub(' ', phrase).lower().split() 275 | if stemming: 276 | stemmer = Stemmer.Stemmer('english') 277 | klass = StemmedIndex 278 | else: 279 | klass = LiteralIndex 280 | 281 | if len(keywords) > 1 and multi_word_literal: 282 | # Try to match literal multi-word phrases first 283 | if len(keywords) == 2: 284 | search_phrases = [' '.join(keywords)] 285 | else: 286 | search_phrases = [] 287 | sub_strings = len(keywords) - 2 288 | keyword_not_stop_word = map(lambda x: x not in STOP_WORDS, keywords) 289 | for pos in xrange(0, sub_strings): 290 | if keyword_not_stop_word[pos] and keyword_not_stop_word[pos+2]: 291 | search_phrases.append(' '.join(keywords[pos:pos+3])) 292 | query = klass.all(keys_only=True) 293 | for phrase in search_phrases: 294 | if stemming: 295 | phrase = stemmer.stemWord(phrase) 296 | query = query.filter('phrases =', phrase) 297 | if kind: 298 | query = query.filter('parent_kind =', kind) 299 | index_keys = query.fetch(limit=limit) 300 | 301 | if len(index_keys) < limit: 302 | new_limit = limit - len(index_keys) 303 | keywords = filter(lambda x: len(x) >= SEARCH_PHRASE_MIN_LENGTH, keywords) 304 | if stemming: 305 | keywords = stemmer.stemWords(keywords) 306 | query = klass.all(keys_only=True) 307 | for keyword in keywords: 308 | query = query.filter('phrases =', keyword) 309 | if kind: 310 | query = query.filter('parent_kind =', kind) 311 | single_word_matches = [key for key in query.fetch(limit=new_limit) \ 312 | if key not in index_keys] 313 | index_keys.extend(single_word_matches) 314 | 315 | return [(key.parent(), SearchIndex.get_title(key.name())) for key in index_keys] 316 | 317 | @classmethod 318 | def get_simple_search_phraseset(cls, text): 319 | """Returns a simple set of keywords from given text. 320 | 321 | Args: 322 | text: String. 323 | 324 | Returns: 325 | A set of keywords that aren't stop words and meet length requirement. 326 | 327 | >>> Searchable.get_simple_search_phraseset('I shall return.') 328 | set(['return']) 329 | """ 330 | if text: 331 | datastore_types.ValidateString(text, 'text', max_len=sys.maxint) 332 | text = PUNCTUATION_REGEX.sub(' ', text) 333 | words = text.lower().split() 334 | words = set(words) 335 | words -= STOP_WORDS 336 | for word in list(words): 337 | if len(word) < SEARCH_PHRASE_MIN_LENGTH: 338 | words.remove(word) 339 | else: 340 | words = set() 341 | return words 342 | 343 | @classmethod 344 | def get_search_phraseset(cls, text): 345 | """Returns set of phrases, including two and three adjacent word phrases 346 | not spanning punctuation or stop words. 347 | 348 | Args: 349 | text: String with punctuation. 350 | 351 | Returns: 352 | A set of search terms that aren't stop words and meet length 353 | requirement. Set includes phrases of adjacent words that 354 | aren't stop words. (Stop words are allowed in middle of three-word 355 | phrases like "Statue of Liberty".) 356 | 357 | >>> Searchable.get_search_phraseset('You look through rosy-colored glasses.') 358 | set(['look through rosy', 'rosy colored', 'colored', 'colored glasses', 'rosy', 'rosy colored glasses', 'glasses', 'look']) 359 | >>> Searchable.get_search_phraseset('I saw the Statue of Liberty.') 360 | set(['saw the statue', 'statue of liberty', 'liberty', 'statue']) 361 | >>> Searchable.get_search_phraseset('Recalling friends, past and present.') 362 | set(['recalling', 'recalling friends', 'friends']) 363 | """ 364 | if text: 365 | datastore_types.ValidateString(text, 'text', max_len=sys.maxint) 366 | text = text.lower() 367 | phrases = [] 368 | two_words = [] 369 | three_words = ['', ''] 370 | three_words_no_stop = [False, False] 371 | text = text.replace('-', ' ') 372 | fragments = text.split() 373 | for frag in fragments: 374 | word, replaced = PUNCTUATION_REGEX.subn('', frag) 375 | not_end_punctuation = (replaced > 1 or frag[-1] not in string.punctuation) 376 | if replaced and not_end_punctuation: 377 | two_words = [] 378 | three_words = ['', ''] 379 | three_words.append(word) # We allow stop words in middle 380 | if word in STOP_WORDS: 381 | two_words = [] 382 | three_words_no_stop.append(False) 383 | else: 384 | two_words.append(word) 385 | three_words_no_stop.append(True) 386 | if len(word) >= SEARCH_PHRASE_MIN_LENGTH: 387 | phrases.append(word) 388 | if len(two_words) == 2: 389 | phrases.append(' '.join(two_words)) 390 | del two_words[0] 391 | if len(three_words) == 3 and three_words_no_stop[0]: 392 | phrases.append(' '.join(three_words)) 393 | del three_words[0] 394 | del three_words_no_stop[0] 395 | phrases = set(phrases) 396 | else: 397 | phrases = set() 398 | return phrases 399 | 400 | @classmethod 401 | def search(cls, phrase, limit=10, keys_only=False): 402 | """Queries search indices for phrases using a merge-join. 403 | 404 | Use of this class method lets you easily restrict searches to a kind 405 | and retrieve entities or keys. 406 | 407 | Args: 408 | phrase: Search phrase (string) 409 | limit: Number of entities or keys to return. 410 | keys_only: If True, return only keys with title of parent entity. 411 | 412 | Returns: 413 | A list. If keys_only is True, the list holds (key, title) tuples. 414 | If keys_only is False, the list holds Model instances. 415 | """ 416 | key_list = Searchable.full_text_search( 417 | phrase, limit=limit, kind=cls.kind(), 418 | stemming=cls.INDEX_STEMMING, 419 | multi_word_literal=cls.INDEX_MULTI_WORD) 420 | if keys_only: 421 | logging.debug("key_list: %s", key_list) 422 | return key_list 423 | else: 424 | return [cls.get(key_and_title[0]) for key_and_title in key_list] 425 | 426 | def indexed_title_changed(self): 427 | """Renames index entities for this model to match new title.""" 428 | klass = StemmedIndex if self.INDEX_STEMMING else LiteralIndex 429 | query = klass.all(keys_only=True).ancestor(self.key()) 430 | old_index_keys = query.fetch(1000) 431 | if not hasattr(self, 'INDEX_TITLE_FROM_PROP'): 432 | raise IndexTitleError('Must declare a property name via INDEX_TITLE_FROM_PROP') 433 | new_keys = [] 434 | for old_key in old_index_keys: 435 | old_index = db.get(old_key) 436 | index_num = SearchIndex.get_index_num(old_key.name()) 437 | index_key = klass.put_index(parent=self, index_num=index_num, 438 | phrases=old_index.phrases) 439 | new_keys.append(index_key) 440 | delete_keys = filter(lambda key: key not in new_keys, old_index_keys) 441 | db.delete(delete_keys) 442 | 443 | def get_search_phrases(self, indexing_func=None): 444 | """Returns search phrases from properties in a given Model instance. 445 | 446 | Args (optional): 447 | only_index: List of strings. Restricts indexing to these property names. 448 | indexing_func: A function that returns a set of keywords or phrases. 449 | 450 | Note that the indexing_func can be passed in to allow more customized 451 | search phrase generation. 452 | 453 | Two model variables influence the output of this method: 454 | INDEX_ONLY: If None, all indexable properties are indexed. 455 | If a list of property names, only those properties are indexed. 456 | INDEX_MULTI_WORD: Class variable that allows multi-word search 457 | phrases like "statue of liberty." 458 | INDEX_STEMMING: Returns stemmed phrases. 459 | """ 460 | if not indexing_func: 461 | klass = self.__class__ 462 | if klass.INDEX_MULTI_WORD: 463 | indexing_func = klass.get_search_phraseset 464 | else: 465 | indexing_func = klass.get_simple_search_phraseset 466 | if self.INDEX_STEMMING: 467 | stemmer = Stemmer.Stemmer('english') 468 | phrases = set() 469 | for prop_name, prop_value in self.properties().iteritems(): 470 | if (not self.INDEX_ONLY) or (prop_name in self.INDEX_ONLY): 471 | values = prop_value.get_value_for_datastore(self) 472 | if not isinstance(values, list): 473 | values = [values] 474 | if (isinstance(values[0], basestring) and 475 | not isinstance(values[0], datastore_types.Blob)): 476 | for value in values: 477 | words = indexing_func(value) 478 | if self.INDEX_STEMMING: 479 | stemmed_words = set(stemmer.stemWords(words)) 480 | phrases.update(stemmed_words) 481 | else: 482 | phrases.update(words) 483 | return list(phrases) 484 | 485 | def index(self, indexing_func=None): 486 | """Generates or replaces a search entities for a Model instance. 487 | 488 | Args (optional): 489 | indexing_func: A function that returns a set of keywords or phrases. 490 | 491 | Note that the indexing_func can be passed in to allow more customized 492 | search phrase generation. 493 | """ 494 | search_phrases = self.get_search_phrases(indexing_func=indexing_func) 495 | 496 | key = self.key() 497 | klass = StemmedIndex if self.INDEX_STEMMING else LiteralIndex 498 | 499 | if self.__class__.INDEX_USES_MULTI_ENTITIES: 500 | query = klass.all(keys_only=True).ancestor(key) 501 | previous_index_keys = query.fetch(1000) 502 | num_phrases = len(search_phrases) 503 | 504 | start_index = 0 505 | entity_num = 1 # Appended to key name of index entity 506 | index_keys = [] 507 | while (num_phrases > 0): 508 | cur_num_phrases = min(num_phrases, MAX_ENTITY_SEARCH_PHRASES) 509 | end_index = start_index + cur_num_phrases 510 | num_indices = (num_phrases - 1) / MAX_ENTITY_SEARCH_PHRASES + 1 511 | index_key = klass.put_index(parent=self, index_num=entity_num, 512 | phrases=search_phrases[start_index:end_index]) 513 | index_keys.append(index_key) 514 | if self.__class__.INDEX_USES_MULTI_ENTITIES: 515 | start_index = end_index 516 | num_phrases -= cur_num_phrases 517 | entity_num += 1 518 | else: 519 | num_phrases = 0 # Only write one index entity 520 | if self.__class__.INDEX_USES_MULTI_ENTITIES: 521 | delete_keys = [] 522 | for key in previous_index_keys: 523 | if key not in index_keys: 524 | delete_keys.append(key) 525 | db.delete(delete_keys) 526 | 527 | def enqueue_indexing(self, url, only_index=None): 528 | """Adds an indexing task to the default task queue. 529 | 530 | Args: 531 | url: String. The url associated with LiteralIndexing handler. 532 | only_index: List of strings. Restricts indexing to these prop names. 533 | """ 534 | if url: 535 | params = {'key': str(self.key())} 536 | if only_index: 537 | params['only_index'] = ' '.join(only_index) 538 | taskqueue.add(url=url, params=params) 539 | 540 | class SearchIndexing(webapp.RequestHandler): 541 | """Handler for full text indexing task.""" 542 | def post(self): 543 | key_str = self.request.get('key') 544 | only_index_str = self.request.get('only_index') 545 | if key_str: 546 | key = db.Key(key_str) 547 | entity = db.get(key) 548 | if not entity: 549 | self.response.set_status(200) # Clear task because it's a bad key 550 | else: 551 | only_index = only_index_str.split(',') if only_index_str else None 552 | entity.index() 553 | 554 | -------------------------------------------------------------------------------- /static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DocSavage/appengine-search/daf2d12bbbb30d1ee5871c3d3c791625ac2c77c8/static/favicon.ico -------------------------------------------------------------------------------- /static/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: -------------------------------------------------------------------------------- /tests/HOW-TO-TEST: -------------------------------------------------------------------------------- 1 | Testing a Google App Engine app 2 | =============================== 3 | 4 | This app uses the nose testing framework. To get started, make sure 5 | you've installed the necessary components: 6 | 7 | sudo easy_install nose 8 | sudo easy_install NoseGAE 9 | 10 | Download WebTest (http://pythonpaste.org/webtest). 11 | cd into webtest directory then: 12 | sudo python setup.py build 13 | sudo python setup.py install 14 | 15 | If all the components have been downloaded and installed correctly, 16 | you'll be able to run tests from the command line: 17 | 18 | nosetests -v --with-gae --with-doctest 19 | 20 | The above will run doctests embedded in source files as well as 21 | test scripts in the /tests directory. -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # This setup is necessary to prevent exception from being thrown by users API. 4 | # Eventually, it should be incorporated into NoseGAE plugin. 5 | def setup(): 6 | os.environ['AUTH_DOMAIN'] = 'example.org' 7 | os.environ['USER_EMAIL'] = '' 8 | 9 | def teardown(): 10 | pass 11 | 12 | -------------------------------------------------------------------------------- /tests/test_app.py: -------------------------------------------------------------------------------- 1 | from webtest import TestApp 2 | from main import application 3 | 4 | app = TestApp(application) 5 | 6 | def test_index(): 7 | response = app.get('/') 8 | assert 'Full Text Search Test' in str(response) 9 | -------------------------------------------------------------------------------- /tests/test_search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # The MIT License 4 | # 5 | # Copyright (c) 2009 William T. Katz 6 | # Website/Contact: http://www.billkatz.com 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to 10 | # deal in the Software without restriction, including without limitation 11 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, 12 | # and/or sell copies of the Software, and to permit persons to whom the 13 | # Software is furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | # DEALINGS IN THE SOFTWARE. 25 | 26 | import re 27 | import os 28 | 29 | LOREM_IPSUM = """ 30 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, 31 | sed do eiusmod tempor incididunt ut labore et dolore magna 32 | aliqua. Ut enim ad minim veniam, quis nostrud exercitation 33 | ullamco laboris nisi ut aliquip ex ea commodo consequat. 34 | Duis aute irure dolor in reprehenderit in voluptate velit 35 | esse cillum dolore eu fugiat nulla pariatur. Excepteur sint 36 | occaecat cupidatat non proident, sunt in culpa qui officia 37 | deserunt mollit anim id est laborum. Encrusted. 38 | """ 39 | 40 | INFLECTION_TEST = """ 41 | Guido ran up slippery ruby-encrusted monoliths in search of 42 | the serpentine mascot. The pythonic creatures skulked away. 43 | How quickly did they forget their master? Guido was 44 | challenged by the excessively poor storyline in this fictional 45 | tale, but alas, what could he do? He was one of many fixtures 46 | in ornately narrated prose doomed to be read only by 47 | computerized algorithms implementing text processing! 48 | """ 49 | 50 | from google.appengine.ext import db 51 | import search 52 | 53 | from google.appengine.api import apiproxy_stub_map 54 | from google.appengine.api import datastore_file_stub 55 | 56 | def clear_datastore(): 57 | """Clear datastore. Can be used between tests to insure empty datastore. 58 | 59 | See code.google.com/p/nose-gae/issues/detail?id=16 60 | Note: the appid passed to DatastoreFileStub should match the app id in your app.yaml. 61 | """ 62 | apiproxy_stub_map.apiproxy = apiproxy_stub_map.APIProxyStubMap() 63 | stub = datastore_file_stub.DatastoreFileStub('billkatz-test', '/dev/null', '/dev/null') 64 | apiproxy_stub_map.apiproxy.RegisterStub('datastore_v3', stub) 65 | 66 | class Page(search.Searchable, db.Model): 67 | author_name = db.StringProperty() 68 | title = db.StringProperty() 69 | content = db.TextProperty() 70 | INDEX_TITLE_FROM_PROP = 'title' 71 | 72 | class NoninflectedPage(search.Searchable, db.Model): 73 | """Used to test search without stemming, e.g. for precise, non-inflected words""" 74 | author_name = db.StringProperty() 75 | content = db.TextProperty() 76 | INDEX_STEMMING = False 77 | INDEX_ONLY = ['content'] 78 | 79 | class TestMisc: 80 | def setup(self): 81 | clear_datastore() 82 | 83 | def test_appostrophed_key(self): 84 | page = Page(key_name="Show Don't Tell", author_name="Pro Author", 85 | content="You should always show and not tell through dialogue or narration.") 86 | key = page.put() 87 | assert str(key.name()) == "Show Don't Tell" 88 | 89 | class TestLoremIpsum: 90 | def setup(self): 91 | clear_datastore() 92 | page = NoninflectedPage(author_name='John Doe', content=LOREM_IPSUM) 93 | page.put() 94 | page.index() 95 | assert search.LiteralIndex.all().count() == 1 96 | page = NoninflectedPage(author_name='Jon Favreau', 97 | content='A director that works well with writers.') 98 | page.put() 99 | page.index() 100 | assert search.LiteralIndex.all().count() == 2 101 | 102 | def teardown(self): 103 | pass 104 | 105 | def test_only_index(self): 106 | returned_pages = NoninflectedPage.search('John') # Only 'content' is indexed. 107 | assert not returned_pages 108 | returned_pages = NoninflectedPage.search('lorem ipsum') 109 | assert returned_pages 110 | 111 | def test_two_word_search(self): 112 | returned_pages = NoninflectedPage.search('LoReM IpSuM') 113 | assert returned_pages and len(returned_pages) == 1 114 | lmatch = re.search(r'lorem', returned_pages[0].content, re.IGNORECASE) 115 | imatch = re.search(r'ipsum', returned_pages[0].content, re.IGNORECASE) 116 | assert lmatch and imatch 117 | 118 | def test_key_only_search(self): 119 | key_list = NoninflectedPage.search('LoReM ipsum', keys_only=True) 120 | assert isinstance(key_list, list) and len(key_list) == 1 121 | assert isinstance(key_list[0][0], db.Key) 122 | assert isinstance(key_list[0][1], basestring) 123 | 124 | def test_search_miss(self): 125 | returned_pages = NoninflectedPage.search('NowhereInDoc') 126 | assert not returned_pages 127 | returned_pages = NoninflectedPage.search('director') 128 | assert returned_pages 129 | lmatch = re.search(r'lorem', returned_pages[0].content, re.IGNORECASE) 130 | imatch = re.search(r'ipsum', returned_pages[0].content, re.IGNORECASE) 131 | assert not lmatch and not imatch 132 | 133 | def test_not_inflected(self): 134 | returned_pages = NoninflectedPage.search('encrust') 135 | assert not returned_pages 136 | returned_pages = NoninflectedPage.search('encrusted') 137 | assert returned_pages 138 | 139 | class TestInflection: 140 | def setup(self): 141 | clear_datastore() 142 | page = Page(author_name='John Doe', content=INFLECTION_TEST) 143 | page.put() 144 | page.index() 145 | assert search.StemmedIndex.all().count() == 1 146 | page = Page(author_name='Jon Favreau', content='A director that works well with writers.') 147 | page.put() 148 | page.index() 149 | assert search.StemmedIndex.all().count() == 2 150 | 151 | def test_inflections(self): 152 | def check_inflection(word1, word2): 153 | returned_pages = Page.search(word1) 154 | assert returned_pages 155 | assert re.search(word2, returned_pages[0].content, re.IGNORECASE) 156 | check_inflection('algorithm', 'algorithms') 157 | check_inflection('python', 'pythonic') 158 | check_inflection('rubies', 'ruby') 159 | check_inflection('encrust', 'encrusted') 160 | 161 | class TestBigIndex: 162 | def setup(self): 163 | clear_datastore() 164 | 165 | def test_multientity_index(self): 166 | curdir = os.path.abspath(os.path.dirname(__file__)) 167 | bigtextfile = os.path.join(curdir, 'roget.txt') 168 | import codecs 169 | bigfile = codecs.open(bigtextfile, 'r', 'utf-8') 170 | bigtext = bigfile.read() 171 | words_to_use = 4 * search.MAX_ENTITY_SEARCH_PHRASES 172 | words = bigtext.split() 173 | Page.INDEX_USES_MULTI_ENTITIES = True 174 | page = Page(key_name="Foo", content=' '.join(words[0:words_to_use])) 175 | page.put() 176 | page.index() 177 | assert search.StemmedIndex.all().count() > 1 178 | page = Page(key_name="Foo", content=INFLECTION_TEST) 179 | page.put() 180 | page.index() 181 | assert search.StemmedIndex.all().count() == 1 182 | 183 | class TestKeyOnlySearch: 184 | def setup(self): 185 | clear_datastore() 186 | self.pages = [{ 187 | 'key_name': 'test1', 188 | 'content': 'This post has no title at all.' 189 | }, { 190 | 'key_name': 'test2', 191 | 'title': 'Second Post', 192 | 'content': 'This is some text for the second post.' 193 | }, { 194 | 'key_name': 'test3', 195 | 'title': 'Third Post', 196 | 'content': 'This is some text for the third post. The last post.' 197 | }] 198 | for page_dict in self.pages: 199 | page = Page(**page_dict) 200 | page.put() 201 | page.index() 202 | assert search.StemmedIndex.all().count() == 3 203 | 204 | def test_default_titling(self): 205 | page_list = Page.search('no title', keys_only=True) 206 | assert len(page_list) == 1 207 | assert page_list[0][0].name() == 'test1' 208 | assert page_list[0][1] == 'Page test1' # Default titling 209 | 210 | def test_title_from_parent(self): 211 | page_list = Page.search('last', keys_only=True) 212 | assert len(page_list) == 1 213 | assert page_list[0][0].name() == 'test3' 214 | assert page_list[0][1] == 'Third Post' 215 | 216 | def test_title_change(self): 217 | pages = Page.search('second post') 218 | assert len(pages) == 1 219 | page = pages[0] 220 | page.title = 'My Great New Title' 221 | old_key = page.put() 222 | page.indexed_title_changed() 223 | assert search.StemmedIndex.all().count() == 3 224 | page_list = Page.search('second post', keys_only=True) 225 | assert len(page_list) == 1 226 | assert page_list[0][1] == 'My Great New Title' 227 | assert page_list[0][0].id_or_name() == old_key.id_or_name() 228 | 229 | class TestMultiWordSearch: 230 | def setup(self): 231 | clear_datastore() 232 | page = Page(key_name='doetext', author_name='John Doe', 233 | content=INFLECTION_TEST) 234 | page.put() 235 | page.index() 236 | assert search.StemmedIndex.all().count() == 1 237 | page = Page(key_name="statuetext", 238 | author_name='Other Guy', content=""" 239 | This is the time for all good python programmers to check, 240 | to test, to go forward and throw junk at the code, and in 241 | so doing, try to find errors. 242 | -- Unheralded inscription at base of Statue of Liberty 243 | """) 244 | page.put() 245 | page.index() 246 | assert search.StemmedIndex.all().count() == 2 247 | page = Page(key_name="statuetext2", 248 | author_name='Another Guy', content=""" 249 | I have seen a statue and it declares there should be 250 | liberty in the world. 251 | """) 252 | page.put() 253 | page.index() 254 | assert search.StemmedIndex.all().count() == 3 255 | 256 | def test_multiword_search_order(self): 257 | returned_pages = Page.search('statue of liberty') 258 | assert len(returned_pages) == 2 259 | print "Returned pages: %s" % [page.key().name() for page in returned_pages] 260 | assert returned_pages[0].key().name() == u'statuetext' 261 | assert returned_pages[1].key().name() == u'statuetext2' 262 | 263 | def test_multiword_search_fail(self): 264 | returned_pages = Page.search('statue of liberty biggy word') 265 | assert not returned_pages 266 | 267 | def test_multiword_search_and(self): 268 | returned_pages = Page.search('statue of liberty python') 269 | assert len(returned_pages) == 1 270 | assert returned_pages[0].key().name() == u'statuetext' 271 | 272 | def test_two_word_search(self): 273 | returned_pages = Page.search('ornately narrated') 274 | assert len(returned_pages) == 1 275 | assert returned_pages[0].key().name() == u'doetext' 276 | 277 | --------------------------------------------------------------------------------