├── .gitignore
├── .gitmodules
├── app.yaml
├── index.yaml
├── main.py
├── search
└── __init__.py
├── static
├── favicon.ico
└── robots.txt
└── tests
├── HOW-TO-TEST
├── __init__.py
├── roget.txt
├── test_app.py
└── test_search.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .DS_Store
3 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "search/pyporter2"]
2 | path = search/pyporter2
3 | url = git://github.com/mdirolf/pyporter2.git
4 |
--------------------------------------------------------------------------------
/app.yaml:
--------------------------------------------------------------------------------
1 | application: billkatz-test
2 | version: 7-16-2009-1
3 | runtime: python
4 | api_version: 1
5 |
6 | handlers:
7 | - url: .*/favicon\.ico
8 | static_files: static/favicon.ico
9 | upload: static/favicon.ico
10 |
11 | - url: /robots\.txt
12 | static_files: static/robots.txt
13 | upload: static/robots.txt
14 |
15 | - url: /static
16 | static_dir: static
17 |
18 | - url: .*
19 | script: main.py
20 |
21 | skip_files: |
22 | ^(.*/)?(
23 | (app\.yaml)|
24 | (app\.yml)|
25 | (index\.yaml)|
26 | (index\.yml)|
27 | (#.*#)|
28 | (.*~)|
29 | (.*\.py[co])|
30 | (.*/RCS/.*)|
31 | (\..*)|
32 | (tests/.*)
33 | )$
34 |
--------------------------------------------------------------------------------
/index.yaml:
--------------------------------------------------------------------------------
1 | indexes:
2 |
3 | - kind: StemmedIndex
4 | ancestor: yes
5 |
6 | - kind: StemmedIndex
7 | properties:
8 | - name: parent_kind
9 | - name: phrases
10 |
11 | - kind: LiteralIndex
12 | ancestor: yes
13 |
14 | - kind: LiteralIndex
15 | properties:
16 | - name: parent_kind
17 | - name: phrases
18 |
19 | # AUTOGENERATED
20 |
21 | # This index.yaml is automatically updated whenever the dev_appserver
22 | # detects that a new type of query is run. If you want to manage the
23 | # index.yaml file manually, remove the above marker line (the line
24 | # saying "# AUTOGENERATED"). If you want to manage some indexes
25 | # manually, move them above the marker line. The index.yaml file is
26 | # automatically uploaded to the admin console when you next deploy
27 | # your application using appcfg.py.
28 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # The MIT License
4 | #
5 | # Copyright (c) 2009 William T. Katz
6 | # Website/Contact: http://www.billkatz.com
7 | #
8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
9 | # of this software and associated documentation files (the "Software"), to
10 | # deal in the Software without restriction, including without limitation
11 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 | # and/or sell copies of the Software, and to permit persons to whom the
13 | # Software is furnished to do so, subject to the following conditions:
14 | #
15 | # The above copyright notice and this permission notice shall be included in
16 | # all copies or substantial portions of the Software.
17 | #
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 | # DEALINGS IN THE SOFTWARE.
25 |
26 | """A super simple Google App Engine text posting app.
27 |
28 | Logged in visitors can add some test and search for keywords across all
29 | added pages. It demos a simple full text search module.
30 | """
31 | __author__ = 'William T. Katz'
32 |
33 | import cgi
34 | import logging
35 |
36 | from google.appengine.api import users
37 | from google.appengine.ext import db
38 | from google.appengine.ext import webapp
39 | from google.appengine.ext.webapp.util import run_wsgi_app
40 |
41 | # The following are necessary for full-text search demo
42 | import search
43 | INDEXING_URL = '/tasks/searchindexing'
44 |
45 | class Page(search.Searchable, db.Model):
46 | user = db.UserProperty()
47 | title = db.StringProperty()
48 | content = db.TextProperty()
49 | created = db.DateTimeProperty(auto_now=True)
50 | INDEX_TITLE_FROM_PROP = 'title'
51 | # INDEX_USES_MULTI_ENTITIES = False
52 |
53 | class SimplePage(webapp.RequestHandler):
54 | def render(self, html):
55 | user = users.get_current_user()
56 | page = '
Add Page | '
57 | if user:
58 | page += 'Logged in as %s ' % (user.nickname())
59 | logout_url = users.create_logout_url(self.request.uri)
60 | page += '|
Logout' % (logout_url)
61 | else:
62 | login_url = users.create_login_url(self.request.uri)
63 | page += '
Google login' % (login_url)
64 | page += """
65 |
66 | Full Text Search Test
67 | This app tests a full text search module for Google App Engine.
68 | Once you are logged in, you can add text pages that will be indexed via
69 | Task Queue API tasks. The search indices are efficiently stored using
70 | "Relation Index" entities as described in
71 |
72 | this Google I/O talk.
73 | My blog has an
74 |
75 | article on this appengine-search module. You can download the code from the
76 | appengine-search
77 | github repository under a liberal open source (MIT) license.
78 | '
93 | page += html
94 | page += ''
95 | self.response.out.write(page)
96 |
97 | class MainPage(SimplePage):
98 | def get(self):
99 | user = users.get_current_user()
100 | if not user:
101 | html = 'Please login to add a page.
'
102 | else:
103 | import time
104 | time_string = time.strftime('Page submitted %X on %x')
105 | html = """
106 | Add a text page below:
107 |
116 | """
117 | self.render(html)
118 |
119 | def post(self):
120 | user = users.get_current_user()
121 | content = self.request.get('content')
122 | title = self.request.get('title')
123 | if not user:
124 | self.redirect('/?msg=You+must+be+logged+in')
125 | elif not content:
126 | self.redirect('/')
127 | else:
128 | page = Page(content=content, title=title, user=user)
129 | page.put()
130 | page.enqueue_indexing(url=INDEXING_URL)
131 | html = "Thanks for entering the following text:
"
132 | html += "%s
" % (cgi.escape(content))
133 | self.render(html)
134 |
135 | class SearchPage(SimplePage):
136 | def get(self):
137 | submitbtn = self.request.get('submitbtn')
138 | phrase = self.request.get('phrase')
139 | html = "'" + phrase + "' was found on these pages:
"
140 | if submitbtn == 'Return Keys Only':
141 | key_list = Page.search(phrase, keys_only=True)
142 | for key_and_title in key_list:
143 | html += "" % key_and_title[1]
144 | else:
145 | pages = Page.search(phrase)
146 | for page in pages:
147 | html += "Title: %s
User: %s, Created: %s
%s
" \
148 | % (page.title, str(page.user), str(page.created), cgi.escape(page.content))
149 | self.render(html)
150 |
151 | application = webapp.WSGIApplication([
152 | ('/', MainPage),
153 | ('/search', SearchPage),
154 | (INDEXING_URL, search.SearchIndexing)], debug=True)
155 |
156 | def main():
157 | run_wsgi_app(application)
158 |
159 | if __name__ == '__main__':
160 | main()
161 |
--------------------------------------------------------------------------------
/search/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # The MIT License
4 | #
5 | # Copyright (c) 2009 William T. Katz
6 | # Website/Contact: http://www.billkatz.com
7 | #
8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
9 | # of this software and associated documentation files (the "Software"), to
10 | # deal in the Software without restriction, including without limitation
11 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 | # and/or sell copies of the Software, and to permit persons to whom the
13 | # Software is furnished to do so, subject to the following conditions:
14 | #
15 | # The above copyright notice and this permission notice shall be included in
16 | # all copies or substantial portions of the Software.
17 | #
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 | # DEALINGS IN THE SOFTWARE.
25 |
26 | """A simple full-text search system
27 |
28 | This module lets you designate particular entities for full text search
29 | indexing. It uses the Task Queue API to schedule search indexing and
30 | relation index entities (as described in Brett Slatkin's 'Building Scalable,
31 | Complex Apps on App Engine' talk at Google I/O, 2009).
32 |
33 | The keyword extraction code was slightly modified from Ryan Barrett's
34 | SearchableModel implementation.
35 | """
36 | __author__ = 'William T. Katz'
37 |
38 | import logging
39 | import re
40 | import string
41 | import sys
42 |
43 | from google.appengine.api import datastore
44 | from google.appengine.api import datastore_types
45 | from google.appengine.ext import db
46 | from google.appengine.ext import webapp
47 |
48 | # TODO -- This will eventually be moved out of labs namespace
49 | from google.appengine.api.labs import taskqueue
50 |
51 | # Use python port of Porter2 stemmer.
52 | from search.pyporter2 import Stemmer
53 |
54 | class Error(Exception):
55 | """Base search module error type."""
56 |
57 | class IndexTitleError(Error):
58 | """Raised when INDEX_TITLE_FROM_PROP or title alterations are incorrect."""
59 |
60 | # Following module-level constants are cached in instance
61 |
62 | KEY_NAME_DELIMITER = '||' # Used to hold arbitrary strings in key names.
63 | # Should not be contained in derived class key names.
64 |
65 | MAX_ENTITY_SEARCH_PHRASES = datastore._MAX_INDEXED_PROPERTIES - 1
66 |
67 | SEARCH_PHRASE_MIN_LENGTH = 4
68 |
69 | STOP_WORDS = frozenset([
70 | 'a', 'about', 'according', 'accordingly', 'affected', 'affecting', 'after',
71 | 'again', 'against', 'all', 'almost', 'already', 'also', 'although',
72 | 'always', 'am', 'among', 'an', 'and', 'any', 'anyone', 'apparently', 'are',
73 | 'arise', 'as', 'aside', 'at', 'away', 'be', 'became', 'because', 'become',
74 | 'becomes', 'been', 'before', 'being', 'between', 'both', 'briefly', 'but',
75 | 'by', 'came', 'can', 'cannot', 'certain', 'certainly', 'could', 'did', 'do',
76 | 'does', 'done', 'during', 'each', 'either', 'else', 'etc', 'ever', 'every',
77 | 'following', 'for', 'found', 'from', 'further', 'gave', 'gets', 'give',
78 | 'given', 'giving', 'gone', 'got', 'had', 'hardly', 'has', 'have', 'having',
79 | 'here', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'itself',
80 | 'just', 'keep', 'kept', 'knowledge', 'largely', 'like', 'made', 'mainly',
81 | 'make', 'many', 'might', 'more', 'most', 'mostly', 'much', 'must', 'nearly',
82 | 'necessarily', 'neither', 'next', 'no', 'none', 'nor', 'normally', 'not',
83 | 'noted', 'now', 'obtain', 'obtained', 'of', 'often', 'on', 'only', 'or',
84 | 'other', 'our', 'out', 'owing', 'particularly', 'past', 'perhaps', 'please',
85 | 'poorly', 'possible', 'possibly', 'potentially', 'predominantly', 'present',
86 | 'previously', 'primarily', 'probably', 'prompt', 'promptly', 'put',
87 | 'quickly', 'quite', 'rather', 'readily', 'really', 'recently', 'regarding',
88 | 'regardless', 'relatively', 'respectively', 'resulted', 'resulting',
89 | 'results', 'said', 'same', 'seem', 'seen', 'several', 'shall', 'should',
90 | 'show', 'showed', 'shown', 'shows', 'significantly', 'similar', 'similarly',
91 | 'since', 'slightly', 'so', 'some', 'sometime', 'somewhat', 'soon',
92 | 'specifically', 'state', 'states', 'strongly', 'substantially',
93 | 'successfully', 'such', 'sufficiently', 'than', 'that', 'the', 'their',
94 | 'theirs', 'them', 'then', 'there', 'therefore', 'these', 'they', 'this',
95 | 'those', 'though', 'through', 'throughout', 'to', 'too', 'toward', 'under',
96 | 'unless', 'until', 'up', 'upon', 'use', 'used', 'usefully', 'usefulness',
97 | 'using', 'usually', 'various', 'very', 'was', 'we', 'were', 'what', 'when',
98 | 'where', 'whether', 'which', 'while', 'who', 'whose', 'why', 'widely',
99 | 'will', 'with', 'within', 'without', 'would', 'yet', 'you'])
100 |
101 | PUNCTUATION_REGEX = re.compile('[' + re.escape(string.punctuation) + ']')
102 |
103 | # Rather than have an extra property name to distinguish stemmed from
104 | # non-stemmed index entities, we use different Models that are
105 | # identical to a base index entity.
106 | class SearchIndex(db.Model):
107 | """Holds full text indexing on an entity.
108 |
109 | This model is used by the Searchable mix-in to hold full text
110 | indexes of a parent entity.
111 | """
112 | @staticmethod
113 | def get_index_key_name(parent, index_num=1):
114 | key = parent.key()
115 | title = key.kind() + ' ' + str(key.id_or_name())
116 | uniq_key = title + KEY_NAME_DELIMITER + str(index_num)
117 | if hasattr(parent, 'INDEX_TITLE_FROM_PROP'):
118 | logging.debug("Getting key name from property '%s'", parent.INDEX_TITLE_FROM_PROP)
119 | if hasattr(parent, parent.INDEX_TITLE_FROM_PROP):
120 | title = getattr(parent, parent.INDEX_TITLE_FROM_PROP) or title
121 | return uniq_key + KEY_NAME_DELIMITER + title
122 |
123 | @staticmethod
124 | def get_title(key_name=''):
125 | frags = key_name.split(KEY_NAME_DELIMITER)
126 | if len(frags) < 3:
127 | return 'Unknown Title'
128 | else:
129 | return frags[2]
130 |
131 | @staticmethod
132 | def get_index_num(key_name=''):
133 | frags = key_name.split(KEY_NAME_DELIMITER)
134 | if len(frags) < 2:
135 | return '1'
136 | else:
137 | return frags[1]
138 |
139 | @classmethod
140 | def put_index(cls, parent, phrases, index_num=1):
141 | parent_key = parent.key()
142 | args = {'key_name': cls.get_index_key_name(parent, index_num),
143 | 'parent': parent_key, 'parent_kind': parent_key.kind(),
144 | 'phrases': phrases }
145 | return cls(**args).put()
146 |
147 |
148 | class LiteralIndex(SearchIndex):
149 | """Index model for non-inflected search phrases."""
150 | parent_kind = db.StringProperty(required=True)
151 | phrases = db.StringListProperty(required=True)
152 |
153 |
154 | class StemmedIndex(SearchIndex):
155 | """Index model for stemmed (inflected) search phrases."""
156 | parent_kind = db.StringProperty(required=True)
157 | phrases = db.StringListProperty(required=True)
158 |
159 |
160 | class Searchable(object):
161 | """A class that supports full text indexing and search on entities.
162 |
163 | Add this class to your model's inheritance declaration like this:
164 |
165 | class Page(Searchable, db.Model):
166 | title = db.StringProperty()
167 | author_name = db.StringProperty()
168 | content = db.TextProperty()
169 | INDEX_TITLE_FROM_PROP = 'title'
170 | # INDEX_STEMMING = False
171 | # INDEX_USES_MULTI_ENTITIES = False
172 | # INDEX_MULTI_WORD = False
173 | # INDEX_ONLY = ['content']
174 |
175 | There are a few class variables that can be overridden by your Model.
176 | The settings were made class variables because their use should be
177 | declared at Model definition.
178 |
179 | You can declare a string property to be stowed in index key names by
180 | using the INDEX_TITLE_FROM_PROP variable. This allows you to retrieve
181 | useful labels on key-only searches without doing a get() on the whole
182 | entity.
183 |
184 | Defaults are for searches to use stemming, multiple index entities,
185 | and index all basestring-derived properties. Also, two and three-word
186 | phrases are inserted into the index, which can be disable by setting
187 | INDEX_MULTI_WORD to False.
188 |
189 | Stemming is on by default but can be toggled off by setting INDEX_STEMMING
190 | to False in your class declaration.
191 |
192 | You can set a class variable INDEX_ONLY to a list of property names
193 | for indexing. If INDEX_ONLY is not None, only those properties named
194 | in the list will be indexed.
195 |
196 | Because most search phrase lists generated from an entity will be under
197 | the approximately 5000 indexed property limit, you can make indexing
198 | more efficient by setting INDEX_USES_MULTI_ENTITIES to False if you know
199 | your indexed content will be relatively small (or you don't care about
200 | some false negatives). When INDEX_USES_MULTI_ENTITIES is True (default),
201 | there is slight overhead on every indexing operation because
202 | we must query for all index entities and delete unused ones. In the
203 | case of a single index entity, it can be simply overwritten.
204 |
205 | The enqueue_indexing() method should be called after your model is created or
206 | edited:
207 |
208 | myPage = Page(author_name='John Doe', content='My amazing content!')
209 | myPage.put()
210 | myPage.enqueue_indexing(url='/tasks/searchindexing')
211 |
212 | Note that a url must be included that corresponds with the url mapped
213 | to search.LiteralIndexing controller.
214 |
215 | You can limit the properties indexed by passing in a list of
216 | property names:
217 |
218 | myPage.enqueue_indexing(url='/foo', only_index=['content'])
219 |
220 | If you want to risk getting a timeout during indexing, you could
221 | index immediately after putting your model and forego task queueing:
222 |
223 | myPage.put()
224 | myPage.index()
225 |
226 | After your model has been indexed, you may use the search() method:
227 |
228 | Page.search('search phrase') # -> Returns Page entities
229 | Page.search('stuff', keys_only=True) # -> Returns Page keys
230 |
231 | In the case of multi-word search phrases like the first example above,
232 | the search will first list keys that match the full phrase and then
233 | list keys that match the AND of individual keywords. Note that when
234 | INDEX_USES_MULTI_ENTITIES is True (default), if a Page's index is spread
235 | over multiple index entities, the keyword AND may fail portion of the
236 | search may fail, i.e., there will be false negative search results.
237 |
238 | You can use the full_text_search() static method to return all entities,
239 | not just a particular kind, that have been indexed:
240 |
241 | Searchable.full_text_search('stuff') # -> Returns any entities
242 | Searchable.full_text_search('stuff', stemming=False)
243 |
244 | Because stemming can be toggled for any particular Model, only entities will
245 | be returned that match indexing style (i.e., stemming on or off).
246 | """
247 |
248 | INDEX_ONLY = None # Can set to list of property names to index.
249 | INDEX_STEMMING = True # Allow stemming to be turned off per subclass.
250 | INDEX_MULTI_WORD = True # Add two and three-word phrases to index.
251 |
252 | # If TRUE, incurs additional query/delete overhead on indexing but will workaround
253 | # indexed properties limit (MAX_ENTITY_SEARCH_PHRASES)
254 | INDEX_USES_MULTI_ENTITIES = True
255 |
256 | @staticmethod
257 | def full_text_search(phrase, limit=10,
258 | kind=None,
259 | stemming=INDEX_STEMMING,
260 | multi_word_literal=INDEX_MULTI_WORD):
261 | """Queries search indices for phrases using a merge-join.
262 |
263 | Args:
264 | phrase: String. Search phrase.
265 | kind: String. Returned keys/entities are restricted to this kind.
266 |
267 | Returns:
268 | A list of (key, title) tuples corresponding to the indexed entities.
269 | Multi-word literal matches are returned first.
270 |
271 | TODO -- Should provide feedback if input search phrase has stop words, etc.
272 | """
273 | index_keys = []
274 | keywords = PUNCTUATION_REGEX.sub(' ', phrase).lower().split()
275 | if stemming:
276 | stemmer = Stemmer.Stemmer('english')
277 | klass = StemmedIndex
278 | else:
279 | klass = LiteralIndex
280 |
281 | if len(keywords) > 1 and multi_word_literal:
282 | # Try to match literal multi-word phrases first
283 | if len(keywords) == 2:
284 | search_phrases = [' '.join(keywords)]
285 | else:
286 | search_phrases = []
287 | sub_strings = len(keywords) - 2
288 | keyword_not_stop_word = map(lambda x: x not in STOP_WORDS, keywords)
289 | for pos in xrange(0, sub_strings):
290 | if keyword_not_stop_word[pos] and keyword_not_stop_word[pos+2]:
291 | search_phrases.append(' '.join(keywords[pos:pos+3]))
292 | query = klass.all(keys_only=True)
293 | for phrase in search_phrases:
294 | if stemming:
295 | phrase = stemmer.stemWord(phrase)
296 | query = query.filter('phrases =', phrase)
297 | if kind:
298 | query = query.filter('parent_kind =', kind)
299 | index_keys = query.fetch(limit=limit)
300 |
301 | if len(index_keys) < limit:
302 | new_limit = limit - len(index_keys)
303 | keywords = filter(lambda x: len(x) >= SEARCH_PHRASE_MIN_LENGTH, keywords)
304 | if stemming:
305 | keywords = stemmer.stemWords(keywords)
306 | query = klass.all(keys_only=True)
307 | for keyword in keywords:
308 | query = query.filter('phrases =', keyword)
309 | if kind:
310 | query = query.filter('parent_kind =', kind)
311 | single_word_matches = [key for key in query.fetch(limit=new_limit) \
312 | if key not in index_keys]
313 | index_keys.extend(single_word_matches)
314 |
315 | return [(key.parent(), SearchIndex.get_title(key.name())) for key in index_keys]
316 |
317 | @classmethod
318 | def get_simple_search_phraseset(cls, text):
319 | """Returns a simple set of keywords from given text.
320 |
321 | Args:
322 | text: String.
323 |
324 | Returns:
325 | A set of keywords that aren't stop words and meet length requirement.
326 |
327 | >>> Searchable.get_simple_search_phraseset('I shall return.')
328 | set(['return'])
329 | """
330 | if text:
331 | datastore_types.ValidateString(text, 'text', max_len=sys.maxint)
332 | text = PUNCTUATION_REGEX.sub(' ', text)
333 | words = text.lower().split()
334 | words = set(words)
335 | words -= STOP_WORDS
336 | for word in list(words):
337 | if len(word) < SEARCH_PHRASE_MIN_LENGTH:
338 | words.remove(word)
339 | else:
340 | words = set()
341 | return words
342 |
343 | @classmethod
344 | def get_search_phraseset(cls, text):
345 | """Returns set of phrases, including two and three adjacent word phrases
346 | not spanning punctuation or stop words.
347 |
348 | Args:
349 | text: String with punctuation.
350 |
351 | Returns:
352 | A set of search terms that aren't stop words and meet length
353 | requirement. Set includes phrases of adjacent words that
354 | aren't stop words. (Stop words are allowed in middle of three-word
355 | phrases like "Statue of Liberty".)
356 |
357 | >>> Searchable.get_search_phraseset('You look through rosy-colored glasses.')
358 | set(['look through rosy', 'rosy colored', 'colored', 'colored glasses', 'rosy', 'rosy colored glasses', 'glasses', 'look'])
359 | >>> Searchable.get_search_phraseset('I saw the Statue of Liberty.')
360 | set(['saw the statue', 'statue of liberty', 'liberty', 'statue'])
361 | >>> Searchable.get_search_phraseset('Recalling friends, past and present.')
362 | set(['recalling', 'recalling friends', 'friends'])
363 | """
364 | if text:
365 | datastore_types.ValidateString(text, 'text', max_len=sys.maxint)
366 | text = text.lower()
367 | phrases = []
368 | two_words = []
369 | three_words = ['', '']
370 | three_words_no_stop = [False, False]
371 | text = text.replace('-', ' ')
372 | fragments = text.split()
373 | for frag in fragments:
374 | word, replaced = PUNCTUATION_REGEX.subn('', frag)
375 | not_end_punctuation = (replaced > 1 or frag[-1] not in string.punctuation)
376 | if replaced and not_end_punctuation:
377 | two_words = []
378 | three_words = ['', '']
379 | three_words.append(word) # We allow stop words in middle
380 | if word in STOP_WORDS:
381 | two_words = []
382 | three_words_no_stop.append(False)
383 | else:
384 | two_words.append(word)
385 | three_words_no_stop.append(True)
386 | if len(word) >= SEARCH_PHRASE_MIN_LENGTH:
387 | phrases.append(word)
388 | if len(two_words) == 2:
389 | phrases.append(' '.join(two_words))
390 | del two_words[0]
391 | if len(three_words) == 3 and three_words_no_stop[0]:
392 | phrases.append(' '.join(three_words))
393 | del three_words[0]
394 | del three_words_no_stop[0]
395 | phrases = set(phrases)
396 | else:
397 | phrases = set()
398 | return phrases
399 |
400 | @classmethod
401 | def search(cls, phrase, limit=10, keys_only=False):
402 | """Queries search indices for phrases using a merge-join.
403 |
404 | Use of this class method lets you easily restrict searches to a kind
405 | and retrieve entities or keys.
406 |
407 | Args:
408 | phrase: Search phrase (string)
409 | limit: Number of entities or keys to return.
410 | keys_only: If True, return only keys with title of parent entity.
411 |
412 | Returns:
413 | A list. If keys_only is True, the list holds (key, title) tuples.
414 | If keys_only is False, the list holds Model instances.
415 | """
416 | key_list = Searchable.full_text_search(
417 | phrase, limit=limit, kind=cls.kind(),
418 | stemming=cls.INDEX_STEMMING,
419 | multi_word_literal=cls.INDEX_MULTI_WORD)
420 | if keys_only:
421 | logging.debug("key_list: %s", key_list)
422 | return key_list
423 | else:
424 | return [cls.get(key_and_title[0]) for key_and_title in key_list]
425 |
426 | def indexed_title_changed(self):
427 | """Renames index entities for this model to match new title."""
428 | klass = StemmedIndex if self.INDEX_STEMMING else LiteralIndex
429 | query = klass.all(keys_only=True).ancestor(self.key())
430 | old_index_keys = query.fetch(1000)
431 | if not hasattr(self, 'INDEX_TITLE_FROM_PROP'):
432 | raise IndexTitleError('Must declare a property name via INDEX_TITLE_FROM_PROP')
433 | new_keys = []
434 | for old_key in old_index_keys:
435 | old_index = db.get(old_key)
436 | index_num = SearchIndex.get_index_num(old_key.name())
437 | index_key = klass.put_index(parent=self, index_num=index_num,
438 | phrases=old_index.phrases)
439 | new_keys.append(index_key)
440 | delete_keys = filter(lambda key: key not in new_keys, old_index_keys)
441 | db.delete(delete_keys)
442 |
443 | def get_search_phrases(self, indexing_func=None):
444 | """Returns search phrases from properties in a given Model instance.
445 |
446 | Args (optional):
447 | only_index: List of strings. Restricts indexing to these property names.
448 | indexing_func: A function that returns a set of keywords or phrases.
449 |
450 | Note that the indexing_func can be passed in to allow more customized
451 | search phrase generation.
452 |
453 | Two model variables influence the output of this method:
454 | INDEX_ONLY: If None, all indexable properties are indexed.
455 | If a list of property names, only those properties are indexed.
456 | INDEX_MULTI_WORD: Class variable that allows multi-word search
457 | phrases like "statue of liberty."
458 | INDEX_STEMMING: Returns stemmed phrases.
459 | """
460 | if not indexing_func:
461 | klass = self.__class__
462 | if klass.INDEX_MULTI_WORD:
463 | indexing_func = klass.get_search_phraseset
464 | else:
465 | indexing_func = klass.get_simple_search_phraseset
466 | if self.INDEX_STEMMING:
467 | stemmer = Stemmer.Stemmer('english')
468 | phrases = set()
469 | for prop_name, prop_value in self.properties().iteritems():
470 | if (not self.INDEX_ONLY) or (prop_name in self.INDEX_ONLY):
471 | values = prop_value.get_value_for_datastore(self)
472 | if not isinstance(values, list):
473 | values = [values]
474 | if (isinstance(values[0], basestring) and
475 | not isinstance(values[0], datastore_types.Blob)):
476 | for value in values:
477 | words = indexing_func(value)
478 | if self.INDEX_STEMMING:
479 | stemmed_words = set(stemmer.stemWords(words))
480 | phrases.update(stemmed_words)
481 | else:
482 | phrases.update(words)
483 | return list(phrases)
484 |
485 | def index(self, indexing_func=None):
486 | """Generates or replaces a search entities for a Model instance.
487 |
488 | Args (optional):
489 | indexing_func: A function that returns a set of keywords or phrases.
490 |
491 | Note that the indexing_func can be passed in to allow more customized
492 | search phrase generation.
493 | """
494 | search_phrases = self.get_search_phrases(indexing_func=indexing_func)
495 |
496 | key = self.key()
497 | klass = StemmedIndex if self.INDEX_STEMMING else LiteralIndex
498 |
499 | if self.__class__.INDEX_USES_MULTI_ENTITIES:
500 | query = klass.all(keys_only=True).ancestor(key)
501 | previous_index_keys = query.fetch(1000)
502 | num_phrases = len(search_phrases)
503 |
504 | start_index = 0
505 | entity_num = 1 # Appended to key name of index entity
506 | index_keys = []
507 | while (num_phrases > 0):
508 | cur_num_phrases = min(num_phrases, MAX_ENTITY_SEARCH_PHRASES)
509 | end_index = start_index + cur_num_phrases
510 | num_indices = (num_phrases - 1) / MAX_ENTITY_SEARCH_PHRASES + 1
511 | index_key = klass.put_index(parent=self, index_num=entity_num,
512 | phrases=search_phrases[start_index:end_index])
513 | index_keys.append(index_key)
514 | if self.__class__.INDEX_USES_MULTI_ENTITIES:
515 | start_index = end_index
516 | num_phrases -= cur_num_phrases
517 | entity_num += 1
518 | else:
519 | num_phrases = 0 # Only write one index entity
520 | if self.__class__.INDEX_USES_MULTI_ENTITIES:
521 | delete_keys = []
522 | for key in previous_index_keys:
523 | if key not in index_keys:
524 | delete_keys.append(key)
525 | db.delete(delete_keys)
526 |
527 | def enqueue_indexing(self, url, only_index=None):
528 | """Adds an indexing task to the default task queue.
529 |
530 | Args:
531 | url: String. The url associated with LiteralIndexing handler.
532 | only_index: List of strings. Restricts indexing to these prop names.
533 | """
534 | if url:
535 | params = {'key': str(self.key())}
536 | if only_index:
537 | params['only_index'] = ' '.join(only_index)
538 | taskqueue.add(url=url, params=params)
539 |
540 | class SearchIndexing(webapp.RequestHandler):
541 | """Handler for full text indexing task."""
542 | def post(self):
543 | key_str = self.request.get('key')
544 | only_index_str = self.request.get('only_index')
545 | if key_str:
546 | key = db.Key(key_str)
547 | entity = db.get(key)
548 | if not entity:
549 | self.response.set_status(200) # Clear task because it's a bad key
550 | else:
551 | only_index = only_index_str.split(',') if only_index_str else None
552 | entity.index()
553 |
554 |
--------------------------------------------------------------------------------
/static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DocSavage/appengine-search/daf2d12bbbb30d1ee5871c3d3c791625ac2c77c8/static/favicon.ico
--------------------------------------------------------------------------------
/static/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Disallow:
--------------------------------------------------------------------------------
/tests/HOW-TO-TEST:
--------------------------------------------------------------------------------
1 | Testing a Google App Engine app
2 | ===============================
3 |
4 | This app uses the nose testing framework. To get started, make sure
5 | you've installed the necessary components:
6 |
7 | sudo easy_install nose
8 | sudo easy_install NoseGAE
9 |
10 | Download WebTest (http://pythonpaste.org/webtest).
11 | cd into webtest directory then:
12 | sudo python setup.py build
13 | sudo python setup.py install
14 |
15 | If all the components have been downloaded and installed correctly,
16 | you'll be able to run tests from the command line:
17 |
18 | nosetests -v --with-gae --with-doctest
19 |
20 | The above will run doctests embedded in source files as well as
21 | test scripts in the /tests directory.
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | # This setup is necessary to prevent exception from being thrown by users API.
4 | # Eventually, it should be incorporated into NoseGAE plugin.
5 | def setup():
6 | os.environ['AUTH_DOMAIN'] = 'example.org'
7 | os.environ['USER_EMAIL'] = ''
8 |
9 | def teardown():
10 | pass
11 |
12 |
--------------------------------------------------------------------------------
/tests/test_app.py:
--------------------------------------------------------------------------------
1 | from webtest import TestApp
2 | from main import application
3 |
4 | app = TestApp(application)
5 |
6 | def test_index():
7 | response = app.get('/')
8 | assert 'Full Text Search Test' in str(response)
9 |
--------------------------------------------------------------------------------
/tests/test_search.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # The MIT License
4 | #
5 | # Copyright (c) 2009 William T. Katz
6 | # Website/Contact: http://www.billkatz.com
7 | #
8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
9 | # of this software and associated documentation files (the "Software"), to
10 | # deal in the Software without restriction, including without limitation
11 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 | # and/or sell copies of the Software, and to permit persons to whom the
13 | # Software is furnished to do so, subject to the following conditions:
14 | #
15 | # The above copyright notice and this permission notice shall be included in
16 | # all copies or substantial portions of the Software.
17 | #
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 | # DEALINGS IN THE SOFTWARE.
25 |
26 | import re
27 | import os
28 |
29 | LOREM_IPSUM = """
30 | Lorem ipsum dolor sit amet, consectetur adipisicing elit,
31 | sed do eiusmod tempor incididunt ut labore et dolore magna
32 | aliqua. Ut enim ad minim veniam, quis nostrud exercitation
33 | ullamco laboris nisi ut aliquip ex ea commodo consequat.
34 | Duis aute irure dolor in reprehenderit in voluptate velit
35 | esse cillum dolore eu fugiat nulla pariatur. Excepteur sint
36 | occaecat cupidatat non proident, sunt in culpa qui officia
37 | deserunt mollit anim id est laborum. Encrusted.
38 | """
39 |
40 | INFLECTION_TEST = """
41 | Guido ran up slippery ruby-encrusted monoliths in search of
42 | the serpentine mascot. The pythonic creatures skulked away.
43 | How quickly did they forget their master? Guido was
44 | challenged by the excessively poor storyline in this fictional
45 | tale, but alas, what could he do? He was one of many fixtures
46 | in ornately narrated prose doomed to be read only by
47 | computerized algorithms implementing text processing!
48 | """
49 |
50 | from google.appengine.ext import db
51 | import search
52 |
53 | from google.appengine.api import apiproxy_stub_map
54 | from google.appengine.api import datastore_file_stub
55 |
56 | def clear_datastore():
57 | """Clear datastore. Can be used between tests to insure empty datastore.
58 |
59 | See code.google.com/p/nose-gae/issues/detail?id=16
60 | Note: the appid passed to DatastoreFileStub should match the app id in your app.yaml.
61 | """
62 | apiproxy_stub_map.apiproxy = apiproxy_stub_map.APIProxyStubMap()
63 | stub = datastore_file_stub.DatastoreFileStub('billkatz-test', '/dev/null', '/dev/null')
64 | apiproxy_stub_map.apiproxy.RegisterStub('datastore_v3', stub)
65 |
66 | class Page(search.Searchable, db.Model):
67 | author_name = db.StringProperty()
68 | title = db.StringProperty()
69 | content = db.TextProperty()
70 | INDEX_TITLE_FROM_PROP = 'title'
71 |
72 | class NoninflectedPage(search.Searchable, db.Model):
73 | """Used to test search without stemming, e.g. for precise, non-inflected words"""
74 | author_name = db.StringProperty()
75 | content = db.TextProperty()
76 | INDEX_STEMMING = False
77 | INDEX_ONLY = ['content']
78 |
79 | class TestMisc:
80 | def setup(self):
81 | clear_datastore()
82 |
83 | def test_appostrophed_key(self):
84 | page = Page(key_name="Show Don't Tell", author_name="Pro Author",
85 | content="You should always show and not tell through dialogue or narration.")
86 | key = page.put()
87 | assert str(key.name()) == "Show Don't Tell"
88 |
89 | class TestLoremIpsum:
90 | def setup(self):
91 | clear_datastore()
92 | page = NoninflectedPage(author_name='John Doe', content=LOREM_IPSUM)
93 | page.put()
94 | page.index()
95 | assert search.LiteralIndex.all().count() == 1
96 | page = NoninflectedPage(author_name='Jon Favreau',
97 | content='A director that works well with writers.')
98 | page.put()
99 | page.index()
100 | assert search.LiteralIndex.all().count() == 2
101 |
102 | def teardown(self):
103 | pass
104 |
105 | def test_only_index(self):
106 | returned_pages = NoninflectedPage.search('John') # Only 'content' is indexed.
107 | assert not returned_pages
108 | returned_pages = NoninflectedPage.search('lorem ipsum')
109 | assert returned_pages
110 |
111 | def test_two_word_search(self):
112 | returned_pages = NoninflectedPage.search('LoReM IpSuM')
113 | assert returned_pages and len(returned_pages) == 1
114 | lmatch = re.search(r'lorem', returned_pages[0].content, re.IGNORECASE)
115 | imatch = re.search(r'ipsum', returned_pages[0].content, re.IGNORECASE)
116 | assert lmatch and imatch
117 |
118 | def test_key_only_search(self):
119 | key_list = NoninflectedPage.search('LoReM ipsum', keys_only=True)
120 | assert isinstance(key_list, list) and len(key_list) == 1
121 | assert isinstance(key_list[0][0], db.Key)
122 | assert isinstance(key_list[0][1], basestring)
123 |
124 | def test_search_miss(self):
125 | returned_pages = NoninflectedPage.search('NowhereInDoc')
126 | assert not returned_pages
127 | returned_pages = NoninflectedPage.search('director')
128 | assert returned_pages
129 | lmatch = re.search(r'lorem', returned_pages[0].content, re.IGNORECASE)
130 | imatch = re.search(r'ipsum', returned_pages[0].content, re.IGNORECASE)
131 | assert not lmatch and not imatch
132 |
133 | def test_not_inflected(self):
134 | returned_pages = NoninflectedPage.search('encrust')
135 | assert not returned_pages
136 | returned_pages = NoninflectedPage.search('encrusted')
137 | assert returned_pages
138 |
139 | class TestInflection:
140 | def setup(self):
141 | clear_datastore()
142 | page = Page(author_name='John Doe', content=INFLECTION_TEST)
143 | page.put()
144 | page.index()
145 | assert search.StemmedIndex.all().count() == 1
146 | page = Page(author_name='Jon Favreau', content='A director that works well with writers.')
147 | page.put()
148 | page.index()
149 | assert search.StemmedIndex.all().count() == 2
150 |
151 | def test_inflections(self):
152 | def check_inflection(word1, word2):
153 | returned_pages = Page.search(word1)
154 | assert returned_pages
155 | assert re.search(word2, returned_pages[0].content, re.IGNORECASE)
156 | check_inflection('algorithm', 'algorithms')
157 | check_inflection('python', 'pythonic')
158 | check_inflection('rubies', 'ruby')
159 | check_inflection('encrust', 'encrusted')
160 |
161 | class TestBigIndex:
162 | def setup(self):
163 | clear_datastore()
164 |
165 | def test_multientity_index(self):
166 | curdir = os.path.abspath(os.path.dirname(__file__))
167 | bigtextfile = os.path.join(curdir, 'roget.txt')
168 | import codecs
169 | bigfile = codecs.open(bigtextfile, 'r', 'utf-8')
170 | bigtext = bigfile.read()
171 | words_to_use = 4 * search.MAX_ENTITY_SEARCH_PHRASES
172 | words = bigtext.split()
173 | Page.INDEX_USES_MULTI_ENTITIES = True
174 | page = Page(key_name="Foo", content=' '.join(words[0:words_to_use]))
175 | page.put()
176 | page.index()
177 | assert search.StemmedIndex.all().count() > 1
178 | page = Page(key_name="Foo", content=INFLECTION_TEST)
179 | page.put()
180 | page.index()
181 | assert search.StemmedIndex.all().count() == 1
182 |
183 | class TestKeyOnlySearch:
184 | def setup(self):
185 | clear_datastore()
186 | self.pages = [{
187 | 'key_name': 'test1',
188 | 'content': 'This post has no title at all.'
189 | }, {
190 | 'key_name': 'test2',
191 | 'title': 'Second Post',
192 | 'content': 'This is some text for the second post.'
193 | }, {
194 | 'key_name': 'test3',
195 | 'title': 'Third Post',
196 | 'content': 'This is some text for the third post. The last post.'
197 | }]
198 | for page_dict in self.pages:
199 | page = Page(**page_dict)
200 | page.put()
201 | page.index()
202 | assert search.StemmedIndex.all().count() == 3
203 |
204 | def test_default_titling(self):
205 | page_list = Page.search('no title', keys_only=True)
206 | assert len(page_list) == 1
207 | assert page_list[0][0].name() == 'test1'
208 | assert page_list[0][1] == 'Page test1' # Default titling
209 |
210 | def test_title_from_parent(self):
211 | page_list = Page.search('last', keys_only=True)
212 | assert len(page_list) == 1
213 | assert page_list[0][0].name() == 'test3'
214 | assert page_list[0][1] == 'Third Post'
215 |
216 | def test_title_change(self):
217 | pages = Page.search('second post')
218 | assert len(pages) == 1
219 | page = pages[0]
220 | page.title = 'My Great New Title'
221 | old_key = page.put()
222 | page.indexed_title_changed()
223 | assert search.StemmedIndex.all().count() == 3
224 | page_list = Page.search('second post', keys_only=True)
225 | assert len(page_list) == 1
226 | assert page_list[0][1] == 'My Great New Title'
227 | assert page_list[0][0].id_or_name() == old_key.id_or_name()
228 |
229 | class TestMultiWordSearch:
230 | def setup(self):
231 | clear_datastore()
232 | page = Page(key_name='doetext', author_name='John Doe',
233 | content=INFLECTION_TEST)
234 | page.put()
235 | page.index()
236 | assert search.StemmedIndex.all().count() == 1
237 | page = Page(key_name="statuetext",
238 | author_name='Other Guy', content="""
239 | This is the time for all good python programmers to check,
240 | to test, to go forward and throw junk at the code, and in
241 | so doing, try to find errors.
242 | -- Unheralded inscription at base of Statue of Liberty
243 | """)
244 | page.put()
245 | page.index()
246 | assert search.StemmedIndex.all().count() == 2
247 | page = Page(key_name="statuetext2",
248 | author_name='Another Guy', content="""
249 | I have seen a statue and it declares there should be
250 | liberty in the world.
251 | """)
252 | page.put()
253 | page.index()
254 | assert search.StemmedIndex.all().count() == 3
255 |
256 | def test_multiword_search_order(self):
257 | returned_pages = Page.search('statue of liberty')
258 | assert len(returned_pages) == 2
259 | print "Returned pages: %s" % [page.key().name() for page in returned_pages]
260 | assert returned_pages[0].key().name() == u'statuetext'
261 | assert returned_pages[1].key().name() == u'statuetext2'
262 |
263 | def test_multiword_search_fail(self):
264 | returned_pages = Page.search('statue of liberty biggy word')
265 | assert not returned_pages
266 |
267 | def test_multiword_search_and(self):
268 | returned_pages = Page.search('statue of liberty python')
269 | assert len(returned_pages) == 1
270 | assert returned_pages[0].key().name() == u'statuetext'
271 |
272 | def test_two_word_search(self):
273 | returned_pages = Page.search('ornately narrated')
274 | assert len(returned_pages) == 1
275 | assert returned_pages[0].key().name() == u'doetext'
276 |
277 |
--------------------------------------------------------------------------------