├── History.md
├── Readme.md
├── benchmark
    ├── app.js
    ├── data
    │   └── example.db
    └── lib
    │   └── nstore.js
├── examples
    ├── app.js
    ├── data
    │   └── example.db
    └── lib
    │   └── nstore.js
├── lib
    ├── node-search.js
    └── node-search
    │   ├── double-metaphone
    │       └── index.js
    │   ├── math
    │       ├── vector-utils.js
    │       └── vector.js
    │   ├── porter-stemmer
    │       └── index.js
    │   └── tokenizer
    │       └── index.js
├── package.json
└── spec
    ├── commands
        └── example_command.rb
    ├── node.js
    └── unit
        ├── spec.helper.js
        └── spec.js


/History.md:
--------------------------------------------------------------------------------
1 | 
2 | 0.0.1 / YYYY-MM-DD
3 | ------------------
4 | 
5 | * Initial release
6 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | # A JavaScript full text search engine
 2 | 
 3 | This is an implementation of a "vector space model" with "Porter stemming", "double-metaphones", false boolean searches, field searching and field weighting. Basically it a full text search engine that has most of the fancy features the well known search engines libraries like Sphinx, Solr, Lucene, etc. The big difference is that it is written in JavaScript originally for use with Node.js.
 4 | 
 5 | 
 6 | ## Example
 7 | 
 8 | 	var NodeSearch = require('./../lib/node-search').NodeSearch;
 9 | 	var nStore = require('./lib/nstore');
10 | 
11 | 	// A simple data set to search over, feel free to use any data source, nstore uses JavaScript objects so it's simple
12 | 	var db = nStore('data/example.db');
13 | 
14 | 	var search = new NodeSearch();
15 | 	search.fieldWeights.title = 2; // Make one/or many of the document fields more important
16 | 	var stream = db.stream();
17 | 	stream.addListener('data', function (doc, meta) {
18 | 		search.index(meta.key,doc);
19 | 	});
20 | 
21 | 	stream.addListener('end', function () { // when the indexing is finished
22 | 		search.query("meet !poultry", null, function (results) { // search and wait for the results
23 | 			results.forEach(function(result){
24 | 				db.get(result.key, function (err, doc, meta) { 
25 | 					if(err) throw err;
26 | 					console.log(result.key+" "+doc.title +" "+doc.body +" "+  result.rank);
27 | 				});
28 | 			});
29 | 		});
30 | 	});
31 | 
32 | 	
33 | ## This code is based on idea and code from
34 | 
35 | 	http://blog.josephwilk.net/projects/building-a-vector-space-search-engine-in-python.html
36 | 	http://github.com/maritz/js-double-metaphone/raw/master/double-metaphone.js
37 | 	http://yeti-witch.googlecode.com/svn/trunk/lib/porter-stemmer.js
38 | 	http://www.koders.com/javascript/fidACD9DF0C1463CFC127D8C8B767B77122F3FC7331.aspx
39 | 	http://playnice.ly/blog/2010/05/05/a-fast-fuzzy-full-text-index-using-redis/
40 | 	http://users.telenet.be/paul.larmuseau/SVD.htm
41 | 	http://gist.github.com/389875
42 | 	http://sylvester.jcoglan.com/api/matrix
43 | 	http://www.uni-bonn.de/~manfear/matrixcalc.php
44 | 	http://www.sphinxsearch.com/docs/manual-1.10.html#boolean-syntax
45 | 	http://stackoverflow.com/questions/90580/word-frequency-algorithm-for-natural-language-processing
46 | 	http://stackoverflow.com/questions/2699646/how-to-get-logical-parts-of-a-sentence-with-java
47 | 
48 | 
49 | ## Things that it doesn't currently do but would like to look into:	
50 | 
51 | 	Phrase based searches, everything is word based, combonations of words are not currently supported
52 | 	Exact matches, all words are converted to stemmed metaphones so "ponies" is indexed as the sound of "pony".
53 | 	Date based searches or other meta data with additional logic are not currently supported
54 | 	tf-idf based ranking, currently using a term count
55 | 	http://blog.josephwilk.net/projects/latent-semantic-analysis-in-python.html
56 | 	http://en.wikipedia.org/wiki/Lanczos_method
57 | 	http://en.wikipedia.org/wiki/Latent_semantic_indexing
58 | 	http://en.wikipedia.org/wiki/Probabilistic_latent_semantic_analysis
59 | 	http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation
60 | 	http://en.wikipedia.org/wiki/Part-of-speech_tagging
61 | 
62 | 
63 | ## License 
64 | 
65 | (The MIT License)
66 | 
67 | Copyright (c) 2009 Motion &amp; Color &lt;Tyler Larson&gt;
68 | 
69 | Permission is hereby granted, free of charge, to any person obtaining
70 | a copy of this software and associated documentation files (the
71 | 'Software'), to deal in the Software without restriction, including
72 | without limitation the rights to use, copy, modify, merge, publish,
73 | distribute, sublicense, and/or sell copies of the Software, and to
74 | permit persons to whom the Software is furnished to do so, subject to
75 | the following conditions:
76 | 
77 | The above copyright notice and this permission notice shall be
78 | included in all copies or substantial portions of the Software.
79 | 
80 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
81 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
82 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
83 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
84 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
85 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
86 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/benchmark/app.js:
--------------------------------------------------------------------------------
 1 | var NodeSearch = require('./../lib/node-search').NodeSearch;
 2 | var nStore = require('./lib/nstore');
 3 | 
 4 | // A simple data set to search over, feel free to use any data source
 5 | var db = nStore('data/example.db');
 6 | 
 7 | // create a sample database to test from
 8 | for(var i=10,l=1000;i<l;i++){ 
 9 | 	db.save(i,{title:"title",body:"Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi."});
10 | }
11 | 
12 | var startIndex = new Date().getTime();
13 | 
14 | var search = new NodeSearch();
15 | search.fieldWeights.title = 5; // Make one/or many of the document fields more important
16 | var stream = db.stream();
17 | stream.addListener('data', function (doc, meta) {
18 | 	search.index(meta.key,doc);
19 | });
20 | 
21 | stream.addListener('end', function () {
22 | 	console.log( "Indexing has taken " + (new Date().getTime() - startIndex ).toString() );
23 | 	var startSearch = (new Date).getTime();
24 | 	search.query("meet !poultry", null, function (results) {
25 | 		console.log( "Searching has taken " + (new Date().getTime() - startSearch).toString() );
26 | 		results.forEach(function(result){
27 | 			db.get(result.key, function (err, doc, meta) {
28 | 				if(err) throw err;
29 | 				console.log(result.key+" "+doc.title +" "+doc.body +" "+  result.rank);
30 | 			});
31 | 		});
32 | 	});
33 | });


--------------------------------------------------------------------------------
/benchmark/data/example.db:
--------------------------------------------------------------------------------
 1 | {"title":"Full Text Search for Node.js","body":"Tyler Larson has created a full text search engine for Node.js"}	{"key":"1"}
 2 | {"title":"Daily Bulletin","body":"Humane society challenges meat industry over new law"}	{"key":"2"}
 3 | {"title":"Wilson County News","body":"Unprecedented meeting on COOL held in Kansas City"}	{"key":"3"}
 4 | {"title":"Meat & Poultry","body":"Industry reflects on USDA under Bush"}	{"key":"4"}
 5 | {"title":"Other County News","body":"The kids in the city are going crazy"}	{"key":"5"}
 6 | {"title":"Drovers","body":"Beefing up Safety"}	{"key":"6"}
 7 | {"title":"Supermarket News","body":"Humane Society opposes meat industry challenge"}	{"key":"7"}
 8 | {"title":"Daily Bulletin","body":"motion against non-ambulatory ban"}	{"key":"8"}
 9 | {"title":"Press-Enterprise","body":"California downer law lawsuit"}	{"key":"9"}
10 | {"title":"title","body":"Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi."}	{"key":10}
11 | 


--------------------------------------------------------------------------------
/benchmark/lib/nstore.js:
--------------------------------------------------------------------------------
  1 | // Copyright 2010 Tim Caswell <tim@creationix.com>
  2 | //
  3 | // MIT licensed
  4 | 
  5 | 
  6 | var sys = require('sys'),
  7 |   fs = require('fs'),
  8 |   Path = require('path'),
  9 |   Buffer = require('buffer').Buffer;
 10 | 
 11 | // This size only affects performance, it's not a constraint on data sizes
 12 | var CHUNK_SIZE = 1024;
 13 | // This is the max size of a single serialized document
 14 | var MAX_SIZE = 1024 * 1024;
 15 | 
 16 | // Reads from a given file descriptor at a specified position and length
 17 | // Handles all OS level chunking for you.
 18 | // Callback gets (err, buffer)
 19 | function fsRead(fd, position, length, callback) {
 20 |   var buffer = new Buffer(length),
 21 |       offset = 0;
 22 | 
 23 |   function readChunk() {
 24 |     fs.read(fd, buffer, offset, length - offset, position, function (err, bytesRead) {
 25 |       if (err) { callback(err); return; }
 26 | 
 27 |       offset += bytesRead;
 28 | 
 29 |       if (offset < length) {
 30 |         readChunk();
 31 |         return;
 32 |       }
 33 |       callback(null, buffer);
 34 |     });
 35 |   }
 36 |   readChunk();
 37 | }
 38 | 
 39 | // Writes a buffer to a specified file descriptor at the given offset
 40 | // handles chunking for you.
 41 | // Callback gets (err)
 42 | function fsWrite(fd, buffer, position, callback) {
 43 |   var offset = 0,
 44 |       length = buffer.length;
 45 | 
 46 |   function writeChunk() {
 47 |     fs.write(fd, buffer, offset, length - offset, position, function (err, bytesWritten) {
 48 |       if (err) { callback(err); return; }
 49 |       offset += bytesWritten;
 50 |       if (offset < length) {
 51 |         writeChunk();
 52 |         return;
 53 |       }
 54 |       callback();
 55 |     });
 56 |   }
 57 |   writeChunk();
 58 | }
 59 | 
 60 | 
 61 | function nStore(filename, filterFn, isTemp) {
 62 |   var fd, // FD for reading and writing to the file
 63 |       index = {}, // Index of file positions of all documents by key
 64 |       writeQueue = [], // Queue of new docs to write to the hd
 65 |       stale = 0,
 66 |       dbLength = 0, // The size of the current db file in bytes
 67 |       compacting = false,
 68 |       lastCompact = Date.now();
 69 | 
 70 |   // Open a single handle for reading and writing
 71 |   fd = fs.openSync(filename, "a+");
 72 | 
 73 |   // Generates a random unique 16 char base 36 string
 74 |   // (about 2^83 possible keys)
 75 |   function makeUUID() {
 76 |     var key = "";
 77 |     while (key.length < 16) {
 78 |       key += Math.floor(Math.random() * 0x290d7410000).toString(36);
 79 |     }
 80 |     key = key.substr(0, 16);
 81 |     if (key in index) {
 82 |       return makeUUID();
 83 |     }
 84 |     return key;
 85 |   }
 86 | 
 87 |   // Load a single record from the disk
 88 |   function getByKey(key, callback) {
 89 |     try {
 90 |       var info = index[key];
 91 |       if (!info) {
 92 |         var error = new Error("Document does not exist for " + key);
 93 |         error.errno = process.ENOENT;
 94 |         callback(error);
 95 |         return;
 96 |       }
 97 | 
 98 |       fsRead(fd, info.position, info.length, function (err, buffer) {
 99 |         if (err) { callback(err); return; }
100 |         try {
101 |           var data = JSON.parse(buffer.toString());
102 |           callback(null, data, info.meta);
103 |         } catch (err) {
104 |           callback(err);
105 |         }
106 |       });
107 |     } catch (err) {
108 |       callback(err);
109 |     }
110 |   }
111 | 
112 | 
113 | 
114 |   function compact() {
115 |     // Don't run if already clean or already compacting
116 |     if (isTemp || compacting || stale === 0) { return; }
117 |     compacting = true;
118 |     var tmpFile = Path.join(Path.dirname(filename), makeUUID() + ".tmpdb"),
119 |         tmpDb = nStore(tmpFile, null, true),
120 |         keys = Object.keys(index),
121 |         counter = keys.length;
122 | 
123 |     keys.forEach(function (key) {
124 |       getByKey(key, function (err, doc, meta) {
125 |         if (err) { throw err; }
126 | 
127 |         function check() {
128 |           counter--;
129 |           if (counter === 0) {
130 |             done();
131 |           }
132 |         }
133 | 
134 |         // Hook to allow filtering when compacting
135 |         // Great for things like session pruning
136 |         if (filterFn && !filterFn(doc, meta)) {
137 |           check();
138 |           return;
139 |         }
140 | 
141 |         tmpDb.save(key, doc, function (err, meta) {
142 |           if (err) {
143 |             throw err;
144 |           }
145 |           check();
146 |         });
147 |       });
148 |     });
149 |     stale = 0;
150 | 
151 |     function done() {
152 | 
153 |       // Swap out stores
154 |       var oldfd = fd;
155 |       fd = tmpDb.fd;
156 |       dbLength = tmpDb.dbLength;
157 |       index = tmpDb.index;
158 | 
159 |       // And clean up the files
160 |       fs.close(oldfd, function (err) {
161 |         if (err) throw err;
162 |         fs.unlink(filename, function (err) {
163 |           if (err) throw err;
164 |           fs.rename(tmpFile, filename, function (err) {
165 |             if (err) throw err;
166 |             compacting = false;
167 |             lastCompact = Date.now();
168 |             checkQueue();
169 |           });
170 |         });
171 |       });
172 |     }
173 | 
174 |   }
175 | 
176 |   // Loads the database from disk using blocking I/O
177 |   // TODO: see if non-blocking is faster, this takes a long time
178 |   function loadDatabase() {
179 | 
180 |     // Create a buffer for reading chunks from the disk
181 |     var chunk = new Buffer(CHUNK_SIZE);
182 | 
183 |     // Create an empty stream buffer
184 |     var input = new Buffer(MAX_SIZE);
185 |     input.length = 0;
186 | 
187 |     // These are positions in the database file
188 |     var offset = 0;
189 |     var base = 0;
190 | 
191 |     // This is a position within the input stream
192 |     var pos = 0;
193 |     var mid = 0;
194 | 
195 |     // Read a chunk from the file into `chunk`
196 |     while ((chunk.length = fs.readSync(fd, chunk, 0, CHUNK_SIZE, offset)) > 0) {
197 | 
198 |       // Move the offset so the outer loop stays in sync
199 |       offset += chunk.length;
200 | 
201 |       // Copy the chunk onto the input stream
202 |       chunk.copy(input, input.length, 0, chunk.length);
203 |       input.length += chunk.length;
204 | 
205 |       // See if there is input to consume
206 |       for (var i = pos, l = input.length; i < l; i++) {
207 |         if (input[i] === 9) {
208 |           mid = i + 1;
209 |         }
210 |         if (mid && input[i] === 10) {
211 |           // var doc = input.slice(pos, mid - 1).toString();
212 |           var meta = JSON.parse(input.slice(mid, i).toString());
213 |           var info = {
214 |             meta: meta,
215 |             position: base + pos,
216 |             length: mid - pos - 1
217 |           };
218 |           if (index[meta.key]) {
219 |             stale++;
220 |           }
221 |           if (info.length > 0) {
222 |             index[meta.key] = info;
223 |           } else {
224 |             delete index[meta.key];
225 |           }
226 |           mid = 0;
227 |           pos = i + 1;
228 |         }
229 |       }
230 | 
231 |       // Shift the input back down
232 |       if (pos > 0) {
233 |         input.copy(input, 0, pos, input.length);
234 |         input.length -= pos;
235 |         base += pos;
236 |         pos = 0;
237 |       }
238 |     }
239 | 
240 |     dbLength = offset;
241 | 
242 |   }
243 |   loadDatabase();
244 |   compact();
245 | 
246 |   var lock = false;
247 |   function checkQueue() {
248 |     if (compacting || lock || writeQueue.length === 0) { return; }
249 |     lock = true;
250 | 
251 |     // Pull some jobs off the writeQueue
252 |     var length = writeQueue.length;
253 |     var i = 0;
254 |     var size = 0;
255 |     var toWrite = [];
256 |     var newIndex = {};
257 |     var position = dbLength;
258 |     while (i < length && size < 50000) {
259 |       var item = writeQueue[i];
260 |       var data = item.doc ? JSON.stringify(item.doc) : "";
261 |       var key = item.key;
262 |       var meta = {key: key};
263 |       var line = new Buffer(data + "\t" + JSON.stringify(meta) + "\n");
264 |       var dataLength = Buffer.byteLength(data);
265 |       // Generate a callback closure
266 |       toWrite[toWrite.length] = {
267 |         line: line,
268 |         key: key,
269 |         callback: item.callback
270 |       };
271 |       newIndex[meta.key] = {
272 |         position: dbLength,
273 |         length: dataLength,
274 |         meta: meta
275 |       };
276 | 
277 |       dbLength += line.length;
278 |       size += line.length;
279 |       i++;
280 |     }
281 |     length = i;
282 |     writeQueue.splice(0, length);
283 | 
284 |     // Merge the buffers into one large one
285 |     var offset = 0;
286 |     var buffer = new Buffer(size);
287 |     for (var i = 0; i < length; i++) {
288 |       var line = toWrite[i].line;
289 |       line.copy(buffer, offset);
290 |       offset += line.length;
291 |     }
292 | 
293 |     fsWrite(fd, buffer, position, function (err) {
294 |       if (err) {
295 |         throw err;
296 |       }
297 | 
298 |       // Mix in the updated indexes
299 |       var willCompact = false;
300 |       var threshold = Object.keys(index).length;
301 |       Object.keys(newIndex).forEach(function (key) {
302 |         if (index[key]) {
303 |           stale++;
304 |           if (stale > threshold) {
305 |             willCompact = true;
306 |           }
307 |         }
308 | 
309 |         if (newIndex[key].length === 0) {
310 |           delete index[key];
311 |         } else {
312 |           index[key] = newIndex[key];
313 |         }
314 |       });
315 | 
316 |       // Call all the individual callbacks for the write
317 |       for (var i = 0; i < length; i++) {
318 |         var item = toWrite[i];
319 |         var callback = item.callback;
320 |         if (callback) {
321 |           callback(err, {key: item.key});
322 |         }
323 |         
324 |       }
325 | 
326 |       // Unlock and try the loop again
327 |       lock = false;
328 |       if (willCompact && (Date.now() - lastCompact > 2000)) {
329 |         compact();
330 |       } else {
331 |         process.nextTick(checkQueue);
332 |       }
333 |     });
334 | 
335 |   }
336 | 
337 |   function getStream(filter) {
338 |     var counter = 0;
339 |     var stream = new process.EventEmitter();
340 |     var queue = [];
341 |     var paused = false;
342 | 
343 |     // Checks to see if we should emit the "end" event yet.
344 |     function checkDone() {
345 |       if (!paused && counter === 0) {
346 |         counter--;
347 |         stream.emit("end");
348 |       }
349 |     }
350 | 
351 |     // Tries to push events through
352 |     function flush() {
353 |       if (paused) { return; }
354 |       for (var i = 0, l = queue.length; i < l; i++) {
355 |         var item = queue[i];
356 |         stream.emit("data", item.doc, item.meta);
357 |         counter--;
358 |       }
359 |       queue.length = 0;
360 |       process.nextTick(checkDone);
361 |     }
362 | 
363 | 
364 |     stream.pause = function () {
365 |       paused = true;
366 |     };
367 | 
368 |     // Resumes emitting of events
369 |     stream.resume = function () {
370 |       paused = false;
371 |       process.nextTick(function () {
372 |         flush();
373 |         checkDone();
374 |       });
375 |     };
376 | 
377 |     Object.keys(index).forEach(function (key) {
378 |       counter++;
379 |       getByKey(key, function (err, doc, meta) {
380 |         if (err) {
381 |           stream.emit("error", err);
382 |           return;
383 |         }
384 |         if (!filter || filter(doc, meta)) {
385 |           queue.push({
386 |             doc: doc,
387 |             meta: meta
388 |           });
389 |           flush();
390 |         } else {
391 |           counter--;
392 |           process.nextTick(checkDone);
393 |         }
394 |       });
395 |     });
396 | 
397 |     process.nextTick(checkDone);
398 | 
399 |     return stream;
400 |   }
401 | 
402 | 
403 |   return {
404 |     get length() {
405 |       return Object.keys(index).length;
406 |     },
407 | 
408 |     // Saves a document with optional key. The effect if immediate to the
409 |     // running program, but not persistent till after the callback.
410 |     // Pass null as the key to get a generated key.
411 |     save: function (key, doc, callback) {
412 |       if (!key) {
413 |         key = makeUUID();
414 |       }
415 |       writeQueue[writeQueue.length] = {
416 |         key: key,
417 |         doc: doc,
418 |         callback: callback
419 |       };
420 |       checkQueue();
421 |     },
422 | 
423 |     // Removes a document from the collection by key
424 |     // The effect is immediate to the running program, but not permanent
425 |     // till the callback returns.
426 |     remove: function (key, callback) {
427 |       if (key in index) {
428 |         delete index[key];
429 |         var line = new Buffer("\t" + JSON.stringify({key: key}) + "\n");
430 | 
431 |         writeQueue[writeQueue.length] = {
432 |           meta: {key: key},
433 |           position: dbLength,
434 |           length: 0,
435 |           line: line,
436 |           callback: callback
437 |         };
438 |         dbLength += line.length;
439 |         checkQueue();
440 |       } else {
441 |           var err = new Error("Cannot delete a document that does not exist");
442 |           err.errno = process.ENOENT;
443 |           callback(err);
444 |       }
445 |     },
446 | 
447 |     all: function (filter, callback) {
448 |       if (typeof filter === 'function' && callback === undefined) {
449 |         callback = filter;
450 |         filter = null;
451 |       }
452 |       var docs = [];
453 |       var metas = [];
454 |       var stream = getStream(filter);
455 |       stream.addListener('data', function (doc, meta) {
456 |         docs.push(doc);
457 |         metas.push(meta);
458 |       });
459 |       stream.addListener('end', function () {
460 |         callback(null, docs, metas);
461 |       });
462 |       stream.addListener('error', callback);
463 |     },
464 | 
465 |     // Returns a readable stream of the whole collection.
466 |     // Supports pause and resume so that you can delay events for layer.
467 |     // This queues "data" and "end" events in memory./
468 |     // Also you can provide a filter to pre-filter results before they
469 |     // go to the queue
470 |     stream: getStream,
471 | 
472 |     // Loads a single document by id, accepts key and callback
473 |     // the callback will be called with (err, doc, meta)
474 |     get: getByKey,
475 | 
476 | 
477 |     // Removes all documents from a database
478 |     clear: function () {
479 |       index = {};
480 |       compact();
481 |     },
482 | 
483 |     compact: compact,
484 | 
485 |     // Expose some private variables
486 |     get index() { return index; },
487 |     get fd() { return fd; },
488 |     get dbLength() { return dbLength; },
489 | 
490 |     // Expose the UUID maker
491 |     makeUUID: makeUUID
492 |   };
493 | }
494 | 
495 | module.exports = nStore;
496 | 
497 | 


--------------------------------------------------------------------------------
/examples/app.js:
--------------------------------------------------------------------------------
 1 | var NodeSearch = require('./../lib/node-search').NodeSearch;
 2 | var nStore = require('./lib/nstore');
 3 | 
 4 | // A simple data set to search over, feel free to use any data source, nstore uses JavaScript objects so it's simple
 5 | var db = nStore('data/example.db');
 6 | 
 7 | var search = new NodeSearch();
 8 | search.fieldWeights.title = 2; // Make one/or many of the document fields more important
 9 | var stream = db.stream();
10 | stream.addListener('data', function (doc, meta) {
11 | 	search.index(meta.key,doc);
12 | });
13 | 
14 | stream.addListener('end', function () {
15 | 	search.query("meet !poultry", null, function (results) {
16 | 		results.forEach(function(result){
17 | 			db.get(result.key, function (err, doc, meta) {
18 | 				if(err) throw err;
19 | 				console.log(result.key+" "+doc.title +" "+doc.body +" "+  result.rank);
20 | 			});
21 | 		});
22 | 	});
23 | });


--------------------------------------------------------------------------------
/examples/data/example.db:
--------------------------------------------------------------------------------
 1 | {"title":"Full Text Search for Node.js","body":"Tyler Larson has created a full text search engine for Node.js"}	{"key":"1"}
 2 | {"title":"Daily Bulletin","body":"Humane society challenges meat industry over new law"}	{"key":"2"}
 3 | {"title":"Wilson County News","body":"Unprecedented meeting on COOL held in Kansas City"}	{"key":"3"}
 4 | {"title":"Meat & Poultry","body":"Industry reflects on USDA under Bush"}	{"key":"4"}
 5 | {"title":"Other County News","body":"The kids in the city are going crazy"}	{"key":"5"}
 6 | {"title":"Drovers","body":"Beefing up Safety"}	{"key":"6"}
 7 | {"title":"Supermarket News","body":"Humane Society opposes meat industry challenge"}	{"key":"7"}
 8 | {"title":"Daily Bulletin","body":"motion against non-ambulatory ban"}	{"key":"8"}
 9 | {"title":"Press-Enterprise","body":"California downer law lawsuit"}	{"key":"9"}
10 | {"title":"Meeting place","body":"COOL funding"}	{"key":"10"}
11 | 


--------------------------------------------------------------------------------
/examples/lib/nstore.js:
--------------------------------------------------------------------------------
  1 | // Copyright 2010 Tim Caswell <tim@creationix.com>
  2 | //
  3 | // MIT licensed
  4 | 
  5 | 
  6 | var sys = require('sys'),
  7 |   fs = require('fs'),
  8 |   Path = require('path'),
  9 |   Buffer = require('buffer').Buffer;
 10 | 
 11 | // This size only affects performance, it's not a constraint on data sizes
 12 | var CHUNK_SIZE = 1024;
 13 | // This is the max size of a single serialized document
 14 | var MAX_SIZE = 1024 * 1024;
 15 | 
 16 | // Reads from a given file descriptor at a specified position and length
 17 | // Handles all OS level chunking for you.
 18 | // Callback gets (err, buffer)
 19 | function fsRead(fd, position, length, callback) {
 20 |   var buffer = new Buffer(length),
 21 |       offset = 0;
 22 | 
 23 |   function readChunk() {
 24 |     fs.read(fd, buffer, offset, length - offset, position, function (err, bytesRead) {
 25 |       if (err) { callback(err); return; }
 26 | 
 27 |       offset += bytesRead;
 28 | 
 29 |       if (offset < length) {
 30 |         readChunk();
 31 |         return;
 32 |       }
 33 |       callback(null, buffer);
 34 |     });
 35 |   }
 36 |   readChunk();
 37 | }
 38 | 
 39 | // Writes a buffer to a specified file descriptor at the given offset
 40 | // handles chunking for you.
 41 | // Callback gets (err)
 42 | function fsWrite(fd, buffer, position, callback) {
 43 |   var offset = 0,
 44 |       length = buffer.length;
 45 | 
 46 |   function writeChunk() {
 47 |     fs.write(fd, buffer, offset, length - offset, position, function (err, bytesWritten) {
 48 |       if (err) { callback(err); return; }
 49 |       offset += bytesWritten;
 50 |       if (offset < length) {
 51 |         writeChunk();
 52 |         return;
 53 |       }
 54 |       callback();
 55 |     });
 56 |   }
 57 |   writeChunk();
 58 | }
 59 | 
 60 | 
 61 | function nStore(filename, filterFn, isTemp) {
 62 |   var fd, // FD for reading and writing to the file
 63 |       index = {}, // Index of file positions of all documents by key
 64 |       writeQueue = [], // Queue of new docs to write to the hd
 65 |       stale = 0,
 66 |       dbLength = 0, // The size of the current db file in bytes
 67 |       compacting = false,
 68 |       lastCompact = Date.now();
 69 | 
 70 |   // Open a single handle for reading and writing
 71 |   fd = fs.openSync(filename, "a+");
 72 | 
 73 |   // Generates a random unique 16 char base 36 string
 74 |   // (about 2^83 possible keys)
 75 |   function makeUUID() {
 76 |     var key = "";
 77 |     while (key.length < 16) {
 78 |       key += Math.floor(Math.random() * 0x290d7410000).toString(36);
 79 |     }
 80 |     key = key.substr(0, 16);
 81 |     if (key in index) {
 82 |       return makeUUID();
 83 |     }
 84 |     return key;
 85 |   }
 86 | 
 87 |   // Load a single record from the disk
 88 |   function getByKey(key, callback) {
 89 |     try {
 90 |       var info = index[key];
 91 |       if (!info) {
 92 |         var error = new Error("Document does not exist for " + key);
 93 |         error.errno = process.ENOENT;
 94 |         callback(error);
 95 |         return;
 96 |       }
 97 | 
 98 |       fsRead(fd, info.position, info.length, function (err, buffer) {
 99 |         if (err) { callback(err); return; }
100 |         try {
101 |           var data = JSON.parse(buffer.toString());
102 |           callback(null, data, info.meta);
103 |         } catch (err) {
104 |           callback(err);
105 |         }
106 |       });
107 |     } catch (err) {
108 |       callback(err);
109 |     }
110 |   }
111 | 
112 | 
113 | 
114 |   function compact() {
115 |     // Don't run if already clean or already compacting
116 |     if (isTemp || compacting || stale === 0) { return; }
117 |     compacting = true;
118 |     var tmpFile = Path.join(Path.dirname(filename), makeUUID() + ".tmpdb"),
119 |         tmpDb = nStore(tmpFile, null, true),
120 |         keys = Object.keys(index),
121 |         counter = keys.length;
122 | 
123 |     keys.forEach(function (key) {
124 |       getByKey(key, function (err, doc, meta) {
125 |         if (err) { throw err; }
126 | 
127 |         function check() {
128 |           counter--;
129 |           if (counter === 0) {
130 |             done();
131 |           }
132 |         }
133 | 
134 |         // Hook to allow filtering when compacting
135 |         // Great for things like session pruning
136 |         if (filterFn && !filterFn(doc, meta)) {
137 |           check();
138 |           return;
139 |         }
140 | 
141 |         tmpDb.save(key, doc, function (err, meta) {
142 |           if (err) {
143 |             throw err;
144 |           }
145 |           check();
146 |         });
147 |       });
148 |     });
149 |     stale = 0;
150 | 
151 |     function done() {
152 | 
153 |       // Swap out stores
154 |       var oldfd = fd;
155 |       fd = tmpDb.fd;
156 |       dbLength = tmpDb.dbLength;
157 |       index = tmpDb.index;
158 | 
159 |       // And clean up the files
160 |       fs.close(oldfd, function (err) {
161 |         if (err) throw err;
162 |         fs.unlink(filename, function (err) {
163 |           if (err) throw err;
164 |           fs.rename(tmpFile, filename, function (err) {
165 |             if (err) throw err;
166 |             compacting = false;
167 |             lastCompact = Date.now();
168 |             checkQueue();
169 |           });
170 |         });
171 |       });
172 |     }
173 | 
174 |   }
175 | 
176 |   // Loads the database from disk using blocking I/O
177 |   // TODO: see if non-blocking is faster, this takes a long time
178 |   function loadDatabase() {
179 | 
180 |     // Create a buffer for reading chunks from the disk
181 |     var chunk = new Buffer(CHUNK_SIZE);
182 | 
183 |     // Create an empty stream buffer
184 |     var input = new Buffer(MAX_SIZE);
185 |     input.length = 0;
186 | 
187 |     // These are positions in the database file
188 |     var offset = 0;
189 |     var base = 0;
190 | 
191 |     // This is a position within the input stream
192 |     var pos = 0;
193 |     var mid = 0;
194 | 
195 |     // Read a chunk from the file into `chunk`
196 |     while ((chunk.length = fs.readSync(fd, chunk, 0, CHUNK_SIZE, offset)) > 0) {
197 | 
198 |       // Move the offset so the outer loop stays in sync
199 |       offset += chunk.length;
200 | 
201 |       // Copy the chunk onto the input stream
202 |       chunk.copy(input, input.length, 0, chunk.length);
203 |       input.length += chunk.length;
204 | 
205 |       // See if there is input to consume
206 |       for (var i = pos, l = input.length; i < l; i++) {
207 |         if (input[i] === 9) {
208 |           mid = i + 1;
209 |         }
210 |         if (mid && input[i] === 10) {
211 |           // var doc = input.slice(pos, mid - 1).toString();
212 |           var meta = JSON.parse(input.slice(mid, i).toString());
213 |           var info = {
214 |             meta: meta,
215 |             position: base + pos,
216 |             length: mid - pos - 1
217 |           };
218 |           if (index[meta.key]) {
219 |             stale++;
220 |           }
221 |           if (info.length > 0) {
222 |             index[meta.key] = info;
223 |           } else {
224 |             delete index[meta.key];
225 |           }
226 |           mid = 0;
227 |           pos = i + 1;
228 |         }
229 |       }
230 | 
231 |       // Shift the input back down
232 |       if (pos > 0) {
233 |         input.copy(input, 0, pos, input.length);
234 |         input.length -= pos;
235 |         base += pos;
236 |         pos = 0;
237 |       }
238 |     }
239 | 
240 |     dbLength = offset;
241 | 
242 |   }
243 |   loadDatabase();
244 |   compact();
245 | 
246 |   var lock = false;
247 |   function checkQueue() {
248 |     if (compacting || lock || writeQueue.length === 0) { return; }
249 |     lock = true;
250 | 
251 |     // Pull some jobs off the writeQueue
252 |     var length = writeQueue.length;
253 |     var i = 0;
254 |     var size = 0;
255 |     var toWrite = [];
256 |     var newIndex = {};
257 |     var position = dbLength;
258 |     while (i < length && size < 50000) {
259 |       var item = writeQueue[i];
260 |       var data = item.doc ? JSON.stringify(item.doc) : "";
261 |       var key = item.key;
262 |       var meta = {key: key};
263 |       var line = new Buffer(data + "\t" + JSON.stringify(meta) + "\n");
264 |       var dataLength = Buffer.byteLength(data);
265 |       // Generate a callback closure
266 |       toWrite[toWrite.length] = {
267 |         line: line,
268 |         key: key,
269 |         callback: item.callback
270 |       };
271 |       newIndex[meta.key] = {
272 |         position: dbLength,
273 |         length: dataLength,
274 |         meta: meta
275 |       };
276 | 
277 |       dbLength += line.length;
278 |       size += line.length;
279 |       i++;
280 |     }
281 |     length = i;
282 |     writeQueue.splice(0, length);
283 | 
284 |     // Merge the buffers into one large one
285 |     var offset = 0;
286 |     var buffer = new Buffer(size);
287 |     for (var i = 0; i < length; i++) {
288 |       var line = toWrite[i].line;
289 |       line.copy(buffer, offset);
290 |       offset += line.length;
291 |     }
292 | 
293 |     fsWrite(fd, buffer, position, function (err) {
294 |       if (err) {
295 |         throw err;
296 |       }
297 | 
298 |       // Mix in the updated indexes
299 |       var willCompact = false;
300 |       var threshold = Object.keys(index).length;
301 |       Object.keys(newIndex).forEach(function (key) {
302 |         if (index[key]) {
303 |           stale++;
304 |           if (stale > threshold) {
305 |             willCompact = true;
306 |           }
307 |         }
308 | 
309 |         if (newIndex[key].length === 0) {
310 |           delete index[key];
311 |         } else {
312 |           index[key] = newIndex[key];
313 |         }
314 |       });
315 | 
316 |       // Call all the individual callbacks for the write
317 |       for (var i = 0; i < length; i++) {
318 |         var item = toWrite[i];
319 |         var callback = item.callback;
320 |         if (callback) {
321 |           callback(err, {key: item.key});
322 |         }
323 |         
324 |       }
325 | 
326 |       // Unlock and try the loop again
327 |       lock = false;
328 |       if (willCompact && (Date.now() - lastCompact > 2000)) {
329 |         compact();
330 |       } else {
331 |         process.nextTick(checkQueue);
332 |       }
333 |     });
334 | 
335 |   }
336 | 
337 |   function getStream(filter) {
338 |     var counter = 0;
339 |     var stream = new process.EventEmitter();
340 |     var queue = [];
341 |     var paused = false;
342 | 
343 |     // Checks to see if we should emit the "end" event yet.
344 |     function checkDone() {
345 |       if (!paused && counter === 0) {
346 |         counter--;
347 |         stream.emit("end");
348 |       }
349 |     }
350 | 
351 |     // Tries to push events through
352 |     function flush() {
353 |       if (paused) { return; }
354 |       for (var i = 0, l = queue.length; i < l; i++) {
355 |         var item = queue[i];
356 |         stream.emit("data", item.doc, item.meta);
357 |         counter--;
358 |       }
359 |       queue.length = 0;
360 |       process.nextTick(checkDone);
361 |     }
362 | 
363 | 
364 |     stream.pause = function () {
365 |       paused = true;
366 |     };
367 | 
368 |     // Resumes emitting of events
369 |     stream.resume = function () {
370 |       paused = false;
371 |       process.nextTick(function () {
372 |         flush();
373 |         checkDone();
374 |       });
375 |     };
376 | 
377 |     Object.keys(index).forEach(function (key) {
378 |       counter++;
379 |       getByKey(key, function (err, doc, meta) {
380 |         if (err) {
381 |           stream.emit("error", err);
382 |           return;
383 |         }
384 |         if (!filter || filter(doc, meta)) {
385 |           queue.push({
386 |             doc: doc,
387 |             meta: meta
388 |           });
389 |           flush();
390 |         } else {
391 |           counter--;
392 |           process.nextTick(checkDone);
393 |         }
394 |       });
395 |     });
396 | 
397 |     process.nextTick(checkDone);
398 | 
399 |     return stream;
400 |   }
401 | 
402 | 
403 |   return {
404 |     get length() {
405 |       return Object.keys(index).length;
406 |     },
407 | 
408 |     // Saves a document with optional key. The effect if immediate to the
409 |     // running program, but not persistent till after the callback.
410 |     // Pass null as the key to get a generated key.
411 |     save: function (key, doc, callback) {
412 |       if (!key) {
413 |         key = makeUUID();
414 |       }
415 |       writeQueue[writeQueue.length] = {
416 |         key: key,
417 |         doc: doc,
418 |         callback: callback
419 |       };
420 |       checkQueue();
421 |     },
422 | 
423 |     // Removes a document from the collection by key
424 |     // The effect is immediate to the running program, but not permanent
425 |     // till the callback returns.
426 |     remove: function (key, callback) {
427 |       if (key in index) {
428 |         delete index[key];
429 |         var line = new Buffer("\t" + JSON.stringify({key: key}) + "\n");
430 | 
431 |         writeQueue[writeQueue.length] = {
432 |           meta: {key: key},
433 |           position: dbLength,
434 |           length: 0,
435 |           line: line,
436 |           callback: callback
437 |         };
438 |         dbLength += line.length;
439 |         checkQueue();
440 |       } else {
441 |           var err = new Error("Cannot delete a document that does not exist");
442 |           err.errno = process.ENOENT;
443 |           callback(err);
444 |       }
445 |     },
446 | 
447 |     all: function (filter, callback) {
448 |       if (typeof filter === 'function' && callback === undefined) {
449 |         callback = filter;
450 |         filter = null;
451 |       }
452 |       var docs = [];
453 |       var metas = [];
454 |       var stream = getStream(filter);
455 |       stream.addListener('data', function (doc, meta) {
456 |         docs.push(doc);
457 |         metas.push(meta);
458 |       });
459 |       stream.addListener('end', function () {
460 |         callback(null, docs, metas);
461 |       });
462 |       stream.addListener('error', callback);
463 |     },
464 | 
465 |     // Returns a readable stream of the whole collection.
466 |     // Supports pause and resume so that you can delay events for layer.
467 |     // This queues "data" and "end" events in memory./
468 |     // Also you can provide a filter to pre-filter results before they
469 |     // go to the queue
470 |     stream: getStream,
471 | 
472 |     // Loads a single document by id, accepts key and callback
473 |     // the callback will be called with (err, doc, meta)
474 |     get: getByKey,
475 | 
476 | 
477 |     // Removes all documents from a database
478 |     clear: function () {
479 |       index = {};
480 |       compact();
481 |     },
482 | 
483 |     compact: compact,
484 | 
485 |     // Expose some private variables
486 |     get index() { return index; },
487 |     get fd() { return fd; },
488 |     get dbLength() { return dbLength; },
489 | 
490 |     // Expose the UUID maker
491 |     makeUUID: makeUUID
492 |   };
493 | }
494 | 
495 | module.exports = nStore;
496 | 
497 | 


--------------------------------------------------------------------------------
/lib/node-search.js:
--------------------------------------------------------------------------------
  1 | var PorterStemmer = require('./node-search/porter-stemmer').PorterStemmer,
  2 | 		Tokenizer = require('./node-search/tokenizer').Tokenizer,
  3 | 		DoubleMetaphone = require('./node-search/double-metaphone').DoubleMetaphone,
  4 | 		Vector = require('./node-search/math/vector').Vector;
  5 | 		VectorUtils = require('./node-search/math/vector-utils').VectorUtils;
  6 | 
  7 | 
  8 | exports.NodeSearch = function() {
  9 | 	return {
 10 | 		docs:{all:[]},
 11 | 		fields:[],
 12 | 		fieldWeights:{},
 13 | 		vectorKeywordIndex:{},
 14 | 		vectorKeywordIndexLength:0,
 15 | 		index: function(key,doc,callback){
 16 | 			var self = this;
 17 | 			var addWords = [];
 18 | 			var uniques = findUniques(doc);
 19 | 			uniques.forEach(function(word){
 20 | 				if( !self.vectorKeywordIndex.hasOwnProperty(word) ){
 21 | 					self.vectorKeywordIndex[word] = self.vectorKeywordIndexLength;
 22 | 					self.vectorKeywordIndexLength++;
 23 | 					addWords.push(word);
 24 | 				}
 25 | 			});
 26 | 
 27 | 			// add zeros to end of other vectors, maybe this should be done another way
 28 | 			if( addWords.length != 0 ) { 
 29 | 				Object.keys(self.docs).forEach(function(column){
 30 | 					self.docs[column].forEach(function(item){
 31 | 						addWords.forEach(function(word){
 32 | 							item.data.push(0);
 33 | 						});
 34 | 					});
 35 | 				});
 36 | 			}
 37 | 
 38 | 			if( typeof(doc) == "object" ) {
 39 | 				var fieldsData = "";
 40 | 				for( var field in doc ){
 41 | 					if(self.fields.length==0){
 42 | 						self.fields.push(field);
 43 | 					}
 44 | 					fieldsData += doc[field] + " ";
 45 | 					var vector = makeVector(key, doc[field],self.fieldWeights[field]||1,self.vectorKeywordIndex,self.vectorKeywordIndexLength);
 46 | 					if( this.docs.hasOwnProperty(field) ){ 
 47 | 						this.docs[field].push(vector);
 48 | 					}else{
 49 | 						this.docs[field] = [vector];
 50 | 					}
 51 | 				}
 52 | 				this.docs.all.push( makeVector(key,fieldsData,1,self.vectorKeywordIndex,self.vectorKeywordIndexLength));
 53 | 			}else{
 54 | 				this.docs.all.push( makeVector(key,doc,1,self.vectorKeywordIndex,self.vectorKeywordIndexLength));	
 55 | 			}
 56 | 		},
 57 | 		
 58 | 		// Query the index, returns an array of documents with id and rank
 59 | 		query: function(string,fields,callback){
 60 | 
 61 | 			var docs, vector;
 62 | 			var words = string.split(" ");
 63 | 			var count = 0;
 64 | 			var total = 0;
 65 | 			var completed = 0;
 66 | 			var falseMatches = [];
 67 | 			var results=[];
 68 | 
 69 | 			for( var word in words){
 70 | 				if(words[word].charAt(0)=="!"||words[word].charAt(0)=="-"){
 71 | 					var stemmed = stemmer.process(words[word].split("!").join("").split("-").join(""));
 72 | 					var index = this.vectorKeywordIndex[DoubleMetaphone(stemmed).primary];
 73 | 					falseMatches.push(index);
 74 | 					words.splice(count,1);
 75 | 				}
 76 | 				count++
 77 | 			}
 78 | 			vector = makeVector("",words.join(" "),1,this.vectorKeywordIndex,this.vectorKeywordIndexLength);
 79 | 
 80 | 			for(var i = 0; i < vector.data.length; i++){
 81 | 			 total += vector.data[i];
 82 | 			}
 83 | 			if( total == 0 ){
 84 | 				return [];
 85 | 			}
 86 | 
 87 | 			if( fields != null){
 88 | 				var fieldsName = fields.sort().join("-");
 89 | 				if( !this.docs.hasOwnProperty(fieldsName) ) {
 90 | 					// This will be really slow the first time.
 91 | 					// this.docs[fieldsName] = indexFields(fields,this.docs,function(docs){
 92 | 						// TODO: need to work on this, can't currently search more than one field at a time
 93 | 					//});
 94 | 					return
 95 | 				}else{
 96 | 					docs = this.docs[fieldsName];
 97 | 				}
 98 | 			}else{
 99 | 				docs = this.docs.all;
100 | 			}
101 | 
102 | 			return asyncForEach( docs,
103 | 				function(doc,i,list){
104 | 					for( var falseMatch in falseMatches ){ // strip documents that have falsematches
105 | 						if(doc.data[falseMatches[falseMatch]]!=0){
106 | 							return
107 | 						}
108 | 					}
109 | 					var result = cosine(vector, doc); // figure out how close your querie vector is to the other docs
110 | 					if( result != 0 ){ // filter out items that dont match at all
111 | 						results.push({key:doc.key, rank:result});
112 | 					}
113 | 				},
114 | 				function(){ // TODO: what if there are millions of results, this sort will be slow.
115 | 					callback(results.sort(function (a, b) { return ((b.rank - a.rank)) }));
116 | 				}
117 | 			);
118 | 		},
119 | 		
120 | 		related: function(key,callback){
121 | 			var docs;
122 | 			var results=[];
123 | 			
124 | 			return asyncForEach( this.docs.all,
125 | 				function(item,i,list){
126 | 					if(item.key == key ){
127 | 						asyncForEach( this.docs.all,
128 | 							function(doc,i,list){
129 | 								var result = cosine(item, doc); // figure out how close your querie vector is to the other docs
130 | 								if( result != 0 ){ // filter out items that dont match at all
131 | 									results.push({key:doc.key, rank:result});
132 | 								}
133 | 							},
134 | 							function(){ // TODO: what if there are millions of results, this sort will be slow.
135 | 								callback(results.sort(function (a, b) { return ((b.rank - a.rank)) }));
136 | 							}
137 | 						);
138 | 						return true;
139 | 					}
140 | 				},
141 | 				function(){}
142 | 			);
143 | 		}
144 | 	}
145 | }
146 | 
147 | // Private ///////////////////////////////////////////////////////////////////////////////////////////
148 | 
149 | // Stemming is a way to convert words like speeder and speeds to speed
150 | var stemmer = exports.stemmer = PorterStemmer();
151 | 
152 | // break string up into tokens and stem words
153 | var tokenizer = exports.tokenizer = Tokenizer();
154 | 
155 | // Words that will not be indexed
156 | var stopWords = exports.stopWords = ["","a","about","above","above","across","after","afterwards","again","against","all","almost","alone","along","already","also","although","always","am","among","amongst","amoungst","amount","an","and","another","any","anyhow","anyone","anything","anyway","anywhere","are","around","as","at","back","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","below","beside","besides","between","beyond","bill","both","bottom","but","by","call","can","cannot","cant","co","con","could","couldnt","cry","de","describe","detail","do","done","down","due","during","each","eg","eight","either","eleven","else","elsewhere","empty","enough","etc","even","ever","every","everyone","everything","everywhere","except","few","fifteen","fify","fill","find","fire","first","five","for","former","formerly","forty","found","four","from","front","full","further","get","give","go","had","has","hasnt","have","he","hence","her","here","hereafter","hereby","herein","hereupon","hers","herself","him","himself","his","how","however","hundred","ie","if","in","inc","indeed","interest","into","is","it","its","itself","keep","last","latter","latterly","least","less","ltd","made","many","may","me","meanwhile","might","mill","mine","more","moreover","most","mostly","move","much","must","my","myself","name","namely","neither","never","nevertheless","next","nine","no","nobody","none","noone","nor","not","nothing","now","nowhere","of","off","often","on","once","one","only","onto","or","other","others","otherwise","our","ours","ourselves","out","over","own","part","per","perhaps","please","put","rather","re","same","see","seem","seemed","seeming","seems","serious","several","she","should","show","side","since","sincere","six","sixty","so","some","somehow","someone","something","sometime","sometimes","somewhere","still","such","system","take","ten","than","that","the","their","them","themselves","then","thence","there","thereafter","thereby","therefore","therein","thereupon","these","they","thickv","thin","third","this","those","though","three","through","throughout","thru","thus","to","together","too","top","toward","towards","twelve","twenty","two","un","under","until","up","upon","us","very","via","was","we","well","were","what","whatever","when","whence","whenever","where","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","whoever","whole","whom","whose","why","will","with","within","without","would","yet","you","your","yours","yourself","yourselves","the"];
157 | 
158 | var vectorKeywordIndexLength = 0;
159 | 
160 | function indexFields(fields,docs){
161 | 	var result = [];
162 | 	for( var field in fields){
163 |  		var data = docs[fields[field]];
164 |  		for( i=0; i<this.docs.all.length; i++){
165 |  			for( var h=0; h<data[i].data.length; h++){
166 |  				if(h==0){
167 |  					result.push( Vector(vectorKeywordIndexLength) );
168 |  				}
169 |  				result[i].data[h] += data[i].data[h];
170 |  			}
171 |  		}
172 |  	}
173 | 	return result;
174 | }
175 | 
176 | // Create the keyword associated to the position of the elements within the document vectors
177 | function getVectorKeywordIndex(documents){
178 | 	// Mapped documents into a single word string
179 | 	var vocabularyString = "";
180 | 	for( var i = 0; i < documents.length; i++){
181 | 		for( var field in documents[i] ){
182 | 			vocabularyString += documents[i][field]+" ";
183 | 		}
184 | 	}
185 | 
186 | 	//Remove common words which have no search value
187 | 	var vocabularyList = tokenizer.process(removeStopWords(vocabularyString));		
188 | 	var uniqueVocabularyList = removeDuplicates(vocabularyList);
189 | 
190 | 	var vectorIndex={};
191 | 	var offset=0;
192 | 	//Associate a position with the keywords which maps to the dimension on the vector used to represent this word
193 | 	for( var i = 0; i < uniqueVocabularyList.length; i++){
194 | 		var word = uniqueVocabularyList[i];
195 | 		vectorIndex[word]=offset;
196 | 		offset++;
197 | 	}
198 | 	vectorKeywordIndexLength = uniqueVocabularyList.length
199 | 	return [vectorIndex, uniqueVocabularyList.length]; // (keyword:position)
200 | }
201 | 
202 | // Create the keyword associated to the position of the elements within the document vectors
203 | function findUniques(doc){
204 | 	// Mapped documents into a single word string
205 | 	var vocabularyString = "";
206 | 	for( var field in doc ){
207 | 		vocabularyString += doc[field]+" ";
208 | 	}
209 | 	//Remove common words which have no search value
210 | 	var vocabularyList = tokenizer.process(removeStopWords(vocabularyString));		
211 | 	return removeDuplicates(vocabularyList);
212 | }
213 | 
214 | // Make a vector that has values for each instance of the words contained in each document
215 | function makeVector(key,string,fieldWeight,vectorKeywordIndex,vectorKeywordIndexLength){
216 | 	fieldWeight = fieldWeight||0;
217 | 	var vector = Vector(vectorKeywordIndexLength);
218 | 	var wordList = tokenizer.process(removeStopWords(string));
219 | 	for( var i=0; i<wordList.length; i++){
220 | 		var word = wordList[i];
221 | 		vector.data[vectorKeywordIndex[word]] += 1 * fieldWeight; // Use simple Term Count Model
222 | 	}
223 | 	vector.key = key;
224 | 	return vector;
225 | }
226 | 
227 | // Remove duplicates from a list
228 | function removeDuplicates(list){
229 | 	var result = [];
230 |   o:for(var i=0, n = list.length; i < n; i++){
231 | 		for(var x=0, y = result.length; x < y; x++){
232 | 			if(result[x]==list[i]) continue o;
233 | 		}
234 | 		result[result.length] = list[i];
235 |   }
236 |   return result;
237 | }
238 | 
239 | // related documents j and q are in the concept space by comparing the vectors 
240 | //  cosine  = ( V1 * V2 ) / ||V1|| x ||V2||
241 | function cosine(vector1,vector2){
242 | 	var utils = VectorUtils();
243 | 	return utils.dot(vector1,vector2) / (utils.norm(vector1) * utils.norm(vector2)); 
244 | }
245 | 
246 | // remove any nasty grammar tokens from string
247 | function clean(string){
248 | 	return string.replace(/[\.\-_&]/g, " ").replace(/\s\s+/g, " ").toLowerCase();
249 | }
250 | 
251 | // Remove common words which have no search value
252 | function removeStopWords(string){
253 | 	var words = clean(string).split(" ");
254 | 	var result = [];
255 | 	for( var i=0; i<words.length; i++){
256 | 		if(stopWords.indexOf(words[i])==-1) result.push(words[i]);
257 | 	}
258 | 	return result;
259 | }
260 | 
261 | function map(array, fn) { return function (callback, errback) {
262 |   var counter = array.length;
263 |   var new_array = [];
264 |   array.forEach(function (item, index) {
265 |     var local_callback = function (result) {
266 |       new_array[index] = result;
267 |       counter--;
268 |       if (counter <= 0) {
269 |         new_array.length = array.length
270 |         callback(new_array);
271 |       }
272 |     };
273 |     var cont = fn(item, local_callback, errback);
274 |     if (typeof cont === 'function') {
275 |       cont(local_callback, errback);
276 |     }
277 |   });
278 | }}
279 | 
280 | /* asyncForEach() */
281 | function asyncForEach(list,handler,callback) {
282 | 	var i = 0;
283 | 	function next() {
284 | 		handler(list[i], i, list);
285 | 		i++;
286 | 		if (i == list.length){
287 | 			callback();
288 | 		}else if (i % 10 == 0) {
289 | 			process.nextTick(next);
290 | 		}else{
291 | 			next();
292 | 		}
293 |   }
294 | 	next();
295 | }


--------------------------------------------------------------------------------
/lib/node-search/double-metaphone/index.js:
--------------------------------------------------------------------------------
   1 | //http://github.com/maritz/js-double-metaphone/raw/master/double-metaphone.js
   2 | // This is a very rough common.js transformation of a php implementation. Original copyright of the PHP implementation follows:
   3 | 
   4 | 
   5 | // VERSION DoubleMetaphone Class 1.01
   6 | //
   7 | // DESCRIPTION
   8 | // 
   9 | //   This class implements a "sounds like" algorithm developed
  10 | //   by Lawrence Philips which he published in the June, 2000 issue
  11 | //   of C/C++ Users Journal.  Double Metaphone is an improved
  12 | //   version of Philips' original Metaphone algorithm.
  13 | // 
  14 | // COPYRIGHT
  15 | // 
  16 | //   Copyright 2001, Stephen Woodbridge <woodbri@swoodbridge.com>
  17 | //   All rights reserved.
  18 | //
  19 | //   http://swoodbridge.com/DoubleMetaPhone/
  20 | //
  21 | //   This PHP translation is based heavily on the C implementation
  22 | //   by Maurice Aubrey <maurice@hevanet.com>, which in turn  
  23 | //   is based heavily on the C++ implementation by
  24 | //   Lawrence Philips and incorporates several bug fixes courtesy
  25 | //   of Kevin Atkinson <kevina@users.sourceforge.net>.
  26 | // 
  27 | //   This module is free software; you may redistribute it and/or
  28 | //   modify it under the same terms as Perl itself.
  29 | // 
  30 | // CONTRIBUTIONS
  31 | //
  32 | //   17-May-2002 Geoff Caplan  http://www.advantae.com
  33 | //     Bug fix: added code to return class object which I forgot to do
  34 | //     Created a functional callable version instead of the class version
  35 | //     which is faster if you are calling this a lot.
  36 | //
  37 | // ------------------------------------------------------------------
  38 | 
  39 | // TODO: this file defines a bunch of globals in it, should be converted
  40 | exports.DoubleMetaphone = function(string) {
  41 | 	primary   = "";
  42 |   secondary = "";
  43 |   current   =  0;
  44 | 
  45 |   current  = 0;
  46 |   length   = string.length;
  47 |   last     = length - 1;
  48 |   original = string + "     ";
  49 | 
  50 |   original = original.toUpperCase();
  51 | 
  52 |   // skip this at beginning of word
  53 | 
  54 |   if (string_at(original, 0, 2, 
  55 |                       ['GN', 'KN', 'PN', 'WR', 'PS']))
  56 |     current++;
  57 | 
  58 |   // Initial 'X' is pronounced 'Z' e.g. 'Xavier'
  59 | 
  60 |   if (original.substr(0, 1) == 'X') {
  61 |     primary   += "S";   // 'Z' maps to 'S'
  62 |     secondary += "S";
  63 |     current++;
  64 |   }
  65 | 
  66 |   // main loop
  67 | 
  68 |   while (primary.length < 4 || secondary.length < 4) {
  69 |     if (current >= length)
  70 |       break;
  71 | 
  72 |     switch (original.substr(current, 1)) {
  73 |       case 'A':
  74 |       case 'E':
  75 |       case 'I':
  76 |       case 'O':
  77 |       case 'U':
  78 |       case 'Y':
  79 |         if (current == 0) {
  80 |           // all init vowels now map to 'A'
  81 |           primary   += 'A';
  82 |           secondary += 'A';
  83 |         }
  84 |         current += 1;
  85 |         break;
  86 | 
  87 |       case 'B':
  88 |         // '-mb', e.g. "dumb", already skipped over ...
  89 |         primary   += 'P';
  90 |         secondary += 'P';
  91 | 
  92 |         if (original.substr(current + 1, 1) == 'B')
  93 |           current += 2;
  94 |         else
  95 |           current += 1;
  96 |         break;
  97 | 
  98 |       case 'Ç':
  99 |         primary   += 'S';
 100 |         secondary += 'S';
 101 |         current += 1;
 102 |         break;
 103 | 
 104 |       case 'C':
 105 |         // various gremanic
 106 |         if ((current > 1) 
 107 |             && !is_vowel(original, current - 2)
 108 |             && string_at(original, current - 1, 3, 
 109 |                       ["ACH"])
 110 |             && ((original.substr(current + 2, 1) != 'I')
 111 |                 && ((original.substr(current + 2, 1) != 'E')
 112 |                     || string_at(original, current - 2, 6, 
 113 |                               ["BACHER", "MACHER"])))) {
 114 | 
 115 |           primary   += 'K';
 116 |           secondary += 'K';
 117 |           current += 2;
 118 |           break;
 119 |         }
 120 | 
 121 |         // special case 'caesar'
 122 |         if ((current == 0) 
 123 |             && string_at(original, current, 6, 
 124 |                        ["CAESAR"])) {
 125 |           primary   += 'S';
 126 |           secondary += 'S';
 127 |           current += 2;
 128 |           break;
 129 |         }
 130 | 
 131 |         // italian 'chianti'
 132 |         if (string_at(original, current, 4, 
 133 |                        ["CHIA"])) {
 134 |           primary   += 'K';
 135 |           secondary += 'K';
 136 |           current += 2;
 137 |           break;
 138 |         }
 139 | 
 140 |         if (string_at(original, current, 2, 
 141 |                        ["CH"])) {
 142 | 
 143 |           // find 'michael'
 144 |           if ((current > 0)
 145 |               && string_at(original, current, 4, 
 146 |                        ["CHAE"])) {
 147 |             primary   += 'K';
 148 |             secondary += 'X';
 149 |             current += 2;
 150 |             break;
 151 |           }
 152 | 
 153 |           // greek roots e.g. 'chemistry', 'chorus'
 154 |           if ((current == 0)
 155 |               && (string_at(original, current + 1, 5, 
 156 |                        ["HARAC", "HARIS"])
 157 |                   || string_at(original, current + 1, 3, 
 158 |                             ["HOR", "HYM", "HIA", "HEM"]))
 159 |               && !string_at(original, 0, 5, ["CHORE"])) {
 160 |             primary   += 'K';
 161 |             secondary += 'K';
 162 |             current += 2;
 163 |             break;
 164 |           }
 165 | 
 166 |           // germanic, greek, or otherwise 'ch' for 'kh' sound
 167 |           if ((string_at(original, 0, 4, ["VAN ", "VON "])
 168 |                || string_at(original, 0, 3, ["SCH"]))
 169 |               // 'architect' but not 'arch', orchestra', 'orchid'
 170 |               || string_at(original, current - 2, 6, 
 171 |                        ["ORCHES", "ARCHIT", "ORCHID"])
 172 |               || string_at(original, current + 2, 1, 
 173 |                        ["T", "S"])
 174 |               || ((string_at(original, current - 1, 1, 
 175 |                        ["A","O","U","E"])
 176 |                    || (current == 0))
 177 |                   // e.g. 'wachtler', 'weschsler', but not 'tichner'
 178 |                   && string_at(original, current + 2, 1, 
 179 |                        ["L","R","N","M","B","H","F","V","W"," "]))) {
 180 |             primary   += 'K';
 181 |             secondary += 'K';
 182 |           } else {
 183 |             if (current > 0) {
 184 |               if (string_at(original, 0, 2, ["MC"])) {
 185 |                 // e.g. 'McHugh'
 186 |                 primary   += 'K';
 187 |                 secondary += 'K';
 188 |               } else {
 189 |                 primary   += 'X';
 190 |                 secondary += 'K';
 191 |               }
 192 |             } else {
 193 |               primary   += 'X';
 194 |               secondary += 'X';
 195 |             }
 196 |           }
 197 |           current += 2;
 198 |           break;
 199 |         }
 200 | 
 201 |         // e.g. 'czerny'
 202 |         if (string_at(original, current, 2, ["CZ"])
 203 |             && !string_at(original, current -2, 4, 
 204 |                        ["WICZ"])) {
 205 |           primary   += 'S';
 206 |           secondary += 'X';
 207 |           current += 2;
 208 |           break;
 209 |         }
 210 | 
 211 |         // e.g. 'focaccia'
 212 |         if (string_at(original, current + 1, 3, 
 213 |                    ["CIA"])) {
 214 |           primary   += 'X';
 215 |           secondary += 'X';
 216 |           current += 3;
 217 |           break;
 218 |         }
 219 | 
 220 |         // double 'C', but not McClellan'
 221 |         if (string_at(original, current, 2, ["CC"])
 222 |             && !((current == 1) 
 223 |                  && (original.substr(0, 1) == 'M'))) {
 224 |           // 'bellocchio' but not 'bacchus'
 225 |           if (string_at(original, current + 2, 1,
 226 |                      ["I","E","H"])
 227 |               && !string_at(original, current + 2, 2,
 228 |                         ["HU"])) {
 229 |             // 'accident', 'accede', 'succeed'
 230 |             if (((current == 1)
 231 |                  && (original.substr(current - 1, 1) == 'A'))
 232 |                 || string_at(original, current - 1, 5,
 233 |                           ["UCCEE", "UCCES"])) {
 234 |               primary   += "KS";
 235 |               secondary += "KS";
 236 |               // 'bacci', 'bertucci', other italian
 237 |             } else {
 238 |               primary   += "X";
 239 |               secondary += "X";
 240 |             }
 241 |             current += 3;
 242 |             break;
 243 |           } else {
 244 |             // Pierce's rule
 245 |             primary   += "K";
 246 |             secondary += "K";
 247 |             current += 2;
 248 |             break;
 249 |           }
 250 |         }
 251 | 
 252 |         if (string_at(original, current, 2,
 253 |                    ["CK","CG","CQ"])) {
 254 |           primary   += "K";
 255 |           secondary += "K";
 256 |           current += 2;
 257 |           break;
 258 |         }
 259 | 
 260 |         if (string_at(original, current, 2,
 261 |                    ["CI","CE","CY"])) {
 262 |           // italian vs. english
 263 |           if (string_at(original, current, 3,
 264 |                      ["CIO","CIE","CIA"])) {
 265 |             primary   += "S";
 266 |             secondary += "X";
 267 |           } else {
 268 |             primary   += "S";
 269 |             secondary += "S";
 270 |           }
 271 |           current += 2;
 272 |           break;
 273 |         }
 274 | 
 275 |         // else
 276 |         primary   += "K";
 277 |         secondary += "K";
 278 | 
 279 |         // name sent in 'mac caffrey', 'mac gregor'
 280 |         if (string_at(original, current + 1, 2,
 281 |                    [" C"," Q"," G"])) {
 282 |           current += 3;
 283 |         } else {
 284 |           if (string_at(original, current + 1, 1,
 285 |                      ["C","K","Q"])
 286 |               && !string_at(original, current + 1, 2,
 287 |                          ["CE","CI"])) {
 288 |             current += 2;
 289 |           } else {
 290 |             current += 1;
 291 |           }
 292 |         }
 293 |         break;
 294 | 
 295 |       case 'D':
 296 |         if (string_at(original, current, 2,
 297 |                    ["DG"])) {
 298 |           if (string_at(original, current + 2, 1,
 299 |                      ["I","E","Y"])) {
 300 |             // e.g. 'edge'
 301 |             primary   += "J";
 302 |             secondary += "J";
 303 |             current += 3;
 304 |             break;
 305 |           } else {
 306 |             // e.g. 'edgar'
 307 |             primary   += "TK";
 308 |             secondary += "TK";
 309 |             current += 2;
 310 |             break;
 311 |           }
 312 |         }
 313 | 
 314 |         if (string_at(original, current, 2,
 315 |                    ["DT","DD"])) {
 316 |           primary   += "T";
 317 |           secondary += "T";
 318 |           current += 2;
 319 |           break;
 320 |         }
 321 | 
 322 |         // else
 323 |         primary   += "T";
 324 |         secondary += "T";
 325 |         current += 1;
 326 |         break;
 327 | 
 328 |       case 'F':
 329 |         if (original.substr(current + 1, 1) == 'F')
 330 |           current += 2;
 331 |         else
 332 |           current += 1;
 333 |         primary   += "F";
 334 |         secondary += "F";
 335 |         break;
 336 | 
 337 |       case 'G':
 338 |         if (original.substr(current + 1, 1) == 'H') {
 339 |           if ((current > 0) 
 340 |               && !is_vowel(original, current - 1)) {
 341 |             primary   += "K";
 342 |             secondary += "K";
 343 |             current += 2;
 344 |             break;
 345 |           }
 346 | 
 347 |           if (current < 3) {
 348 |             // 'ghislane', 'ghiradelli'
 349 |             if (current == 0) {
 350 |               if (original.substr(current + 2, 1) == 'I') {
 351 |                 primary   += "J";
 352 |                 secondary += "J";
 353 |               } else {
 354 |                 primary   += "K";
 355 |                 secondary += "K";
 356 |               }
 357 |               current += 2;
 358 |               break;
 359 |             }
 360 |           }
 361 | 
 362 |           // Parker's rule (with some further refinements) - e.g. 'hugh'
 363 |           if (((current > 1)
 364 |                && string_at(original, current - 2, 1,
 365 |                          ["B","H","D"]))
 366 |               // e.g. 'bough'
 367 |               || ((current > 2)
 368 |                   &&  string_at(original, current - 3, 1,
 369 |                              ["B","H","D"]))
 370 |               // e.g. 'broughton'
 371 |               || ((current > 3)
 372 |                   && string_at(original, current - 4, 1,
 373 |                              ["B","H"]))) {
 374 |             current += 2;
 375 |             break;
 376 |           } else {
 377 |             // e.g. 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough'
 378 |             if ((current > 2)
 379 |                 && (original.substr(current - 1, 1) == 'U')
 380 |                 && string_at(original, current - 3, 1,
 381 |                           ["C","G","L","R","T"])) {
 382 |               primary   += "F";
 383 |               secondary += "F";
 384 |             } else if ( (current > 0) && original.substr(current - 1, 1) != 'I') {
 385 |               primary   += "K";
 386 |               secondary += "K";
 387 |             }
 388 |             current += 2;
 389 |             break;
 390 |           }
 391 |         }
 392 | 
 393 |         if (original.substr(current + 1, 1) == 'N') {
 394 |           if ((current == 1) && is_vowel(original, 0)
 395 |               && !Slavo_Germanic(original)) {
 396 |             primary   += "KN";
 397 |             secondary += "N";
 398 |           } else {
 399 |             // not e.g. 'cagney'
 400 |             if (!string_at(original, current + 2, 2,
 401 |                         ["EY"])
 402 |                 && (original.substr(current + 1) != "Y")
 403 |                 && !Slavo_Germanic(original)) {
 404 |                primary   += "N";
 405 |                secondary += "KN";
 406 |             } else {
 407 |                primary   += "KN";
 408 |                secondary += "KN";
 409 |             }
 410 |           }
 411 |           current += 2;
 412 |           break;
 413 |         }
 414 | 
 415 |         // 'tagliaro'
 416 |         if (string_at(original, current + 1, 2,
 417 |                    ["LI"])
 418 |             && !Slavo_Germanic(original)) {
 419 |           primary   += "KL";
 420 |           secondary += "L";
 421 |           current += 2;
 422 |           break;
 423 |         }
 424 | 
 425 |         // -ges-, -gep-, -gel- at beginning
 426 |         if ((current == 0)
 427 |             && ((original.substr(current + 1, 1) == 'Y')
 428 |                 || string_at(original, current + 1, 2,
 429 |                           ["ES","EP","EB","EL","EY","IB","IL","IN","IE",
 430 |                                 "EI","ER"]))) {
 431 |           primary   += "K";
 432 |           secondary += "J";
 433 |           current += 2;
 434 |           break;
 435 |         }
 436 | 
 437 |         // -ger-, -gy-
 438 |         if ((string_at(original, current + 1, 2,
 439 |                     ["ER"])
 440 |              || (original.substr(current + 1, 1) == 'Y'))
 441 |             && !string_at(original, 0, 6,
 442 |                        ["DANGER","RANGER","MANGER"])
 443 |             && !string_at(original, current -1, 1,
 444 |                        ["E", "I"])
 445 |             && !string_at(original, current -1, 3,
 446 |                        ["RGY","OGY"])) {
 447 |           primary   += "K";
 448 |           secondary += "J";
 449 |           current += 2;
 450 |           break;
 451 |         }
 452 | 
 453 |         // italian e.g. 'biaggi'
 454 |         if (string_at(original, current + 1, 1,
 455 |                    ["E","I","Y"])
 456 |             || string_at(original, current -1, 4,
 457 |                       ["AGGI","OGGI"])) {
 458 |           // obvious germanic
 459 |           if ((string_at(original, 0, 4, ["VAN ", "VON "])
 460 |                || string_at(original, 0, 3, ["SCH"]))
 461 |               || string_at(original, current + 1, 2,
 462 |                         ["ET"])) {
 463 |             primary   += "K";
 464 |             secondary += "K";
 465 |           } else {
 466 |             // always soft if french ending
 467 |             if (string_at(original, current + 1, 4,
 468 |                        ["IER "])) {
 469 |               primary   += "J";
 470 |               secondary += "J";
 471 |             } else {
 472 |               primary   += "J";
 473 |               secondary += "K";
 474 |             }
 475 |           }
 476 |           current += 2;
 477 |           break;
 478 |         }
 479 | 
 480 |         if (original.substr(current +1, 1) == 'G')
 481 |           current += 2;
 482 |         else
 483 |           current += 1;
 484 | 
 485 |         primary   += 'K';
 486 |         secondary += 'K';
 487 |         break;
 488 | 
 489 |       case 'H':
 490 |         // only keep if first & before vowel or btw. 2 vowels
 491 |         if (((current == 0) || 
 492 |              is_vowel(original, current - 1))
 493 |             && is_vowel(original, current + 1)) {
 494 |           primary   += 'H';
 495 |           secondary += 'H';
 496 |           current += 2;
 497 |         } else
 498 |           current += 1;
 499 |         break;
 500 | 
 501 |       case 'J':
 502 |         // obvious spanish, 'jose', 'san jacinto'
 503 |         if (string_at(original, current, 4,
 504 |                    ["JOSE"])
 505 |             || string_at(original, 0, 4, ["SAN "])) {
 506 |           if (((current == 0)
 507 |                && (original.substr(current + 4, 1) == ' '))
 508 |               || string_at(original, 0, 4, ["SAN "])) {
 509 |             primary   += 'H';
 510 |             secondary += 'H';
 511 |           } else {
 512 |             primary   += "J";
 513 |             secondary += 'H';
 514 |           }
 515 |           current += 1;
 516 |           break;
 517 |         }
 518 | 
 519 |         if ((current == 0)
 520 |             && !string_at(original, current, 4,
 521 |                    ["JOSE"])) {
 522 |           primary   += 'J';  // Yankelovich/Jankelowicz
 523 |           secondary += 'A';
 524 |         } else {
 525 |           // spanish pron. of .e.g. 'bajador'
 526 |           if (is_vowel(original, current - 1)
 527 |               && !Slavo_Germanic(original)
 528 |               && ((original.substr(current + 1, 1) == 'A')
 529 |                   || (original.substr(current + 1, 1) == 'O'))) {
 530 |             primary   += "J";
 531 |             secondary += "H";
 532 |           } else {
 533 |             if (current == last) {
 534 |               primary   += "J";
 535 |               secondary += "";
 536 |             } else {
 537 |               if (!string_at(original, current + 1, 1,
 538 |                           ["L","T","K","S","N","M","B","Z"])
 539 |                   && !string_at(original, current - 1, 1,
 540 |                              ["S","K","L"])) {
 541 |                 primary   += "J";
 542 |                 secondary += "J";
 543 |               }
 544 |             }
 545 |           }
 546 |         }
 547 | 
 548 |         if (original.substr(current + 1, 1) == 'J') // it could happen
 549 |           current += 2;
 550 |         else 
 551 |           current += 1;
 552 |         break;
 553 | 
 554 |       case 'K':
 555 |         if (original.substr(current + 1, 1) == 'K')
 556 |           current += 2;
 557 |         else
 558 |           current += 1;
 559 |         primary   += "K";
 560 |         secondary += "K";
 561 |         break;
 562 | 
 563 |       case 'L':
 564 |         if (original.substr(current + 1, 1) == 'L') {
 565 |           // spanish e.g. 'cabrillo', 'gallegos'
 566 |           if (((current == (length - 3))
 567 |                && string_at(original, current - 1, 4,
 568 |                          ["ILLO","ILLA","ALLE"]))
 569 |               || ((string_at(original, last-1, 2,
 570 |                           ["AS","OS"])
 571 |                 || string_at(original, last, 1,
 572 |                           ["A","O"]))
 573 |                && string_at(original, current - 1, 4,
 574 |                          ["ALLE"]))) {
 575 |             primary   += "L";
 576 |             secondary += "";
 577 |             current += 2;
 578 |             break;
 579 |           }
 580 |           current += 2;
 581 |         } else 
 582 |           current += 1;
 583 |         primary   += "L";
 584 |         secondary += "L";
 585 |         break;
 586 | 
 587 |       case 'M':
 588 |         if ((string_at(original, current - 1, 3,
 589 |                    ["UMB"])
 590 |              && (((current + 1) == last)
 591 |                  || string_at(original, current + 2, 2,
 592 |                           ["ER"])))
 593 |             // 'dumb', 'thumb'
 594 |             || (original.substr(current + 1, 1) == 'M')) {
 595 |             current += 2;
 596 |         } else {
 597 |             current += 1;
 598 |         }
 599 |         primary   += "M";
 600 |         secondary += "M";
 601 |         break;
 602 | 
 603 |       case 'N':
 604 |         if (original.substr(current + 1, 1) == 'N') 
 605 |           current += 2;
 606 |         else
 607 |           current += 1;
 608 |         primary   += "N";
 609 |         secondary += "N";
 610 |         break;
 611 | 
 612 |       case 'Ñ':
 613 |         current += 1;
 614 |         primary   += "N";
 615 |         secondary += "N";
 616 |         break;
 617 | 
 618 |       case 'P':
 619 |         if (original.substr(current + 1, 1) == 'H') {
 620 |           current += 2;
 621 |           primary   += "F";
 622 |           secondary += "F";
 623 |           break;
 624 |         }
 625 | 
 626 |         // also account for "campbell" and "raspberry"
 627 |         if (string_at(original, current + 1, 1,
 628 |                    ["P","B"]))
 629 |           current += 2;
 630 |         else
 631 |           current += 1;
 632 |         primary   += "P";
 633 |         secondary += "P";
 634 |         break;
 635 | 
 636 |       case 'Q':
 637 |         if (original.substr(current + 1, 1) == 'Q') 
 638 |           current += 2;
 639 |         else 
 640 |           current += 1;
 641 |         primary   += "K";
 642 |         secondary += "K";
 643 |         break;
 644 | 
 645 |       case 'R':
 646 |         // french e.g. 'rogier', but exclude 'hochmeier'
 647 |         if ((current == last)
 648 |             && !Slavo_Germanic(original)
 649 |             && string_at(original, current - 2, 2,
 650 |                       ["IE"])
 651 |             && !string_at(original, current - 4, 2,
 652 |                        ["ME","MA"])) {
 653 |           primary   += "";
 654 |           secondary += "R";
 655 |         } else {
 656 |           primary   += "R";
 657 |           secondary += "R";
 658 |         }
 659 |         if (original.substr(current + 1, 1) == 'R') 
 660 |           current += 2;
 661 |         else
 662 |           current += 1;
 663 |         break;
 664 | 
 665 |       case 'S':
 666 |         // special cases 'island', 'isle', 'carlisle', 'carlysle'
 667 |         if (string_at(original, current - 1, 3,
 668 |                    ["ISL","YSL"])) {
 669 |           current += 1;
 670 |           break;
 671 |         }
 672 | 
 673 |         // special case 'sugar-'
 674 |         if ((current == 0)
 675 |             && string_at(original, current, 5,
 676 |                       ["SUGAR"])) {
 677 |           primary   += "X";
 678 |           secondary += "S";
 679 |           current += 1;
 680 |           break;
 681 |         }
 682 | 
 683 |         if (string_at(original, current, 2,
 684 |                    ["SH"])) {
 685 |           // germanic
 686 |           if (string_at(original, current + 1, 4,
 687 |                      ["HEIM","HOEK","HOLM","HOLZ"])) {
 688 |             primary   += "S";
 689 |             secondary += "S";
 690 |           } else {
 691 |             primary   += "X";
 692 |             secondary += "X";
 693 |           }
 694 |           current += 2;
 695 |           break;
 696 |         }
 697 | 
 698 |         // italian & armenian 
 699 |         if (string_at(original, current, 3,
 700 |                    ["SIO","SIA"])
 701 |             || string_at(original, current, 4,
 702 |                       ["SIAN"])) {
 703 |           if (!Slavo_Germanic(original)) {
 704 |             primary   += "S";
 705 |             secondary += "X";
 706 |           } else {
 707 |             primary   += "S";
 708 |             secondary += "S";
 709 |           }
 710 |           current += 3;
 711 |           break;
 712 |         }
 713 | 
 714 |         // german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
 715 |         // also, -sz- in slavic language altho in hungarian it is pronounced 's'
 716 |         if (((current == 0)
 717 |              && string_at(original, current + 1, 1,
 718 |                        ["M","N","L","W"]))
 719 |             || string_at(original, current + 1, 1,
 720 |                       ["Z"])) {
 721 |           primary   += "S";
 722 |           secondary += "X";
 723 |           if (string_at(original, current + 1, 1,
 724 |                       ["Z"]))
 725 |             current += 2;
 726 |           else
 727 |             current += 1;
 728 |           break;
 729 |         }
 730 | 
 731 |         if (string_at(original, current, 2,
 732 |                    ["SC"])) {
 733 |           // Schlesinger's rule 
 734 |           if (original.substr(current + 2, 1) == 'H')
 735 |             // dutch origin, e.g. 'school', 'schooner'
 736 |             if (string_at(original, current + 3, 2,
 737 |                        ["OO","ER","EN","UY","ED","EM"])) {
 738 |               // 'schermerhorn', 'schenker' 
 739 |               if (string_at(original, current + 3, 2,
 740 |                          ["ER","EN"])) {
 741 |                 primary   += "X";
 742 |                 secondary += "SK";
 743 |               } else {
 744 |                 primary   += "SK";
 745 |                 secondary += "SK";
 746 |               }
 747 |               current += 3;
 748 |               break;
 749 |             } else {
 750 |               if ((current == 0) 
 751 |                   && !is_vowel(original, 3)
 752 |                   && (original.substr(current + 3, 1) != 'W')) {
 753 |                 primary   += "X";
 754 |                 secondary += "S";
 755 |               } else {
 756 |                 primary   += "X";
 757 |                 secondary += "X";
 758 |               }
 759 |               current += 3;
 760 |               break;
 761 |             }
 762 | 
 763 |             if (string_at(original, current + 2, 1,
 764 |                        ["I","E","Y"])) {
 765 |               primary   += "S";
 766 |               secondary += "S";
 767 |               current += 3;
 768 |               break;
 769 |             }
 770 | 
 771 |           // else
 772 |           primary   += "SK";
 773 |           secondary += "SK";
 774 |           current += 3;
 775 |           break;
 776 |         }
 777 | 
 778 |         // french e.g. 'resnais', 'artois'
 779 |         if ((current == last)
 780 |             && string_at(original, current - 2, 2,
 781 |                       ["AI","OI"])) {
 782 |           primary   += "";
 783 |           secondary += "S";
 784 |         } else {
 785 |           primary   += "S";
 786 |           secondary += "S";
 787 |         }
 788 | 
 789 |         if (string_at(original, current + 1, 1,
 790 |                    ["S","Z"]))
 791 |           current += 2;
 792 |         else 
 793 |           current += 1;
 794 |         break;
 795 | 
 796 |       case 'T':
 797 |         if (string_at(original, current, 4,
 798 |                    ["TION"])) {
 799 |           primary   += "X";
 800 |           secondary += "X";
 801 |           current += 3;
 802 |           break;
 803 |         }
 804 | 
 805 |         if (string_at(original, current, 3,
 806 |                    ["TIA","TCH"])) {
 807 |           primary   += "X";
 808 |           secondary += "X";
 809 |           current += 3;
 810 |           break;
 811 |         }
 812 | 
 813 |         if (string_at(original, current, 2,
 814 |                    ["TH"])
 815 |             || string_at(original, current, 3,
 816 |                           ["TTH"])) {
 817 |           // special case 'thomas', 'thames' or germanic
 818 |           if (string_at(original, current + 2, 2,
 819 |                      ["OM","AM"])
 820 |               || string_at(original, 0, 4, ["VAN ","VON "])
 821 |               || string_at(original, 0, 3, ["SCH"])) {
 822 |             primary   += "T";
 823 |             secondary += "T";
 824 |           } else {
 825 |             primary   += "0";
 826 |             secondary += "T";
 827 |           }
 828 |           current += 2;
 829 |           break;
 830 |         }
 831 | 
 832 |         if (string_at(original, current + 1, 1,
 833 |                    ["T","D"]))
 834 |           current += 2;
 835 |         else
 836 |           current += 1;
 837 |         primary   += "T";
 838 |         secondary += "T";
 839 |         break;
 840 | 
 841 |       case 'V':
 842 |         if (original.substr(current + 1, 1) == 'V')
 843 |           current += 2;
 844 |         else
 845 |           current += 1;
 846 |         primary   += "F";
 847 |         secondary += "F";
 848 |         break;
 849 | 
 850 |       case 'W':
 851 |         // can also be in middle of word
 852 |         if (string_at(original, current, 2, ["WR"])) {
 853 |           primary   += "R";
 854 |           secondary += "R";
 855 |           current += 2;
 856 |           break;
 857 |         }
 858 | 
 859 |         if ((current == 0)
 860 |             && (is_vowel(original, current + 1)
 861 |                 || string_at(original, current, 2, 
 862 |                           ["WH"]))) {
 863 |           // Wasserman should match Vasserman 
 864 |           if (is_vowel(original, current + 1)) {
 865 |             primary   += "A";
 866 |             secondary += "F";
 867 |           } else {
 868 |             // need Uomo to match Womo 
 869 |             primary   += "A";
 870 |             secondary += "A";
 871 |           }
 872 |         }
 873 | 
 874 |         // Arnow should match Arnoff
 875 |         if (((current == last) 
 876 |               && is_vowel(original, current - 1))
 877 |             || string_at(original, current - 1, 5,
 878 |                       ["EWSKI","EWSKY","OWSKI","OWSKY"])
 879 |             || string_at(original, 0, 3, ["SCH"])) {
 880 |           primary   += "";
 881 |           secondary += "F";
 882 |           current += 1;
 883 |           break;
 884 |         }
 885 | 
 886 |         // polish e.g. 'filipowicz'
 887 |         if (string_at(original, current, 4,
 888 |                    ["WICZ","WITZ"])) {
 889 |           primary   += "TS";
 890 |           secondary += "FX";
 891 |           current += 4;
 892 |           break;
 893 |         }
 894 | 
 895 |         // else skip it
 896 |         current += 1;
 897 |         break;
 898 | 
 899 |       case 'X':
 900 |         // french e.g. breaux 
 901 |         if (!((current == last)
 902 |               && (string_at(original, current - 3, 3,
 903 |                          ["IAU", "EAU"])
 904 |                || string_at(original, current - 2, 2,
 905 |                          ["AU", "OU"])))) {
 906 |           primary   += "KS";
 907 |           secondary += "KS";
 908 |         }
 909 | 
 910 |         if (string_at(original, current + 1, 1,
 911 |                    ["C","X"]))
 912 |           current += 2;
 913 |         else
 914 |           current += 1;
 915 |         break;
 916 | 
 917 |       case 'Z':
 918 |         // chinese pinyin e.g. 'zhao' 
 919 |         if (original.substr(current + 1, 1) == "H") {
 920 |           primary   += "J";
 921 |           secondary += "J";
 922 |           current += 2;
 923 |           break;
 924 |         } else if (string_at(original, current + 1, 2,
 925 |                          ["ZO", "ZI", "ZA"])
 926 |                   || (Slavo_Germanic(original)
 927 |                       && ((current > 0)
 928 |                           && original.substr(current - 1, 1) != 'T'))) {
 929 |           primary   += "S";
 930 |           secondary += "TS";
 931 |         } else {
 932 |           primary   += "S";
 933 |           secondary += "S";
 934 |         }
 935 | 
 936 |         if (original.substr(current + 1, 1) == 'Z')
 937 |           current += 2;
 938 |         else
 939 |           current += 1;
 940 |         break;
 941 | 
 942 |       default:
 943 |         current += 1;
 944 | 
 945 |     } // end switch
 946 | 
 947 |   } // end while
 948 | 
 949 |   primary   = primary.substr(  0, 4);
 950 |   secondary = secondary.substr(0, 4);
 951 | 
 952 |   if( primary == secondary )
 953 |   {
 954 |     secondary = null ; 
 955 |   }
 956 | 
 957 |   return {
 958 |     primary: primary,
 959 |     secondary: secondary
 960 |   }
 961 | }
 962 | 
 963 | /*=================================================================*\
 964 |   # Name:   string_at(string, start, length, list)
 965 |   # Purpose:  Helper function for DoubleMetaphone( )
 966 |   # Return:   Bool
 967 | \*=================================================================*/
 968 | function string_at(string, start, length, list) 
 969 | {
 970 |     if ((start <0) || (start >= string.length))
 971 |       return 0;
 972 | 
 973 |     for (var i=0, len=list.length; i<len; i++) {
 974 |       if (list[i] == string.substr(start, length))
 975 |         return 1;
 976 |     }
 977 |     return 0;
 978 |   }
 979 | 
 980 | 
 981 | /*=================================================================*\
 982 |   # Name:   is_vowel(string, pos)
 983 |   # Purpose:  Helper function for DoubleMetaphone( )
 984 |   # Return:   Bool
 985 | \*=================================================================*/
 986 | function is_vowel(string, pos)
 987 | {
 988 |     return /[AEIOUY]/.test(string.substr(pos, 1));
 989 | }
 990 | 
 991 | 
 992 | /*=================================================================*\
 993 |   # Name:   Slavo_Germanic(string, pos)
 994 |   # Purpose:  Helper function for DoubleMetaphone( )
 995 |   # Return:   Bool
 996 | \*=================================================================*/
 997 | function Slavo_Germanic(string) 
 998 | {
 999 | 		return /W|K|CZ|WITZ/.test(string);
1000 | }


--------------------------------------------------------------------------------
/lib/node-search/math/vector-utils.js:
--------------------------------------------------------------------------------
 1 | exports.VectorUtils = function() {
 2 | 	return {
 3 | 		norm: function(vector){
 4 | 			var maxColSum = 0;
 5 | 		  var sum = 0;
 6 | 			var V = vector.data||vector;
 7 | 		  for (var row = 0; row < V.length; row++) {
 8 | 				sum += Math.abs(V[row]);
 9 | 		  }
10 | 		  return Math.max(maxColSum, sum);
11 | 		},
12 | 		dot: function(vector1,vector2) {
13 | 		  var product = 0, 
14 | 					n = vector1.data.length,
15 | 			 		V = vector2.data||vector2;
16 | 		  if (n != V.length) { return null; }
17 | 		  do { product += vector1.data[n-1] * V[n-1]; } while (--n);
18 | 		  return product;
19 | 		}
20 |   }
21 | }


--------------------------------------------------------------------------------
/lib/node-search/math/vector.js:
--------------------------------------------------------------------------------
1 | exports.Vector = function(size) {
2 | 	var data = [];
3 | 	for(var i=0;i<size;i++){
4 | 		data.push(0);
5 | 	}
6 | 	return {
7 | 		data:data
8 |   }
9 | }


--------------------------------------------------------------------------------
/lib/node-search/porter-stemmer/index.js:
--------------------------------------------------------------------------------
  1 | // http://tartarus.org/~martin/PorterStemmer/js.txt
  2 | // Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
  3 | // paper, in
  4 | //
  5 | //  Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
  6 | //  no. 3, pp 130-137,
  7 | //
  8 | // see also http://www.tartarus.org/~martin/PorterStemmer
  9 | 
 10 | // Release 1 be 'andargor', Jul 2004
 11 | // Release 2 (substantially revised) by Christopher McKenzie, Aug 2009
 12 | 
 13 | exports.PorterStemmer = function(){
 14 | 	var step2list = {
 15 | 			"ational" : "ate",
 16 | 			"tional" : "tion",
 17 | 			"enci" : "ence",
 18 | 			"anci" : "ance",
 19 | 			"izer" : "ize",
 20 | 			"bli" : "ble",
 21 | 			"alli" : "al",
 22 | 			"entli" : "ent",
 23 | 			"eli" : "e",
 24 | 			"ousli" : "ous",
 25 | 			"ization" : "ize",
 26 | 			"ation" : "ate",
 27 | 			"ator" : "ate",
 28 | 			"alism" : "al",
 29 | 			"iveness" : "ive",
 30 | 			"fulness" : "ful",
 31 | 			"ousness" : "ous",
 32 | 			"aliti" : "al",
 33 | 			"iviti" : "ive",
 34 | 			"biliti" : "ble",
 35 | 			"logi" : "log"
 36 | 		},
 37 | 
 38 | 		step3list = {
 39 | 			"icate" : "ic",
 40 | 			"ative" : "",
 41 | 			"alize" : "al",
 42 | 			"iciti" : "ic",
 43 | 			"ical" : "ic",
 44 | 			"ful" : "",
 45 | 			"ness" : ""
 46 | 		},
 47 | 
 48 | 		c = "[^aeiou]",          // consonant
 49 | 		v = "[aeiouy]",          // vowel
 50 | 		C = c + "[^aeiouy]*",    // consonant sequence
 51 | 		V = v + "[aeiou]*",      // vowel sequence
 52 | 
 53 | 		mgr0 = "^(" + C + ")?" + V + C,               // [C]VC... is m>0
 54 | 		meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$",  // [C]VC[V] is m=1
 55 | 		mgr1 = "^(" + C + ")?" + V + C + V + C,       // [C]VCVC... is m>1
 56 | 		s_v = "^(" + C + ")?" + v;                   // vowel in stem
 57 | 
 58 | 	return { 
 59 | 		process: function(w) {
 60 | 			var stem,
 61 | 				suffix,
 62 | 				firstch,
 63 | 				re,
 64 | 				re2,
 65 | 				re3,
 66 | 				re4,
 67 | 				origword = w;
 68 | 
 69 | 			if (w.length < 3) { return w; }
 70 | 
 71 | 			firstch = w.substr(0,1);
 72 | 			if (firstch == "y") {
 73 | 				w = firstch.toUpperCase() + w.substr(1);
 74 | 			}
 75 | 
 76 | 			// Step 1a
 77 | 			re = /^(.+?)(ss|i)es$/;
 78 | 			re2 = /^(.+?)([^s])s$/;
 79 | 
 80 | 			if (re.test(w)) { w = w.replace(re,"$1$2"); }
 81 | 			else if (re2.test(w)) {	w = w.replace(re2,"$1$2"); }
 82 | 
 83 | 			// Step 1b
 84 | 			re = /^(.+?)eed$/;
 85 | 			re2 = /^(.+?)(ed|ing)$/;
 86 | 			if (re.test(w)) {
 87 | 				var fp = re.exec(w);
 88 | 				re = new RegExp(mgr0);
 89 | 				if (re.test(fp[1])) {
 90 | 					re = /.$/;
 91 | 					w = w.replace(re,"");
 92 | 				}
 93 | 			} else if (re2.test(w)) {
 94 | 				var fp = re2.exec(w);
 95 | 				stem = fp[1];
 96 | 				re2 = new RegExp(s_v);
 97 | 				if (re2.test(stem)) {
 98 | 					w = stem;
 99 | 					re2 = /(at|bl|iz)$/;
100 | 					re3 = new RegExp("([^aeiouylsz])\\1$");
101 | 					re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
102 | 					if (re2.test(w)) {	w = w + "e"; }
103 | 					else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
104 | 					else if (re4.test(w)) { w = w + "e"; }
105 | 				}
106 | 			}
107 | 
108 | 			// Step 1c
109 | 			re = /^(.+?)y$/;
110 | 			if (re.test(w)) {
111 | 				var fp = re.exec(w);
112 | 				stem = fp[1];
113 | 				re = new RegExp(s_v);
114 | 				if (re.test(stem)) { w = stem + "i"; }
115 | 			}
116 | 
117 | 			// Step 2
118 | 			re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
119 | 			if (re.test(w)) {
120 | 				var fp = re.exec(w);
121 | 				stem = fp[1];
122 | 				suffix = fp[2];
123 | 				re = new RegExp(mgr0);
124 | 				if (re.test(stem)) {
125 | 					w = stem + step2list[suffix];
126 | 				}
127 | 			}
128 | 
129 | 			// Step 3
130 | 			re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
131 | 			if (re.test(w)) {
132 | 				var fp = re.exec(w);
133 | 				stem = fp[1];
134 | 				suffix = fp[2];
135 | 				re = new RegExp(mgr0);
136 | 				if (re.test(stem)) {
137 | 					w = stem + step3list[suffix];
138 | 				}
139 | 			}
140 | 
141 | 			// Step 4
142 | 			re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
143 | 			re2 = /^(.+?)(s|t)(ion)$/;
144 | 			if (re.test(w)) {
145 | 				var fp = re.exec(w);
146 | 				stem = fp[1];
147 | 				re = new RegExp(mgr1);
148 | 				if (re.test(stem)) {
149 | 					w = stem;
150 | 				}
151 | 			} else if (re2.test(w)) {
152 | 				var fp = re2.exec(w);
153 | 				stem = fp[1] + fp[2];
154 | 				re2 = new RegExp(mgr1);
155 | 				if (re2.test(stem)) {
156 | 					w = stem;
157 | 				}
158 | 			}
159 | 
160 | 			// Step 5
161 | 			re = /^(.+?)e$/;
162 | 			if (re.test(w)) {
163 | 				var fp = re.exec(w);
164 | 				stem = fp[1];
165 | 				re = new RegExp(mgr1);
166 | 				re2 = new RegExp(meq1);
167 | 				re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
168 | 				if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
169 | 					w = stem;
170 | 				}
171 | 			}
172 | 
173 | 			re = /ll$/;
174 | 			re2 = new RegExp(mgr1);
175 | 			if (re.test(w) && re2.test(w)) {
176 | 				re = /.$/;
177 | 				w = w.replace(re,"");
178 | 			}
179 | 
180 | 			// and turn initial Y back to y
181 | 
182 | 			if (firstch == "y") {
183 | 				w = firstch.toLowerCase() + w.substr(1);
184 | 			}
185 | 
186 | 			return w;
187 | 		}
188 | 	}
189 | }


--------------------------------------------------------------------------------
/lib/node-search/tokenizer/index.js:
--------------------------------------------------------------------------------
 1 | var DoubleMetaphone = require("../double-metaphone").DoubleMetaphone;
 2 | var PorterStemmer = require("../porter-stemmer").PorterStemmer;
 3 | 
 4 | exports.Tokenizer = function() {
 5 | 	return {
 6 | 		stemmer:PorterStemmer(),
 7 | 		process: function(words){
 8 | 			var result = [];
 9 | 			for( var i=0; i<words.length; i++){
10 | 				var word = words[i];
11 | 				var metaphones = DoubleMetaphone(this.stemmer.process(word));
12 | 				result.push(metaphones.primary);
13 | 				if(metaphones.secondary!=null){
14 | 					result.push(metaphones.secondary);
15 | 				}
16 | 			}
17 | 			return result;
18 | 		}
19 | 	}
20 | }


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"name": "node-search",
3 | 	"version": "0.0.1",
4 | 	"directories": {"lib": "./lib"},
5 | 	"main": "./lib/search",
6 | }
7 | 


--------------------------------------------------------------------------------
/spec/commands/example_command.rb:
--------------------------------------------------------------------------------
 1 | 
 2 | # uncomment and call with `$ jspec example `
 3 | 
 4 | # command :example do |c|
 5 | #   c.syntax = 'jspec example [options]'
 6 | #   c.description = 'Just an example command'
 7 | #   c.option '-f', '--foo string', 'Does some foo with <string>'
 8 | #   c.option '-b', '--bar [string]', 'Does some bar with [string]'
 9 | #   c.example 'Do some foo', 'jspec example --foo bar'
10 | #   c.example 'Do some bar', 'jspec example --bar'
11 | #   c.when_called do |args, options|
12 | #     p args
13 | #     p options.__hash__
14 | #     # options.foo
15 | #     # options.bar
16 | #     # options.__hash__[:foo]
17 | #     # options.__hash__[:bar]
18 | #   end 
19 | # end


--------------------------------------------------------------------------------
/spec/node.js:
--------------------------------------------------------------------------------
 1 | 
 2 | require.paths.unshift('spec', '/usr/local/lib/ruby/gems/1.8/gems/jspec-4.3.3/lib', 'lib')
 3 | require('jspec')
 4 | require('unit/spec.helper')
 5 | require('yourlib')
 6 | 
 7 | JSpec
 8 |   .exec('spec/unit/spec.js')
 9 |   .run({ reporter: JSpec.reporters.Terminal, fixturePath: 'spec/fixtures', failuresOnly: true })
10 |   .report()
11 | 


--------------------------------------------------------------------------------
/spec/unit/spec.helper.js:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/talltyler/node-search/caeb8759faf1c08b43da91dc9c34ec45d94da779/spec/unit/spec.helper.js


--------------------------------------------------------------------------------
/spec/unit/spec.js:
--------------------------------------------------------------------------------
 1 | JSpec.describe('Search', function(){
 2 |   before_each(function{
 3 | 		//TODO
 4 |     //search = new Search
 5 |   })
 6 | 
 7 |   describe('addProducts', function(){
 8 |     it ('should add several products', function(){
 9 |       cart.addProducts('cookie')
10 |       cart.addProducts('icecream')
11 |       expect(cart).to(have, 2, 'products')
12 |     })
13 |   })
14 | 
15 |   describe('checkout', function(){
16 |     it ('should throw an error when checking out with no products', function(){
17 |       expect(function(){ cart.clear().checkout() }).to(throw_error, EmptyCart)
18 |     })
19 |   })
20 | })
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------