├── History.md ├── Readme.md ├── benchmark ├── app.js ├── data │ └── example.db └── lib │ └── nstore.js ├── examples ├── app.js ├── data │ └── example.db └── lib │ └── nstore.js ├── lib ├── node-search.js └── node-search │ ├── double-metaphone │ └── index.js │ ├── math │ ├── vector-utils.js │ └── vector.js │ ├── porter-stemmer │ └── index.js │ └── tokenizer │ └── index.js ├── package.json └── spec ├── commands └── example_command.rb ├── node.js └── unit ├── spec.helper.js └── spec.js /History.md: -------------------------------------------------------------------------------- 1 | 2 | 0.0.1 / YYYY-MM-DD 3 | ------------------ 4 | 5 | * Initial release 6 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # A JavaScript full text search engine 2 | 3 | This is an implementation of a "vector space model" with "Porter stemming", "double-metaphones", false boolean searches, field searching and field weighting. Basically it a full text search engine that has most of the fancy features the well known search engines libraries like Sphinx, Solr, Lucene, etc. The big difference is that it is written in JavaScript originally for use with Node.js. 4 | 5 | 6 | ## Example 7 | 8 | var NodeSearch = require('./../lib/node-search').NodeSearch; 9 | var nStore = require('./lib/nstore'); 10 | 11 | // A simple data set to search over, feel free to use any data source, nstore uses JavaScript objects so it's simple 12 | var db = nStore('data/example.db'); 13 | 14 | var search = new NodeSearch(); 15 | search.fieldWeights.title = 2; // Make one/or many of the document fields more important 16 | var stream = db.stream(); 17 | stream.addListener('data', function (doc, meta) { 18 | search.index(meta.key,doc); 19 | }); 20 | 21 | stream.addListener('end', function () { // when the indexing is finished 22 | search.query("meet !poultry", null, function (results) { // search and wait for the results 23 | results.forEach(function(result){ 24 | db.get(result.key, function (err, doc, meta) { 25 | if(err) throw err; 26 | console.log(result.key+" "+doc.title +" "+doc.body +" "+ result.rank); 27 | }); 28 | }); 29 | }); 30 | }); 31 | 32 | 33 | ## This code is based on idea and code from 34 | 35 | http://blog.josephwilk.net/projects/building-a-vector-space-search-engine-in-python.html 36 | http://github.com/maritz/js-double-metaphone/raw/master/double-metaphone.js 37 | http://yeti-witch.googlecode.com/svn/trunk/lib/porter-stemmer.js 38 | http://www.koders.com/javascript/fidACD9DF0C1463CFC127D8C8B767B77122F3FC7331.aspx 39 | http://playnice.ly/blog/2010/05/05/a-fast-fuzzy-full-text-index-using-redis/ 40 | http://users.telenet.be/paul.larmuseau/SVD.htm 41 | http://gist.github.com/389875 42 | http://sylvester.jcoglan.com/api/matrix 43 | http://www.uni-bonn.de/~manfear/matrixcalc.php 44 | http://www.sphinxsearch.com/docs/manual-1.10.html#boolean-syntax 45 | http://stackoverflow.com/questions/90580/word-frequency-algorithm-for-natural-language-processing 46 | http://stackoverflow.com/questions/2699646/how-to-get-logical-parts-of-a-sentence-with-java 47 | 48 | 49 | ## Things that it doesn't currently do but would like to look into: 50 | 51 | Phrase based searches, everything is word based, combonations of words are not currently supported 52 | Exact matches, all words are converted to stemmed metaphones so "ponies" is indexed as the sound of "pony". 53 | Date based searches or other meta data with additional logic are not currently supported 54 | tf-idf based ranking, currently using a term count 55 | http://blog.josephwilk.net/projects/latent-semantic-analysis-in-python.html 56 | http://en.wikipedia.org/wiki/Lanczos_method 57 | http://en.wikipedia.org/wiki/Latent_semantic_indexing 58 | http://en.wikipedia.org/wiki/Probabilistic_latent_semantic_analysis 59 | http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation 60 | http://en.wikipedia.org/wiki/Part-of-speech_tagging 61 | 62 | 63 | ## License 64 | 65 | (The MIT License) 66 | 67 | Copyright (c) 2009 Motion & Color <Tyler Larson> 68 | 69 | Permission is hereby granted, free of charge, to any person obtaining 70 | a copy of this software and associated documentation files (the 71 | 'Software'), to deal in the Software without restriction, including 72 | without limitation the rights to use, copy, modify, merge, publish, 73 | distribute, sublicense, and/or sell copies of the Software, and to 74 | permit persons to whom the Software is furnished to do so, subject to 75 | the following conditions: 76 | 77 | The above copyright notice and this permission notice shall be 78 | included in all copies or substantial portions of the Software. 79 | 80 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 81 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 82 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 83 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 84 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 85 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 86 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /benchmark/app.js: -------------------------------------------------------------------------------- 1 | var NodeSearch = require('./../lib/node-search').NodeSearch; 2 | var nStore = require('./lib/nstore'); 3 | 4 | // A simple data set to search over, feel free to use any data source 5 | var db = nStore('data/example.db'); 6 | 7 | // create a sample database to test from 8 | for(var i=10,l=1000;i 2 | // 3 | // MIT licensed 4 | 5 | 6 | var sys = require('sys'), 7 | fs = require('fs'), 8 | Path = require('path'), 9 | Buffer = require('buffer').Buffer; 10 | 11 | // This size only affects performance, it's not a constraint on data sizes 12 | var CHUNK_SIZE = 1024; 13 | // This is the max size of a single serialized document 14 | var MAX_SIZE = 1024 * 1024; 15 | 16 | // Reads from a given file descriptor at a specified position and length 17 | // Handles all OS level chunking for you. 18 | // Callback gets (err, buffer) 19 | function fsRead(fd, position, length, callback) { 20 | var buffer = new Buffer(length), 21 | offset = 0; 22 | 23 | function readChunk() { 24 | fs.read(fd, buffer, offset, length - offset, position, function (err, bytesRead) { 25 | if (err) { callback(err); return; } 26 | 27 | offset += bytesRead; 28 | 29 | if (offset < length) { 30 | readChunk(); 31 | return; 32 | } 33 | callback(null, buffer); 34 | }); 35 | } 36 | readChunk(); 37 | } 38 | 39 | // Writes a buffer to a specified file descriptor at the given offset 40 | // handles chunking for you. 41 | // Callback gets (err) 42 | function fsWrite(fd, buffer, position, callback) { 43 | var offset = 0, 44 | length = buffer.length; 45 | 46 | function writeChunk() { 47 | fs.write(fd, buffer, offset, length - offset, position, function (err, bytesWritten) { 48 | if (err) { callback(err); return; } 49 | offset += bytesWritten; 50 | if (offset < length) { 51 | writeChunk(); 52 | return; 53 | } 54 | callback(); 55 | }); 56 | } 57 | writeChunk(); 58 | } 59 | 60 | 61 | function nStore(filename, filterFn, isTemp) { 62 | var fd, // FD for reading and writing to the file 63 | index = {}, // Index of file positions of all documents by key 64 | writeQueue = [], // Queue of new docs to write to the hd 65 | stale = 0, 66 | dbLength = 0, // The size of the current db file in bytes 67 | compacting = false, 68 | lastCompact = Date.now(); 69 | 70 | // Open a single handle for reading and writing 71 | fd = fs.openSync(filename, "a+"); 72 | 73 | // Generates a random unique 16 char base 36 string 74 | // (about 2^83 possible keys) 75 | function makeUUID() { 76 | var key = ""; 77 | while (key.length < 16) { 78 | key += Math.floor(Math.random() * 0x290d7410000).toString(36); 79 | } 80 | key = key.substr(0, 16); 81 | if (key in index) { 82 | return makeUUID(); 83 | } 84 | return key; 85 | } 86 | 87 | // Load a single record from the disk 88 | function getByKey(key, callback) { 89 | try { 90 | var info = index[key]; 91 | if (!info) { 92 | var error = new Error("Document does not exist for " + key); 93 | error.errno = process.ENOENT; 94 | callback(error); 95 | return; 96 | } 97 | 98 | fsRead(fd, info.position, info.length, function (err, buffer) { 99 | if (err) { callback(err); return; } 100 | try { 101 | var data = JSON.parse(buffer.toString()); 102 | callback(null, data, info.meta); 103 | } catch (err) { 104 | callback(err); 105 | } 106 | }); 107 | } catch (err) { 108 | callback(err); 109 | } 110 | } 111 | 112 | 113 | 114 | function compact() { 115 | // Don't run if already clean or already compacting 116 | if (isTemp || compacting || stale === 0) { return; } 117 | compacting = true; 118 | var tmpFile = Path.join(Path.dirname(filename), makeUUID() + ".tmpdb"), 119 | tmpDb = nStore(tmpFile, null, true), 120 | keys = Object.keys(index), 121 | counter = keys.length; 122 | 123 | keys.forEach(function (key) { 124 | getByKey(key, function (err, doc, meta) { 125 | if (err) { throw err; } 126 | 127 | function check() { 128 | counter--; 129 | if (counter === 0) { 130 | done(); 131 | } 132 | } 133 | 134 | // Hook to allow filtering when compacting 135 | // Great for things like session pruning 136 | if (filterFn && !filterFn(doc, meta)) { 137 | check(); 138 | return; 139 | } 140 | 141 | tmpDb.save(key, doc, function (err, meta) { 142 | if (err) { 143 | throw err; 144 | } 145 | check(); 146 | }); 147 | }); 148 | }); 149 | stale = 0; 150 | 151 | function done() { 152 | 153 | // Swap out stores 154 | var oldfd = fd; 155 | fd = tmpDb.fd; 156 | dbLength = tmpDb.dbLength; 157 | index = tmpDb.index; 158 | 159 | // And clean up the files 160 | fs.close(oldfd, function (err) { 161 | if (err) throw err; 162 | fs.unlink(filename, function (err) { 163 | if (err) throw err; 164 | fs.rename(tmpFile, filename, function (err) { 165 | if (err) throw err; 166 | compacting = false; 167 | lastCompact = Date.now(); 168 | checkQueue(); 169 | }); 170 | }); 171 | }); 172 | } 173 | 174 | } 175 | 176 | // Loads the database from disk using blocking I/O 177 | // TODO: see if non-blocking is faster, this takes a long time 178 | function loadDatabase() { 179 | 180 | // Create a buffer for reading chunks from the disk 181 | var chunk = new Buffer(CHUNK_SIZE); 182 | 183 | // Create an empty stream buffer 184 | var input = new Buffer(MAX_SIZE); 185 | input.length = 0; 186 | 187 | // These are positions in the database file 188 | var offset = 0; 189 | var base = 0; 190 | 191 | // This is a position within the input stream 192 | var pos = 0; 193 | var mid = 0; 194 | 195 | // Read a chunk from the file into `chunk` 196 | while ((chunk.length = fs.readSync(fd, chunk, 0, CHUNK_SIZE, offset)) > 0) { 197 | 198 | // Move the offset so the outer loop stays in sync 199 | offset += chunk.length; 200 | 201 | // Copy the chunk onto the input stream 202 | chunk.copy(input, input.length, 0, chunk.length); 203 | input.length += chunk.length; 204 | 205 | // See if there is input to consume 206 | for (var i = pos, l = input.length; i < l; i++) { 207 | if (input[i] === 9) { 208 | mid = i + 1; 209 | } 210 | if (mid && input[i] === 10) { 211 | // var doc = input.slice(pos, mid - 1).toString(); 212 | var meta = JSON.parse(input.slice(mid, i).toString()); 213 | var info = { 214 | meta: meta, 215 | position: base + pos, 216 | length: mid - pos - 1 217 | }; 218 | if (index[meta.key]) { 219 | stale++; 220 | } 221 | if (info.length > 0) { 222 | index[meta.key] = info; 223 | } else { 224 | delete index[meta.key]; 225 | } 226 | mid = 0; 227 | pos = i + 1; 228 | } 229 | } 230 | 231 | // Shift the input back down 232 | if (pos > 0) { 233 | input.copy(input, 0, pos, input.length); 234 | input.length -= pos; 235 | base += pos; 236 | pos = 0; 237 | } 238 | } 239 | 240 | dbLength = offset; 241 | 242 | } 243 | loadDatabase(); 244 | compact(); 245 | 246 | var lock = false; 247 | function checkQueue() { 248 | if (compacting || lock || writeQueue.length === 0) { return; } 249 | lock = true; 250 | 251 | // Pull some jobs off the writeQueue 252 | var length = writeQueue.length; 253 | var i = 0; 254 | var size = 0; 255 | var toWrite = []; 256 | var newIndex = {}; 257 | var position = dbLength; 258 | while (i < length && size < 50000) { 259 | var item = writeQueue[i]; 260 | var data = item.doc ? JSON.stringify(item.doc) : ""; 261 | var key = item.key; 262 | var meta = {key: key}; 263 | var line = new Buffer(data + "\t" + JSON.stringify(meta) + "\n"); 264 | var dataLength = Buffer.byteLength(data); 265 | // Generate a callback closure 266 | toWrite[toWrite.length] = { 267 | line: line, 268 | key: key, 269 | callback: item.callback 270 | }; 271 | newIndex[meta.key] = { 272 | position: dbLength, 273 | length: dataLength, 274 | meta: meta 275 | }; 276 | 277 | dbLength += line.length; 278 | size += line.length; 279 | i++; 280 | } 281 | length = i; 282 | writeQueue.splice(0, length); 283 | 284 | // Merge the buffers into one large one 285 | var offset = 0; 286 | var buffer = new Buffer(size); 287 | for (var i = 0; i < length; i++) { 288 | var line = toWrite[i].line; 289 | line.copy(buffer, offset); 290 | offset += line.length; 291 | } 292 | 293 | fsWrite(fd, buffer, position, function (err) { 294 | if (err) { 295 | throw err; 296 | } 297 | 298 | // Mix in the updated indexes 299 | var willCompact = false; 300 | var threshold = Object.keys(index).length; 301 | Object.keys(newIndex).forEach(function (key) { 302 | if (index[key]) { 303 | stale++; 304 | if (stale > threshold) { 305 | willCompact = true; 306 | } 307 | } 308 | 309 | if (newIndex[key].length === 0) { 310 | delete index[key]; 311 | } else { 312 | index[key] = newIndex[key]; 313 | } 314 | }); 315 | 316 | // Call all the individual callbacks for the write 317 | for (var i = 0; i < length; i++) { 318 | var item = toWrite[i]; 319 | var callback = item.callback; 320 | if (callback) { 321 | callback(err, {key: item.key}); 322 | } 323 | 324 | } 325 | 326 | // Unlock and try the loop again 327 | lock = false; 328 | if (willCompact && (Date.now() - lastCompact > 2000)) { 329 | compact(); 330 | } else { 331 | process.nextTick(checkQueue); 332 | } 333 | }); 334 | 335 | } 336 | 337 | function getStream(filter) { 338 | var counter = 0; 339 | var stream = new process.EventEmitter(); 340 | var queue = []; 341 | var paused = false; 342 | 343 | // Checks to see if we should emit the "end" event yet. 344 | function checkDone() { 345 | if (!paused && counter === 0) { 346 | counter--; 347 | stream.emit("end"); 348 | } 349 | } 350 | 351 | // Tries to push events through 352 | function flush() { 353 | if (paused) { return; } 354 | for (var i = 0, l = queue.length; i < l; i++) { 355 | var item = queue[i]; 356 | stream.emit("data", item.doc, item.meta); 357 | counter--; 358 | } 359 | queue.length = 0; 360 | process.nextTick(checkDone); 361 | } 362 | 363 | 364 | stream.pause = function () { 365 | paused = true; 366 | }; 367 | 368 | // Resumes emitting of events 369 | stream.resume = function () { 370 | paused = false; 371 | process.nextTick(function () { 372 | flush(); 373 | checkDone(); 374 | }); 375 | }; 376 | 377 | Object.keys(index).forEach(function (key) { 378 | counter++; 379 | getByKey(key, function (err, doc, meta) { 380 | if (err) { 381 | stream.emit("error", err); 382 | return; 383 | } 384 | if (!filter || filter(doc, meta)) { 385 | queue.push({ 386 | doc: doc, 387 | meta: meta 388 | }); 389 | flush(); 390 | } else { 391 | counter--; 392 | process.nextTick(checkDone); 393 | } 394 | }); 395 | }); 396 | 397 | process.nextTick(checkDone); 398 | 399 | return stream; 400 | } 401 | 402 | 403 | return { 404 | get length() { 405 | return Object.keys(index).length; 406 | }, 407 | 408 | // Saves a document with optional key. The effect if immediate to the 409 | // running program, but not persistent till after the callback. 410 | // Pass null as the key to get a generated key. 411 | save: function (key, doc, callback) { 412 | if (!key) { 413 | key = makeUUID(); 414 | } 415 | writeQueue[writeQueue.length] = { 416 | key: key, 417 | doc: doc, 418 | callback: callback 419 | }; 420 | checkQueue(); 421 | }, 422 | 423 | // Removes a document from the collection by key 424 | // The effect is immediate to the running program, but not permanent 425 | // till the callback returns. 426 | remove: function (key, callback) { 427 | if (key in index) { 428 | delete index[key]; 429 | var line = new Buffer("\t" + JSON.stringify({key: key}) + "\n"); 430 | 431 | writeQueue[writeQueue.length] = { 432 | meta: {key: key}, 433 | position: dbLength, 434 | length: 0, 435 | line: line, 436 | callback: callback 437 | }; 438 | dbLength += line.length; 439 | checkQueue(); 440 | } else { 441 | var err = new Error("Cannot delete a document that does not exist"); 442 | err.errno = process.ENOENT; 443 | callback(err); 444 | } 445 | }, 446 | 447 | all: function (filter, callback) { 448 | if (typeof filter === 'function' && callback === undefined) { 449 | callback = filter; 450 | filter = null; 451 | } 452 | var docs = []; 453 | var metas = []; 454 | var stream = getStream(filter); 455 | stream.addListener('data', function (doc, meta) { 456 | docs.push(doc); 457 | metas.push(meta); 458 | }); 459 | stream.addListener('end', function () { 460 | callback(null, docs, metas); 461 | }); 462 | stream.addListener('error', callback); 463 | }, 464 | 465 | // Returns a readable stream of the whole collection. 466 | // Supports pause and resume so that you can delay events for layer. 467 | // This queues "data" and "end" events in memory./ 468 | // Also you can provide a filter to pre-filter results before they 469 | // go to the queue 470 | stream: getStream, 471 | 472 | // Loads a single document by id, accepts key and callback 473 | // the callback will be called with (err, doc, meta) 474 | get: getByKey, 475 | 476 | 477 | // Removes all documents from a database 478 | clear: function () { 479 | index = {}; 480 | compact(); 481 | }, 482 | 483 | compact: compact, 484 | 485 | // Expose some private variables 486 | get index() { return index; }, 487 | get fd() { return fd; }, 488 | get dbLength() { return dbLength; }, 489 | 490 | // Expose the UUID maker 491 | makeUUID: makeUUID 492 | }; 493 | } 494 | 495 | module.exports = nStore; 496 | 497 | -------------------------------------------------------------------------------- /examples/app.js: -------------------------------------------------------------------------------- 1 | var NodeSearch = require('./../lib/node-search').NodeSearch; 2 | var nStore = require('./lib/nstore'); 3 | 4 | // A simple data set to search over, feel free to use any data source, nstore uses JavaScript objects so it's simple 5 | var db = nStore('data/example.db'); 6 | 7 | var search = new NodeSearch(); 8 | search.fieldWeights.title = 2; // Make one/or many of the document fields more important 9 | var stream = db.stream(); 10 | stream.addListener('data', function (doc, meta) { 11 | search.index(meta.key,doc); 12 | }); 13 | 14 | stream.addListener('end', function () { 15 | search.query("meet !poultry", null, function (results) { 16 | results.forEach(function(result){ 17 | db.get(result.key, function (err, doc, meta) { 18 | if(err) throw err; 19 | console.log(result.key+" "+doc.title +" "+doc.body +" "+ result.rank); 20 | }); 21 | }); 22 | }); 23 | }); -------------------------------------------------------------------------------- /examples/data/example.db: -------------------------------------------------------------------------------- 1 | {"title":"Full Text Search for Node.js","body":"Tyler Larson has created a full text search engine for Node.js"} {"key":"1"} 2 | {"title":"Daily Bulletin","body":"Humane society challenges meat industry over new law"} {"key":"2"} 3 | {"title":"Wilson County News","body":"Unprecedented meeting on COOL held in Kansas City"} {"key":"3"} 4 | {"title":"Meat & Poultry","body":"Industry reflects on USDA under Bush"} {"key":"4"} 5 | {"title":"Other County News","body":"The kids in the city are going crazy"} {"key":"5"} 6 | {"title":"Drovers","body":"Beefing up Safety"} {"key":"6"} 7 | {"title":"Supermarket News","body":"Humane Society opposes meat industry challenge"} {"key":"7"} 8 | {"title":"Daily Bulletin","body":"motion against non-ambulatory ban"} {"key":"8"} 9 | {"title":"Press-Enterprise","body":"California downer law lawsuit"} {"key":"9"} 10 | {"title":"Meeting place","body":"COOL funding"} {"key":"10"} 11 | -------------------------------------------------------------------------------- /examples/lib/nstore.js: -------------------------------------------------------------------------------- 1 | // Copyright 2010 Tim Caswell 2 | // 3 | // MIT licensed 4 | 5 | 6 | var sys = require('sys'), 7 | fs = require('fs'), 8 | Path = require('path'), 9 | Buffer = require('buffer').Buffer; 10 | 11 | // This size only affects performance, it's not a constraint on data sizes 12 | var CHUNK_SIZE = 1024; 13 | // This is the max size of a single serialized document 14 | var MAX_SIZE = 1024 * 1024; 15 | 16 | // Reads from a given file descriptor at a specified position and length 17 | // Handles all OS level chunking for you. 18 | // Callback gets (err, buffer) 19 | function fsRead(fd, position, length, callback) { 20 | var buffer = new Buffer(length), 21 | offset = 0; 22 | 23 | function readChunk() { 24 | fs.read(fd, buffer, offset, length - offset, position, function (err, bytesRead) { 25 | if (err) { callback(err); return; } 26 | 27 | offset += bytesRead; 28 | 29 | if (offset < length) { 30 | readChunk(); 31 | return; 32 | } 33 | callback(null, buffer); 34 | }); 35 | } 36 | readChunk(); 37 | } 38 | 39 | // Writes a buffer to a specified file descriptor at the given offset 40 | // handles chunking for you. 41 | // Callback gets (err) 42 | function fsWrite(fd, buffer, position, callback) { 43 | var offset = 0, 44 | length = buffer.length; 45 | 46 | function writeChunk() { 47 | fs.write(fd, buffer, offset, length - offset, position, function (err, bytesWritten) { 48 | if (err) { callback(err); return; } 49 | offset += bytesWritten; 50 | if (offset < length) { 51 | writeChunk(); 52 | return; 53 | } 54 | callback(); 55 | }); 56 | } 57 | writeChunk(); 58 | } 59 | 60 | 61 | function nStore(filename, filterFn, isTemp) { 62 | var fd, // FD for reading and writing to the file 63 | index = {}, // Index of file positions of all documents by key 64 | writeQueue = [], // Queue of new docs to write to the hd 65 | stale = 0, 66 | dbLength = 0, // The size of the current db file in bytes 67 | compacting = false, 68 | lastCompact = Date.now(); 69 | 70 | // Open a single handle for reading and writing 71 | fd = fs.openSync(filename, "a+"); 72 | 73 | // Generates a random unique 16 char base 36 string 74 | // (about 2^83 possible keys) 75 | function makeUUID() { 76 | var key = ""; 77 | while (key.length < 16) { 78 | key += Math.floor(Math.random() * 0x290d7410000).toString(36); 79 | } 80 | key = key.substr(0, 16); 81 | if (key in index) { 82 | return makeUUID(); 83 | } 84 | return key; 85 | } 86 | 87 | // Load a single record from the disk 88 | function getByKey(key, callback) { 89 | try { 90 | var info = index[key]; 91 | if (!info) { 92 | var error = new Error("Document does not exist for " + key); 93 | error.errno = process.ENOENT; 94 | callback(error); 95 | return; 96 | } 97 | 98 | fsRead(fd, info.position, info.length, function (err, buffer) { 99 | if (err) { callback(err); return; } 100 | try { 101 | var data = JSON.parse(buffer.toString()); 102 | callback(null, data, info.meta); 103 | } catch (err) { 104 | callback(err); 105 | } 106 | }); 107 | } catch (err) { 108 | callback(err); 109 | } 110 | } 111 | 112 | 113 | 114 | function compact() { 115 | // Don't run if already clean or already compacting 116 | if (isTemp || compacting || stale === 0) { return; } 117 | compacting = true; 118 | var tmpFile = Path.join(Path.dirname(filename), makeUUID() + ".tmpdb"), 119 | tmpDb = nStore(tmpFile, null, true), 120 | keys = Object.keys(index), 121 | counter = keys.length; 122 | 123 | keys.forEach(function (key) { 124 | getByKey(key, function (err, doc, meta) { 125 | if (err) { throw err; } 126 | 127 | function check() { 128 | counter--; 129 | if (counter === 0) { 130 | done(); 131 | } 132 | } 133 | 134 | // Hook to allow filtering when compacting 135 | // Great for things like session pruning 136 | if (filterFn && !filterFn(doc, meta)) { 137 | check(); 138 | return; 139 | } 140 | 141 | tmpDb.save(key, doc, function (err, meta) { 142 | if (err) { 143 | throw err; 144 | } 145 | check(); 146 | }); 147 | }); 148 | }); 149 | stale = 0; 150 | 151 | function done() { 152 | 153 | // Swap out stores 154 | var oldfd = fd; 155 | fd = tmpDb.fd; 156 | dbLength = tmpDb.dbLength; 157 | index = tmpDb.index; 158 | 159 | // And clean up the files 160 | fs.close(oldfd, function (err) { 161 | if (err) throw err; 162 | fs.unlink(filename, function (err) { 163 | if (err) throw err; 164 | fs.rename(tmpFile, filename, function (err) { 165 | if (err) throw err; 166 | compacting = false; 167 | lastCompact = Date.now(); 168 | checkQueue(); 169 | }); 170 | }); 171 | }); 172 | } 173 | 174 | } 175 | 176 | // Loads the database from disk using blocking I/O 177 | // TODO: see if non-blocking is faster, this takes a long time 178 | function loadDatabase() { 179 | 180 | // Create a buffer for reading chunks from the disk 181 | var chunk = new Buffer(CHUNK_SIZE); 182 | 183 | // Create an empty stream buffer 184 | var input = new Buffer(MAX_SIZE); 185 | input.length = 0; 186 | 187 | // These are positions in the database file 188 | var offset = 0; 189 | var base = 0; 190 | 191 | // This is a position within the input stream 192 | var pos = 0; 193 | var mid = 0; 194 | 195 | // Read a chunk from the file into `chunk` 196 | while ((chunk.length = fs.readSync(fd, chunk, 0, CHUNK_SIZE, offset)) > 0) { 197 | 198 | // Move the offset so the outer loop stays in sync 199 | offset += chunk.length; 200 | 201 | // Copy the chunk onto the input stream 202 | chunk.copy(input, input.length, 0, chunk.length); 203 | input.length += chunk.length; 204 | 205 | // See if there is input to consume 206 | for (var i = pos, l = input.length; i < l; i++) { 207 | if (input[i] === 9) { 208 | mid = i + 1; 209 | } 210 | if (mid && input[i] === 10) { 211 | // var doc = input.slice(pos, mid - 1).toString(); 212 | var meta = JSON.parse(input.slice(mid, i).toString()); 213 | var info = { 214 | meta: meta, 215 | position: base + pos, 216 | length: mid - pos - 1 217 | }; 218 | if (index[meta.key]) { 219 | stale++; 220 | } 221 | if (info.length > 0) { 222 | index[meta.key] = info; 223 | } else { 224 | delete index[meta.key]; 225 | } 226 | mid = 0; 227 | pos = i + 1; 228 | } 229 | } 230 | 231 | // Shift the input back down 232 | if (pos > 0) { 233 | input.copy(input, 0, pos, input.length); 234 | input.length -= pos; 235 | base += pos; 236 | pos = 0; 237 | } 238 | } 239 | 240 | dbLength = offset; 241 | 242 | } 243 | loadDatabase(); 244 | compact(); 245 | 246 | var lock = false; 247 | function checkQueue() { 248 | if (compacting || lock || writeQueue.length === 0) { return; } 249 | lock = true; 250 | 251 | // Pull some jobs off the writeQueue 252 | var length = writeQueue.length; 253 | var i = 0; 254 | var size = 0; 255 | var toWrite = []; 256 | var newIndex = {}; 257 | var position = dbLength; 258 | while (i < length && size < 50000) { 259 | var item = writeQueue[i]; 260 | var data = item.doc ? JSON.stringify(item.doc) : ""; 261 | var key = item.key; 262 | var meta = {key: key}; 263 | var line = new Buffer(data + "\t" + JSON.stringify(meta) + "\n"); 264 | var dataLength = Buffer.byteLength(data); 265 | // Generate a callback closure 266 | toWrite[toWrite.length] = { 267 | line: line, 268 | key: key, 269 | callback: item.callback 270 | }; 271 | newIndex[meta.key] = { 272 | position: dbLength, 273 | length: dataLength, 274 | meta: meta 275 | }; 276 | 277 | dbLength += line.length; 278 | size += line.length; 279 | i++; 280 | } 281 | length = i; 282 | writeQueue.splice(0, length); 283 | 284 | // Merge the buffers into one large one 285 | var offset = 0; 286 | var buffer = new Buffer(size); 287 | for (var i = 0; i < length; i++) { 288 | var line = toWrite[i].line; 289 | line.copy(buffer, offset); 290 | offset += line.length; 291 | } 292 | 293 | fsWrite(fd, buffer, position, function (err) { 294 | if (err) { 295 | throw err; 296 | } 297 | 298 | // Mix in the updated indexes 299 | var willCompact = false; 300 | var threshold = Object.keys(index).length; 301 | Object.keys(newIndex).forEach(function (key) { 302 | if (index[key]) { 303 | stale++; 304 | if (stale > threshold) { 305 | willCompact = true; 306 | } 307 | } 308 | 309 | if (newIndex[key].length === 0) { 310 | delete index[key]; 311 | } else { 312 | index[key] = newIndex[key]; 313 | } 314 | }); 315 | 316 | // Call all the individual callbacks for the write 317 | for (var i = 0; i < length; i++) { 318 | var item = toWrite[i]; 319 | var callback = item.callback; 320 | if (callback) { 321 | callback(err, {key: item.key}); 322 | } 323 | 324 | } 325 | 326 | // Unlock and try the loop again 327 | lock = false; 328 | if (willCompact && (Date.now() - lastCompact > 2000)) { 329 | compact(); 330 | } else { 331 | process.nextTick(checkQueue); 332 | } 333 | }); 334 | 335 | } 336 | 337 | function getStream(filter) { 338 | var counter = 0; 339 | var stream = new process.EventEmitter(); 340 | var queue = []; 341 | var paused = false; 342 | 343 | // Checks to see if we should emit the "end" event yet. 344 | function checkDone() { 345 | if (!paused && counter === 0) { 346 | counter--; 347 | stream.emit("end"); 348 | } 349 | } 350 | 351 | // Tries to push events through 352 | function flush() { 353 | if (paused) { return; } 354 | for (var i = 0, l = queue.length; i < l; i++) { 355 | var item = queue[i]; 356 | stream.emit("data", item.doc, item.meta); 357 | counter--; 358 | } 359 | queue.length = 0; 360 | process.nextTick(checkDone); 361 | } 362 | 363 | 364 | stream.pause = function () { 365 | paused = true; 366 | }; 367 | 368 | // Resumes emitting of events 369 | stream.resume = function () { 370 | paused = false; 371 | process.nextTick(function () { 372 | flush(); 373 | checkDone(); 374 | }); 375 | }; 376 | 377 | Object.keys(index).forEach(function (key) { 378 | counter++; 379 | getByKey(key, function (err, doc, meta) { 380 | if (err) { 381 | stream.emit("error", err); 382 | return; 383 | } 384 | if (!filter || filter(doc, meta)) { 385 | queue.push({ 386 | doc: doc, 387 | meta: meta 388 | }); 389 | flush(); 390 | } else { 391 | counter--; 392 | process.nextTick(checkDone); 393 | } 394 | }); 395 | }); 396 | 397 | process.nextTick(checkDone); 398 | 399 | return stream; 400 | } 401 | 402 | 403 | return { 404 | get length() { 405 | return Object.keys(index).length; 406 | }, 407 | 408 | // Saves a document with optional key. The effect if immediate to the 409 | // running program, but not persistent till after the callback. 410 | // Pass null as the key to get a generated key. 411 | save: function (key, doc, callback) { 412 | if (!key) { 413 | key = makeUUID(); 414 | } 415 | writeQueue[writeQueue.length] = { 416 | key: key, 417 | doc: doc, 418 | callback: callback 419 | }; 420 | checkQueue(); 421 | }, 422 | 423 | // Removes a document from the collection by key 424 | // The effect is immediate to the running program, but not permanent 425 | // till the callback returns. 426 | remove: function (key, callback) { 427 | if (key in index) { 428 | delete index[key]; 429 | var line = new Buffer("\t" + JSON.stringify({key: key}) + "\n"); 430 | 431 | writeQueue[writeQueue.length] = { 432 | meta: {key: key}, 433 | position: dbLength, 434 | length: 0, 435 | line: line, 436 | callback: callback 437 | }; 438 | dbLength += line.length; 439 | checkQueue(); 440 | } else { 441 | var err = new Error("Cannot delete a document that does not exist"); 442 | err.errno = process.ENOENT; 443 | callback(err); 444 | } 445 | }, 446 | 447 | all: function (filter, callback) { 448 | if (typeof filter === 'function' && callback === undefined) { 449 | callback = filter; 450 | filter = null; 451 | } 452 | var docs = []; 453 | var metas = []; 454 | var stream = getStream(filter); 455 | stream.addListener('data', function (doc, meta) { 456 | docs.push(doc); 457 | metas.push(meta); 458 | }); 459 | stream.addListener('end', function () { 460 | callback(null, docs, metas); 461 | }); 462 | stream.addListener('error', callback); 463 | }, 464 | 465 | // Returns a readable stream of the whole collection. 466 | // Supports pause and resume so that you can delay events for layer. 467 | // This queues "data" and "end" events in memory./ 468 | // Also you can provide a filter to pre-filter results before they 469 | // go to the queue 470 | stream: getStream, 471 | 472 | // Loads a single document by id, accepts key and callback 473 | // the callback will be called with (err, doc, meta) 474 | get: getByKey, 475 | 476 | 477 | // Removes all documents from a database 478 | clear: function () { 479 | index = {}; 480 | compact(); 481 | }, 482 | 483 | compact: compact, 484 | 485 | // Expose some private variables 486 | get index() { return index; }, 487 | get fd() { return fd; }, 488 | get dbLength() { return dbLength; }, 489 | 490 | // Expose the UUID maker 491 | makeUUID: makeUUID 492 | }; 493 | } 494 | 495 | module.exports = nStore; 496 | 497 | -------------------------------------------------------------------------------- /lib/node-search.js: -------------------------------------------------------------------------------- 1 | var PorterStemmer = require('./node-search/porter-stemmer').PorterStemmer, 2 | Tokenizer = require('./node-search/tokenizer').Tokenizer, 3 | DoubleMetaphone = require('./node-search/double-metaphone').DoubleMetaphone, 4 | Vector = require('./node-search/math/vector').Vector; 5 | VectorUtils = require('./node-search/math/vector-utils').VectorUtils; 6 | 7 | 8 | exports.NodeSearch = function() { 9 | return { 10 | docs:{all:[]}, 11 | fields:[], 12 | fieldWeights:{}, 13 | vectorKeywordIndex:{}, 14 | vectorKeywordIndexLength:0, 15 | index: function(key,doc,callback){ 16 | var self = this; 17 | var addWords = []; 18 | var uniques = findUniques(doc); 19 | uniques.forEach(function(word){ 20 | if( !self.vectorKeywordIndex.hasOwnProperty(word) ){ 21 | self.vectorKeywordIndex[word] = self.vectorKeywordIndexLength; 22 | self.vectorKeywordIndexLength++; 23 | addWords.push(word); 24 | } 25 | }); 26 | 27 | // add zeros to end of other vectors, maybe this should be done another way 28 | if( addWords.length != 0 ) { 29 | Object.keys(self.docs).forEach(function(column){ 30 | self.docs[column].forEach(function(item){ 31 | addWords.forEach(function(word){ 32 | item.data.push(0); 33 | }); 34 | }); 35 | }); 36 | } 37 | 38 | if( typeof(doc) == "object" ) { 39 | var fieldsData = ""; 40 | for( var field in doc ){ 41 | if(self.fields.length==0){ 42 | self.fields.push(field); 43 | } 44 | fieldsData += doc[field] + " "; 45 | var vector = makeVector(key, doc[field],self.fieldWeights[field]||1,self.vectorKeywordIndex,self.vectorKeywordIndexLength); 46 | if( this.docs.hasOwnProperty(field) ){ 47 | this.docs[field].push(vector); 48 | }else{ 49 | this.docs[field] = [vector]; 50 | } 51 | } 52 | this.docs.all.push( makeVector(key,fieldsData,1,self.vectorKeywordIndex,self.vectorKeywordIndexLength)); 53 | }else{ 54 | this.docs.all.push( makeVector(key,doc,1,self.vectorKeywordIndex,self.vectorKeywordIndexLength)); 55 | } 56 | }, 57 | 58 | // Query the index, returns an array of documents with id and rank 59 | query: function(string,fields,callback){ 60 | 61 | var docs, vector; 62 | var words = string.split(" "); 63 | var count = 0; 64 | var total = 0; 65 | var completed = 0; 66 | var falseMatches = []; 67 | var results=[]; 68 | 69 | for( var word in words){ 70 | if(words[word].charAt(0)=="!"||words[word].charAt(0)=="-"){ 71 | var stemmed = stemmer.process(words[word].split("!").join("").split("-").join("")); 72 | var index = this.vectorKeywordIndex[DoubleMetaphone(stemmed).primary]; 73 | falseMatches.push(index); 74 | words.splice(count,1); 75 | } 76 | count++ 77 | } 78 | vector = makeVector("",words.join(" "),1,this.vectorKeywordIndex,this.vectorKeywordIndexLength); 79 | 80 | for(var i = 0; i < vector.data.length; i++){ 81 | total += vector.data[i]; 82 | } 83 | if( total == 0 ){ 84 | return []; 85 | } 86 | 87 | if( fields != null){ 88 | var fieldsName = fields.sort().join("-"); 89 | if( !this.docs.hasOwnProperty(fieldsName) ) { 90 | // This will be really slow the first time. 91 | // this.docs[fieldsName] = indexFields(fields,this.docs,function(docs){ 92 | // TODO: need to work on this, can't currently search more than one field at a time 93 | //}); 94 | return 95 | }else{ 96 | docs = this.docs[fieldsName]; 97 | } 98 | }else{ 99 | docs = this.docs.all; 100 | } 101 | 102 | return asyncForEach( docs, 103 | function(doc,i,list){ 104 | for( var falseMatch in falseMatches ){ // strip documents that have falsematches 105 | if(doc.data[falseMatches[falseMatch]]!=0){ 106 | return 107 | } 108 | } 109 | var result = cosine(vector, doc); // figure out how close your querie vector is to the other docs 110 | if( result != 0 ){ // filter out items that dont match at all 111 | results.push({key:doc.key, rank:result}); 112 | } 113 | }, 114 | function(){ // TODO: what if there are millions of results, this sort will be slow. 115 | callback(results.sort(function (a, b) { return ((b.rank - a.rank)) })); 116 | } 117 | ); 118 | }, 119 | 120 | related: function(key,callback){ 121 | var docs; 122 | var results=[]; 123 | 124 | return asyncForEach( this.docs.all, 125 | function(item,i,list){ 126 | if(item.key == key ){ 127 | asyncForEach( this.docs.all, 128 | function(doc,i,list){ 129 | var result = cosine(item, doc); // figure out how close your querie vector is to the other docs 130 | if( result != 0 ){ // filter out items that dont match at all 131 | results.push({key:doc.key, rank:result}); 132 | } 133 | }, 134 | function(){ // TODO: what if there are millions of results, this sort will be slow. 135 | callback(results.sort(function (a, b) { return ((b.rank - a.rank)) })); 136 | } 137 | ); 138 | return true; 139 | } 140 | }, 141 | function(){} 142 | ); 143 | } 144 | } 145 | } 146 | 147 | // Private /////////////////////////////////////////////////////////////////////////////////////////// 148 | 149 | // Stemming is a way to convert words like speeder and speeds to speed 150 | var stemmer = exports.stemmer = PorterStemmer(); 151 | 152 | // break string up into tokens and stem words 153 | var tokenizer = exports.tokenizer = Tokenizer(); 154 | 155 | // Words that will not be indexed 156 | var stopWords = exports.stopWords = ["","a","about","above","above","across","after","afterwards","again","against","all","almost","alone","along","already","also","although","always","am","among","amongst","amoungst","amount","an","and","another","any","anyhow","anyone","anything","anyway","anywhere","are","around","as","at","back","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","below","beside","besides","between","beyond","bill","both","bottom","but","by","call","can","cannot","cant","co","con","could","couldnt","cry","de","describe","detail","do","done","down","due","during","each","eg","eight","either","eleven","else","elsewhere","empty","enough","etc","even","ever","every","everyone","everything","everywhere","except","few","fifteen","fify","fill","find","fire","first","five","for","former","formerly","forty","found","four","from","front","full","further","get","give","go","had","has","hasnt","have","he","hence","her","here","hereafter","hereby","herein","hereupon","hers","herself","him","himself","his","how","however","hundred","ie","if","in","inc","indeed","interest","into","is","it","its","itself","keep","last","latter","latterly","least","less","ltd","made","many","may","me","meanwhile","might","mill","mine","more","moreover","most","mostly","move","much","must","my","myself","name","namely","neither","never","nevertheless","next","nine","no","nobody","none","noone","nor","not","nothing","now","nowhere","of","off","often","on","once","one","only","onto","or","other","others","otherwise","our","ours","ourselves","out","over","own","part","per","perhaps","please","put","rather","re","same","see","seem","seemed","seeming","seems","serious","several","she","should","show","side","since","sincere","six","sixty","so","some","somehow","someone","something","sometime","sometimes","somewhere","still","such","system","take","ten","than","that","the","their","them","themselves","then","thence","there","thereafter","thereby","therefore","therein","thereupon","these","they","thickv","thin","third","this","those","though","three","through","throughout","thru","thus","to","together","too","top","toward","towards","twelve","twenty","two","un","under","until","up","upon","us","very","via","was","we","well","were","what","whatever","when","whence","whenever","where","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","whoever","whole","whom","whose","why","will","with","within","without","would","yet","you","your","yours","yourself","yourselves","the"]; 157 | 158 | var vectorKeywordIndexLength = 0; 159 | 160 | function indexFields(fields,docs){ 161 | var result = []; 162 | for( var field in fields){ 163 | var data = docs[fields[field]]; 164 | for( i=0; i 17 | // All rights reserved. 18 | // 19 | // http://swoodbridge.com/DoubleMetaPhone/ 20 | // 21 | // This PHP translation is based heavily on the C implementation 22 | // by Maurice Aubrey , which in turn 23 | // is based heavily on the C++ implementation by 24 | // Lawrence Philips and incorporates several bug fixes courtesy 25 | // of Kevin Atkinson . 26 | // 27 | // This module is free software; you may redistribute it and/or 28 | // modify it under the same terms as Perl itself. 29 | // 30 | // CONTRIBUTIONS 31 | // 32 | // 17-May-2002 Geoff Caplan http://www.advantae.com 33 | // Bug fix: added code to return class object which I forgot to do 34 | // Created a functional callable version instead of the class version 35 | // which is faster if you are calling this a lot. 36 | // 37 | // ------------------------------------------------------------------ 38 | 39 | // TODO: this file defines a bunch of globals in it, should be converted 40 | exports.DoubleMetaphone = function(string) { 41 | primary = ""; 42 | secondary = ""; 43 | current = 0; 44 | 45 | current = 0; 46 | length = string.length; 47 | last = length - 1; 48 | original = string + " "; 49 | 50 | original = original.toUpperCase(); 51 | 52 | // skip this at beginning of word 53 | 54 | if (string_at(original, 0, 2, 55 | ['GN', 'KN', 'PN', 'WR', 'PS'])) 56 | current++; 57 | 58 | // Initial 'X' is pronounced 'Z' e.g. 'Xavier' 59 | 60 | if (original.substr(0, 1) == 'X') { 61 | primary += "S"; // 'Z' maps to 'S' 62 | secondary += "S"; 63 | current++; 64 | } 65 | 66 | // main loop 67 | 68 | while (primary.length < 4 || secondary.length < 4) { 69 | if (current >= length) 70 | break; 71 | 72 | switch (original.substr(current, 1)) { 73 | case 'A': 74 | case 'E': 75 | case 'I': 76 | case 'O': 77 | case 'U': 78 | case 'Y': 79 | if (current == 0) { 80 | // all init vowels now map to 'A' 81 | primary += 'A'; 82 | secondary += 'A'; 83 | } 84 | current += 1; 85 | break; 86 | 87 | case 'B': 88 | // '-mb', e.g. "dumb", already skipped over ... 89 | primary += 'P'; 90 | secondary += 'P'; 91 | 92 | if (original.substr(current + 1, 1) == 'B') 93 | current += 2; 94 | else 95 | current += 1; 96 | break; 97 | 98 | case 'Ç': 99 | primary += 'S'; 100 | secondary += 'S'; 101 | current += 1; 102 | break; 103 | 104 | case 'C': 105 | // various gremanic 106 | if ((current > 1) 107 | && !is_vowel(original, current - 2) 108 | && string_at(original, current - 1, 3, 109 | ["ACH"]) 110 | && ((original.substr(current + 2, 1) != 'I') 111 | && ((original.substr(current + 2, 1) != 'E') 112 | || string_at(original, current - 2, 6, 113 | ["BACHER", "MACHER"])))) { 114 | 115 | primary += 'K'; 116 | secondary += 'K'; 117 | current += 2; 118 | break; 119 | } 120 | 121 | // special case 'caesar' 122 | if ((current == 0) 123 | && string_at(original, current, 6, 124 | ["CAESAR"])) { 125 | primary += 'S'; 126 | secondary += 'S'; 127 | current += 2; 128 | break; 129 | } 130 | 131 | // italian 'chianti' 132 | if (string_at(original, current, 4, 133 | ["CHIA"])) { 134 | primary += 'K'; 135 | secondary += 'K'; 136 | current += 2; 137 | break; 138 | } 139 | 140 | if (string_at(original, current, 2, 141 | ["CH"])) { 142 | 143 | // find 'michael' 144 | if ((current > 0) 145 | && string_at(original, current, 4, 146 | ["CHAE"])) { 147 | primary += 'K'; 148 | secondary += 'X'; 149 | current += 2; 150 | break; 151 | } 152 | 153 | // greek roots e.g. 'chemistry', 'chorus' 154 | if ((current == 0) 155 | && (string_at(original, current + 1, 5, 156 | ["HARAC", "HARIS"]) 157 | || string_at(original, current + 1, 3, 158 | ["HOR", "HYM", "HIA", "HEM"])) 159 | && !string_at(original, 0, 5, ["CHORE"])) { 160 | primary += 'K'; 161 | secondary += 'K'; 162 | current += 2; 163 | break; 164 | } 165 | 166 | // germanic, greek, or otherwise 'ch' for 'kh' sound 167 | if ((string_at(original, 0, 4, ["VAN ", "VON "]) 168 | || string_at(original, 0, 3, ["SCH"])) 169 | // 'architect' but not 'arch', orchestra', 'orchid' 170 | || string_at(original, current - 2, 6, 171 | ["ORCHES", "ARCHIT", "ORCHID"]) 172 | || string_at(original, current + 2, 1, 173 | ["T", "S"]) 174 | || ((string_at(original, current - 1, 1, 175 | ["A","O","U","E"]) 176 | || (current == 0)) 177 | // e.g. 'wachtler', 'weschsler', but not 'tichner' 178 | && string_at(original, current + 2, 1, 179 | ["L","R","N","M","B","H","F","V","W"," "]))) { 180 | primary += 'K'; 181 | secondary += 'K'; 182 | } else { 183 | if (current > 0) { 184 | if (string_at(original, 0, 2, ["MC"])) { 185 | // e.g. 'McHugh' 186 | primary += 'K'; 187 | secondary += 'K'; 188 | } else { 189 | primary += 'X'; 190 | secondary += 'K'; 191 | } 192 | } else { 193 | primary += 'X'; 194 | secondary += 'X'; 195 | } 196 | } 197 | current += 2; 198 | break; 199 | } 200 | 201 | // e.g. 'czerny' 202 | if (string_at(original, current, 2, ["CZ"]) 203 | && !string_at(original, current -2, 4, 204 | ["WICZ"])) { 205 | primary += 'S'; 206 | secondary += 'X'; 207 | current += 2; 208 | break; 209 | } 210 | 211 | // e.g. 'focaccia' 212 | if (string_at(original, current + 1, 3, 213 | ["CIA"])) { 214 | primary += 'X'; 215 | secondary += 'X'; 216 | current += 3; 217 | break; 218 | } 219 | 220 | // double 'C', but not McClellan' 221 | if (string_at(original, current, 2, ["CC"]) 222 | && !((current == 1) 223 | && (original.substr(0, 1) == 'M'))) { 224 | // 'bellocchio' but not 'bacchus' 225 | if (string_at(original, current + 2, 1, 226 | ["I","E","H"]) 227 | && !string_at(original, current + 2, 2, 228 | ["HU"])) { 229 | // 'accident', 'accede', 'succeed' 230 | if (((current == 1) 231 | && (original.substr(current - 1, 1) == 'A')) 232 | || string_at(original, current - 1, 5, 233 | ["UCCEE", "UCCES"])) { 234 | primary += "KS"; 235 | secondary += "KS"; 236 | // 'bacci', 'bertucci', other italian 237 | } else { 238 | primary += "X"; 239 | secondary += "X"; 240 | } 241 | current += 3; 242 | break; 243 | } else { 244 | // Pierce's rule 245 | primary += "K"; 246 | secondary += "K"; 247 | current += 2; 248 | break; 249 | } 250 | } 251 | 252 | if (string_at(original, current, 2, 253 | ["CK","CG","CQ"])) { 254 | primary += "K"; 255 | secondary += "K"; 256 | current += 2; 257 | break; 258 | } 259 | 260 | if (string_at(original, current, 2, 261 | ["CI","CE","CY"])) { 262 | // italian vs. english 263 | if (string_at(original, current, 3, 264 | ["CIO","CIE","CIA"])) { 265 | primary += "S"; 266 | secondary += "X"; 267 | } else { 268 | primary += "S"; 269 | secondary += "S"; 270 | } 271 | current += 2; 272 | break; 273 | } 274 | 275 | // else 276 | primary += "K"; 277 | secondary += "K"; 278 | 279 | // name sent in 'mac caffrey', 'mac gregor' 280 | if (string_at(original, current + 1, 2, 281 | [" C"," Q"," G"])) { 282 | current += 3; 283 | } else { 284 | if (string_at(original, current + 1, 1, 285 | ["C","K","Q"]) 286 | && !string_at(original, current + 1, 2, 287 | ["CE","CI"])) { 288 | current += 2; 289 | } else { 290 | current += 1; 291 | } 292 | } 293 | break; 294 | 295 | case 'D': 296 | if (string_at(original, current, 2, 297 | ["DG"])) { 298 | if (string_at(original, current + 2, 1, 299 | ["I","E","Y"])) { 300 | // e.g. 'edge' 301 | primary += "J"; 302 | secondary += "J"; 303 | current += 3; 304 | break; 305 | } else { 306 | // e.g. 'edgar' 307 | primary += "TK"; 308 | secondary += "TK"; 309 | current += 2; 310 | break; 311 | } 312 | } 313 | 314 | if (string_at(original, current, 2, 315 | ["DT","DD"])) { 316 | primary += "T"; 317 | secondary += "T"; 318 | current += 2; 319 | break; 320 | } 321 | 322 | // else 323 | primary += "T"; 324 | secondary += "T"; 325 | current += 1; 326 | break; 327 | 328 | case 'F': 329 | if (original.substr(current + 1, 1) == 'F') 330 | current += 2; 331 | else 332 | current += 1; 333 | primary += "F"; 334 | secondary += "F"; 335 | break; 336 | 337 | case 'G': 338 | if (original.substr(current + 1, 1) == 'H') { 339 | if ((current > 0) 340 | && !is_vowel(original, current - 1)) { 341 | primary += "K"; 342 | secondary += "K"; 343 | current += 2; 344 | break; 345 | } 346 | 347 | if (current < 3) { 348 | // 'ghislane', 'ghiradelli' 349 | if (current == 0) { 350 | if (original.substr(current + 2, 1) == 'I') { 351 | primary += "J"; 352 | secondary += "J"; 353 | } else { 354 | primary += "K"; 355 | secondary += "K"; 356 | } 357 | current += 2; 358 | break; 359 | } 360 | } 361 | 362 | // Parker's rule (with some further refinements) - e.g. 'hugh' 363 | if (((current > 1) 364 | && string_at(original, current - 2, 1, 365 | ["B","H","D"])) 366 | // e.g. 'bough' 367 | || ((current > 2) 368 | && string_at(original, current - 3, 1, 369 | ["B","H","D"])) 370 | // e.g. 'broughton' 371 | || ((current > 3) 372 | && string_at(original, current - 4, 1, 373 | ["B","H"]))) { 374 | current += 2; 375 | break; 376 | } else { 377 | // e.g. 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' 378 | if ((current > 2) 379 | && (original.substr(current - 1, 1) == 'U') 380 | && string_at(original, current - 3, 1, 381 | ["C","G","L","R","T"])) { 382 | primary += "F"; 383 | secondary += "F"; 384 | } else if ( (current > 0) && original.substr(current - 1, 1) != 'I') { 385 | primary += "K"; 386 | secondary += "K"; 387 | } 388 | current += 2; 389 | break; 390 | } 391 | } 392 | 393 | if (original.substr(current + 1, 1) == 'N') { 394 | if ((current == 1) && is_vowel(original, 0) 395 | && !Slavo_Germanic(original)) { 396 | primary += "KN"; 397 | secondary += "N"; 398 | } else { 399 | // not e.g. 'cagney' 400 | if (!string_at(original, current + 2, 2, 401 | ["EY"]) 402 | && (original.substr(current + 1) != "Y") 403 | && !Slavo_Germanic(original)) { 404 | primary += "N"; 405 | secondary += "KN"; 406 | } else { 407 | primary += "KN"; 408 | secondary += "KN"; 409 | } 410 | } 411 | current += 2; 412 | break; 413 | } 414 | 415 | // 'tagliaro' 416 | if (string_at(original, current + 1, 2, 417 | ["LI"]) 418 | && !Slavo_Germanic(original)) { 419 | primary += "KL"; 420 | secondary += "L"; 421 | current += 2; 422 | break; 423 | } 424 | 425 | // -ges-, -gep-, -gel- at beginning 426 | if ((current == 0) 427 | && ((original.substr(current + 1, 1) == 'Y') 428 | || string_at(original, current + 1, 2, 429 | ["ES","EP","EB","EL","EY","IB","IL","IN","IE", 430 | "EI","ER"]))) { 431 | primary += "K"; 432 | secondary += "J"; 433 | current += 2; 434 | break; 435 | } 436 | 437 | // -ger-, -gy- 438 | if ((string_at(original, current + 1, 2, 439 | ["ER"]) 440 | || (original.substr(current + 1, 1) == 'Y')) 441 | && !string_at(original, 0, 6, 442 | ["DANGER","RANGER","MANGER"]) 443 | && !string_at(original, current -1, 1, 444 | ["E", "I"]) 445 | && !string_at(original, current -1, 3, 446 | ["RGY","OGY"])) { 447 | primary += "K"; 448 | secondary += "J"; 449 | current += 2; 450 | break; 451 | } 452 | 453 | // italian e.g. 'biaggi' 454 | if (string_at(original, current + 1, 1, 455 | ["E","I","Y"]) 456 | || string_at(original, current -1, 4, 457 | ["AGGI","OGGI"])) { 458 | // obvious germanic 459 | if ((string_at(original, 0, 4, ["VAN ", "VON "]) 460 | || string_at(original, 0, 3, ["SCH"])) 461 | || string_at(original, current + 1, 2, 462 | ["ET"])) { 463 | primary += "K"; 464 | secondary += "K"; 465 | } else { 466 | // always soft if french ending 467 | if (string_at(original, current + 1, 4, 468 | ["IER "])) { 469 | primary += "J"; 470 | secondary += "J"; 471 | } else { 472 | primary += "J"; 473 | secondary += "K"; 474 | } 475 | } 476 | current += 2; 477 | break; 478 | } 479 | 480 | if (original.substr(current +1, 1) == 'G') 481 | current += 2; 482 | else 483 | current += 1; 484 | 485 | primary += 'K'; 486 | secondary += 'K'; 487 | break; 488 | 489 | case 'H': 490 | // only keep if first & before vowel or btw. 2 vowels 491 | if (((current == 0) || 492 | is_vowel(original, current - 1)) 493 | && is_vowel(original, current + 1)) { 494 | primary += 'H'; 495 | secondary += 'H'; 496 | current += 2; 497 | } else 498 | current += 1; 499 | break; 500 | 501 | case 'J': 502 | // obvious spanish, 'jose', 'san jacinto' 503 | if (string_at(original, current, 4, 504 | ["JOSE"]) 505 | || string_at(original, 0, 4, ["SAN "])) { 506 | if (((current == 0) 507 | && (original.substr(current + 4, 1) == ' ')) 508 | || string_at(original, 0, 4, ["SAN "])) { 509 | primary += 'H'; 510 | secondary += 'H'; 511 | } else { 512 | primary += "J"; 513 | secondary += 'H'; 514 | } 515 | current += 1; 516 | break; 517 | } 518 | 519 | if ((current == 0) 520 | && !string_at(original, current, 4, 521 | ["JOSE"])) { 522 | primary += 'J'; // Yankelovich/Jankelowicz 523 | secondary += 'A'; 524 | } else { 525 | // spanish pron. of .e.g. 'bajador' 526 | if (is_vowel(original, current - 1) 527 | && !Slavo_Germanic(original) 528 | && ((original.substr(current + 1, 1) == 'A') 529 | || (original.substr(current + 1, 1) == 'O'))) { 530 | primary += "J"; 531 | secondary += "H"; 532 | } else { 533 | if (current == last) { 534 | primary += "J"; 535 | secondary += ""; 536 | } else { 537 | if (!string_at(original, current + 1, 1, 538 | ["L","T","K","S","N","M","B","Z"]) 539 | && !string_at(original, current - 1, 1, 540 | ["S","K","L"])) { 541 | primary += "J"; 542 | secondary += "J"; 543 | } 544 | } 545 | } 546 | } 547 | 548 | if (original.substr(current + 1, 1) == 'J') // it could happen 549 | current += 2; 550 | else 551 | current += 1; 552 | break; 553 | 554 | case 'K': 555 | if (original.substr(current + 1, 1) == 'K') 556 | current += 2; 557 | else 558 | current += 1; 559 | primary += "K"; 560 | secondary += "K"; 561 | break; 562 | 563 | case 'L': 564 | if (original.substr(current + 1, 1) == 'L') { 565 | // spanish e.g. 'cabrillo', 'gallegos' 566 | if (((current == (length - 3)) 567 | && string_at(original, current - 1, 4, 568 | ["ILLO","ILLA","ALLE"])) 569 | || ((string_at(original, last-1, 2, 570 | ["AS","OS"]) 571 | || string_at(original, last, 1, 572 | ["A","O"])) 573 | && string_at(original, current - 1, 4, 574 | ["ALLE"]))) { 575 | primary += "L"; 576 | secondary += ""; 577 | current += 2; 578 | break; 579 | } 580 | current += 2; 581 | } else 582 | current += 1; 583 | primary += "L"; 584 | secondary += "L"; 585 | break; 586 | 587 | case 'M': 588 | if ((string_at(original, current - 1, 3, 589 | ["UMB"]) 590 | && (((current + 1) == last) 591 | || string_at(original, current + 2, 2, 592 | ["ER"]))) 593 | // 'dumb', 'thumb' 594 | || (original.substr(current + 1, 1) == 'M')) { 595 | current += 2; 596 | } else { 597 | current += 1; 598 | } 599 | primary += "M"; 600 | secondary += "M"; 601 | break; 602 | 603 | case 'N': 604 | if (original.substr(current + 1, 1) == 'N') 605 | current += 2; 606 | else 607 | current += 1; 608 | primary += "N"; 609 | secondary += "N"; 610 | break; 611 | 612 | case 'Ñ': 613 | current += 1; 614 | primary += "N"; 615 | secondary += "N"; 616 | break; 617 | 618 | case 'P': 619 | if (original.substr(current + 1, 1) == 'H') { 620 | current += 2; 621 | primary += "F"; 622 | secondary += "F"; 623 | break; 624 | } 625 | 626 | // also account for "campbell" and "raspberry" 627 | if (string_at(original, current + 1, 1, 628 | ["P","B"])) 629 | current += 2; 630 | else 631 | current += 1; 632 | primary += "P"; 633 | secondary += "P"; 634 | break; 635 | 636 | case 'Q': 637 | if (original.substr(current + 1, 1) == 'Q') 638 | current += 2; 639 | else 640 | current += 1; 641 | primary += "K"; 642 | secondary += "K"; 643 | break; 644 | 645 | case 'R': 646 | // french e.g. 'rogier', but exclude 'hochmeier' 647 | if ((current == last) 648 | && !Slavo_Germanic(original) 649 | && string_at(original, current - 2, 2, 650 | ["IE"]) 651 | && !string_at(original, current - 4, 2, 652 | ["ME","MA"])) { 653 | primary += ""; 654 | secondary += "R"; 655 | } else { 656 | primary += "R"; 657 | secondary += "R"; 658 | } 659 | if (original.substr(current + 1, 1) == 'R') 660 | current += 2; 661 | else 662 | current += 1; 663 | break; 664 | 665 | case 'S': 666 | // special cases 'island', 'isle', 'carlisle', 'carlysle' 667 | if (string_at(original, current - 1, 3, 668 | ["ISL","YSL"])) { 669 | current += 1; 670 | break; 671 | } 672 | 673 | // special case 'sugar-' 674 | if ((current == 0) 675 | && string_at(original, current, 5, 676 | ["SUGAR"])) { 677 | primary += "X"; 678 | secondary += "S"; 679 | current += 1; 680 | break; 681 | } 682 | 683 | if (string_at(original, current, 2, 684 | ["SH"])) { 685 | // germanic 686 | if (string_at(original, current + 1, 4, 687 | ["HEIM","HOEK","HOLM","HOLZ"])) { 688 | primary += "S"; 689 | secondary += "S"; 690 | } else { 691 | primary += "X"; 692 | secondary += "X"; 693 | } 694 | current += 2; 695 | break; 696 | } 697 | 698 | // italian & armenian 699 | if (string_at(original, current, 3, 700 | ["SIO","SIA"]) 701 | || string_at(original, current, 4, 702 | ["SIAN"])) { 703 | if (!Slavo_Germanic(original)) { 704 | primary += "S"; 705 | secondary += "X"; 706 | } else { 707 | primary += "S"; 708 | secondary += "S"; 709 | } 710 | current += 3; 711 | break; 712 | } 713 | 714 | // german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' 715 | // also, -sz- in slavic language altho in hungarian it is pronounced 's' 716 | if (((current == 0) 717 | && string_at(original, current + 1, 1, 718 | ["M","N","L","W"])) 719 | || string_at(original, current + 1, 1, 720 | ["Z"])) { 721 | primary += "S"; 722 | secondary += "X"; 723 | if (string_at(original, current + 1, 1, 724 | ["Z"])) 725 | current += 2; 726 | else 727 | current += 1; 728 | break; 729 | } 730 | 731 | if (string_at(original, current, 2, 732 | ["SC"])) { 733 | // Schlesinger's rule 734 | if (original.substr(current + 2, 1) == 'H') 735 | // dutch origin, e.g. 'school', 'schooner' 736 | if (string_at(original, current + 3, 2, 737 | ["OO","ER","EN","UY","ED","EM"])) { 738 | // 'schermerhorn', 'schenker' 739 | if (string_at(original, current + 3, 2, 740 | ["ER","EN"])) { 741 | primary += "X"; 742 | secondary += "SK"; 743 | } else { 744 | primary += "SK"; 745 | secondary += "SK"; 746 | } 747 | current += 3; 748 | break; 749 | } else { 750 | if ((current == 0) 751 | && !is_vowel(original, 3) 752 | && (original.substr(current + 3, 1) != 'W')) { 753 | primary += "X"; 754 | secondary += "S"; 755 | } else { 756 | primary += "X"; 757 | secondary += "X"; 758 | } 759 | current += 3; 760 | break; 761 | } 762 | 763 | if (string_at(original, current + 2, 1, 764 | ["I","E","Y"])) { 765 | primary += "S"; 766 | secondary += "S"; 767 | current += 3; 768 | break; 769 | } 770 | 771 | // else 772 | primary += "SK"; 773 | secondary += "SK"; 774 | current += 3; 775 | break; 776 | } 777 | 778 | // french e.g. 'resnais', 'artois' 779 | if ((current == last) 780 | && string_at(original, current - 2, 2, 781 | ["AI","OI"])) { 782 | primary += ""; 783 | secondary += "S"; 784 | } else { 785 | primary += "S"; 786 | secondary += "S"; 787 | } 788 | 789 | if (string_at(original, current + 1, 1, 790 | ["S","Z"])) 791 | current += 2; 792 | else 793 | current += 1; 794 | break; 795 | 796 | case 'T': 797 | if (string_at(original, current, 4, 798 | ["TION"])) { 799 | primary += "X"; 800 | secondary += "X"; 801 | current += 3; 802 | break; 803 | } 804 | 805 | if (string_at(original, current, 3, 806 | ["TIA","TCH"])) { 807 | primary += "X"; 808 | secondary += "X"; 809 | current += 3; 810 | break; 811 | } 812 | 813 | if (string_at(original, current, 2, 814 | ["TH"]) 815 | || string_at(original, current, 3, 816 | ["TTH"])) { 817 | // special case 'thomas', 'thames' or germanic 818 | if (string_at(original, current + 2, 2, 819 | ["OM","AM"]) 820 | || string_at(original, 0, 4, ["VAN ","VON "]) 821 | || string_at(original, 0, 3, ["SCH"])) { 822 | primary += "T"; 823 | secondary += "T"; 824 | } else { 825 | primary += "0"; 826 | secondary += "T"; 827 | } 828 | current += 2; 829 | break; 830 | } 831 | 832 | if (string_at(original, current + 1, 1, 833 | ["T","D"])) 834 | current += 2; 835 | else 836 | current += 1; 837 | primary += "T"; 838 | secondary += "T"; 839 | break; 840 | 841 | case 'V': 842 | if (original.substr(current + 1, 1) == 'V') 843 | current += 2; 844 | else 845 | current += 1; 846 | primary += "F"; 847 | secondary += "F"; 848 | break; 849 | 850 | case 'W': 851 | // can also be in middle of word 852 | if (string_at(original, current, 2, ["WR"])) { 853 | primary += "R"; 854 | secondary += "R"; 855 | current += 2; 856 | break; 857 | } 858 | 859 | if ((current == 0) 860 | && (is_vowel(original, current + 1) 861 | || string_at(original, current, 2, 862 | ["WH"]))) { 863 | // Wasserman should match Vasserman 864 | if (is_vowel(original, current + 1)) { 865 | primary += "A"; 866 | secondary += "F"; 867 | } else { 868 | // need Uomo to match Womo 869 | primary += "A"; 870 | secondary += "A"; 871 | } 872 | } 873 | 874 | // Arnow should match Arnoff 875 | if (((current == last) 876 | && is_vowel(original, current - 1)) 877 | || string_at(original, current - 1, 5, 878 | ["EWSKI","EWSKY","OWSKI","OWSKY"]) 879 | || string_at(original, 0, 3, ["SCH"])) { 880 | primary += ""; 881 | secondary += "F"; 882 | current += 1; 883 | break; 884 | } 885 | 886 | // polish e.g. 'filipowicz' 887 | if (string_at(original, current, 4, 888 | ["WICZ","WITZ"])) { 889 | primary += "TS"; 890 | secondary += "FX"; 891 | current += 4; 892 | break; 893 | } 894 | 895 | // else skip it 896 | current += 1; 897 | break; 898 | 899 | case 'X': 900 | // french e.g. breaux 901 | if (!((current == last) 902 | && (string_at(original, current - 3, 3, 903 | ["IAU", "EAU"]) 904 | || string_at(original, current - 2, 2, 905 | ["AU", "OU"])))) { 906 | primary += "KS"; 907 | secondary += "KS"; 908 | } 909 | 910 | if (string_at(original, current + 1, 1, 911 | ["C","X"])) 912 | current += 2; 913 | else 914 | current += 1; 915 | break; 916 | 917 | case 'Z': 918 | // chinese pinyin e.g. 'zhao' 919 | if (original.substr(current + 1, 1) == "H") { 920 | primary += "J"; 921 | secondary += "J"; 922 | current += 2; 923 | break; 924 | } else if (string_at(original, current + 1, 2, 925 | ["ZO", "ZI", "ZA"]) 926 | || (Slavo_Germanic(original) 927 | && ((current > 0) 928 | && original.substr(current - 1, 1) != 'T'))) { 929 | primary += "S"; 930 | secondary += "TS"; 931 | } else { 932 | primary += "S"; 933 | secondary += "S"; 934 | } 935 | 936 | if (original.substr(current + 1, 1) == 'Z') 937 | current += 2; 938 | else 939 | current += 1; 940 | break; 941 | 942 | default: 943 | current += 1; 944 | 945 | } // end switch 946 | 947 | } // end while 948 | 949 | primary = primary.substr( 0, 4); 950 | secondary = secondary.substr(0, 4); 951 | 952 | if( primary == secondary ) 953 | { 954 | secondary = null ; 955 | } 956 | 957 | return { 958 | primary: primary, 959 | secondary: secondary 960 | } 961 | } 962 | 963 | /*=================================================================*\ 964 | # Name: string_at(string, start, length, list) 965 | # Purpose: Helper function for DoubleMetaphone( ) 966 | # Return: Bool 967 | \*=================================================================*/ 968 | function string_at(string, start, length, list) 969 | { 970 | if ((start <0) || (start >= string.length)) 971 | return 0; 972 | 973 | for (var i=0, len=list.length; i0 54 | meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1 55 | mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1 56 | s_v = "^(" + C + ")?" + v; // vowel in stem 57 | 58 | return { 59 | process: function(w) { 60 | var stem, 61 | suffix, 62 | firstch, 63 | re, 64 | re2, 65 | re3, 66 | re4, 67 | origword = w; 68 | 69 | if (w.length < 3) { return w; } 70 | 71 | firstch = w.substr(0,1); 72 | if (firstch == "y") { 73 | w = firstch.toUpperCase() + w.substr(1); 74 | } 75 | 76 | // Step 1a 77 | re = /^(.+?)(ss|i)es$/; 78 | re2 = /^(.+?)([^s])s$/; 79 | 80 | if (re.test(w)) { w = w.replace(re,"$1$2"); } 81 | else if (re2.test(w)) { w = w.replace(re2,"$1$2"); } 82 | 83 | // Step 1b 84 | re = /^(.+?)eed$/; 85 | re2 = /^(.+?)(ed|ing)$/; 86 | if (re.test(w)) { 87 | var fp = re.exec(w); 88 | re = new RegExp(mgr0); 89 | if (re.test(fp[1])) { 90 | re = /.$/; 91 | w = w.replace(re,""); 92 | } 93 | } else if (re2.test(w)) { 94 | var fp = re2.exec(w); 95 | stem = fp[1]; 96 | re2 = new RegExp(s_v); 97 | if (re2.test(stem)) { 98 | w = stem; 99 | re2 = /(at|bl|iz)$/; 100 | re3 = new RegExp("([^aeiouylsz])\\1$"); 101 | re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); 102 | if (re2.test(w)) { w = w + "e"; } 103 | else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); } 104 | else if (re4.test(w)) { w = w + "e"; } 105 | } 106 | } 107 | 108 | // Step 1c 109 | re = /^(.+?)y$/; 110 | if (re.test(w)) { 111 | var fp = re.exec(w); 112 | stem = fp[1]; 113 | re = new RegExp(s_v); 114 | if (re.test(stem)) { w = stem + "i"; } 115 | } 116 | 117 | // Step 2 118 | re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; 119 | if (re.test(w)) { 120 | var fp = re.exec(w); 121 | stem = fp[1]; 122 | suffix = fp[2]; 123 | re = new RegExp(mgr0); 124 | if (re.test(stem)) { 125 | w = stem + step2list[suffix]; 126 | } 127 | } 128 | 129 | // Step 3 130 | re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; 131 | if (re.test(w)) { 132 | var fp = re.exec(w); 133 | stem = fp[1]; 134 | suffix = fp[2]; 135 | re = new RegExp(mgr0); 136 | if (re.test(stem)) { 137 | w = stem + step3list[suffix]; 138 | } 139 | } 140 | 141 | // Step 4 142 | re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; 143 | re2 = /^(.+?)(s|t)(ion)$/; 144 | if (re.test(w)) { 145 | var fp = re.exec(w); 146 | stem = fp[1]; 147 | re = new RegExp(mgr1); 148 | if (re.test(stem)) { 149 | w = stem; 150 | } 151 | } else if (re2.test(w)) { 152 | var fp = re2.exec(w); 153 | stem = fp[1] + fp[2]; 154 | re2 = new RegExp(mgr1); 155 | if (re2.test(stem)) { 156 | w = stem; 157 | } 158 | } 159 | 160 | // Step 5 161 | re = /^(.+?)e$/; 162 | if (re.test(w)) { 163 | var fp = re.exec(w); 164 | stem = fp[1]; 165 | re = new RegExp(mgr1); 166 | re2 = new RegExp(meq1); 167 | re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); 168 | if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) { 169 | w = stem; 170 | } 171 | } 172 | 173 | re = /ll$/; 174 | re2 = new RegExp(mgr1); 175 | if (re.test(w) && re2.test(w)) { 176 | re = /.$/; 177 | w = w.replace(re,""); 178 | } 179 | 180 | // and turn initial Y back to y 181 | 182 | if (firstch == "y") { 183 | w = firstch.toLowerCase() + w.substr(1); 184 | } 185 | 186 | return w; 187 | } 188 | } 189 | } -------------------------------------------------------------------------------- /lib/node-search/tokenizer/index.js: -------------------------------------------------------------------------------- 1 | var DoubleMetaphone = require("../double-metaphone").DoubleMetaphone; 2 | var PorterStemmer = require("../porter-stemmer").PorterStemmer; 3 | 4 | exports.Tokenizer = function() { 5 | return { 6 | stemmer:PorterStemmer(), 7 | process: function(words){ 8 | var result = []; 9 | for( var i=0; i' 8 | # c.option '-b', '--bar [string]', 'Does some bar with [string]' 9 | # c.example 'Do some foo', 'jspec example --foo bar' 10 | # c.example 'Do some bar', 'jspec example --bar' 11 | # c.when_called do |args, options| 12 | # p args 13 | # p options.__hash__ 14 | # # options.foo 15 | # # options.bar 16 | # # options.__hash__[:foo] 17 | # # options.__hash__[:bar] 18 | # end 19 | # end -------------------------------------------------------------------------------- /spec/node.js: -------------------------------------------------------------------------------- 1 | 2 | require.paths.unshift('spec', '/usr/local/lib/ruby/gems/1.8/gems/jspec-4.3.3/lib', 'lib') 3 | require('jspec') 4 | require('unit/spec.helper') 5 | require('yourlib') 6 | 7 | JSpec 8 | .exec('spec/unit/spec.js') 9 | .run({ reporter: JSpec.reporters.Terminal, fixturePath: 'spec/fixtures', failuresOnly: true }) 10 | .report() 11 | -------------------------------------------------------------------------------- /spec/unit/spec.helper.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/talltyler/node-search/caeb8759faf1c08b43da91dc9c34ec45d94da779/spec/unit/spec.helper.js -------------------------------------------------------------------------------- /spec/unit/spec.js: -------------------------------------------------------------------------------- 1 | JSpec.describe('Search', function(){ 2 | before_each(function{ 3 | //TODO 4 | //search = new Search 5 | }) 6 | 7 | describe('addProducts', function(){ 8 | it ('should add several products', function(){ 9 | cart.addProducts('cookie') 10 | cart.addProducts('icecream') 11 | expect(cart).to(have, 2, 'products') 12 | }) 13 | }) 14 | 15 | describe('checkout', function(){ 16 | it ('should throw an error when checking out with no products', function(){ 17 | expect(function(){ cart.clear().checkout() }).to(throw_error, EmptyCart) 18 | }) 19 | }) 20 | }) 21 | 22 | 23 | --------------------------------------------------------------------------------