├── lib ├── nlpjs.js └── levenshtein.js ├── .gitignore ├── package.json ├── README.md └── LICENSE /lib/nlpjs.js: -------------------------------------------------------------------------------- 1 | exports.levenshtein = require("./levenshtein").levenshtein; -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Mac. 2 | .DS_STORE 3 | 4 | # Node. 5 | node_modules 6 | npm-debug.log -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "nlpjs", 3 | "description": "NLP library for javascript", 4 | "version": "0.0.2", 5 | "author": "nicktesla", 6 | "repository": { 7 | "type": "git", 8 | "url": "http://github.com/nicktesla/nlpjs.git" 9 | }, 10 | "main": "./lib/nlpjs", 11 | "dependencies": { 12 | 13 | }, 14 | "devDependencies" : { 15 | 16 | }, 17 | "keywords": ["nlp", "levenshtein", "machine learning", "natural language processing", "nltk"] 18 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nlpjs 2 | `nlpjs` is a JavaScript [natural language processing](http://en.wikipedia.org/wiki/Natural_language_processing) library. 3 | 4 | 5 | ## Using in node 6 | If you have [node](http://nodejs.org/) you can install with [npm](http://github.com/isaacs/npm): 7 | 8 | npm install nlpjs 9 | 10 | ## Done 11 | 12 | String Similarity/Distance: Recursive and iterative implementations of Levenshtein distance 13 | 14 | 15 | ## TO DO 16 | 17 | * Other Distance measures 18 | * Sentence splitting 19 | * Stemming 20 | * Word frequency analysis 21 | * Co-occurance analysis 22 | * part of speech tagging 23 | * trigram statistics 24 | * named entity recognition 25 | * word sense disambiguation 26 | * Library for clustering 27 | * Bayesian classifier 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 nicktesla 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /lib/levenshtein.js: -------------------------------------------------------------------------------- 1 | //slow ass recursive version 2 | levenshteinDistR = function(firstword, secword) { 3 | var flength = firstword.length; 4 | var slength = secword.length; 5 | 6 | //if firstword length is 0 return secword length 7 | if(flength == 0) { 8 | return slength; 9 | } 10 | //if secword length is 0 return firstword length 11 | else if (slength == 0) { 12 | return flength; 13 | } 14 | else { 15 | //if last character of firstword is equal to last character of secword cost = 0 else cost = 1 16 | 17 | if(firstword.charAt(flength-1) == secword.charAt(slength-1)){ 18 | var cost = 0; 19 | } 20 | else { 21 | var cost = 1; 22 | } 23 | //return the minimum of fristword less one, secword +1, secword lessone, firstword +1, firstword-1, secword-1 + cost 24 | 25 | return Math.min( 26 | levenshteinDistR(firstword.substring(0,flength-1), secword.substring(0,slength))+1, 27 | levenshteinDistR(firstword.substring(0,flength), secword.substring(0,slength-1))+1, 28 | levenshteinDistR(firstword.substring(0,flength-1), secword.substring(0,slength-1))+cost 29 | ); 30 | } 31 | } 32 | 33 | //quadratic iterative version 34 | levenshteinDist = function(firstword, secword) { 35 | var flength = firstword.length; 36 | var slength = secword.length; 37 | //make it case insensitive 38 | var firstword = firstword.toLowerCase(); 39 | var secword = secword.toLowerCase(); 40 | //make d(flength+1, slength+1) //and set it to 0 41 | var dMatrix = createInitMatrix(flength+1, slength+1, 0); 42 | //set d(i,0) to i 43 | for(var i=1; i