├── README.md ├── LICENSE └── bow.js /README.md: -------------------------------------------------------------------------------- 1 | bow.js 2 | ====== 3 | 4 | Bag of words models (i.e. term-document matrix creation) in Javascript. 5 | 6 | Requires Numeric.js for Matrix creation. 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 aneesha 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /bow.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | 3 | // Javascript implementation to create Bag of Words (BOW) models 4 | // (i.e document-term matrix) 5 | 6 | var bow = (typeof exports === "undefined")?(function bow() {}):(exports); 7 | if(typeof global !== "undefined") { global.bow = bow; } 8 | 9 | bow.version = "0.5"; 10 | 11 | bow.tokenizer = function tokenizer(obj) { 12 | if (!arguments.length || obj == null || obj == undefined) return [] 13 | 14 | var str = obj.toString().replace(/^\s+/, '') 15 | 16 | for (var i = str.length - 1; i >= 0; i--) { 17 | if (/\S/.test(str.charAt(i))) { 18 | str = str.substring(0, i + 1) 19 | break; 20 | } 21 | } 22 | 23 | return str 24 | .split(/\s+/) 25 | .map(function (token) { 26 | return token.replace(/^\W+/, '').replace(/\W+$/, '').toLowerCase() 27 | }) 28 | } 29 | 30 | bow.removestopwords = function removestopwords(obj) { 31 | if (!arguments.length || obj == null || obj == undefined) return [] 32 | 33 | var stopwords = ["a", "about", "above", "above", "across", "after", 34 | "afterwards", "again", "against", "all", "almost", "alone", "along", 35 | "already", "also","although","always","am","among", "amongst", 36 | "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone", 37 | "anything","anyway", "anywhere", "are", "around", "as", "at", "back", 38 | "be","became", "because","become","becomes", "becoming", "been", 39 | "before", "beforehand", "behind", "being", "below", "beside", 40 | "besides", "between", "beyond", "bill", "both", "bottom","but", 41 | "by", "call", "can", "cannot", "cant", "co", "con", "could", 42 | "couldnt", "cry", "de", "describe", "detail", "do", "does", "done", "down", 43 | "due", "during", "each", "eg", "eight", "either", "eleven","else", 44 | "elsewhere", "empty", "enough", "etc", "even", "ever", "every", 45 | "everyone", "everything", "everywhere", "except", "few", "fifteen", 46 | "fify", "fill", "find", "fire", "first", "five", "for", "former", 47 | "formerly", "forty", "found", "four", "from", "front", "full", 48 | "further", "get", "give", "go", "had", "has", "hasnt", "have", 49 | "he", "hence", "her", "here", "hereafter", "hereby", "herein", 50 | "hereupon", "hers", "herself", "him", "himself", "his", "how", 51 | "however", "hundred", "ie", "if", "in", "inc", "indeed", 52 | "interest", "into", "is", "it", "its", "itself", "keep", 53 | "last", "latter", "latterly", "least", "less", "ltd", "made", 54 | "many", "may", "me", "meanwhile", "might", "mill", "mine", 55 | "more", "moreover", "most", "mostly", "move", "much", "must", "my", 56 | "myself", "name", "namely", "neither", "never", "nevertheless", "next", 57 | "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", 58 | "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", 59 | "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", 60 | "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", 61 | "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", 62 | "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", 63 | "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", 64 | "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", 65 | "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", 66 | "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", 67 | "those", "though", "three", "through", "throughout", "thru", "thus", "to", 68 | "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", 69 | "un", "under", "until", "up", "upon", "us", "very", "via", "was", "way", "we", 70 | "well", "were", "what", "whatever", "when", "whence", "whenever", "where", 71 | "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", 72 | "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", 73 | "whose", "why", "will", "with", "within", "without", "would", "yet", "you", 74 | "your", "yours", "yourself", "yourselves", "the", 75 | // contractions? 76 | "didnt", "doesnt", "dont", "isnt", "wasnt", "youre", "hes", "ive", "theyll", 77 | "whos", "wheres", "whens", "whys", "hows", "whats", "were", "shes", "im", "thats" 78 | ]; 79 | 80 | var filtered_text = []; 81 | 82 | for (var i in obj) 83 | { 84 | if (stopwords.indexOf(obj[i])==-1) 85 | { 86 | filtered_text.push(obj[i]); 87 | } 88 | } 89 | 90 | return filtered_text; 91 | } 92 | 93 | bow.makevocabulary = function makevocabulary(obj) { 94 | if (!arguments.length || obj == null || obj == undefined) return [] 95 | 96 | var vocabulary = []; 97 | for (var doc in obj) 98 | { 99 | for (var i in obj[doc]) 100 | { 101 | if (vocabulary.indexOf(obj[doc][i])==-1) 102 | { 103 | vocabulary.push(obj[doc][i]); 104 | } 105 | } 106 | } 107 | return vocabulary; 108 | } 109 | 110 | bow.makematrix = function makematrix(vocab_obj,obj) { 111 | if (!arguments.length || obj == null || obj == undefined) return [] 112 | 113 | var vocab_size = vocab_obj.length; 114 | var document_size = obj.length; 115 | var matrix = numeric.rep([document_size,vocab_size],0); 116 | 117 | for (var doc=0; doc