├── .gitignore ├── package.json ├── Source ├── textAnalysis.js ├── TFIDF.js └── nGrams.js ├── tryMeOut.js ├── Specs ├── TFIDF.spec.js ├── nGrams.spec.js └── textAnalysis.spec.js └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "LanguageAnalysis", 3 | "version": "0.0.0", 4 | "description": "* Text analysis module: * countWords * countSentences * countParagraphs * estimateReadingTime * analyzeText * addTags * nGram analysis module: * buildNGrams * getNGramsByFrequency * getMostCommonNGrams", 5 | "main": "textAnalysis.js", 6 | "scripts": { 7 | "test": "mocha Specs/*.spec.js -R nyan --bail", 8 | "start": "node tryMeOut.js" 9 | }, 10 | "author": "Drew Cuthbertson", 11 | "license": "ISC", 12 | "dependencies": { 13 | "chai": "^1.9.2", 14 | "mocha": "^2.0.1" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Source/textAnalysis.js: -------------------------------------------------------------------------------- 1 | var countWords = function(text){ 2 | var words = text.match(/\w+['-']*\w*/g); 3 | return words ? words.length : 0; 4 | } 5 | 6 | var countSentences = function(text){ 7 | var sentences = text.match(/[^.!?]+/g); 8 | return sentences ? sentences.length : 0; 9 | } 10 | 11 | var countParagraphs = function(text){ 12 | var paragraphs = text.match(/[^\n]+\s*/g); 13 | return paragraphs ? paragraphs.length : 0; 14 | } 15 | 16 | var estimateReadingTime = function(text, readingSpeed){ 17 | readingSpeed = readingSpeed || 250; 18 | var wordCount = countWords(text); 19 | return Math.ceil(wordCount / readingSpeed); 20 | } 21 | 22 | var analyzeText = function(text, options){ 23 | options = options || {}; 24 | var analysis = {}; 25 | analysis.wordCount = countWords(text); 26 | analysis.sentenceCount = countSentences(text); 27 | analysis.paragraphCount = countParagraphs(text); 28 | analysis.readingSpeed = options.readingSpeed || 250; 29 | analysis.estimatedReadingTime = estimateReadingTime(text, options.readingSpeed); 30 | analysis.tags = options.tags || []; 31 | return analysis; 32 | } 33 | 34 | var addTags = function(analysis, newTags){ 35 | var tags = {}; 36 | for(var i = 0; i < analysis.tags.length; i++){ 37 | tags[analysis.tags[i]] = true; 38 | } 39 | for(var i = 0; i < newTags.length; i++){ 40 | tags[newTags[i]] = true; 41 | } 42 | analysis.tags = Object.keys(tags); 43 | return tags; 44 | } 45 | 46 | module.exports = { 47 | countWords: countWords, 48 | countSentences: countSentences, 49 | countParagraphs: countParagraphs, 50 | estimateReadingTime: estimateReadingTime, 51 | analyzeText: analyzeText, 52 | addTags: addTags, 53 | } 54 | -------------------------------------------------------------------------------- /tryMeOut.js: -------------------------------------------------------------------------------- 1 | var textAnalysis = require('./Source/textAnalysis.js'); 2 | var nGrams = require('./Source/nGrams.js'); 3 | var TFIDF = require('./Source/TFIDF.js'); 4 | 5 | var TFStorage = {}; 6 | 7 | process.stdin.setEncoding('utf8'); 8 | 9 | process.stdin.on('data', function(data){ 10 | var simpleAnalysis = textAnalysis.analyzeText(data); 11 | var bigramsSet = nGrams.buildNGrams(data, 2); 12 | var bigramList = nGrams.listNGramsByCount(bigramsSet); 13 | var commonBigrams = nGrams.getMostCommonNGrams(bigramsSet); 14 | var printableBigrams = ''; 15 | for(var i = 0; i < commonBigrams.length; i++){ 16 | printableBigrams += '"' + commonBigrams[i] + '"'; 17 | if(i < commonBigrams.length - 1) printableBigrams += ', '; 18 | } 19 | 20 | var TFIDFAnalysis = TFIDF.fullTFIDFAnalysis(data, {TFStorage: TFStorage}); 21 | var printableUniques = ''; 22 | for(var i = 0; i < TFIDFAnalysis.mostUniqueTerms.length; i++){ 23 | printableUniques += '"' + TFIDFAnalysis.mostUniqueTerms[i] + '"'; 24 | if(i < TFIDFAnalysis.mostUniqueTerms.length - 1) printableUniques += ', '; 25 | } 26 | 27 | process.stdout.write('\n\n================RAW DATA================\n'); 28 | process.stdout.write('Simple analysis\n' + JSON.stringify(simpleAnalysis) + '\n\n'); 29 | process.stdout.write('Bigrams\n' + JSON.stringify(bigramList) + '\n\n'); 30 | process.stdout.write('TFIDF\n' + JSON.stringify(TFIDFAnalysis) + '\n\n'); 31 | process.stdout.write('\n\n================INTERPRETED DATA================\n'); 32 | process.stdout.write('Word Count: ' + simpleAnalysis.wordCount + '\n'); 33 | process.stdout.write('Sentence Count: ' + simpleAnalysis.sentenceCount + '\n'); 34 | process.stdout.write('Paragraph Count: ' + simpleAnalysis.paragraphCount + '\n'); 35 | process.stdout.write('Estimated Reading Time: ' + simpleAnalysis.estimatedReadingTime + ' minute\n'); 36 | process.stdout.write('Most Common Bigrams: ' + printableBigrams + '\n'); 37 | process.stdout.write('Most Unique Terms: ' + printableUniques + '\n'); 38 | process.stdout.write('\n\n================END================\n\n\n\n'); 39 | }); 40 | -------------------------------------------------------------------------------- /Source/TFIDF.js: -------------------------------------------------------------------------------- 1 | var nGrams = require('./nGrams.js'); 2 | 3 | var countTermFrequencies = function(text, options){ 4 | var tokenLength = options ? options.tokenLength || 1 : 1; 5 | var nGramList = nGrams.buildNGrams(text, tokenLength); 6 | return nGrams.listNGramsByCount(nGramList); 7 | } 8 | 9 | var storeTermFrequencies = function(TF, TFStorage){ 10 | TFStorage = TFStorage || {}; 11 | for(var count in TF){ 12 | for(var i = 0; i < TF[count].length; i++){ 13 | var word = TF[count][i]; 14 | if(word in TFStorage) TFStorage[word] += +count; 15 | else TFStorage[word] = +count; 16 | } 17 | } 18 | return TFStorage; 19 | } 20 | 21 | var normalizeTermFrequencies = function(TF, TFStorage){ 22 | var IDF = {}; 23 | for(var count in TF){ 24 | for(var i = 0; i < TF[count].length; i++){ 25 | var word = TF[count][i]; 26 | IDF[word] = +(count / TFStorage[word]).toFixed(4); 27 | } 28 | } 29 | 30 | return IDF; 31 | } 32 | 33 | var identifyUniqueTerms = function(IDF, options){ 34 | if(options && options.uniqueThreshold >= 0){ 35 | var score = options.uniqueThreshold; 36 | var uniqueSet = {}; 37 | for(var word in IDF){ 38 | if(IDF[word] >= score){ 39 | uniqueSet[word] = IDF[word]; 40 | } 41 | } 42 | } else { 43 | var uniqueSet = []; 44 | var score = 0; 45 | for(var word in IDF){ 46 | if(IDF[word] > score){ 47 | uniqueSet = [word]; 48 | score = IDF[word]; 49 | } else if(IDF[word] === score){ 50 | uniqueSet.push(word); 51 | } 52 | } 53 | } 54 | return uniqueSet 55 | } 56 | 57 | var fullTFIDFAnalysis = function(text, options){ 58 | options = options || {}; 59 | var analysis = {}; 60 | analysis.frequencyCount = countTermFrequencies(text, options.tokenLength); 61 | analysis.TFStorage = storeTermFrequencies(analysis.frequencyCount, options.TFStorage); 62 | analysis.IDF = normalizeTermFrequencies(analysis.frequencyCount, analysis.TFStorage); 63 | analysis.mostUniqueTerms = identifyUniqueTerms(analysis.IDF); 64 | return analysis; 65 | } 66 | 67 | module.exports = { 68 | countTermFrequencies: countTermFrequencies, 69 | storeTermFrequencies: storeTermFrequencies, 70 | normalizeTermFrequencies: normalizeTermFrequencies, 71 | identifyUniqueTerms: identifyUniqueTerms, 72 | fullTFIDFAnalysis: fullTFIDFAnalysis, 73 | } 74 | -------------------------------------------------------------------------------- /Source/nGrams.js: -------------------------------------------------------------------------------- 1 | var buildNGrams = function(text, unit, options){ 2 | unit = unit || 1; 3 | options = options || {}; 4 | var nGrams = {}; 5 | if(!text.length) return nGrams; 6 | if(!options.caseSensitive) text = text.toLowerCase(); 7 | if(options.includePunctuation){ 8 | var sentenceSplitter = new RegExp('[^\.\?!;]+[\.\?!;]', 'g') 9 | } else { 10 | var sentenceSplitter = new RegExp('[^.?!;]+', 'g'); 11 | } 12 | // split text into a list of sentenceList 13 | var sentenceList = text.match(sentenceSplitter); 14 | // strip punctuation from the sentenceList and separate by word 15 | for(var i = 0; i < sentenceList.length; i++){ 16 | if(options.includePunctuation) { 17 | sentenceList[i] = sentenceList[i].replace(/([\?\.!;])/, ' $1'); 18 | } 19 | sentenceList[i] = sentenceList[i].replace(/([^\w]*-[^\w])+|[\s,:]+/g, ' ') 20 | .replace(/^\s/, '') 21 | .split(/\s+/g); 22 | } 23 | 24 | for(var sentence = 0; sentence < sentenceList.length; sentence++){ 25 | for(var word = 0; word < sentenceList[sentence].length - unit + 1; word++){ 26 | var start = ''; 27 | for(var gramLength = 0; gramLength < unit - 1; gramLength++){ 28 | start += sentenceList[sentence][word + gramLength] + ' '; 29 | } 30 | // remove trailing space 31 | start = start.slice(0, start.length - 1); 32 | var end = sentenceList[sentence][word + unit - 1]; 33 | if(unit === 1){ 34 | var bucket = nGrams; 35 | } else { 36 | if( !(start in nGrams) ){ 37 | nGrams[start] = {}; 38 | } 39 | var bucket = nGrams[start]; 40 | } 41 | if(end in bucket){ 42 | bucket[end]++; 43 | } else { 44 | bucket[end] = 1; 45 | } 46 | } 47 | } 48 | return nGrams; 49 | } 50 | 51 | var listAllNGrams = function(nGrams){ 52 | var nGramList = []; 53 | for(var i in nGrams){ 54 | if(typeof nGrams[i] === 'number'){ 55 | nGramList = Object.keys(nGrams); 56 | break; 57 | } 58 | for(var j in nGrams[i]){ 59 | nGramList.push(i + ' ' + j); 60 | } 61 | } 62 | return nGramList; 63 | } 64 | 65 | var getNGramsByFrequency = function(nGrams, frequency){ 66 | var nGramList = []; 67 | for(var i in nGrams){ 68 | if(typeof nGrams[i] === 'number'){ 69 | if(nGrams[i] === frequency) nGramList.push(i); 70 | } else { 71 | for(var j in nGrams[i]){ 72 | if(nGrams[i][j] === frequency) nGramList.push(i + ' ' + j); 73 | } 74 | } 75 | } 76 | return nGramList; 77 | } 78 | 79 | var getMostCommonNGrams = function(nGrams){ 80 | var nGramList = []; 81 | var maximumFrequency = 1; 82 | for(var i in nGrams){ 83 | if(typeof nGrams[i] === 'number'){ 84 | if(nGrams[i] > maximumFrequency){ 85 | nGramList = [i]; 86 | maximumFrequency = nGrams[i]; 87 | } else if(nGrams[i] === maximumFrequency){ 88 | nGramList.push(i); 89 | } 90 | } else { 91 | for(var j in nGrams[i]){ 92 | if(nGrams[i][j] > maximumFrequency){ 93 | nGramList = [i + ' ' + j]; 94 | maximumFrequency = nGrams[i][j]; 95 | } else if(nGrams[i][j] === maximumFrequency){ 96 | nGramList.push(i + ' ' + j); 97 | } 98 | } 99 | } 100 | } 101 | return nGramList; 102 | } 103 | 104 | var listNGramsByCount = function(nGrams){ 105 | var countList = {}; 106 | for(var i in nGrams){ 107 | if(typeof nGrams[i] === 'number'){ 108 | if(nGrams[i] in countList){ 109 | countList[nGrams[i]].push(i) 110 | } else { 111 | countList[nGrams[i]] = [i]; 112 | } 113 | } else { 114 | for(var j in nGrams[i]){ 115 | if(nGrams[i][j] in countList){ 116 | countList[nGrams[i][j]].push(i + ' ' + j); 117 | } else { 118 | countList[nGrams[i][j]] = [i + ' ' + j]; 119 | } 120 | } 121 | } 122 | } 123 | return countList; 124 | } 125 | 126 | module.exports = { 127 | buildNGrams: buildNGrams, 128 | listAllNGrams: listAllNGrams, 129 | getNGramsByFrequency: getNGramsByFrequency, 130 | getMostCommonNGrams: getMostCommonNGrams, 131 | listNGramsByCount: listNGramsByCount, 132 | } 133 | -------------------------------------------------------------------------------- /Specs/TFIDF.spec.js: -------------------------------------------------------------------------------- 1 | var expect = require('chai').expect; 2 | var TFIDF = require('../Source/TFIDF.js'); 3 | 4 | describe('The countTermFrequencies method', function(){ 5 | it('should generate a list of terms sorted by their frequency for unigrams', function(){ 6 | var list = TFIDF.countTermFrequencies("Apple orange pizza apple"); 7 | expect(list).to.deep.equal({ 1: ['orange', 'pizza'], 2: ['apple']}); 8 | }); 9 | 10 | it('should generate a list of terms sorted by their frequency for bigrams', function(){ 11 | var list = TFIDF.countTermFrequencies("Apple orange pizza apple", {tokenLength: 2}); 12 | expect(list).to.deep.equal({1: ['apple orange', 'orange pizza', 'pizza apple']}); 13 | }); 14 | }); 15 | 16 | describe('The storeTermFrequencies method', function(){ 17 | it('should add the termFrequencies to the term frequency storage', function(){ 18 | var TFStorage = {}; 19 | var list = TFIDF.countTermFrequencies("Apple orange pizza apple"); 20 | TFIDF.storeTermFrequencies(list, TFStorage); 21 | expect(TFStorage).to.deep.equal({ apple: 2, orange: 1, pizza: 1}); 22 | TFIDF.storeTermFrequencies(list, TFStorage); 23 | expect(TFStorage).to.deep.equal({ apple: 4, orange: 2, pizza: 2}); 24 | }); 25 | 26 | it('should return the new storage object', function(){ 27 | var list = TFIDF.countTermFrequencies("Apple orange pizza apple"); 28 | var storage = TFIDF.storeTermFrequencies(list); 29 | expect(storage).to.deep.equal({ apple: 2, orange: 1, pizza: 1}); 30 | }); 31 | }); 32 | 33 | describe('The normalizeTermFrequencies method', function(){ 34 | it('should normalize term frequency counts based on the current term frequency storage counts', function(){ 35 | var TFStorage = {}; 36 | var list = TFIDF.countTermFrequencies("Apple orange pizza apple"); 37 | TFIDF.storeTermFrequencies(list, TFStorage); 38 | var IDF = TFIDF.normalizeTermFrequencies(list, TFStorage); 39 | expect(IDF).to.deep.equal({apple: 1, orange: 1, pizza: 1}); 40 | 41 | var secondList = TFIDF.countTermFrequencies("Apple orange pizza peach"); 42 | TFIDF.storeTermFrequencies(secondList, TFStorage); 43 | var secondIDF = TFIDF.normalizeTermFrequencies(secondList, TFStorage); 44 | expect(secondIDF).to.deep.equal({ apple: 0.3333 , orange: 0.5000, pizza: 0.5000, peach: 1.0000}); 45 | }); 46 | }); 47 | 48 | describe('The identifyUniqueTerms method', function(){ 49 | it('should identify the most unique term from the normalized set of terms', function(){ 50 | var TFStorage = {}; 51 | 52 | var list = TFIDF.countTermFrequencies("Apple orange pizza apple"); 53 | TFIDF.storeTermFrequencies(list, TFStorage); 54 | var IDF = TFIDF.normalizeTermFrequencies(list, TFStorage); 55 | 56 | var secondList = TFIDF.countTermFrequencies("Apple orange pizza peach"); 57 | TFIDF.storeTermFrequencies(secondList, TFStorage); 58 | var secondIDF = TFIDF.normalizeTermFrequencies(secondList, TFStorage); 59 | 60 | var mostUnique = TFIDF.identifyUniqueTerms(secondIDF); 61 | expect(mostUnique).to.deep.equal(['peach']); 62 | }); 63 | 64 | it('should return a list of most unique terms when there is a tie', function(){ 65 | var TFStorage = {}; 66 | 67 | var list = TFIDF.countTermFrequencies("Apple orange pizza apple"); 68 | TFIDF.storeTermFrequencies(list, TFStorage); 69 | var IDF = TFIDF.normalizeTermFrequencies(list, TFStorage); 70 | 71 | var secondList = TFIDF.countTermFrequencies("Apple orange pizza"); 72 | TFIDF.storeTermFrequencies(secondList, TFStorage); 73 | var secondIDF = TFIDF.normalizeTermFrequencies(secondList, TFStorage); 74 | 75 | var mostUnique = TFIDF.identifyUniqueTerms(secondIDF); 76 | expect(mostUnique).to.deep.equal(['orange', 'pizza']); 77 | }); 78 | 79 | it('should identify all unique terms based on the optional uniqueThreshold input', function(){ 80 | var TFStorage = {}; 81 | 82 | var list = TFIDF.countTermFrequencies("Apple orange pizza apple"); 83 | TFIDF.storeTermFrequencies(list, TFStorage); 84 | var IDF = TFIDF.normalizeTermFrequencies(list, TFStorage); 85 | 86 | var secondList = TFIDF.countTermFrequencies("Apple orange pizza peach"); 87 | TFIDF.storeTermFrequencies(secondList, TFStorage); 88 | var secondIDF = TFIDF.normalizeTermFrequencies(secondList, TFStorage); 89 | 90 | var mostUniqueTerms = TFIDF.identifyUniqueTerms(secondIDF, {uniqueThreshold: 0.5}); 91 | expect(mostUniqueTerms).to.deep.equal({orange: 0.5000, pizza: 0.5000, peach: 1.0000}); 92 | }); 93 | }); 94 | 95 | describe('The fullTFIDFAnalysis', function(){ 96 | var analysis; 97 | var TFStorage; 98 | beforeEach(function(){ 99 | TFStorage = {}; 100 | analysis = TFIDF.fullTFIDFAnalysis("Apple orange pizza pizza", {TFStorage: TFStorage}); 101 | }); 102 | 103 | it('should have a frequencyCount', function(){ 104 | expect(analysis.frequencyCount).to.deep.equal({1: ['apple', 'orange'], 2: ['pizza']}); 105 | }); 106 | 107 | it('should have a TFStorage', function(){ 108 | expect(analysis.TFStorage).to.deep.equal({apple: 1, orange: 1, pizza: 2}); 109 | }); 110 | 111 | it('should add the frequencyCount to the TFStorage if supplied in options', function(){ 112 | expect(TFStorage).to.deep.equal({apple: 1, orange: 1, pizza: 2}); 113 | }) 114 | 115 | it('should have an IDF', function(){ 116 | expect(analysis.IDF).to.deep.equal({apple: 1.0000, orange: 1.0000, pizza: 1.0000}); 117 | }); 118 | 119 | it('should have a mostUniqueTerms list', function(){ 120 | expect(analysis.mostUniqueTerms).to.deep.equal(['apple', 'orange', 'pizza']); 121 | }); 122 | }); 123 | -------------------------------------------------------------------------------- /Specs/nGrams.spec.js: -------------------------------------------------------------------------------- 1 | var expect = require('chai').expect; 2 | var nGrams = require('../Source/nGrams.js'); 3 | 4 | describe('The buildNGrams method', function(){ 5 | it('should work on an empty string', function(){ 6 | var emptyGrams = nGrams.buildNGrams('', 1); 7 | expect(emptyGrams).to.deep.equal({}); 8 | }); 9 | 10 | it('should be able to build unigrams', function(){ 11 | var unigrams = nGrams.buildNGrams('Hello world!', 1); 12 | expect(unigrams).to.deep.equal({ hello: 1, world: 1 }); 13 | }); 14 | 15 | it('should track a compound word as a single word', function(){ 16 | var unigrams = nGrams.buildNGrams('This is a top-notch test.', 1); 17 | expect(unigrams).to.deep.equal({"this": 1, is: 1, a: 1, "top-notch": 1, test: 1}); 18 | }); 19 | 20 | it('should allow for apostrophes in words', function(){ 21 | var unigrams = nGrams.buildNGrams("I'm hyphenated!", 1); 22 | expect(unigrams).to.deep.equal({"i'm": 1, hyphenated: 1}); 23 | }); 24 | 25 | it('should track numbers as words', function(){ 26 | var unigrams = nGrams.buildNGrams("Here's 1 more test", 1); 27 | expect(unigrams).to.deep.equal({"here's": 1, "1": 1, more: 1, test: 1}); 28 | }); 29 | 30 | it('should be able to build nGrams with punctuation', function(){ 31 | var unigrams = nGrams.buildNGrams('Hello, world. How are you?', 1, {includePunctuation: true}); 32 | expect(unigrams).to.deep.equal({ hello: 1, world: 1, '.': 1, how: 1, are: 1, you: 1, '?': 1}); 33 | }); 34 | 35 | it('should be able to build case sensitive nGrams', function(){ 36 | var unigrams = nGrams.buildNGrams('Hello World! Hello world!', 1, {caseSensitive: true}); 37 | expect(unigrams).to.deep.equal({ Hello: 2, World: 1, world: 1}); 38 | }); 39 | 40 | it('should be able to build nGrams of arbitrary length', function(){ 41 | var bigrams = nGrams.buildNGrams("How are you doing today?", 2); 42 | expect(bigrams).to.deep.equal({ how: { are: 1 }, are: { you: 1 }, you: { doing: 1 }, doing: { today: 1 }}); 43 | var trigrams = nGrams.buildNGrams("How are you doing today?", 3); 44 | expect(trigrams).to.deep.equal({ "how are": { you: 1 }, "are you": { doing: 1 }, "you doing": { today: 1 }}); 45 | var quadrigrams = nGrams.buildNGrams("How are you doing today?", 4); 46 | expect(quadrigrams).to.deep.equal({ "how are you": { doing: 1 }, "are you doing": { today: 1 }}); 47 | var quintigrams = nGrams.buildNGrams("How are you doing today", 5) 48 | expect(quintigrams).to.deep.equal({"how are you doing": { today: 1 }}); 49 | }); 50 | 51 | it('should not build nGrams greater than the length of the input text', function(){ 52 | var trigrams = nGrams.buildNGrams("Hello, world", 3); 53 | expect(trigrams).to.deep.equal({}); 54 | }); 55 | }); 56 | 57 | describe('The listAllNGrams method', function(){ 58 | it('should work for an empty set of nGrams', function(){ 59 | var emptyGrams = nGrams.buildNGrams("", 2); 60 | var allNGrams = nGrams.listAllNGrams(emptyGrams); 61 | expect(allNGrams).to.deep.equal([]); 62 | }); 63 | 64 | it('should return a list of all nGrams, given an input set of nGrams', function(){ 65 | var bigrams = nGrams.buildNGrams("Hello, world! Goodbye, world!", 2); 66 | var allNGrams = nGrams.listAllNGrams(bigrams); 67 | expect(allNGrams).to.deep.equal(["hello world", "goodbye world"]); 68 | }); 69 | 70 | it('should return a list of words, when the input is a set of unigrams', function(){ 71 | var unigrams = nGrams.buildNGrams("Hello, world! Goodbye, world!", 1, {includePunctuation: true}); 72 | var allNGrams = nGrams.listAllNGrams(unigrams); 73 | expect(allNGrams).to.deep.equal(["hello", "world", "!", "goodbye"]); 74 | }) 75 | }); 76 | 77 | describe('The getNGramsByFrequency method', function(){ 78 | it('should work for an empty set of nGrams', function(){ 79 | var emptyGrams = nGrams.buildNGrams("", 2); 80 | var twiceGrams = nGrams.getNGramsByFrequency(emptyGrams, 2); 81 | expect(twiceGrams).to.deep.equal([]); 82 | }); 83 | 84 | it('should return all nGrams that appear the input number of times', function(){ 85 | var bigrams = nGrams.buildNGrams("Hello world! How are you? Hello world!", 2); 86 | var twiceGrams = nGrams.getNGramsByFrequency(bigrams, 2); 87 | expect(twiceGrams).to.deep.equal(["hello world"]); 88 | }); 89 | 90 | it('should work when there are no matching nGrams', function(){ 91 | var bigrams = nGrams.buildNGrams("Hello world! Goodbye world!", 2); 92 | var noGrams = nGrams.getNGramsByFrequency(bigrams, 5); 93 | expect(noGrams).to.deep.equal([]); 94 | }); 95 | 96 | it('should work on a set of unigrams', function(){ 97 | var unigrams = nGrams.buildNGrams("Hello world! How are you? Hello world!"); 98 | var twicegrams = nGrams.getNGramsByFrequency(unigrams, 2); 99 | expect(twicegrams).to.deep.equal(["hello", "world"]); 100 | }); 101 | }); 102 | 103 | describe('The getMostCommonNGrams method', function(){ 104 | it('should work for an empty set of nGrams', function(){ 105 | var emptyGrams = nGrams.buildNGrams("", 2); 106 | var commonGrams = nGrams.getMostCommonNGrams(emptyGrams, 2); 107 | expect(commonGrams).to.deep.equal([]); 108 | }); 109 | 110 | it('should return the most common nGram when there is only one most frequent nGram.', function(){ 111 | var bigrams = nGrams.buildNGrams("Hello world! How are you? Hello world!", 2); 112 | var commonNGrams = nGrams.getMostCommonNGrams(bigrams); 113 | expect(commonNGrams).to.deep.equal(["hello world"]); 114 | }); 115 | 116 | it('should return a list of the most common nGrams when there are multiple.', function(){ 117 | var bigrams = nGrams.buildNGrams("Hello world! Goodbye world!", 2, {includePunctuation: false}); 118 | var commonNGrams = nGrams.getMostCommonNGrams(bigrams); 119 | expect(commonNGrams).to.deep.equal(["hello world", "goodbye world"]); 120 | }); 121 | }); 122 | 123 | describe('The listNGramsByCount method', function(){ 124 | it('should sort a set of unigrams by count', function(){ 125 | var unigrams = nGrams.buildNGrams("Hello, world! How's the weather? Goodbye, world!", 1); 126 | var listOfGrams = nGrams.listNGramsByCount(unigrams); 127 | expect(listOfGrams).to.deep.equal({ 1: ['hello', "how's", 'the', 'weather', 'goodbye'], 2: ['world']}); 128 | }); 129 | it('should sort a set of bigrams by count', function(){ 130 | var bigrams = nGrams.buildNGrams("Hello, world! Hello, world!", 2); 131 | var listOfGrams = nGrams.listNGramsByCount(bigrams); 132 | expect(listOfGrams).to.deep.equal({ 2: ["hello world"]}); 133 | }) 134 | }); 135 | -------------------------------------------------------------------------------- /Specs/textAnalysis.spec.js: -------------------------------------------------------------------------------- 1 | var expect = require('chai').expect; 2 | var textAnalysis = require('../Source/textAnalysis.js'); 3 | 4 | describe('The countWords method', function(){ 5 | it('should work on an empty string', function(){ 6 | var wordCount = textAnalysis.countWords(''); 7 | expect(wordCount).to.equal(0); 8 | }); 9 | it('should count the number of words in a string without punctuation', function(){ 10 | var wordCount = textAnalysis.countWords("This is test number one") 11 | expect(wordCount).to.equal(5); 12 | var biggerWordCount = textAnalysis.countWords("Lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor incididunt ut labore et dolore magna aliqua Ut enim ad minim veniam quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur Excepteur sint occaecat cupidatat non proident sunt in culpa qui officia deserunt mollit anim id est laborum") 13 | expect(biggerWordCount).to.equal(69); 14 | }); 15 | 16 | it('should not count punctuation or whitespace in the word count', function(){ 17 | var wordCount = textAnalysis.countWords("This is another text. "); 18 | expect(wordCount).to.equal(4); 19 | var nextWordCount = textAnalysis.countWords("And now, for another test! With more puncuation - you know, the kind that can 'get in the way'!") 20 | expect(nextWordCount).to.equal(18); 21 | }); 22 | 23 | it('should count numbers as words', function(){ 24 | var wordCount = textAnalysis.countWords("There are 7 words in this string."); 25 | expect(wordCount).to.equal(7); 26 | }); 27 | }); 28 | 29 | describe('The countSentences method', function(){ 30 | it('should work on an empty string', function(){ 31 | var sentenceCount = textAnalysis.countSentences(''); 32 | expect(sentenceCount).to.equal(0); 33 | }); 34 | 35 | it('should count the number of sentences in a string', function(){ 36 | var sentenceCount = textAnalysis.countSentences("This is a test. Just a test? Yes! Just a test; and semicolons can't get in my way!!!") 37 | expect(sentenceCount).to.equal(4) 38 | }); 39 | }); 40 | 41 | describe('The countParagraphs method', function(){ 42 | it('should work on an empty string', function(){ 43 | var paragraphCount = textAnalysis.countParagraphs(''); 44 | expect(paragraphCount).to.equal(0); 45 | }); 46 | 47 | it('should count the number of paragraphs in a string', function(){ 48 | var paragraphCount = textAnalysis.countParagraphs('Hello, World. \n\n Goodbye, World.'); 49 | expect(paragraphCount).to.equal(2); 50 | }); 51 | 52 | it('should not include blocks of whitespace in the paragraph count', function(){ 53 | var paragraphCount = textAnalysis.countParagraphs('Hello, World. \n \n \n Goodbye, World'); 54 | expect(paragraphCount).to.equal(2); 55 | }) 56 | }); 57 | 58 | describe('The estimateReadingTime method', function(){ 59 | it('should calculate the estimated reading time of a string', function(){ 60 | var text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.'; 61 | var time = textAnalysis.estimateReadingTime(text); 62 | expect(time).to.equal(2); 63 | var timeWithInputSpeed = textAnalysis.estimateReadingTime(text, 450); 64 | expect(timeWithInputSpeed).to.equal(1); 65 | }); 66 | }); 67 | 68 | describe('The analyzeText method', function(){ 69 | it('should generate a report on the input string, including the metrics tested above.', function(){ 70 | var text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.' 71 | var analysis = textAnalysis.analyzeText(text); 72 | expect(analysis).to.have.property('wordCount'); 73 | expect(analysis).to.have.property('sentenceCount'); 74 | expect(analysis).to.have.property('paragraphCount'); 75 | expect(analysis).to.have.property('estimatedReadingTime'); 76 | expect(analysis).to.have.property('readingSpeed'); 77 | expect(analysis).to.have.property('tags'); 78 | }); 79 | }); 80 | 81 | describe('The addTags method', function(){ 82 | it('should add tags to an analysis object', function(){ 83 | var text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.' 84 | var analysis = textAnalysis.analyzeText(text); 85 | textAnalysis.addTags(analysis, ['test']); 86 | expect(analysis.tags).to.deep.equal(['test']); 87 | }); 88 | }); 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ####Getting Started 2 | Clone the repo: 3 | ``` 4 | git clone https://github.com/Syeoryn/textAnalysisSuite.git 5 | ``` 6 | Install the dependencies (none currently, unless you want to run the specs): 7 | ``` 8 | npm install 9 | ``` 10 | Try it out! 11 | ``` 12 | npm start 13 | ``` 14 | This will start a streaming analysis of text through stdin. Go ahead and put some sample text in the terminal window where this is running and test it out for yourself! 15 | The tryMeOut.js script is just a sample of what can be done with the packages in this suite. It defaults to building bigrams and does not save the TFIDF analysis to a persistent datastore, but feel free to tweak it and see what you can do. 16 | 17 | To run the tests, try: 18 | ``` 19 | npm test 20 | ``` 21 | 22 | ####This just in! 23 | We’re now live on npm! You can install the [nGrams](https://www.npmjs.org/package/word-ngrams) and [TFIDF](https://www.npmjs.org/package/document-tfidf) modules like this: 24 | ``` 25 | npm install word-ngrams 26 | npm install document-tfidf 27 | ``` 28 | Then all you need to do to use them is add a few lines of code to the beginning of your project! 29 | ``` 30 | var nGrams = require(‘word-ngrams’); 31 | var TFIDF = require(‘document-tfidf’); 32 | ``` 33 | 34 | 35 | ####Features: 36 | * Text analysis module: 37 | * countWords 38 | * countSentences 39 | * countParagraphs 40 | * estimateReadingTime 41 | * analyzeText 42 | * addTags 43 | * nGram analysis module: 44 | * buildNGrams 45 | * listAllNGrams 46 | * getNGramsByFrequency 47 | * getMostCommonNGrams 48 | * listNGramsByCount 49 | * TFIDF module: 50 | * countTermFrequencies 51 | * storeTermFrequencies 52 | * normalizeTermFrequencies 53 | * identifyUniqueTerms 54 | * fullTFIDFAnalysis 55 | 56 | ####Feature Descriptions: 57 | * Text Analysis 58 | * countWords: function(text) 59 | * Counts the total number of words in the input text. Assumes words are separated by any whitespace, commas (,), colons (:), semicolons (;), periods (.), question marks (?), exclamation marks (!), or ellipses (...), but not by apostrophes (‘) or hyphens (-). 60 | * Example: 61 | ``` 62 | countWords(“Hello, world! How’s the weather?”) // returns 5 63 | ``` 64 | * countSentences: function(text) 65 | * Counts the total number of sentences in the input text. Assumes sentences are separated by periods (.), question marks (?), exclamation marks(!), or ellipses (...). 66 | * Example: 67 | ``` 68 | countSentences(“Hello, world! How’s the weather?”) // returns 2 69 | ``` 70 | * countParagraphs: function(text) 71 | * Counts the total number of paragraphs in the input text. Assumes paragraphs are separated by new lines (\n). 72 | * Example: 73 | ``` 74 | countParagraphs(“Hello, world! How’s the weather?”) // returns 1 75 | ``` 76 | * estimateReadingTime: function(text [, readingSpeed]) 77 | * Estimates the time required to read the input text, returning the number of minutes required to read the text, rounding up to the nearest minute. Uses either the input readingSpeed (a number in words per minute) or the US average reading speed of 250 wpm. 78 | * Example: 79 | ``` 80 | estimateReadingTime(“Hello, world! How’s the weather?”, 300) // returns 1 81 | ``` 82 | * analyzeText: function(text [, options]) 83 | * Performs all the above text analyses, returning an object with the results of each analysis. 84 | * Options currently include readingSpeed, tags, and autoTag. 85 | * Example: 86 | ``` 87 | analyzeText(“Hello, world! How’s the weather?”, {readingSpeed: 300, tags: [‘test’, ‘HelloWorld’], autoTag: false}) 88 | // returns 89 | {text: “Hello, world! How’s the weather?”, 90 | wordCount: 5, 91 | sentenceCount: 2, 92 | paragraphCount: 1, 93 | estimatedReadingTime: 1, 94 | readingSpeed: 300, 95 | tags: [‘test’, ‘HelloWorld’] 96 | } 97 | ``` 98 | * addTags: function(analysis, tags) 99 | * Adds tags to the input analysis object 100 | 101 | * nGram Analysis 102 | * buildNGrams: function(text, unit [, options]) 103 | * Maps all nGrams within input text with input unit length (1=unigram, 2=bigram, 3=trigram, ...) 104 | * In constructing the nGram, terminal sentence punctuation (such as periods, question marks, and exclamation marks) and semicolons are considered words, as they also carry meaning. Apostrophes and compound word hyphens are ignored. To signify the end of a paragraph or body of text, null will be used. 105 | * Options include caseSensitive and includePunctuation. 106 | * If includePunctuation is set to false, then terminal sentence punctuation and the end of the body of text are not included in the nGram. 107 | * Both caseSensitive and includePunctuation both default to false. 108 | * Example: 109 | ``` 110 | buildNGrams(“Hello, World! How’s the world weather today? Hello, World!”, 2, {caseSensitive: true, includePunctuation: true}) 111 | // returns { Hello: { ,: 2 }, 112 | ,: { World: 2 }, 113 | World: { !: 2 }, 114 | !: { How’s: 1, null: 1}, 115 | How’s: { the: 1 }, 116 | the: { world: 1 }, 117 | world: { weather: 1 }, 118 | weather: { today: 1 }, 119 | today: { ?: 1 }, 120 | ?: { Hello: 1 } 121 | } 122 | ``` 123 | * listAllNGrams: function(nGrams) 124 | * Given an input set of nGrams (of the same format as the buildNGrams output), listAllNGrams will return a list of unique nGrams found in the text. 125 | * Example: 126 | ``` 127 | // Example input nGram for “Hello World. Goodbye World!”, without punctuation 128 | listAllNGrams({ Hello: { World: 1 }, Goodbye: { world: 1 }}) 129 | // returns [“hello world”, “goodbye world”] 130 | ``` 131 | * getNGramsByFrequency: function(nGrams, frequency) 132 | * Given an input set of nGrams (of the same format as the buildNGrams output), getNGramsByFrequency will return a list of all nGrams that occur that many times. 133 | * Example: 134 | ``` 135 | // Example input nGram for “Hello World”, without punctuation 136 | getNGramsByFrequency({ hello: { world: 1 }, 1) 137 | // returns [ “hello world”] 138 | ``` 139 | * getMostCommonNGrams: function(nGrams) 140 | * Given an input set of nGrams (of the same format as the buildNGrams output), getMostCommonNGrams will return a list of the most common nGrams. 141 | * Example: 142 | ``` 143 | // Example input nGram for “Hello World! Goodbye World!”, with punctuation 144 | getMostCommonNGrams({ Hello: { World: 1 }, World: { !: 2 }, !: { Goodbye: 1, null: 1 }, Goodbye: { world: 1 }}) 145 | // returns [“World!”] 146 | ``` 147 | * listNGramsByCount: function(nGrams) 148 | * Given an input set of nGrams (of the same format as the buildNGrams output), listNGramsByCount will return all nGrams sorted into buckets by count. 149 | * Example: 150 | ``` 151 | // Example input for “Hello, World! How’s the weather? Goodbye, World!” 152 | listNGramsByCount({ hello: 1, world: 2, “how’s”: 1, the: 1, weather: 1, goodbye: 1}) 153 | // returns { 1: [“hello”, “how’s”, “the”, “weather”, “goodbye”], 2: [“world”]} 154 | ``` 155 | 156 | * Term Frequency - Inverse Document Frequency (TFIDF) Module: 157 | * countTermFrequencies: function(text [, options]) 158 | * Counts the number of times each token appears in the input text. 159 | * Current options include tokenLength, which dictates the number of words that comprise each token. tokenLength defaults to 1. 160 | * Depends on nGrams module, which can get all tokens with arbitrary length. 161 | * storeTermFrequencies: function(tokenSet, TFStorage) 162 | * Adds the tokenSet to the collectionStorage for improved analysis over time. 163 | * It’s recommended to save this collection in a persistent data store, although this is unnecessary. 164 | * If collectionStorage is not provided, it will create it as an object and return that object. 165 | * normalizeTermFrequencies: function(tokenSet, TFStorage) 166 | * For each token in tokenSet, normalizeTermFrequencies will divide its count by the total number found in TFStorage and return the token set with normalized counts. 167 | * identifyUniqueTerms: function(normalizedTokenSet [, options]) 168 | * From the input normalizedTokenSet, identifyUniqueTerms will return the most unique tokens, as defined by the highest TFIDF 169 | * Current options include uniqueThreshold. If specified, identifyUniqueTerms will return all terms with a TFIDF equal to or greater than the uniqueThreshold 170 | * fullTFIDAnalysis: function(text [, options]) 171 | * Completes all of the above TFIDF calculations 172 | * options correspond with the options for each piece of the analysis 173 | --------------------------------------------------------------------------------