├── package.json ├── LICENSE ├── test └── test.js ├── README.md └── glossary.js /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "glossary", 3 | "description": "Term extraction module", 4 | "version": "0.1.1", 5 | "author": "Heather Arthur ", 6 | "repository": { 7 | "type": "git", 8 | "url": "http://github.com/harthur/glossary.git" 9 | }, 10 | "main": "./glossary", 11 | "dependencies": { 12 | "natural": ">=0.0.28", 13 | "pos": "0.1.x", 14 | "underscore": "1.1.x" 15 | }, 16 | "keywords": ["term extraction", "keyword", "tag", "auto tag"] 17 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 Heather Arthur 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | -------------------------------------------------------------------------------- /test/test.js: -------------------------------------------------------------------------------- 1 | var glossary = require("../glossary"), 2 | assert = require("assert"); 3 | 4 | var string = "Patsy Cline \ 5 | born in Gore, Virginia, was an American country music singer \ 6 | who enjoyed pop music crossover success during the era of the Nashville sound in the early 1960s. \ 7 | Since her death in 1963 at age 30 in a private airplane crash, \ 8 | she has been considered one of the most influential, successful, \ 9 | and acclaimed female singers."; 10 | 11 | assert.deepEqual(glossary.extract(string),["Patsy","Cline","Patsy Cline","Gore","Virginia","American","country","music","singer","American country music singer","pop","crossover","success","pop music crossover success","era","Nashville","sound","Nashville sound","death","age","airplane","crash","airplane crash","one"]) 12 | 13 | assert.deepEqual(glossary({ minFreq: 2 }).extract(string), ["music","singer"]); 14 | 15 | assert.deepEqual(glossary({ collapse: true }).extract(string), ["Patsy Cline","Gore","Virginia","American country music singer","pop music crossover success","era","Nashville sound","death","age","airplane crash","one"]); 16 | 17 | assert.deepEqual(glossary({ blacklist: ["singer", "one", "gore", "sound"]}).extract(string), ["Patsy","Cline","Patsy Cline","Virginia","American","country","music","pop","crossover","success","pop music crossover success","era","Nashville","death","age","airplane","crash","airplane crash"]) 18 | 19 | assert.deepEqual(glossary({ minFreq: 2, verbose: true }).extract(string), [{"count":2,"norm":"music","word":"music"},{"count":2,"norm":"singer","word":"singer"}]); 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # glossary 2 | 3 | glossary is a JavaScript module that extracts keywords from text (aka "term extraction" or "auto tagging"). It takes a string of text and returns an array of terms that are relevant to the content: 4 | 5 | ```javascript 6 | var glossary = require("glossary"); 7 | 8 | var keywords = glossary.extract("Her cake shop is the best in the business"); 9 | 10 | console.log(keywords) // ["cake", "shop", "cake shop", "business"] 11 | ``` 12 | 13 | `glossary` is standalone and uses part-of-speech analysis to extract the relevant terms. 14 | 15 | # install 16 | 17 | For [node](http://nodejs.org) with [npm](http://npmjs.org): 18 | 19 | ```bash 20 | npm install glossary 21 | ``` 22 | 23 | # API 24 | 25 | #### blacklisting 26 | 27 | Use `blacklist` to remove unwanted terms from any extraction: 28 | 29 | ```javascript 30 | var glossary = require("glossary")({ 31 | blacklist: ["library", "script", "api", "function"] 32 | }); 33 | 34 | var keywords = glossary.extract("JavaScript color conversion library"); 35 | 36 | console.log(keywords); // ["color", "conversion"] 37 | ``` 38 | 39 | #### minimum frequency 40 | 41 | Use `minFreq` to limit the terms to only those that occur with a certain frequency: 42 | 43 | ```javascript 44 | var glossary = require("glossary")({ minFreq: 2 }); 45 | 46 | var keywords = glossary.extract("Kasey's pears are the best pears in Canada"); 47 | 48 | console.log(keywords); // ["pears"] 49 | ``` 50 | 51 | #### sub-terms 52 | 53 | Use `collapse` to remove terms that are sub-terms of other terms: 54 | 55 | ```javascript 56 | var glossary = require("glossary")({ collapse: true }); 57 | 58 | var keywords = glossary.extract("The Middle East crisis is getting worse"); 59 | 60 | console.log(keywords); // ["Middle East crisis"] 61 | ``` 62 | 63 | #### verbose output 64 | 65 | Use `verbose` to also get the count of each term: 66 | 67 | ```javascript 68 | var glossary = require("glossary")({ verbose: true }); 69 | 70 | var keywords = glossary.extract("The pears from the farm are good"); 71 | 72 | console.log(keywords); // [ { word: 'pears', count: 1 }, { word: 'farm', count: 1 } ] 73 | ``` 74 | 75 | # propers 76 | 77 | `glossary` Uses [jspos](http://code.google.com/p/jspos/) for POS tagging. It's inspired by the python module [topia.termextract](http://pypi.python.org/pypi/topia.termextract/). 78 | 79 | 80 | -------------------------------------------------------------------------------- /glossary.js: -------------------------------------------------------------------------------- 1 | var _ = require("underscore"), 2 | pos = require("pos"), 3 | natural = require("natural"), 4 | inflector = new natural.NounInflector(); 5 | 6 | function normalize(word) { 7 | return inflector.singularize(word); 8 | } 9 | 10 | function Glossary(opts) { 11 | this.opts = _(opts || {}).defaults({ 12 | minFreq: 1, 13 | collapse: false, 14 | blacklist: [], 15 | verbose: false 16 | }); 17 | } 18 | 19 | Glossary.prototype.extract = function(text) { 20 | var words = new pos.Lexer().lex(text); 21 | var tags = new pos.Tagger().tag(words); 22 | 23 | var terms = {}; 24 | var multiterm = []; 25 | 26 | function add(word) { 27 | var norm = normalize(word); 28 | multiterm.push(word); 29 | 30 | terms[norm] = terms[norm] || { 31 | count: 0, 32 | norm: norm, 33 | word: word 34 | }; 35 | terms[norm].count++; 36 | } 37 | 38 | var searching = true; 39 | 40 | for (var i = 0; i < tags.length; i++) { 41 | var word = tags[i][0], 42 | tag = tags[i][1]; 43 | 44 | var isNoun = tag.indexOf('N') == 0, 45 | isAdj = tag == "JJ"; 46 | 47 | if (searching && (isNoun || (isAdj 48 | && word[0].match(/[A-Z]/)))) { 49 | searching = false; 50 | add(word); 51 | } 52 | else if (!searching && isNoun) { 53 | add(word); 54 | } 55 | else if (!searching && !isNoun) { 56 | searching = true; 57 | if (multiterm.length > 1) { 58 | add(multiterm.join(" ")); 59 | } 60 | multiterm = []; 61 | } 62 | } 63 | 64 | var opts = this.opts; 65 | var terms = _(terms).select(function(term) { 66 | return term.count >= opts.minFreq; 67 | }); 68 | 69 | if (opts.collapse) { 70 | terms = _(terms).reject(function(term) { 71 | return _(terms).any(function(term2) { 72 | return term.word != term2.word 73 | && term2.norm.indexOf(term.norm) >= 0; 74 | }) 75 | }); 76 | } 77 | 78 | if (opts.blacklist) { 79 | terms = _(terms).reject(function(term) { 80 | return _(opts.blacklist).any(function(black) { 81 | return term.norm.toLowerCase().indexOf(normalize(black).toLowerCase()) >= 0; 82 | }) 83 | }) 84 | } 85 | 86 | if (!this.opts.verbose) { 87 | terms = _(terms).pluck("word"); 88 | } 89 | 90 | return terms; 91 | } 92 | 93 | var createGlossary = function(opts) { 94 | return new Glossary(opts); 95 | } 96 | 97 | var glossary = createGlossary(); 98 | 99 | createGlossary.extract = _(glossary.extract).bind(glossary); 100 | 101 | module.exports = createGlossary; 102 | --------------------------------------------------------------------------------