├── package.json
├── LICENSE
├── test
    └── test.js
├── README.md
└── glossary.js


/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "glossary",
 3 |     "description": "Term extraction module",
 4 |     "version": "0.1.1",
 5 |     "author": "Heather Arthur <fayearthur@gmail.com>",
 6 |     "repository": {
 7 |         "type": "git",
 8 |         "url": "http://github.com/harthur/glossary.git"
 9 |     },
10 |     "main": "./glossary",
11 |     "dependencies": {
12 |         "natural": ">=0.0.28",
13 |         "pos": "0.1.x",
14 |         "underscore": "1.1.x"
15 |     },
16 |     "keywords": ["term extraction", "keyword", "tag", "auto tag"]
17 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011 Heather Arthur
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 
22 | 


--------------------------------------------------------------------------------
/test/test.js:
--------------------------------------------------------------------------------
 1 | var glossary = require("../glossary"),
 2 |     assert = require("assert");
 3 | 
 4 | var string = "Patsy Cline \
 5 | born in Gore, Virginia, was an American country music singer \
 6 | who enjoyed pop music crossover success during the era of the Nashville sound in the early 1960s. \
 7 | Since her death in 1963 at age 30 in a private airplane crash, \
 8 | she has been considered one of the most influential, successful, \
 9 | and acclaimed female singers.";
10 | 
11 | assert.deepEqual(glossary.extract(string),["Patsy","Cline","Patsy Cline","Gore","Virginia","American","country","music","singer","American country music singer","pop","crossover","success","pop music crossover success","era","Nashville","sound","Nashville sound","death","age","airplane","crash","airplane crash","one"])
12 | 
13 | assert.deepEqual(glossary({ minFreq: 2 }).extract(string), ["music","singer"]);
14 | 
15 | assert.deepEqual(glossary({ collapse: true }).extract(string), ["Patsy Cline","Gore","Virginia","American country music singer","pop music crossover success","era","Nashville sound","death","age","airplane crash","one"]);
16 | 
17 | assert.deepEqual(glossary({ blacklist: ["singer", "one", "gore", "sound"]}).extract(string), ["Patsy","Cline","Patsy Cline","Virginia","American","country","music","pop","crossover","success","pop music crossover success","era","Nashville","death","age","airplane","crash","airplane crash"])
18 | 
19 | assert.deepEqual(glossary({ minFreq: 2, verbose: true }).extract(string), [{"count":2,"norm":"music","word":"music"},{"count":2,"norm":"singer","word":"singer"}]);
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # glossary
 2 | 
 3 | glossary is a JavaScript module that extracts keywords from text (aka "term extraction" or "auto tagging"). It takes a string of text and returns an array of terms that are relevant to the content:
 4 | 
 5 | ```javascript
 6 | var glossary = require("glossary");
 7 | 
 8 | var keywords = glossary.extract("Her cake shop is the best in the business");
 9 | 
10 | console.log(keywords)  // ["cake", "shop", "cake shop", "business"]
11 | ```
12 | 
13 | `glossary` is standalone and uses part-of-speech analysis to extract the relevant terms.
14 | 
15 | # install
16 | 
17 | For [node](http://nodejs.org) with [npm](http://npmjs.org):
18 | 
19 | ```bash
20 | npm install glossary
21 | ```
22 | 
23 | # API
24 | 
25 | #### blacklisting
26 | 
27 | Use `blacklist` to remove unwanted terms from any extraction:
28 | 
29 | ```javascript
30 | var glossary = require("glossary")({
31 |    blacklist: ["library", "script", "api", "function"]
32 | });
33 | 
34 | var keywords = glossary.extract("JavaScript color conversion library");
35 | 
36 | console.log(keywords); // ["color", "conversion"]
37 | ```
38 | 
39 | #### minimum frequency
40 | 
41 | Use `minFreq` to limit the terms to only those that occur with a certain frequency:
42 | 
43 | ```javascript
44 | var glossary = require("glossary")({ minFreq: 2 });
45 | 
46 | var keywords = glossary.extract("Kasey's pears are the best pears in Canada");
47 | 
48 | console.log(keywords); // ["pears"]
49 | ```
50 | 
51 | #### sub-terms
52 | 
53 | Use `collapse` to remove terms that are sub-terms of other terms:
54 | 
55 | ```javascript
56 | var glossary = require("glossary")({ collapse: true });
57 | 
58 | var keywords = glossary.extract("The Middle East crisis is getting worse");
59 | 
60 | console.log(keywords); // ["Middle East crisis"]
61 | ```
62 | 
63 | #### verbose output
64 | 
65 | Use `verbose` to also get the count of each term:
66 | 
67 | ```javascript
68 | var glossary = require("glossary")({ verbose: true });
69 | 
70 | var keywords = glossary.extract("The pears from the farm are good");
71 | 
72 | console.log(keywords); // [ { word: 'pears', count: 1 }, { word: 'farm', count: 1 } ]
73 | ```
74 | 
75 | # propers
76 | 
77 | `glossary` Uses [jspos](http://code.google.com/p/jspos/) for POS tagging. It's inspired by the python module [topia.termextract](http://pypi.python.org/pypi/topia.termextract/).
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/glossary.js:
--------------------------------------------------------------------------------
  1 | var _ = require("underscore"),
  2 |     pos = require("pos"),
  3 |     natural = require("natural"),
  4 |     inflector = new natural.NounInflector();
  5 | 
  6 | function normalize(word) {
  7 |    return inflector.singularize(word);
  8 | }
  9 | 
 10 | function Glossary(opts) {
 11 |    this.opts = _(opts || {}).defaults({
 12 |       minFreq: 1,
 13 |       collapse: false,
 14 |       blacklist: [],
 15 |       verbose: false
 16 |    });
 17 | }
 18 | 
 19 | Glossary.prototype.extract = function(text) {
 20 |    var words = new pos.Lexer().lex(text);
 21 |    var tags = new pos.Tagger().tag(words);
 22 | 
 23 |    var terms = {};
 24 |    var multiterm = [];
 25 | 
 26 |    function add(word) {
 27 |      var norm = normalize(word);
 28 |      multiterm.push(word);
 29 | 
 30 |      terms[norm] = terms[norm] || {
 31 |         count: 0,
 32 |         norm: norm,
 33 |         word: word
 34 |      };
 35 |      terms[norm].count++;
 36 |    }
 37 | 
 38 |    var searching = true;
 39 | 
 40 |    for (var i = 0; i < tags.length; i++) {
 41 |       var word = tags[i][0],
 42 |           tag = tags[i][1];
 43 | 
 44 |       var isNoun = tag.indexOf('N') == 0,
 45 |           isAdj = tag == "JJ";
 46 | 
 47 |       if (searching && (isNoun || (isAdj
 48 |             && word[0].match(/[A-Z]/)))) {
 49 |          searching = false;
 50 |          add(word);
 51 |       }
 52 |       else if (!searching && isNoun) {
 53 |          add(word);
 54 |       }
 55 |       else if (!searching && !isNoun) {
 56 |          searching = true;
 57 |          if (multiterm.length > 1) {
 58 |             add(multiterm.join(" "));
 59 |          }
 60 |          multiterm = [];
 61 |       }
 62 |    }
 63 | 
 64 |    var opts = this.opts;
 65 |    var terms =  _(terms).select(function(term) {
 66 |       return term.count >= opts.minFreq;
 67 |    });
 68 | 
 69 |   if (opts.collapse) {
 70 |      terms = _(terms).reject(function(term) {
 71 |         return _(terms).any(function(term2) {
 72 |            return term.word != term2.word
 73 |               && term2.norm.indexOf(term.norm) >= 0;
 74 |         })
 75 |      });
 76 |   }
 77 |   
 78 |   if (opts.blacklist) {
 79 |      terms = _(terms).reject(function(term) {
 80 |         return _(opts.blacklist).any(function(black) {
 81 |            return term.norm.toLowerCase().indexOf(normalize(black).toLowerCase()) >= 0;
 82 |         }) 
 83 |      })
 84 |   }
 85 | 
 86 |   if (!this.opts.verbose) {
 87 |      terms = _(terms).pluck("word");
 88 |   }
 89 | 
 90 |   return terms;
 91 | }
 92 | 
 93 | var createGlossary = function(opts) {
 94 |    return new Glossary(opts);
 95 | }
 96 | 
 97 | var glossary = createGlossary();
 98 | 
 99 | createGlossary.extract = _(glossary.extract).bind(glossary);
100 | 
101 | module.exports = createGlossary;
102 | 


--------------------------------------------------------------------------------