├── .gitignore
├── LICENSE
├── test
    ├── fixtures
    │   ├── classifier-with-limit.json
    │   ├── classifier-without-limit.json
    │   └── classifier-limit.json
    ├── base.js
    └── limit.js
├── generate.js
├── package.json
├── README.md
└── src
    └── naivebayes.js


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Logs
 2 | logs
 3 | *.log
 4 | npm-debug.log*
 5 | 
 6 | # Runtime data
 7 | pids
 8 | *.pid
 9 | *.seed
10 | 
11 | # Directory for instrumented libs generated by jscoverage/JSCover
12 | lib-cov
13 | 
14 | # Coverage directory used by tools like istanbul
15 | coverage
16 | 
17 | # nyc test coverage
18 | .nyc_output
19 | 
20 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
21 | .grunt
22 | 
23 | # node-waf configuration
24 | .lock-wscript
25 | 
26 | # Compiled binary addons (http://nodejs.org/api/addons.html)
27 | build/Release
28 | 
29 | # Dependency directories
30 | node_modules
31 | jspm_packages
32 | 
33 | # Optional npm cache directory
34 | .npm
35 | 
36 | # Optional REPL history
37 | .node_repl_history
38 | 
39 | test/fixtures/ham
40 | test/fixtures/spam
41 | test.js
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Surmon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/test/fixtures/classifier-with-limit.json:
--------------------------------------------------------------------------------
1 | {"categories":["positive","negative","foul"],"docCount":{"positive":5,"negative":5,"foul":6},"totalDocuments":16,"vocabulary":["","","This","","amazing","is","Get","Go","to","hell","awesome","movie","Yeah","Oh","boy","Sweet","this","is","incredibly","perfect","great","Do","one","thing","at","a","time","and","do","well","Never","forget","to","say","thanks","Believe","in","yourself","terrible","crappy","thing","Dang","Stinks","ugh","bad","annoying","No","why","dumb","Are","you","serious","sucks","I","don","t","want","to","be","here","out","Beat","it","lost","You","SOB","(son","of","a)","","SOG","(son","of","Gun)","","Damn","you",""],"wordCount":{"positive":35,"negative":30,"foul":29},"wordFrequencyCount":{"positive":{"":5,"amazing":2,"awesome":1,"movie":1,"Yeah":1,"Oh":1,"boy":1,"Sweet":1,"this":1,"is":1,"incredibly":1,"perfect":1,"great":1,"Do":1,"one":1,"thing":1,"at":1,"a":1,"time":1,"and":1,"do":1,"well":1,"Never":1,"forget":1,"to":1,"say":1,"thanks":1,"Believe":1,"in":1,"yourself":1},"negative":{"":3,"This":3,"is":2,"terrible":1,"crappy":1,"thing":1,"Dang":1,"Stinks":1,"ugh":1,"bad":1,"annoying":1,"No":1,"why":1,"dumb":1,"Are":1,"you":1,"serious":1,"sucks":1,"I":1,"don":1,"t":1,"want":1,"to":1,"be":1,"here":1},"foul":{"":6,"Get":2,"Go":2,"to":2,"hell":2,"out":1,"Beat":1,"it":1,"lost":1,"You":1,"SOB":1,"(son":2,"of":2,"a)":1,"SOG":1,"Gun)":1,"Damn":1,"you":1}},"options":{"vocabularyLimit":80}}
2 | 


--------------------------------------------------------------------------------
/test/fixtures/classifier-without-limit.json:
--------------------------------------------------------------------------------
1 | {"categories":["positive","negative","foul"],"docCount":{"positive":5,"negative":5,"foul":6},"totalDocuments":16,"vocabulary":["amazing","awesome","movie","Yeah","Oh","boy","","Sweet","this","is","incredibly","amazing","perfect","great","","Do","one","thing","at","a","time","and","do","well","","Never","forget","to","say","thanks","","Believe","in","yourself","","terrible","crappy","thing","Dang","Stinks","","ugh","bad","This","is","annoying","","No","why","This","is","dumb","Are","you","serious","This","sucks","","I","don","t","want","to","be","here","Get","out","Beat","it","lost","","Go","to","hell","the","devil","","Oh","hell","s","bells","","You","SOB","(son","of","a)","","SOG","(son","of","Gun)","","Damn","you",""],"wordCount":{"positive":35,"negative":30,"foul":34},"wordFrequencyCount":{"positive":{"amazing":2,"awesome":1,"movie":1,"Yeah":1,"Oh":1,"boy":1,"":5,"Sweet":1,"this":1,"is":1,"incredibly":1,"perfect":1,"great":1,"Do":1,"one":1,"thing":1,"at":1,"a":1,"time":1,"and":1,"do":1,"well":1,"Never":1,"forget":1,"to":1,"say":1,"thanks":1,"Believe":1,"in":1,"yourself":1},"negative":{"terrible":1,"crappy":1,"thing":1,"Dang":1,"Stinks":1,"":3,"ugh":1,"bad":1,"This":3,"is":2,"annoying":1,"No":1,"why":1,"dumb":1,"Are":1,"you":1,"serious":1,"sucks":1,"I":1,"don":1,"t":1,"want":1,"to":1,"be":1,"here":1},"foul":{"Get":2,"out":1,"Beat":1,"it":1,"lost":1,"":6,"Go":2,"to":2,"hell":2,"the":1,"devil":1,"Oh":1,"s":1,"bells":1,"You":1,"SOB":1,"(son":2,"of":2,"a)":1,"SOG":1,"Gun)":1,"Damn":1,"you":1}}}
2 | 


--------------------------------------------------------------------------------
/generate.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs');
 2 | const path = require('path');
 3 | const { promisify } = require('util');
 4 | const { readDirDeep } = require('read-dir-deep');
 5 | const { simpleParser } = require('mailparser');
 6 | 
 7 | const NaiveBayes = require('.');
 8 | const classifier = new NaiveBayes({ vocabularyLimit: 300, stopwords: true });
 9 | 
10 | const readFile = promisify(fs.readFile);
11 | const writeFile = promisify(fs.writeFile);
12 | 
13 | const FIXTURES_PATH = path.join(process.cwd(), 'test', 'fixtures');
14 | const HAM_PATH = path.join(FIXTURES_PATH, 'ham');
15 | const SPAM_PATH = path.join(FIXTURES_PATH, 'spam');
16 | 
17 | async function getEmailFromSource(filepath) {
18 |   const sourceFiles = await readDirDeep(filepath);
19 |   const files = await Promise.all(sourceFiles.map((file) => readFile(file)));
20 |   const emails = await Promise.all(files.map((file) => simpleParser(file)));
21 |   const texts = emails.map((email) => email.text);
22 |   return texts;
23 | }
24 | 
25 | (async () => {
26 |   const hamEmails = await getEmailFromSource(HAM_PATH);
27 |   console.log('got ham emails');
28 |   const spamEmails = await getEmailFromSource(SPAM_PATH);
29 |   console.log('got spam emails');
30 | 
31 |   for (const text of hamEmails) {
32 |     if (text) {
33 |       classifier.learn(text.trim(), 'ham');
34 |     }
35 |   }
36 | 
37 |   for (const text of spamEmails) {
38 |     if (text) {
39 |       classifier.learn(text.trim(), 'spam');
40 |     }
41 |   }
42 | 
43 |   await writeFile(
44 |     path.join(FIXTURES_PATH, 'classifier-limit.json'),
45 |     classifier.toJson()
46 |   );
47 | })();
48 | 


--------------------------------------------------------------------------------
/test/base.js:
--------------------------------------------------------------------------------
 1 | const test = require('ava');
 2 | const NaiveBayes = require('../src/naivebayes.js');
 3 | const classifier = new NaiveBayes();
 4 | 
 5 | function decode(text) {
 6 |   return Buffer.from(text, 'base64').toString();
 7 | }
 8 | 
 9 | test('naivebayes', (t) => {
10 |   classifier.learn('amazing, awesome movie!! Yeah!! Oh boy.', 'positive');
11 |   classifier.learn(
12 |     'Sweet, this is incredibly, amazing, perfect, great!!',
13 |     'positive'
14 |   );
15 |   classifier.learn('Do one thing at a time, and do well.', 'positive');
16 |   classifier.learn('Never forget to say “thanks”.', 'positive');
17 |   classifier.learn('Believe in yourself.', 'positive');
18 | 
19 |   classifier.learn('terrible, crappy thing. Dang. Stinks!!', 'negative');
20 |   classifier.learn('ugh, bad. This is annoying.', 'negative');
21 |   classifier.learn('crud, this sucks', 'negative');
22 |   classifier.learn('awful, no way', 'negative');
23 | 
24 |   classifier.learn(decode('R2V0IG91dCAhQmVhdCBpdCEgR2V0IGxvc3Qh'), 'foul');
25 |   classifier.learn(decode('R28gdG8gaGVsbCEgR28gdG8gdGhlIGRldmlsIQ=='), 'foul');
26 |   classifier.learn(decode('T2gsIGhlbGwncyBiZWxscyE='), 'foul');
27 |   classifier.learn(decode('WW91IFNPQiAoc29uIG9mIGEpIQ=='), 'foul');
28 |   classifier.learn(decode('U09HIChzb24gb2YgR3VuKSE='), 'foul');
29 |   classifier.learn(decode('RGFtbiB5b3Uh'), 'foul');
30 | 
31 |   const classifierJson = classifier.toJson();
32 |   const classifierJsonObject = classifier.toJsonObject();
33 |   t.is(typeof classifierJson, 'string');
34 |   t.is(typeof classifierJsonObject, 'object');
35 |   t.deepEqual(classifierJsonObject.categories, [
36 |     'positive',
37 |     'negative',
38 |     'foul'
39 |   ]);
40 | });
41 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "name": "@ladjs/naivebayes",
  3 |   "description": "Naive Bayes Classifier for JavaScript.",
  4 |   "version": "0.0.4",
  5 |   "author": {
  6 |     "name": "Shaun Warman",
  7 |     "url": "https://github.com/ladjs"
  8 |   },
  9 |   "bugs": {
 10 |     "url": "https://github.com/ladjs/naivebayes/issues",
 11 |     "email": "shaunwarman1@gmail.com"
 12 |   },
 13 |   "commitlint": {
 14 |     "extends": [
 15 |       "@commitlint/config-conventional"
 16 |     ]
 17 |   },
 18 |   "contributors": [
 19 |     "Surmon <i@surmon.me> (http://surmon.me/)",
 20 |     "Shaun Warman <shaunwarman1@gmail.com> (https://shaunwarman.com/)"
 21 |   ],
 22 |   "dependencies": {
 23 |     "debug": "^4.1.1",
 24 |     "stopword": "^1.0.1"
 25 |   },
 26 |   "devDependencies": {
 27 |     "@commitlint/cli": "latest",
 28 |     "@commitlint/config-conventional": "latest",
 29 |     "ava": "latest",
 30 |     "codecov": "latest",
 31 |     "cross-env": "latest",
 32 |     "eslint": "latest",
 33 |     "eslint-config-xo-lass": "latest",
 34 |     "fixpack": "latest",
 35 |     "husky": "latest",
 36 |     "lint-staged": "latest",
 37 |     "mailparser": "^2.7.7",
 38 |     "nyc": "latest",
 39 |     "read-dir-deep": "^7.0.1",
 40 |     "remark-cli": "latest",
 41 |     "remark-preset-github": "latest",
 42 |     "xo": "latest"
 43 |   },
 44 |   "files": [
 45 |     "src"
 46 |   ],
 47 |   "husky": {
 48 |     "hooks": {
 49 |       "pre-commit": "lint-staged && npm test",
 50 |       "commit-msg": "commitlint -E HUSKY_GIT_PARAMS"
 51 |     }
 52 |   },
 53 |   "license": "MIT",
 54 |   "lint-staged": {
 55 |     "*.js": [
 56 |       "xo --fix",
 57 |       "git add"
 58 |     ],
 59 |     "*.md": [
 60 |       "remark . -qfo",
 61 |       "git add"
 62 |     ],
 63 |     "package.json": [
 64 |       "fixpack",
 65 |       "git add"
 66 |     ]
 67 |   },
 68 |   "main": "src/naivebayes.js",
 69 |   "prettier": {
 70 |     "singleQuote": true,
 71 |     "bracketSpacing": true,
 72 |     "trailingComma": "none"
 73 |   },
 74 |   "private": false,
 75 |   "publishConfig": {
 76 |     "access": "public"
 77 |   },
 78 |   "remarkConfig": {
 79 |     "plugins": [
 80 |       "preset-github"
 81 |     ]
 82 |   },
 83 |   "repository": {
 84 |     "type": "git",
 85 |     "url": "https://github.com/ladjs/naivebayes"
 86 |   },
 87 |   "scripts": {
 88 |     "ava": "cross-env NODE_ENV=test ava",
 89 |     "coverage": "nyc report --reporter=text-lcov > coverage.lcov && codecov",
 90 |     "lint": "xo && remark . -qfo",
 91 |     "lint:fix": "xo --fix",
 92 |     "nyc": "cross-env NODE_ENV=test nyc ava",
 93 |     "test": "yarn run lint && yarn run ava",
 94 |     "test-coverage": "yarn run lint && yarn run nyc"
 95 |   },
 96 |   "xo": {
 97 |     "prettier": true,
 98 |     "space": true,
 99 |     "extends": [
100 |       "xo-lass"
101 |     ]
102 |   }
103 | }
104 | 


--------------------------------------------------------------------------------
/test/limit.js:
--------------------------------------------------------------------------------
 1 | const test = require('ava');
 2 | const NaiveBayes = require('../src/naivebayes.js');
 3 | 
 4 | function decode(text) {
 5 |   return Buffer.from(text, 'base64').toString();
 6 | }
 7 | 
 8 | test('naivebayes with limit', (t) => {
 9 |   const classifier = new NaiveBayes({ vocabularyLimit: 10 });
10 | 
11 |   classifier.learn('amazing, awesome movie!! Yeah!! Oh boy.', 'positive');
12 |   classifier.learn(
13 |     'Sweet, this is incredibly, amazing, perfect, great!!',
14 |     'positive'
15 |   );
16 |   classifier.learn('Do one thing at a time, and do well.', 'positive');
17 |   classifier.learn('Never forget to say “thanks”.', 'positive');
18 |   classifier.learn('Believe in yourself.', 'positive');
19 | 
20 |   classifier.learn('terrible, crappy thing. Dang. Stinks!!', 'negative');
21 |   classifier.learn('ugh, bad. This is annoying.', 'negative');
22 |   classifier.learn('No, why. This is dumb', 'negative');
23 |   classifier.learn('Are you serious? This sucks!', 'negative');
24 |   classifier.learn("I don't want to be here", 'negative');
25 | 
26 |   classifier.learn(decode('R2V0IG91dCAhQmVhdCBpdCEgR2V0IGxvc3Qh'), 'foul');
27 |   classifier.learn(decode('R28gdG8gaGVsbCEgR28gdG8gdGhlIGRldmlsIQ=='), 'foul');
28 |   classifier.learn(decode('T2gsIGhlbGwncyBiZWxscyE='), 'foul');
29 |   classifier.learn(decode('WW91IFNPQiAoc29uIG9mIGEpIQ=='), 'foul');
30 |   classifier.learn(decode('U09HIChzb24gb2YgR3VuKSE='), 'foul');
31 |   classifier.learn(decode('RGFtbiB5b3Uh'), 'foul');
32 | 
33 |   const pFoul = classifier.categorize(decode('R2V0IGxvc3QgeW91IFNPQg=='));
34 |   t.is(pFoul, 'foul');
35 | 
36 |   const pNegative = classifier.categorize('Oh no that is crappy');
37 |   t.is(pNegative, 'negative');
38 | 
39 |   const pPositive = classifier.categorize('Sweet that was awesome');
40 |   t.is(pPositive, 'positive');
41 | 
42 |   const classifierJson = classifier.toJson();
43 |   const classifierJsonObject = classifier.toJsonObject();
44 |   t.is(typeof classifierJson, 'string');
45 |   t.is(typeof classifierJsonObject, 'object');
46 |   t.deepEqual(classifierJsonObject.categories, [
47 |     'positive',
48 |     'negative',
49 |     'foul'
50 |   ]);
51 | });
52 | 
53 | test('naivebayes from json with implicit limit', (t) => {
54 |   const json = require('./fixtures/classifier-with-limit');
55 |   const classifier = NaiveBayes.fromJson(json);
56 |   t.is(classifier.vocabularyLimit, 80);
57 | 
58 |   const pFoul = classifier.categorize(decode('WW91IGdldCBvdXQh'));
59 |   t.is(pFoul, 'foul');
60 | 
61 |   const pNegative = classifier.categorize('Oh no that is crappy');
62 |   t.is(pNegative, 'negative');
63 | 
64 |   const pPositive = classifier.categorize('Sweet that was awesome');
65 |   t.is(pPositive, 'positive');
66 | 
67 |   const state = classifier.toJsonObject();
68 | 
69 |   t.true(state.vocabulary.length <= 80);
70 | });
71 | 
72 | test('naivebayes from json with explicit limit', (t) => {
73 |   const json = require('./fixtures/classifier-with-limit');
74 |   const classifier = NaiveBayes.fromJson(json, 80);
75 |   t.is(classifier.vocabularyLimit, 80);
76 | 
77 |   classifier.learn(decode('WW91IGdldCBvdXQh'), 'foul');
78 | 
79 |   const pFoul = classifier.categorize(decode('WW91IGdldCBvdXQh'));
80 |   t.is(pFoul, 'foul');
81 | 
82 |   const pNegative = classifier.categorize('Oh no that is crappy');
83 |   t.is(pNegative, 'negative');
84 | 
85 |   const pPositive = classifier.categorize('Sweet that was awesome');
86 |   t.is(pPositive, 'positive');
87 | 
88 |   const state = classifier.toJsonObject();
89 | 
90 |   t.true(state.vocabulary.length <= 80);
91 | });
92 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # [**@ladjs/naivebayes**](https://github.com/ladjs/naivebayes)
  2 | 
  3 | [![build status](https://img.shields.io/travis/com/shaunwarman/naivebayes.svg)](https://travis-ci.com/shaunwarman/naivebayes)
  4 | [![code coverage](https://img.shields.io/codecov/c/github/shaunwarman/naivebayes.svg)](https://codecov.io/gh/shaunwarman/naivebayes)
  5 | [![code style](https://img.shields.io/badge/code_style-XO-5ed9c7.svg)](https://github.com/sindresorhus/xo)
  6 | [![styled with prettier](https://img.shields.io/badge/styled_with-prettier-ff69b4.svg)](https://github.com/prettier/prettier)
  7 | [![made with lass](https://img.shields.io/badge/made_with-lass-95CC28.svg)](https://lass.js.org)
  8 | [![npm downloads](https://img.shields.io/npm/dt/@ladjs/naivebayes.svg)](https://npm.im/@ladjs/naivebayes)
  9 | 
 10 | > A ladjs naivebayes package forked from surmon-china/naivebayes
 11 | 
 12 | 
 13 | ## Table of Contents
 14 | 
 15 | * [What can I use this for](#what-can-i-use-this-for)
 16 | * [Install](#install)
 17 |   * [npm](#npm)
 18 |   * [yarn](#yarn)
 19 | * [Usage](#usage)
 20 | * [API](#api)
 21 |   * [Class](#class)
 22 |   * [Learn](#learn)
 23 |   * [Probabilities](#probabilities)
 24 |   * [Categorize](#categorize)
 25 |   * [ToJson](#tojson)
 26 |   * [ToJsonObject](#tojsonobject)
 27 |   * [FromJson](#fromjson)
 28 |   * [Debug](#debug)
 29 | * [Contributors](#contributors)
 30 | 
 31 | 
 32 | ## What can I use this for
 33 | 
 34 | Naive-Bayes classifier for JavaScript.
 35 | 
 36 | `naivebayes` takes a document (piece of text), and tells you what category that document belongs to.
 37 | 
 38 | You can use this for categorizing any text content into any arbitrary set of **categories**. For example:
 39 | 
 40 | * Is an email **spam**, or **not spam** ?
 41 | * Is a news article about **technology**, **politics**, or **sports** ?
 42 | * Is a piece of text expressing **positive** emotions, or **negative** emotions?
 43 | 
 44 | 
 45 | ## Install
 46 | 
 47 | ### npm
 48 | 
 49 | ```sh
 50 | npm install @ladjs/naivebayes
 51 | ```
 52 | 
 53 | ### yarn
 54 | 
 55 | ```sh
 56 | yarn add @ladjs/naivebayes
 57 | ```
 58 | 
 59 | 
 60 | ## Usage
 61 | 
 62 | ```javascript
 63 | const NaiveBayes = require('naivebayes')
 64 | 
 65 | const classifier = new NaiveBayes()
 66 | 
 67 | // teach it positive phrases
 68 | classifier.learn('amazing, awesome movie!! Yeah!! Oh boy.', 'positive')
 69 | classifier.learn('Sweet, this is incredibly, amazing, perfect, great!!', 'positive')
 70 | 
 71 | // teach it a negative phrase
 72 | classifier.learn('terrible, cruddy thing. Damn. Sucks!!', 'negative')
 73 | 
 74 | // now ask it to categorize a document it has never seen before
 75 | classifier.categorize('awesome, cool, amazing!! Yay.')
 76 | // => 'positive'
 77 | 
 78 | // serialize the classifier's state as a JSON string.
 79 | const stateJson = classifier.toJson()
 80 | 
 81 | // load the classifier back from its JSON representation.
 82 | const revivedClassifier = NaiveBayes.fromJson(stateJson)
 83 | 
 84 | ```
 85 | 
 86 | ```javascript
 87 | const NaiveBayes = require('naivebayes')
 88 | 
 89 | const Segment = require('segment')
 90 | const segment = new Segment()
 91 | 
 92 | segment.useDefault()
 93 | 
 94 | const classifier = new NaiveBayes({
 95 | 
 96 |     tokenizer(sentence) {
 97 | 
 98 |         const sanitized = sentence.replace(/[^(a-zA-Z\u4e00-\u9fa50-9_)+\s]/g, ' ')
 99 | 
100 |         return segment.doSegment(sanitized, { simple: true })
101 |     }
102 | })
103 | ```
104 | 
105 | 
106 | ## API
107 | 
108 | ### Class
109 | 
110 | ```javascript
111 | const classifier = new NaiveBayes([options])
112 | ```
113 | 
114 | Returns an instance of a Naive-Bayes Classifier.
115 | 
116 | #### Options
117 | 
118 | * `tokenizer(text)` - (type: `function`) -  Configure your own tokenizer.
119 | * `vocabularyLimit` - (type: `number` default: 0) - Reference a max word count where `0` is the default, meaning no limit.
120 | * `stopwords` - (type: `boolean` default: false) - To remove [stopwords](https://en.wikipedia.org/wiki/Stop_words) from text
121 | 
122 | Eg.
123 | 
124 | ```javascript
125 | const classifier = new NaiveBayes({
126 |     tokenizer(text) {
127 |         return text.split(' ')
128 |     }
129 | })
130 | ```
131 | 
132 | ### Learn
133 | 
134 | ```javascript
135 | classifier.learn(text, category)
136 | ```
137 | 
138 | Teach your classifier what `category` the `text` belongs to. The more you teach your classifier, the more reliable it becomes. It will use what it has learned to identify new documents that it hasn't seen before.
139 | 
140 | ### Probabilities
141 | 
142 | ```javascript
143 | classifier.probabilities(text)
144 | ```
145 | 
146 | Returns an array of `{ category, probability }` objects with probability calculated for each category. Its judgement is based on what you have taught it with `.learn()`.
147 | 
148 | ### Categorize
149 | 
150 | ```javascript
151 | classifier.categorize(text ,[probability])
152 | ```
153 | 
154 | Returns the `category` it thinks `text` belongs to. Its judgement is based on what you have taught it with `.learn()`.
155 | 
156 | ### ToJson
157 | 
158 | ```javascript
159 | classifier.toJson()
160 | ```
161 | 
162 | Returns the JSON representation of a classifier. This is the same as `JSON.stringify(classifier.toJsonObject())`.
163 | 
164 | ### ToJsonObject
165 | 
166 | ```javascript
167 | classifier.toJsonObject()
168 | ```
169 | 
170 | Returns a JSON-friendly representation of the classifier as an `object`.
171 | 
172 | ### FromJson
173 | 
174 | ```javascript
175 | const classifier = NaiveBayes.fromJson(jsonObject)
176 | ```
177 | 
178 | Returns a classifier instance from the JSON representation. Use this with the JSON representation obtained from `classifier.toJson()`.
179 | 
180 | ### Debug
181 | 
182 | To run `naivebayes` in debug mode simply set `DEBUG=naivebayes` when running your script.
183 | 
184 | 
185 | ## Contributors
186 | 
187 | | Name             | Website                    |
188 | | ---------------- | -------------------------- |
189 | | **Surmon**       | <http://surmon.me/>        |
190 | | **Shaun Warman** | <https://shaunwarman.com/> |
191 | 


--------------------------------------------------------------------------------
/test/fixtures/classifier-limit.json:
--------------------------------------------------------------------------------
1 | {"categories":["ham","spam"],"docCount":{"ham":2551,"spam":468},"totalDocuments":3019,"vocabulary":["0","1","2","3","4","5","7","10","12","18","20","21","22","25","26","27","28","29","30","51","99","2002","the","to","of","and","a","I","in","is","that","it","for","s","http","com","on","you","with","be","t","have","this","are","as","The","not","net","from","www","at","was","or","by","but","can","an","has","they","all","list","will","lists","my","if","use","we","about","","there","more","so","one","would","just","their","do","which","like","get","your","out","up","some","This","listinfo","what","time","who","It","people","he","|","m","been","me","freshrpms","than","no","mailing","said","Date","them","users","_______________________________________________","new","any","List","had","In","mailman","into","rpm","way","If","html","ve","could","RPM","email","You","because","https","were","his","first","its","our","spam","d","how","To","even","talk","mail","But","using","message","where","most","redhat","ll","line","click","see","linux","years","From","XML","exmh","many","Linux","still","C","problem","+","And","very","ie","We","being","That","apt","find","data","us","after","Exmh","fork","information","own","files","these","world","might","better","without","different","technology","ALB","last","while","DataPower","i","long","news","kernel","code","server","software","Bush","01","Aug","He","United","States","companies","e","ilug","Mr","yahoo","World","high","company","both","unseen","Yahoo","must","version","send","devel","workers","~","doing","Now","unsubscribe","source","against","Red","group","within","business","\\","Hat","root","her","Groups","network","Chris","works","him","web","i386","info","political","]","ever","F","msgs","l","subject","global","Wed","industry","August","seen","Java","says","terms","market","security","k","Capital","man","One","Your","America","processing","application","size","war","hardware","p","Powell","co","docs","Sun","Kelly","President","freedom","million","president","sent","trade","forteana","ximian","ftp","egroups","Sponsor","TM","Networks","7gSolB","Free","uk","CD","French","configure","Dave","Venrock","DVDs","Join","af6_decore","cpp","George","echo","British","Pacific","pub","+s","pt6YBB","0","1","2","3","4","5","7","8","10","12","14","18","23","24","25","30","31","50","66","71","72","80","81","100","500","the","to","of","and","you","a","in","for","your","com","is","I","this","www","[http","that","or","with","are","on","be","from","will","have","it","as","http","s","our","not","000","The","by","We","can","This","You","an","email","my","money","If","FREE","mail","e","out","more","net","all","A","do","people","was","we","To","t","gif]","TO","one","information","THE","list","at","has","YOU","get","only","receive","up","","make","if","images","me","please","send","OF","AND","just","E","address","business","name","over","de","any","asp","so","00","been","free","how","YOUR","Your","wish","ie","new","No","S","no","linux","they","who","jpg]","below","made","Click","day","their","use","his","each","received","removed","FOR","For","what","than","THIS","Free","program","NOW","c","but","government","which","work","mails","IN","server","milf","EnenKio","IS","It","link","ad_key","htm]","REPORT","subject","future","Report","days","orders","life","New","Kingdom","had","ext","index","report","theadmanager","first","Get","looking","COM","very","ve","jeweldive","emails","In","1]","total","sites","html]","domain","ilug","U","et","insurance","fromyou2","response","weeks","cgi","addresses","nasty","Guide","en","html","admanmail","United","Islands","How","p","php","JM","NETNOTEINC","And","o","trading","under","des","fund","cfm","family","Q","IT","internet","Marshall","States","l","adclick","Email","Phone","bindex","world","Group","ws","its","Life","Hermios","last","THAT","»","les","big","CD","state","T","State","MY","King","la","sending","Atoll","WILL","As","REMOVE","à","ebonylust4free","Legal","sle","le","interested","Eneen","Kio","lose","B","freak","Membership","Web","_","prizeinthebag","Mr","ad","Wake","Contains","friendfinder","rights","pk007","nsi","e89","banners","Drive","mailer","Computer","hanmail","Have","Warranty","Island","amber","Offer","natural","Marshallese","islands","His","GUIDE","India","Start","1)","aff","_0","gif][http","black","Guaranteed","AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"],"wordCount":{"ham":300,"spam":300},"wordFrequencyCount":{"ham":{"0":2547,"1":1460,"2":1201,"3":536,"4":579,"5":592,"7":433,"10":802,"12":196,"18":346,"20":255,"21":152,"22":204,"25":186,"26":165,"27":153,"28":194,"29":126,"30":175,"51":177,"99":231,"2002":1912,"the":20973,"to":13281,"of":10721,"and":10550,"a":10175,"I":7428,"in":6859,"is":6395,"that":6101,"it":5116,"for":4619,"s":3964,"http":3861,"com":3723,"on":3460,"you":3251,"with":3099,"be":2872,"t":2807,"have":2676,"this":2629,"are":2497,"as":2345,"The":2326,"not":2282,"net":2221,"from":2193,"www":2156,"at":2113,"was":1992,"or":1972,"by":1971,"but":1880,"can":1805,"an":1719,"has":1515,"they":1468,"all":1456,"list":1418,"will":1372,"lists":1367,"my":1365,"if":1289,"use":1272,"we":1245,"about":1242,"":1207,"there":1181,"more":1173,"so":1164,"one":1094,"would":1093,"just":1090,"their":1060,"do":1059,"which":1052,"like":1033,"get":1028,"your":1023,"out":1013,"up":990,"some":948,"This":944,"listinfo":931,"what":931,"time":921,"who":893,"It":875,"people":841,"he":839,"|":836,"m":835,"been":831,"me":816,"freshrpms":803,"than":794,"no":793,"mailing":792,"said":788,"Date":770,"them":764,"users":741,"_______________________________________________":730,"new":725,"any":683,"List":665,"had":664,"In":653,"mailman":649,"into":644,"rpm":635,"way":635,"If":625,"html":617,"ve":615,"could":608,"RPM":600,"email":597,"You":596,"because":586,"https":571,"were":565,"his":565,"first":551,"its":536,"our":525,"spam":523,"d":518,"how":512,"To":502,"even":498,"talk":491,"mail":491,"But":490,"using":490,"message":484,"where":470,"most":463,"redhat":462,"ll":453,"line":450,"click":439,"see":435,"linux":422,"years":415,"From":412,"XML":407,"exmh":406,"many":404,"Linux":403,"still":401,"C":399,"problem":396,"+":394,"And":393,"very":390,"ie":379,"We":373,"being":372,"That":369,"apt":366,"find":366,"data":365,"us":361,"after":353,"Exmh":335,"fork":333,"information":322,"own":316,"files":315,"these":313,"world":306,"might":301,"better":291,"without":288,"different":283,"technology":281,"ALB":270,"last":270,"while":267,"DataPower":266,"i":264,"long":264,"news":262,"kernel":261,"code":261,"server":260,"software":253,"Bush":252,"01":252,"Aug":251,"He":245,"United":243,"States":242,"companies":242,"e":234,"ilug":227,"Mr":225,"yahoo":225,"World":222,"high":221,"company":218,"both":216,"unseen":211,"Yahoo":211,"must":211,"version":210,"send":209,"devel":208,"workers":206,"~":206,"doing":205,"Now":203,"unsubscribe":198,"source":197,"against":197,"Red":196,"group":196,"within":193,"business":191,"\\":189,"Hat":186,"root":184,"her":184,"Groups":183,"network":178,"Chris":177,"works":174,"him":174,"web":174,"i386":173,"info":167,"political":164,"]":163,"ever":161,"F":159,"msgs":158,"l":158,"subject":158,"global":156,"Wed":152,"industry":148,"August":148,"seen":146,"Java":142,"says":142,"terms":137,"market":136,"security":135,"k":132,"Capital":130,"man":130,"One":125,"Your":121,"America":121,"processing":119,"application":119,"size":117,"war":117,"hardware":114,"p":114,"Powell":112,"co":112,"docs":110,"Sun":107,"Kelly":106,"President":105,"freedom":102,"million":100,"president":100,"sent":100,"trade":98,"forteana":97,"ximian":96,"ftp":93,"egroups":92,"Sponsor":89,"TM":89,"Networks":89,"7gSolB":87,"Free":87,"uk":87,"CD":85,"French":85,"configure":84,"Dave":84,"Venrock":78,"DVDs":73,"Join":67,"af6_decore":66,"cpp":66,"George":63,"echo":61,"British":60,"Pacific":58,"pub":57,"+s":51,"pt6YBB":51},"spam":{"0":92,"1":437,"2":346,"3":222,"4":180,"5":379,"7":59,"8":89,"10":176,"12":66,"14":54,"18":53,"23":35,"24":65,"25":89,"30":281,"31":270,"50":117,"66":52,"71":69,"72":36,"80":64,"81":106,"100":184,"500":102,"the":4705,"to":4043,"of":3209,"and":3208,"you":2573,"a":2020,"in":1825,"for":1653,"your":1443,"com":1402,"is":1361,"I":1184,"this":1108,"www":1067,"[http":1050,"that":967,"or":840,"with":830,"are":810,"on":791,"be":777,"from":762,"will":737,"have":723,"it":715,"as":686,"http":669,"s":602,"our":593,"not":561,"000":559,"The":550,"by":545,"We":450,"can":446,"This":433,"You":432,"an":424,"email":413,"my":410,"money":394,"If":376,"FREE":375,"mail":375,"e":370,"out":363,"more":349,"net":344,"all":342,"A":340,"do":323,"people":321,"was":317,"we":315,"To":313,"t":311,"gif]":291,"TO":290,"one":285,"information":284,"THE":282,"list":280,"at":279,"has":270,"YOU":269,"get":260,"only":245,"receive":235,"up":232,"":231,"make":229,"if":228,"images":227,"me":222,"please":221,"send":218,"OF":217,"AND":216,"just":211,"E":206,"address":202,"business":199,"name":198,"over":196,"de":194,"any":193,"asp":192,"so":189,"00":187,"been":178,"free":178,"how":176,"YOUR":169,"Your":167,"wish":163,"ie":163,"new":163,"No":161,"S":160,"no":159,"linux":157,"they":155,"who":154,"jpg]":153,"below":153,"made":148,"Click":147,"day":141,"their":140,"use":138,"his":137,"each":134,"received":132,"removed":132,"FOR":131,"For":129,"what":129,"than":129,"THIS":128,"Free":128,"program":128,"NOW":125,"c":125,"but":125,"government":124,"which":122,"work":119,"mails":117,"IN":117,"server":111,"milf":110,"EnenKio":110,"IS":109,"It":109,"link":108,"ad_key":104,"htm]":104,"REPORT":103,"subject":103,"future":102,"Report":100,"days":100,"orders":99,"life":96,"New":94,"Kingdom":93,"had":93,"ext":93,"index":93,"report":91,"theadmanager":89,"first":87,"Get":87,"looking":87,"COM":86,"very":84,"ve":84,"jeweldive":83,"emails":82,"In":82,"1]":81,"total":80,"sites":79,"html]":79,"domain":78,"ilug":78,"U":77,"et":76,"insurance":76,"fromyou2":74,"response":74,"weeks":73,"cgi":73,"addresses":73,"nasty":72,"Guide":72,"en":70,"html":70,"admanmail":70,"United":70,"Islands":68,"How":68,"p":67,"php":66,"JM":66,"NETNOTEINC":66,"And":66,"o":65,"trading":64,"under":63,"des":62,"fund":62,"cfm":61,"family":61,"Q":61,"IT":61,"internet":61,"Marshall":60,"States":60,"l":59,"adclick":58,"Email":58,"Phone":58,"bindex":57,"world":57,"Group":57,"ws":56,"its":55,"Life":54,"Hermios":53,"last":53,"THAT":53,"»":52,"les":52,"big":52,"CD":52,"state":52,"T":50,"State":49,"MY":49,"King":48,"la":48,"sending":48,"Atoll":46,"WILL":46,"As":46,"REMOVE":45,"à":44,"ebonylust4free":43,"Legal":43,"sle":42,"le":42,"interested":42,"Eneen":41,"Kio":41,"lose":41,"B":41,"freak":40,"Membership":40,"Web":40,"_":40,"prizeinthebag":39,"Mr":39,"ad":38,"Wake":37,"Contains":37,"friendfinder":37,"rights":37,"pk007":36,"nsi":36,"e89":36,"banners":36,"Drive":36,"mailer":35,"Computer":35,"hanmail":34,"Have":34,"Warranty":33,"Island":33,"amber":32,"Offer":32,"natural":32,"Marshallese":31,"islands":31,"His":31,"GUIDE":31,"India":30,"Start":29,"1)":29,"aff":29,"_0":29,"gif][http":29,"black":28,"Guaranteed":28,"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA":28}},"options":{"vocabularyLimit":300}}


--------------------------------------------------------------------------------
/src/naivebayes.js:
--------------------------------------------------------------------------------
  1 | const debug = require('debug')('naivebayes');
  2 | const stopword = require('stopword');
  3 | 
  4 | /**
  5 |  * keys we use to serialize a classifier's state
  6 |  */
  7 | const STATE_KEYS = [
  8 |   'categories',
  9 |   'docCount',
 10 |   'totalDocuments',
 11 |   'vocabulary',
 12 |   'wordCount',
 13 |   'wordFrequencyCount',
 14 |   'options'
 15 | ];
 16 | 
 17 | /**
 18 |  * Given an input string, tokenize it into an array of word tokens.
 19 |  * This is the default tokenization function used if user does not provide one in `options`.
 20 |  *
 21 |  * @param  {String} text
 22 |  * @return {Array}
 23 |  */
 24 | const defaultTokenizer = (text) => {
 25 |   const rgxPunctuation = /[^(A-Яa-я\u4E00-\u9FA5\w)+\s]/g;
 26 | 
 27 |   return text
 28 |     .replace(rgxPunctuation, ' ')
 29 |     .replace(/[\u4E00-\u9FA5]/g, (word) => `${word} `)
 30 |     .split(/\s+/);
 31 | };
 32 | 
 33 | /**
 34 |  * Naive-Bayes Classifier
 35 |  *
 36 |  * This is a naive-bayes classifier that uses Laplace Smoothing.
 37 |  *
 38 |  */
 39 | class NaiveBayes {
 40 |   constructor(options = {}) {
 41 |     // set options object
 42 |     this.options = {};
 43 |     if (typeof options !== 'undefined') {
 44 |       if (!options || typeof options !== 'object' || Array.isArray(options)) {
 45 |         throw new TypeError(
 46 |           'NaiveBayes got invalid `options`: `' +
 47 |             options +
 48 |             '`. Pass in an object.'
 49 |         );
 50 |       }
 51 | 
 52 |       this.options = options;
 53 |     }
 54 | 
 55 |     this.tokenizer = this.options.tokenizer || defaultTokenizer;
 56 | 
 57 |     this.vocabulary = [];
 58 | 
 59 |     // max vocabulary size based on word frequency, default is no limit
 60 |     this.vocabularyLimit = this.options.vocabularyLimit || 0;
 61 | 
 62 |     // number of documents we have learned from
 63 |     this.totalDocuments = 0;
 64 | 
 65 |     // document frequency table for each of our categories
 66 |     this.docCount = {};
 67 | 
 68 |     // filter stopwords from vocabulary
 69 |     this.stopwords = options.stopwords || false;
 70 | 
 71 |     // for each category, how many words total were mapped to it
 72 |     this.wordCount = {};
 73 | 
 74 |     // word frequency table for each category
 75 |     this.wordFrequencyCount = {};
 76 | 
 77 |     // hashmap of our category names
 78 |     this.categories = [];
 79 | 
 80 |     debug('init %O', this);
 81 |   }
 82 | 
 83 |   /**
 84 |    * Initialize each of our data structure entries for this new category
 85 |    *
 86 |    * @param  {String} categoryName
 87 |    */
 88 |   initializeCategory(categoryName) {
 89 |     if (!this.categories.includes(categoryName)) {
 90 |       this.docCount[categoryName] = 0;
 91 |       this.wordCount[categoryName] = 0;
 92 |       this.wordFrequencyCount[categoryName] = {};
 93 |       this.categories.push(categoryName);
 94 |     }
 95 | 
 96 |     return this;
 97 |   }
 98 | 
 99 |   /**
100 |    * train our naive-bayes classifier by telling it what `category`
101 |    * the `text` corresponds to.
102 |    *
103 |    * @param  {String} text
104 |    * @param  {String} class
105 |    */
106 |   learn(text, category) {
107 |     debug({ text, category });
108 |     // initialize category data structures if we've never seen this category
109 |     this.initializeCategory(category);
110 | 
111 |     // update our count of how many documents mapped to this category
112 |     this.docCount[category]++;
113 | 
114 |     // update the total number of documents we have learned from
115 |     this.totalDocuments++;
116 | 
117 |     // normalize the text into a word array
118 |     let tokens = this.tokenizer(text);
119 | 
120 |     if (this.stopwords) {
121 |       tokens = stopword.removeStopwords(tokens);
122 |     }
123 | 
124 |     // get a frequency count for each token in the text
125 |     const frequencyTable = this.frequencyTable(tokens);
126 | 
127 |     /*
128 |      * Update our vocabulary and our word frequency count for this category
129 |      */
130 |     Object.keys(frequencyTable).forEach((token) => {
131 |       // add this word to our vocabulary if not already existing
132 |       if (!this.vocabulary[token]) {
133 |         this.vocabulary.push(token);
134 |       }
135 | 
136 |       const frequencyInText = frequencyTable[token];
137 | 
138 |       if (!this.wordFrequencyCount[category])
139 |         this.wordFrequencyCount[category] = {};
140 | 
141 |       // update the frequency information for this word in this category
142 |       if (this.wordFrequencyCount[category][token]) {
143 |         this.wordFrequencyCount[category][token] += frequencyInText;
144 |       } else {
145 |         this.wordFrequencyCount[category][token] = frequencyInText;
146 |       }
147 | 
148 |       // update the count of all words we have seen mapped to this category
149 |       this.wordCount[category] += frequencyInText;
150 |     });
151 | 
152 |     if (!this.vocabularyLimit || this.vocabulary.length <= this.vocabularyLimit)
153 |       return this;
154 | 
155 |     const newFrequencyCount = {};
156 |     for (const category in this.wordFrequencyCount) {
157 |       if (Object.hasOwnProperty.call(this.wordFrequencyCount, category)) {
158 |         const frequencyTable = this.wordFrequencyCount[category];
159 |         const words = Object.keys(frequencyTable);
160 |         if (words.length <= this.vocabularyLimit) {
161 |           newFrequencyCount[category] = this.wordFrequencyCount[category];
162 |           continue;
163 |         }
164 | 
165 |         // sort words by highest frequency
166 |         const frequentWords = words.sort(
167 |           (a, b) => frequencyTable[b] - frequencyTable[a]
168 |         );
169 | 
170 |         debug({ frequentWords });
171 | 
172 |         // build up new structure until vocab limit reached
173 |         for (let count = 0; count < this.vocabularyLimit; count++) {
174 |           const word = frequentWords[count];
175 |           if (!newFrequencyCount[category]) {
176 |             newFrequencyCount[category] = {};
177 |           }
178 | 
179 |           newFrequencyCount[category][word] = this.wordFrequencyCount[category][
180 |             word
181 |           ];
182 |         }
183 |       }
184 |     }
185 | 
186 |     this.wordFrequencyCount = newFrequencyCount;
187 | 
188 |     this.vocabulary = [];
189 |     this.wordCount = {};
190 |     const categories = Object.keys(this.wordFrequencyCount);
191 |     for (const category of categories) {
192 |       const words = Object.keys(this.wordFrequencyCount[category]);
193 |       this.wordCount[category] = words.length;
194 |       this.vocabulary = [...this.vocabulary, ...words];
195 |     }
196 | 
197 |     return this;
198 |   }
199 | 
200 |   /**
201 |    * Determine what category `text` belongs to.
202 |    *
203 |    * @param  {String} text
204 |    * @param  {Boolean} probability
205 |    * @return {String} category
206 |    */
207 |   categorize(text, probability) {
208 |     const category = probability
209 |       ? this.probabilities(text)[0]
210 |       : this.probabilities(text)[0].category;
211 |     debug('categorize: %O', { text, category, probability });
212 |     return category;
213 |   }
214 | 
215 |   /**
216 |    * Determine category probabilities for `text`.
217 |    *
218 |    * @param  {String} text
219 |    * @return {Array} probabilities
220 |    */
221 |   probabilities(text) {
222 |     // [W1,W2,W3,W4,Wn...]
223 |     const tokens = this.tokenizer(text);
224 |     const frequencyTable = this.frequencyTable(tokens);
225 | 
226 |     // P(W1|C) * P(W2|C) ... P(Wn|C) * P(C)
227 |     // iterate thru our categories to calculate the probability for this text
228 |     return this.categories
229 |       .map((category) => {
230 |         // start by calculating the overall probability of this category
231 |         // => out of all documents we've ever looked at, how many were
232 |         //    mapped to this category
233 |         const categoryProbability =
234 |           this.docCount[category] / this.vocabularyLimit
235 |             ? this.wordCount[category]
236 |             : this.totalDocuments;
237 | 
238 |         // take the log to avoid underflow
239 |         let logProbability = Math.log(categoryProbability);
240 | 
241 |         // now determine P( w | c ) for each word `w` in the text
242 |         Object.keys(frequencyTable).forEach((token) => {
243 |           const frequencyInText = frequencyTable[token];
244 |           const tokenProbability = this.tokenProbability(token, category);
245 | 
246 |           // determine the log of the P( w | c ) for this word
247 |           logProbability += frequencyInText * Math.log(tokenProbability);
248 |         });
249 | 
250 |         debug('probabilities: %O', { category, logProbability });
251 | 
252 |         return {
253 |           category,
254 |           probability: logProbability
255 |         };
256 |       })
257 |       .sort((previous, next) => next.probability - previous.probability);
258 |   }
259 | 
260 |   /**
261 |    * Calculate probability that a `token` belongs to a `category`
262 |    *
263 |    * @param  {String} token
264 |    * @param  {String} category
265 |    * @return {Number} probability
266 |    */
267 |   tokenProbability(token, category) {
268 |     if (!this.wordFrequencyCount[category])
269 |       this.wordFrequencyCount[category] = {};
270 | 
271 |     const wordFrequencyCount = this.wordFrequencyCount[category][token] || 0;
272 | 
273 |     const wordCount = this.wordCount[category];
274 | 
275 |     // P(W|C)
276 |     return (wordFrequencyCount + 1) / (wordCount + this.vocabulary.length);
277 |   }
278 | 
279 |   /**
280 |    * Build a frequency hashmap where
281 |    * - the keys are the entries in `tokens`
282 |    * - the values are the frequency of each entry in `tokens`
283 |    *
284 |    * @param  {Array} tokens  Normalized word array
285 |    * @return {Object}
286 |    */
287 |   frequencyTable(tokens) {
288 |     const frequencyTable = Object.create(null);
289 |     for (const token of tokens) {
290 |       if (frequencyTable[token]) {
291 |         frequencyTable[token]++;
292 |       } else {
293 |         frequencyTable[token] = 1;
294 |       }
295 |     }
296 | 
297 |     return frequencyTable;
298 |   }
299 | 
300 |   /**
301 |    * Dump the classifier's state as a JSON string.
302 |    * @param {Boolean} Optionally format the serialized JSON output for easier human consumption
303 |    * @return {String} Representation of the classifier.
304 |    */
305 |   toJson(prettyPrint) {
306 |     const prettyPrintSpaces = prettyPrint ? 2 : 0;
307 |     return JSON.stringify(this.toJsonObject(), null, prettyPrintSpaces);
308 |   }
309 | 
310 |   toJsonObject() {
311 |     const state = {};
312 |     for (const key of STATE_KEYS) {
313 |       state[key] = this[key];
314 |     }
315 | 
316 |     return state;
317 |   }
318 | 
319 |   /**
320 |    * Initializes a NaiveBayes instance from a JSON state representation.
321 |    * Use this with classifier.toJson().
322 |    *
323 |    * @param  {String} jsonStr   state representation obtained by classifier.toJson()
324 |    * @return {NaiveBayes}       Classifier
325 |    */
326 |   static fromJson(json, limit) {
327 |     if (typeof json === 'string') {
328 |       try {
329 |         json = JSON.parse(json);
330 |       } catch {
331 |         throw new Error('Naivebayes.fromJson expects a valid JSON string.');
332 |       }
333 |     }
334 | 
335 |     if (json.options && limit) {
336 |       json.options.vocabularyLimit = limit || 0;
337 |     }
338 | 
339 |     // init a new classifier
340 |     const classifier = new NaiveBayes(json.options);
341 | 
342 |     // override the classifier's state
343 |     STATE_KEYS.forEach((key) => {
344 |       if (json[key] === undefined) {
345 |         throw new Error(
346 |           `NaiveBayes.fromJson: JSON string is missing an expected property: '${key}'.`
347 |         );
348 |       } else {
349 |         classifier[key] = json[key];
350 |       }
351 |     });
352 | 
353 |     return classifier;
354 |   }
355 | 
356 |   static getStateKeys() {
357 |     return STATE_KEYS;
358 |   }
359 | }
360 | 
361 | module.exports = NaiveBayes;
362 | 


--------------------------------------------------------------------------------