├── .babelrc
├── .editorconfig
├── .eslintrc.js
├── .gitignore
├── .npmignore
├── .prettierrc
├── .vscode
    ├── extensions.json
    └── settings.json
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── docs
    ├── Classifier.md
    ├── Model.md
    ├── Prediction.md
    ├── README.md
    └── Vocabulary.md
├── package.json
├── src
    ├── Classifier.js
    ├── Model.js
    ├── Prediction.js
    ├── Vocabulary.js
    └── index.js
├── test
    ├── Classifier.test.js
    ├── Model.test.js
    ├── Prediction.test.js
    └── Vocabulary.test.js
└── webpack.config.js


/.babelrc:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"env": {
 3 | 		"test": {
 4 | 			"plugins": ["@babel/plugin-transform-modules-commonjs"]
 5 | 		},
 6 | 		"development": {
 7 | 			"presets": [["@babel/env"]],
 8 | 			"plugins": ["add-module-exports"]
 9 | 		},
10 | 		"production": {
11 | 			"presets": [["@babel/env"], "minify"],
12 | 			"plugins": ["add-module-exports"]
13 | 		}
14 | 	}
15 | }
16 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig helps developers define and maintain
 2 | # consistent coding styles between different editors and IDEs.
 3 | 
 4 | root = true
 5 | 
 6 | [*]
 7 | end_of_line = lf
 8 | charset = utf-8
 9 | trim_trailing_whitespace = true
10 | insert_final_newline = true
11 | indent_style = tab
12 | 
13 | [*.md]
14 | trim_trailing_whitespace = false
15 | 


--------------------------------------------------------------------------------
/.eslintrc.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 | 	env: {
 3 | 		browser: true,
 4 | 		node: true,
 5 | 		es2021: true,
 6 | 		jest: true
 7 | 	},
 8 | 	extends: ['eslint:recommended'],
 9 | 	parserOptions: {
10 | 		ecmaVersion: 12,
11 | 		sourceType: 'module'
12 | 	},
13 | 	rules: {
14 | 		quotes: ['error', 'single', { avoidEscape: true }],
15 | 		semi: ['error', 'never'],
16 | 		indent: 'off',
17 | 		'no-mixed-spaces-and-tabs': ['warn', 'smart-tabs'],
18 | 		'linebreak-style': ['error', 'unix'],
19 | 		'no-unused-vars': 'warn'
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | logs
 2 | *.log
 3 | npm-debug.log*
 4 | pids
 5 | *.pid
 6 | *.seed
 7 | lib-cov
 8 | coverage
 9 | .nyc_output
10 | node_modules
11 | jspm_packages
12 | .npm
13 | .node_repl_history
14 | .idea
15 | lib
16 | package-lock.json
17 | .DS_Store
18 | Thumbs.db
19 | 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
 1 | *.log
 2 | npm-debug.log*
 3 | coverage
 4 | docs
 5 | .vscode
 6 | .nyc_output
 7 | node_modules
 8 | package-lock.json
 9 | yarn.lock
10 | src
11 | test
12 | CONTRIBUTING.md
13 | .editorconfig
14 | .eslintrc.js
15 | .vscode
16 | .babelrc
17 | webpack.config.js
18 | .gitignore
19 | .DS_Store
20 | Thumb.db
21 | 


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 | 	"trailingComma": "none",
3 | 	"tabWidth": 4,
4 | 	"useTabs": true,
5 | 	"semi": false,
6 | 	"singleQuote": true
7 | }
8 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"recommendations": ["esbenp.prettier-vscode"]
3 | }
4 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "editor.formatOnSave": true,
3 |     "files.insertFinalNewline": true,
4 |     "editor.defaultFormatter": "esbenp.prettier-vscode",
5 |     "prettier.useTabs": true
6 | }
7 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | ## [2.0.1] - 2023-02-05
 6 | 
 7 | ### Changed
 8 | 
 9 | -   Fixed all instances of improper object literal type checks
10 | -   Fixed bug where terms were added to the model vocabulary when making predictions
11 | -   Migrated tests from Mocha/Chai to Jest
12 | 
13 | ## [2.0.0] - 2020-08-28
14 | 
15 | ### Breaking changes
16 | 
17 | -   Removed `minimumConfidence` from `Model`
18 | 
19 | ## [1.0.0] - 2020-08-26
20 | 
21 | Initial release
22 | 
23 | [2.0.1]: https://github.com/andreekeberg/ml-classify-text-js/releases/tag/2.0.1
24 | [2.0.0]: https://github.com/andreekeberg/ml-classify-text-js/releases/tag/2.0.0
25 | [1.0.0]: https://github.com/andreekeberg/ml-classify-text-js/releases/tag/1.0.0
26 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to ClassifyText
 2 | 
 3 | This document contains basic guidelines to make contributing to this project as easy and transparent as possible, whether it's:
 4 | 
 5 | -   Reporting a bug
 6 | -   Discussing the current state of the code
 7 | -   Submitting a fix
 8 | -   Proposing new features
 9 | -   Becoming a maintainer
10 | 
11 | ## Pull requests are actively welcomed
12 | 
13 | 1. Fork the repo and create your branch from `master`.
14 | 2. If you've added code that should be tested, add tests.
15 | 3. If you've changed APIs, update the documentation.
16 | 4. Make sure your code lints.
17 | 5. Issue your pull request.
18 | 
19 | ## Any contributions you make will be under the MIT Software License
20 | 
21 | In short, when you submit code changes, your submissions are understood to be under the same [MIT License](http://choosealicense.com/licenses/mit/) that covers the project.
22 | 
23 | ## Report bugs using [issues](https://github.com/andreekeberg/ml-classify-text-js/issues)
24 | 
25 | All bugs are tracked using GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/andreekeberg/ml-classify-text-js/issues/new); it's that easy!
26 | 
27 | ## Write bug reports with detail, background, and sample code
28 | 
29 | **Great bug reports** tend to have:
30 | 
31 | -   A quick summary and/or background
32 | -   Steps to reproduce
33 |     -   Be specific!
34 |     -   Give sample code if you can.
35 |     -   What you expected would happen
36 |     -   What actually happens
37 | -   Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
38 | 
39 | ## License
40 | 
41 | By contributing, you agree that your contributions will be licensed under its MIT License.
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2020-2023 André Ekeberg <hello@andreekeberg.se>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 📄 ClassifyText (JS)
  2 | 
  3 | [![Version](https://img.shields.io/npm/v/ml-classify-text)](https://www.npmjs.com/package/ml-classify-text) [![Total Downloads](https://img.shields.io/npm/dt/ml-classify-text)](https://www.npmjs.com/package/ml-classify-text) [![License](https://img.shields.io/npm/l/ml-classify-text)](https://www.npmjs.com/package/ml-classify-text)
  4 | 
  5 | Use machine learning to classify text using [n-grams](https://en.wikipedia.org/wiki/N-gram) and [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity).
  6 | 
  7 | Minimal library that can be used both in the **browser** and in **Node.js**, that allows you to train a model with a large amount of text samples (and corresponding labels), and then use this model to quickly predict one or more appropriate labels for new text samples.
  8 | 
  9 | ## Installation
 10 | 
 11 | **Using npm**
 12 | 
 13 | ```
 14 | npm install ml-classify-text
 15 | ```
 16 | 
 17 | **Using yarn**
 18 | 
 19 | ```
 20 | yarn add ml-classify-text
 21 | ```
 22 | 
 23 | ## Getting started
 24 | 
 25 | **Import as an ES6 module**
 26 | 
 27 | ```javascript
 28 | import Classifier from 'ml-classify-text'
 29 | ```
 30 | 
 31 | **Import as a CommonJS module**
 32 | 
 33 | ```javascript
 34 | const { Classifier } = require('ml-classify-text')
 35 | ```
 36 | 
 37 | ## Basic usage
 38 | 
 39 | ### Setting up a new Classifier instance
 40 | 
 41 | ```javascript
 42 | const classifier = new Classifier()
 43 | ```
 44 | 
 45 | ### Training a model
 46 | 
 47 | ```javascript
 48 | const positive = [
 49 | 	'This is great, so cool!',
 50 | 	'Wow, I love it!',
 51 | 	'It really is amazing'
 52 | ]
 53 | 
 54 | const negative = [
 55 | 	'This is really bad',
 56 | 	'I hate it with a passion',
 57 | 	'Just terrible!'
 58 | ]
 59 | 
 60 | classifier.train(positive, 'positive')
 61 | classifier.train(negative, 'negative')
 62 | ```
 63 | 
 64 | ### Getting a prediction
 65 | 
 66 | ```javascript
 67 | const predictions = classifier.predict('It sure is pretty great!')
 68 | 
 69 | if (predictions.length) {
 70 | 	predictions.forEach((prediction) => {
 71 | 		console.log(`${prediction.label} (${prediction.confidence})`)
 72 | 	})
 73 | } else {
 74 | 	console.log('No predictions returned')
 75 | }
 76 | ```
 77 | 
 78 | Returning:
 79 | 
 80 | ```
 81 | positive (0.5423261445466404)
 82 | ```
 83 | 
 84 | ## Advanced usage
 85 | 
 86 | ### Configuration
 87 | 
 88 | The following configuration options can be passed both directly to a new [Model](docs/model.md), or indirectly by passing it to the [Classifier](docs/classifier.md) constructor.
 89 | 
 90 | #### Options
 91 | 
 92 | | Property       | Type                        | Default | Description                                                                                           |
 93 | | -------------- | --------------------------- | ------- | ----------------------------------------------------------------------------------------------------- |
 94 | | **nGramMin**   | `int`                       | `1`     | Minimum n-gram size                                                                                   |
 95 | | **nGramMax**   | `int`                       | `1`     | Maximum n-gram size                                                                                   |
 96 | | **vocabulary** | `Array` \| `Set` \| `false` | `[]`    | Terms mapped to indexes in the model data, set to `false` to store terms directly in the data entries |
 97 | | **data**       | `Object`                    | `{}`    | Key-value store of labels and training data vectors                                                   |
 98 | 
 99 | ### Using n-grams
100 | 
101 | The default behavior is to split up texts by single words (known as a [bag of words](https://en.wikipedia.org/wiki/Bag-of-words_model), or unigrams).
102 | 
103 | This has a few limitations, since by ignoring the order of words, it's impossible to correctly match phrases and expressions.
104 | 
105 | In comes [n-grams](https://en.wikipedia.org/wiki/N-gram), which, when set to use more than one word per term, act like a sliding window that moves across the text — a continuous sequence of words of the specified amount, which can greatly improve the accuracy of predictions.
106 | 
107 | #### Example of using n-grams with a size of 2 (bigrams)
108 | 
109 | ```javascript
110 | const classifier = new Classifier({
111 | 	nGramMin: 2,
112 | 	nGramMax: 2
113 | })
114 | 
115 | const tokens = classifier.tokenize('I really dont like it')
116 | 
117 | console.log(tokens)
118 | ```
119 | 
120 | Returning:
121 | 
122 | ```javascript
123 | {
124 |     'i really': 1,
125 |     'really dont': 1,
126 |     'dont like': 1,
127 |     'like it': 1
128 | }
129 | ```
130 | 
131 | ### Serializing a model
132 | 
133 | After training a model with large sets of data, you'll want to store all this data, to allow you to simply set up a new model using this training data at another time, and quickly make predictions.
134 | 
135 | To do this, simply use the `serialize` method on your [Model](docs/model.md), and either save the data structure to a file, send it to a server, or store it in any other way you want.
136 | 
137 | ```javascript
138 | const model = classifier.model
139 | 
140 | console.log(model.serialize())
141 | ```
142 | 
143 | Returning:
144 | 
145 | ```
146 | {
147 |     nGramMin: 1,
148 |     nGramMax: 1,
149 |     vocabulary: [
150 |     	'this',    'is',      'great',
151 |     	'so',      'cool',    'wow',
152 |     	'i',       'love',    'it',
153 |     	'really',  'amazing', 'bad',
154 |     	'hate',    'with',    'a',
155 |     	'passion', 'just',    'terrible'
156 |     ],
157 |     data: {
158 |         positive: {
159 |             '0': 1, '1': 2, '2': 1,
160 |             '3': 1, '4': 1, '5': 1,
161 |             '6': 1, '7': 1, '8': 2,
162 |             '9': 1, '10': 1
163 |         },
164 |         negative: {
165 |             '0': 1, '1': 1, '6': 1,
166 |             '8': 1, '9': 1, '11': 1,
167 |             '12': 1, '13': 1, '14': 1,
168 |             '15': 1, '16': 1, '17': 1
169 |         }
170 |     }
171 | }
172 | ```
173 | 
174 | ## Documentation
175 | 
176 | -   [Classifier](docs/Classifier.md)
177 | -   [Model](docs/Model.md)
178 | -   [Vocabulary](docs/Vocabulary.md)
179 | -   [Prediction](docs/Prediction.md)
180 | 
181 | ## Contributing
182 | 
183 | Read the [contribution guidelines](CONTRIBUTING.md).
184 | 
185 | ## Changelog
186 | 
187 | Refer to the [changelog](CHANGELOG.md) for a full history of the project.
188 | 
189 | ## License
190 | 
191 | ClassifyText is licensed under the [MIT license](LICENSE).
192 | 


--------------------------------------------------------------------------------
/docs/Classifier.md:
--------------------------------------------------------------------------------
 1 | <a name="Classifier"></a>
 2 | 
 3 | ## Classifier
 4 | 
 5 | -   [Classifier](#Classifier)
 6 |     -   [new Classifier([model])](#new_Classifier)
 7 |     -   [.model](#Classifier+model) : <code>Model</code>
 8 |     -   [.train(input, label)](#Classifier+train) ⇒ <code>this</code>
 9 |     -   [.predict(input, [maxMatches], [minimumConfidence])](#Classifier+predict) ⇒ <code>Array</code>
10 |     -   [.splitWords(input)](#Classifier+splitWords) ⇒ <code>Array</code>
11 |     -   [.tokenize(input)](#Classifier+tokenize) ⇒ <code>Object</code>
12 |     -   [.vectorize(tokens)](#Classifier+vectorize) ⇒ <code>Object</code>
13 |     -   [.cosineSimilarity(v1, v2)](#Classifier+cosineSimilarity) ⇒ <code>float</code>
14 | 
15 | <a name="new_Classifier"></a>
16 | 
17 | ### new Classifier([model])
18 | 
19 | | Param              | Type                        | Default | Description                                                                                           |
20 | | ------------------ | --------------------------- | ------- | ----------------------------------------------------------------------------------------------------- |
21 | | [model]            | `Model` \| `Object`         |         |                                                                                                       |
22 | | [model.nGramMin]   | `int`                       | `1`     | Minimum n-gram size                                                                                   |
23 | | [model.nGramMax]   | `int`                       | `1`     | Maximum n-gram size                                                                                   |
24 | | [model.vocabulary] | `Array` \| `Set` \| `false` | `[]`    | Terms mapped to indexes in the model data, set to `false` to store terms directly in the data entries |
25 | | [model.data]       | `Object`                    | `{}`    | Key-value store of labels and training data vectors                                                   |
26 | 
27 | <a name="Classifier+model"></a>
28 | 
29 | ### classifier.model : `Model`
30 | 
31 | Model instance
32 | 
33 | <a name="Classifier+train"></a>
34 | 
35 | ### classifier.train(input, label) ⇒ `this`
36 | 
37 | Train the current model using an input string (or array of strings) and a corresponding label
38 | 
39 | | Param | Type                | Description                    |
40 | | ----- | ------------------- | ------------------------------ |
41 | | input | `string` \| `Array` | String, or an array of strings |
42 | | label | `string`            | Corresponding label            |
43 | 
44 | <a name="Classifier+predict"></a>
45 | 
46 | ### classifier.predict(input, [maxMatches], [minimumConfidence]) ⇒ `Array`
47 | 
48 | Return an array of one or more Prediction instances
49 | 
50 | | Param               | Type     | Default | Description                                         |
51 | | ------------------- | -------- | ------- | --------------------------------------------------- |
52 | | input               | `string` |         | Input string to make a prediction from              |
53 | | [maxMatches]        | `int`    | `1`     | Maximum number of predictions to return             |
54 | | [minimumConfidence] | `float`  | `0.2`   | Minimum confidence required to include a prediction |
55 | 
56 | <a name="Classifier+splitWords"></a>
57 | 
58 | ### classifier.splitWords(input) ⇒ `Array`
59 | 
60 | Split a string into an array of lowercase words, with all non-letter characters removed
61 | 
62 | | Param | Type     |
63 | | ----- | -------- |
64 | | input | `string` |
65 | 
66 | <a name="Classifier+tokenize"></a>
67 | 
68 | ### classifier.tokenize(input) ⇒ `Object`
69 | 
70 | Create an object literal of unique tokens (n-grams) as keys, and their
71 | respective occurrences as values based on an input string, or array of words
72 | 
73 | | Param | Type                |
74 | | ----- | ------------------- |
75 | | input | `string` \| `Array` |
76 | 
77 | <a name="Classifier+vectorize"></a>
78 | 
79 | ### classifier.vectorize(tokens) ⇒ `Object`
80 | 
81 | Convert a tokenized object into a new object with all keys (terms)
82 | translated to their index in the returned vocabulary (which is also
83 | returned along with the object, with any new terms added to the end)
84 | 
85 | | Param  | Type     |
86 | | ------ | -------- |
87 | | tokens | `Object` |
88 | 
89 | <a name="Classifier+cosineSimilarity"></a>
90 | 
91 | ### classifier.cosineSimilarity(v1, v2) ⇒ `float`
92 | 
93 | Return the cosine similarity between two vectors
94 | 
95 | | Param | Type     |
96 | | ----- | -------- |
97 | | v1    | `Object` |
98 | | v2    | `Object` |
99 | 


--------------------------------------------------------------------------------
/docs/Model.md:
--------------------------------------------------------------------------------
 1 | <a name="Model"></a>
 2 | 
 3 | ## Model
 4 | 
 5 | -   [Model](#Model)
 6 |     -   [new Model([config])](#new_Model)
 7 |     -   [.nGramMin](#Model+nGramMin) : `int`
 8 |     -   [.nGramMax](#Model+nGramMax) : `int`
 9 |     -   [.vocabulary](#Model+vocabulary) : `Vocabulary` \| `false`
10 |     -   [.data](#Model+data) : `Object`
11 |     -   [.serialize()](#Model+serialize) ⇒ `Object`
12 | 
13 | <a name="new_Model"></a>
14 | 
15 | ### new Model([config])
16 | 
17 | | Param               | Type                        | Default | Description                                                                                                 |
18 | | ------------------- | --------------------------- | ------- | ----------------------------------------------------------------------------------------------------------- |
19 | | [config]            | `Object`                    |         |                                                                                                             |
20 | | [config.nGramMin]   | `int`                       | `1`     | Minimum n-gram size                                                                                         |
21 | | [config.nGramMax]   | `int`                       | `1`     | Maximum n-gram size                                                                                         |
22 | | [config.vocabulary] | `Array` \| `Set` \| `false` | `[]`    | Terms mapped to indexes in the model data entries, set to false to store terms directly in the data entries |
23 | | [config.data]       | `Object`                    | `{}`    | Key-value store containing all training data                                                                |
24 | 
25 | <a name="Model+nGramMin"></a>
26 | 
27 | ### model.nGramMin : `int`
28 | 
29 | Minimum n-gram size
30 | 
31 | <a name="Model+nGramMax"></a>
32 | 
33 | ### model.nGramMax : `int`
34 | 
35 | Maximum n-gram size
36 | 
37 | <a name="Model+vocabulary"></a>
38 | 
39 | ### model.vocabulary : `Vocabulary` \| `false`
40 | 
41 | Vocabulary instance
42 | 
43 | <a name="Model+data"></a>
44 | 
45 | ### model.data : `Object`
46 | 
47 | Model data
48 | 
49 | <a name="Model+serialize"></a>
50 | 
51 | ### model.serialize() ⇒ `Object`
52 | 
53 | Return the model in its current state an an object literal, including the
54 | configured n-gram min/max values, the vocabulary as an array (if any,
55 | otherwise false), and an object literal with all the training data
56 | 


--------------------------------------------------------------------------------
/docs/Prediction.md:
--------------------------------------------------------------------------------
 1 | <a name="Prediction"></a>
 2 | 
 3 | ## Prediction
 4 | 
 5 | -   [Prediction](#Prediction)
 6 |     -   [.label](#Prediction+label) : `string`
 7 |     -   [.confidence](#Prediction+confidence) : `number`
 8 | 
 9 | <a name="Prediction+label"></a>
10 | 
11 | ### prediction.label : `string`
12 | 
13 | Label of the prediction
14 | 
15 | <a name="Prediction+confidence"></a>
16 | 
17 | ### prediction.confidence : `number`
18 | 
19 | Confidence of the prediction
20 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # Documentation
2 | 
3 | Full documentation of all the available classes, properties and methods.
4 | 
5 | -   [Classifier](Classifier.md)
6 | -   [Model](Model.md)
7 | -   [Vocabulary](Vocabulary.md)
8 | -   [Prediction](Prediction.md)
9 | 


--------------------------------------------------------------------------------
/docs/Vocabulary.md:
--------------------------------------------------------------------------------
 1 | <a name="Vocabulary"></a>
 2 | 
 3 | ## Vocabulary
 4 | 
 5 | -   [Vocabulary](#Vocabulary)
 6 |     -   [new Vocabulary(terms)](#new_Vocabulary)
 7 |     -   [.size](#Vocabulary+size) : `number`
 8 |     -   [.terms](#Vocabulary+terms) : `Array` \| `Set`
 9 |     -   [.add(terms)](#Vocabulary+add) ⇒ `this`
10 |     -   [.remove(terms)](#Vocabulary+remove) ⇒ `this`
11 |     -   [.has(term)](#Vocabulary+has) ⇒ `bool`
12 |     -   [.indexOf(term)](#Vocabulary+indexOf) ⇒ `number`
13 | 
14 | <a name="new_Vocabulary"></a>
15 | 
16 | ### new Vocabulary(terms)
17 | 
18 | | Param | Type             |
19 | | ----- | ---------------- |
20 | | terms | `Array` \| `Set` |
21 | 
22 | <a name="Vocabulary+size"></a>
23 | 
24 | ### vocabulary.size : `number`
25 | 
26 | Vocabulary size
27 | 
28 | <a name="Vocabulary+terms"></a>
29 | 
30 | ### vocabulary.terms : `Array` \| `Set`
31 | 
32 | Vocabulary terms
33 | 
34 | <a name="Vocabulary+add"></a>
35 | 
36 | ### vocabulary.add(terms) ⇒ `this`
37 | 
38 | Add one or more terms to the vocabulary
39 | 
40 | | Param | Type                         |
41 | | ----- | ---------------------------- |
42 | | terms | `string` \| `Array` \| `Set` |
43 | 
44 | <a name="Vocabulary+remove"></a>
45 | 
46 | ### vocabulary.remove(terms) ⇒ `this`
47 | 
48 | Remove one or more terms from the vocabulary
49 | 
50 | | Param | Type                         |
51 | | ----- | ---------------------------- |
52 | | terms | `string` \| `Array` \| `Set` |
53 | 
54 | <a name="Vocabulary+has"></a>
55 | 
56 | ### vocabulary.has(term) ⇒ `bool`
57 | 
58 | Return whether the vocabulary contains a certain term
59 | 
60 | | Param | Type     |
61 | | ----- | -------- |
62 | | term  | `string` |
63 | 
64 | <a name="Vocabulary+indexOf"></a>
65 | 
66 | ### vocabulary.indexOf(term) ⇒ `number`
67 | 
68 | Return the index of a term in the vocabulary (returns -1 if not found)
69 | 
70 | | Param | Type     |
71 | | ----- | -------- |
72 | | term  | `string` |
73 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "ml-classify-text",
 3 | 	"version": "2.0.1",
 4 | 	"description": "Text classification using n-grams and cosine similarity",
 5 | 	"module": "./lib",
 6 | 	"main": "./lib",
 7 | 	"scripts": {
 8 | 		"clean": "rimraf lib",
 9 | 		"test": "jest --coverage",
10 | 		"test:watch": "jest --watchAll",
11 | 		"test:prod": "cross-env BABEL_ENV=production npm run test",
12 | 		"lint": "eslint src test",
13 | 		"build": "webpack --mode=production --config=webpack.config.js",
14 | 		"prepublish": "npm run clean && npm run lint && npm run test && npm run build"
15 | 	},
16 | 	"files": [
17 | 		"lib"
18 | 	],
19 | 	"repository": {
20 | 		"type": "git",
21 | 		"url": "git+https://github.com/andreekeberg/ml-classify-text-js.git"
22 | 	},
23 | 	"keywords": [
24 | 		"text classification",
25 | 		"classification",
26 | 		"classify",
27 | 		"classifier",
28 | 		"machine learning",
29 | 		"machine",
30 | 		"learning",
31 | 		"ai",
32 | 		"artificial intelligence",
33 | 		"artificial",
34 | 		"intelligence",
35 | 		"n-gram",
36 | 		"n-grams",
37 | 		"cosine similarity",
38 | 		"cosine",
39 | 		"similarity",
40 | 		"confidence",
41 | 		"predict",
42 | 		"prediction",
43 | 		"model",
44 | 		"train"
45 | 	],
46 | 	"author": "André Ekeberg <hello@andreekeberg.se> (https://andreekeberg.se/en/)",
47 | 	"license": "MIT",
48 | 	"bugs": {
49 | 		"url": "https://github.com/andreekeberg/ml-classify-text-js/issues"
50 | 	},
51 | 	"homepage": "https://github.com/andreekeberg/ml-classify-text-js",
52 | 	"devDependencies": {
53 | 		"@babel/core": "^7.20.12",
54 | 		"@babel/plugin-transform-modules-amd": "^7.20.11",
55 | 		"@babel/plugin-transform-modules-commonjs": "^7.20.11",
56 | 		"@babel/plugin-transform-runtime": "^7.19.6",
57 | 		"@babel/polyfill": "^7.12.1",
58 | 		"@babel/preset-env": "^7.20.2",
59 | 		"@babel/register": "^7.18.9",
60 | 		"@babel/runtime": "^7.20.13",
61 | 		"@babel/runtime-corejs3": "^7.20.13",
62 | 		"babel-cli": "^6.26.0",
63 | 		"babel-eslint": "^10.1.0",
64 | 		"babel-loader": "^9.1.2",
65 | 		"babel-plugin-add-module-exports": "^1.0.4",
66 | 		"babel-polyfill": "^6.26.0",
67 | 		"babel-preset-env": "^1.7.0",
68 | 		"babel-preset-minify": "^0.5.2",
69 | 		"babel-runtime": "^6.26.0",
70 | 		"core-js": "^3.27.2",
71 | 		"cross-env": "^7.0.3",
72 | 		"eslint": "^8.33.0",
73 | 		"eslint-config-standard": "^17.0.0",
74 | 		"eslint-plugin-node": "^11.1.0",
75 | 		"jest": "^29.4.1",
76 | 		"jsdoc": "^4.0.0",
77 | 		"jsdoc-to-markdown": "^8.0.0",
78 | 		"rimraf": "^4.1.2",
79 | 		"webpack": "^5.75.0",
80 | 		"webpack-cli": "^5.0.1"
81 | 	},
82 | 	"dependencies": {
83 | 		"xregexp": "^5.1.1"
84 | 	}
85 | }
86 | 


--------------------------------------------------------------------------------
/src/Classifier.js:
--------------------------------------------------------------------------------
  1 | import XRegExp from 'xregexp'
  2 | import Model from './Model'
  3 | import Prediction from './Prediction'
  4 | import Vocabulary from './Vocabulary'
  5 | 
  6 | /**
  7 |  * @param {(Model|Object)} [model]
  8 |  * @param {int} [model.nGramMin=1] - Minimum n-gram size
  9 |  * @param {int} [model.nGramMax=1] - Maximum n-gram size
 10 |  * @param {(Array|Set|false)} [model.vocabulary=[]] - Terms mapped to indexes in the model data entries, set to false to store terms directly in the data entries
 11 |  * @param {Object} [model.data={}] - Key-value store containing all training data
 12 |  * @constructor
 13 |  */
 14 | class Classifier {
 15 | 	constructor(model = {}) {
 16 | 		if (!(model instanceof Model)) {
 17 | 			model = new Model(model)
 18 | 		}
 19 | 
 20 | 		this._model = model
 21 | 	}
 22 | 
 23 | 	/**
 24 | 	 * Model instance
 25 | 	 *
 26 | 	 * @type {Model}
 27 | 	 */
 28 | 	get model() {
 29 | 		return this._model
 30 | 	}
 31 | 
 32 | 	set model(model) {
 33 | 		if (!(model instanceof Model)) {
 34 | 			model = new Model(model)
 35 | 		}
 36 | 
 37 | 		this._model = model
 38 | 	}
 39 | 
 40 | 	/**
 41 | 	 * Train the current model using an input string (or array of strings) and a corresponding label
 42 | 	 *
 43 | 	 * @param {(string|string[])} input - String, or an array of strings
 44 | 	 * @param {string} label - Corresponding label
 45 | 	 * @return {this}
 46 | 	 */
 47 | 	train(input, label) {
 48 | 		if (typeof input !== 'string' && !(input instanceof Array)) {
 49 | 			throw new Error('input must be either a string or Array')
 50 | 		}
 51 | 
 52 | 		if (typeof label !== 'string') {
 53 | 			throw new Error('label must be a string')
 54 | 		}
 55 | 
 56 | 		// If input isn't an array, convert to a single item array
 57 | 		if (!(input instanceof Array)) {
 58 | 			input = [input]
 59 | 		}
 60 | 
 61 | 		input.forEach((string) => {
 62 | 			// Convert the string to a tokenized object
 63 | 			let tokens = this.tokenize(string)
 64 | 
 65 | 			if (this._model.vocabulary !== false) {
 66 | 				// If we're using a vocabulary, convert the tokens to a vector where all
 67 | 				// indexes reference vocabulary terms
 68 | 				const { vector, vocabulary } = this.vectorize(tokens)
 69 | 
 70 | 				// Overwrite the tokens object with our new vectorized object
 71 | 				tokens = vector
 72 | 
 73 | 				// Update the model vocabulary
 74 | 				this._model.vocabulary = vocabulary
 75 | 			}
 76 | 
 77 | 			// Set up an empty entry for the label if it does not exist
 78 | 			if (
 79 | 				!Object.prototype.hasOwnProperty.call(this._model.data, label)
 80 | 			) {
 81 | 				this._model.data[label] = {}
 82 | 			}
 83 | 
 84 | 			// Add all occurrences to our model entry
 85 | 			Object.keys(tokens).forEach((index) => {
 86 | 				let occurrences = tokens[index]
 87 | 
 88 | 				if (
 89 | 					!Object.prototype.hasOwnProperty.call(
 90 | 						this._model.data[label],
 91 | 						index
 92 | 					)
 93 | 				) {
 94 | 					this._model.data[label][index] = 0
 95 | 				}
 96 | 
 97 | 				this._model.data[label][index] += occurrences
 98 | 			})
 99 | 		})
100 | 
101 | 		return this
102 | 	}
103 | 
104 | 	/**
105 | 	 * Return an array of one or more Prediction instances
106 | 	 *
107 | 	 * @param {string} input - Input string to make a prediction from
108 | 	 * @param {int} [maxMatches=1] Maximum number of predictions to return
109 | 	 * @param {float} [minimumConfidence=0.2] Minimum confidence required to include a prediction
110 | 	 * @return {Array}
111 | 	 */
112 | 	predict(input, maxMatches = 1, minimumConfidence = 0.2) {
113 | 		if (typeof input !== 'string') {
114 | 			throw new Error('input must be a string')
115 | 		}
116 | 
117 | 		if (!['number', 'undefined'].includes(typeof maxMatches)) {
118 | 			throw new Error('maxMatches must be either a number or undefined')
119 | 		}
120 | 
121 | 		if (!['number', 'undefined'].includes(typeof minimumConfidence)) {
122 | 			throw new Error(
123 | 				'minimumConfidence must be either a number or undefined'
124 | 			)
125 | 		}
126 | 
127 | 		if (minimumConfidence < 0) {
128 | 			throw new Error('minimumConfidence can not be lower than 0')
129 | 		}
130 | 
131 | 		if (minimumConfidence > 1) {
132 | 			throw new Error('minimumConfidence can not be higher than 1')
133 | 		}
134 | 
135 | 		// Convert the string to a tokenized object
136 | 		let tokens = this.tokenize(input)
137 | 
138 | 		if (this.vocabulary !== false) {
139 | 			// If we're using a vocabulary, convert the tokens to a vector where all
140 | 			// indexes reference vocabulary terms
141 | 			const { vector } = this.vectorize(tokens)
142 | 
143 | 			// Overwrite the tokens object with our new vectorized object
144 | 			tokens = vector
145 | 		}
146 | 
147 | 		const predictions = []
148 | 
149 | 		Object.keys(this._model.data).forEach((label) => {
150 | 			let entry = this._model.data[label]
151 | 
152 | 			let confidence = this.cosineSimilarity(tokens, entry)
153 | 
154 | 			if (confidence >= minimumConfidence) {
155 | 				predictions.push(
156 | 					new Prediction({
157 | 						label,
158 | 						confidence
159 | 					})
160 | 				)
161 | 			}
162 | 		})
163 | 
164 | 		/* istanbul ignore next */
165 | 		predictions.sort((a, b) => {
166 | 			if (a.confidence === b.confidence) {
167 | 				return 0
168 | 			}
169 | 
170 | 			return a.confidence > b.confidence ? -1 : 1
171 | 		})
172 | 
173 | 		return predictions.slice(0, Math.min(predictions.length, maxMatches))
174 | 	}
175 | 
176 | 	/**
177 | 	 * Split a string into an array of lowercase words, with all non-letter characters removed
178 | 	 *
179 | 	 * @param {string} input
180 | 	 * @return {Array}
181 | 	 */
182 | 	splitWords(input) {
183 | 		if (typeof input !== 'string') {
184 | 			throw new Error('input must be a string')
185 | 		}
186 | 
187 | 		// Remove all apostrophes and dashes to keep words intact
188 | 		input = input.replace(/'|´|’|-/g, '')
189 | 
190 | 		// Lowercase all letters and replace all non-letter characters with a space
191 | 		input = XRegExp.replace(
192 | 			input.toLocaleLowerCase(),
193 | 			XRegExp('\\P{L}+', 'g'),
194 | 			' '
195 | 		).trim()
196 | 
197 | 		return input.split(' ')
198 | 	}
199 | 
200 | 	/**
201 | 	 * Create an object literal of unique tokens (n-grams) as keys, and their
202 | 	 * respective occurrences as values based on an input string, or array of words
203 | 	 *
204 | 	 * @param {(string|string[])} input
205 | 	 * @return {Object}
206 | 	 */
207 | 	tokenize(input) {
208 | 		let words = typeof input === 'string' ? this.splitWords(input) : input
209 | 
210 | 		if (!(words instanceof Array)) {
211 | 			throw new Error('input must be either a string or Array')
212 | 		}
213 | 
214 | 		if (this._model.nGramMax < this._model.nGramMin) {
215 | 			throw new Error(
216 | 				'Invalid nGramMin/nGramMax combination in model config'
217 | 			)
218 | 		}
219 | 
220 | 		let tokens = {}
221 | 
222 | 		// Generate a list of n-grams along with their respective occurrences
223 | 		// based on the models configured min/max values
224 | 		words.forEach((word, index) => {
225 | 			let sequence = ''
226 | 
227 | 			words.slice(index).forEach((nextWord) => {
228 | 				sequence += sequence ? ' ' + nextWord : nextWord
229 | 				let tokenCount = sequence.split(' ').length
230 | 
231 | 				if (
232 | 					tokenCount < this._model.nGramMin ||
233 | 					tokenCount > this._model.nGramMax
234 | 				) {
235 | 					return
236 | 				}
237 | 
238 | 				if (!Object.prototype.hasOwnProperty.call(tokens, sequence)) {
239 | 					tokens[sequence] = 0
240 | 				}
241 | 
242 | 				++tokens[sequence]
243 | 			})
244 | 		})
245 | 
246 | 		return tokens
247 | 	}
248 | 
249 | 	/**
250 | 	 * Convert a tokenized object into a new object with all keys (terms)
251 | 	 * translated to their index in the returned vocabulary (which is also
252 | 	 * returned along with the object, with any new terms added to the end)
253 | 	 *
254 | 	 * @param {Object} tokens
255 | 	 * @return {Object}
256 | 	 */
257 | 	vectorize(tokens) {
258 | 		if (Object.getPrototypeOf(tokens) !== Object.prototype) {
259 | 			throw new Error('tokens must be an object literal')
260 | 		}
261 | 
262 | 		/* istanbul ignore next */
263 | 		if (this._model.vocabulary === false) {
264 | 			throw new Error('Cannot vectorize tokens when vocabulary is false')
265 | 		}
266 | 
267 | 		const vector = {}
268 | 		const vocabulary = new Vocabulary(this._model.vocabulary.terms)
269 | 
270 | 		Object.keys(tokens).forEach((token) => {
271 | 			let vocabularyIndex = vocabulary.indexOf(token)
272 | 
273 | 			if (vocabularyIndex === -1) {
274 | 				vocabulary.add(token)
275 | 
276 | 				vocabularyIndex = vocabulary.size - 1
277 | 			}
278 | 
279 | 			vector[vocabularyIndex] = tokens[token]
280 | 		})
281 | 
282 | 		return {
283 | 			vector,
284 | 			vocabulary
285 | 		}
286 | 	}
287 | 
288 | 	/**
289 | 	 * Return the cosine similarity between two vectors
290 | 	 *
291 | 	 * @param {Object} v1
292 | 	 * @param {Object} v2
293 | 	 * @return {float}
294 | 	 */
295 | 	cosineSimilarity(v1, v2) {
296 | 		if (Object.getPrototypeOf(v1) !== Object.prototype) {
297 | 			throw new Error('v1 must be an object literal')
298 | 		}
299 | 		if (Object.getPrototypeOf(v2) !== Object.prototype) {
300 | 			throw new Error('v2 must be an object literal')
301 | 		}
302 | 
303 | 		let prod = 0.0
304 | 		let v1Norm = 0.0
305 | 
306 | 		Object.keys(v1).forEach((i) => {
307 | 			let xi = v1[i]
308 | 
309 | 			if (Object.prototype.hasOwnProperty.call(v2, i)) {
310 | 				prod += xi * v2[i]
311 | 			}
312 | 
313 | 			v1Norm += xi * xi
314 | 		})
315 | 
316 | 		v1Norm = Math.sqrt(v1Norm)
317 | 
318 | 		if (v1Norm === 0) {
319 | 			return 0
320 | 		}
321 | 
322 | 		let v2Norm = 0.0
323 | 
324 | 		Object.keys(v2).forEach((i) => {
325 | 			let xi = v2[i]
326 | 
327 | 			v2Norm += xi * xi
328 | 		})
329 | 
330 | 		v2Norm = Math.sqrt(v2Norm)
331 | 
332 | 		if (v2Norm === 0) {
333 | 			return 0
334 | 		}
335 | 
336 | 		return prod / (v1Norm * v2Norm)
337 | 	}
338 | }
339 | 
340 | export default Classifier
341 | 


--------------------------------------------------------------------------------
/src/Model.js:
--------------------------------------------------------------------------------
  1 | import Vocabulary from './Vocabulary'
  2 | 
  3 | /**
  4 |  * @param {Object} [config]
  5 |  * @param {int} [config.nGramMin=1] - Minimum n-gram size
  6 |  * @param {int} [config.nGramMax=1] - Maximum n-gram size
  7 |  * @param {(Array|Set|false)} [config.vocabulary=[]] - Terms mapped to indexes in the model data entries, set to false to store terms directly in the data entries
  8 |  * @param {Object} [config.data={}] - Key-value store containing all training data
  9 |  * @constructor
 10 |  */
 11 | class Model {
 12 | 	constructor(config = {}) {
 13 | 		if (Object.getPrototypeOf(config) !== Object.prototype) {
 14 | 			throw new Error('config must be an object literal')
 15 | 		}
 16 | 
 17 | 		config = {
 18 | 			nGramMin: 1,
 19 | 			nGramMax: 1,
 20 | 			vocabulary: [],
 21 | 			data: {},
 22 | 			...config
 23 | 		}
 24 | 
 25 | 		if (config.nGramMin !== parseInt(config.nGramMin, 10)) {
 26 | 			throw new Error('Config value nGramMin must be an integer')
 27 | 		}
 28 | 
 29 | 		if (config.nGramMax !== parseInt(config.nGramMax, 10)) {
 30 | 			throw new Error('Config value nGramMax must be an integer')
 31 | 		}
 32 | 
 33 | 		if (config.nGramMin < 1) {
 34 | 			throw new Error('Config value nGramMin must be at least 1')
 35 | 		}
 36 | 
 37 | 		if (config.nGramMax < 1) {
 38 | 			throw new Error('Config value nGramMax must be at least 1')
 39 | 		}
 40 | 
 41 | 		if (config.nGramMax < config.nGramMin) {
 42 | 			throw new Error('Invalid nGramMin/nGramMax combination in config')
 43 | 		}
 44 | 
 45 | 		if (
 46 | 			config.vocabulary !== false &&
 47 | 			!(config.vocabulary instanceof Vocabulary)
 48 | 		) {
 49 | 			config.vocabulary = new Vocabulary(config.vocabulary)
 50 | 		}
 51 | 
 52 | 		if (Object.getPrototypeOf(config.data) !== Object.prototype) {
 53 | 			throw new Error('Config value data must be an object literal')
 54 | 		}
 55 | 
 56 | 		this._nGramMin = config.nGramMin
 57 | 		this._nGramMax = config.nGramMax
 58 | 		this._vocabulary = config.vocabulary
 59 | 		this._data = { ...config.data }
 60 | 	}
 61 | 
 62 | 	/**
 63 | 	 * Minimum n-gram size
 64 | 	 *
 65 | 	 * @type {int}
 66 | 	 */
 67 | 	get nGramMin() {
 68 | 		return this._nGramMin
 69 | 	}
 70 | 
 71 | 	set nGramMin(size) {
 72 | 		if (size !== parseInt(size, 10)) {
 73 | 			throw new Error('nGramMin must be an integer')
 74 | 		}
 75 | 
 76 | 		this._nGramMin = size
 77 | 	}
 78 | 
 79 | 	/**
 80 | 	 * Maximum n-gram size
 81 | 	 *
 82 | 	 * @type {int}
 83 | 	 */
 84 | 	get nGramMax() {
 85 | 		return this._nGramMax
 86 | 	}
 87 | 
 88 | 	set nGramMax(size) {
 89 | 		if (size !== parseInt(size, 10)) {
 90 | 			throw new Error('nGramMax must be an integer')
 91 | 		}
 92 | 
 93 | 		this._nGramMax = size
 94 | 	}
 95 | 
 96 | 	/**
 97 | 	 * Vocabulary instance
 98 | 	 *
 99 | 	 * @type {(Vocabulary|false)}
100 | 	 */
101 | 	get vocabulary() {
102 | 		return this._vocabulary
103 | 	}
104 | 
105 | 	set vocabulary(vocabulary) {
106 | 		if (vocabulary !== false && !(vocabulary instanceof Vocabulary)) {
107 | 			vocabulary = new Vocabulary(vocabulary)
108 | 		}
109 | 
110 | 		this._vocabulary = vocabulary
111 | 	}
112 | 
113 | 	/**
114 | 	 * Model data
115 | 	 *
116 | 	 * @type {Object}
117 | 	 */
118 | 	get data() {
119 | 		return this._data
120 | 	}
121 | 
122 | 	set data(data) {
123 | 		if (!(data instanceof Object) || data.constructor !== Object) {
124 | 			throw new Error('data must be an object literal')
125 | 		}
126 | 
127 | 		this._data = { ...data }
128 | 	}
129 | 
130 | 	/**
131 | 	 * Return the model in its current state an an object literal, including the
132 | 	 * configured n-gram min/max values, the vocabulary as an array (if any,
133 | 	 * otherwise false), and an object literal with all the training data
134 | 	 *
135 | 	 * @return {Object}
136 | 	 */
137 | 	serialize() {
138 | 		return {
139 | 			nGramMin: this._nGramMin,
140 | 			nGramMax: this._nGramMax,
141 | 			vocabulary: Array.from(this._vocabulary.terms),
142 | 			data: this._data
143 | 		}
144 | 	}
145 | }
146 | 
147 | export default Model
148 | 


--------------------------------------------------------------------------------
/src/Prediction.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @param {Object} prediction
 3 |  * @constructor
 4 |  * @hideconstructor
 5 |  */
 6 | class Prediction {
 7 | 	constructor(prediction = {}) {
 8 | 		if (Object.getPrototypeOf(prediction) !== Object.prototype) {
 9 | 			throw new Error('prediction must be an object literal')
10 | 		}
11 | 
12 | 		prediction = {
13 | 			label: '',
14 | 			confidence: 0,
15 | 			...prediction
16 | 		}
17 | 
18 | 		this._label = prediction.label
19 | 		this._confidence = prediction.confidence
20 | 	}
21 | 
22 | 	/**
23 | 	 * Label of the prediction
24 | 	 *
25 | 	 * @type {string}
26 | 	 */
27 | 	get label() {
28 | 		return this._label
29 | 	}
30 | 
31 | 	set label(label) {
32 | 		if (typeof label !== 'string') {
33 | 			throw new Error('label must be a string')
34 | 		}
35 | 
36 | 		this._label = label
37 | 	}
38 | 
39 | 	/**
40 | 	 * Confidence of the prediction
41 | 	 *
42 | 	 * @type {number}
43 | 	 */
44 | 	get confidence() {
45 | 		return this._confidence
46 | 	}
47 | 
48 | 	set confidence(confidence) {
49 | 		if (typeof confidence !== 'number') {
50 | 			throw new Error('confidence must be a number')
51 | 		}
52 | 
53 | 		this._confidence = confidence
54 | 	}
55 | }
56 | 
57 | export default Prediction
58 | 


--------------------------------------------------------------------------------
/src/Vocabulary.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @param {Array|Set} terms
  3 |  * @constructor
  4 |  */
  5 | class Vocabulary {
  6 | 	constructor(terms = []) {
  7 | 		if (!(terms instanceof Array) && !(terms instanceof Set)) {
  8 | 			throw new Error('terms must be either an Array or a Set')
  9 | 		}
 10 | 
 11 | 		this._terms = new Set(terms)
 12 | 	}
 13 | 
 14 | 	/**
 15 | 	 * Vocabulary size
 16 | 	 *
 17 | 	 * @type {number}
 18 | 	 */
 19 | 	get size() {
 20 | 		return this._terms.size
 21 | 	}
 22 | 
 23 | 	/**
 24 | 	 * Vocabulary terms
 25 | 	 *
 26 | 	 * @type {(Array|Set)}
 27 | 	 */
 28 | 	get terms() {
 29 | 		return this._terms
 30 | 	}
 31 | 
 32 | 	set terms(terms) {
 33 | 		if (!(terms instanceof Array) && !(terms instanceof Set)) {
 34 | 			throw new Error('terms must be either an Array or a Set')
 35 | 		}
 36 | 
 37 | 		this._terms = new Set(terms)
 38 | 	}
 39 | 
 40 | 	/**
 41 | 	 * Add one or more terms to the vocabulary
 42 | 	 *
 43 | 	 * @param {(string|Array|Set)} terms
 44 | 	 * @return {this}
 45 | 	 */
 46 | 	add(terms) {
 47 | 		if (
 48 | 			typeof terms !== 'string' &&
 49 | 			!(terms instanceof Array) &&
 50 | 			!(terms instanceof Set)
 51 | 		) {
 52 | 			throw new Error('terms must be either a string, Array or Set')
 53 | 		}
 54 | 
 55 | 		if (typeof terms === 'string') {
 56 | 			terms = [terms]
 57 | 		} else if (terms instanceof Set) {
 58 | 			terms = Array.from(terms)
 59 | 		}
 60 | 
 61 | 		terms.forEach((term) => {
 62 | 			this._terms.add(term)
 63 | 		})
 64 | 
 65 | 		return this
 66 | 	}
 67 | 
 68 | 	/**
 69 | 	 * Remove one or more terms from the vocabulary
 70 | 	 *
 71 | 	 * @param {(string|Array|Set)} terms
 72 | 	 * @return {this}
 73 | 	 */
 74 | 	remove(terms) {
 75 | 		if (
 76 | 			typeof terms !== 'string' &&
 77 | 			!(terms instanceof Array) &&
 78 | 			!(terms instanceof Set)
 79 | 		) {
 80 | 			throw new Error('terms must be either a string, Array or Set')
 81 | 		}
 82 | 
 83 | 		if (typeof terms === 'string') {
 84 | 			terms = [terms]
 85 | 		} else if (terms instanceof Set) {
 86 | 			terms = Array.from(terms)
 87 | 		}
 88 | 
 89 | 		terms.forEach((term) => {
 90 | 			this._terms.delete(term)
 91 | 		})
 92 | 
 93 | 		return this
 94 | 	}
 95 | 
 96 | 	/**
 97 | 	 * Return whether the vocabulary contains a certain term
 98 | 	 *
 99 | 	 * @param {string} term
100 | 	 * @return {bool}
101 | 	 */
102 | 	has(term) {
103 | 		return this._terms.has(term)
104 | 	}
105 | 
106 | 	/**
107 | 	 * Return the index of a term in the vocabulary (returns -1 if not found)
108 | 	 *
109 | 	 * @param {string} term
110 | 	 * @return {number}
111 | 	 */
112 | 	indexOf(term) {
113 | 		if (!this._terms.has(term)) {
114 | 			return -1
115 | 		}
116 | 
117 | 		return Array.from(this._terms).indexOf(term)
118 | 	}
119 | }
120 | 
121 | export default Vocabulary
122 | 


--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
1 | import Classifier from './Classifier'
2 | 
3 | export { default as Model } from './Model'
4 | export { default as Vocabulary } from './Vocabulary'
5 | export { default as Prediction } from './Prediction'
6 | export { Classifier as Classifier }
7 | 
8 | export default Classifier
9 | 


--------------------------------------------------------------------------------
/test/Classifier.test.js:
--------------------------------------------------------------------------------
  1 | import Classifier from '../src/Classifier'
  2 | import Model from '../src/Model'
  3 | 
  4 | describe('Classifier', () => {
  5 | 	describe('constructor', () => {
  6 | 		test('should set the model when passed a model instance', () => {
  7 | 			const classifier = new Classifier(
  8 | 				new Model({
  9 | 					nGramMax: 4
 10 | 				})
 11 | 			)
 12 | 
 13 | 			expect(classifier.model.nGramMax).toStrictEqual(4)
 14 | 		})
 15 | 
 16 | 		test('should set the model when passed an object literal', () => {
 17 | 			const classifier = new Classifier({
 18 | 				nGramMax: 5
 19 | 			})
 20 | 
 21 | 			expect(classifier.model.nGramMax).toStrictEqual(5)
 22 | 		})
 23 | 	})
 24 | 
 25 | 	describe('model', () => {
 26 | 		test('should return a model instance', () => {
 27 | 			let classifier = new Classifier()
 28 | 
 29 | 			expect(classifier.model).toBeInstanceOf(Model)
 30 | 		})
 31 | 
 32 | 		test('should set the current model when passed a model instance', () => {
 33 | 			let classifier = new Classifier()
 34 | 
 35 | 			classifier.model = new Model({
 36 | 				nGramMax: 3
 37 | 			})
 38 | 
 39 | 			expect(classifier.model.nGramMax).toStrictEqual(3)
 40 | 		})
 41 | 
 42 | 		test('should set the current model to a new model instance when passed an object literal', () => {
 43 | 			let classifier = new Classifier()
 44 | 
 45 | 			classifier.model = {}
 46 | 
 47 | 			expect(classifier.model).toBeInstanceOf(Model)
 48 | 		})
 49 | 	})
 50 | 
 51 | 	describe('splitWords', () => {
 52 | 		test('should throw an error if input is not a string', () => {
 53 | 			const classifier = new Classifier()
 54 | 
 55 | 			expect(() => classifier.splitWords(1)).toThrow(Error)
 56 | 		})
 57 | 
 58 | 		test('should split a string into an array of words', () => {
 59 | 			const classifier = new Classifier()
 60 | 
 61 | 			expect(classifier.splitWords('Hello world!')).toStrictEqual([
 62 | 				'hello',
 63 | 				'world'
 64 | 			])
 65 | 		})
 66 | 	})
 67 | 
 68 | 	describe('tokenize', () => {
 69 | 		test('should throw an error if input is neither a string or array', () => {
 70 | 			const classifier = new Classifier()
 71 | 
 72 | 			expect(() => classifier.tokenize({})).toThrow(Error)
 73 | 		})
 74 | 
 75 | 		test('should throw an error if nGramMax is less than nGramMin in model config', () => {
 76 | 			const classifier = new Classifier()
 77 | 
 78 | 			classifier.model.nGramMin = 2
 79 | 
 80 | 			expect(() => classifier.tokenize('Hello world!')).toThrow(Error)
 81 | 		})
 82 | 
 83 | 		test('should return an object literal of tokens and their occurrences from a string', () => {
 84 | 			const classifier = new Classifier()
 85 | 
 86 | 			expect(classifier.tokenize('Hello world!')).toStrictEqual({
 87 | 				hello: 1,
 88 | 				world: 1
 89 | 			})
 90 | 		})
 91 | 
 92 | 		test('should return an object literal of tokens and their occurrences from a string', () => {
 93 | 			const classifier = new Classifier()
 94 | 
 95 | 			expect(classifier.tokenize('Hello world!')).toStrictEqual({
 96 | 				hello: 1,
 97 | 				world: 1
 98 | 			})
 99 | 		})
100 | 
101 | 		test('should return an object literal of tokens and their occurrences from a array', () => {
102 | 			const classifier = new Classifier()
103 | 
104 | 			expect(classifier.tokenize(['hello', 'world'])).toStrictEqual({
105 | 				hello: 1,
106 | 				world: 1
107 | 			})
108 | 		})
109 | 
110 | 		test('should return an object literal of bigrams when nGramMin/nGramMax is 2', () => {
111 | 			const classifier = new Classifier({
112 | 				nGramMin: 2,
113 | 				nGramMax: 2
114 | 			})
115 | 
116 | 			expect(classifier.tokenize('Hello world!')).toStrictEqual({
117 | 				'hello world': 1
118 | 			})
119 | 		})
120 | 
121 | 		test('should return an object literal of unigrams and bigrams when nGramMin/nGramMax is 1/2', () => {
122 | 			const classifier = new Classifier({
123 | 				nGramMin: 1,
124 | 				nGramMax: 2
125 | 			})
126 | 
127 | 			expect(classifier.tokenize('Hello world!')).toStrictEqual({
128 | 				hello: 1,
129 | 				'hello world': 1,
130 | 				world: 1
131 | 			})
132 | 		})
133 | 
134 | 		test('should increment the occurrence of the duplicate tokens', () => {
135 | 			const classifier = new Classifier()
136 | 
137 | 			expect(classifier.tokenize('Hello hello!')).toStrictEqual({
138 | 				hello: 2
139 | 			})
140 | 		})
141 | 	})
142 | 
143 | 	describe('vectorize', () => {
144 | 		test('should throw an error if input is not an object literal', () => {
145 | 			const classifier = new Classifier()
146 | 
147 | 			expect(() => classifier.vectorize([])).toThrow(Error)
148 | 		})
149 | 
150 | 		test('should throw an error if vocabulary config option is set to false', () => {
151 | 			const classifier = new Classifier({
152 | 				vocabulary: false
153 | 			})
154 | 
155 | 			expect(() => classifier.vectorize({ hello: 1 })).toThrow(Error)
156 | 		})
157 | 
158 | 		test('should convert key to its corresponding vocabulary term index', () => {
159 | 			const classifier = new Classifier()
160 | 			const tokens = classifier.tokenize('Hello')
161 | 
162 | 			const { vector } = classifier.vectorize(tokens)
163 | 
164 | 			expect(vector).toStrictEqual({ 0: 1 })
165 | 		})
166 | 
167 | 		test('should use existing term index when token is already in vocabulary', () => {
168 | 			const classifier = new Classifier({
169 | 				vocabulary: ['hello', 'world']
170 | 			})
171 | 
172 | 			const tokens = classifier.tokenize('world')
173 | 
174 | 			const { vector } = classifier.vectorize(tokens)
175 | 
176 | 			expect(vector).toStrictEqual({ 1: 1 })
177 | 		})
178 | 
179 | 		test('should return an updated copy of the vocabulary', () => {
180 | 			const classifier = new Classifier()
181 | 
182 | 			const tokens = classifier.tokenize('Hello world')
183 | 
184 | 			const { vocabulary } = classifier.vectorize(tokens)
185 | 
186 | 			const terms = vocabulary.terms
187 | 
188 | 			expect(Array.from(terms)).toStrictEqual(['hello', 'world'])
189 | 		})
190 | 	})
191 | 
192 | 	describe('train', () => {
193 | 		test('should throw an error if input is not a string or array', () => {
194 | 			const classifier = new Classifier()
195 | 
196 | 			expect(() => classifier.train({}, 'test')).toThrow(Error)
197 | 		})
198 | 
199 | 		test('should throw an error if label is not a string', () => {
200 | 			const classifier = new Classifier()
201 | 
202 | 			expect(() => classifier.train('test', [])).toThrow(Error)
203 | 		})
204 | 
205 | 		test('should add tokens to the vocabulary (if not configured to false)', () => {
206 | 			const classifier = new Classifier()
207 | 
208 | 			classifier.train('hello world', 'test')
209 | 
210 | 			const vocabulary = classifier.model.vocabulary
211 | 
212 | 			expect(vocabulary.size).toStrictEqual(2)
213 | 		})
214 | 
215 | 		test('should add tokens (and their occurrences) to the model from a string', () => {
216 | 			const classifier = new Classifier()
217 | 
218 | 			classifier.train('hello world', 'test')
219 | 
220 | 			const model = classifier.model
221 | 
222 | 			expect(model.data).toStrictEqual({
223 | 				test: { 0: 1, 1: 1 }
224 | 			})
225 | 		})
226 | 
227 | 		test('should add tokens (and their occurrences) to the model from an array of strings', () => {
228 | 			const classifier = new Classifier()
229 | 
230 | 			classifier.train(['hello world', 'foo', 'bar'], 'test')
231 | 
232 | 			const model = classifier.model
233 | 
234 | 			expect(model.data).toStrictEqual({
235 | 				test: { 0: 1, 1: 1, 2: 1, 3: 1 }
236 | 			})
237 | 		})
238 | 
239 | 		test('should increment the occurrence of an existing vocabulary term', () => {
240 | 			const classifier = new Classifier()
241 | 
242 | 			classifier.train(['hello world', 'foo', 'hello'], 'test')
243 | 
244 | 			const model = classifier.model
245 | 
246 | 			expect(model.data).toStrictEqual({
247 | 				test: { 0: 2, 1: 1, 2: 1 }
248 | 			})
249 | 		})
250 | 
251 | 		test('should return classifier instance', () => {
252 | 			const classifier = new Classifier()
253 | 
254 | 			expect(classifier.train('hello world', 'test')).toStrictEqual(
255 | 				classifier
256 | 			)
257 | 		})
258 | 	})
259 | 
260 | 	describe('cosineSimilarity', () => {
261 | 		test('should throw an error if v1 is not an object literal', () => {
262 | 			const classifier = new Classifier()
263 | 
264 | 			expect(() => classifier.cosineSimilarity(false, {})).toThrow(Error)
265 | 		})
266 | 
267 | 		test('should throw an error if v2 is not an object literal', () => {
268 | 			const classifier = new Classifier()
269 | 
270 | 			expect(() => classifier.cosineSimilarity({}, false)).toThrow(Error)
271 | 		})
272 | 
273 | 		test('should return 1 on identical object literals', () => {
274 | 			const classifier = new Classifier()
275 | 
276 | 			expect(
277 | 				classifier.cosineSimilarity(
278 | 					{
279 | 						0: 1
280 | 					},
281 | 					{
282 | 						0: 1
283 | 					}
284 | 				)
285 | 			).toStrictEqual(1)
286 | 		})
287 | 
288 | 		test('should return 0 on object literals with no similarity', () => {
289 | 			const classifier = new Classifier()
290 | 
291 | 			expect(
292 | 				classifier.cosineSimilarity(
293 | 					{
294 | 						0: 1
295 | 					},
296 | 					{
297 | 						1: 1
298 | 					}
299 | 				)
300 | 			).toStrictEqual(0)
301 | 		})
302 | 
303 | 		test('should return > 0 on similar object literals', () => {
304 | 			const classifier = new Classifier()
305 | 
306 | 			expect(
307 | 				classifier.cosineSimilarity(
308 | 					{
309 | 						0: 1,
310 | 						1: 1
311 | 					},
312 | 					{
313 | 						0: 1,
314 | 						2: 1
315 | 					}
316 | 				)
317 | 			).toBeGreaterThan(0)
318 | 		})
319 | 
320 | 		test('should return 0 when sum of v1 is 0', () => {
321 | 			const classifier = new Classifier()
322 | 
323 | 			expect(
324 | 				classifier.cosineSimilarity(
325 | 					{
326 | 						0: 0
327 | 					},
328 | 					{
329 | 						0: 1
330 | 					}
331 | 				)
332 | 			).toStrictEqual(0)
333 | 		})
334 | 
335 | 		test('should return 0 when sum of v2 is 0', () => {
336 | 			const classifier = new Classifier()
337 | 
338 | 			expect(
339 | 				classifier.cosineSimilarity(
340 | 					{
341 | 						0: 1
342 | 					},
343 | 					{
344 | 						0: 0
345 | 					}
346 | 				)
347 | 			).toStrictEqual(0)
348 | 		})
349 | 	})
350 | 
351 | 	describe('predict', () => {
352 | 		test('should throw an error if input is not a string', () => {
353 | 			const classifier = new Classifier()
354 | 
355 | 			expect(() => classifier.predict([])).toThrow(Error)
356 | 		})
357 | 
358 | 		test('should throw an error if maxMatches is not a number', () => {
359 | 			const classifier = new Classifier()
360 | 
361 | 			expect(() => classifier.predict('', 'test')).toThrow(Error)
362 | 		})
363 | 
364 | 		test('should throw an error if minimumConfidence is not a number', () => {
365 | 			const classifier = new Classifier()
366 | 
367 | 			expect(() => classifier.predict('', undefined, 'test')).toThrow(
368 | 				Error
369 | 			)
370 | 		})
371 | 
372 | 		test('should throw an error if minimumConfidence is lower than 0', () => {
373 | 			const classifier = new Classifier()
374 | 
375 | 			expect(() => classifier.predict('', undefined, -1)).toThrow(Error)
376 | 		})
377 | 
378 | 		test('should throw an error if minimumConfidence is higher than 1', () => {
379 | 			const classifier = new Classifier()
380 | 
381 | 			expect(() => classifier.predict('', undefined, 2)).toThrow(Error)
382 | 		})
383 | 
384 | 		test('should return an array', () => {
385 | 			const classifier = new Classifier()
386 | 
387 | 			expect(classifier.predict('test')).toBeInstanceOf(Array)
388 | 		})
389 | 
390 | 		test('should return one prediction when trained with a sample', () => {
391 | 			const classifier = new Classifier()
392 | 
393 | 			classifier.train('hello world', 'test')
394 | 
395 | 			expect(classifier.predict('hello world').length).toStrictEqual(1)
396 | 		})
397 | 
398 | 		test('should not include predictions with a confidence below the configured minimumConfidence', () => {
399 | 			const classifier = new Classifier()
400 | 
401 | 			classifier.train('hello world', 'test')
402 | 
403 | 			const minimumConfidence = 0.8
404 | 
405 | 			const predictions = classifier.predict(
406 | 				'hello',
407 | 				undefined,
408 | 				minimumConfidence
409 | 			)
410 | 
411 | 			expect(
412 | 				predictions.filter((prediction) => {
413 | 					return prediction.confidence < minimumConfidence
414 | 				}).length
415 | 			).toStrictEqual(0)
416 | 		})
417 | 
418 | 		test('should not update the model vocabulary', () => {
419 | 			const classifier = new Classifier()
420 | 
421 | 			classifier.train('hello world', 'test')
422 | 			classifier.predict('hello foo world')
423 | 
424 | 			expect(classifier.model.vocabulary.has('foo')).toStrictEqual(false)
425 | 		})
426 | 	})
427 | })
428 | 


--------------------------------------------------------------------------------
/test/Model.test.js:
--------------------------------------------------------------------------------
  1 | import Model from '../src/Model'
  2 | import Vocabulary from '../src/Vocabulary'
  3 | 
  4 | describe('Model', () => {
  5 | 	describe('constructor', () => {
  6 | 		test('should throw an error if config is not an object literal', () => {
  7 | 			expect(() => new Model([])).toThrow(Error)
  8 | 		})
  9 | 
 10 | 		test('should throw an error if config option nGramMin is not a number', () => {
 11 | 			expect(
 12 | 				() =>
 13 | 					new Model({
 14 | 						nGramMin: ''
 15 | 					})
 16 | 			).toThrow(Error)
 17 | 		})
 18 | 
 19 | 		test('should throw an error if config option nGramMax is not a number', () => {
 20 | 			expect(
 21 | 				() =>
 22 | 					new Model({
 23 | 						nGramMax: ''
 24 | 					})
 25 | 			).toThrow(Error)
 26 | 		})
 27 | 
 28 | 		test('should throw an error if config option nGramMin is less than 1', () => {
 29 | 			expect(
 30 | 				() =>
 31 | 					new Model({
 32 | 						nGramMin: 0
 33 | 					})
 34 | 			).toThrow(Error)
 35 | 		})
 36 | 
 37 | 		test('should throw an error if config option nGramMax is less than 1', () => {
 38 | 			expect(
 39 | 				() =>
 40 | 					new Model({
 41 | 						nGramMax: 0
 42 | 					})
 43 | 			).toThrow(Error)
 44 | 		})
 45 | 
 46 | 		test('should throw an error if config option nGramMax is less than nGramMin', () => {
 47 | 			expect(
 48 | 				() =>
 49 | 					new Model({
 50 | 						nGramMin: 2,
 51 | 						nGramMax: 1
 52 | 					})
 53 | 			).toThrow(Error)
 54 | 		})
 55 | 
 56 | 		test('should throw an error if data is not an object literal', () => {
 57 | 			expect(
 58 | 				() =>
 59 | 					new Model({
 60 | 						data: []
 61 | 					})
 62 | 			).toThrow(Error)
 63 | 		})
 64 | 	})
 65 | 
 66 | 	describe('nGramMin', () => {
 67 | 		test('should return a number', () => {
 68 | 			const model = new Model()
 69 | 
 70 | 			expect(typeof model.nGramMin).toStrictEqual('number')
 71 | 		})
 72 | 
 73 | 		test('should return the current nGramMin value', () => {
 74 | 			const model = new Model({
 75 | 				nGramMin: 3,
 76 | 				nGramMax: 4
 77 | 			})
 78 | 
 79 | 			expect(model.nGramMin).toStrictEqual(3)
 80 | 		})
 81 | 
 82 | 		test('should set the nGramMin value', () => {
 83 | 			const model = new Model()
 84 | 
 85 | 			model.nGramMin = 2
 86 | 
 87 | 			expect(model.nGramMin).toStrictEqual(2)
 88 | 		})
 89 | 
 90 | 		test('should throw an error if size is not an integer', () => {
 91 | 			const model = new Model()
 92 | 
 93 | 			expect(() => {
 94 | 				model.nGramMin = 1.1
 95 | 			}).toThrow(Error)
 96 | 		})
 97 | 	})
 98 | 
 99 | 	describe('nGramMax', () => {
100 | 		test('should return a number', () => {
101 | 			const model = new Model()
102 | 
103 | 			expect(typeof model.nGramMax).toStrictEqual('number')
104 | 		})
105 | 
106 | 		test('should return the current nGramMax value', () => {
107 | 			const model = new Model({
108 | 				nGramMax: 2
109 | 			})
110 | 
111 | 			expect(model.nGramMax).toStrictEqual(2)
112 | 		})
113 | 
114 | 		test('should set the nGramMax value', () => {
115 | 			const model = new Model()
116 | 
117 | 			model.nGramMax = 3
118 | 
119 | 			expect(model.nGramMax).toStrictEqual(3)
120 | 		})
121 | 
122 | 		test('should throw an error if size is not an integer', () => {
123 | 			const model = new Model()
124 | 
125 | 			expect(() => {
126 | 				model.nGramMax = 1.1
127 | 			}).toThrow(Error)
128 | 		})
129 | 	})
130 | 
131 | 	describe('vocabulary', () => {
132 | 		test('should return a vocabulary instance', () => {
133 | 			const model = new Model()
134 | 
135 | 			expect(model.vocabulary).toBeInstanceOf(Vocabulary)
136 | 		})
137 | 
138 | 		test('should return false when vocabulary is configured to false', () => {
139 | 			const model = new Model({
140 | 				vocabulary: false
141 | 			})
142 | 
143 | 			expect(model.vocabulary).toStrictEqual(false)
144 | 		})
145 | 
146 | 		test('should set the vocabulary value when passing an array', () => {
147 | 			const model = new Model()
148 | 
149 | 			model.vocabulary = ['hello', 'world']
150 | 
151 | 			expect(Array.from(model.vocabulary.terms)).toStrictEqual([
152 | 				'hello',
153 | 				'world'
154 | 			])
155 | 		})
156 | 
157 | 		test('should set the vocabulary value when passing false', () => {
158 | 			const model = new Model()
159 | 
160 | 			model.vocabulary = false
161 | 
162 | 			expect(model.vocabulary).toStrictEqual(false)
163 | 		})
164 | 	})
165 | 
166 | 	describe('data', () => {
167 | 		test('should return an object literal', () => {
168 | 			const model = new Model()
169 | 
170 | 			expect(model.data).toStrictEqual({})
171 | 		})
172 | 
173 | 		test('should set the model data', () => {
174 | 			const model = new Model()
175 | 
176 | 			model.data = {
177 | 				test: { 0: 1 }
178 | 			}
179 | 
180 | 			expect(model.data).toStrictEqual({
181 | 				test: { 0: 1 }
182 | 			})
183 | 		})
184 | 
185 | 		test('should throw an error if data is not an object literal', () => {
186 | 			const model = new Model()
187 | 
188 | 			expect(() => {
189 | 				model.data = []
190 | 			}).toThrow(Error)
191 | 		})
192 | 	})
193 | 
194 | 	describe('serialize', () => {
195 | 		test('should return an object literal created from the current model', () => {
196 | 			const model = new Model()
197 | 
198 | 			expect(model.serialize()).toStrictEqual({
199 | 				nGramMin: 1,
200 | 				nGramMax: 1,
201 | 				vocabulary: [],
202 | 				data: {}
203 | 			})
204 | 		})
205 | 	})
206 | })
207 | 


--------------------------------------------------------------------------------
/test/Prediction.test.js:
--------------------------------------------------------------------------------
 1 | import Prediction from '../src/Prediction'
 2 | 
 3 | describe('Prediction', () => {
 4 | 	describe('constructor', () => {
 5 | 		test('should throw an error if prediction is not an object literal', () => {
 6 | 			expect(() => new Prediction([])).toThrow(Error)
 7 | 		})
 8 | 	})
 9 | 
10 | 	describe('label', () => {
11 | 		test('should throw an error if label is not a string', () => {
12 | 			const prediction = new Prediction()
13 | 
14 | 			expect(() => {
15 | 				prediction.label = []
16 | 			}).toThrow(Error)
17 | 		})
18 | 
19 | 		test('should return a string', () => {
20 | 			const prediction = new Prediction()
21 | 
22 | 			expect(typeof prediction.label).toStrictEqual('string')
23 | 		})
24 | 
25 | 		test('should return the defined prediction label', () => {
26 | 			const prediction = new Prediction({
27 | 				label: 'test'
28 | 			})
29 | 
30 | 			expect(prediction.label).toStrictEqual('test')
31 | 		})
32 | 
33 | 		test('should set the prediction label', () => {
34 | 			const prediction = new Prediction()
35 | 
36 | 			prediction.label = 'test'
37 | 
38 | 			expect(prediction.label).toStrictEqual('test')
39 | 		})
40 | 	})
41 | 
42 | 	describe('confidence', () => {
43 | 		test('should throw an error if confidence is not a number', () => {
44 | 			const prediction = new Prediction()
45 | 
46 | 			expect(() => {
47 | 				prediction.confidence = 'test'
48 | 			}).toThrow(Error)
49 | 		})
50 | 
51 | 		test('should return a number', () => {
52 | 			const prediction = new Prediction()
53 | 
54 | 			expect(typeof prediction.confidence).toStrictEqual('number')
55 | 		})
56 | 
57 | 		test('should return the defined prediction confidence', () => {
58 | 			const prediction = new Prediction({
59 | 				confidence: 0.5
60 | 			})
61 | 
62 | 			expect(prediction.confidence).toBeCloseTo(0.5)
63 | 		})
64 | 
65 | 		test('should set the prediction confidence', () => {
66 | 			const prediction = new Prediction()
67 | 
68 | 			prediction.confidence = 1
69 | 
70 | 			expect(prediction.confidence).toStrictEqual(1)
71 | 		})
72 | 	})
73 | })
74 | 


--------------------------------------------------------------------------------
/test/Vocabulary.test.js:
--------------------------------------------------------------------------------
  1 | import Vocabulary from '../src/Vocabulary'
  2 | 
  3 | describe('Vocabulary', () => {
  4 | 	describe('constructor', () => {
  5 | 		test('should throw an error if terms is not an array or set', () => {
  6 | 			expect(() => new Vocabulary({})).toThrow(Error)
  7 | 		})
  8 | 	})
  9 | 
 10 | 	describe('size', () => {
 11 | 		test('should return a number', () => {
 12 | 			const vocabulary = new Vocabulary()
 13 | 
 14 | 			expect(typeof vocabulary.size).toStrictEqual('number')
 15 | 		})
 16 | 
 17 | 		test('should return the vocabulary size', () => {
 18 | 			const vocabulary = new Vocabulary(['hello'])
 19 | 
 20 | 			expect(vocabulary.size).toStrictEqual(1)
 21 | 		})
 22 | 	})
 23 | 
 24 | 	describe('terms', () => {
 25 | 		test('should return a set instance', () => {
 26 | 			const vocabulary = new Vocabulary()
 27 | 
 28 | 			expect(vocabulary.terms).toBeInstanceOf(Set)
 29 | 		})
 30 | 
 31 | 		test('should return the vocabulary terms', () => {
 32 | 			const vocabulary = new Vocabulary(['hello'])
 33 | 
 34 | 			expect(Array.from(vocabulary.terms)).toStrictEqual(['hello'])
 35 | 		})
 36 | 
 37 | 		test('should set the vocabulary terms from an array', () => {
 38 | 			const vocabulary = new Vocabulary()
 39 | 
 40 | 			vocabulary.terms = ['hello', 'world']
 41 | 
 42 | 			expect(Array.from(vocabulary.terms)).toStrictEqual([
 43 | 				'hello',
 44 | 				'world'
 45 | 			])
 46 | 		})
 47 | 
 48 | 		test('should set the vocabulary terms from a set', () => {
 49 | 			const vocabulary = new Vocabulary()
 50 | 
 51 | 			vocabulary.terms = new Set(['hello', 'world'])
 52 | 
 53 | 			expect(Array.from(vocabulary.terms)).toStrictEqual([
 54 | 				'hello',
 55 | 				'world'
 56 | 			])
 57 | 		})
 58 | 
 59 | 		test('should throw an error if terms is not an array or set', () => {
 60 | 			const vocabulary = new Vocabulary()
 61 | 
 62 | 			expect(() => {
 63 | 				vocabulary.terms = {}
 64 | 			}).toThrow(Error)
 65 | 		})
 66 | 	})
 67 | 
 68 | 	describe('add', () => {
 69 | 		test('should throw an error if terms is not a string, array or set', () => {
 70 | 			const vocabulary = new Vocabulary()
 71 | 
 72 | 			expect(() => vocabulary.add({})).toThrow(Error)
 73 | 		})
 74 | 
 75 | 		test('should add a term to the vocabulary from a string', () => {
 76 | 			const vocabulary = new Vocabulary()
 77 | 
 78 | 			vocabulary.add('test')
 79 | 
 80 | 			expect(Array.from(vocabulary.terms)).toStrictEqual(['test'])
 81 | 		})
 82 | 
 83 | 		test('should add terms to the vocabulary from an array', () => {
 84 | 			const vocabulary = new Vocabulary()
 85 | 
 86 | 			vocabulary.add(['hello', 'world'])
 87 | 
 88 | 			expect(Array.from(vocabulary.terms)).toStrictEqual([
 89 | 				'hello',
 90 | 				'world'
 91 | 			])
 92 | 		})
 93 | 
 94 | 		test('should add terms to the vocabulary from a set', () => {
 95 | 			const vocabulary = new Vocabulary()
 96 | 
 97 | 			vocabulary.add(new Set(['hello', 'world']))
 98 | 
 99 | 			expect(Array.from(vocabulary.terms)).toStrictEqual([
100 | 				'hello',
101 | 				'world'
102 | 			])
103 | 		})
104 | 
105 | 		test('should return vocabulary instance', () => {
106 | 			const vocabulary = new Vocabulary()
107 | 
108 | 			expect(vocabulary.add('test')).toBeInstanceOf(Vocabulary)
109 | 		})
110 | 	})
111 | 
112 | 	describe('remove', () => {
113 | 		test('should throw an error if terms is not a string, array or set', () => {
114 | 			const vocabulary = new Vocabulary()
115 | 
116 | 			expect(() => vocabulary.remove({})).toThrow(Error)
117 | 		})
118 | 
119 | 		test('should remove a term to the vocabulary when called with a string', () => {
120 | 			const vocabulary = new Vocabulary(['test'])
121 | 
122 | 			vocabulary.remove('test')
123 | 
124 | 			expect(Array.from(vocabulary.terms)).toStrictEqual([])
125 | 		})
126 | 
127 | 		test('should remove terms from the vocabulary when called with an array', () => {
128 | 			const vocabulary = new Vocabulary(['hello', 'world'])
129 | 
130 | 			vocabulary.remove(['world'])
131 | 
132 | 			expect(Array.from(vocabulary.terms)).toStrictEqual(['hello'])
133 | 		})
134 | 
135 | 		test('should remove terms from the vocabulary when called with a set', () => {
136 | 			const vocabulary = new Vocabulary(['hello', 'world'])
137 | 
138 | 			vocabulary.remove(new Set(['world']))
139 | 
140 | 			expect(Array.from(vocabulary.terms)).toStrictEqual(['hello'])
141 | 		})
142 | 
143 | 		test('should return a vocabulary instance', () => {
144 | 			const vocabulary = new Vocabulary(['test'])
145 | 
146 | 			expect(vocabulary.remove('test')).toBeInstanceOf(Vocabulary)
147 | 		})
148 | 	})
149 | 
150 | 	describe('has', () => {
151 | 		test('should return a boolean', () => {
152 | 			const vocabulary = new Vocabulary()
153 | 
154 | 			expect(typeof vocabulary.has('test')).toStrictEqual('boolean')
155 | 		})
156 | 
157 | 		test('should return whether a term exists in the vocabulary', () => {
158 | 			const vocabulary = new Vocabulary(['test'])
159 | 
160 | 			expect(vocabulary.has('test')).toStrictEqual(true)
161 | 		})
162 | 	})
163 | 
164 | 	describe('indexOf', () => {
165 | 		test('should return the index of an existing vocabulary term', () => {
166 | 			const vocabulary = new Vocabulary(['test'])
167 | 
168 | 			expect(vocabulary.indexOf('test')).toStrictEqual(0)
169 | 		})
170 | 
171 | 		test('should return -1 for non-existing vocabulary terms', () => {
172 | 			const vocabulary = new Vocabulary()
173 | 
174 | 			expect(vocabulary.indexOf('test')).toStrictEqual(-1)
175 | 		})
176 | 	})
177 | })
178 | 


--------------------------------------------------------------------------------
/webpack.config.js:
--------------------------------------------------------------------------------
 1 | require('core-js/stable')
 2 | require('regenerator-runtime/runtime')
 3 | 
 4 | const path = require('path')
 5 | 
 6 | module.exports = {
 7 | 	entry: {
 8 | 		'index': './src/index.js'
 9 | 	},
10 | 	output: {
11 | 		path: path.resolve(__dirname, 'lib'),
12 | 		filename: 'index.js',
13 | 		libraryTarget: 'umd',
14 | 		globalObject: 'this'
15 | 	},
16 | 	module: {
17 | 		rules: [
18 | 			{
19 | 				test: /\.js$/,
20 | 				exclude: /node_modules/,
21 | 				use: {
22 | 					loader: 'babel-loader',
23 | 					options: {
24 | 						plugins: [
25 | 							['@babel/plugin-transform-runtime', {
26 | 								corejs: 3,
27 | 							}]
28 | 						],
29 | 						presets: ['@babel/preset-env']
30 | 					}
31 | 				}
32 | 			}
33 | 		]
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------