├── .babelrc ├── .editorconfig ├── .eslintrc.js ├── .gitignore ├── .npmignore ├── .prettierrc ├── .vscode ├── extensions.json └── settings.json ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── Classifier.md ├── Model.md ├── Prediction.md ├── README.md └── Vocabulary.md ├── package.json ├── src ├── Classifier.js ├── Model.js ├── Prediction.js ├── Vocabulary.js └── index.js ├── test ├── Classifier.test.js ├── Model.test.js ├── Prediction.test.js └── Vocabulary.test.js └── webpack.config.js /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "env": { 3 | "test": { 4 | "plugins": ["@babel/plugin-transform-modules-commonjs"] 5 | }, 6 | "development": { 7 | "presets": [["@babel/env"]], 8 | "plugins": ["add-module-exports"] 9 | }, 10 | "production": { 11 | "presets": [["@babel/env"], "minify"], 12 | "plugins": ["add-module-exports"] 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig helps developers define and maintain 2 | # consistent coding styles between different editors and IDEs. 3 | 4 | root = true 5 | 6 | [*] 7 | end_of_line = lf 8 | charset = utf-8 9 | trim_trailing_whitespace = true 10 | insert_final_newline = true 11 | indent_style = tab 12 | 13 | [*.md] 14 | trim_trailing_whitespace = false 15 | -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | env: { 3 | browser: true, 4 | node: true, 5 | es2021: true, 6 | jest: true 7 | }, 8 | extends: ['eslint:recommended'], 9 | parserOptions: { 10 | ecmaVersion: 12, 11 | sourceType: 'module' 12 | }, 13 | rules: { 14 | quotes: ['error', 'single', { avoidEscape: true }], 15 | semi: ['error', 'never'], 16 | indent: 'off', 17 | 'no-mixed-spaces-and-tabs': ['warn', 'smart-tabs'], 18 | 'linebreak-style': ['error', 'unix'], 19 | 'no-unused-vars': 'warn' 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | logs 2 | *.log 3 | npm-debug.log* 4 | pids 5 | *.pid 6 | *.seed 7 | lib-cov 8 | coverage 9 | .nyc_output 10 | node_modules 11 | jspm_packages 12 | .npm 13 | .node_repl_history 14 | .idea 15 | lib 16 | package-lock.json 17 | .DS_Store 18 | Thumbs.db 19 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | *.log 2 | npm-debug.log* 3 | coverage 4 | docs 5 | .vscode 6 | .nyc_output 7 | node_modules 8 | package-lock.json 9 | yarn.lock 10 | src 11 | test 12 | CONTRIBUTING.md 13 | .editorconfig 14 | .eslintrc.js 15 | .vscode 16 | .babelrc 17 | webpack.config.js 18 | .gitignore 19 | .DS_Store 20 | Thumb.db 21 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "trailingComma": "none", 3 | "tabWidth": 4, 4 | "useTabs": true, 5 | "semi": false, 6 | "singleQuote": true 7 | } 8 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": ["esbenp.prettier-vscode"] 3 | } 4 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.formatOnSave": true, 3 | "files.insertFinalNewline": true, 4 | "editor.defaultFormatter": "esbenp.prettier-vscode", 5 | "prettier.useTabs": true 6 | } 7 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | ## [2.0.1] - 2023-02-05 6 | 7 | ### Changed 8 | 9 | - Fixed all instances of improper object literal type checks 10 | - Fixed bug where terms were added to the model vocabulary when making predictions 11 | - Migrated tests from Mocha/Chai to Jest 12 | 13 | ## [2.0.0] - 2020-08-28 14 | 15 | ### Breaking changes 16 | 17 | - Removed `minimumConfidence` from `Model` 18 | 19 | ## [1.0.0] - 2020-08-26 20 | 21 | Initial release 22 | 23 | [2.0.1]: https://github.com/andreekeberg/ml-classify-text-js/releases/tag/2.0.1 24 | [2.0.0]: https://github.com/andreekeberg/ml-classify-text-js/releases/tag/2.0.0 25 | [1.0.0]: https://github.com/andreekeberg/ml-classify-text-js/releases/tag/1.0.0 26 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to ClassifyText 2 | 3 | This document contains basic guidelines to make contributing to this project as easy and transparent as possible, whether it's: 4 | 5 | - Reporting a bug 6 | - Discussing the current state of the code 7 | - Submitting a fix 8 | - Proposing new features 9 | - Becoming a maintainer 10 | 11 | ## Pull requests are actively welcomed 12 | 13 | 1. Fork the repo and create your branch from `master`. 14 | 2. If you've added code that should be tested, add tests. 15 | 3. If you've changed APIs, update the documentation. 16 | 4. Make sure your code lints. 17 | 5. Issue your pull request. 18 | 19 | ## Any contributions you make will be under the MIT Software License 20 | 21 | In short, when you submit code changes, your submissions are understood to be under the same [MIT License](http://choosealicense.com/licenses/mit/) that covers the project. 22 | 23 | ## Report bugs using [issues](https://github.com/andreekeberg/ml-classify-text-js/issues) 24 | 25 | All bugs are tracked using GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/andreekeberg/ml-classify-text-js/issues/new); it's that easy! 26 | 27 | ## Write bug reports with detail, background, and sample code 28 | 29 | **Great bug reports** tend to have: 30 | 31 | - A quick summary and/or background 32 | - Steps to reproduce 33 | - Be specific! 34 | - Give sample code if you can. 35 | - What you expected would happen 36 | - What actually happens 37 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work) 38 | 39 | ## License 40 | 41 | By contributing, you agree that your contributions will be licensed under its MIT License. 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020-2023 André Ekeberg 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 📄 ClassifyText (JS) 2 | 3 | [![Version](https://img.shields.io/npm/v/ml-classify-text)](https://www.npmjs.com/package/ml-classify-text) [![Total Downloads](https://img.shields.io/npm/dt/ml-classify-text)](https://www.npmjs.com/package/ml-classify-text) [![License](https://img.shields.io/npm/l/ml-classify-text)](https://www.npmjs.com/package/ml-classify-text) 4 | 5 | Use machine learning to classify text using [n-grams](https://en.wikipedia.org/wiki/N-gram) and [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity). 6 | 7 | Minimal library that can be used both in the **browser** and in **Node.js**, that allows you to train a model with a large amount of text samples (and corresponding labels), and then use this model to quickly predict one or more appropriate labels for new text samples. 8 | 9 | ## Installation 10 | 11 | **Using npm** 12 | 13 | ``` 14 | npm install ml-classify-text 15 | ``` 16 | 17 | **Using yarn** 18 | 19 | ``` 20 | yarn add ml-classify-text 21 | ``` 22 | 23 | ## Getting started 24 | 25 | **Import as an ES6 module** 26 | 27 | ```javascript 28 | import Classifier from 'ml-classify-text' 29 | ``` 30 | 31 | **Import as a CommonJS module** 32 | 33 | ```javascript 34 | const { Classifier } = require('ml-classify-text') 35 | ``` 36 | 37 | ## Basic usage 38 | 39 | ### Setting up a new Classifier instance 40 | 41 | ```javascript 42 | const classifier = new Classifier() 43 | ``` 44 | 45 | ### Training a model 46 | 47 | ```javascript 48 | const positive = [ 49 | 'This is great, so cool!', 50 | 'Wow, I love it!', 51 | 'It really is amazing' 52 | ] 53 | 54 | const negative = [ 55 | 'This is really bad', 56 | 'I hate it with a passion', 57 | 'Just terrible!' 58 | ] 59 | 60 | classifier.train(positive, 'positive') 61 | classifier.train(negative, 'negative') 62 | ``` 63 | 64 | ### Getting a prediction 65 | 66 | ```javascript 67 | const predictions = classifier.predict('It sure is pretty great!') 68 | 69 | if (predictions.length) { 70 | predictions.forEach((prediction) => { 71 | console.log(`${prediction.label} (${prediction.confidence})`) 72 | }) 73 | } else { 74 | console.log('No predictions returned') 75 | } 76 | ``` 77 | 78 | Returning: 79 | 80 | ``` 81 | positive (0.5423261445466404) 82 | ``` 83 | 84 | ## Advanced usage 85 | 86 | ### Configuration 87 | 88 | The following configuration options can be passed both directly to a new [Model](docs/model.md), or indirectly by passing it to the [Classifier](docs/classifier.md) constructor. 89 | 90 | #### Options 91 | 92 | | Property | Type | Default | Description | 93 | | -------------- | --------------------------- | ------- | ----------------------------------------------------------------------------------------------------- | 94 | | **nGramMin** | `int` | `1` | Minimum n-gram size | 95 | | **nGramMax** | `int` | `1` | Maximum n-gram size | 96 | | **vocabulary** | `Array` \| `Set` \| `false` | `[]` | Terms mapped to indexes in the model data, set to `false` to store terms directly in the data entries | 97 | | **data** | `Object` | `{}` | Key-value store of labels and training data vectors | 98 | 99 | ### Using n-grams 100 | 101 | The default behavior is to split up texts by single words (known as a [bag of words](https://en.wikipedia.org/wiki/Bag-of-words_model), or unigrams). 102 | 103 | This has a few limitations, since by ignoring the order of words, it's impossible to correctly match phrases and expressions. 104 | 105 | In comes [n-grams](https://en.wikipedia.org/wiki/N-gram), which, when set to use more than one word per term, act like a sliding window that moves across the text — a continuous sequence of words of the specified amount, which can greatly improve the accuracy of predictions. 106 | 107 | #### Example of using n-grams with a size of 2 (bigrams) 108 | 109 | ```javascript 110 | const classifier = new Classifier({ 111 | nGramMin: 2, 112 | nGramMax: 2 113 | }) 114 | 115 | const tokens = classifier.tokenize('I really dont like it') 116 | 117 | console.log(tokens) 118 | ``` 119 | 120 | Returning: 121 | 122 | ```javascript 123 | { 124 | 'i really': 1, 125 | 'really dont': 1, 126 | 'dont like': 1, 127 | 'like it': 1 128 | } 129 | ``` 130 | 131 | ### Serializing a model 132 | 133 | After training a model with large sets of data, you'll want to store all this data, to allow you to simply set up a new model using this training data at another time, and quickly make predictions. 134 | 135 | To do this, simply use the `serialize` method on your [Model](docs/model.md), and either save the data structure to a file, send it to a server, or store it in any other way you want. 136 | 137 | ```javascript 138 | const model = classifier.model 139 | 140 | console.log(model.serialize()) 141 | ``` 142 | 143 | Returning: 144 | 145 | ``` 146 | { 147 | nGramMin: 1, 148 | nGramMax: 1, 149 | vocabulary: [ 150 | 'this', 'is', 'great', 151 | 'so', 'cool', 'wow', 152 | 'i', 'love', 'it', 153 | 'really', 'amazing', 'bad', 154 | 'hate', 'with', 'a', 155 | 'passion', 'just', 'terrible' 156 | ], 157 | data: { 158 | positive: { 159 | '0': 1, '1': 2, '2': 1, 160 | '3': 1, '4': 1, '5': 1, 161 | '6': 1, '7': 1, '8': 2, 162 | '9': 1, '10': 1 163 | }, 164 | negative: { 165 | '0': 1, '1': 1, '6': 1, 166 | '8': 1, '9': 1, '11': 1, 167 | '12': 1, '13': 1, '14': 1, 168 | '15': 1, '16': 1, '17': 1 169 | } 170 | } 171 | } 172 | ``` 173 | 174 | ## Documentation 175 | 176 | - [Classifier](docs/Classifier.md) 177 | - [Model](docs/Model.md) 178 | - [Vocabulary](docs/Vocabulary.md) 179 | - [Prediction](docs/Prediction.md) 180 | 181 | ## Contributing 182 | 183 | Read the [contribution guidelines](CONTRIBUTING.md). 184 | 185 | ## Changelog 186 | 187 | Refer to the [changelog](CHANGELOG.md) for a full history of the project. 188 | 189 | ## License 190 | 191 | ClassifyText is licensed under the [MIT license](LICENSE). 192 | -------------------------------------------------------------------------------- /docs/Classifier.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Classifier 4 | 5 | - [Classifier](#Classifier) 6 | - [new Classifier([model])](#new_Classifier) 7 | - [.model](#Classifier+model) : Model 8 | - [.train(input, label)](#Classifier+train) ⇒ this 9 | - [.predict(input, [maxMatches], [minimumConfidence])](#Classifier+predict) ⇒ Array 10 | - [.splitWords(input)](#Classifier+splitWords) ⇒ Array 11 | - [.tokenize(input)](#Classifier+tokenize) ⇒ Object 12 | - [.vectorize(tokens)](#Classifier+vectorize) ⇒ Object 13 | - [.cosineSimilarity(v1, v2)](#Classifier+cosineSimilarity) ⇒ float 14 | 15 | 16 | 17 | ### new Classifier([model]) 18 | 19 | | Param | Type | Default | Description | 20 | | ------------------ | --------------------------- | ------- | ----------------------------------------------------------------------------------------------------- | 21 | | [model] | `Model` \| `Object` | | | 22 | | [model.nGramMin] | `int` | `1` | Minimum n-gram size | 23 | | [model.nGramMax] | `int` | `1` | Maximum n-gram size | 24 | | [model.vocabulary] | `Array` \| `Set` \| `false` | `[]` | Terms mapped to indexes in the model data, set to `false` to store terms directly in the data entries | 25 | | [model.data] | `Object` | `{}` | Key-value store of labels and training data vectors | 26 | 27 | 28 | 29 | ### classifier.model : `Model` 30 | 31 | Model instance 32 | 33 | 34 | 35 | ### classifier.train(input, label) ⇒ `this` 36 | 37 | Train the current model using an input string (or array of strings) and a corresponding label 38 | 39 | | Param | Type | Description | 40 | | ----- | ------------------- | ------------------------------ | 41 | | input | `string` \| `Array` | String, or an array of strings | 42 | | label | `string` | Corresponding label | 43 | 44 | 45 | 46 | ### classifier.predict(input, [maxMatches], [minimumConfidence]) ⇒ `Array` 47 | 48 | Return an array of one or more Prediction instances 49 | 50 | | Param | Type | Default | Description | 51 | | ------------------- | -------- | ------- | --------------------------------------------------- | 52 | | input | `string` | | Input string to make a prediction from | 53 | | [maxMatches] | `int` | `1` | Maximum number of predictions to return | 54 | | [minimumConfidence] | `float` | `0.2` | Minimum confidence required to include a prediction | 55 | 56 | 57 | 58 | ### classifier.splitWords(input) ⇒ `Array` 59 | 60 | Split a string into an array of lowercase words, with all non-letter characters removed 61 | 62 | | Param | Type | 63 | | ----- | -------- | 64 | | input | `string` | 65 | 66 | 67 | 68 | ### classifier.tokenize(input) ⇒ `Object` 69 | 70 | Create an object literal of unique tokens (n-grams) as keys, and their 71 | respective occurrences as values based on an input string, or array of words 72 | 73 | | Param | Type | 74 | | ----- | ------------------- | 75 | | input | `string` \| `Array` | 76 | 77 | 78 | 79 | ### classifier.vectorize(tokens) ⇒ `Object` 80 | 81 | Convert a tokenized object into a new object with all keys (terms) 82 | translated to their index in the returned vocabulary (which is also 83 | returned along with the object, with any new terms added to the end) 84 | 85 | | Param | Type | 86 | | ------ | -------- | 87 | | tokens | `Object` | 88 | 89 | 90 | 91 | ### classifier.cosineSimilarity(v1, v2) ⇒ `float` 92 | 93 | Return the cosine similarity between two vectors 94 | 95 | | Param | Type | 96 | | ----- | -------- | 97 | | v1 | `Object` | 98 | | v2 | `Object` | 99 | -------------------------------------------------------------------------------- /docs/Model.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Model 4 | 5 | - [Model](#Model) 6 | - [new Model([config])](#new_Model) 7 | - [.nGramMin](#Model+nGramMin) : `int` 8 | - [.nGramMax](#Model+nGramMax) : `int` 9 | - [.vocabulary](#Model+vocabulary) : `Vocabulary` \| `false` 10 | - [.data](#Model+data) : `Object` 11 | - [.serialize()](#Model+serialize) ⇒ `Object` 12 | 13 | 14 | 15 | ### new Model([config]) 16 | 17 | | Param | Type | Default | Description | 18 | | ------------------- | --------------------------- | ------- | ----------------------------------------------------------------------------------------------------------- | 19 | | [config] | `Object` | | | 20 | | [config.nGramMin] | `int` | `1` | Minimum n-gram size | 21 | | [config.nGramMax] | `int` | `1` | Maximum n-gram size | 22 | | [config.vocabulary] | `Array` \| `Set` \| `false` | `[]` | Terms mapped to indexes in the model data entries, set to false to store terms directly in the data entries | 23 | | [config.data] | `Object` | `{}` | Key-value store containing all training data | 24 | 25 | 26 | 27 | ### model.nGramMin : `int` 28 | 29 | Minimum n-gram size 30 | 31 | 32 | 33 | ### model.nGramMax : `int` 34 | 35 | Maximum n-gram size 36 | 37 | 38 | 39 | ### model.vocabulary : `Vocabulary` \| `false` 40 | 41 | Vocabulary instance 42 | 43 | 44 | 45 | ### model.data : `Object` 46 | 47 | Model data 48 | 49 | 50 | 51 | ### model.serialize() ⇒ `Object` 52 | 53 | Return the model in its current state an an object literal, including the 54 | configured n-gram min/max values, the vocabulary as an array (if any, 55 | otherwise false), and an object literal with all the training data 56 | -------------------------------------------------------------------------------- /docs/Prediction.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Prediction 4 | 5 | - [Prediction](#Prediction) 6 | - [.label](#Prediction+label) : `string` 7 | - [.confidence](#Prediction+confidence) : `number` 8 | 9 | 10 | 11 | ### prediction.label : `string` 12 | 13 | Label of the prediction 14 | 15 | 16 | 17 | ### prediction.confidence : `number` 18 | 19 | Confidence of the prediction 20 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | Full documentation of all the available classes, properties and methods. 4 | 5 | - [Classifier](Classifier.md) 6 | - [Model](Model.md) 7 | - [Vocabulary](Vocabulary.md) 8 | - [Prediction](Prediction.md) 9 | -------------------------------------------------------------------------------- /docs/Vocabulary.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Vocabulary 4 | 5 | - [Vocabulary](#Vocabulary) 6 | - [new Vocabulary(terms)](#new_Vocabulary) 7 | - [.size](#Vocabulary+size) : `number` 8 | - [.terms](#Vocabulary+terms) : `Array` \| `Set` 9 | - [.add(terms)](#Vocabulary+add) ⇒ `this` 10 | - [.remove(terms)](#Vocabulary+remove) ⇒ `this` 11 | - [.has(term)](#Vocabulary+has) ⇒ `bool` 12 | - [.indexOf(term)](#Vocabulary+indexOf) ⇒ `number` 13 | 14 | 15 | 16 | ### new Vocabulary(terms) 17 | 18 | | Param | Type | 19 | | ----- | ---------------- | 20 | | terms | `Array` \| `Set` | 21 | 22 | 23 | 24 | ### vocabulary.size : `number` 25 | 26 | Vocabulary size 27 | 28 | 29 | 30 | ### vocabulary.terms : `Array` \| `Set` 31 | 32 | Vocabulary terms 33 | 34 | 35 | 36 | ### vocabulary.add(terms) ⇒ `this` 37 | 38 | Add one or more terms to the vocabulary 39 | 40 | | Param | Type | 41 | | ----- | ---------------------------- | 42 | | terms | `string` \| `Array` \| `Set` | 43 | 44 | 45 | 46 | ### vocabulary.remove(terms) ⇒ `this` 47 | 48 | Remove one or more terms from the vocabulary 49 | 50 | | Param | Type | 51 | | ----- | ---------------------------- | 52 | | terms | `string` \| `Array` \| `Set` | 53 | 54 | 55 | 56 | ### vocabulary.has(term) ⇒ `bool` 57 | 58 | Return whether the vocabulary contains a certain term 59 | 60 | | Param | Type | 61 | | ----- | -------- | 62 | | term | `string` | 63 | 64 | 65 | 66 | ### vocabulary.indexOf(term) ⇒ `number` 67 | 68 | Return the index of a term in the vocabulary (returns -1 if not found) 69 | 70 | | Param | Type | 71 | | ----- | -------- | 72 | | term | `string` | 73 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ml-classify-text", 3 | "version": "2.0.1", 4 | "description": "Text classification using n-grams and cosine similarity", 5 | "module": "./lib", 6 | "main": "./lib", 7 | "scripts": { 8 | "clean": "rimraf lib", 9 | "test": "jest --coverage", 10 | "test:watch": "jest --watchAll", 11 | "test:prod": "cross-env BABEL_ENV=production npm run test", 12 | "lint": "eslint src test", 13 | "build": "webpack --mode=production --config=webpack.config.js", 14 | "prepublish": "npm run clean && npm run lint && npm run test && npm run build" 15 | }, 16 | "files": [ 17 | "lib" 18 | ], 19 | "repository": { 20 | "type": "git", 21 | "url": "git+https://github.com/andreekeberg/ml-classify-text-js.git" 22 | }, 23 | "keywords": [ 24 | "text classification", 25 | "classification", 26 | "classify", 27 | "classifier", 28 | "machine learning", 29 | "machine", 30 | "learning", 31 | "ai", 32 | "artificial intelligence", 33 | "artificial", 34 | "intelligence", 35 | "n-gram", 36 | "n-grams", 37 | "cosine similarity", 38 | "cosine", 39 | "similarity", 40 | "confidence", 41 | "predict", 42 | "prediction", 43 | "model", 44 | "train" 45 | ], 46 | "author": "André Ekeberg (https://andreekeberg.se/en/)", 47 | "license": "MIT", 48 | "bugs": { 49 | "url": "https://github.com/andreekeberg/ml-classify-text-js/issues" 50 | }, 51 | "homepage": "https://github.com/andreekeberg/ml-classify-text-js", 52 | "devDependencies": { 53 | "@babel/core": "^7.20.12", 54 | "@babel/plugin-transform-modules-amd": "^7.20.11", 55 | "@babel/plugin-transform-modules-commonjs": "^7.20.11", 56 | "@babel/plugin-transform-runtime": "^7.19.6", 57 | "@babel/polyfill": "^7.12.1", 58 | "@babel/preset-env": "^7.20.2", 59 | "@babel/register": "^7.18.9", 60 | "@babel/runtime": "^7.20.13", 61 | "@babel/runtime-corejs3": "^7.20.13", 62 | "babel-cli": "^6.26.0", 63 | "babel-eslint": "^10.1.0", 64 | "babel-loader": "^9.1.2", 65 | "babel-plugin-add-module-exports": "^1.0.4", 66 | "babel-polyfill": "^6.26.0", 67 | "babel-preset-env": "^1.7.0", 68 | "babel-preset-minify": "^0.5.2", 69 | "babel-runtime": "^6.26.0", 70 | "core-js": "^3.27.2", 71 | "cross-env": "^7.0.3", 72 | "eslint": "^8.33.0", 73 | "eslint-config-standard": "^17.0.0", 74 | "eslint-plugin-node": "^11.1.0", 75 | "jest": "^29.4.1", 76 | "jsdoc": "^4.0.0", 77 | "jsdoc-to-markdown": "^8.0.0", 78 | "rimraf": "^4.1.2", 79 | "webpack": "^5.75.0", 80 | "webpack-cli": "^5.0.1" 81 | }, 82 | "dependencies": { 83 | "xregexp": "^5.1.1" 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/Classifier.js: -------------------------------------------------------------------------------- 1 | import XRegExp from 'xregexp' 2 | import Model from './Model' 3 | import Prediction from './Prediction' 4 | import Vocabulary from './Vocabulary' 5 | 6 | /** 7 | * @param {(Model|Object)} [model] 8 | * @param {int} [model.nGramMin=1] - Minimum n-gram size 9 | * @param {int} [model.nGramMax=1] - Maximum n-gram size 10 | * @param {(Array|Set|false)} [model.vocabulary=[]] - Terms mapped to indexes in the model data entries, set to false to store terms directly in the data entries 11 | * @param {Object} [model.data={}] - Key-value store containing all training data 12 | * @constructor 13 | */ 14 | class Classifier { 15 | constructor(model = {}) { 16 | if (!(model instanceof Model)) { 17 | model = new Model(model) 18 | } 19 | 20 | this._model = model 21 | } 22 | 23 | /** 24 | * Model instance 25 | * 26 | * @type {Model} 27 | */ 28 | get model() { 29 | return this._model 30 | } 31 | 32 | set model(model) { 33 | if (!(model instanceof Model)) { 34 | model = new Model(model) 35 | } 36 | 37 | this._model = model 38 | } 39 | 40 | /** 41 | * Train the current model using an input string (or array of strings) and a corresponding label 42 | * 43 | * @param {(string|string[])} input - String, or an array of strings 44 | * @param {string} label - Corresponding label 45 | * @return {this} 46 | */ 47 | train(input, label) { 48 | if (typeof input !== 'string' && !(input instanceof Array)) { 49 | throw new Error('input must be either a string or Array') 50 | } 51 | 52 | if (typeof label !== 'string') { 53 | throw new Error('label must be a string') 54 | } 55 | 56 | // If input isn't an array, convert to a single item array 57 | if (!(input instanceof Array)) { 58 | input = [input] 59 | } 60 | 61 | input.forEach((string) => { 62 | // Convert the string to a tokenized object 63 | let tokens = this.tokenize(string) 64 | 65 | if (this._model.vocabulary !== false) { 66 | // If we're using a vocabulary, convert the tokens to a vector where all 67 | // indexes reference vocabulary terms 68 | const { vector, vocabulary } = this.vectorize(tokens) 69 | 70 | // Overwrite the tokens object with our new vectorized object 71 | tokens = vector 72 | 73 | // Update the model vocabulary 74 | this._model.vocabulary = vocabulary 75 | } 76 | 77 | // Set up an empty entry for the label if it does not exist 78 | if ( 79 | !Object.prototype.hasOwnProperty.call(this._model.data, label) 80 | ) { 81 | this._model.data[label] = {} 82 | } 83 | 84 | // Add all occurrences to our model entry 85 | Object.keys(tokens).forEach((index) => { 86 | let occurrences = tokens[index] 87 | 88 | if ( 89 | !Object.prototype.hasOwnProperty.call( 90 | this._model.data[label], 91 | index 92 | ) 93 | ) { 94 | this._model.data[label][index] = 0 95 | } 96 | 97 | this._model.data[label][index] += occurrences 98 | }) 99 | }) 100 | 101 | return this 102 | } 103 | 104 | /** 105 | * Return an array of one or more Prediction instances 106 | * 107 | * @param {string} input - Input string to make a prediction from 108 | * @param {int} [maxMatches=1] Maximum number of predictions to return 109 | * @param {float} [minimumConfidence=0.2] Minimum confidence required to include a prediction 110 | * @return {Array} 111 | */ 112 | predict(input, maxMatches = 1, minimumConfidence = 0.2) { 113 | if (typeof input !== 'string') { 114 | throw new Error('input must be a string') 115 | } 116 | 117 | if (!['number', 'undefined'].includes(typeof maxMatches)) { 118 | throw new Error('maxMatches must be either a number or undefined') 119 | } 120 | 121 | if (!['number', 'undefined'].includes(typeof minimumConfidence)) { 122 | throw new Error( 123 | 'minimumConfidence must be either a number or undefined' 124 | ) 125 | } 126 | 127 | if (minimumConfidence < 0) { 128 | throw new Error('minimumConfidence can not be lower than 0') 129 | } 130 | 131 | if (minimumConfidence > 1) { 132 | throw new Error('minimumConfidence can not be higher than 1') 133 | } 134 | 135 | // Convert the string to a tokenized object 136 | let tokens = this.tokenize(input) 137 | 138 | if (this.vocabulary !== false) { 139 | // If we're using a vocabulary, convert the tokens to a vector where all 140 | // indexes reference vocabulary terms 141 | const { vector } = this.vectorize(tokens) 142 | 143 | // Overwrite the tokens object with our new vectorized object 144 | tokens = vector 145 | } 146 | 147 | const predictions = [] 148 | 149 | Object.keys(this._model.data).forEach((label) => { 150 | let entry = this._model.data[label] 151 | 152 | let confidence = this.cosineSimilarity(tokens, entry) 153 | 154 | if (confidence >= minimumConfidence) { 155 | predictions.push( 156 | new Prediction({ 157 | label, 158 | confidence 159 | }) 160 | ) 161 | } 162 | }) 163 | 164 | /* istanbul ignore next */ 165 | predictions.sort((a, b) => { 166 | if (a.confidence === b.confidence) { 167 | return 0 168 | } 169 | 170 | return a.confidence > b.confidence ? -1 : 1 171 | }) 172 | 173 | return predictions.slice(0, Math.min(predictions.length, maxMatches)) 174 | } 175 | 176 | /** 177 | * Split a string into an array of lowercase words, with all non-letter characters removed 178 | * 179 | * @param {string} input 180 | * @return {Array} 181 | */ 182 | splitWords(input) { 183 | if (typeof input !== 'string') { 184 | throw new Error('input must be a string') 185 | } 186 | 187 | // Remove all apostrophes and dashes to keep words intact 188 | input = input.replace(/'|´|’|-/g, '') 189 | 190 | // Lowercase all letters and replace all non-letter characters with a space 191 | input = XRegExp.replace( 192 | input.toLocaleLowerCase(), 193 | XRegExp('\\P{L}+', 'g'), 194 | ' ' 195 | ).trim() 196 | 197 | return input.split(' ') 198 | } 199 | 200 | /** 201 | * Create an object literal of unique tokens (n-grams) as keys, and their 202 | * respective occurrences as values based on an input string, or array of words 203 | * 204 | * @param {(string|string[])} input 205 | * @return {Object} 206 | */ 207 | tokenize(input) { 208 | let words = typeof input === 'string' ? this.splitWords(input) : input 209 | 210 | if (!(words instanceof Array)) { 211 | throw new Error('input must be either a string or Array') 212 | } 213 | 214 | if (this._model.nGramMax < this._model.nGramMin) { 215 | throw new Error( 216 | 'Invalid nGramMin/nGramMax combination in model config' 217 | ) 218 | } 219 | 220 | let tokens = {} 221 | 222 | // Generate a list of n-grams along with their respective occurrences 223 | // based on the models configured min/max values 224 | words.forEach((word, index) => { 225 | let sequence = '' 226 | 227 | words.slice(index).forEach((nextWord) => { 228 | sequence += sequence ? ' ' + nextWord : nextWord 229 | let tokenCount = sequence.split(' ').length 230 | 231 | if ( 232 | tokenCount < this._model.nGramMin || 233 | tokenCount > this._model.nGramMax 234 | ) { 235 | return 236 | } 237 | 238 | if (!Object.prototype.hasOwnProperty.call(tokens, sequence)) { 239 | tokens[sequence] = 0 240 | } 241 | 242 | ++tokens[sequence] 243 | }) 244 | }) 245 | 246 | return tokens 247 | } 248 | 249 | /** 250 | * Convert a tokenized object into a new object with all keys (terms) 251 | * translated to their index in the returned vocabulary (which is also 252 | * returned along with the object, with any new terms added to the end) 253 | * 254 | * @param {Object} tokens 255 | * @return {Object} 256 | */ 257 | vectorize(tokens) { 258 | if (Object.getPrototypeOf(tokens) !== Object.prototype) { 259 | throw new Error('tokens must be an object literal') 260 | } 261 | 262 | /* istanbul ignore next */ 263 | if (this._model.vocabulary === false) { 264 | throw new Error('Cannot vectorize tokens when vocabulary is false') 265 | } 266 | 267 | const vector = {} 268 | const vocabulary = new Vocabulary(this._model.vocabulary.terms) 269 | 270 | Object.keys(tokens).forEach((token) => { 271 | let vocabularyIndex = vocabulary.indexOf(token) 272 | 273 | if (vocabularyIndex === -1) { 274 | vocabulary.add(token) 275 | 276 | vocabularyIndex = vocabulary.size - 1 277 | } 278 | 279 | vector[vocabularyIndex] = tokens[token] 280 | }) 281 | 282 | return { 283 | vector, 284 | vocabulary 285 | } 286 | } 287 | 288 | /** 289 | * Return the cosine similarity between two vectors 290 | * 291 | * @param {Object} v1 292 | * @param {Object} v2 293 | * @return {float} 294 | */ 295 | cosineSimilarity(v1, v2) { 296 | if (Object.getPrototypeOf(v1) !== Object.prototype) { 297 | throw new Error('v1 must be an object literal') 298 | } 299 | if (Object.getPrototypeOf(v2) !== Object.prototype) { 300 | throw new Error('v2 must be an object literal') 301 | } 302 | 303 | let prod = 0.0 304 | let v1Norm = 0.0 305 | 306 | Object.keys(v1).forEach((i) => { 307 | let xi = v1[i] 308 | 309 | if (Object.prototype.hasOwnProperty.call(v2, i)) { 310 | prod += xi * v2[i] 311 | } 312 | 313 | v1Norm += xi * xi 314 | }) 315 | 316 | v1Norm = Math.sqrt(v1Norm) 317 | 318 | if (v1Norm === 0) { 319 | return 0 320 | } 321 | 322 | let v2Norm = 0.0 323 | 324 | Object.keys(v2).forEach((i) => { 325 | let xi = v2[i] 326 | 327 | v2Norm += xi * xi 328 | }) 329 | 330 | v2Norm = Math.sqrt(v2Norm) 331 | 332 | if (v2Norm === 0) { 333 | return 0 334 | } 335 | 336 | return prod / (v1Norm * v2Norm) 337 | } 338 | } 339 | 340 | export default Classifier 341 | -------------------------------------------------------------------------------- /src/Model.js: -------------------------------------------------------------------------------- 1 | import Vocabulary from './Vocabulary' 2 | 3 | /** 4 | * @param {Object} [config] 5 | * @param {int} [config.nGramMin=1] - Minimum n-gram size 6 | * @param {int} [config.nGramMax=1] - Maximum n-gram size 7 | * @param {(Array|Set|false)} [config.vocabulary=[]] - Terms mapped to indexes in the model data entries, set to false to store terms directly in the data entries 8 | * @param {Object} [config.data={}] - Key-value store containing all training data 9 | * @constructor 10 | */ 11 | class Model { 12 | constructor(config = {}) { 13 | if (Object.getPrototypeOf(config) !== Object.prototype) { 14 | throw new Error('config must be an object literal') 15 | } 16 | 17 | config = { 18 | nGramMin: 1, 19 | nGramMax: 1, 20 | vocabulary: [], 21 | data: {}, 22 | ...config 23 | } 24 | 25 | if (config.nGramMin !== parseInt(config.nGramMin, 10)) { 26 | throw new Error('Config value nGramMin must be an integer') 27 | } 28 | 29 | if (config.nGramMax !== parseInt(config.nGramMax, 10)) { 30 | throw new Error('Config value nGramMax must be an integer') 31 | } 32 | 33 | if (config.nGramMin < 1) { 34 | throw new Error('Config value nGramMin must be at least 1') 35 | } 36 | 37 | if (config.nGramMax < 1) { 38 | throw new Error('Config value nGramMax must be at least 1') 39 | } 40 | 41 | if (config.nGramMax < config.nGramMin) { 42 | throw new Error('Invalid nGramMin/nGramMax combination in config') 43 | } 44 | 45 | if ( 46 | config.vocabulary !== false && 47 | !(config.vocabulary instanceof Vocabulary) 48 | ) { 49 | config.vocabulary = new Vocabulary(config.vocabulary) 50 | } 51 | 52 | if (Object.getPrototypeOf(config.data) !== Object.prototype) { 53 | throw new Error('Config value data must be an object literal') 54 | } 55 | 56 | this._nGramMin = config.nGramMin 57 | this._nGramMax = config.nGramMax 58 | this._vocabulary = config.vocabulary 59 | this._data = { ...config.data } 60 | } 61 | 62 | /** 63 | * Minimum n-gram size 64 | * 65 | * @type {int} 66 | */ 67 | get nGramMin() { 68 | return this._nGramMin 69 | } 70 | 71 | set nGramMin(size) { 72 | if (size !== parseInt(size, 10)) { 73 | throw new Error('nGramMin must be an integer') 74 | } 75 | 76 | this._nGramMin = size 77 | } 78 | 79 | /** 80 | * Maximum n-gram size 81 | * 82 | * @type {int} 83 | */ 84 | get nGramMax() { 85 | return this._nGramMax 86 | } 87 | 88 | set nGramMax(size) { 89 | if (size !== parseInt(size, 10)) { 90 | throw new Error('nGramMax must be an integer') 91 | } 92 | 93 | this._nGramMax = size 94 | } 95 | 96 | /** 97 | * Vocabulary instance 98 | * 99 | * @type {(Vocabulary|false)} 100 | */ 101 | get vocabulary() { 102 | return this._vocabulary 103 | } 104 | 105 | set vocabulary(vocabulary) { 106 | if (vocabulary !== false && !(vocabulary instanceof Vocabulary)) { 107 | vocabulary = new Vocabulary(vocabulary) 108 | } 109 | 110 | this._vocabulary = vocabulary 111 | } 112 | 113 | /** 114 | * Model data 115 | * 116 | * @type {Object} 117 | */ 118 | get data() { 119 | return this._data 120 | } 121 | 122 | set data(data) { 123 | if (!(data instanceof Object) || data.constructor !== Object) { 124 | throw new Error('data must be an object literal') 125 | } 126 | 127 | this._data = { ...data } 128 | } 129 | 130 | /** 131 | * Return the model in its current state an an object literal, including the 132 | * configured n-gram min/max values, the vocabulary as an array (if any, 133 | * otherwise false), and an object literal with all the training data 134 | * 135 | * @return {Object} 136 | */ 137 | serialize() { 138 | return { 139 | nGramMin: this._nGramMin, 140 | nGramMax: this._nGramMax, 141 | vocabulary: Array.from(this._vocabulary.terms), 142 | data: this._data 143 | } 144 | } 145 | } 146 | 147 | export default Model 148 | -------------------------------------------------------------------------------- /src/Prediction.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @param {Object} prediction 3 | * @constructor 4 | * @hideconstructor 5 | */ 6 | class Prediction { 7 | constructor(prediction = {}) { 8 | if (Object.getPrototypeOf(prediction) !== Object.prototype) { 9 | throw new Error('prediction must be an object literal') 10 | } 11 | 12 | prediction = { 13 | label: '', 14 | confidence: 0, 15 | ...prediction 16 | } 17 | 18 | this._label = prediction.label 19 | this._confidence = prediction.confidence 20 | } 21 | 22 | /** 23 | * Label of the prediction 24 | * 25 | * @type {string} 26 | */ 27 | get label() { 28 | return this._label 29 | } 30 | 31 | set label(label) { 32 | if (typeof label !== 'string') { 33 | throw new Error('label must be a string') 34 | } 35 | 36 | this._label = label 37 | } 38 | 39 | /** 40 | * Confidence of the prediction 41 | * 42 | * @type {number} 43 | */ 44 | get confidence() { 45 | return this._confidence 46 | } 47 | 48 | set confidence(confidence) { 49 | if (typeof confidence !== 'number') { 50 | throw new Error('confidence must be a number') 51 | } 52 | 53 | this._confidence = confidence 54 | } 55 | } 56 | 57 | export default Prediction 58 | -------------------------------------------------------------------------------- /src/Vocabulary.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @param {Array|Set} terms 3 | * @constructor 4 | */ 5 | class Vocabulary { 6 | constructor(terms = []) { 7 | if (!(terms instanceof Array) && !(terms instanceof Set)) { 8 | throw new Error('terms must be either an Array or a Set') 9 | } 10 | 11 | this._terms = new Set(terms) 12 | } 13 | 14 | /** 15 | * Vocabulary size 16 | * 17 | * @type {number} 18 | */ 19 | get size() { 20 | return this._terms.size 21 | } 22 | 23 | /** 24 | * Vocabulary terms 25 | * 26 | * @type {(Array|Set)} 27 | */ 28 | get terms() { 29 | return this._terms 30 | } 31 | 32 | set terms(terms) { 33 | if (!(terms instanceof Array) && !(terms instanceof Set)) { 34 | throw new Error('terms must be either an Array or a Set') 35 | } 36 | 37 | this._terms = new Set(terms) 38 | } 39 | 40 | /** 41 | * Add one or more terms to the vocabulary 42 | * 43 | * @param {(string|Array|Set)} terms 44 | * @return {this} 45 | */ 46 | add(terms) { 47 | if ( 48 | typeof terms !== 'string' && 49 | !(terms instanceof Array) && 50 | !(terms instanceof Set) 51 | ) { 52 | throw new Error('terms must be either a string, Array or Set') 53 | } 54 | 55 | if (typeof terms === 'string') { 56 | terms = [terms] 57 | } else if (terms instanceof Set) { 58 | terms = Array.from(terms) 59 | } 60 | 61 | terms.forEach((term) => { 62 | this._terms.add(term) 63 | }) 64 | 65 | return this 66 | } 67 | 68 | /** 69 | * Remove one or more terms from the vocabulary 70 | * 71 | * @param {(string|Array|Set)} terms 72 | * @return {this} 73 | */ 74 | remove(terms) { 75 | if ( 76 | typeof terms !== 'string' && 77 | !(terms instanceof Array) && 78 | !(terms instanceof Set) 79 | ) { 80 | throw new Error('terms must be either a string, Array or Set') 81 | } 82 | 83 | if (typeof terms === 'string') { 84 | terms = [terms] 85 | } else if (terms instanceof Set) { 86 | terms = Array.from(terms) 87 | } 88 | 89 | terms.forEach((term) => { 90 | this._terms.delete(term) 91 | }) 92 | 93 | return this 94 | } 95 | 96 | /** 97 | * Return whether the vocabulary contains a certain term 98 | * 99 | * @param {string} term 100 | * @return {bool} 101 | */ 102 | has(term) { 103 | return this._terms.has(term) 104 | } 105 | 106 | /** 107 | * Return the index of a term in the vocabulary (returns -1 if not found) 108 | * 109 | * @param {string} term 110 | * @return {number} 111 | */ 112 | indexOf(term) { 113 | if (!this._terms.has(term)) { 114 | return -1 115 | } 116 | 117 | return Array.from(this._terms).indexOf(term) 118 | } 119 | } 120 | 121 | export default Vocabulary 122 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | import Classifier from './Classifier' 2 | 3 | export { default as Model } from './Model' 4 | export { default as Vocabulary } from './Vocabulary' 5 | export { default as Prediction } from './Prediction' 6 | export { Classifier as Classifier } 7 | 8 | export default Classifier 9 | -------------------------------------------------------------------------------- /test/Classifier.test.js: -------------------------------------------------------------------------------- 1 | import Classifier from '../src/Classifier' 2 | import Model from '../src/Model' 3 | 4 | describe('Classifier', () => { 5 | describe('constructor', () => { 6 | test('should set the model when passed a model instance', () => { 7 | const classifier = new Classifier( 8 | new Model({ 9 | nGramMax: 4 10 | }) 11 | ) 12 | 13 | expect(classifier.model.nGramMax).toStrictEqual(4) 14 | }) 15 | 16 | test('should set the model when passed an object literal', () => { 17 | const classifier = new Classifier({ 18 | nGramMax: 5 19 | }) 20 | 21 | expect(classifier.model.nGramMax).toStrictEqual(5) 22 | }) 23 | }) 24 | 25 | describe('model', () => { 26 | test('should return a model instance', () => { 27 | let classifier = new Classifier() 28 | 29 | expect(classifier.model).toBeInstanceOf(Model) 30 | }) 31 | 32 | test('should set the current model when passed a model instance', () => { 33 | let classifier = new Classifier() 34 | 35 | classifier.model = new Model({ 36 | nGramMax: 3 37 | }) 38 | 39 | expect(classifier.model.nGramMax).toStrictEqual(3) 40 | }) 41 | 42 | test('should set the current model to a new model instance when passed an object literal', () => { 43 | let classifier = new Classifier() 44 | 45 | classifier.model = {} 46 | 47 | expect(classifier.model).toBeInstanceOf(Model) 48 | }) 49 | }) 50 | 51 | describe('splitWords', () => { 52 | test('should throw an error if input is not a string', () => { 53 | const classifier = new Classifier() 54 | 55 | expect(() => classifier.splitWords(1)).toThrow(Error) 56 | }) 57 | 58 | test('should split a string into an array of words', () => { 59 | const classifier = new Classifier() 60 | 61 | expect(classifier.splitWords('Hello world!')).toStrictEqual([ 62 | 'hello', 63 | 'world' 64 | ]) 65 | }) 66 | }) 67 | 68 | describe('tokenize', () => { 69 | test('should throw an error if input is neither a string or array', () => { 70 | const classifier = new Classifier() 71 | 72 | expect(() => classifier.tokenize({})).toThrow(Error) 73 | }) 74 | 75 | test('should throw an error if nGramMax is less than nGramMin in model config', () => { 76 | const classifier = new Classifier() 77 | 78 | classifier.model.nGramMin = 2 79 | 80 | expect(() => classifier.tokenize('Hello world!')).toThrow(Error) 81 | }) 82 | 83 | test('should return an object literal of tokens and their occurrences from a string', () => { 84 | const classifier = new Classifier() 85 | 86 | expect(classifier.tokenize('Hello world!')).toStrictEqual({ 87 | hello: 1, 88 | world: 1 89 | }) 90 | }) 91 | 92 | test('should return an object literal of tokens and their occurrences from a string', () => { 93 | const classifier = new Classifier() 94 | 95 | expect(classifier.tokenize('Hello world!')).toStrictEqual({ 96 | hello: 1, 97 | world: 1 98 | }) 99 | }) 100 | 101 | test('should return an object literal of tokens and their occurrences from a array', () => { 102 | const classifier = new Classifier() 103 | 104 | expect(classifier.tokenize(['hello', 'world'])).toStrictEqual({ 105 | hello: 1, 106 | world: 1 107 | }) 108 | }) 109 | 110 | test('should return an object literal of bigrams when nGramMin/nGramMax is 2', () => { 111 | const classifier = new Classifier({ 112 | nGramMin: 2, 113 | nGramMax: 2 114 | }) 115 | 116 | expect(classifier.tokenize('Hello world!')).toStrictEqual({ 117 | 'hello world': 1 118 | }) 119 | }) 120 | 121 | test('should return an object literal of unigrams and bigrams when nGramMin/nGramMax is 1/2', () => { 122 | const classifier = new Classifier({ 123 | nGramMin: 1, 124 | nGramMax: 2 125 | }) 126 | 127 | expect(classifier.tokenize('Hello world!')).toStrictEqual({ 128 | hello: 1, 129 | 'hello world': 1, 130 | world: 1 131 | }) 132 | }) 133 | 134 | test('should increment the occurrence of the duplicate tokens', () => { 135 | const classifier = new Classifier() 136 | 137 | expect(classifier.tokenize('Hello hello!')).toStrictEqual({ 138 | hello: 2 139 | }) 140 | }) 141 | }) 142 | 143 | describe('vectorize', () => { 144 | test('should throw an error if input is not an object literal', () => { 145 | const classifier = new Classifier() 146 | 147 | expect(() => classifier.vectorize([])).toThrow(Error) 148 | }) 149 | 150 | test('should throw an error if vocabulary config option is set to false', () => { 151 | const classifier = new Classifier({ 152 | vocabulary: false 153 | }) 154 | 155 | expect(() => classifier.vectorize({ hello: 1 })).toThrow(Error) 156 | }) 157 | 158 | test('should convert key to its corresponding vocabulary term index', () => { 159 | const classifier = new Classifier() 160 | const tokens = classifier.tokenize('Hello') 161 | 162 | const { vector } = classifier.vectorize(tokens) 163 | 164 | expect(vector).toStrictEqual({ 0: 1 }) 165 | }) 166 | 167 | test('should use existing term index when token is already in vocabulary', () => { 168 | const classifier = new Classifier({ 169 | vocabulary: ['hello', 'world'] 170 | }) 171 | 172 | const tokens = classifier.tokenize('world') 173 | 174 | const { vector } = classifier.vectorize(tokens) 175 | 176 | expect(vector).toStrictEqual({ 1: 1 }) 177 | }) 178 | 179 | test('should return an updated copy of the vocabulary', () => { 180 | const classifier = new Classifier() 181 | 182 | const tokens = classifier.tokenize('Hello world') 183 | 184 | const { vocabulary } = classifier.vectorize(tokens) 185 | 186 | const terms = vocabulary.terms 187 | 188 | expect(Array.from(terms)).toStrictEqual(['hello', 'world']) 189 | }) 190 | }) 191 | 192 | describe('train', () => { 193 | test('should throw an error if input is not a string or array', () => { 194 | const classifier = new Classifier() 195 | 196 | expect(() => classifier.train({}, 'test')).toThrow(Error) 197 | }) 198 | 199 | test('should throw an error if label is not a string', () => { 200 | const classifier = new Classifier() 201 | 202 | expect(() => classifier.train('test', [])).toThrow(Error) 203 | }) 204 | 205 | test('should add tokens to the vocabulary (if not configured to false)', () => { 206 | const classifier = new Classifier() 207 | 208 | classifier.train('hello world', 'test') 209 | 210 | const vocabulary = classifier.model.vocabulary 211 | 212 | expect(vocabulary.size).toStrictEqual(2) 213 | }) 214 | 215 | test('should add tokens (and their occurrences) to the model from a string', () => { 216 | const classifier = new Classifier() 217 | 218 | classifier.train('hello world', 'test') 219 | 220 | const model = classifier.model 221 | 222 | expect(model.data).toStrictEqual({ 223 | test: { 0: 1, 1: 1 } 224 | }) 225 | }) 226 | 227 | test('should add tokens (and their occurrences) to the model from an array of strings', () => { 228 | const classifier = new Classifier() 229 | 230 | classifier.train(['hello world', 'foo', 'bar'], 'test') 231 | 232 | const model = classifier.model 233 | 234 | expect(model.data).toStrictEqual({ 235 | test: { 0: 1, 1: 1, 2: 1, 3: 1 } 236 | }) 237 | }) 238 | 239 | test('should increment the occurrence of an existing vocabulary term', () => { 240 | const classifier = new Classifier() 241 | 242 | classifier.train(['hello world', 'foo', 'hello'], 'test') 243 | 244 | const model = classifier.model 245 | 246 | expect(model.data).toStrictEqual({ 247 | test: { 0: 2, 1: 1, 2: 1 } 248 | }) 249 | }) 250 | 251 | test('should return classifier instance', () => { 252 | const classifier = new Classifier() 253 | 254 | expect(classifier.train('hello world', 'test')).toStrictEqual( 255 | classifier 256 | ) 257 | }) 258 | }) 259 | 260 | describe('cosineSimilarity', () => { 261 | test('should throw an error if v1 is not an object literal', () => { 262 | const classifier = new Classifier() 263 | 264 | expect(() => classifier.cosineSimilarity(false, {})).toThrow(Error) 265 | }) 266 | 267 | test('should throw an error if v2 is not an object literal', () => { 268 | const classifier = new Classifier() 269 | 270 | expect(() => classifier.cosineSimilarity({}, false)).toThrow(Error) 271 | }) 272 | 273 | test('should return 1 on identical object literals', () => { 274 | const classifier = new Classifier() 275 | 276 | expect( 277 | classifier.cosineSimilarity( 278 | { 279 | 0: 1 280 | }, 281 | { 282 | 0: 1 283 | } 284 | ) 285 | ).toStrictEqual(1) 286 | }) 287 | 288 | test('should return 0 on object literals with no similarity', () => { 289 | const classifier = new Classifier() 290 | 291 | expect( 292 | classifier.cosineSimilarity( 293 | { 294 | 0: 1 295 | }, 296 | { 297 | 1: 1 298 | } 299 | ) 300 | ).toStrictEqual(0) 301 | }) 302 | 303 | test('should return > 0 on similar object literals', () => { 304 | const classifier = new Classifier() 305 | 306 | expect( 307 | classifier.cosineSimilarity( 308 | { 309 | 0: 1, 310 | 1: 1 311 | }, 312 | { 313 | 0: 1, 314 | 2: 1 315 | } 316 | ) 317 | ).toBeGreaterThan(0) 318 | }) 319 | 320 | test('should return 0 when sum of v1 is 0', () => { 321 | const classifier = new Classifier() 322 | 323 | expect( 324 | classifier.cosineSimilarity( 325 | { 326 | 0: 0 327 | }, 328 | { 329 | 0: 1 330 | } 331 | ) 332 | ).toStrictEqual(0) 333 | }) 334 | 335 | test('should return 0 when sum of v2 is 0', () => { 336 | const classifier = new Classifier() 337 | 338 | expect( 339 | classifier.cosineSimilarity( 340 | { 341 | 0: 1 342 | }, 343 | { 344 | 0: 0 345 | } 346 | ) 347 | ).toStrictEqual(0) 348 | }) 349 | }) 350 | 351 | describe('predict', () => { 352 | test('should throw an error if input is not a string', () => { 353 | const classifier = new Classifier() 354 | 355 | expect(() => classifier.predict([])).toThrow(Error) 356 | }) 357 | 358 | test('should throw an error if maxMatches is not a number', () => { 359 | const classifier = new Classifier() 360 | 361 | expect(() => classifier.predict('', 'test')).toThrow(Error) 362 | }) 363 | 364 | test('should throw an error if minimumConfidence is not a number', () => { 365 | const classifier = new Classifier() 366 | 367 | expect(() => classifier.predict('', undefined, 'test')).toThrow( 368 | Error 369 | ) 370 | }) 371 | 372 | test('should throw an error if minimumConfidence is lower than 0', () => { 373 | const classifier = new Classifier() 374 | 375 | expect(() => classifier.predict('', undefined, -1)).toThrow(Error) 376 | }) 377 | 378 | test('should throw an error if minimumConfidence is higher than 1', () => { 379 | const classifier = new Classifier() 380 | 381 | expect(() => classifier.predict('', undefined, 2)).toThrow(Error) 382 | }) 383 | 384 | test('should return an array', () => { 385 | const classifier = new Classifier() 386 | 387 | expect(classifier.predict('test')).toBeInstanceOf(Array) 388 | }) 389 | 390 | test('should return one prediction when trained with a sample', () => { 391 | const classifier = new Classifier() 392 | 393 | classifier.train('hello world', 'test') 394 | 395 | expect(classifier.predict('hello world').length).toStrictEqual(1) 396 | }) 397 | 398 | test('should not include predictions with a confidence below the configured minimumConfidence', () => { 399 | const classifier = new Classifier() 400 | 401 | classifier.train('hello world', 'test') 402 | 403 | const minimumConfidence = 0.8 404 | 405 | const predictions = classifier.predict( 406 | 'hello', 407 | undefined, 408 | minimumConfidence 409 | ) 410 | 411 | expect( 412 | predictions.filter((prediction) => { 413 | return prediction.confidence < minimumConfidence 414 | }).length 415 | ).toStrictEqual(0) 416 | }) 417 | 418 | test('should not update the model vocabulary', () => { 419 | const classifier = new Classifier() 420 | 421 | classifier.train('hello world', 'test') 422 | classifier.predict('hello foo world') 423 | 424 | expect(classifier.model.vocabulary.has('foo')).toStrictEqual(false) 425 | }) 426 | }) 427 | }) 428 | -------------------------------------------------------------------------------- /test/Model.test.js: -------------------------------------------------------------------------------- 1 | import Model from '../src/Model' 2 | import Vocabulary from '../src/Vocabulary' 3 | 4 | describe('Model', () => { 5 | describe('constructor', () => { 6 | test('should throw an error if config is not an object literal', () => { 7 | expect(() => new Model([])).toThrow(Error) 8 | }) 9 | 10 | test('should throw an error if config option nGramMin is not a number', () => { 11 | expect( 12 | () => 13 | new Model({ 14 | nGramMin: '' 15 | }) 16 | ).toThrow(Error) 17 | }) 18 | 19 | test('should throw an error if config option nGramMax is not a number', () => { 20 | expect( 21 | () => 22 | new Model({ 23 | nGramMax: '' 24 | }) 25 | ).toThrow(Error) 26 | }) 27 | 28 | test('should throw an error if config option nGramMin is less than 1', () => { 29 | expect( 30 | () => 31 | new Model({ 32 | nGramMin: 0 33 | }) 34 | ).toThrow(Error) 35 | }) 36 | 37 | test('should throw an error if config option nGramMax is less than 1', () => { 38 | expect( 39 | () => 40 | new Model({ 41 | nGramMax: 0 42 | }) 43 | ).toThrow(Error) 44 | }) 45 | 46 | test('should throw an error if config option nGramMax is less than nGramMin', () => { 47 | expect( 48 | () => 49 | new Model({ 50 | nGramMin: 2, 51 | nGramMax: 1 52 | }) 53 | ).toThrow(Error) 54 | }) 55 | 56 | test('should throw an error if data is not an object literal', () => { 57 | expect( 58 | () => 59 | new Model({ 60 | data: [] 61 | }) 62 | ).toThrow(Error) 63 | }) 64 | }) 65 | 66 | describe('nGramMin', () => { 67 | test('should return a number', () => { 68 | const model = new Model() 69 | 70 | expect(typeof model.nGramMin).toStrictEqual('number') 71 | }) 72 | 73 | test('should return the current nGramMin value', () => { 74 | const model = new Model({ 75 | nGramMin: 3, 76 | nGramMax: 4 77 | }) 78 | 79 | expect(model.nGramMin).toStrictEqual(3) 80 | }) 81 | 82 | test('should set the nGramMin value', () => { 83 | const model = new Model() 84 | 85 | model.nGramMin = 2 86 | 87 | expect(model.nGramMin).toStrictEqual(2) 88 | }) 89 | 90 | test('should throw an error if size is not an integer', () => { 91 | const model = new Model() 92 | 93 | expect(() => { 94 | model.nGramMin = 1.1 95 | }).toThrow(Error) 96 | }) 97 | }) 98 | 99 | describe('nGramMax', () => { 100 | test('should return a number', () => { 101 | const model = new Model() 102 | 103 | expect(typeof model.nGramMax).toStrictEqual('number') 104 | }) 105 | 106 | test('should return the current nGramMax value', () => { 107 | const model = new Model({ 108 | nGramMax: 2 109 | }) 110 | 111 | expect(model.nGramMax).toStrictEqual(2) 112 | }) 113 | 114 | test('should set the nGramMax value', () => { 115 | const model = new Model() 116 | 117 | model.nGramMax = 3 118 | 119 | expect(model.nGramMax).toStrictEqual(3) 120 | }) 121 | 122 | test('should throw an error if size is not an integer', () => { 123 | const model = new Model() 124 | 125 | expect(() => { 126 | model.nGramMax = 1.1 127 | }).toThrow(Error) 128 | }) 129 | }) 130 | 131 | describe('vocabulary', () => { 132 | test('should return a vocabulary instance', () => { 133 | const model = new Model() 134 | 135 | expect(model.vocabulary).toBeInstanceOf(Vocabulary) 136 | }) 137 | 138 | test('should return false when vocabulary is configured to false', () => { 139 | const model = new Model({ 140 | vocabulary: false 141 | }) 142 | 143 | expect(model.vocabulary).toStrictEqual(false) 144 | }) 145 | 146 | test('should set the vocabulary value when passing an array', () => { 147 | const model = new Model() 148 | 149 | model.vocabulary = ['hello', 'world'] 150 | 151 | expect(Array.from(model.vocabulary.terms)).toStrictEqual([ 152 | 'hello', 153 | 'world' 154 | ]) 155 | }) 156 | 157 | test('should set the vocabulary value when passing false', () => { 158 | const model = new Model() 159 | 160 | model.vocabulary = false 161 | 162 | expect(model.vocabulary).toStrictEqual(false) 163 | }) 164 | }) 165 | 166 | describe('data', () => { 167 | test('should return an object literal', () => { 168 | const model = new Model() 169 | 170 | expect(model.data).toStrictEqual({}) 171 | }) 172 | 173 | test('should set the model data', () => { 174 | const model = new Model() 175 | 176 | model.data = { 177 | test: { 0: 1 } 178 | } 179 | 180 | expect(model.data).toStrictEqual({ 181 | test: { 0: 1 } 182 | }) 183 | }) 184 | 185 | test('should throw an error if data is not an object literal', () => { 186 | const model = new Model() 187 | 188 | expect(() => { 189 | model.data = [] 190 | }).toThrow(Error) 191 | }) 192 | }) 193 | 194 | describe('serialize', () => { 195 | test('should return an object literal created from the current model', () => { 196 | const model = new Model() 197 | 198 | expect(model.serialize()).toStrictEqual({ 199 | nGramMin: 1, 200 | nGramMax: 1, 201 | vocabulary: [], 202 | data: {} 203 | }) 204 | }) 205 | }) 206 | }) 207 | -------------------------------------------------------------------------------- /test/Prediction.test.js: -------------------------------------------------------------------------------- 1 | import Prediction from '../src/Prediction' 2 | 3 | describe('Prediction', () => { 4 | describe('constructor', () => { 5 | test('should throw an error if prediction is not an object literal', () => { 6 | expect(() => new Prediction([])).toThrow(Error) 7 | }) 8 | }) 9 | 10 | describe('label', () => { 11 | test('should throw an error if label is not a string', () => { 12 | const prediction = new Prediction() 13 | 14 | expect(() => { 15 | prediction.label = [] 16 | }).toThrow(Error) 17 | }) 18 | 19 | test('should return a string', () => { 20 | const prediction = new Prediction() 21 | 22 | expect(typeof prediction.label).toStrictEqual('string') 23 | }) 24 | 25 | test('should return the defined prediction label', () => { 26 | const prediction = new Prediction({ 27 | label: 'test' 28 | }) 29 | 30 | expect(prediction.label).toStrictEqual('test') 31 | }) 32 | 33 | test('should set the prediction label', () => { 34 | const prediction = new Prediction() 35 | 36 | prediction.label = 'test' 37 | 38 | expect(prediction.label).toStrictEqual('test') 39 | }) 40 | }) 41 | 42 | describe('confidence', () => { 43 | test('should throw an error if confidence is not a number', () => { 44 | const prediction = new Prediction() 45 | 46 | expect(() => { 47 | prediction.confidence = 'test' 48 | }).toThrow(Error) 49 | }) 50 | 51 | test('should return a number', () => { 52 | const prediction = new Prediction() 53 | 54 | expect(typeof prediction.confidence).toStrictEqual('number') 55 | }) 56 | 57 | test('should return the defined prediction confidence', () => { 58 | const prediction = new Prediction({ 59 | confidence: 0.5 60 | }) 61 | 62 | expect(prediction.confidence).toBeCloseTo(0.5) 63 | }) 64 | 65 | test('should set the prediction confidence', () => { 66 | const prediction = new Prediction() 67 | 68 | prediction.confidence = 1 69 | 70 | expect(prediction.confidence).toStrictEqual(1) 71 | }) 72 | }) 73 | }) 74 | -------------------------------------------------------------------------------- /test/Vocabulary.test.js: -------------------------------------------------------------------------------- 1 | import Vocabulary from '../src/Vocabulary' 2 | 3 | describe('Vocabulary', () => { 4 | describe('constructor', () => { 5 | test('should throw an error if terms is not an array or set', () => { 6 | expect(() => new Vocabulary({})).toThrow(Error) 7 | }) 8 | }) 9 | 10 | describe('size', () => { 11 | test('should return a number', () => { 12 | const vocabulary = new Vocabulary() 13 | 14 | expect(typeof vocabulary.size).toStrictEqual('number') 15 | }) 16 | 17 | test('should return the vocabulary size', () => { 18 | const vocabulary = new Vocabulary(['hello']) 19 | 20 | expect(vocabulary.size).toStrictEqual(1) 21 | }) 22 | }) 23 | 24 | describe('terms', () => { 25 | test('should return a set instance', () => { 26 | const vocabulary = new Vocabulary() 27 | 28 | expect(vocabulary.terms).toBeInstanceOf(Set) 29 | }) 30 | 31 | test('should return the vocabulary terms', () => { 32 | const vocabulary = new Vocabulary(['hello']) 33 | 34 | expect(Array.from(vocabulary.terms)).toStrictEqual(['hello']) 35 | }) 36 | 37 | test('should set the vocabulary terms from an array', () => { 38 | const vocabulary = new Vocabulary() 39 | 40 | vocabulary.terms = ['hello', 'world'] 41 | 42 | expect(Array.from(vocabulary.terms)).toStrictEqual([ 43 | 'hello', 44 | 'world' 45 | ]) 46 | }) 47 | 48 | test('should set the vocabulary terms from a set', () => { 49 | const vocabulary = new Vocabulary() 50 | 51 | vocabulary.terms = new Set(['hello', 'world']) 52 | 53 | expect(Array.from(vocabulary.terms)).toStrictEqual([ 54 | 'hello', 55 | 'world' 56 | ]) 57 | }) 58 | 59 | test('should throw an error if terms is not an array or set', () => { 60 | const vocabulary = new Vocabulary() 61 | 62 | expect(() => { 63 | vocabulary.terms = {} 64 | }).toThrow(Error) 65 | }) 66 | }) 67 | 68 | describe('add', () => { 69 | test('should throw an error if terms is not a string, array or set', () => { 70 | const vocabulary = new Vocabulary() 71 | 72 | expect(() => vocabulary.add({})).toThrow(Error) 73 | }) 74 | 75 | test('should add a term to the vocabulary from a string', () => { 76 | const vocabulary = new Vocabulary() 77 | 78 | vocabulary.add('test') 79 | 80 | expect(Array.from(vocabulary.terms)).toStrictEqual(['test']) 81 | }) 82 | 83 | test('should add terms to the vocabulary from an array', () => { 84 | const vocabulary = new Vocabulary() 85 | 86 | vocabulary.add(['hello', 'world']) 87 | 88 | expect(Array.from(vocabulary.terms)).toStrictEqual([ 89 | 'hello', 90 | 'world' 91 | ]) 92 | }) 93 | 94 | test('should add terms to the vocabulary from a set', () => { 95 | const vocabulary = new Vocabulary() 96 | 97 | vocabulary.add(new Set(['hello', 'world'])) 98 | 99 | expect(Array.from(vocabulary.terms)).toStrictEqual([ 100 | 'hello', 101 | 'world' 102 | ]) 103 | }) 104 | 105 | test('should return vocabulary instance', () => { 106 | const vocabulary = new Vocabulary() 107 | 108 | expect(vocabulary.add('test')).toBeInstanceOf(Vocabulary) 109 | }) 110 | }) 111 | 112 | describe('remove', () => { 113 | test('should throw an error if terms is not a string, array or set', () => { 114 | const vocabulary = new Vocabulary() 115 | 116 | expect(() => vocabulary.remove({})).toThrow(Error) 117 | }) 118 | 119 | test('should remove a term to the vocabulary when called with a string', () => { 120 | const vocabulary = new Vocabulary(['test']) 121 | 122 | vocabulary.remove('test') 123 | 124 | expect(Array.from(vocabulary.terms)).toStrictEqual([]) 125 | }) 126 | 127 | test('should remove terms from the vocabulary when called with an array', () => { 128 | const vocabulary = new Vocabulary(['hello', 'world']) 129 | 130 | vocabulary.remove(['world']) 131 | 132 | expect(Array.from(vocabulary.terms)).toStrictEqual(['hello']) 133 | }) 134 | 135 | test('should remove terms from the vocabulary when called with a set', () => { 136 | const vocabulary = new Vocabulary(['hello', 'world']) 137 | 138 | vocabulary.remove(new Set(['world'])) 139 | 140 | expect(Array.from(vocabulary.terms)).toStrictEqual(['hello']) 141 | }) 142 | 143 | test('should return a vocabulary instance', () => { 144 | const vocabulary = new Vocabulary(['test']) 145 | 146 | expect(vocabulary.remove('test')).toBeInstanceOf(Vocabulary) 147 | }) 148 | }) 149 | 150 | describe('has', () => { 151 | test('should return a boolean', () => { 152 | const vocabulary = new Vocabulary() 153 | 154 | expect(typeof vocabulary.has('test')).toStrictEqual('boolean') 155 | }) 156 | 157 | test('should return whether a term exists in the vocabulary', () => { 158 | const vocabulary = new Vocabulary(['test']) 159 | 160 | expect(vocabulary.has('test')).toStrictEqual(true) 161 | }) 162 | }) 163 | 164 | describe('indexOf', () => { 165 | test('should return the index of an existing vocabulary term', () => { 166 | const vocabulary = new Vocabulary(['test']) 167 | 168 | expect(vocabulary.indexOf('test')).toStrictEqual(0) 169 | }) 170 | 171 | test('should return -1 for non-existing vocabulary terms', () => { 172 | const vocabulary = new Vocabulary() 173 | 174 | expect(vocabulary.indexOf('test')).toStrictEqual(-1) 175 | }) 176 | }) 177 | }) 178 | -------------------------------------------------------------------------------- /webpack.config.js: -------------------------------------------------------------------------------- 1 | require('core-js/stable') 2 | require('regenerator-runtime/runtime') 3 | 4 | const path = require('path') 5 | 6 | module.exports = { 7 | entry: { 8 | 'index': './src/index.js' 9 | }, 10 | output: { 11 | path: path.resolve(__dirname, 'lib'), 12 | filename: 'index.js', 13 | libraryTarget: 'umd', 14 | globalObject: 'this' 15 | }, 16 | module: { 17 | rules: [ 18 | { 19 | test: /\.js$/, 20 | exclude: /node_modules/, 21 | use: { 22 | loader: 'babel-loader', 23 | options: { 24 | plugins: [ 25 | ['@babel/plugin-transform-runtime', { 26 | corejs: 3, 27 | }] 28 | ], 29 | presets: ['@babel/preset-env'] 30 | } 31 | } 32 | } 33 | ] 34 | } 35 | } 36 | --------------------------------------------------------------------------------