├── src ├── index.js ├── encoder.js └── neural.js ├── package.json ├── benchmark ├── bench.js └── index.js ├── LICENSE ├── README.md └── .gitignore /src/index.js: -------------------------------------------------------------------------------- 1 | const { Neural } = require('./neural'); 2 | const { normalize, tokenize, Encoder } = require('./encoder'); 3 | 4 | module.exports = { 5 | normalize, 6 | tokenize, 7 | Encoder, 8 | Neural, 9 | } -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "fastest_nlu", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "./benchmark/index.js", 6 | "scripts": { 7 | "start": "node ." 8 | }, 9 | "author": "", 10 | "license": "MIT" 11 | } 12 | -------------------------------------------------------------------------------- /benchmark/bench.js: -------------------------------------------------------------------------------- 1 | class Bench { 2 | constructor(settings = {}) { 3 | this.duration = settings.duration || 10000; 4 | this.transactionsPerRun = settings.transactionsPerRun || 1; 5 | } 6 | 7 | measure(fn, initfn) { 8 | const initValue = initfn(); 9 | const hrstart = process.hrtime(); 10 | let runs = 0; 11 | let elapsed = 0; 12 | let resultIteration; 13 | while (elapsed < this.duration) { 14 | resultIteration = fn(initValue); 15 | runs += 1; 16 | const hrend = process.hrtime(hrstart); 17 | elapsed = hrend[0] * 1000 + hrend[1] / 1000000; 18 | } 19 | const timePerTransaction = elapsed / (runs * this.transactionsPerRun); 20 | return 1000 / timePerTransaction; 21 | } 22 | } 23 | 24 | module.exports = { 25 | Bench, 26 | }; 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Jesús Seijas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fastest_nlu 2 | 3 | ## Introduction 4 | The idea is to have an NLU (Natural Language Understanding) of Conversational AI that is fast to train, fast to run, but with a good accuracy. 5 | 6 | To do the comparision we use the english and spanish corpus from Amazon Massive dataset. 7 | https://www.amazon.science/blog/amazon-releases-51-language-dataset-for-language-understanding 8 | 9 | We will compare the speed and accuracy with RASA. 10 | 11 | ## Installation 12 | 13 | Download this repository. 14 | No need of ```npm install``` as there are no dependencies. 15 | 16 | ## Run 17 | 18 | ```sh 19 | npm start 20 | ``` 21 | 22 | ## Results 23 | 24 | For the English corpus, these are the results: 25 | - Time for training: 3s 213.6769ms 26 | - Accuracy: 83.66% 27 | - Transactions per second: 181836.99495205944 28 | 29 | RASA Accuracy is 81.4%, time to train in RASA is 4517 seconds, Transactions per second in RASA are 84 30 | 31 | For the Spanish corpus, these are the results: 32 | - Time for training: 2s 488.0854ms 33 | - Accuracy: 81.91% 34 | - Transactions per second: 141800.23397571384 35 | 36 | RASA Accuracy is 80.4%, time to train in RASA is 4712 seconds, Transactions per second in RASA are 82 37 | -------------------------------------------------------------------------------- /benchmark/index.js: -------------------------------------------------------------------------------- 1 | const { Neural } = require('../src'); 2 | const { Bench } = require('./bench'); 3 | const corpusEn = require('./corpus-massive-en.json'); 4 | const corpusEs = require('./corpus-massive-es.json'); 5 | 6 | function execFn({ net, data }) { 7 | let good = 0; 8 | data.forEach((item) => { 9 | const classifications = net.run(item.utterance); 10 | if (classifications[0].intent === item.intent) { 11 | good += 1; 12 | } 13 | }); 14 | return { good, total: data.length} 15 | } 16 | 17 | function measureCorpus(corpus) { 18 | const testData = []; 19 | corpus.data.forEach((item) => { 20 | item.tests.forEach((test) => { 21 | testData.push({ utterance: test, intent: item.intent }); 22 | }); 23 | }); 24 | const net = new Neural(); 25 | const hrstart = process.hrtime(); 26 | net.train(corpus); 27 | const hrend = process.hrtime(hrstart); 28 | console.info('Time for training: %ds %dms', hrend[0], hrend[1] / 1000000); 29 | const result = execFn({ net, data: testData }); 30 | console.log(`Accuracy: ${(result.good * 100) / result.total}`); 31 | const bench = new Bench({ transactionsPerRun: testData.length }); 32 | const benchResult = bench.measure(execFn, () => ({ net, data: testData })); 33 | console.log(`Transactions per second: ${benchResult}`); 34 | } 35 | 36 | console.log('English corpus'); 37 | measureCorpus(corpusEn); 38 | console.log('\nSpanish corpus'); 39 | measureCorpus(corpusEs); 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # TypeScript v1 declaration files 45 | typings/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Microbundle cache 57 | .rpt2_cache/ 58 | .rts2_cache_cjs/ 59 | .rts2_cache_es/ 60 | .rts2_cache_umd/ 61 | 62 | # Optional REPL history 63 | .node_repl_history 64 | 65 | # Output of 'npm pack' 66 | *.tgz 67 | 68 | # Yarn Integrity file 69 | .yarn-integrity 70 | 71 | # dotenv environment variables file 72 | .env 73 | .env.test 74 | 75 | # parcel-bundler cache (https://parceljs.org/) 76 | .cache 77 | 78 | # Next.js build output 79 | .next 80 | 81 | # Nuxt.js build / generate output 82 | .nuxt 83 | dist 84 | 85 | # Gatsby files 86 | .cache/ 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js 88 | # https://nextjs.org/blog/next-9-1#public-directory-support 89 | # public 90 | 91 | # vuepress build output 92 | .vuepress/dist 93 | 94 | # Serverless directories 95 | .serverless/ 96 | 97 | # FuseBox cache 98 | .fusebox/ 99 | 100 | # DynamoDB Local files 101 | .dynamodb/ 102 | 103 | # TernJS port file 104 | .tern-port 105 | -------------------------------------------------------------------------------- /src/encoder.js: -------------------------------------------------------------------------------- 1 | const normalize = (str) => 2 | str 3 | .normalize('NFD') 4 | .replace(/[\u0300-\u036f]/g, '') 5 | .toLowerCase(); 6 | 7 | const tokenize = (str) => str.split(/[\s,.!?;:([\]'"¡¿)/]+/).filter((x) => x); 8 | 9 | class Encoder { 10 | constructor(processor) { 11 | this.processor = processor || ((str) => tokenize(normalize(str))); 12 | this.featureMap = new Map(); 13 | this.numFeature = 0; 14 | this.intentMap = new Map(); 15 | this.intents = []; 16 | } 17 | 18 | learnIntent(intent) { 19 | if (!this.intentMap.has(intent)) { 20 | this.intentMap.set(intent, this.intents.length); 21 | this.intents.push(intent); 22 | } 23 | } 24 | 25 | learnFeature(feature) { 26 | if (!this.featureMap.has(feature)) { 27 | this.featureMap.set(feature, this.numFeature); 28 | this.numFeature += 1; 29 | } 30 | } 31 | 32 | encodeText(text, learn = false) { 33 | const dict = {}; 34 | const keys = []; 35 | const features = this.processor(text); 36 | features.forEach((feature) => { 37 | if (learn) { 38 | this.learnFeature(feature); 39 | } 40 | const index = this.featureMap.get(feature); 41 | if (index !== undefined && dict[index] === undefined) { 42 | dict[index] = 1; 43 | keys.push(index); 44 | } 45 | }); 46 | return keys; 47 | } 48 | 49 | encode(text, intent, learn = false) { 50 | if (learn) { 51 | this.learnIntent(intent); 52 | } 53 | return { 54 | input: this.encodeText(text, learn), 55 | output: this.intentMap.get(intent), 56 | }; 57 | } 58 | 59 | encodeCorpus(corpus) { 60 | const result = { train: [], validation: [] }; 61 | corpus.forEach(({ utterances, intent }) => { 62 | if (utterances) { 63 | utterances.forEach((utterance) => { 64 | result.train.push(this.encode(utterance, intent, true)); 65 | }); 66 | } 67 | }); 68 | corpus.forEach(({ tests, intent }) => { 69 | if (tests) { 70 | tests.forEach((test) => { 71 | result.validation.push(this.encode(test, intent)); 72 | }); 73 | } 74 | }); 75 | return result; 76 | } 77 | } 78 | 79 | module.exports = { normalize, tokenize, Encoder }; 80 | -------------------------------------------------------------------------------- /src/neural.js: -------------------------------------------------------------------------------- 1 | const { Encoder } = require('./encoder'); 2 | 3 | const defaultLogFn = (status, time) => 4 | console.log(`Epoch ${status.iterations} loss ${status.error} time ${time}ms`); 5 | 6 | const runInputPerceptron = (weights, input) => { 7 | const sum = input.reduce((acc, key) => acc + weights[key], 0); 8 | return sum <= 0 ? 0 : sum; 9 | }; 10 | 11 | class Neural { 12 | constructor(settings = {}) { 13 | this.settings = settings; 14 | this.settings.maxIterations ??= 150; 15 | this.settings.learningRate ??= 0.002; 16 | this.logFn = this.settings.log === true ? defaultLogFn : this.settings.log; 17 | } 18 | 19 | prepareCorpus(corpus) { 20 | this.encoder = this.settings.encoder || new Encoder(this.settings.processor); 21 | this.encoded = this.encoder.encodeCorpus(corpus); 22 | } 23 | 24 | initialize(corpus) { 25 | this.prepareCorpus(corpus); 26 | this.status = { error: Infinity, iterations: 0 }; 27 | this.perceptrons = this.encoder.intents.map((intent) => ({ 28 | intent, 29 | id: this.encoder.intentMap.get(intent), 30 | weights: new Float32Array(this.encoder.numFeature), 31 | })); 32 | } 33 | 34 | trainPerceptron(perceptron, data) { 35 | const { learningRate } = this.settings; 36 | const { weights } = perceptron; 37 | let error = 0; 38 | data.forEach(({ input, output }) => { 39 | const actualOutput = runInputPerceptron(weights, input, true); 40 | const expectedOutput = output === perceptron.id ? 1 : 0; 41 | const currentError = expectedOutput - actualOutput; 42 | if (currentError) { 43 | error += currentError ** 2; 44 | const change = currentError * learningRate; 45 | input.forEach((key) => { 46 | weights[key] += change; 47 | }); 48 | } 49 | }); 50 | return error; 51 | } 52 | 53 | train(corpus) { 54 | this.initialize(Array.isArray(corpus) ? corpus : corpus.data); 55 | const data = this.encoded.train; 56 | const { maxIterations } = this.settings; 57 | while (this.status.iterations < maxIterations) { 58 | const hrstart = new Date(); 59 | this.status.iterations += 1; 60 | this.status.error = 61 | this.perceptrons.reduce( 62 | (acc, perceptron) => acc + this.trainPerceptron(perceptron, data), 63 | 0 64 | ) / 65 | (data.length * this.perceptrons.length); 66 | if (this.logFn) { 67 | const hrend = new Date(); 68 | this.logFn(this.status, hrend.getTime() - hrstart.getTime()); 69 | } 70 | } 71 | return this.status; 72 | } 73 | 74 | run(text) { 75 | const input = this.encoder.encodeText(text); 76 | const result = []; 77 | this.perceptrons.forEach(({ weights, intent }) => { 78 | const score = runInputPerceptron(weights, input); 79 | if (score) { 80 | result.push({ intent, score }); 81 | } 82 | }); 83 | if (!result.length) { 84 | return [{ intent: 'None', score: 0 }]; 85 | } 86 | return result.sort((a, b) => b.score - a.score); 87 | } 88 | } 89 | 90 | module.exports = { Neural }; 91 | --------------------------------------------------------------------------------