├── .nvmrc ├── .npmignore ├── screenshot.jpg ├── public ├── favicon.ico ├── favicon-16x16.png ├── favicon-32x32.png ├── apple-touch-icon.png ├── browserconfig.xml ├── site.webmanifest └── safari-pinned-tab.svg ├── examples ├── importing │ ├── main.chatito │ ├── dir │ │ └── import.chatito │ └── nestedimport.chatito ├── citySearch_medium.chatito └── dateBooking_large.chatito ├── .prettierrc ├── web ├── lib │ ├── utils.ts │ └── editorConfig.ts ├── pages │ ├── 404.tsx │ └── index.tsx └── components │ ├── globalStyles.tsx │ ├── Logo.tsx │ └── Editor │ ├── editorStyles.tsx │ └── Editor.tsx ├── tslint.json ├── tsconfig.json ├── .gitignore ├── gatsby-config.js ├── .circleci └── config.yml ├── gatsby-node.js ├── .vscode └── launch.json ├── src ├── adapters │ ├── web.ts │ ├── luis.ts │ ├── flair.ts │ ├── rasa.ts │ └── snips.ts ├── utils.ts ├── types.ts ├── bin.ts ├── tests │ ├── parser.spec.ts │ └── bin.spec.ts └── main.ts ├── LICENSE ├── package.json ├── parser └── chatito.pegjs ├── readme.md └── spec.md /.nvmrc: -------------------------------------------------------------------------------- 1 | v10.7.0 -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | src/ 2 | coverage/ 3 | examples/ 4 | web/ 5 | .vscode/ -------------------------------------------------------------------------------- /screenshot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binhetech/Chatito/master/screenshot.jpg -------------------------------------------------------------------------------- /public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binhetech/Chatito/master/public/favicon.ico -------------------------------------------------------------------------------- /public/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binhetech/Chatito/master/public/favicon-16x16.png -------------------------------------------------------------------------------- /public/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binhetech/Chatito/master/public/favicon-32x32.png -------------------------------------------------------------------------------- /examples/importing/main.chatito: -------------------------------------------------------------------------------- 1 | import ./dir/import.chatito 2 | 3 | %[greet] 4 | ~[hi] ~[how are you?] 5 | -------------------------------------------------------------------------------- /public/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binhetech/Chatito/master/public/apple-touch-icon.png -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "semi": true, 3 | "tabWidth": 4, 4 | "printWidth": 140, 5 | "singleQuote": true 6 | } -------------------------------------------------------------------------------- /examples/importing/dir/import.chatito: -------------------------------------------------------------------------------- 1 | import ../nestedimport.chatito 2 | 3 | ~[hi] 4 | ~[saludos1] ~[saludos2?] 5 | ~[saludos2] -------------------------------------------------------------------------------- /examples/importing/nestedimport.chatito: -------------------------------------------------------------------------------- 1 | ~[saludos1] 2 | hola1 3 | hi1 4 | hihi1 5 | 6 | ~[saludos2] 7 | hola2 8 | hhi2 9 | hello2 -------------------------------------------------------------------------------- /web/lib/utils.ts: -------------------------------------------------------------------------------- 1 | export function debounce(func, wait) { 2 | let timeout; 3 | return function(...args) { 4 | const context = this; 5 | clearTimeout(timeout); 6 | timeout = setTimeout(() => func.apply(context, args), wait); 7 | }; 8 | } 9 | -------------------------------------------------------------------------------- /public/browserconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | #da532c 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /web/pages/404.tsx: -------------------------------------------------------------------------------- 1 | import * as React from 'react'; 2 | 3 | const NotFoundPage = () => ( 4 |
5 |

404: Page not found.

6 |

7 | You've hit the void. Go back. 8 |

9 |
10 | ); 11 | 12 | export default NotFoundPage; 13 | -------------------------------------------------------------------------------- /tslint.json: -------------------------------------------------------------------------------- 1 | { 2 | "defaultSeverity": "error", 3 | "extends": ["tslint:recommended", "tslint-config-prettier"], 4 | "jsRules": {}, 5 | "rules": { 6 | "quotemark": [true, "single", "avoid-escape", "jsx-double"], 7 | "object-literal-sort-keys": false, 8 | "prettier": true 9 | }, 10 | "rulesDirectory": ["tslint-plugin-prettier"] 11 | } -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2015", 4 | "module": "commonjs", 5 | "jsx": "react", 6 | "strict": true, 7 | "declaration": true, 8 | "removeComments": true, 9 | "outDir": "./dist", 10 | "types": ["node", "jest"] 11 | }, 12 | "include": [ 13 | "src/**/*" 14 | ], 15 | "exclude": [ 16 | "dist", 17 | "node_modules", 18 | "src/tests", 19 | "**/*.spec.ts" 20 | ] 21 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /coverage 2 | /dist 3 | node_modules 4 | .DS_Store 5 | *.swp 6 | .tmp 7 | .cache 8 | /parser/chatito.js 9 | /public/**/* 10 | !/public/andorid-chrome-192x192.png 11 | !/public/andorid-chrome-512x512.png 12 | !/public/apple-touch-icon.png 13 | !/public/browserconfig.xml 14 | !/public/favicon-16x16.png 15 | !/public/favicon-32x32.png 16 | !/public/favicon.ico 17 | !/public/mstitle-150x150.png 18 | !/public/safari-pinned-tab.svg 19 | !/public/site.webmanifest 20 | -------------------------------------------------------------------------------- /gatsby-config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | pathPrefix: '/Chatito', 3 | siteMetadata: { 4 | title: 'Chatito' 5 | }, 6 | plugins: [ 7 | 'gatsby-plugin-typescript', 8 | { 9 | resolve: 'gatsby-plugin-page-creator', 10 | options: { 11 | path: `${__dirname}/web/pages` 12 | } 13 | }, 14 | 'gatsby-plugin-react-helmet', 15 | 'gatsby-plugin-styled-components' 16 | ] 17 | }; -------------------------------------------------------------------------------- /public/site.webmanifest: -------------------------------------------------------------------------------- 1 | { 2 | "name": "", 3 | "short_name": "", 4 | "icons": [ 5 | { 6 | "src": "/android-chrome-192x192.png", 7 | "sizes": "192x192", 8 | "type": "image/png" 9 | }, 10 | { 11 | "src": "/android-chrome-512x512.png", 12 | "sizes": "512x512", 13 | "type": "image/png" 14 | } 15 | ], 16 | "theme_color": "#ffffff", 17 | "background_color": "#ffffff", 18 | "display": "standalone" 19 | } 20 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | branches: 5 | only: 6 | - master 7 | docker: 8 | - image: circleci/node:8.11 9 | working_directory: ~/repo 10 | steps: 11 | - checkout 12 | - restore_cache: 13 | keys: 14 | - v1-dependencies-{{ checksum "package.json" }} 15 | - v1-dependencies- 16 | - run: npm install 17 | - save_cache: 18 | paths: 19 | - node_modules 20 | key: v1-dependencies-{{ checksum "package.json" }} 21 | - run: npm run test 22 | -------------------------------------------------------------------------------- /gatsby-node.js: -------------------------------------------------------------------------------- 1 | exports.onCreateWebpackConfig = ({ actions, stage, loaders }) => { 2 | const jsLoader = loaders.js(); 3 | if (stage === 'build-javascript') { 4 | actions.setWebpackConfig({ 5 | module: { 6 | rules: [ 7 | { 8 | test: /\.tsx?$/, 9 | use: [ 10 | { 11 | loader: jsLoader.loader 12 | } 13 | ] 14 | } 15 | ] 16 | } 17 | }); 18 | } 19 | }; 20 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Debug Current TS File", 6 | "type": "node", 7 | "request": "launch", 8 | "program": "${workspaceRoot}/node_modules/ts-node/dist/bin.js", 9 | "args": ["${relativeFile}"], 10 | "cwd": "${workspaceRoot}", 11 | "protocol": "inspector", 12 | }, 13 | { 14 | "name": "Debug Current TS Test File", 15 | "type": "node", 16 | "request": "launch", 17 | "program": "${workspaceRoot}/node_modules/.bin/jest", 18 | "args": ["-i", "${relativeFile}"], 19 | "cwd": "${workspaceRoot}", 20 | "protocol": "inspector" 21 | } 22 | ] 23 | } 24 | -------------------------------------------------------------------------------- /examples/citySearch_medium.chatito: -------------------------------------------------------------------------------- 1 | %[findByCityAndCategory]('training': '1000', 'testing': '100') 2 | ~[greet?] ~[botName?] ~[please?] ~[find?] ~[restaurants?] ~[nearby] @[city] 3 | 4 | ~[greet] 5 | hey 6 | hi 7 | hello 8 | greetings 9 | 10 | ~[botName] 11 | Pia 12 | 13 | ~[please] 14 | please 15 | pls 16 | 17 | ~[find] 18 | find 19 | search 20 | lookup 21 | 22 | ~[nearby] 23 | close to 24 | in the area of 25 | within 26 | located at 27 | nearby 28 | 29 | ~[restaurants] 30 | restaurants 31 | places to eat 32 | where to eat 33 | 34 | ~[newYork]('synonym': 'true') 35 | new york ~[city?] 36 | ny ~[city?] 37 | 38 | ~[sanFrancisco]('synonym': 'true') 39 | san francisco 40 | san francisco city 41 | 42 | ~[atlanta]('synonym': 'true') 43 | atlanta 44 | atlanta city 45 | 46 | ~[city] 47 | city 48 | 49 | @[city]('entity': 'location') 50 | ~[newYork] 51 | ~[sanFrancisco] 52 | ~[atlanta] -------------------------------------------------------------------------------- /src/adapters/web.ts: -------------------------------------------------------------------------------- 1 | import * as gen from '../main'; 2 | import { ISentenceTokens } from '../types'; 3 | import * as utils from '../utils'; 4 | 5 | export interface IDefaultDataset { 6 | [intent: string]: ISentenceTokens[][]; 7 | } 8 | export async function adapter(dsl: string, formatOptions?: any, importer?: gen.IFileImporter, currentPath?: string) { 9 | const training: IDefaultDataset = {}; 10 | const testing: IDefaultDataset = {}; 11 | if (formatOptions) { 12 | utils.mergeDeep(training, formatOptions); 13 | } 14 | const utteranceWriter = (utterance: ISentenceTokens[], intentKey: string, isTrainingExample: boolean) => { 15 | const dataset = isTrainingExample ? training : testing; 16 | if (!dataset[intentKey]) { 17 | dataset[intentKey] = []; 18 | } 19 | dataset[intentKey].push(utterance); 20 | }; 21 | await gen.datasetFromString(dsl, utteranceWriter, importer, currentPath); 22 | return { training, testing }; 23 | } 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Rodrigo Pimentel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/utils.ts: -------------------------------------------------------------------------------- 1 | // Deep merge objects 2 | // https://gist.github.com/Salakar/1d7137de9cb8b704e48a 3 | const isObject = (item: any) => item && typeof item === 'object' && !Array.isArray(item) && item !== null; 4 | const isArray = (item: any) => { 5 | if (typeof Array.isArray === 'undefined') { 6 | return Object.prototype.toString.call(item) === '[object Array]'; 7 | } else { 8 | return Array.isArray(item); 9 | } 10 | }; 11 | export const mergeDeep = (target: any, source: any): T => { 12 | if (isObject(target) && isObject(source)) { 13 | Object.keys(source).forEach(key => { 14 | if (isArray(source[key])) { 15 | if (target[key] === undefined) { 16 | target[key] = []; 17 | } 18 | target[key] = target[key].concat(source[key]); 19 | } else if (isObject(source[key])) { 20 | if (!target[key]) { 21 | Object.assign(target, { [key]: {} }); 22 | } 23 | mergeDeep(target[key], source[key]); 24 | } else { 25 | Object.assign(target, { [key]: source[key] }); 26 | } 27 | }); 28 | } 29 | return target; 30 | }; 31 | -------------------------------------------------------------------------------- /web/components/globalStyles.tsx: -------------------------------------------------------------------------------- 1 | import styled, { createGlobalStyle } from 'styled-components'; 2 | 3 | // tslint:disable-next-line:no-unused-expression 4 | export const Global: any = createGlobalStyle` 5 | *, *::after, *::before { 6 | margin: 0; 7 | padding: 0; 8 | box-sizing: border-box; 9 | } 10 | :focus { outline: none; } 11 | h1, h2 { display: inline; font-size: 20px; } 12 | ::-moz-focus-inner { border: 0; } 13 | html, body, #app { 14 | padding: 0; 15 | margin: 0; 16 | display: flex; 17 | flex-direction: column; 18 | flex: 1; 19 | height: auto !important; 20 | font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif; 21 | background-color: #ccc; 22 | } 23 | a { text-decoration: none; } 24 | body { 25 | box-sizing: border-box; 26 | min-height: 100vh; 27 | background: #ececec; 28 | padding: 0; 29 | } 30 | a:focus, a:active, a:any-link { text-decoration: none; } 31 | `; 32 | 33 | export const Header = styled('div')` 34 | display: flex; 35 | align-items: center; 36 | justify-content: center; 37 | h1, 38 | h2 { 39 | display: inline; 40 | } 41 | a { 42 | text-decoration: none; 43 | color: #990adb; 44 | } 45 | a:hover { 46 | color: #b92afb; 47 | } 48 | color: '#444'; 49 | margin: 20px; 50 | `; 51 | -------------------------------------------------------------------------------- /src/adapters/luis.ts: -------------------------------------------------------------------------------- 1 | import * as gen from '../main'; 2 | import { ISentenceTokens } from '../types'; 3 | import * as utils from '../utils'; 4 | 5 | export interface ILuisEntityLabel { 6 | startCharIndex: number; 7 | endCharIndex: number; 8 | entityName: string; 9 | } 10 | export interface ILuisExample { 11 | text: string; 12 | intentName: string; 13 | entityLabels: ILuisEntityLabel[]; 14 | } 15 | export interface ILuisDataset { 16 | data: ILuisExample[]; 17 | } 18 | 19 | export interface ILuisTestingDataset { 20 | [intent: string]: ISentenceTokens[][]; 21 | } 22 | 23 | export async function adapter(dsl: string, formatOptions?: any, importer?: gen.IFileImporter, currentPath?: string) { 24 | const training: ILuisDataset = { data: [] }; 25 | const testing: ILuisDataset = { data: [] }; 26 | if (formatOptions) { 27 | utils.mergeDeep(training, formatOptions); 28 | } 29 | const utteranceWriter = (utterance: ISentenceTokens[], intentKey: string, isTrainingExample: boolean) => { 30 | const example = utterance.reduce( 31 | (acc, next) => { 32 | if (next.type === 'Slot' && next.slot) { 33 | acc.entityLabels.push({ 34 | endCharIndex: acc.text.length + next.value.length, 35 | entityName: next.slot, 36 | startCharIndex: acc.text.length 37 | }); 38 | } 39 | acc.text += next.value; 40 | return acc; 41 | }, 42 | { text: '', intentName: intentKey, entityLabels: [] } as ILuisExample 43 | ); 44 | if (isTrainingExample) { 45 | training.data.push(example); 46 | } else { 47 | testing.data.push(example); 48 | } 49 | }; 50 | await gen.datasetFromString(dsl, utteranceWriter, importer, currentPath); 51 | return { training, testing }; 52 | } 53 | -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | export type IInnerEntitiesTypes = 'Alias' | 'Slot' | 'Text'; 2 | export interface IASTLocationProperties { 3 | offset: number; 4 | line: number; 5 | column: number; 6 | } 7 | export interface IASTLocation { 8 | start: IASTLocationProperties; 9 | end: IASTLocationProperties; 10 | } 11 | export interface ISentenceTokens { 12 | value: string; 13 | type: IInnerEntitiesTypes; 14 | opt?: boolean; 15 | location?: IASTLocation; 16 | variation?: string | null; 17 | slot?: string; 18 | synonym?: string; 19 | args?: { [key: string]: string }; 20 | } 21 | 22 | export interface ISingleSentence { 23 | sentence: ISentenceTokens[]; 24 | probability: null | string; 25 | cardinality?: number; 26 | } 27 | 28 | export interface IChatitoEntityAST { 29 | type: 'IntentDefinition' | 'AliasDefinition' | 'SlotDefinition' | 'Comment' | 'ImportFile'; 30 | key: string; 31 | inner: ISingleSentence[]; 32 | value?: string; 33 | location?: IASTLocation; 34 | variation?: string | null; 35 | args?: { [key: string]: string }; 36 | cardinality?: number; 37 | } 38 | 39 | export interface IChatitoParser { 40 | parse: (input: string) => IChatitoEntityAST[]; 41 | } 42 | export interface IEntityDef { 43 | [key: string]: IChatitoEntityAST; 44 | } 45 | export interface IEntities { 46 | Intent: IEntityDef; 47 | Slot: IEntityDef; 48 | Alias: IEntityDef; 49 | } 50 | 51 | export interface IStatCache { 52 | // optional: boolean; 53 | // optionalCounts: number; 54 | // totalCounts: number[]; 55 | counts: IChatitoCache[]; 56 | // sumOfTotalMax: number; 57 | maxCounts: number[]; 58 | probabilities: number[]; // value defined at probability operator 59 | // realProbabilities: number[]; // actual probability calculateed from the max possible utterances 60 | // utterancesToProvide: number[]; // the actual number of utterances each sentence will provide 61 | // resetedCounts: boolean; 62 | } 63 | export type IChatitoCache = Map; 64 | export type IUtteranceWriter = (utterance: ISentenceTokens[], intentKey: string, isTrainingExample: boolean) => void; 65 | -------------------------------------------------------------------------------- /web/components/Logo.tsx: -------------------------------------------------------------------------------- 1 | import * as React from 'react'; 2 | 3 | export default function Logo() { 4 | return ( 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | ); 14 | } 15 | -------------------------------------------------------------------------------- /src/adapters/flair.ts: -------------------------------------------------------------------------------- 1 | import { WriteStream } from 'fs'; 2 | import * as Tokenizer from 'wink-tokenizer'; 3 | import * as gen from '../main'; 4 | import { ISentenceTokens } from '../types'; 5 | 6 | const tokenizer = new Tokenizer(); 7 | 8 | export interface IDefaultDataset { 9 | [intent: string]: ISentenceTokens[][]; 10 | } 11 | export interface IFlairWriteStreams { 12 | trainClassification: WriteStream; 13 | testClassification: WriteStream; 14 | trainNER: WriteStream; 15 | testNER: WriteStream; 16 | } 17 | 18 | // NOTE: Flair adapter uses write streams to text files and requires two different formats 19 | // reference https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md 20 | // E.G: 21 | // npm run generate -- ./examples --format=flair --outputPath=./output --trainingFileName=training.txt --testingFileName=testing.txt 22 | export async function streamAdapter(dsl: string, ws: IFlairWriteStreams, imp?: gen.IFileImporter, currPath?: string) { 23 | // NOTE: the utteranceWriter is called with each sentences with aliases already replaced, 24 | // so the sentence toke can only be text or slot types. 25 | const utteranceWriter = (utterance: ISentenceTokens[], intentKey: string, isTrainingExample: boolean) => { 26 | // classification dataset in FastText format 27 | const classificationText = utterance.map(v => v.value).join(''); 28 | const classificationLabel = intentKey.replace(/\s+/g, ''); 29 | const writeStreamClassif = isTrainingExample ? ws.trainClassification : ws.testClassification; 30 | writeStreamClassif.write(`__label__${classificationLabel} ${classificationText}` + '\n'); 31 | // named entity recognition dataset in two column with BIO-annotated NER tags (requires tokenization) 32 | const writeStreamNER = isTrainingExample ? ws.trainNER : ws.testNER; 33 | utterance.forEach(v => { 34 | const wordTokens = tokenizer.tokenize(v.value); 35 | if (v.type === 'Slot') { 36 | wordTokens.forEach((wt, idx) => { 37 | const slotBorI = idx === 0 ? 'B' : 'I'; 38 | const slotTag = v.slot!.toLocaleUpperCase().replace(/\s+/g, ''); 39 | writeStreamNER.write(`${wt.value} ${slotBorI}-${slotTag}` + '\n'); 40 | }); 41 | } else { 42 | wordTokens.forEach(wt => writeStreamNER.write(`${wt.value} O` + '\n')); 43 | } 44 | }); 45 | writeStreamNER.write('\n'); // always write an extra EOL at the end of sentences 46 | }; 47 | await gen.datasetFromString(dsl, utteranceWriter, imp, currPath); 48 | } 49 | -------------------------------------------------------------------------------- /web/pages/index.tsx: -------------------------------------------------------------------------------- 1 | import * as React from 'react'; 2 | import Helmet from 'react-helmet'; 3 | import Editor from '../components/Editor/Editor'; 4 | import { Global, Header } from '../components/globalStyles'; 5 | import Logo from '../components/Logo'; 6 | 7 | // NOTE: gatsby global for prefix 8 | declare var __PATH_PREFIX__; 9 | 10 | export default class Index extends React.Component<{}, {}> { 11 | public render() { 12 | return ( 13 |
14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | Chatito DSL - Generate dataset for chatbots 23 | 27 | 31 | 32 | 33 |
34 |
35 | 36 |
37 |
38 |

39 | Chatito 40 |

41 |

42 |  helps you generate datasets for natural language understanding models using a simple DSL  43 | Read the docs 44 |

45 |
46 |
47 | 48 |
49 | ); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/adapters/rasa.ts: -------------------------------------------------------------------------------- 1 | import * as gen from '../main'; 2 | import { ISentenceTokens } from '../types'; 3 | import * as utils from '../utils'; 4 | 5 | export interface IRasaEntity { 6 | start: number; 7 | end: number; 8 | value: string; 9 | entity: string; 10 | } 11 | export interface IRasaExample { 12 | text: string; 13 | intent: string; 14 | entities: IRasaEntity[]; 15 | } 16 | export interface IRasaDataset { 17 | rasa_nlu_data: { 18 | regex_features: any[]; 19 | entity_synonyms: Array<{ value: string; synonyms: string[] }>; 20 | common_examples: IRasaExample[]; 21 | }; 22 | } 23 | 24 | export interface IRasaTestingDataset { 25 | [intent: string]: ISentenceTokens[][]; 26 | } 27 | 28 | export async function adapter(dsl: string, formatOptions?: any, importer?: gen.IFileImporter, currentPath?: string) { 29 | const training: IRasaDataset = { 30 | rasa_nlu_data: { 31 | regex_features: [], 32 | entity_synonyms: [], 33 | common_examples: [] 34 | } 35 | }; 36 | const testing = { rasa_nlu_data: { common_examples: [] as IRasaExample[] } }; 37 | const synonyms: { [key: string]: Set } = {}; 38 | if (formatOptions) { 39 | utils.mergeDeep(training, formatOptions); 40 | } 41 | const utteranceWriter = (utterance: ISentenceTokens[], intentKey: string, isTrainingExample: boolean) => { 42 | const example = utterance.reduce( 43 | (acc, next) => { 44 | if (next.type === 'Slot' && next.slot) { 45 | if (next.synonym) { 46 | if (!synonyms[next.synonym]) { 47 | synonyms[next.synonym] = new Set(); 48 | } 49 | if (next.synonym !== next.value) { 50 | synonyms[next.synonym].add(next.value); 51 | } 52 | } 53 | acc.entities.push({ 54 | end: acc.text.length + next.value.length, 55 | entity: next.slot, 56 | start: acc.text.length, 57 | value: next.synonym ? next.synonym : next.value 58 | }); 59 | } 60 | acc.text += next.value; 61 | return acc; 62 | }, 63 | { text: '', intent: intentKey, entities: [] } as IRasaExample 64 | ); 65 | if (isTrainingExample) { 66 | training.rasa_nlu_data.common_examples.push(example); 67 | } else { 68 | testing.rasa_nlu_data.common_examples.push(example); 69 | } 70 | }; 71 | await gen.datasetFromString(dsl, utteranceWriter, importer, currentPath); 72 | Object.keys(synonyms).forEach(k => { 73 | training.rasa_nlu_data.entity_synonyms.push({ 74 | synonyms: Array.from(synonyms[k]), 75 | value: k 76 | }); 77 | }); 78 | return { training, testing }; 79 | } 80 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "chatito", 3 | "version": "2.3.4", 4 | "description": "Generate training datasets for NLU chatbots using a simple DSL", 5 | "bin": { 6 | "chatito": "./dist/bin.js" 7 | }, 8 | "main": "./dist/main.js", 9 | "scripts": { 10 | "cleanup": "rm -rf .cache && rm -rf dist/* && find public -maxdepth 1 -not -name public -not -iname '*.png' -not -iname '*.ico' -not -iname '*.xml' -not -iname '*.svg' -not -iname '*.webmanifest' -exec rm -rv {} \\;", 11 | "prettier": "prettier --write '{web,src}/**/*.{ts,tsx}'", 12 | "prepublish": "npm run parser:build && npm run ts", 13 | "parser:build": "pegjs parser/chatito.pegjs", 14 | "ts": "tsc", 15 | "web:build": "npm run cleanup && gatsby build", 16 | "web:start": "gatsby develop", 17 | "web:deploy": "npm run cleanup && gatsby build --prefix-paths && gh-pages -d public", 18 | "generate": "node -r ts-node/register ./src/bin.ts", 19 | "test": "npx jest", 20 | "test:debug": "npm run test:kill && NODE_ENV=TEST node --inspect-brk ./node_modules/jest/bin/jest.js --no-cache --runInBand --forceExit --detectOpenHandles", 21 | "test:kill": "lsof -n -i4TCP:5858 | sed '1 d' | awk '{print $2}' | xargs kill -9", 22 | "lint": "npx tslint -c tslint.json 'src/**/*.ts' 'web/**/*.ts'" 23 | }, 24 | "engines": { 25 | "node": ">=8.11.2" 26 | }, 27 | "repository": { 28 | "type": "git", 29 | "url": "git+https://github.com/rodrigopivi/Chatito.git" 30 | }, 31 | "keywords": [ 32 | "nlu", 33 | "natural language processing", 34 | "typescript", 35 | "dataset generation", 36 | "named entity recognition", 37 | "nlp", 38 | "natural language understanding", 39 | "chatbot", 40 | "rasa nlu", 41 | "luis ai", 42 | "snips nlu" 43 | ], 44 | "author": { 45 | "name": "Rodrigo Pimentel", 46 | "url": "https://twitter.com/amaru_muru" 47 | }, 48 | "license": "MIT", 49 | "homepage": "https://github.com/rodrigopivi/Chatito", 50 | "dependencies": { 51 | "chance": "1.0.18", 52 | "minimist": "1.2.0", 53 | "wink-tokenizer": "5.2.1" 54 | }, 55 | "jest": { 56 | "transform": { 57 | "^.+\\.tsx?$": "ts-jest" 58 | }, 59 | "testRegex": "(/src/tests/.*|(\\.|/)(test|spec))\\.(tsx?)$", 60 | "moduleFileExtensions": [ 61 | "ts", 62 | "tsx", 63 | "js", 64 | "jsx", 65 | "json", 66 | "node" 67 | ], 68 | "collectCoverage": true, 69 | "coveragePathIgnorePatterns": [ 70 | "/node_modules/", 71 | "/dist/", 72 | "/src/test/", 73 | "/parser/" 74 | ] 75 | }, 76 | "devDependencies": { 77 | "@babel/core": "7.4.5", 78 | "@types/chance": "1.0.5", 79 | "@types/file-saver": "2.0.1", 80 | "@types/jest": "24.0.15", 81 | "@types/node": "12.0.10", 82 | "@types/react": "16.8.22", 83 | "@types/react-dom": "16.8.4", 84 | "@types/react-helmet": "5.0.8", 85 | "@types/react-router-dom": "4.3.4", 86 | "@types/wink-tokenizer": "4.0.0", 87 | "babel-loader": "8.0.6", 88 | "babel-plugin-import": "1.12.0", 89 | "babel-plugin-styled-components": "1.10.6", 90 | "codeflask": "1.4.1", 91 | "core-js": "3.1.4", 92 | "file-saver": "2.0.2", 93 | "gatsby": "2.12.0", 94 | "gatsby-link": "2.2.0", 95 | "gatsby-plugin-react-helmet": "3.1.0", 96 | "gatsby-plugin-styled-components": "3.1.0", 97 | "gatsby-plugin-typescript": "2.1.0", 98 | "gh-pages": "2.0.1", 99 | "jest": "24.8.0", 100 | "pegjs": "0.10.0", 101 | "prettier": "1.18.2", 102 | "react": "16.8.6", 103 | "react-dom": "16.8.6", 104 | "react-helmet": "5.2.1", 105 | "react-json-view": "1.19.1", 106 | "react-router-dom": "5.0.1", 107 | "regenerator-runtime": "0.13.2", 108 | "styled-components": "4.3.2", 109 | "ts-jest": "24.0.2", 110 | "ts-node": "8.3.0", 111 | "tslint": "5.18.0", 112 | "tslint-config-prettier": "1.18.0", 113 | "tslint-plugin-prettier": "2.0.1", 114 | "typescript": "3.5.2" 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /examples/dateBooking_large.chatito: -------------------------------------------------------------------------------- 1 | %[bookRestaurantsAtDatetime]('training': '1000', 'testing': '100') 2 | ~[find?] ~[some?] ~[restaurants] ~[available?] from @[bookTime] to @[bookTime] 3 | ~[find?] ~[some?] ~[restaurants] ~[available?] from @[bookTime] to @[bookTime] on @[bookDate] 4 | 5 | ~[find] 6 | show 7 | look for 8 | search 9 | show me 10 | find 11 | identify 12 | book 13 | 14 | ~[some] 15 | all 16 | any 17 | some 18 | 19 | ~[available] 20 | available 21 | 22 | ~[restaurants] 23 | restaurants 24 | places where to eat 25 | buffets 26 | sushi 27 | food courts 28 | 29 | @[bookTime] 30 | ~[12hour] ~[timePeriods?] 31 | ~[12hour]~[:]~[minute] ~[timePeriods?] 32 | ~[24hour]~[:]~[minute] 33 | 34 | @[bookDate] 35 | ~[monthNames] ~[monthDays] 36 | ~[monthDays] of ~[monthNames] 37 | ~[monthDayNumbers]/~[monthNumbers] 38 | today 39 | tomorrow 40 | next ~[weekDays] 41 | 42 | ~[:] 43 | : 44 | 45 | ~[timePeriods] 46 | am 47 | pm 48 | 49 | 50 | ~[monthNames] 51 | January 52 | February 53 | March 54 | April 55 | May 56 | June 57 | July 58 | August 59 | September 60 | October 61 | November 62 | December 63 | 64 | ~[monthNumbers] 65 | 1 66 | 2 67 | 3 68 | 4 69 | 5 70 | 6 71 | 7 72 | 8 73 | 9 74 | 10 75 | 11 76 | 12 77 | 78 | 79 | ~[st] 80 | st 81 | 82 | ~[nd] 83 | nd 84 | 85 | ~[rd] 86 | rd 87 | 88 | ~[th] 89 | th 90 | 91 | ~[weekDays] 92 | Monday 93 | Tuesday 94 | Wednesday 95 | Thursday 96 | Friday 97 | Saturday 98 | Sunday 99 | Mon 100 | Tue 101 | Wed 102 | Thu 103 | Fri 104 | Sat 105 | Sun 106 | 107 | ~[monthDayNumbers] 108 | 1 109 | 2 110 | 3 111 | 4 112 | 5 113 | 6 114 | 7 115 | 8 116 | 9 117 | 10 118 | 11 119 | 12 120 | 13 121 | 14 122 | 15 123 | 16 124 | 17 125 | 18 126 | 19 127 | 20 128 | 21 129 | 22 130 | 23 131 | 24 132 | 25 133 | 26 134 | 27 135 | 28 136 | 29 137 | 30 138 | 31 139 | 140 | ~[monthDays] 141 | 1~[st?] 142 | 2~[nd?] 143 | 3~[rd?] 144 | 4~[th?] 145 | 5~[th?] 146 | 6~[th?] 147 | 7~[th?] 148 | 8~[th?] 149 | 9~[th?] 150 | 10~[th?] 151 | 11~[th?] 152 | 12~[th?] 153 | 13~[th?] 154 | 14~[th?] 155 | 15~[th?] 156 | 16~[th?] 157 | 17~[th?] 158 | 18~[th?] 159 | 19~[th?] 160 | 20~[th?] 161 | 21~[th?] 162 | 22~[th?] 163 | 23~[th?] 164 | 24~[th?] 165 | 25~[th?] 166 | 26~[th?] 167 | 27~[th?] 168 | 28~[th?] 169 | 29~[th?] 170 | 30~[th?] 171 | 31~[th?] 172 | 173 | ~[12hour] 174 | 0 175 | 1 176 | 2 177 | 3 178 | 4 179 | 5 180 | 6 181 | 7 182 | 8 183 | 9 184 | 10 185 | 11 186 | 12 187 | 188 | ~[24hour] 189 | 0 190 | 1 191 | 2 192 | 3 193 | 4 194 | 5 195 | 6 196 | 7 197 | 8 198 | 9 199 | 10 200 | 11 201 | 12 202 | 13 203 | 14 204 | 15 205 | 16 206 | 17 207 | 18 208 | 19 209 | 20 210 | 21 211 | 22 212 | 23 213 | 214 | ~[minute] 215 | 00 216 | 01 217 | 02 218 | 03 219 | 04 220 | 05 221 | 06 222 | 07 223 | 08 224 | 09 225 | 0 226 | 1 227 | 2 228 | 3 229 | 4 230 | 5 231 | 6 232 | 7 233 | 8 234 | 9 235 | 10 236 | 11 237 | 12 238 | 13 239 | 14 240 | 15 241 | 16 242 | 17 243 | 18 244 | 19 245 | 20 246 | 21 247 | 22 248 | 23 249 | 24 250 | 25 251 | 26 252 | 27 253 | 28 254 | 29 255 | 30 256 | 31 257 | 32 258 | 33 259 | 34 260 | 35 261 | 36 262 | 37 263 | 38 264 | 39 265 | 40 266 | 41 267 | 42 268 | 43 269 | 44 270 | 45 271 | 46 272 | 47 273 | 48 274 | 49 275 | 50 276 | 51 277 | 52 278 | 53 279 | 54 280 | 55 281 | 56 282 | 57 283 | 58 284 | 59 -------------------------------------------------------------------------------- /src/adapters/snips.ts: -------------------------------------------------------------------------------- 1 | import * as gen from '../main'; 2 | import { ISentenceTokens } from '../types'; 3 | import * as utils from '../utils'; 4 | 5 | export interface ISnipsUtteranceData { 6 | text: string; 7 | entity?: string; 8 | slot_name?: string; 9 | } 10 | export interface ISnipsUtterance { 11 | data: ISnipsUtteranceData[]; 12 | } 13 | export interface ISnipsIntent { 14 | utterances: ISnipsUtterance[]; 15 | } 16 | export interface ISnipsDataset { 17 | intents: { [intentKey: string]: ISnipsIntent }; 18 | entities: { 19 | [entityKey: string]: { 20 | data?: Array<{ value: string; synonyms: string[] }>; 21 | use_synonyms?: boolean; 22 | automatically_extensible?: boolean; 23 | }; 24 | }; 25 | language: string; 26 | } 27 | 28 | export interface ISnipsTestingDataset { 29 | [intent: string]: ISentenceTokens[][]; 30 | } 31 | 32 | export async function adapter(dsl: string, formatOptions?: any, importer?: gen.IFileImporter, currentPath?: string) { 33 | const training: ISnipsDataset = { language: 'en', entities: {}, intents: {} }; 34 | const testing: ISnipsTestingDataset = {}; 35 | if (formatOptions) { 36 | utils.mergeDeep(training, formatOptions); 37 | } 38 | const synonymsForSlots: { 39 | [slot: string]: { [key: string]: Set }; 40 | } = {}; 41 | // const slots: Set = new Set(); 42 | const entities: Set = new Set(); 43 | const utteranceWriter = (utterance: ISentenceTokens[], intentKey: string, isTrainingExample: boolean) => { 44 | if (isTrainingExample) { 45 | if (!training.intents[intentKey]) { 46 | training.intents[intentKey] = { utterances: [] }; 47 | } 48 | const data = utterance.map(u => { 49 | const ret: ISnipsUtteranceData = { text: u.value }; 50 | if (u.type === 'Slot' && u.slot) { 51 | ret.slot_name = u.slot; 52 | if (u.args) { 53 | Object.keys(u.args).forEach(key => { 54 | if (u.args && key === 'entity') { 55 | entities.add(u.args[key]); 56 | ret.entity = u.args[key]; 57 | } 58 | }); 59 | } 60 | if (!ret.entity) { 61 | ret.entity = u.slot; 62 | entities.add(u.slot); 63 | } 64 | if (u.synonym && ret.entity) { 65 | if (!synonymsForSlots[ret.entity]) { 66 | synonymsForSlots[ret.entity] = {}; 67 | } 68 | const synonyms = synonymsForSlots[ret.entity]; 69 | if (!synonyms[u.synonym]) { 70 | synonyms[u.synonym] = new Set(); 71 | } 72 | if (u.synonym !== u.value) { 73 | synonyms[u.synonym].add(u.value); 74 | } 75 | } 76 | } 77 | return ret; 78 | }); 79 | training.intents[intentKey].utterances.push({ data }); 80 | } else { 81 | if (!testing[intentKey]) { 82 | testing[intentKey] = []; 83 | } 84 | testing[intentKey].push(utterance); 85 | } 86 | }; 87 | await gen.datasetFromString(dsl, utteranceWriter, importer, currentPath); 88 | entities.forEach(slotKey => { 89 | if (!synonymsForSlots[slotKey]) { 90 | if (!training.entities[slotKey]) { 91 | training.entities[slotKey] = {}; 92 | } 93 | return; 94 | } 95 | Object.keys(synonymsForSlots[slotKey]).forEach(synonymsValue => { 96 | if (!training.entities[slotKey]) { 97 | training.entities[slotKey] = {}; 98 | } 99 | training.entities[slotKey].use_synonyms = true; 100 | training.entities[slotKey].automatically_extensible = true; 101 | if (!training.entities[slotKey].data) { 102 | training.entities[slotKey].data = []; 103 | } 104 | const slotSynonymsSet = synonymsForSlots[slotKey][synonymsValue]; 105 | const synonymsList = slotSynonymsSet.size ? Array.from(slotSynonymsSet) : []; 106 | (training.entities[slotKey].data as any[]).push({ 107 | synonyms: synonymsList, 108 | value: synonymsValue 109 | }); 110 | }); 111 | }); 112 | return { training, testing }; 113 | } 114 | -------------------------------------------------------------------------------- /web/lib/editorConfig.ts: -------------------------------------------------------------------------------- 1 | import * as rasaAdapter from '../../src/adapters/rasa'; 2 | import * as snipsAdapter from '../../src/adapters/snips'; 3 | import * as webAdapter from '../../src/adapters/web'; 4 | 5 | const findRestaurantsByCity = `import ./common.chatito 6 | 7 | # Ways to request a restaurant within a location (using probability operator) 8 | # NOTE: 60% of the examples should come from the first sentence, and 40% from the second 9 | 10 | %[findRestaurantsByCity]('training': '100', 'testing': '100') 11 | *[60%] ~[hi?] ~[please?] ~[find?] ~[restaurants] ~[located at] @[city] ~[city?] ~[thanks?] 12 | *[40%] ~[restaurants] ~[located at] @[city] 13 | 14 | @[city] 15 | ~[new york] 16 | ~[san francisco] 17 | ~[atlanta] 18 | 19 | ~[find] 20 | find 21 | i'm looking for 22 | help me find 23 | 24 | ~[located at] 25 | located at 26 | in the area of 27 | near 28 | 29 | ~[restaurants] 30 | restaurants 31 | places to eat 32 | where to eat 33 | `; 34 | 35 | const affirmative = `// Ways to say yes 36 | 37 | import ./common.chatito 38 | 39 | %[affirmative]('training': '50', 'testing': '50') 40 | *[20%] ~[yes] 41 | ~[yes] ~[please?] 42 | ~[yes] ~[yes?] ~[thanks?] 43 | ~[yes?] ~[that is good] ~[yes?] 44 | 45 | ~[yes] 46 | yes 47 | right 48 | affirmative 49 | agreed 50 | correct 51 | yep 52 | yes sir 53 | sounds good 54 | im ok with that 55 | 56 | ~[that is good] 57 | that is good 58 | i want that 59 | that is fine 60 | that is correct 61 | that is what i want 62 | you understood me 63 | that is right 64 | its fine 65 | good 66 | `; 67 | 68 | const bye = `// Ways to say goodbye 69 | 70 | import ./common.chatito 71 | 72 | %[bye]('training': '50', 'testing': '50') 73 | *[20%] ~[bye] 74 | ~[thanks?] ~[bye] 75 | ~[bye] ~[thanks?] 76 | ~[leaving] 77 | ~[leaving] ~[bye] 78 | 79 | ~[bye] 80 | bye 81 | goodbye 82 | ttyl 83 | gtg 84 | adios 85 | farewell 86 | adieu 87 | chao 88 | chau 89 | 90 | ~[leaving] 91 | leaving 92 | talk to you soon 93 | have to go 94 | got to go 95 | talk to you later 96 | heading out 97 | im leaving now 98 | going out 99 | `; 100 | 101 | const greet = `// Ways to say hello 102 | 103 | import ./common.chatito 104 | 105 | %[greet]('training': '50', 'testing': '50') 106 | *[20%] ~[hi] 107 | ~[greetings] 108 | ~[hi] ~[greetings?] 109 | ~[hi] ~[whats up] 110 | ~[greetings] ~[whats up] 111 | ~[hi] ~[greetings] ~[whats up] 112 | 113 | ~[greetings] 114 | greetings 115 | good morning 116 | good afternoon 117 | good day 118 | good night 119 | morning 120 | 121 | ~[whats up] 122 | how are you 123 | whats up 124 | how are you doing 125 | how is it going 126 | are you there 127 | how are things going 128 | are you around 129 | whatsup 130 | sup 131 | are you around? 132 | `; 133 | 134 | const negative = `// Ways to say no 135 | 136 | import ./common.chatito 137 | 138 | %[negative]('training': '50', 'testing': '50') 139 | *[20%] ~[no] 140 | ~[no] ~[please?] ~[its not ok?] 141 | ~[please?] ~[no] ~[its not ok?] 142 | *[20%] ~[its not ok] 143 | 144 | ~[no] 145 | no 146 | nope 147 | not really 148 | that's not right 149 | incorrect 150 | don't do that 151 | 152 | ~[its not ok] 153 | i don't want that 154 | didnt meant that 155 | dont mean that 156 | that's not what i want 157 | that's not correct 158 | that's wrong 159 | it's not good 160 | that is wrong 161 | its not ok 162 | its not correct 163 | `; 164 | 165 | const common = `// Common entities to be imported and reused 166 | ~[hi] 167 | hi 168 | hello 169 | hey 170 | 171 | ~[please] 172 | please 173 | plz 174 | pls 175 | 176 | ~[thanks] 177 | thanks 178 | thank you 179 | 180 | `; 181 | 182 | export const tabs = [ 183 | { title: 'findRestaurantsByCity.chatito', value: findRestaurantsByCity }, 184 | { title: 'greet.chatito', value: greet }, 185 | { title: 'bye.chatito', value: bye }, 186 | { title: 'affirmative.chatito', value: affirmative }, 187 | { title: 'negative.chatito', value: negative }, 188 | { title: 'common.chatito', value: common } 189 | ]; 190 | 191 | export const chatitoPrism = { 192 | comments: [{ pattern: /^(\/\/|\#).*/, greedy: true }, { pattern: /((\n|\r\n)+)(\/\/|\#).*/, greedy: true }], 193 | imports: [{ pattern: /(\n|\r\n)import\s/, greedy: true }, { pattern: /^import\s/, greedy: true }], 194 | intentDefinition: [ 195 | { 196 | pattern: /^%\[[^\]]+\]((\(.+\))?)/, 197 | inside: { intentArguments: /((\(.+\))?)$/ } 198 | }, 199 | { 200 | pattern: /((\n|\r\n)+)%\[[^\]]+\]((\(.+\))?)/, 201 | inside: { intentArguments: /((\(.+\))?)$/ } 202 | } 203 | ], 204 | slotDefinition: [ 205 | { 206 | pattern: /^\@\[[^\]]+\]((\(.+\))?)/, 207 | inside: { slotArguments: /((\(.+\))?)$/ } 208 | }, 209 | { 210 | pattern: /((\n|\r\n)+)\@\[[^\]]+\]((\(.+\))?)/, 211 | inside: { slotArguments: /((\(.+\))?)$/ } 212 | } 213 | ], 214 | aliasDefinition: [ 215 | { 216 | pattern: /^~\[[^\]]+\]((\(.+\))?)/, 217 | inside: { aliasArguments: /((\(.+\))?)$/ } 218 | }, 219 | { 220 | pattern: /((\n|\r\n)+)~\[[^\]]+\]((\(.+\))?)/, 221 | inside: { aliasArguments: /((\(.+\))?)$/ } 222 | } 223 | ], 224 | probability: { pattern: /(\n|\r\n)\s\s\s\s\*\[[^\]]+\]/, greedy: true }, 225 | slot: { pattern: /\@\[[^\]]+(\?)?\]/, greedy: true }, 226 | alias: { pattern: /~\[[^\]]+(\?)?\]/, greedy: true }, 227 | default: { pattern: /[^\r\n]/i, greedy: true } 228 | }; 229 | 230 | export const webDefaultOptions: webAdapter.IDefaultDataset = {}; 231 | export const rasaDefaultOptions: rasaAdapter.IRasaDataset = { 232 | rasa_nlu_data: { 233 | regex_features: [], 234 | entity_synonyms: [], 235 | common_examples: [] 236 | } 237 | }; 238 | export const snipsDefaultOptions: snipsAdapter.ISnipsDataset = { language: 'en', entities: {}, intents: {} }; 239 | -------------------------------------------------------------------------------- /public/safari-pinned-tab.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 7 | 8 | Created by potrace 1.11, written by Peter Selinger 2001-2013 9 | 10 | 12 | 81 | 85 | 88 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /parser/chatito.pegjs: -------------------------------------------------------------------------------- 1 | { var STEP = 4; var level = 0; var entry = false; } 2 | 3 | Start = (ImportFile/TopLevelStatement/CommentLine)+ 4 | TopLevelStatement = od:(IntentDefinition/SlotDefinition/AliasDefinition) { return od; } 5 | 6 | // ============= Probability operator ============= 7 | ProbabilityOperatorDefinition = "*[" probability:Number percent:"%"? "]" { return `${probability}${percent || ''}`; } 8 | // ============= Entities ============= 9 | EntityOpt = "?" 10 | EntityBody = "[" value:EntityKeywordLiteral "]" { return value } 11 | EntityOptionalBody = "[" value:EntityKeywordLiteral opt:EntityOpt? "]" 12 | { return { value: value, opt: !!opt }; } 13 | 14 | // Intent 15 | EntityIntentDefinition = "%" value:EntityBody args:EntityArguments? 16 | { return { value: value, type: "IntentDefinition", args: args, location: location() } } 17 | // Intents allow any text except end of lines, alias and slot definitions (because they are parsed as another value) 18 | AnyTextWithSlotAndAlias = v:(t:((!"\r\n")(!"\n")(!"~[")(!"@[") .) { return t.join(""); })+ { return v.join(""); } 19 | IntentAndSlotKeywordLiterals = value:AnyTextWithSlotAndAlias { return { value: value, type: "Text" }} 20 | IntentAndSlotValidInner = (OptionalSlot/OptionalAlias/IntentAndSlotKeywordLiterals)+ 21 | IntentAndSlotInnerStatements = IntentAndSlotInnerStatement+ 22 | IntentAndSlotInnerStatement = Samedent p:ProbabilityOperatorDefinition? s:IntentAndSlotValidInner EOS 23 | { return { sentence: s, probability: p }; } 24 | IntentDefinition = EOL? o:EntityIntentDefinition EOL 25 | Indent s:IntentAndSlotInnerStatements Dedent 26 | { return { type: o.type, key: o.value, args: o.args, location: o.location, inner: s } } 27 | 28 | // Slot 29 | SlotVariationStartDefinition = "#" 30 | SlotVariationDefinition = SlotVariationStartDefinition id:SlotKeywordLiteral { return id } 31 | EntitySlotDefinition = "@[" value:SlotKeywordLiteral variation:SlotVariationDefinition? "]" args:EntityArguments? 32 | { return { value: value, type: "SlotDefinition", variation: variation, args: args, location: location() } } 33 | SlotOptionalBody = "[" value:SlotKeywordLiteral variation:SlotVariationDefinition? opt:EntityOpt? "]" 34 | { return { value: value, opt: !!opt, variation: variation }; } 35 | OptionalSlot = "@" op:SlotOptionalBody 36 | { return { value: op.value, type: "Slot", opt: op.opt, location: location(), variation: op.variation } } 37 | // Slots allow any text except end of lines and alias definitions (because they are parsed as another value) 38 | AnyTextWithAlias = v:(t:((!"\r\n")(!"\n")(!"~[") .) { return t.join(""); })+ { return v.join(""); } 39 | SlotKeywordLiterals = value:AnyTextWithAlias { return { value: value, type: "Text" }} 40 | SlotValidInner = (OptionalAlias/SlotKeywordLiterals)+ 41 | SlotInnerStatement = Samedent p:ProbabilityOperatorDefinition? s:SlotValidInner EOS { return { sentence: s, probability: p }; } 42 | SlotInnerStatements = SlotInnerStatement+ 43 | SlotDefinition = EOL? o:EntitySlotDefinition EOL 44 | Indent s:SlotInnerStatements Dedent 45 | { return { type: o.type, key: o.value, args: o.args, location: o.location, inner: s, variation: o.variation } } 46 | 47 | // Alias 48 | EntityAliasDefinition = "~" value:EntityBody args:EntityArguments? 49 | { return { value: value, type: "AliasDefinition", location: location(), args: args } } 50 | OptionalAlias = "~" op:EntityOptionalBody { return { value: op.value, type: "Alias", opt: op.opt } } 51 | AliasDefinition = EOL? o:EntityAliasDefinition EOL 52 | Indent s:IntentAndSlotInnerStatements Dedent 53 | { return { type: o.type, key: o.value, location: o.location, inner: s, args: o.args } } 54 | 55 | // ============= Identation ============= 56 | Samedent "correct indentation" = s:" "* &{ return s.length === level * STEP; } 57 | Indent = &{ level++; return true; } 58 | Dedent = &{ level--; return true; } 59 | 60 | // ============= Primitives ============= 61 | AnyTextWithoutEOL = v:(t:((!"\r\n")(!"\n") .) { return t.join(""); })+ { return v.join(""); } 62 | DoubleSlashCommentLine = EOL? "//" c:AnyTextWithoutEOL EOS? { return { type: "Comment" , value: c.trim() }; } 63 | HashCommentLine = EOL? "#" c:AnyTextWithoutEOL EOS? { return { type: "Comment" , value: c.trim() }; } 64 | CommentLine = (DoubleSlashCommentLine/HashCommentLine) 65 | 66 | ImportFile = EOL? "import " s:AnyTextWithoutEOL EOS? { return { type: "ImportFile", value: s.trim() }; } 67 | 68 | // KeywordLiteral "word" = v:([a-zA-Z0-9_ \:\+]+) { return v.join(""); } 69 | BasicKeywordLiteral "entity name" = v:(t:((!"\r\n")(!"\n")(!"]") .) { return t.join(""); })+ { return v.join(""); } 70 | EntityKeywordLiteral "entity name" = v:(t:((!"\r\n")(!"\n")(!"]")(!"?") .) { return t.join(""); })+ { return v.join(""); } 71 | SlotKeywordLiteral "entity name" = v:(t:((!"\r\n")(!"\n")(!"#")(!"]")(!"?") .) { return t.join(""); })+ { return v.join(""); } 72 | 73 | // Number 74 | Number "number" = int frac? { return parseFloat(text()); } 75 | DecimalPoint = "." 76 | Digit1_9 = [1-9] 77 | Digit0_9 = [0-9] 78 | frac = DecimalPoint Digit0_9+ 79 | int = zero / (Digit1_9 Digit0_9*) 80 | zero = "0" 81 | 82 | EOS "end of sentence" = EOL / EOF 83 | EOL "end of line "= (EOLNonWindows/EOLWindows)+ 84 | EOLNonWindows "non windows end of line" = "\n" 85 | EOLWindows "windows end of line" = "\r\n" 86 | EOF = !. 87 | 88 | // ============= Entity arguments ============= 89 | EntityArguments = "(" args:(EntityArg)+ ")" { 90 | return args.reduce(function (prev, curr) { prev[curr.key] = curr.value; return prev; }, {}); 91 | } 92 | EntityArg = (" "*)? ek:ArgumentKeyValueString (" "*)? ":" (" "*)? ev:ArgumentKeyValueString ((" "*)? ",")? (" "*)? { return { key: ek, value: ev }; } 93 | // EntityValidKeyOrValue = v:(t:((!"\r\n")(!"\n")(!"=")(!",")(!")")(!"(") .) { return t.join(""); })+ { return v.join(""); } 94 | // based from json parser from https://github.com/pegjs/pegjs/blob/master/examples/json.pegjs 95 | ArgumentKeyValueString 96 | = '"' chars:DoubleQuotedString* '"' { return chars.join(''); } 97 | / "'" chars:SingleQuotedString* "'" { return chars.join(''); } 98 | DoubleQuotedString 99 | = !('"' / "\\" / "\n") char:. { return char; } 100 | / "\\" sequence:StringEscapedChars { return sequence; } 101 | SingleQuotedString 102 | = !("'" / "\\" / "\n") char:. { return char; } 103 | / "\\" sequence:StringEscapedChars { return sequence; } 104 | StringEscapedChars 105 | = "'" 106 | / '"' 107 | / "\\" 108 | / "b" { return "\b"; } 109 | / "f" { return "\f"; } 110 | / "n" { return "\n"; } 111 | / "r" { return "\r"; } 112 | / "t" { return "\t"; } 113 | / "v" { return "\x0B"; } 114 | / "u" digits:$(HEXDIG HEXDIG HEXDIG HEXDIG) { return String.fromCharCode(parseInt(digits, 16)); } 115 | HEXDIG = [0-9a-f]i -------------------------------------------------------------------------------- /web/components/Editor/editorStyles.tsx: -------------------------------------------------------------------------------- 1 | import styled from 'styled-components'; 2 | 3 | export const AlertNotification = styled('div')` 4 | width: 100%; 5 | background-color: ${({ state }: { state: 'error' | 'warning' | 'success' }) => 6 | state === 'error' ? '#c80000' : state === 'warning' ? '#7f8000' : '#008800'}; 7 | bottom: 0; 8 | margin: auto; 9 | right: 0; 10 | text-align: center; 11 | padding: 12px; 12 | color: white; 13 | z-index: 99; 14 | font-size: 14px; 15 | `; 16 | 17 | export const CodeStyles = styled('div')` 18 | white-space: pre-wrap; 19 | position: relative; 20 | margin: auto; 21 | width: inherit; 22 | height: calc(100vh - 210px) !important; 23 | min-height: 400px; 24 | background-color: #282a35; 25 | > .codeflask { 26 | background-color: #282a35; 27 | > textarea.codeflask__textarea { 28 | color: #282a35; 29 | caret-color: #fff; 30 | } 31 | &.codeflask--has-line-numbers { 32 | :before { 33 | background-color: #3c3c4c; 34 | } 35 | > pre { 36 | width: auto !important; 37 | } 38 | div.codeflask__lines { 39 | z-index: 3; 40 | height: auto !important; 41 | padding: 10px 4px 0 0; 42 | > .codeflask__lines__line { 43 | color: #6473a0; 44 | background-color: #3c3c4c; 45 | } 46 | } 47 | } 48 | *::-webkit-scrollbar { 49 | width: 10px; 50 | height: 10px; 51 | } 52 | *::-webkit-scrollbar-thumb { 53 | background-color: #7c7c9c; 54 | box-shadow: inset 0 0 2px rgba(0, 0, 0, 0.8); 55 | } 56 | *::-webkit-scrollbar-track { 57 | box-shadow: inset 0 0 2px rgba(0, 0, 0, 0.8); 58 | } 59 | *::-webkit-scrollbar-corner { 60 | background-color: transparent; 61 | } 62 | } 63 | .token.imports { 64 | color: #f7717d; 65 | } 66 | .token.comments { 67 | color: #999; 68 | } 69 | .token.intentDefinition { 70 | color: #ef82c3; 71 | } 72 | .token.slotDefinition { 73 | color: #ffaf56; 74 | } 75 | .token.aliasDefinition { 76 | color: #a0e7fb; 77 | } 78 | .token.probability { 79 | color: #00f0b5; 80 | } 81 | .token.slot { 82 | color: #ffaf56; 83 | } 84 | .token.alias { 85 | color: #a0e7fb; 86 | } 87 | .token.default { 88 | color: #e2e2dd; 89 | } 90 | .token.intentArguments { 91 | color: #b5669e; 92 | } 93 | .token.slotArguments { 94 | color: #7a9d98; 95 | } 96 | .token.aliasArguments { 97 | color: #80c7db; 98 | } 99 | `; 100 | 101 | export const TabButton = styled('div')` 102 | cursor: pointer; 103 | display: inline-block; 104 | background-color: ${({ active }: { active: boolean }) => (active ? '#282A35' : '#3c3c4c')}; 105 | font-size: 12px; 106 | color: #ededed; 107 | padding: 13px 3px 13px 13px; 108 | border-right: 1px solid #2c2c3c; 109 | zoom: 1; 110 | -webkit-touch-callout: none; 111 | -webkit-user-select: none; 112 | -moz-user-select: none; 113 | -ms-user-select: none; 114 | user-select: none; 115 | `; 116 | 117 | export const CloseTab = styled('div')` 118 | :after { 119 | content: 'x'; 120 | } 121 | padding: 8px; 122 | margin-left: 8px; 123 | display: inline-block; 124 | color: #ccf; 125 | line-height: 10px; 126 | font-size: 14px; 127 | cursor: pointer; 128 | font-weight: bold; 129 | `; 130 | 131 | export const EditorHeader = styled('div')` 132 | display: flex; 133 | flex-direction: row; 134 | width: 100%; 135 | max-width: 100%; 136 | background-color: #3c3c4c; 137 | padding-left: 40px; 138 | padding-top: 10px; 139 | `; 140 | 141 | export const TabsAreaButton = styled('button')` 142 | cursor: pointer; 143 | background-color: #6c1de2; 144 | font-size: 12px; 145 | color: #efefef; 146 | line-height: 14px; 147 | padding: 8px 24px; 148 | white-space: nowrap; 149 | margin: auto 10px; 150 | border-radius: 4px; 151 | border-color: #333; 152 | -webkit-transition: 0.25s ease; 153 | -moz-transition: 0.25s ease; 154 | -o-transition: 0.25s ease; 155 | transition: 0.25s ease; 156 | &:first-of-type { 157 | margin-left: 20px; 158 | } 159 | :disabled { 160 | border: 1px solid #999999; 161 | background-color: #cccccc; 162 | color: #666666; 163 | } 164 | `; 165 | 166 | export const TabsArea = styled('div')` 167 | width: auto; 168 | max-width: 100%; 169 | white-space: nowrap; 170 | position: relative; 171 | overflow-x: auto; 172 | overflow-y: hidden; 173 | -webkit-overflow-scrolling: touch; 174 | &::-webkit-scrollbar { 175 | height: 6px; 176 | } 177 | &::-webkit-scrollbar-thumb { 178 | background-color: #7c7c9c; 179 | -webkit-box-shadow: inset 0 0 2px rgba(0, 0, 0, 0.8); 180 | } 181 | &::-webkit-scrollbar-track { 182 | -webkit-box-shadow: inset 0 0 2px rgba(0, 0, 0, 0.8); 183 | } 184 | *::-webkit-scrollbar-corner { 185 | background-color: transparent; 186 | } 187 | `; 188 | 189 | export const EditorWrapper = styled('div')` 190 | width: 90vw; 191 | overflow: auto; 192 | margin: auto; 193 | position: relative; 194 | -webkit-box-shadow: 0px 0px 36px 2px rgba(0, 0, 0, 0.63); 195 | -moz-box-shadow: 0px 0px 36px 2px rgba(0, 0, 0, 0.63); 196 | box-shadow: 0px 0px 36px 2px rgba(0, 0, 0, 0.63); 197 | `; 198 | 199 | export const Drawer = styled('div')` 200 | z-index: 99; 201 | position: absolute; 202 | background-color: #352252; 203 | -webkit-box-shadow: -5px 0px 5px -5px rgba(0, 0, 0, 0.55); 204 | -moz-box-shadow: -5px 0px 5px -5px rgba(0, 0, 0, 0.55); 205 | box-shadow: -5px 0px 5px -5px rgba(0, 0, 0, 0.55); 206 | top: 0; 207 | right: 0; 208 | max-width: 700px; 209 | height: 100%; 210 | width: ${({ showDrawer }: { showDrawer: boolean }) => (showDrawer ? `100%` : `0px`)}; 211 | -webkit-transition: 0.65s ease; 212 | -moz-transition: 0.65s ease; 213 | -o-transition: 0.65s ease; 214 | transition: 0.65s ease; 215 | overflow: auto; 216 | `; 217 | 218 | export const EditorOverlay = styled('div')` 219 | z-index: 999; 220 | position: absolute; 221 | top: 0; 222 | left: 0; 223 | width: 100%; 224 | height: 100%; 225 | background: rgba(0, 0, 0, 0.6); 226 | visibility: ${({ showDrawer }: { showDrawer: boolean }) => (showDrawer ? 'visible' : 'hidden')}; 227 | -webkit-transition: 0.25s ease; 228 | -moz-transition: 0.25s ease; 229 | -o-transition: 0.25s ease; 230 | transition: 0.25s ease; 231 | `; 232 | 233 | export const BlockWrapper = styled('div')` 234 | background-color: #e4e4e4; 235 | margin: 20px; 236 | overflow: auto; 237 | border-radius: 8px; 238 | -webkit-box-shadow: 0px 0px 50px 0px rgba(0, 0, 0, 0.4); 239 | -moz-box-shadow: 0px 0px 50px 0px rgba(0, 0, 0, 0.4); 240 | box-shadow: 0px 0px 50px 0px rgba(0, 0, 0, 0.4); 241 | clear: both; 242 | `; 243 | 244 | export const BlockWrapperTitle = styled('div')` 245 | background-color: #6b5a86; 246 | color: #efefef; 247 | font-size: 13px; 248 | padding: 8px 10px; 249 | border-top-left-radius: 8px; 250 | border-top-right-radius: 8px; 251 | `; 252 | 253 | export const CloseDrawerButton = styled('div')` 254 | cursor: pointer; 255 | color: white; 256 | font-size: 16px; 257 | padding: 8px; 258 | font-weight: bold; 259 | margin: 8px 20px 8px 20px; 260 | float: right; 261 | `; 262 | 263 | export const DrawerFormField = styled('div')` 264 | padding: 10px 20px; 265 | display: flex; 266 | align-items: center; 267 | flex: 1; 268 | > label { 269 | font-size: 12px; 270 | padding-right: 10px; 271 | } 272 | `; 273 | 274 | export const SelectWrapper = styled('div')` 275 | position: relative; 276 | z-index: 0; 277 | display: inline-block; 278 | overflow: hidden; 279 | height: auto; 280 | padding: 0 5px 0 0; 281 | margin: 0 5px 0 0; 282 | border-radius: 5px; 283 | border: solid 1px #ccc; 284 | background-color: #fff; 285 | :before { 286 | position: absolute; 287 | z-index: 1; 288 | content: '\\25BE'; 289 | top: 50%; 290 | right: 10px; 291 | margin-top: -9px; 292 | } 293 | select { 294 | position: relative; 295 | z-index: 2; 296 | outline: none; 297 | width: 120%; 298 | padding: 5px 20px 5px 10px; 299 | background-color: transparent; 300 | background-image: none; 301 | -webkit-appearance: none; 302 | border: none; 303 | box-shadow: none; 304 | } 305 | `; 306 | 307 | export const CheckboxWrapper = styled('div')` 308 | font-size: 12px; 309 | text-decoration: underline; 310 | cursor: pointer; 311 | input { 312 | margin-right: 10px; 313 | cursor: pointer; 314 | } 315 | `; 316 | -------------------------------------------------------------------------------- /src/bin.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import * as fs from 'fs'; 3 | import * as path from 'path'; 4 | import * as flair from './adapters/flair'; 5 | import * as luis from './adapters/luis'; 6 | import * as rasa from './adapters/rasa'; 7 | import * as snips from './adapters/snips'; 8 | import * as web from './adapters/web'; 9 | import { config, VALID_AUTO_ALIASES, VALID_DISTRIBUTIONS } from './main'; 10 | import * as utils from './utils'; 11 | 12 | // tslint:disable-next-line:no-var-requires 13 | const argv = require('minimist')(process.argv.slice(2)); 14 | 15 | const logger = console; 16 | const adapters = { default: web, rasa, snips, luis, flair }; 17 | const workingDirectory = process.cwd(); 18 | const getFileWithPath = (filename: string) => path.resolve(workingDirectory, filename); 19 | 20 | const chatitoFilesFromDir = async (startPath: string, cb: (filename: string) => Promise) => { 21 | if (!fs.existsSync(startPath)) { 22 | logger.error(`Invalid directory: ${startPath}`); 23 | process.exit(1); 24 | } 25 | const files = fs.readdirSync(startPath); 26 | for (const file of files) { 27 | const filename = path.join(startPath, file); 28 | const stat = fs.lstatSync(filename); 29 | if (stat.isDirectory()) { 30 | await chatitoFilesFromDir(filename, cb); 31 | } else if (/\.chatito$/.test(filename)) { 32 | await cb(filename); 33 | } 34 | } 35 | }; 36 | 37 | const importer = (fromPath: string, importFile: string) => { 38 | const filePath = path.resolve(path.dirname(fromPath), importFile); 39 | if (path.extname(filePath) !== '.chatito') { 40 | throw new Error('Only files with .chatito extension can be imported'); 41 | } 42 | if (!fs.existsSync(filePath)) { 43 | throw new Error(`Can't import ${filePath}`); 44 | } 45 | const dsl = fs.readFileSync(filePath, 'utf8'); 46 | return { filePath, dsl }; 47 | }; 48 | 49 | const streamedAdapterAccumulator = (format: 'flair', outputPath: string) => { 50 | const adapterHandler = adapters[format]; 51 | if (!adapterHandler) { 52 | throw new Error(`Invalid adapter: ${format}`); 53 | } 54 | if (!fs.existsSync(outputPath)) { 55 | fs.mkdirSync(outputPath); 56 | } 57 | const trainingFileName = argv.trainingFileName || `${format}_dataset_training.txt`; 58 | const testingFileName = argv.testingFileName || `${format}_dataset_testing.txt`; 59 | const trainingClassificationFilePath = path.resolve(outputPath, `classification_${trainingFileName}`); 60 | const testingClassificationFilePath = path.resolve(outputPath, `classification_${testingFileName}`); 61 | const trainingNerFilePath = path.resolve(outputPath, `ner_${trainingFileName}`); 62 | const testingNerFilePath = path.resolve(outputPath, `ner_${testingFileName}`); 63 | // write streams 64 | const trainClassification = fs.createWriteStream(trainingClassificationFilePath, { flags: 'a' }); 65 | const testClassification = fs.createWriteStream(testingClassificationFilePath, { flags: 'a' }); 66 | const trainNER = fs.createWriteStream(trainingNerFilePath, { flags: 'a' }); 67 | const testNER = fs.createWriteStream(testingNerFilePath, { flags: 'a' }); 68 | trainClassification.on('close', () => logger.log('Train classification dataset done!')); 69 | testClassification.on('close', () => logger.log('Test classification dataset done!')); 70 | trainNER.on('close', () => logger.log('Test NER dataset done!')); 71 | testNER.on('close', () => logger.log('Test NER dataset done!')); 72 | return { 73 | write: async (fullFilenamePath: string) => { 74 | logger.log(`Processing file: ${fullFilenamePath}`); 75 | const dsl = fs.readFileSync(fullFilenamePath, 'utf8'); 76 | const streams = { trainClassification, testClassification, trainNER, testNER }; 77 | await adapterHandler.streamAdapter(dsl, streams, importer, fullFilenamePath); 78 | }, 79 | save: () => { 80 | trainClassification.close(); 81 | testClassification.close(); 82 | trainNER.close(); 83 | testNER.close(); 84 | logger.log(`Saved training dataset: ${trainingClassificationFilePath}`); 85 | logger.log(`Saved testing dataset: ${testingClassificationFilePath}`); 86 | logger.log(`Saved training dataset: ${trainingNerFilePath}`); 87 | logger.log(`Saved testing dataset: ${testingNerFilePath}`); 88 | } 89 | }; 90 | }; 91 | 92 | type IValidFormat = 'default' | 'rasa' | 'snips' | 'luis' | 'flair'; 93 | const adapterAccumulator = (format: IValidFormat, outputPath: string, formatOptions?: any) => { 94 | const trainingDataset: snips.ISnipsDataset | rasa.IRasaDataset | luis.ILuisDataset | {} = {}; 95 | const testingDataset: any = {}; 96 | if (format === 'flair') { 97 | return streamedAdapterAccumulator('flair', outputPath); 98 | } 99 | const trainingJsonFileName = argv.trainingFileName || `${format}_dataset_training.json`; 100 | const trainingJsonFilePath = path.resolve(outputPath, trainingJsonFileName); 101 | const testingFileName = argv.testingFileName || `${format}_dataset_testing.json`; 102 | const testingJsonFilePath = path.resolve(outputPath, testingFileName); 103 | const adapterHandler = adapters[format]; 104 | if (!adapterHandler) { 105 | throw new Error(`Invalid adapter: ${format}`); 106 | } 107 | return { 108 | write: async (fullFilenamePath: string) => { 109 | logger.log(`Processing file: ${fullFilenamePath}`); 110 | const dsl = fs.readFileSync(fullFilenamePath, 'utf8'); 111 | const { training, testing } = await adapterHandler.adapter(dsl, formatOptions, importer, fullFilenamePath); 112 | utils.mergeDeep(trainingDataset, training); 113 | utils.mergeDeep(testingDataset, testing); 114 | }, 115 | save: () => { 116 | if (!fs.existsSync(outputPath)) { 117 | fs.mkdirSync(outputPath); 118 | } 119 | fs.writeFileSync(trainingJsonFilePath, JSON.stringify(trainingDataset)); 120 | logger.log(`Saved training dataset: ${trainingJsonFilePath}`); 121 | 122 | if (Object.keys(testingDataset).length) { 123 | fs.writeFileSync(testingJsonFilePath, JSON.stringify(testingDataset)); 124 | logger.log(`Saved testing dataset: ${testingJsonFilePath}`); 125 | } 126 | } 127 | }; 128 | }; 129 | 130 | const validateArgs = () => { 131 | if (argv.defaultDistribution) { 132 | if (VALID_DISTRIBUTIONS.includes(argv.defaultDistribution)) { 133 | config.defaultDistribution = argv.defaultDistribution; 134 | } else { 135 | throw new Error( 136 | `Unknow defaultDistribution value: '${argv.defaultDistribution}'. Valid values are: ${VALID_DISTRIBUTIONS.join(', ')}.` 137 | ); 138 | } 139 | } 140 | if (argv.autoAliases) { 141 | if (VALID_AUTO_ALIASES.includes(argv.autoAliases)) { 142 | config.autoAliases = argv.autoAliases; 143 | } else { 144 | throw new Error(`Unknow autoAliases value: '${argv.autoAliases}'. Valid values are: ${VALID_AUTO_ALIASES.join(', ')}.`); 145 | } 146 | } 147 | }; 148 | 149 | (async () => { 150 | if (!argv._ || !argv._.length) { 151 | logger.error('Invalid chatito file.'); 152 | process.exit(1); 153 | } 154 | const dslFile = argv._[0]; 155 | const format = (argv.format || 'default').toLowerCase(); 156 | if (['default', 'rasa', 'snips', 'luis', 'flair'].indexOf(format) === -1) { 157 | logger.error(`Invalid format argument: ${format}`); 158 | process.exit(1); 159 | } 160 | const outputPath = argv.outputPath || process.cwd(); 161 | try { 162 | validateArgs(); 163 | logger.log(`NOTE: Using ${config.defaultDistribution} as default frequency distribution.`); 164 | // parse the formatOptions argument 165 | let formatOptions = null; 166 | if (argv.formatOptions) { 167 | formatOptions = JSON.parse(fs.readFileSync(path.resolve(argv.formatOptions), 'utf8')); 168 | } 169 | const dslFilePath = getFileWithPath(dslFile); 170 | const isDirectory = fs.existsSync(dslFilePath) && fs.lstatSync(dslFilePath).isDirectory(); 171 | const accumulator = adapterAccumulator(format, outputPath, formatOptions); 172 | if (isDirectory) { 173 | await chatitoFilesFromDir(dslFilePath, accumulator.write); 174 | } else { 175 | await accumulator.write(dslFilePath); 176 | } 177 | accumulator.save(); 178 | } catch (e) { 179 | if (e && e.message && e.location) { 180 | logger.log('==== CHATITO SYNTAX ERROR ===='); 181 | logger.log(' ', e.message); 182 | logger.log(` Line: ${e.location.start.line}, Column: ${e.location.start.column}`); 183 | logger.log('=============================='); 184 | } else { 185 | logger.error(e && e.stack ? e.stack : e); 186 | } 187 | logger.log('FULL ERROR REPORT:'); 188 | logger.error(e); 189 | process.exit(1); 190 | } 191 | })(); 192 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Chatito 2 | 3 | [![npm version](https://badge.fury.io/js/chatito.svg)](https://www.npmjs.com/package/chatito) 4 | [![CircleCI branch]( 5 | https://img.shields.io/circleci/project/github/RedSparr0w/node-csgo-parser/master.svg 6 | )](https://circleci.com/gh/rodrigopivi/workflows/Chatito) 7 | [![npm](https://img.shields.io/npm/dm/chatito.svg)](https://www.npmjs.com/package/chatito) 8 | [![License](https://img.shields.io/github/license/rodrigopivi/Chatito.svg)](https://www.npmjs.com/package/chatito) 9 | 10 | 11 | [![Alt text](screenshot.jpg?raw=true "Screenshot of online IDE")](https://rodrigopivi.github.io/Chatito/) 12 | 13 | [Try the online IDE!](https://rodrigopivi.github.io/Chatito/) 14 | 15 | 16 | ## Overview 17 | Chatito helps you generate datasets for training and validating chatbot models using a simple DSL. 18 | 19 | If you are building chatbots using commercial models, open source frameworks or writing your own natural language processing model, you need training and testing examples. Chatito is here to help you. 20 | 21 | This project contains the: 22 | - [Online chatito IDE](https://rodrigopivi.github.io/Chatito/) 23 | - [Chatito DSL specification](https://github.com/rodrigopivi/Chatito/blob/master/spec.md) 24 | - [DSL AST parser in pegjs format](https://github.com/rodrigopivi/Chatito/blob/master/parser/chatito.pegjs) 25 | - [Generator implemented in typescript + npm package](https://github.com/rodrigopivi/Chatito/tree/master/src) 26 | 27 | ### Chatito language 28 | For the full language specification and documentation, please refer to the [DSL spec document](https://github.com/rodrigopivi/Chatito/blob/master/spec.md). 29 | 30 | ### Adapters 31 | The language is independent from the generated output format and because each model can receive different parameters and settings, this are the currently implemented data formats, if your provider is not listed, at the Tools and resources section there is more information on how to support more formats. 32 | 33 | NOTE: Samples are not shuffled between intents for easier review and because some adapters stream samples directly to the file. 34 | 35 | #### Default format 36 | Use the default format if you plan to train a custom model or if you are writing a custom adapter. This is the most flexible format because you can annotate `Slots` and `Intents` with custom entity arguments, and they all will be present at the generated output, so for example, you could also include dialog/response generation logic with the DSL. E.g.: 37 | 38 | ``` 39 | %[some intent]('context': 'some annotation') 40 | @[some slot] ~[please?] 41 | 42 | @[some slot]('required': 'true', 'type': 'some type') 43 | ~[some alias here] 44 | 45 | ``` 46 | 47 | Custom entities like 'context', 'required' and 'type' will be available at the output so you can handle this custom arguments as you want. 48 | 49 | #### [Rasa NLU](https://rasa.com/docs/nlu/) 50 | [Rasa NLU](https://rasa.com/docs/nlu/) is a great open source framework for training NLU models. 51 | One particular behavior of the Rasa adapter is that when a slot definition sentence only contains one alias, and that alias defines the 'synonym' argument with 'true', the generated Rasa dataset will map the alias as a synonym. e.g.: 52 | 53 | ``` 54 | %[some intent]('training': '1') 55 | @[some slot] 56 | 57 | @[some slot] 58 | ~[some slot synonyms] 59 | 60 | ~[some slot synonyms]('synonym': 'true') 61 | synonym 1 62 | synonym 2 63 | ``` 64 | 65 | In this example, the generated Rasa dataset will contain the `entity_synonyms` of `synonym 1` and `synonym 2` mapping to `some slot synonyms`. 66 | 67 | #### [Flair](https://github.com/zalandoresearch/flair) 68 | [Flair](https://github.com/zalandoresearch/flair) A very simple framework for state-of-the-art NLP. Developed by Zalando Research. It provides state of the art (GPT, BERT, ELMo, etc...) pre trained models and embeddings for many languages that work out of the box. This adapter supports the `text classification` dataset in FastText format and the `named entity recognition` dataset in two column [BIO](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) annotated words, as documented at [flair corpus documentation](https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md). This two data formats are very common and with many other providers or models. 69 | 70 | The NER dataset requires a word tokenization processing that is currently done using [wink-tokenizer](https://github.com/winkjs/wink-tokenizer) npm package. Extending the adapter to add PoS tagging can be explored in the future, but it's not implemented. 71 | 72 | NOTE: Flair adapter is only available for the NodeJS NPM CLI package, not for the IDE. 73 | 74 | #### [LUIS](https://www.luis.ai/) 75 | [LUIS](https://www.luis.ai/) is part of Microsoft's Cognitive services. Chatito supports training a LUIS NLU model through its [batch add labeled utterances endpoint](https://westus.dev.cognitive.microsoft.com/docs/services/5890b47c39e2bb17b84a55ff/operations/5890b47c39e2bb052c5b9c09), and its [batch testing api](https://docs.microsoft.com/en-us/azure/cognitive-services/LUIS/luis-how-to-batch-test). 76 | 77 | To train a LUIS model, you will need to post the utterance in batches to the relevant API for training or testing. 78 | 79 | Reference issue: [#61](https://github.com/rodrigopivi/Chatito/issues/61) 80 | 81 | #### [Snips NLU](https://snips-nlu.readthedocs.io/en/latest/) 82 | [Snips NLU](https://snips-nlu.readthedocs.io/en/latest/) is another great open source framework for NLU. One particular behavior of the Snips adapter is that you can define entity types for the slots. e.g.: 83 | 84 | ``` 85 | %[date search]('training':'1') 86 | for @[date] 87 | 88 | @[date]('entity': 'snips/datetime') 89 | ~[today] 90 | ~[tomorrow] 91 | ``` 92 | 93 | In the previous example, all `@[date]` values will be tagged with the `snips/datetime` entity tag. 94 | 95 | ### NPM package 96 | 97 | Chatito supports Node.js `v8.11.2 LTS` or higher. 98 | 99 | Install it with yarn or npm: 100 | ``` 101 | npm i chatito --save 102 | ``` 103 | 104 | Then create a definition file (e.g.: `trainClimateBot.chatito`) with your code. 105 | 106 | Run the npm generator: 107 | 108 | ``` 109 | npx chatito trainClimateBot.chatito 110 | ``` 111 | 112 | The generated dataset should be available next to your definition file. 113 | 114 | Here is the full npm generator options: 115 | ``` 116 | npx chatito --format= --formatOptions= --outputPath= --trainingFileName= --testingFileName= --defaultDistribution= 117 | ``` 118 | 119 | - `` path to a `.chatito` file or a directory that contains chatito files. If it is a directory, will search recursively for all `*.chatito` files inside and use them to generate the dataset. e.g.: `lightsChange.chatito` or `./chatitoFilesFolder` 120 | - `` Optional. `default`, `rasa`, `luis`, `flair` or `snips`. 121 | - `` Optional. Path to a .json file that each adapter optionally can use 122 | - `` Optional. The directory where to save the generated datasets. Uses the current directory as default. 123 | - `` Optional. The name of the generated training dataset file. Do not forget to add a .json extension at the end. Uses ``_dataset_training.json as default file name. 124 | - `` Optional. The name of the generated testing dataset file. Do not forget to add a .json extension at the end. Uses ``_dataset_testing.json as default file name. 125 | - `` Optional. The default frequency distribution if not defined at the entity level. Defaults to `regular` and can be set to `even`. 126 | 127 | - `` Optional. The generaor behavior when finding an undefined alias. Valid opions are `allow`, `warn`, `restrict`. Defauls to 'allow'. 128 | 129 | ### Notes to prevent overfitting 130 | 131 | [Overfitting](https://en.wikipedia.org/wiki/Overfitting) is a problem that can be prevented if we use Chatito correctly. The idea behind this tool, is to have an intersection between data augmentation and a probabilistic description of possible sentences combinations. It is not intended to generate deterministic datasets, you should avoid generating all possible combinations. 132 | 133 | ### Tools and resources 134 | 135 | - [Visual Studio Code syntax highlighting plugin](https://marketplace.visualstudio.com/items?itemName=nimfin.chatito) Thanks to [Yuri Golobokov](https://github.com/nimf) for his [work on this](https://github.com/nimf/chatito-vscode). 136 | 137 | - [AI Blueprints: How to build and deploy AI business projects](https://books.google.com.pe/books?id=sR2CDwAAQBAJ) implements practical full chatbot examples using chatito at chapter 7. 138 | 139 | - [3 steps to convert chatbot training data between different NLP Providers](https://medium.com/@benoit.alvarez/3-steps-to-convert-chatbot-training-data-between-different-nlp-providers-fa235f67617c) details a simple way to convert the data format to non implemented adapters. You can use a generated dataset with providers like DialogFlow, Wit.ai and Watson. 140 | 141 | - [Aida-nlp](https://github.com/rodrigopivi/aida) is a tiny experimental NLP deep learning library for text classification and NER. Built with Tensorflow.js, Keras and Chatito. Implemented in JS and Python. 142 | 143 | ### Author and maintainer 144 | Rodrigo Pimentel 145 | -------------------------------------------------------------------------------- /spec.md: -------------------------------------------------------------------------------- 1 | # Chatito Spec 2 | 3 | ## 1 - Overview 4 | 5 | Chatito is a domain specific language designed to simplify the process of creating, extending and maintaining 6 | datasets for training natural language processing (NLP) models for text classification, named entity recognition, slot filling or equivalent tasks. 7 | 8 | Chatito design principles: 9 | 10 | - Simplicity: should be understandable by someone looking at it for the first time 11 | 12 | - Speed: generate samples by pulling them from a cloud of probabilities on demand 13 | 14 | - Practicality: this tool is meant to help people who use it, the design should be guided by the community needs 15 | 16 | Following those principles this is an example of the language and its generated output: 17 | 18 | ``` 19 | %[greet]('training': '2') 20 | ~[hi] @[name?] ~[whatsUp?] 21 | 22 | ~[hi] 23 | hi 24 | hey 25 | 26 | @[name] 27 | Janis 28 | Bob 29 | 30 | ~[whatsUp] 31 | whats up 32 | how is it going 33 | ``` 34 | 35 | This code could produce a maximum of 18 examples, the output format is independent from the DSL language, 36 | although it is recommended to use a newline delimited format to just stream results to a file, a format like ndjson is recommended over plain json and using the `training` entity argument to limit the dataset size is recommended for large dataset where there should be no need to generate all variations. 37 | 38 | That said, the earlier DSL code generates two training examples for the `greet` intent. Here is the `Newline Delimited JSON` (ndjson.org) examples generated from the previous code: 39 | 40 | ``` 41 | [{"type":"Text","value":"hi how is it going"}] 42 | [{"type":"Text","value":"hey "},{"type":"Slot","value":"Bob","slot":"name"}] 43 | ``` 44 | 45 | Given this principles in mind, this document is the specification of such language. 46 | 47 | ## 2 - Language 48 | 49 | A chatito file, is a document containing the grammar definitions. Because of the different encoding formats and range of 50 | non printable characters, this are the requirements of document source text and some terminology: 51 | 52 | - Format: UTF-8 53 | - Valid characters: Allow international language characters. 54 | - White space: allows white space character, not horizontal tab 55 | - Line end: new line, carriage return, carriage return + new line (supporting non windows and windows) 56 | - Indentation: should use a 4 space character to define the scope of entities 57 | - Entities: Special keywords with special behaviors used to declare the sentence combinations 58 | - Sentences: 4 space indented text lines after an entity definition 59 | - Definition order: It does not matter if an entity is defined after it is being referenced 60 | - Comments: Lines of text starting with '//' or '#' (no spaces before) 61 | - Imports: Lines of text starting with 'import' keyword followed by a relative filepath 62 | - Entity arguments: Optional key-values that can be declared at intents and slot definitions 63 | - Probability operator: an optional keyword declared at the start of sentences to control the odds. 64 | 65 | ### 2.1 - Entities 66 | Entities are the way to define keywords that wrap sentence variations and attach some properties to them. 67 | There are three types of entities: `intent`, `slot` and `alias`. 68 | 69 | #### 2.1.1 - Intent 70 | 71 | The intent entity is defined by the `%[` symbols at the start of a line, following by the entity name and `]`. 72 | 73 | Intent names should be at least 1 character long and can contain any characters except `]`, `line end` and `?` 74 | . e.g.: (%[intentName], %[intent_name], %[intent name]) 75 | 76 | Repeating intent name definitions should not be allowed. 77 | 78 | Each intent defined in a file is an entry point for the generation, the intent is the classification tag that is 79 | added to the sentences defined inside. e.g.: 80 | 81 | ``` 82 | %[greet] 83 | hello 84 | hi 85 | ``` 86 | 87 | The previous example will generate all possible unique examples for greet (in this case 2 utterances). But there are cases where there is no need to generate all utterances, or when we want to attach some extra properties to the generated utterance, that is where entity arguments can help. 88 | 89 | Entity arguments are comma separated key-values declared with the entity definition inside parenthesis. Each entity argument is composed of a key, followed by the `:` symbol and the value. The argument key or value are just strings wrapped with single or double quotes, optional spaces between the parenthesis and comma are allowed, the format is similar to ndjson but only for string values. 90 | 91 | By default, intent definitions can expect the `training` and `testing` argument keys, when defined, are used to declare the maximum number of unique examples to generate for the given intent, and splitting them in two datasets, the training dataset is to be used to train the NLU model, and the testing dataset should be used to evaluate the accuracy of the model with examples it never trained with. Creating a testing dataset is not required, but it is important to be aware of the accuracy of your model to detect overfitting and compare against previous accuracies. The generator will first populate the training dataset, then testing dataset until reaching the sum of both values, each value must be `>= 1`. e.g.: 92 | 93 | ``` 94 | %[greet]('training': '2', 'testing': '1') 95 | hello 96 | hi 97 | hola 98 | salute 99 | ``` 100 | 101 | In this example, the greet intent could generate a maximum of 4 examples, but the declaration only requests 3. The training dataset will contain 2 utterances for greet intent and the testing dataset 1. Other entity arguments are ignored by default and their functionality depend on the dataset generator/adapter, this means that each adapter may use the other entity arguments differently in its own context (e.g.: Rasa/Snips adapter may expect different entity arguments). 102 | 103 | Nesting entities: Sentences defined inside an intent can refer to slots and alias entities. 104 | 105 | #### 2.1.2 - Slot 106 | The slot entity is defined by the `@[` symbols at the start of a line, following by the name of the slot and `]`. 107 | 108 | Slot names should be at least 1 character long and can contain any characters except `]`, `line end`, `?` and `#` (as # is used for variations). 109 | . e.g.: (@[slotName], %[slot_name], %[slot name]) 110 | 111 | Repeating slot name definitions should not be allowed. 112 | 113 | From the output perspective, a slot is the tag that is added the relevant words in a generated sentence. e.g.: 114 | 115 | ``` 116 | %[greet] 117 | ~[hi] @[name?] 118 | 119 | ~[hi] 120 | hi 121 | hey 122 | 123 | @[name] 124 | Janis 125 | Bob 126 | ``` 127 | 128 | Slot entities referenced within sentences, can have `?` symbol at the end of the reference name. (e.g.: @[name?]). 129 | In that context, the `?` symbol means that the slot combination is optional, and could be omitted at generation. The probabilities of being omitted are defined by the number of sentence definitions at the entity. If the entity defines only one sentence, then the probabilities of empty string will be 50%, if the sentences defines 2 sentences, the probabilities of being omitted are 33.3333%, and so on. 130 | 131 | Slots provide a particular property at their definitions called variations. 132 | 133 | - Variations: There are cases where a slot combination only makes sense in a given context, variations allow to map one slot to different sentences in different contexts. e.g.: 134 | 135 | ``` 136 | %[ask_for_delivery] 137 | my parcel should be delivered in @[delivery_time#time_in_hours] 138 | my parcel should be delivered @[delivery_time#relative_time] 139 | 140 | @[delivery_time#time_in_hours] 141 | 3 days 142 | 5 hours 143 | 144 | @[delivery_time#relative_time] 145 | as fast as possible 146 | quickly 147 | ``` 148 | 149 | In this example, both combinations map to the `delivery_time` slot, but 150 | the generated sentences only generate their variations contexts where they make sense. 151 | 152 | Slot definitions can have entity arguments too but there are no default argument keys. Entity arguments are ignored by default and their functionality depends on the dataset adapter, this means that each adapter may use the entity arguments differently in its own context (e.g.: Rasa/Snips adapter may expect different entity arguments like for pre-build date parsing, or text value aliases mappings). 153 | 154 | Nesting entities: Sentences defined inside a slot can only reference alias entities. 155 | 156 | #### 2.1.3 - Alias 157 | The alias entity is defined by the `~[` symbols at the start of a line, following by the name of the alias and `]`. 158 | Alias are just variations of a word and does not generate any tag. By default if an alias is referenced but not defined (like in the next example for `how are you`, it just uses the alias key name, this is useful for making a word optional but not having to add the extra lines of code defining a new alias. (This 'auto alias' behavior can be configurable) e.g.: 159 | 160 | ``` 161 | %[greet] 162 | ~[hi] ~[how are you?] 163 | 164 | ~[hi] 165 | hi 166 | hey 167 | ``` 168 | 169 | Same as with slots, alias references can be ommited using a `?` symbol at the end of the reference name. (e.g.: ~[hi?]). 170 | 171 | When an alias is referenced inside a slot definition, and it is the only token of the slot sentence, by default the generator will tag the generated alias value as a `synonym` of the alias key name. 172 | 173 | Alias definitions are not allowed to declare entity arguments. 174 | 175 | Nesting entities: Sentences defined inside aliases can reference slots and other aliases but preventing recursive loops. 176 | 177 | 178 | ### 2.2 - Importing chatito files 179 | 180 | To allow reusing entity declarations. It is possible to import another chatito file using the import keyword. Importing another chatito file only allows using the slots and aliases defined there, if the imported file defines intents, they will be ignored since intents are generation entry points. 181 | 182 | As an example, given two chatito files: 183 | 184 | ``` 185 | # file slot1.chatito 186 | @[slot1] 187 | s1v1 188 | s1v2 189 | ``` 190 | 191 | and 192 | 193 | ``` 194 | # file main.chatito 195 | import ./slot1.chatito 196 | 197 | %[some intent] 198 | ~[word] @[slot1] 199 | ``` 200 | 201 | The file `main.chatito` will import all alias and slot definitions from `./slot1.chatito`. 202 | The text next to the import statement should be a relative path from the main file to the imported file. Imports can be nested, and the path is always relative to the file that declares the reference. 203 | 204 | Note: Chatito will throw an exception if two imports define the same entity. 205 | 206 | 207 | ### 2.2 - Controlling probabilities 208 | 209 | The way Chatito works, is like pulling samples from a cloud of possible combinations and avoiding duplicates. Once the sentences definitions gain complexity, the max possible combinations increments exponentially, causing a problem where the generator will most likely pick sentences that have more possible combinations, and omit some sentences that may be more important at the dataset. To overcome this problem, semantics for controlling the data generation probabilities are provided. 210 | 211 | #### 2.2.1 - Frequency distribution strategies 212 | 213 | When generating samples for an entity, the generator will randomly pick a sentence model using one of the two frequency distribution strategies available: `regular` or `even`. 214 | 215 | For a regular distribution strategy, each sentence probabilities are defined by their maximum possible combinations, in other words, a sentence that can produce more combinations will have more probabilities. For even distribution strategy, sentence probabilities are the same. 216 | 217 | The distribution strategy can be declared as an argument at the entity level. If not declared, the generator should use the default strategy configured (at the IDE or CLI level), if there is no default definition, then `regular` should be the default. 218 | 219 | Lets look at an example, here, all the alias entities are defined at `./aliases.chatito`, and are named by the maximum possible combinations each provide: 220 | 221 | ``` 222 | import ./aliases.chatito 223 | 224 | %[intent with a maximum of 1k combinations]('distribution': 'regular') 225 | first sentence equals ~[100 maximum combinations] 226 | second sentence equals ~[50 maximum combinations] multiplied by ~[10 maximum combinations] 227 | third sentence equals ~[400 maximum combinations] 228 | ``` 229 | 230 | Since the intent declares a `regular` distribution, this would be the odds: 231 | 232 | | | Max combinations | Weight | Probability % | 233 | |------------|------------------|--------|---------------| 234 | | sentence 1 | 100 | 100 | 10% | 235 | | sentence 2 | 500 | 500 | 50% | 236 | | sentence 3 | 400 | 400 | 40% | 237 | 238 | 239 | Now the code to get an `even` distribution: 240 | 241 | ``` 242 | import ./aliases.chatito 243 | 244 | %[intent with a maximum of 1k combinations]('distribution': 'even') 245 | first sentence equals ~[100 maximum combinations] 246 | second sentence equals ~[50 maximum combinations] multiplied by ~[10 maximum combinations] 247 | third sentence equals ~[400 maximum combinations] 248 | ``` 249 | 250 | For `even` distribution using the previous example: 251 | 252 | | | Max combinations | Weight | Probability % | 253 | |------------|------------------|--------|---------------| 254 | | sentence 1 | 100 | 1 | 33.3333% | 255 | | sentence 2 | 500 | 1 | 33.3333% | 256 | | sentence 3 | 400 | 1 | 33.3333% | 257 | 258 | 259 | #### 2.2.1 - Sentence probability operator 260 | 261 | The sentence probability operator is defined by the `*[` symbol at the start of a sentence following by the probability value and `]`. The probability value may be expressed in two ways, as a plain number (considered as weighted probabilty, e.g.: `1`) or as a percentage value (a number ending with `%`, e.g.: `33.3333%`), but once an entity defines a probabilty as either weight or percentage, then all the other sentences for that entity should use the same type. Inconsistencies declaring entity sentence probabilty values should be considered an input error and if the value is not a valid integer, float or percentual value, the input should be considered as simple text and not as a sentence probability definition. 262 | 263 | NOTE: If the probabilty value is a percentage type, then and the sum of all sentence probabilty operators declared inside the entity definition should never exceed 100. 264 | 265 | Lets continue with some examples: 266 | 267 | ``` 268 | %[intent with a maximum of 1k combinations] 269 | *[20%] first sentence ~[100 maximum combinations] 270 | second sentence ~[50 maximum combinations] multiplied by ~[10 maximum combinations] 271 | third sentence ~[400 maximum combinations] 272 | ``` 273 | 274 | The previous example, declares `20%` probabilties for the first sentence. This would be odds table for the two strategy distributions: 275 | 276 | | | Max combinations | % with even | % with regular | 277 | |------------|------------------|-------------|-----------------------| 278 | | sentence 1 | 100 | 20% | 20% | 279 | | sentence 2 | 500 | 40% | 44.4444% (500*80/900) | 280 | | sentence 3 | 400 | 40% | 35.5556% (400*80/900) | 281 | 282 | 283 | When probabilty value is a weight with regular distribution, multiply that value with the maximum combinations for that sentence, if distribution is even, that value is the actual weighted probability. E.g.: 284 | 285 | ``` 286 | %[intent with a maximum of 1k combinations] 287 | *[2] first sentence ~[100 maximum combinations] 288 | second sentence ~[50 maximum combinations] multiplied by ~[10 maximum combinations] 289 | third sentence ~[400 maximum combinations] 290 | ``` 291 | 292 | And the odds table: 293 | 294 | | | Max combinations | even weight | even % | regular weight | regular % | 295 | |------------|------------------|-------------|--------|----------------|-----------| 296 | | sentence 1 | 100 | 2 | 50% | 200 | 18.1818% | 297 | | sentence 2 | 500 | 1 | 25% | 500 | 45.4545% | 298 | | sentence 3 | 400 | 1 | 25% | 400 | 36.3636% | 299 | 300 | 301 | NOTE: Be careful when using probabilty operator, because if the sentence reaches its max number of unique generated values, it will start producing duplicates and slowing down the generator that filters duplicates. 302 | 303 | ## 3 - Data Generation 304 | 305 | The entry points for the data generation are the intent definitions, for each intent definition available: 306 | - If the intent does not specify the 'training' or 'testing' arguments, generate all possible unique combinations and add them to the training dataset. 307 | 308 | - Respect probabilty operator declarations and distribution strategy. 309 | 310 | - Generate unique combinations for the training and testing dataset until the provided sum of both argument numbers are reached. 311 | 312 | - Recursive loop references should be prevented. 313 | -------------------------------------------------------------------------------- /src/tests/parser.spec.ts: -------------------------------------------------------------------------------- 1 | import { IChatitoParser } from '../types'; 2 | 3 | // tslint:disable-next-line:no-var-requires 4 | const chatitoParser = require('../../parser/chatito') as IChatitoParser; 5 | 6 | describe('Simple example', () => { 7 | const firstSpecExample = ` 8 | %[greet] 9 | ~[hi] @[name?] ~[whatsUp?] one two three im @[name?] 10 | 11 | ~[hi] 12 | hi 13 | hey 14 | 15 | @[name] 16 | Janis 17 | Bob 18 | 19 | ~[whatsUp] 20 | whats up 21 | how is it going 22 | `; 23 | test('correct PEGJS output', () => { 24 | let error = null; 25 | let result = null; 26 | try { 27 | result = chatitoParser.parse(firstSpecExample); 28 | } catch (e) { 29 | error = e; 30 | } 31 | expect(error).toBeNull(); 32 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 33 | }); 34 | }); 35 | 36 | describe('Simple examples with max training and testing', () => { 37 | const specExampleWithMaximum = ` 38 | %[greet]('training': '3') 39 | ~[hi] @[name?] ~[whatsUp?] 40 | ~[hi] 41 | hi 42 | hey 43 | @[name] 44 | Janis 45 | Bob 46 | ~[whatsUp] 47 | whats up 48 | how is it going 49 | `; 50 | test('CORRECT parser output specExampleWithMaximum', () => { 51 | let error = null; 52 | let result = null; 53 | try { 54 | result = chatitoParser.parse(specExampleWithMaximum); 55 | } catch (e) { 56 | error = e; 57 | } 58 | expect(error).toBeNull(); 59 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 60 | }); 61 | const specExampleWithTrainingAndTesting = ` 62 | %[greet]('training': '3', 'testing': '3') 63 | ~[hi] @[name?] ~[whatsUp?] 64 | ~[hi] 65 | hi 66 | hey 67 | @[name] 68 | Janis 69 | Bob 70 | ~[whatsUp] 71 | whats up 72 | how is it going 73 | `; 74 | test('CORRECT parser output for specExampleWithTrainingAndTesting', () => { 75 | let error = null; 76 | let result = null; 77 | try { 78 | result = chatitoParser.parse(specExampleWithTrainingAndTesting); 79 | } catch (e) { 80 | error = e; 81 | } 82 | expect(error).toBeNull(); 83 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 84 | }); 85 | const specExampleWithTrainingAndTestingWithSpaces = ` 86 | %[greet]( 'training' : '3' , 'testing': '3' ) 87 | ~[hi] @[name?] ~[whatsUp?] 88 | ~[hi] 89 | hi 90 | hey 91 | @[name] 92 | Janis 93 | Bob 94 | ~[whatsUp] 95 | whats up 96 | how is it going 97 | `; 98 | test('CORRECT parser output for specExampleWithTrainingAndTestingWithSpaces', () => { 99 | let error = null; 100 | let result = null; 101 | try { 102 | result = chatitoParser.parse(specExampleWithTrainingAndTestingWithSpaces); 103 | } catch (e) { 104 | error = e; 105 | } 106 | expect(error).toBeNull(); 107 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 108 | }); 109 | }); 110 | 111 | describe('Simple example with wrong syntax', () => { 112 | const specExampleWithWrongSyntax = ` 113 | %[greet]('training': '3')wrong 114 | hi 115 | `; 116 | test('ERROR with wrong syntax after maximum', () => { 117 | let error = null; 118 | let result = null; 119 | try { 120 | result = chatitoParser.parse(specExampleWithWrongSyntax); 121 | } catch (e) { 122 | error = e; 123 | } 124 | expect(error).toMatchSnapshot(); 125 | }); 126 | const specExampleWithWrongTestingTrainingSyntax = ` 127 | %[greet]('training': 3, 'testing': 3) 128 | hi 129 | `; 130 | test('ERROR with wrong syntax after training and testing defined', () => { 131 | let error = null; 132 | let result = null; 133 | try { 134 | result = chatitoParser.parse(specExampleWithWrongTestingTrainingSyntax); 135 | } catch (e) { 136 | error = e; 137 | } 138 | expect(error).toMatchSnapshot(); 139 | }); 140 | }); 141 | 142 | describe('Simple example with wrong identation', () => { 143 | const specExampleWithWrongIndentationSyntax = ` 144 | %[greet] 145 | wrong 146 | `; 147 | test('ERROR with wrong indentation syntax', () => { 148 | let error = null; 149 | let result = null; 150 | try { 151 | result = chatitoParser.parse(specExampleWithWrongIndentationSyntax); 152 | } catch (e) { 153 | error = e; 154 | } 155 | expect(error).toMatchSnapshot(); 156 | }); 157 | }); 158 | 159 | describe('Simple example for windows end of line', () => { 160 | // tslint:disable-next-line:max-line-length 161 | const specExampleWindowsEOLSyntax = `%[greet]\r\n hi hi\r\n how are you @[full names] sup\r\n@[full names]\r\n jim raynor`; 162 | test('CORRECT parser output', () => { 163 | let error = null; 164 | let result = null; 165 | try { 166 | result = chatitoParser.parse(specExampleWindowsEOLSyntax); 167 | } catch (e) { 168 | error = e; 169 | } 170 | expect(error).toBeNull(); 171 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 172 | }); 173 | }); 174 | 175 | describe('Example variation spec', () => { 176 | const slotVariationSpecSyntax = ` 177 | %[ask_for_delivery] 178 | my parcel should be delivered in @[delivery_time#time_in_hours] 179 | my parcel should be delivered @[delivery_time#relative_time] 180 | 181 | @[delivery_time#time_in_hours] 182 | 3 days 183 | 5 hours 184 | 185 | @[delivery_time#relative_time] 186 | as fast as possible 187 | quickly 188 | `; 189 | test('CORRECT parser output', () => { 190 | let error: any = null; 191 | let result = null; 192 | try { 193 | result = chatitoParser.parse(slotVariationSpecSyntax); 194 | } catch (e) { 195 | error = { error: e }; 196 | if (e.location) { 197 | error.location = { 198 | line: e.location.start.line, 199 | column: e.location.start.column 200 | }; 201 | } 202 | } 203 | expect(error).toBeNull(); 204 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 205 | }); 206 | }); 207 | 208 | describe('Example for weird variations', () => { 209 | const slotExamplesWithWeirdKeywords = ` 210 | %[intent] 211 | [ adfd] adf ~ @ asdfasdf asdf ~[alias_name ok] @[slot name#variation name?] 212 | ~ @~[alias_name ok]@[slot name#variation name?] 213 | @@~[alias_name ok]~~@[slot name#variation name?] 214 | 215 | @[slot name#variation name] 216 | 3 ~[daysOrHours] 217 | 5 ~[daysOrHours] 218 | 219 | ~[alias_name ok] 220 | as fast as possible 221 | quickly 222 | 223 | ~[daysOrHours] 224 | days 225 | hours 226 | `; 227 | test('CORRECT parser output', () => { 228 | let error: any = null; 229 | let result = null; 230 | try { 231 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords); 232 | } catch (e) { 233 | error = { error: e }; 234 | if (e.location) { 235 | error.location = { 236 | line: e.location.start.line, 237 | column: e.location.start.column 238 | }; 239 | } 240 | } 241 | expect(error).toBeNull(); 242 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 243 | }); 244 | }); 245 | 246 | describe('Example with multi intent', () => { 247 | const slotExamplesWithWeirdKeywords = ` 248 | %[hi + bye] 249 | hi, i have to go, bye 250 | `; 251 | test('CORRECT parser output', () => { 252 | let error: any = null; 253 | let result = null; 254 | try { 255 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords); 256 | } catch (e) { 257 | error = { error: e }; 258 | if (e.location) { 259 | error.location = { 260 | line: e.location.start.line, 261 | column: e.location.start.column 262 | }; 263 | } 264 | } 265 | expect(error).toBeNull(); 266 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 267 | }); 268 | }); 269 | 270 | describe('Example with comments spec', () => { 271 | const exampleWithCorrectComments = ` 272 | // this is a comment 273 | %[ask_for_delivery] 274 | my parcel should be delivered in @[delivery_time#time_in_hours] 275 | 276 | // this is two 277 | // line comment 278 | @[delivery_time#time_in_hours] 279 | 3 days 280 | 5 hours 281 | // more comments here 282 | `; 283 | test('CORRECT parser output for exampleWithCorrectComments', () => { 284 | let error: any = null; 285 | let result = null; 286 | try { 287 | result = chatitoParser.parse(exampleWithCorrectComments); 288 | } catch (e) { 289 | error = { error: e }; 290 | if (e.location) { 291 | error.location = { 292 | line: e.location.start.line, 293 | column: e.location.start.column 294 | }; 295 | } 296 | } 297 | expect(error).toBeNull(); 298 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 299 | }); 300 | 301 | const exampleWithCorrectHashComments = ` 302 | #this is a comment 303 | %[ask_for_delivery] 304 | my parcel should be delivered in @[delivery_time#time_in_hours] 305 | `; 306 | test('CORRECT parser output for exampleWithCorrectHashComments', () => { 307 | let error: any = null; 308 | let result = null; 309 | try { 310 | result = chatitoParser.parse(exampleWithCorrectHashComments); 311 | } catch (e) { 312 | error = { error: e }; 313 | if (e.location) { 314 | error.location = { 315 | line: e.location.start.line, 316 | column: e.location.start.column 317 | }; 318 | } 319 | } 320 | expect(error).toBeNull(); 321 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 322 | }); 323 | 324 | const exampleWithWrongComments = ` 325 | // this is a comment 326 | %[ask_for_delivery] 327 | my parcel should be delivered in @[delivery_time#time_in_hours] 328 | 329 | @[delivery_time#time_in_hours] 330 | 3 days 331 | 5 hours 332 | `; 333 | test('CORRECT parser output for exampleWithWrongComments', () => { 334 | let error: any = null; 335 | let result = null; 336 | try { 337 | result = chatitoParser.parse(exampleWithWrongComments); 338 | } catch (e) { 339 | error = { error: e }; 340 | if (e.location) { 341 | error.location = { 342 | line: e.location.start.line, 343 | column: e.location.start.column 344 | }; 345 | } 346 | } 347 | expect(error).toMatchSnapshot(); 348 | }); 349 | }); 350 | 351 | describe('Example with probability weighted opreator', () => { 352 | const slotExamplesWithWeirdKeywords = ` 353 | %[greet]('training': '10', 'testing': '10') 354 | *[50] ~[phrase1] 355 | *[30] ~[phrase2] ~[phrase3?] 356 | ~[another phrase] ~[something] ~[something else?] 357 | `; 358 | test('CORRECT parser output', () => { 359 | let error: any = null; 360 | let result = null; 361 | try { 362 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords); 363 | } catch (e) { 364 | error = { error: e }; 365 | if (e.location) { 366 | error.location = { 367 | line: e.location.start.line, 368 | column: e.location.start.column 369 | }; 370 | } 371 | } 372 | expect(error).toBeNull(); 373 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 374 | }); 375 | }); 376 | 377 | describe('Example with probability percentual opreator', () => { 378 | const slotExamplesWithWeirdKeywords = ` 379 | %[greet]('training': '10', 'testing': '10') 380 | *[50%] ~[phrase1] 381 | *[30%] ~[phrase2] ~[phrase3?] 382 | ~[another phrase] ~[something] ~[something else?] 383 | `; 384 | test('CORRECT parser output', () => { 385 | let error: any = null; 386 | let result = null; 387 | try { 388 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords); 389 | } catch (e) { 390 | error = { error: e }; 391 | if (e.location) { 392 | error.location = { 393 | line: e.location.start.line, 394 | column: e.location.start.column 395 | }; 396 | } 397 | } 398 | expect(error).toBeNull(); 399 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 400 | }); 401 | }); 402 | 403 | describe('Example with probability opreator but non int or float value parses as text', () => { 404 | const slotExamplesWithWeirdKeywords = ` 405 | %[greet]('training': '10', 'testing': '10') 406 | *[5c0] ~[phrase1] 407 | `; 408 | test('CORRECT parser output', () => { 409 | let error: any = null; 410 | let result = null; 411 | try { 412 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords); 413 | } catch (e) { 414 | error = { error: e }; 415 | if (e.location) { 416 | error.location = { 417 | line: e.location.start.line, 418 | column: e.location.start.column 419 | }; 420 | } 421 | } 422 | expect(error).toBeNull(); 423 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 424 | }); 425 | }); 426 | 427 | describe('Example with probability opreator but no after space parses correctly', () => { 428 | const slotExamplesWithWeirdKeywords = ` 429 | %[greet]('training': '10', 'testing': '10') 430 | *[50]~[phrase1] 431 | `; 432 | test('CORRECT parser output', () => { 433 | let error: any = null; 434 | let result = null; 435 | try { 436 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords); 437 | } catch (e) { 438 | error = { error: e }; 439 | if (e.location) { 440 | error.location = { 441 | line: e.location.start.line, 442 | column: e.location.start.column 443 | }; 444 | } 445 | } 446 | expect(error).toBeNull(); 447 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 448 | }); 449 | }); 450 | 451 | describe('Example with international language characters', () => { 452 | const slotExamplesWithWeirdKeywords = ` 453 | %[中文] 454 | 中文 @[中文] ~[中文] 455 | 456 | @[中文] 457 | 中文 458 | `; 459 | test('CORRECT parser output', () => { 460 | let error: any = null; 461 | let result = null; 462 | try { 463 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords); 464 | } catch (e) { 465 | error = { error: e }; 466 | if (e.location) { 467 | error.location = { 468 | line: e.location.start.line, 469 | column: e.location.start.column 470 | }; 471 | } 472 | } 473 | expect(error).toBeNull(); 474 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 475 | }); 476 | }); 477 | 478 | describe('Example with import statement at start', () => { 479 | const slotExamplesWithWeirdKeywords = ` 480 | 481 | import ../some/file.chatito 482 | import ../some/file.chatito 483 | 484 | %[greet] 485 | hey yo! 486 | `; 487 | test('CORRECT parser output', () => { 488 | let error: any = null; 489 | let result = null; 490 | try { 491 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords); 492 | } catch (e) { 493 | error = { error: e }; 494 | if (e.location) { 495 | error.location = { 496 | line: e.location.start.line, 497 | column: e.location.start.column 498 | }; 499 | } 500 | } 501 | expect(error).toBeNull(); 502 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 503 | }); 504 | }); 505 | 506 | describe('Example with alias arguments', () => { 507 | const slotExamplesWithWeirdKeywords = ` 508 | %[g]('training': '2', 'testing': '1') 509 | ~[g] 510 | 511 | ~[g]('arg': 'val') 512 | g1 513 | g2 514 | g3 515 | `; 516 | test('CORRECT parser output', () => { 517 | let error: any = null; 518 | let result = null; 519 | try { 520 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords); 521 | } catch (e) { 522 | error = { error: e }; 523 | if (e.location) { 524 | error.location = { 525 | line: e.location.start.line, 526 | column: e.location.start.column 527 | }; 528 | } 529 | } 530 | expect(error).toBeNull(); 531 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot(); 532 | }); 533 | }); 534 | -------------------------------------------------------------------------------- /src/tests/bin.spec.ts: -------------------------------------------------------------------------------- 1 | import * as cp from 'child_process'; 2 | import * as fs from 'fs'; 3 | import * as path from 'path'; 4 | 5 | test('test npm command line generator for large example', () => { 6 | const d = __dirname; 7 | const generatedDir = path.resolve(`${d}/../../examples/dateBooking_large`); 8 | const generatedTrainingFile = path.resolve(generatedDir, 'default_dataset_training.json'); 9 | const generatedTestingFile = path.resolve(generatedDir, 'default_dataset_testing.json'); 10 | const npmBin = path.resolve(`${d}/../bin.ts`); 11 | const grammarFile = path.resolve(`${d}/../../examples/dateBooking_large.chatito`); 12 | if (fs.existsSync(generatedTrainingFile)) { 13 | fs.unlinkSync(generatedTrainingFile); 14 | } 15 | if (fs.existsSync(generatedTestingFile)) { 16 | fs.unlinkSync(generatedTestingFile); 17 | } 18 | if (fs.existsSync(generatedDir)) { 19 | fs.rmdirSync(generatedDir); 20 | } 21 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --outputPath=${generatedDir}`); 22 | expect(fs.existsSync(generatedDir)).toBeTruthy(); 23 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy(); 24 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy(); 25 | const trainingDataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8')); 26 | expect(trainingDataset).not.toBeNull(); 27 | expect(trainingDataset.bookRestaurantsAtDatetime).not.toBeNull(); 28 | expect(trainingDataset.bookRestaurantsAtDatetime.length).toEqual(1000); 29 | fs.unlinkSync(generatedTrainingFile); 30 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8')); 31 | expect(testingDataset).not.toBeNull(); 32 | expect(testingDataset.bookRestaurantsAtDatetime).not.toBeNull(); 33 | expect(testingDataset.bookRestaurantsAtDatetime.length).toEqual(100); 34 | fs.unlinkSync(generatedTestingFile); 35 | fs.rmdirSync(generatedDir); 36 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy(); 37 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy(); 38 | expect(fs.existsSync(generatedDir)).toBeFalsy(); 39 | }); 40 | 41 | test('test npm command line generator for medium example', () => { 42 | const d = __dirname; 43 | const generatedDir = path.resolve(`${d}/../../examples/citySearch_medium`); 44 | const generatedTrainingFile = path.resolve(generatedDir, 'default_dataset_training.json'); 45 | const generatedTestingFile = path.resolve(generatedDir, 'default_dataset_testing.json'); 46 | const npmBin = path.resolve(`${d}/../bin.ts`); 47 | const grammarFile = path.resolve(`${d}/../../examples/citySearch_medium.chatito`); 48 | if (fs.existsSync(generatedTrainingFile)) { 49 | fs.unlinkSync(generatedTrainingFile); 50 | } 51 | if (fs.existsSync(generatedTestingFile)) { 52 | fs.unlinkSync(generatedTestingFile); 53 | } 54 | if (fs.existsSync(generatedDir)) { 55 | fs.rmdirSync(generatedDir); 56 | } 57 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --outputPath=${generatedDir}`); 58 | expect(fs.existsSync(generatedDir)).toBeTruthy(); 59 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy(); 60 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy(); 61 | const trainingDataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8')); 62 | expect(trainingDataset).not.toBeNull(); 63 | expect(trainingDataset.findByCityAndCategory).not.toBeNull(); 64 | expect(trainingDataset.findByCityAndCategory.length).toEqual(1000); 65 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8')); 66 | expect(testingDataset).not.toBeNull(); 67 | expect(testingDataset.findByCityAndCategory).not.toBeNull(); 68 | expect(testingDataset.findByCityAndCategory.length).toEqual(100); 69 | fs.unlinkSync(generatedTrainingFile); 70 | fs.unlinkSync(generatedTestingFile); 71 | fs.rmdirSync(generatedDir); 72 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy(); 73 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy(); 74 | expect(fs.existsSync(generatedDir)).toBeFalsy(); 75 | }); 76 | 77 | test('test npm command line generator for process all directory examples', () => { 78 | const d = __dirname; 79 | const generatedDir = path.resolve(`${d}/../../examples/citySearch_medium`); 80 | const generatedTrainingFile = path.resolve(generatedDir, 'default_dataset_training.json'); 81 | const generatedTestingFile = path.resolve(generatedDir, 'default_dataset_testing.json'); 82 | const npmBin = path.resolve(`${d}/../bin.ts`); 83 | const grammarFiles = path.resolve(`${d}/../../examples/`); 84 | if (fs.existsSync(generatedTrainingFile)) { 85 | fs.unlinkSync(generatedTrainingFile); 86 | } 87 | if (fs.existsSync(generatedTestingFile)) { 88 | fs.unlinkSync(generatedTestingFile); 89 | } 90 | if (fs.existsSync(generatedDir)) { 91 | fs.rmdirSync(generatedDir); 92 | } 93 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFiles} --outputPath=${generatedDir}`); 94 | expect(fs.existsSync(generatedDir)).toBeTruthy(); 95 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy(); 96 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy(); 97 | const trainingDataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8')); 98 | expect(trainingDataset).not.toBeNull(); 99 | expect(trainingDataset.findByCityAndCategory).not.toBeNull(); 100 | expect(trainingDataset.findByCityAndCategory.length).toEqual(1000); 101 | expect(trainingDataset.bookRestaurantsAtDatetime).not.toBeNull(); 102 | expect(trainingDataset.bookRestaurantsAtDatetime.length).toEqual(1000); 103 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8')); 104 | expect(testingDataset).not.toBeNull(); 105 | expect(testingDataset.findByCityAndCategory).not.toBeNull(); 106 | expect(testingDataset.findByCityAndCategory.length).toEqual(100); 107 | expect(testingDataset.bookRestaurantsAtDatetime).not.toBeNull(); 108 | expect(testingDataset.bookRestaurantsAtDatetime.length).toEqual(100); 109 | fs.unlinkSync(generatedTrainingFile); 110 | fs.unlinkSync(generatedTestingFile); 111 | fs.rmdirSync(generatedDir); 112 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy(); 113 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy(); 114 | expect(fs.existsSync(generatedDir)).toBeFalsy(); 115 | }); 116 | 117 | test('test npm command line generator for rasa medium example', () => { 118 | const d = __dirname; 119 | const generatedTrainingFile = path.resolve(`${d}/../../examples/rasa_dataset_training.json`); 120 | const generatedTestingFile = path.resolve(`${d}/../../examples/rasa_dataset_testing.json`); 121 | const npmBin = path.resolve(`${d}/../bin.ts`); 122 | const grammarFile = path.resolve(`${d}/../../examples/citySearch_medium.chatito`); 123 | if (fs.existsSync(generatedTrainingFile)) { 124 | fs.unlinkSync(generatedTrainingFile); 125 | } 126 | if (fs.existsSync(generatedTestingFile)) { 127 | fs.unlinkSync(generatedTestingFile); 128 | } 129 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --format=rasa --outputPath=${d}/../../examples`); 130 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy(); 131 | const dataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8')); 132 | expect(dataset).not.toBeNull(); 133 | expect(dataset.rasa_nlu_data).not.toBeNull(); 134 | expect(dataset.rasa_nlu_data.entity_synonyms).not.toBeNull(); 135 | expect(dataset.rasa_nlu_data.entity_synonyms.length).toEqual(3); 136 | expect(dataset.rasa_nlu_data.common_examples).not.toBeNull(); 137 | expect(dataset.rasa_nlu_data.common_examples.length).toEqual(1000); 138 | fs.unlinkSync(generatedTrainingFile); 139 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy(); 140 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy(); 141 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8')); 142 | expect(testingDataset).not.toBeNull(); 143 | expect(testingDataset.rasa_nlu_data).not.toBeNull(); 144 | expect(testingDataset.rasa_nlu_data.common_examples).not.toBeNull(); 145 | expect(testingDataset.rasa_nlu_data.common_examples.length).toEqual(100); 146 | fs.unlinkSync(generatedTestingFile); 147 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy(); 148 | }); 149 | 150 | test('test npm command line generator for rasa directory examples', () => { 151 | const d = __dirname; 152 | const generatedTrainingFile = path.resolve(`${d}/../../examples/rasa_dataset_training.json`); 153 | const generatedTestingFile = path.resolve(`${d}/../../examples/rasa_dataset_testing.json`); 154 | const npmBin = path.resolve(`${d}/../bin.ts`); 155 | const grammarFile = path.resolve(`${d}/../../examples`); 156 | if (fs.existsSync(generatedTrainingFile)) { 157 | fs.unlinkSync(generatedTrainingFile); 158 | } 159 | if (fs.existsSync(generatedTestingFile)) { 160 | fs.unlinkSync(generatedTestingFile); 161 | } 162 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --format=rasa --outputPath=${d}/../../examples`); 163 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy(); 164 | const dataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8')); 165 | expect(dataset).not.toBeNull(); 166 | expect(dataset.rasa_nlu_data).not.toBeNull(); 167 | expect(dataset.rasa_nlu_data.common_examples).not.toBeNull(); 168 | expect(dataset.rasa_nlu_data.common_examples.length).toEqual(2030); 169 | expect(dataset.rasa_nlu_data.entity_synonyms).not.toBeNull(); 170 | expect(dataset.rasa_nlu_data.entity_synonyms.length).toEqual(3); 171 | fs.unlinkSync(generatedTrainingFile); 172 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy(); 173 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy(); 174 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8')); 175 | expect(testingDataset).not.toBeNull(); 176 | expect(testingDataset.rasa_nlu_data).not.toBeNull(); 177 | expect(testingDataset.rasa_nlu_data.common_examples).not.toBeNull(); 178 | expect(testingDataset.rasa_nlu_data.common_examples.length).toEqual(200); 179 | fs.unlinkSync(generatedTestingFile); 180 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy(); 181 | }); 182 | 183 | test('test npm command line generator for snips medium example', () => { 184 | const d = __dirname; 185 | const generatedTrainingFile = path.resolve(`${d}/../../examples/snips_dataset_training.json`); 186 | const generatedTestingFile = path.resolve(`${d}/../../examples/snips_dataset_testing.json`); 187 | const npmBin = path.resolve(`${d}/../bin.ts`); 188 | const grammarFile = path.resolve(`${d}/../../examples/citySearch_medium.chatito`); 189 | if (fs.existsSync(generatedTrainingFile)) { 190 | fs.unlinkSync(generatedTrainingFile); 191 | } 192 | if (fs.existsSync(generatedTestingFile)) { 193 | fs.unlinkSync(generatedTestingFile); 194 | } 195 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --format=snips --outputPath=${d}/../../examples`); 196 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy(); 197 | const dataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8')); 198 | expect(dataset).not.toBeNull(); 199 | expect(dataset.entities).not.toBeNull(); 200 | expect(dataset.entities.location).not.toBeNull(); 201 | expect(dataset.entities.location.data).not.toBeNull(); 202 | expect(dataset.entities.location.data.length).toEqual(3); 203 | expect(dataset.intents).not.toBeNull(); 204 | expect(dataset.intents.findByCityAndCategory).not.toBeNull(); 205 | expect(dataset.intents.findByCityAndCategory.utterances).not.toBeNull(); 206 | expect(dataset.intents.findByCityAndCategory.utterances.length).toEqual(1000); 207 | fs.unlinkSync(generatedTrainingFile); 208 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy(); 209 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy(); 210 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8')); 211 | expect(testingDataset).not.toBeNull(); 212 | expect(testingDataset.findByCityAndCategory).not.toBeNull(); 213 | expect(testingDataset.findByCityAndCategory.length).toEqual(100); 214 | fs.unlinkSync(generatedTestingFile); 215 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy(); 216 | }); 217 | 218 | test('test npm command line generator for snips all examples', () => { 219 | const d = __dirname; 220 | const generatedTrainingFile = path.resolve(`${d}/../../examples/snips_dataset_training.json`); 221 | const generatedTestingFile = path.resolve(`${d}/../../examples/snips_dataset_testing.json`); 222 | const npmBin = path.resolve(`${d}/../bin.ts`); 223 | const grammarFile = path.resolve(`${d}/../../examples`); 224 | if (fs.existsSync(generatedTrainingFile)) { 225 | fs.unlinkSync(generatedTrainingFile); 226 | } 227 | if (fs.existsSync(generatedTestingFile)) { 228 | fs.unlinkSync(generatedTestingFile); 229 | } 230 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --format=snips --outputPath=${d}/../../examples`); 231 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy(); 232 | const dataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8')); 233 | expect(dataset).not.toBeNull(); 234 | expect(dataset.entities).not.toBeNull(); 235 | expect(dataset.entities.location).not.toBeNull(); 236 | expect(dataset.entities.location.data).not.toBeNull(); 237 | expect(dataset.entities.location.data.length).toEqual(3); 238 | expect(dataset.intents).not.toBeNull(); 239 | expect(dataset.intents.findByCityAndCategory).not.toBeNull(); 240 | expect(dataset.intents.findByCityAndCategory.utterances).not.toBeNull(); 241 | expect(dataset.intents.findByCityAndCategory.utterances.length).toEqual(1000); 242 | expect(dataset.intents.bookRestaurantsAtDatetime).not.toBeNull(); 243 | expect(dataset.intents.bookRestaurantsAtDatetime.utterances).not.toBeNull(); 244 | expect(dataset.intents.bookRestaurantsAtDatetime.utterances.length).toEqual(1000); 245 | fs.unlinkSync(generatedTrainingFile); 246 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy(); 247 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy(); 248 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8')); 249 | expect(testingDataset).not.toBeNull(); 250 | expect(testingDataset.findByCityAndCategory).not.toBeNull(); 251 | expect(testingDataset.findByCityAndCategory.length).toEqual(100); 252 | expect(testingDataset.bookRestaurantsAtDatetime).not.toBeNull(); 253 | expect(testingDataset.bookRestaurantsAtDatetime.length).toEqual(100); 254 | fs.unlinkSync(generatedTestingFile); 255 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy(); 256 | }); 257 | 258 | test('test npm command line generator for luis medium example', () => { 259 | const d = __dirname; 260 | const generatedTrainingFile = path.resolve(`${d}/../../examples/luis_dataset_training.json`); 261 | const generatedTestingFile = path.resolve(`${d}/../../examples/luis_dataset_testing.json`); 262 | const npmBin = path.resolve(`${d}/../bin.ts`); 263 | const grammarFile = path.resolve(`${d}/../../examples/citySearch_medium.chatito`); 264 | if (fs.existsSync(generatedTrainingFile)) { 265 | fs.unlinkSync(generatedTrainingFile); 266 | } 267 | if (fs.existsSync(generatedTestingFile)) { 268 | fs.unlinkSync(generatedTestingFile); 269 | } 270 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --format=luis --outputPath=${d}/../../examples`); 271 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy(); 272 | const dataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8')); 273 | expect(dataset).not.toBeNull(); 274 | expect(dataset.data).not.toBeNull(); 275 | expect(dataset.data.length).toEqual(1000); 276 | fs.unlinkSync(generatedTrainingFile); 277 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy(); 278 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy(); 279 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8')); 280 | expect(testingDataset).not.toBeNull(); 281 | expect(testingDataset.data).not.toBeNull(); 282 | expect(testingDataset.data.length).toEqual(100); 283 | fs.unlinkSync(generatedTestingFile); 284 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy(); 285 | }); 286 | 287 | test('test npm command line generator for luis directory examples', () => { 288 | const d = __dirname; 289 | const generatedTrainingFile = path.resolve(`${d}/../../examples/luis_dataset_training.json`); 290 | const generatedTestingFile = path.resolve(`${d}/../../examples/luis_dataset_testing.json`); 291 | const npmBin = path.resolve(`${d}/../bin.ts`); 292 | const grammarFile = path.resolve(`${d}/../../examples`); 293 | if (fs.existsSync(generatedTrainingFile)) { 294 | fs.unlinkSync(generatedTrainingFile); 295 | } 296 | if (fs.existsSync(generatedTestingFile)) { 297 | fs.unlinkSync(generatedTestingFile); 298 | } 299 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --format=luis --outputPath=${d}/../../examples`); 300 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy(); 301 | const dataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8')); 302 | expect(dataset).not.toBeNull(); 303 | expect(dataset.data).not.toBeNull(); 304 | expect(dataset.data.length).toEqual(2030); 305 | fs.unlinkSync(generatedTrainingFile); 306 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy(); 307 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy(); 308 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8')); 309 | expect(testingDataset).not.toBeNull(); 310 | expect(testingDataset.data).not.toBeNull(); 311 | expect(testingDataset.data.length).toEqual(200); 312 | fs.unlinkSync(generatedTestingFile); 313 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy(); 314 | }); 315 | 316 | test('test npm command line generator for imports example', () => { 317 | const d = __dirname; 318 | const generatedDir = path.resolve(`${d}/../../examples/importing/main`); 319 | const generatedTrainingFile = path.resolve(generatedDir, 'default_dataset_training.json'); 320 | const npmBin = path.resolve(`${d}/../bin.ts`); 321 | const grammarFile = path.resolve(`${d}/../../examples/importing/main.chatito`); 322 | if (fs.existsSync(generatedTrainingFile)) { 323 | fs.unlinkSync(generatedTrainingFile); 324 | } 325 | if (fs.existsSync(generatedDir)) { 326 | fs.rmdirSync(generatedDir); 327 | } 328 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --outputPath=${generatedDir}`); 329 | expect(fs.existsSync(generatedDir)).toBeTruthy(); 330 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy(); 331 | const trainingDataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8')); 332 | expect(trainingDataset).not.toBeNull(); 333 | expect(trainingDataset.greet).not.toBeNull(); 334 | expect(trainingDataset.greet.length).toEqual(30); 335 | fs.unlinkSync(generatedTrainingFile); 336 | fs.rmdirSync(generatedDir); 337 | }); 338 | 339 | test('test npm command line generator for flair medium example', () => { 340 | const d = __dirname; 341 | const generatedClassificationTrainingFile = path.resolve(`${d}/../../examples/classification_flair_dataset_training.txt`); 342 | const generatedClassificationTestingFile = path.resolve(`${d}/../../examples/classification_flair_dataset_testing.txt`); 343 | const generatedNERTrainingFile = path.resolve(`${d}/../../examples/ner_flair_dataset_training.txt`); 344 | const generatedNERTestingFile = path.resolve(`${d}/../../examples/ner_flair_dataset_testing.txt`); 345 | const npmBin = path.resolve(`${d}/../bin.ts`); 346 | const grammarFile = path.resolve(`${d}/../../examples/citySearch_medium.chatito`); 347 | if (fs.existsSync(generatedClassificationTrainingFile)) { 348 | fs.unlinkSync(generatedClassificationTrainingFile); 349 | } 350 | if (fs.existsSync(generatedClassificationTestingFile)) { 351 | fs.unlinkSync(generatedClassificationTestingFile); 352 | } 353 | if (fs.existsSync(generatedNERTrainingFile)) { 354 | fs.unlinkSync(generatedNERTrainingFile); 355 | } 356 | if (fs.existsSync(generatedNERTestingFile)) { 357 | fs.unlinkSync(generatedNERTestingFile); 358 | } 359 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --format=flair --outputPath=${d}/../../examples`); 360 | // generatedClassificationTrainingFile 361 | expect(fs.existsSync(generatedClassificationTrainingFile)).toBeTruthy(); 362 | const dataset = fs.readFileSync(generatedClassificationTrainingFile, 'utf8'); 363 | expect(dataset).not.toBeNull(); 364 | expect(dataset.length).toBeGreaterThan(0); 365 | fs.unlinkSync(generatedClassificationTrainingFile); 366 | expect(fs.existsSync(generatedClassificationTrainingFile)).toBeFalsy(); 367 | 368 | // generatedClassificationTestingFile 369 | expect(fs.existsSync(generatedClassificationTestingFile)).toBeTruthy(); 370 | const testingDataset = fs.readFileSync(generatedClassificationTestingFile, 'utf8'); 371 | expect(testingDataset).not.toBeNull(); 372 | expect(testingDataset.length).toBeGreaterThan(0); 373 | fs.unlinkSync(generatedClassificationTestingFile); 374 | expect(fs.existsSync(generatedClassificationTestingFile)).toBeFalsy(); 375 | 376 | // generatedNERTrainingFile 377 | expect(fs.existsSync(generatedNERTrainingFile)).toBeTruthy(); 378 | const nerDataset = fs.readFileSync(generatedNERTrainingFile, 'utf8'); 379 | expect(nerDataset).not.toBeNull(); 380 | expect(nerDataset.length).toBeGreaterThan(0); 381 | fs.unlinkSync(generatedNERTrainingFile); 382 | expect(fs.existsSync(generatedNERTrainingFile)).toBeFalsy(); 383 | 384 | // generatedNERTestingFile 385 | expect(fs.existsSync(generatedNERTestingFile)).toBeTruthy(); 386 | const testingNerDataset = fs.readFileSync(generatedNERTestingFile, 'utf8'); 387 | expect(testingNerDataset).not.toBeNull(); 388 | expect(testingNerDataset.length).toBeGreaterThan(0); 389 | fs.unlinkSync(generatedNERTestingFile); 390 | expect(fs.existsSync(generatedNERTestingFile)).toBeFalsy(); 391 | }); 392 | -------------------------------------------------------------------------------- /web/components/Editor/Editor.tsx: -------------------------------------------------------------------------------- 1 | import { saveAs } from 'file-saver'; 2 | import * as React from 'react'; 3 | import * as luisAdapter from '../../../src/adapters/luis'; 4 | import * as rasaAdapter from '../../../src/adapters/rasa'; 5 | import * as snipsAdapter from '../../../src/adapters/snips'; 6 | import * as webAdapter from '../../../src/adapters/web'; 7 | import * as chatito from '../../../src/main'; 8 | import * as utils from '../../../src/utils'; 9 | import { chatitoPrism, rasaDefaultOptions, snipsDefaultOptions, tabs } from '../../lib/editorConfig'; 10 | import { debounce } from '../../lib/utils'; 11 | import * as es from './editorStyles'; 12 | 13 | const logger = console; 14 | 15 | const adapters = { 16 | default: webAdapter, 17 | rasa: rasaAdapter, 18 | snips: snipsAdapter, 19 | luis: luisAdapter 20 | }; 21 | 22 | interface IEditorState { 23 | error: null | string; 24 | warning: null | string; 25 | activeTabIndex: number; 26 | showDrawer: boolean; 27 | dataset: any; 28 | adapterOptions: any; 29 | currentAdapter: 'default' | 'rasa' | 'snips' | 'luis'; 30 | useCustomOptions: boolean; 31 | frequencyDistribution: chatito.distributionType; 32 | autoAliases: chatito.autoAliasesType; 33 | } 34 | 35 | type IDataset = webAdapter.IDefaultDataset | snipsAdapter.ISnipsDataset | rasaAdapter.IRasaDataset | luisAdapter.ILuisDataset; 36 | 37 | // NOTE: for SSR, wrap the require in check for window 38 | let CodeFlask = null; 39 | let ReactJson = null; 40 | if (typeof window !== `undefined`) { 41 | // tslint:disable-next-line:no-var-requires 42 | CodeFlask = require('codeflask').default; 43 | // tslint:disable-next-line:no-var-requires 44 | ReactJson = require('react-json-view').default; 45 | } 46 | 47 | export default class Editor extends React.Component<{}, IEditorState> { 48 | public state: IEditorState = { 49 | error: null, 50 | warning: null, 51 | activeTabIndex: 0, 52 | showDrawer: false, 53 | dataset: null, 54 | adapterOptions: null, 55 | currentAdapter: 'default', 56 | useCustomOptions: false, 57 | frequencyDistribution: 'regular', 58 | autoAliases: 'allow' 59 | }; 60 | private tabsContainer = React.createRef() as React.RefObject; 61 | private codeflask = null; 62 | private editorUpdatesSetupCount = 0; 63 | private codeInputValue = ''; 64 | private tabs: Array<{ title: string; value: string }> = []; 65 | 66 | private debouncedTabDSLValidation = debounce(() => { 67 | if (!this.codeInputValue.length) { 68 | if (this.state.error || this.state.warning) { 69 | this.setState({ error: null, warning: null }); 70 | } 71 | return; 72 | } 73 | const validation = this.getDSLValidation(this.codeInputValue); 74 | let newState = {}; 75 | if (validation && validation.error) { 76 | newState = { error: validation.error, warning: null }; 77 | } else if (validation && validation.warning) { 78 | newState = { error: null, warning: validation.warning }; 79 | } else { 80 | newState = { error: null, warning: null }; 81 | } 82 | this.setState(newState, () => { 83 | this.saveToLocalStorage(true, false, false); 84 | }); 85 | }, 300); 86 | 87 | public componentDidMount() { 88 | if (!CodeFlask) { 89 | return; 90 | } 91 | this.loadFromLocalStorage(() => { 92 | const flask = new CodeFlask('#my-code-editor', { 93 | language: 'chatito', 94 | lineNumbers: true 95 | }); 96 | flask.addLanguage('chatito', chatitoPrism); 97 | flask.onUpdate(code => { 98 | if (!this.tabs || !this.tabs[this.state.activeTabIndex]) { 99 | return; 100 | } 101 | this.codeInputValue = code; 102 | this.tabs[this.state.activeTabIndex].value = code; 103 | // NOTE: ugly hack to know when codeflask is mounted (it makes 2 calls to update on mount) 104 | if (this.editorUpdatesSetupCount < 2) { 105 | this.editorUpdatesSetupCount++; 106 | } else { 107 | this.setState({ dataset: null }); 108 | this.debouncedTabDSLValidation(); 109 | } 110 | }); 111 | if (this.tabs && this.tabs[this.state.activeTabIndex]) { 112 | flask.updateCode(this.tabs[this.state.activeTabIndex].value); 113 | } 114 | flask.setLineNumber(); 115 | this.codeflask = flask; 116 | }); 117 | } 118 | 119 | public render() { 120 | const alertState = !!this.state.error ? 'error' : !!this.state.warning ? 'warning' : 'success'; 121 | return ( 122 | 123 | 124 | {this.tabs.map(this.renderTabButton)} 125 | 126 | New file 127 | 128 | 129 | Generate Dataset 130 | 131 | 132 | 133 | 134 | {' '} 135 | {this.state.error || this.state.warning || `Correct syntax!`} 136 | 137 | 138 | e.stopPropagation()} showDrawer={this.state.showDrawer}> 139 | x 140 | {this.renderDatasetGeneratorSettings()} 141 | {this.renderDatasetPreviewer()} 142 | 143 | 144 | 145 | ); 146 | } 147 | 148 | /* ================== Renderers ================== */ 149 | private renderDatasetGeneratorSettings = () => { 150 | return ( 151 | 152 | Dataset generation settings 153 |
154 | 155 | 156 | 157 | 168 | 169 | 170 | 171 | 172 | 173 | 182 | 183 | 184 |
185 |
186 | 187 | 188 | 196 | 197 | 198 | 199 | 200 | 201 | 211 | 212 | 213 |
214 |
215 | * NLP providers like DialogFlow, Wit.ai and Watson can be used with a conversion tool. Read the  216 | 217 | resources section 218 | 219 |
220 | {this.renderEditAdapterOptions()} 221 |
222 | Generate and download dataset! 223 |
224 |
225 | ); 226 | }; 227 | 228 | private renderEditAdapterOptions = () => { 229 | if (!this.state.useCustomOptions || !ReactJson) { 230 | return null; 231 | } 232 | return ( 233 |
234 |
235 | Edit the adapter custom initial options: 236 |
237 |
238 | 251 |
252 |
253 | ); 254 | }; 255 | 256 | private renderDatasetPreviewer = () => { 257 | if (!this.state.dataset || !ReactJson) { 258 | return null; 259 | } 260 | return ( 261 | 262 | Review the generated training dataset 263 | 273 | 274 | ); 275 | }; 276 | 277 | private renderTabButton = (t, i) => { 278 | const changeTab = () => this.changeTab(i); 279 | const onCloseTab = this.closerTab(i); 280 | return ( 281 | 282 | {t.title} 283 | 284 | 285 | ); 286 | }; 287 | 288 | /* ================== Event Handlers ================== */ 289 | private onCloseDrawer = () => this.setState({ showDrawer: false, dataset: null }); 290 | 291 | private onCustomOptionsCheckboxChange = e => { 292 | let adapterOptions = {}; 293 | if (this.state.currentAdapter === 'rasa') { 294 | adapterOptions = Object.assign({}, rasaDefaultOptions); 295 | } else if (this.state.currentAdapter === 'snips') { 296 | adapterOptions = Object.assign({}, snipsDefaultOptions); 297 | } 298 | this.setState({ useCustomOptions: e.target.checked, adapterOptions, dataset: null }, () => { 299 | this.saveToLocalStorage(false, true, true); 300 | }); 301 | }; 302 | 303 | private onAdapterChange = e => { 304 | let adapterOptions = {}; 305 | if (e.target.value === 'rasa') { 306 | adapterOptions = Object.assign({}, rasaDefaultOptions); 307 | } else if (e.target.value === 'snips') { 308 | adapterOptions = Object.assign({}, snipsDefaultOptions); 309 | } 310 | this.setState({ currentAdapter: e.target.value, adapterOptions, dataset: null }, () => { 311 | this.saveToLocalStorage(false, true, true); 312 | }); 313 | }; 314 | 315 | private onDistributionChange = e => { 316 | this.setState( 317 | { 318 | frequencyDistribution: e.target.value === 'even' ? 'even' : 'regular', 319 | dataset: null 320 | }, 321 | () => this.saveToLocalStorage(false, true, true) 322 | ); 323 | }; 324 | 325 | private onAutoAliasesChange = (e: React.ChangeEvent) => { 326 | if ((chatito.VALID_AUTO_ALIASES as readonly string[]).includes(e.target.value)) { 327 | this.setState( 328 | { 329 | autoAliases: e.target.value as chatito.autoAliasesType, 330 | dataset: null 331 | }, 332 | () => this.saveToLocalStorage(false, true, true) 333 | ); 334 | } 335 | }; 336 | 337 | private onEditAdapterOptions = changes => { 338 | if (changes && changes.updated_src) { 339 | this.setState({ adapterOptions: changes.updated_src }, () => { 340 | this.saveToLocalStorage(false, true, false); 341 | }); 342 | return null; 343 | } 344 | return false; 345 | }; 346 | 347 | private onAddFile = () => { 348 | let filename = 'newFile'; 349 | if (window && window.prompt) { 350 | filename = prompt('Please enter the new .chatito file name:', filename); 351 | } 352 | if (filename) { 353 | this.tabs.push({ title: `${filename}.chatito`, value: '' }); 354 | this.changeTab(this.tabs.length - 1, () => { 355 | this.tabsContainer.current.scrollTo({ 356 | left: this.tabsContainer.current.scrollWidth, 357 | behavior: 'smooth' 358 | }); 359 | }); 360 | } 361 | }; 362 | 363 | private onToggleDrawer = async () => { 364 | if (!this.state.showDrawer) { 365 | if (this.validateChatitoFiles()) { 366 | try { 367 | this.setState({ showDrawer: !this.state.showDrawer }); 368 | } catch (e) { 369 | return; 370 | } 371 | } else { 372 | if (window && window.alert) { 373 | window.alert('Please fix the errors or warnings found in the code.'); 374 | } 375 | } 376 | } 377 | }; 378 | 379 | /* ================== Utils ================== */ 380 | 381 | private saveToLocalStorage = (saveTabs, saveAdapterOptions, saveCurrentAdapter) => { 382 | if (window && localStorage) { 383 | if (saveTabs) { 384 | localStorage.setItem('___tabs', JSON.stringify(this.tabs)); 385 | } 386 | if (saveAdapterOptions) { 387 | localStorage.setItem('___adapterOptions', this.state.useCustomOptions ? JSON.stringify(this.state.adapterOptions) : ''); 388 | localStorage.setItem('___defaultDistribution', this.state.frequencyDistribution); 389 | localStorage.setItem('___autoAliases', this.state.autoAliases); 390 | } 391 | if (saveCurrentAdapter) { 392 | localStorage.setItem('___currentAdapter', this.state.currentAdapter); 393 | } 394 | } 395 | }; 396 | 397 | private loadFromLocalIfPresent = (key: string, parseAsJSON: boolean) => { 398 | if (window && localStorage) { 399 | try { 400 | const item = localStorage.getItem(key); 401 | if (!parseAsJSON) { 402 | return item; 403 | } 404 | if (item) { 405 | try { 406 | return JSON.parse(item); 407 | } catch (e) { 408 | // just catch the error 409 | } 410 | } 411 | } catch (e) { 412 | logger.error(e); 413 | } 414 | } 415 | }; 416 | 417 | private loadFromLocalStorage = (cb: () => void) => { 418 | if (window && localStorage) { 419 | const newState: IEditorState = this.state; 420 | const localTabs = this.loadFromLocalIfPresent('___tabs', true); 421 | const localAdapterOptions = this.loadFromLocalIfPresent('___adapterOptions', true); 422 | const localCurrentAdapter = this.loadFromLocalIfPresent('___currentAdapter', false); 423 | const localDefaultDistribution: string | undefined = this.loadFromLocalIfPresent('___defaultDistribution', false); 424 | const localAutoAliases: string | undefined = this.loadFromLocalIfPresent('___autoAliases', false); 425 | this.tabs = localTabs ? localTabs : tabs; 426 | if (localAdapterOptions) { 427 | newState.adapterOptions = localAdapterOptions; 428 | newState.useCustomOptions = true; 429 | } 430 | if (localCurrentAdapter) { 431 | newState.currentAdapter = localCurrentAdapter; 432 | } 433 | if (localDefaultDistribution && (chatito.VALID_DISTRIBUTIONS as readonly string[]).includes(localDefaultDistribution)) { 434 | newState.frequencyDistribution = localDefaultDistribution as chatito.distributionType; 435 | } 436 | if (localAutoAliases && (chatito.VALID_AUTO_ALIASES as readonly string[]).includes(localAutoAliases)) { 437 | newState.autoAliases = localAutoAliases as chatito.autoAliasesType; 438 | } 439 | this.setState(newState, cb); 440 | } else { 441 | this.tabs = tabs; 442 | } 443 | cb(); 444 | }; 445 | 446 | private changeTab = (i: number, cb?: () => void) => { 447 | this.setState({ activeTabIndex: i }, () => { 448 | this.codeflask.updateCode(this.tabs[this.state.activeTabIndex].value); 449 | this.codeflask.setLineNumber(); 450 | if (cb) { 451 | setTimeout(cb, 600); // note; hack using setTimeout because codeflask uses a timeout on update code 452 | } 453 | }); 454 | }; 455 | 456 | private closerTab = (i: number) => { 457 | return (e: React.SyntheticEvent) => { 458 | if (e) { 459 | e.stopPropagation(); 460 | } 461 | if (this.tabs[i].value) { 462 | if (!window.confirm(`Do you really want to remove '${this.tabs[i].title}'?`)) { 463 | return; 464 | } 465 | } 466 | const ati = this.state.activeTabIndex; 467 | let newActiveTabIndex = this.state.activeTabIndex; 468 | if (ati === i && ati > 0) { 469 | newActiveTabIndex = ati - 1; 470 | } 471 | this.tabs = [...this.tabs.slice(0, i), ...this.tabs.slice(i + 1)]; 472 | if (!this.tabs.length) { 473 | this.tabs.push({ title: 'newFile.chatito', value: '' }); 474 | newActiveTabIndex = 0; 475 | } 476 | this.saveToLocalStorage(true, false, false); 477 | this.changeTab(newActiveTabIndex); 478 | }; 479 | }; 480 | 481 | private getDSLValidation = (dsl: string): null | { error?: string; warning?: string } => { 482 | try { 483 | const ast = chatito.astFromString(dsl); 484 | const intentsWithoutLimit = ast.filter(entity => entity.type === 'IntentDefinition' && entity.args === null); 485 | if (intentsWithoutLimit.length) { 486 | return { 487 | warning: `Warning: Limit the number of generated examples for intents. E.g.: %[${intentsWithoutLimit[0].key}]('training': '100')` 488 | }; 489 | } 490 | return null; 491 | } catch (e) { 492 | const error = 493 | e.constructor === Error 494 | ? e.toString() 495 | : `${e.name}: ${e.message} Line: ${e.location.start.line}, Column: ${e.location.start.column}`; 496 | return { error }; 497 | } 498 | }; 499 | 500 | private validateChatitoFiles = () => { 501 | return !this.tabs.some((tab, i) => { 502 | if (tab.value) { 503 | const validation = this.getDSLValidation(tab.value); 504 | if (validation !== null) { 505 | this.changeTab(i); 506 | return true; 507 | } 508 | } 509 | return false; 510 | }); 511 | }; 512 | 513 | private importFile = (startPath: string, endPath: string) => { 514 | const filename = endPath.replace(/^\.\//, ''); 515 | const tabFound = this.tabs.find(t => t.title.trim() === filename); 516 | if (!tabFound) { 517 | throw new Error(`Can't import ${endPath}. Not found.`); 518 | } 519 | // note: returning empty path since there is no actual filesystem 520 | return { filePath: '', dsl: tabFound.value }; 521 | }; 522 | 523 | private generateDataset = async () => { 524 | let dataset: IDataset | null = null; 525 | const testingDataset = {}; 526 | const adapter = adapters[this.state.currentAdapter]; 527 | if (!adapter) { 528 | return; 529 | } 530 | chatito.config.defaultDistribution = this.state.frequencyDistribution; 531 | chatito.config.autoAliases = this.state.autoAliases; 532 | for (const [i, tab] of this.tabs.entries()) { 533 | try { 534 | if (dataset === null && this.state.useCustomOptions && this.state.adapterOptions) { 535 | dataset = JSON.parse(JSON.stringify(this.state.adapterOptions)); 536 | } 537 | const { training, testing } = await adapter.adapter(tab.value, dataset, this.importFile, ''); 538 | dataset = training; 539 | utils.mergeDeep(testingDataset, testing); 540 | } catch (e) { 541 | this.setState({ dataset: null, showDrawer: false }, () => { 542 | this.changeTab(i, () => 543 | this.setState({ error: e.message }, () => { 544 | if (window && window.alert) { 545 | logger.log(e); 546 | window.alert(`Please fix error: ${e.message}`); 547 | } 548 | }) 549 | ); 550 | }); 551 | return; 552 | } 553 | } 554 | const datasetBlob = new Blob([JSON.stringify(dataset)], { type: 'text/json;charset=utf-8' }); 555 | const testingBlob = new Blob([JSON.stringify(testingDataset)], { type: 'text/json;charset=utf-8' }); 556 | saveAs(datasetBlob, `training_dataset_${Math.round(new Date().getTime() / 1000)}.json`); 557 | setTimeout(() => { 558 | saveAs(testingBlob, `testing_dataset_${Math.round(new Date().getTime() / 1000)}.json`); 559 | }, 100); // note: timeout to allow multiple downloads at once 560 | this.setState({ dataset }); 561 | }; 562 | } 563 | -------------------------------------------------------------------------------- /src/main.ts: -------------------------------------------------------------------------------- 1 | import { Chance } from 'chance'; 2 | import { 3 | IChatitoCache, 4 | IChatitoEntityAST, 5 | IChatitoParser, 6 | IEntities, 7 | IEntityDef, 8 | ISentenceTokens, 9 | IStatCache, 10 | IUtteranceWriter 11 | } from './types'; 12 | 13 | const logger = console; 14 | 15 | export const VALID_DISTRIBUTIONS = ['regular', 'even'] as const; 16 | export const VALID_AUTO_ALIASES = ['allow', 'warn', 'restrict'] as const; 17 | 18 | export type distributionType = typeof VALID_DISTRIBUTIONS[number]; 19 | export type autoAliasesType = typeof VALID_AUTO_ALIASES[number]; 20 | 21 | export interface IConfigOptions { 22 | defaultDistribution?: distributionType; 23 | autoAliases?: autoAliasesType; 24 | } 25 | 26 | type Configuration = Required; 27 | 28 | export const config: Configuration = { 29 | defaultDistribution: 'regular', 30 | autoAliases: 'allow' 31 | }; 32 | 33 | // tslint:disable-next-line:no-var-requires 34 | const chatito = require('../parser/chatito') as IChatitoParser; 35 | const chance = new Chance(); 36 | 37 | /** 38 | * Returns the entity key for the Alias/Slot that `token` refers to 39 | * @param token Sentence's token 40 | */ 41 | const getEntityKey = (token: ISentenceTokens) => (token.variation ? `${token.value}#${token.variation}` : token.value); 42 | 43 | const chatitoFormatPostProcess = (data: ISentenceTokens[]) => { 44 | const arr = data.reduce( 45 | (accumulator: ISentenceTokens[], next: ISentenceTokens, i, arrShadow) => { 46 | if (accumulator.length) { 47 | const lastWord = accumulator[accumulator.length - 1]; 48 | if (lastWord.type === next.type && lastWord.type === 'Text') { 49 | accumulator[accumulator.length - 1] = { 50 | type: lastWord.type, 51 | value: (lastWord.value + next.value).replace(/\s+/g, ' ') 52 | }; 53 | } else { 54 | accumulator.push(next); 55 | } 56 | } else if (next.value.trim()) { 57 | accumulator.push(next); 58 | } 59 | if (i === arrShadow.length - 1) { 60 | // if its the last token of a sentence 61 | // remove empty strings at the end 62 | if (accumulator.length) { 63 | if (!accumulator[accumulator.length - 1].value.trim()) { 64 | accumulator.pop(); 65 | } 66 | accumulator[accumulator.length - 1] = Object.assign({}, accumulator[accumulator.length - 1], { 67 | value: accumulator[accumulator.length - 1].value.replace(/\s+$/g, '') 68 | }); 69 | } 70 | } 71 | return accumulator; 72 | }, 73 | [] as ISentenceTokens[] 74 | ); 75 | if (arr.length) { 76 | arr[0] = Object.assign({}, arr[0], { 77 | value: arr[0].value.replace(/^\s+/, '') 78 | }); 79 | } 80 | if (!arr.length) { 81 | throw new Error(`Some sentence generated an empty string. Can't map empty to an intent.`); 82 | } 83 | return arr; 84 | }; 85 | 86 | const calcSentencesProbabilities = ( 87 | isPercentageProbability: boolean, 88 | isEvenDistribution: boolean, 89 | definedSentenceProbabilities: Array, 90 | sumOfTotalProbabilitiesDefined: number, 91 | maxCounts: number[] 92 | ) => { 93 | let sentencesWithNullProbabilityCount = 0; 94 | let totalMaxCountsToShareBetweenNullProbs = 0; 95 | definedSentenceProbabilities.forEach((prob, i) => { 96 | if (prob === null) { 97 | sentencesWithNullProbabilityCount += 1; 98 | totalMaxCountsToShareBetweenNullProbs += maxCounts[i]; 99 | } 100 | }); 101 | let probabilities: number[]; 102 | if (isPercentageProbability) { 103 | // if defined probabilities is percentual, then calculate each sentence chances in percent 104 | probabilities = definedSentenceProbabilities.map((p, i) => { 105 | if (p !== null) { 106 | return p; 107 | } 108 | if (isEvenDistribution) { 109 | return (100 - sumOfTotalProbabilitiesDefined) / sentencesWithNullProbabilityCount; 110 | } 111 | return (((maxCounts[i] * 100) / totalMaxCountsToShareBetweenNullProbs) * (100 - sumOfTotalProbabilitiesDefined)) / 100; 112 | }); 113 | } else { 114 | // if probabilityTypeDefined is weighted, then multiply the weight by max counts 115 | probabilities = definedSentenceProbabilities.map((p, i) => { 116 | if (p !== null) { 117 | return isEvenDistribution ? p : maxCounts[i] * p; 118 | } 119 | if (isEvenDistribution) { 120 | return 1; 121 | } 122 | return maxCounts[i]; 123 | }); 124 | } 125 | return probabilities; 126 | }; 127 | 128 | // recursive function that generates variations using a cache 129 | // that uses counts to avoid repetitions 130 | export const getVariationsFromEntity = async ( 131 | ed: IChatitoEntityAST, 132 | entities: IEntities, 133 | optional: boolean, 134 | cache: IChatitoCache 135 | ): Promise => { 136 | // if this entity is a slot variation, add that as the key 137 | const variationKey = ed.variation ? `#${ed.variation}` : ''; 138 | const cacheKey = `${ed.type}-${ed.key}${variationKey}`; 139 | let cacheStats = cache.get(cacheKey) as IStatCache; 140 | if (!cacheStats) { 141 | // if the entity is not cache, create an empty cache for it 142 | const counts: IChatitoCache[] = []; 143 | const maxCounts: number[] = ed.inner.map(s => s.cardinality!); 144 | let probabilityTypeDefined: 'w' | '%' | null = null; 145 | const definedSentenceProbabilities: Array = []; // the posibility operators defined for sentences 146 | let isEvenDistribution = config.defaultDistribution === 'even'; 147 | if (ed.args && ed.args.distribution) { 148 | isEvenDistribution = ed.args.distribution === 'even'; 149 | } 150 | let sumOfTotalProbabilitiesDefined = 0; 151 | for (const c of ed.inner) { 152 | // get counts for each of the sentences inside the entity 153 | counts.push(new Map()); 154 | if (c.probability === null) { 155 | definedSentenceProbabilities.push(null); 156 | } else { 157 | const p = c.probability || ''; 158 | const isPercent = p.slice(-1) === '%'; 159 | const setenceProbabilityType = isPercent ? '%' : 'w'; 160 | if (probabilityTypeDefined === null) { 161 | probabilityTypeDefined = setenceProbabilityType; 162 | } else if (setenceProbabilityType !== probabilityTypeDefined) { 163 | throw new Error(`All probability definitions for "${cacheKey}" must be of the same type.`); 164 | } 165 | const prob = parseFloat(isPercent ? p.slice(0, -1) : p); 166 | if (isPercent) { 167 | if (prob <= 0 || prob > 100) { 168 | throw new Error(`Probability "${p}" must be greater than 0 up to 100. At ${cacheKey}`); 169 | } 170 | } else if (setenceProbabilityType === 'w') { 171 | if (prob <= 0) { 172 | throw new Error(`Probability weight "${p}" must be greater than 0. At ${cacheKey}`); 173 | } 174 | } 175 | sumOfTotalProbabilitiesDefined += prob; 176 | definedSentenceProbabilities.push(prob); 177 | } 178 | } 179 | if (probabilityTypeDefined === '%' && sumOfTotalProbabilitiesDefined && sumOfTotalProbabilitiesDefined > 100) { 180 | throw new Error( 181 | `The sum of sentence probabilities (${sumOfTotalProbabilitiesDefined}) for an entity can't be higher than 100%. At ${cacheKey}` 182 | ); 183 | } 184 | const isPercentageProbability = probabilityTypeDefined === '%'; 185 | const probabilities = calcSentencesProbabilities( 186 | isPercentageProbability, 187 | isEvenDistribution, 188 | definedSentenceProbabilities, 189 | sumOfTotalProbabilitiesDefined, 190 | maxCounts 191 | ); 192 | const currentEntityCache: IStatCache = { counts, maxCounts, probabilities }; 193 | cache.set(cacheKey, currentEntityCache); 194 | cacheStats = cache.get(cacheKey) as IStatCache; 195 | } 196 | // NOTE: if an entity has 5 sentences we add one (the optional empty sentence) and get that probability 197 | const optionalProb = 100 / (cacheStats.probabilities.length + 1); 198 | const sentenceIndex = chance.weighted(Array.from(cacheStats.probabilities.keys()), cacheStats.probabilities); 199 | if (optional && chance.bool({ likelihood: optionalProb })) { 200 | return []; 201 | } 202 | const sentence = ed.inner[sentenceIndex].sentence; 203 | let accumulator: ISentenceTokens[] = []; 204 | // For slots where a sentence is composed of only one alias, we add the synonym tag, 205 | // to denote that the generated alias is a synonym of its alias name 206 | const isSlotDefSentenceWithOnlyOneAlias = ed.type === 'SlotDefinition' && sentence.length === 1 && sentence[0].type === 'Alias'; 207 | for (const t of sentence) { 208 | // slots and alias entities generate the sentences recursively 209 | const slotsInSentenceKeys: Set = new Set([]); 210 | if (t.type === 'Slot' || t.type === 'Alias') { 211 | const def = entities[t.type]; 212 | const innerEntityKey = getEntityKey(t); 213 | const currentCache = slotsInSentenceKeys.has(innerEntityKey) ? cacheStats.counts[sentenceIndex] : new Map(); 214 | slotsInSentenceKeys.add(innerEntityKey); 215 | const sentenceVariation = await getVariationsFromEntity(def[innerEntityKey], entities, !!t.opt, currentCache); 216 | if (sentenceVariation.length) { 217 | const returnSentenceTokens = chatitoFormatPostProcess(sentenceVariation); 218 | for (const returnToken of returnSentenceTokens) { 219 | const ettArgs = def[innerEntityKey].args; 220 | if (isSlotDefSentenceWithOnlyOneAlias && ettArgs && ettArgs.synonym === 'true') { 221 | returnToken.synonym = t.value; 222 | } 223 | if (t.type === 'Slot') { 224 | if (def[innerEntityKey].args) { 225 | returnToken.args = def[innerEntityKey].args; 226 | } 227 | returnToken.value = returnToken.value.trim(); 228 | returnToken.type = t.type; 229 | returnToken.slot = t.value; 230 | } 231 | accumulator = accumulator.concat(returnToken); 232 | } 233 | } 234 | } else { 235 | accumulator = accumulator.concat(t); 236 | } 237 | } 238 | return accumulator; 239 | }; 240 | 241 | /** 242 | * Picks the `combinationNumber`th example amongst all possible `entity` examples. 243 | * 244 | * @param defs All entities definitions 245 | * @param entity Entity to get the example from 246 | * @param combinationNumber The number of the example 247 | */ 248 | export const getExampleByNumber = (defs: IEntities, entity: IChatitoEntityAST, combinationNumber: number): ISentenceTokens[] => { 249 | let lookupNumber = combinationNumber; 250 | const sentence = entity.inner.find(s => { 251 | if (lookupNumber < s.cardinality!) { 252 | return true; 253 | } 254 | lookupNumber -= s.cardinality!; 255 | return false; 256 | }); 257 | if (!sentence) { 258 | return []; 259 | } 260 | let prevCardinality = 1; 261 | let prevRemaining = 0; 262 | const isSlotDefSentenceWithOnlyOneAlias = 263 | entity.type === 'SlotDefinition' && sentence.sentence.length === 1 && sentence.sentence[0].type === 'Alias'; 264 | const resultTokens = sentence.sentence.reduce( 265 | (example, token) => { 266 | if (token.type === 'Text') { 267 | return example.concat([token]); 268 | } 269 | if (token.type === 'Slot' || token.type === 'Alias') { 270 | let cardinality = token.opt ? 1 : 0; 271 | const innerEntity = token.type === 'Alias' ? defs.Alias : defs.Slot; 272 | const entityKey = getEntityKey(token); 273 | cardinality += innerEntity[entityKey].cardinality!; 274 | lookupNumber = (lookupNumber - prevRemaining) / prevCardinality; 275 | prevRemaining = lookupNumber % cardinality; 276 | prevCardinality = cardinality; 277 | if (prevRemaining === 0 && token.opt) { 278 | return example; 279 | } 280 | const innerNumber = token.opt ? prevRemaining - 1 : prevRemaining; 281 | let tokens = getExampleByNumber(defs, innerEntity[entityKey], innerNumber); 282 | tokens = chatitoFormatPostProcess(tokens).map(t => { 283 | const ettArgs = innerEntity[entityKey].args; 284 | if (isSlotDefSentenceWithOnlyOneAlias && ettArgs && ettArgs.synonym === 'true') { 285 | t.synonym = token.value; 286 | } 287 | if (token.type === 'Slot') { 288 | if (innerEntity[entityKey].args) { 289 | t.args = innerEntity[entityKey].args; 290 | } 291 | t.value = t.value.trim(); 292 | t.type = token.type; 293 | t.slot = token.value; 294 | } 295 | return t; 296 | }); 297 | return example.concat(tokens); 298 | } 299 | throw Error(`Unknown token type: ${token.type}`); 300 | }, 301 | [] as ISentenceTokens[] 302 | ); 303 | return chatitoFormatPostProcess(resultTokens); 304 | }; 305 | 306 | /** 307 | * Returns a generator providing every possible combination of entity's examples 308 | * including duplicates. 309 | * 310 | * @param defs All entities definitions 311 | * @param entity Entity to get all examples for 312 | */ 313 | export function* allExamplesGenerator(defs: IEntities, entity: IChatitoEntityAST) { 314 | for (let i = 0; i < entity.cardinality!; i++) { 315 | yield getExampleByNumber(defs, entity, i); 316 | } 317 | } 318 | 319 | /** 320 | * Calculates the cardinality of the `sentence`. 321 | * All the entities used in the sentence must already have their cardinalities 322 | * calculated. 323 | * 324 | * @param defs All entities definitions 325 | * @param sentence Sentence tokens 326 | */ 327 | const getCardinality = (defs: IEntities, sentence: ISentenceTokens[]) => { 328 | return sentence.reduce((acc, token) => { 329 | if (token.type === 'Text') { 330 | return acc; 331 | } 332 | const entity = token.type === 'Alias' ? defs.Alias : defs.Slot; 333 | const entityKey = getEntityKey(token); 334 | 335 | let tokenCardinality = entity[entityKey].cardinality!; 336 | if (token.opt) { 337 | tokenCardinality += 1; 338 | } 339 | return acc * tokenCardinality; 340 | }, 1); 341 | }; 342 | 343 | /** 344 | * Calculates the cardinality of the `entity`. 345 | * All the entities used in the entity must already have their cardinalities 346 | * calculated. 347 | * 348 | * @param defs All entities definitions 349 | * @param entity Entity to calc cardinality for 350 | */ 351 | const calcCardinality = (defs: IEntities, entity: IChatitoEntityAST) => { 352 | entity.inner.forEach(sentence => { 353 | const cardinality = getCardinality(defs, sentence.sentence); 354 | sentence.cardinality = cardinality; 355 | }); 356 | entity.cardinality = entity.inner.reduce((acc, sentence) => acc + sentence.cardinality!, 0); 357 | }; 358 | 359 | /** 360 | * Returns human readable string representing an entity. 361 | * Returns the same string for entity definition and it's use in a token. 362 | * 363 | * @param item Token or Entity definition 364 | */ 365 | const getRefKey = (item: IChatitoEntityAST | ISentenceTokens) => { 366 | const type = item.type.replace('Definition', ''); 367 | const key = 'key' in item ? item.key : getEntityKey(item); 368 | switch (type) { 369 | case 'Intent': 370 | return `%[${key}]`; 371 | case 'Alias': 372 | return `~[${key}]`; 373 | case 'Slot': 374 | return `@[${key}]`; 375 | 376 | default: 377 | return `(${key})`; 378 | } 379 | }; 380 | 381 | /** 382 | * Returns true if the `entity` has any entity with cardinality not yet being 383 | * calculated. 384 | * Also populates `refs` map. 385 | * 386 | * @param defs All entities definitions 387 | * @param entity An Entity 388 | * @param refs A map of entities references 389 | */ 390 | const hasTokenWithoutCardinality = (defs: IEntities, entity: IChatitoEntityAST, refs: { [key: string]: Set }) => { 391 | const parentKey = getRefKey(entity); 392 | return entity.inner.some(sentence => 393 | sentence.sentence.some(token => { 394 | if (token.type === 'Text') { 395 | return false; 396 | } 397 | const entityKey = getEntityKey(token); 398 | const refKey = getRefKey(token); 399 | if (refKey in refs) { 400 | refs[refKey].add(parentKey); 401 | } else { 402 | refs[refKey] = new Set([parentKey]); 403 | } 404 | if (!defs[token.type][entityKey]) { 405 | throw new Error(`${token.type} not defined: ${entityKey}`); 406 | } 407 | return defs[token.type][entityKey].cardinality === undefined; 408 | }) 409 | ); 410 | }; 411 | 412 | /** 413 | * Throws an error showing loop path if there is any in entities references (`refs`) 414 | * starting with `path` path. 415 | * 416 | * @param path Current path 417 | * @param refs Entities references map 418 | */ 419 | const checkLoopIn = (path: string[], refs: { [key: string]: Set }) => { 420 | const last = path[path.length - 1]; 421 | if (refs[last]) { 422 | for (const parent of refs[last]) { 423 | if (parent === path[0]) { 424 | const loop = path.concat([parent]).reverse(); 425 | throw new Error(`You have a circular nesting: ${loop.join(' -> ')}. Infinite loop prevented.`); 426 | } else { 427 | checkLoopIn(path.concat([parent]), refs); 428 | } 429 | } 430 | } 431 | }; 432 | 433 | /** 434 | * Throws an error showing loop path if there is any in entities references (`refs`) 435 | * 436 | * @param refs Entities references map 437 | */ 438 | const checkLoop = (refs: { [key: string]: Set }) => { 439 | for (const key of Object.keys(refs)) { 440 | const path = [key]; 441 | checkLoopIn(path, refs); 442 | } 443 | }; 444 | 445 | /** 446 | * Throws an error showing slots nesting path if there is any 447 | * in the entitiesreferences (`refs`) starting with `path` path. 448 | * 449 | * @param path Current path 450 | * @param refs Entities references map 451 | */ 452 | const findNestedSlots = (path: string[], refs: { [key: string]: Set }) => { 453 | const last = path[path.length - 1]; 454 | if (refs[last]) { 455 | for (const parent of refs[last]) { 456 | const firstIndex = path.findIndex(item => item.startsWith('@')); 457 | if (firstIndex !== -1 && parent.startsWith('@')) { 458 | const slotsPath = path 459 | .slice(firstIndex) 460 | .reverse() 461 | .join(' -> '); 462 | throw new Error(`You have nested slots: ${parent} -> ${slotsPath}. A slot can't reference other slot.`); 463 | } else { 464 | findNestedSlots(path.concat([parent]), refs); 465 | } 466 | } 467 | } 468 | }; 469 | 470 | /** 471 | * Throws an error showing slots nesting path if there is any 472 | * in the entitiesreferences (`refs`). 473 | * 474 | * @param refs Entities references map 475 | */ 476 | const checkNestedSlots = (refs: { [key: string]: Set }) => { 477 | for (const key of Object.keys(refs)) { 478 | const path = [key]; 479 | findNestedSlots(path, refs); 480 | } 481 | }; 482 | 483 | /** 484 | * Calculates cardinalities for all entities. 485 | * Also checks for nested slots. 486 | * 487 | * @param defs All entities definitions 488 | */ 489 | const preCalcCardinality = (defs: IEntities) => { 490 | // cycle through uncalculated: 491 | const uncalced = { 492 | Intent: [] as string[], 493 | Alias: [] as string[], 494 | Slot: [] as string[] 495 | }; 496 | const refs: { [key: string]: Set } = {}; 497 | let totalUncalced = 0; 498 | let lastUncalced = -1; 499 | do { 500 | totalUncalced = 0; 501 | for (const type of Object.keys(uncalced) as Array) { 502 | uncalced[type] = Object.keys(defs[type]).filter(key => defs[type][key].cardinality === undefined); 503 | uncalced[type].forEach(key => { 504 | if (!hasTokenWithoutCardinality(defs, defs[type][key], refs)) { 505 | calcCardinality(defs, defs[type][key]); 506 | } else { 507 | totalUncalced += 1; 508 | } 509 | }); 510 | } 511 | if (lastUncalced === totalUncalced) { 512 | checkLoop(refs); 513 | } 514 | lastUncalced = totalUncalced; 515 | } while (totalUncalced > 0); 516 | checkNestedSlots(refs); 517 | }; 518 | 519 | /** 520 | * Adds missing alias definitions. 521 | * When alias is used in sentence tokens but not defined. 522 | * 523 | * @param defs All entities definitions 524 | */ 525 | const addMissingAliases = (defs: IEntities) => { 526 | const aliases = new Set(); 527 | for (const entities of [defs.Alias, defs.Slot, defs.Intent]) { 528 | for (const key of Object.keys(entities)) { 529 | entities[key].inner.forEach(sentence => { 530 | sentence.sentence.forEach(token => { 531 | if (token.type === 'Alias') { 532 | aliases.add(token.value); 533 | } 534 | }); 535 | }); 536 | } 537 | } 538 | for (const alias of aliases) { 539 | if (!defs.Alias[alias]) { 540 | if (config.autoAliases === 'warn') { 541 | // tslint:disable-next-line: no-console 542 | console.warn(`WARNING! Auto alias creation: '${alias}'`); 543 | } 544 | defs.Alias[alias] = { 545 | inner: [{ sentence: [{ value: alias, type: 'Text' }], probability: null }], 546 | key: alias, 547 | type: 'AliasDefinition' 548 | }; 549 | } 550 | } 551 | }; 552 | 553 | export type IFileImporter = ( 554 | fromPath: string, 555 | importFile: string 556 | ) => { 557 | filePath: string; 558 | dsl: string; 559 | }; 560 | 561 | export const astFromString = (str: string) => chatito.parse(str); 562 | export const datasetFromString = (str: string, writterFn: IUtteranceWriter, importer?: IFileImporter, currentPath?: string) => { 563 | const ast = astFromString(str); 564 | return datasetFromAST(ast, writterFn, importer, currentPath); 565 | }; 566 | 567 | export const getImports = (from: string, to: string, importer: IFileImporter) => { 568 | const fileContent = importer(from, to); 569 | if (!fileContent || !fileContent.dsl) { 570 | throw new Error(`Failed importing ${to}`); 571 | } 572 | try { 573 | const importAst = astFromString(fileContent.dsl); 574 | let outAst: IChatitoEntityAST[] = []; 575 | importAst.forEach(ett => { 576 | if (ett.type === 'ImportFile' && ett.value) { 577 | outAst = [...outAst, ...getImports(fileContent.filePath, ett.value, importer)]; 578 | } else if (ett.type === 'AliasDefinition' || ett.type === 'SlotDefinition') { 579 | outAst = [...outAst, ett]; 580 | } 581 | }); 582 | return outAst; 583 | } catch (e) { 584 | throw new Error(`Failed importing ${to}. ${e.message} - ${JSON.stringify(e.location)}`); 585 | } 586 | }; 587 | 588 | export const definitionsFromAST = (initialAst: IChatitoEntityAST[], importHandler?: IFileImporter, currPath?: string) => { 589 | const operatorDefinitions: IEntities = { Intent: {}, Slot: {}, Alias: {} }; 590 | if (!initialAst || !initialAst.length) { 591 | return; 592 | } 593 | const importer = importHandler ? importHandler : () => ({ filePath: '', dsl: '' }); 594 | const currentPath = currPath ? currPath : ''; 595 | // gete imports first 596 | let ast: IChatitoEntityAST[] = [...initialAst]; 597 | initialAst.forEach(od => { 598 | if (od.type === 'ImportFile' && od.value) { 599 | ast = [...ast, ...getImports(currentPath, od.value, importer)]; 600 | } 601 | }); 602 | ast.forEach(od => { 603 | let entity: IEntityDef; 604 | if (od.type === 'IntentDefinition') { 605 | entity = operatorDefinitions.Intent; 606 | } else if (od.type === 'SlotDefinition') { 607 | entity = operatorDefinitions.Slot; 608 | } else if (od.type === 'AliasDefinition') { 609 | entity = operatorDefinitions.Alias; 610 | } else { 611 | // type is 'Comment' or 'ImportFile' 612 | return; // skip comments 613 | } 614 | const odKey = od.variation ? `${od.key}#${od.variation}` : od.key; 615 | if (entity[odKey]) { 616 | throw new Error(`Duplicate definition for ${od.type} '${odKey}'`); 617 | } 618 | entity[odKey] = od; 619 | }); 620 | if (config.autoAliases !== 'restrict') { 621 | addMissingAliases(operatorDefinitions); 622 | } 623 | preCalcCardinality(operatorDefinitions); 624 | return operatorDefinitions; 625 | }; 626 | 627 | export const datasetFromAST = async ( 628 | initialAst: IChatitoEntityAST[], 629 | writterFn: IUtteranceWriter, 630 | importHandler?: IFileImporter, 631 | currPath?: string 632 | ) => { 633 | const operatorDefinitions = definitionsFromAST(initialAst, importHandler, currPath); 634 | if (!operatorDefinitions) { 635 | return; 636 | } 637 | const intentKeys = Object.keys(operatorDefinitions.Intent); 638 | if (!intentKeys || !intentKeys.length) { 639 | return; 640 | } 641 | for (const intentKey of intentKeys) { 642 | // and for all tokens inside the sentence 643 | const maxPossibleCombinations = operatorDefinitions.Intent[intentKey].cardinality!; 644 | let maxIntentExamples = maxPossibleCombinations; // counter that will change 645 | const entityArgs = operatorDefinitions.Intent[intentKey].args; 646 | // by default if no training or testing arguments are declared, all go to training 647 | let trainingN = maxIntentExamples; 648 | let testingN = 0; 649 | let generatedTrainingExamplesCount = 0; 650 | let generatedTestingExamplesCount = 0; 651 | if (entityArgs) { 652 | if (entityArgs.training) { 653 | trainingN = parseInt(entityArgs.training, 10); 654 | if (trainingN < 1) { 655 | throw new Error(`The 'training' argument for ${intentKey} must be higher than 0.`); 656 | } 657 | if (entityArgs.testing) { 658 | testingN = parseInt(entityArgs.testing, 10); 659 | if (testingN < 1) { 660 | throw new Error(`The 'testing' argument for ${intentKey} must be higher than 0.`); 661 | } 662 | } 663 | } 664 | let intentMax = trainingN + testingN; 665 | if (intentMax > maxIntentExamples) { 666 | logger.warn( 667 | `Can't generate ${intentMax} examples. ` + 668 | `Using the maximum possible combinations: ${maxIntentExamples}. ` + 669 | 'NOTE: Using the maximum leads to overfitting.' 670 | ); 671 | intentMax = maxIntentExamples; 672 | } else if (intentMax < maxIntentExamples) { 673 | maxIntentExamples = intentMax; 674 | } 675 | } 676 | const maxEx = maxIntentExamples; 677 | const globalCache: IChatitoCache = new Map(); 678 | const collitionsCache: { [id: string]: boolean } = {}; 679 | if (maxIntentExamples >= maxPossibleCombinations) { 680 | for (const utterance of allExamplesGenerator(operatorDefinitions, operatorDefinitions.Intent[intentKey])) { 681 | const utteranceString = utterance.reduce((p, n) => p + n.value, ''); 682 | if (!collitionsCache[utteranceString]) { 683 | collitionsCache[utteranceString] = true; 684 | const completedTraining = generatedTrainingExamplesCount >= trainingN; 685 | const completedTesting = generatedTestingExamplesCount >= testingN; 686 | let isTrainingExample = !completedTraining; 687 | if (!completedTraining && !completedTesting) { 688 | const trainingLeft = trainingN - generatedTrainingExamplesCount; 689 | const testingLeft = testingN - generatedTestingExamplesCount; 690 | isTrainingExample = Math.random() < trainingLeft / (trainingLeft + testingLeft); 691 | } 692 | writterFn(utterance, intentKey, isTrainingExample); 693 | if (isTrainingExample) { 694 | generatedTrainingExamplesCount++; 695 | } else { 696 | generatedTestingExamplesCount++; 697 | } 698 | } 699 | } 700 | continue; 701 | } 702 | let duplicatesCounter = 0; 703 | while (maxIntentExamples) { 704 | const intentSentence = await getVariationsFromEntity( 705 | operatorDefinitions.Intent[intentKey], 706 | operatorDefinitions, 707 | false, 708 | globalCache 709 | ); 710 | const utterance = chatitoFormatPostProcess(intentSentence); 711 | const utteranceString = utterance.reduce((p, n) => p + n.value, ''); 712 | if (!collitionsCache[utteranceString]) { 713 | collitionsCache[utteranceString] = true; 714 | const completedTraining = generatedTrainingExamplesCount >= trainingN; 715 | const completedTesting = generatedTestingExamplesCount >= testingN; 716 | let isTrainingExample = !completedTraining; 717 | if (!completedTraining && !completedTesting) { 718 | // reference: https://stackoverflow.com/questions/44263229/generate-a-random-boolean-70-true-30-false 719 | isTrainingExample = Math.random() < 0.7; 720 | } 721 | writterFn(utterance, intentKey, isTrainingExample); 722 | maxIntentExamples--; 723 | if (isTrainingExample) { 724 | generatedTrainingExamplesCount++; 725 | } else { 726 | generatedTestingExamplesCount++; 727 | } 728 | } else { 729 | duplicatesCounter++; 730 | // note: trick to make all combinations for small datasets, but avoid them for large ones 731 | const smallDupesLimit = 10000; 732 | const maxDupes = maxPossibleCombinations * maxPossibleCombinations; 733 | const maxDupesLimit = Math.floor(maxDupes / 2); 734 | const isBigDataset = maxPossibleCombinations > smallDupesLimit; 735 | if ( 736 | (isBigDataset && duplicatesCounter > maxDupesLimit) || 737 | (!isBigDataset && duplicatesCounter > maxDupes * maxPossibleCombinations) 738 | ) { 739 | // prevent cases where duplicates are part of the entity definitions 740 | let m = `Too many duplicates while generating dataset! Looks like we have probably reached `; 741 | m += `the maximum ammount of possible unique generated examples. `; 742 | m += `The generator has stopped at ${maxEx - maxIntentExamples} examples for intent ${intentKey}.`; 743 | logger.warn(m); 744 | maxIntentExamples = 0; 745 | } 746 | } 747 | } 748 | } 749 | }; 750 | --------------------------------------------------------------------------------