├── .nvmrc
├── .npmignore
├── screenshot.jpg
├── public
├── favicon.ico
├── favicon-16x16.png
├── favicon-32x32.png
├── apple-touch-icon.png
├── browserconfig.xml
├── site.webmanifest
└── safari-pinned-tab.svg
├── examples
├── importing
│ ├── main.chatito
│ ├── dir
│ │ └── import.chatito
│ └── nestedimport.chatito
├── citySearch_medium.chatito
└── dateBooking_large.chatito
├── .prettierrc
├── web
├── lib
│ ├── utils.ts
│ └── editorConfig.ts
├── pages
│ ├── 404.tsx
│ └── index.tsx
└── components
│ ├── globalStyles.tsx
│ ├── Logo.tsx
│ └── Editor
│ ├── editorStyles.tsx
│ └── Editor.tsx
├── tslint.json
├── tsconfig.json
├── .gitignore
├── gatsby-config.js
├── .circleci
└── config.yml
├── gatsby-node.js
├── .vscode
└── launch.json
├── src
├── adapters
│ ├── web.ts
│ ├── luis.ts
│ ├── flair.ts
│ ├── rasa.ts
│ └── snips.ts
├── utils.ts
├── types.ts
├── bin.ts
├── tests
│ ├── parser.spec.ts
│ └── bin.spec.ts
└── main.ts
├── LICENSE
├── package.json
├── parser
└── chatito.pegjs
├── readme.md
└── spec.md
/.nvmrc:
--------------------------------------------------------------------------------
1 | v10.7.0
--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | src/
2 | coverage/
3 | examples/
4 | web/
5 | .vscode/
--------------------------------------------------------------------------------
/screenshot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binhetech/Chatito/master/screenshot.jpg
--------------------------------------------------------------------------------
/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binhetech/Chatito/master/public/favicon.ico
--------------------------------------------------------------------------------
/public/favicon-16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binhetech/Chatito/master/public/favicon-16x16.png
--------------------------------------------------------------------------------
/public/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binhetech/Chatito/master/public/favicon-32x32.png
--------------------------------------------------------------------------------
/examples/importing/main.chatito:
--------------------------------------------------------------------------------
1 | import ./dir/import.chatito
2 |
3 | %[greet]
4 | ~[hi] ~[how are you?]
5 |
--------------------------------------------------------------------------------
/public/apple-touch-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binhetech/Chatito/master/public/apple-touch-icon.png
--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 | "semi": true,
3 | "tabWidth": 4,
4 | "printWidth": 140,
5 | "singleQuote": true
6 | }
--------------------------------------------------------------------------------
/examples/importing/dir/import.chatito:
--------------------------------------------------------------------------------
1 | import ../nestedimport.chatito
2 |
3 | ~[hi]
4 | ~[saludos1] ~[saludos2?]
5 | ~[saludos2]
--------------------------------------------------------------------------------
/examples/importing/nestedimport.chatito:
--------------------------------------------------------------------------------
1 | ~[saludos1]
2 | hola1
3 | hi1
4 | hihi1
5 |
6 | ~[saludos2]
7 | hola2
8 | hhi2
9 | hello2
--------------------------------------------------------------------------------
/web/lib/utils.ts:
--------------------------------------------------------------------------------
1 | export function debounce(func, wait) {
2 | let timeout;
3 | return function(...args) {
4 | const context = this;
5 | clearTimeout(timeout);
6 | timeout = setTimeout(() => func.apply(context, args), wait);
7 | };
8 | }
9 |
--------------------------------------------------------------------------------
/public/browserconfig.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | #da532c
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/web/pages/404.tsx:
--------------------------------------------------------------------------------
1 | import * as React from 'react';
2 |
3 | const NotFoundPage = () => (
4 |
5 |
404: Page not found.
6 |
7 | You've hit the void. Go back.
8 |
9 |
10 | );
11 |
12 | export default NotFoundPage;
13 |
--------------------------------------------------------------------------------
/tslint.json:
--------------------------------------------------------------------------------
1 | {
2 | "defaultSeverity": "error",
3 | "extends": ["tslint:recommended", "tslint-config-prettier"],
4 | "jsRules": {},
5 | "rules": {
6 | "quotemark": [true, "single", "avoid-escape", "jsx-double"],
7 | "object-literal-sort-keys": false,
8 | "prettier": true
9 | },
10 | "rulesDirectory": ["tslint-plugin-prettier"]
11 | }
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "ES2015",
4 | "module": "commonjs",
5 | "jsx": "react",
6 | "strict": true,
7 | "declaration": true,
8 | "removeComments": true,
9 | "outDir": "./dist",
10 | "types": ["node", "jest"]
11 | },
12 | "include": [
13 | "src/**/*"
14 | ],
15 | "exclude": [
16 | "dist",
17 | "node_modules",
18 | "src/tests",
19 | "**/*.spec.ts"
20 | ]
21 | }
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /coverage
2 | /dist
3 | node_modules
4 | .DS_Store
5 | *.swp
6 | .tmp
7 | .cache
8 | /parser/chatito.js
9 | /public/**/*
10 | !/public/andorid-chrome-192x192.png
11 | !/public/andorid-chrome-512x512.png
12 | !/public/apple-touch-icon.png
13 | !/public/browserconfig.xml
14 | !/public/favicon-16x16.png
15 | !/public/favicon-32x32.png
16 | !/public/favicon.ico
17 | !/public/mstitle-150x150.png
18 | !/public/safari-pinned-tab.svg
19 | !/public/site.webmanifest
20 |
--------------------------------------------------------------------------------
/gatsby-config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | pathPrefix: '/Chatito',
3 | siteMetadata: {
4 | title: 'Chatito'
5 | },
6 | plugins: [
7 | 'gatsby-plugin-typescript',
8 | {
9 | resolve: 'gatsby-plugin-page-creator',
10 | options: {
11 | path: `${__dirname}/web/pages`
12 | }
13 | },
14 | 'gatsby-plugin-react-helmet',
15 | 'gatsby-plugin-styled-components'
16 | ]
17 | };
--------------------------------------------------------------------------------
/public/site.webmanifest:
--------------------------------------------------------------------------------
1 | {
2 | "name": "",
3 | "short_name": "",
4 | "icons": [
5 | {
6 | "src": "/android-chrome-192x192.png",
7 | "sizes": "192x192",
8 | "type": "image/png"
9 | },
10 | {
11 | "src": "/android-chrome-512x512.png",
12 | "sizes": "512x512",
13 | "type": "image/png"
14 | }
15 | ],
16 | "theme_color": "#ffffff",
17 | "background_color": "#ffffff",
18 | "display": "standalone"
19 | }
20 |
--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | jobs:
3 | build:
4 | branches:
5 | only:
6 | - master
7 | docker:
8 | - image: circleci/node:8.11
9 | working_directory: ~/repo
10 | steps:
11 | - checkout
12 | - restore_cache:
13 | keys:
14 | - v1-dependencies-{{ checksum "package.json" }}
15 | - v1-dependencies-
16 | - run: npm install
17 | - save_cache:
18 | paths:
19 | - node_modules
20 | key: v1-dependencies-{{ checksum "package.json" }}
21 | - run: npm run test
22 |
--------------------------------------------------------------------------------
/gatsby-node.js:
--------------------------------------------------------------------------------
1 | exports.onCreateWebpackConfig = ({ actions, stage, loaders }) => {
2 | const jsLoader = loaders.js();
3 | if (stage === 'build-javascript') {
4 | actions.setWebpackConfig({
5 | module: {
6 | rules: [
7 | {
8 | test: /\.tsx?$/,
9 | use: [
10 | {
11 | loader: jsLoader.loader
12 | }
13 | ]
14 | }
15 | ]
16 | }
17 | });
18 | }
19 | };
20 |
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "0.2.0",
3 | "configurations": [
4 | {
5 | "name": "Debug Current TS File",
6 | "type": "node",
7 | "request": "launch",
8 | "program": "${workspaceRoot}/node_modules/ts-node/dist/bin.js",
9 | "args": ["${relativeFile}"],
10 | "cwd": "${workspaceRoot}",
11 | "protocol": "inspector",
12 | },
13 | {
14 | "name": "Debug Current TS Test File",
15 | "type": "node",
16 | "request": "launch",
17 | "program": "${workspaceRoot}/node_modules/.bin/jest",
18 | "args": ["-i", "${relativeFile}"],
19 | "cwd": "${workspaceRoot}",
20 | "protocol": "inspector"
21 | }
22 | ]
23 | }
24 |
--------------------------------------------------------------------------------
/examples/citySearch_medium.chatito:
--------------------------------------------------------------------------------
1 | %[findByCityAndCategory]('training': '1000', 'testing': '100')
2 | ~[greet?] ~[botName?] ~[please?] ~[find?] ~[restaurants?] ~[nearby] @[city]
3 |
4 | ~[greet]
5 | hey
6 | hi
7 | hello
8 | greetings
9 |
10 | ~[botName]
11 | Pia
12 |
13 | ~[please]
14 | please
15 | pls
16 |
17 | ~[find]
18 | find
19 | search
20 | lookup
21 |
22 | ~[nearby]
23 | close to
24 | in the area of
25 | within
26 | located at
27 | nearby
28 |
29 | ~[restaurants]
30 | restaurants
31 | places to eat
32 | where to eat
33 |
34 | ~[newYork]('synonym': 'true')
35 | new york ~[city?]
36 | ny ~[city?]
37 |
38 | ~[sanFrancisco]('synonym': 'true')
39 | san francisco
40 | san francisco city
41 |
42 | ~[atlanta]('synonym': 'true')
43 | atlanta
44 | atlanta city
45 |
46 | ~[city]
47 | city
48 |
49 | @[city]('entity': 'location')
50 | ~[newYork]
51 | ~[sanFrancisco]
52 | ~[atlanta]
--------------------------------------------------------------------------------
/src/adapters/web.ts:
--------------------------------------------------------------------------------
1 | import * as gen from '../main';
2 | import { ISentenceTokens } from '../types';
3 | import * as utils from '../utils';
4 |
5 | export interface IDefaultDataset {
6 | [intent: string]: ISentenceTokens[][];
7 | }
8 | export async function adapter(dsl: string, formatOptions?: any, importer?: gen.IFileImporter, currentPath?: string) {
9 | const training: IDefaultDataset = {};
10 | const testing: IDefaultDataset = {};
11 | if (formatOptions) {
12 | utils.mergeDeep(training, formatOptions);
13 | }
14 | const utteranceWriter = (utterance: ISentenceTokens[], intentKey: string, isTrainingExample: boolean) => {
15 | const dataset = isTrainingExample ? training : testing;
16 | if (!dataset[intentKey]) {
17 | dataset[intentKey] = [];
18 | }
19 | dataset[intentKey].push(utterance);
20 | };
21 | await gen.datasetFromString(dsl, utteranceWriter, importer, currentPath);
22 | return { training, testing };
23 | }
24 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Rodrigo Pimentel
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/utils.ts:
--------------------------------------------------------------------------------
1 | // Deep merge objects
2 | // https://gist.github.com/Salakar/1d7137de9cb8b704e48a
3 | const isObject = (item: any) => item && typeof item === 'object' && !Array.isArray(item) && item !== null;
4 | const isArray = (item: any) => {
5 | if (typeof Array.isArray === 'undefined') {
6 | return Object.prototype.toString.call(item) === '[object Array]';
7 | } else {
8 | return Array.isArray(item);
9 | }
10 | };
11 | export const mergeDeep = (target: any, source: any): T => {
12 | if (isObject(target) && isObject(source)) {
13 | Object.keys(source).forEach(key => {
14 | if (isArray(source[key])) {
15 | if (target[key] === undefined) {
16 | target[key] = [];
17 | }
18 | target[key] = target[key].concat(source[key]);
19 | } else if (isObject(source[key])) {
20 | if (!target[key]) {
21 | Object.assign(target, { [key]: {} });
22 | }
23 | mergeDeep(target[key], source[key]);
24 | } else {
25 | Object.assign(target, { [key]: source[key] });
26 | }
27 | });
28 | }
29 | return target;
30 | };
31 |
--------------------------------------------------------------------------------
/web/components/globalStyles.tsx:
--------------------------------------------------------------------------------
1 | import styled, { createGlobalStyle } from 'styled-components';
2 |
3 | // tslint:disable-next-line:no-unused-expression
4 | export const Global: any = createGlobalStyle`
5 | *, *::after, *::before {
6 | margin: 0;
7 | padding: 0;
8 | box-sizing: border-box;
9 | }
10 | :focus { outline: none; }
11 | h1, h2 { display: inline; font-size: 20px; }
12 | ::-moz-focus-inner { border: 0; }
13 | html, body, #app {
14 | padding: 0;
15 | margin: 0;
16 | display: flex;
17 | flex-direction: column;
18 | flex: 1;
19 | height: auto !important;
20 | font-family: 'Helvetica Neue', Arial, Helvetica, sans-serif;
21 | background-color: #ccc;
22 | }
23 | a { text-decoration: none; }
24 | body {
25 | box-sizing: border-box;
26 | min-height: 100vh;
27 | background: #ececec;
28 | padding: 0;
29 | }
30 | a:focus, a:active, a:any-link { text-decoration: none; }
31 | `;
32 |
33 | export const Header = styled('div')`
34 | display: flex;
35 | align-items: center;
36 | justify-content: center;
37 | h1,
38 | h2 {
39 | display: inline;
40 | }
41 | a {
42 | text-decoration: none;
43 | color: #990adb;
44 | }
45 | a:hover {
46 | color: #b92afb;
47 | }
48 | color: '#444';
49 | margin: 20px;
50 | `;
51 |
--------------------------------------------------------------------------------
/src/adapters/luis.ts:
--------------------------------------------------------------------------------
1 | import * as gen from '../main';
2 | import { ISentenceTokens } from '../types';
3 | import * as utils from '../utils';
4 |
5 | export interface ILuisEntityLabel {
6 | startCharIndex: number;
7 | endCharIndex: number;
8 | entityName: string;
9 | }
10 | export interface ILuisExample {
11 | text: string;
12 | intentName: string;
13 | entityLabels: ILuisEntityLabel[];
14 | }
15 | export interface ILuisDataset {
16 | data: ILuisExample[];
17 | }
18 |
19 | export interface ILuisTestingDataset {
20 | [intent: string]: ISentenceTokens[][];
21 | }
22 |
23 | export async function adapter(dsl: string, formatOptions?: any, importer?: gen.IFileImporter, currentPath?: string) {
24 | const training: ILuisDataset = { data: [] };
25 | const testing: ILuisDataset = { data: [] };
26 | if (formatOptions) {
27 | utils.mergeDeep(training, formatOptions);
28 | }
29 | const utteranceWriter = (utterance: ISentenceTokens[], intentKey: string, isTrainingExample: boolean) => {
30 | const example = utterance.reduce(
31 | (acc, next) => {
32 | if (next.type === 'Slot' && next.slot) {
33 | acc.entityLabels.push({
34 | endCharIndex: acc.text.length + next.value.length,
35 | entityName: next.slot,
36 | startCharIndex: acc.text.length
37 | });
38 | }
39 | acc.text += next.value;
40 | return acc;
41 | },
42 | { text: '', intentName: intentKey, entityLabels: [] } as ILuisExample
43 | );
44 | if (isTrainingExample) {
45 | training.data.push(example);
46 | } else {
47 | testing.data.push(example);
48 | }
49 | };
50 | await gen.datasetFromString(dsl, utteranceWriter, importer, currentPath);
51 | return { training, testing };
52 | }
53 |
--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------
1 | export type IInnerEntitiesTypes = 'Alias' | 'Slot' | 'Text';
2 | export interface IASTLocationProperties {
3 | offset: number;
4 | line: number;
5 | column: number;
6 | }
7 | export interface IASTLocation {
8 | start: IASTLocationProperties;
9 | end: IASTLocationProperties;
10 | }
11 | export interface ISentenceTokens {
12 | value: string;
13 | type: IInnerEntitiesTypes;
14 | opt?: boolean;
15 | location?: IASTLocation;
16 | variation?: string | null;
17 | slot?: string;
18 | synonym?: string;
19 | args?: { [key: string]: string };
20 | }
21 |
22 | export interface ISingleSentence {
23 | sentence: ISentenceTokens[];
24 | probability: null | string;
25 | cardinality?: number;
26 | }
27 |
28 | export interface IChatitoEntityAST {
29 | type: 'IntentDefinition' | 'AliasDefinition' | 'SlotDefinition' | 'Comment' | 'ImportFile';
30 | key: string;
31 | inner: ISingleSentence[];
32 | value?: string;
33 | location?: IASTLocation;
34 | variation?: string | null;
35 | args?: { [key: string]: string };
36 | cardinality?: number;
37 | }
38 |
39 | export interface IChatitoParser {
40 | parse: (input: string) => IChatitoEntityAST[];
41 | }
42 | export interface IEntityDef {
43 | [key: string]: IChatitoEntityAST;
44 | }
45 | export interface IEntities {
46 | Intent: IEntityDef;
47 | Slot: IEntityDef;
48 | Alias: IEntityDef;
49 | }
50 |
51 | export interface IStatCache {
52 | // optional: boolean;
53 | // optionalCounts: number;
54 | // totalCounts: number[];
55 | counts: IChatitoCache[];
56 | // sumOfTotalMax: number;
57 | maxCounts: number[];
58 | probabilities: number[]; // value defined at probability operator
59 | // realProbabilities: number[]; // actual probability calculateed from the max possible utterances
60 | // utterancesToProvide: number[]; // the actual number of utterances each sentence will provide
61 | // resetedCounts: boolean;
62 | }
63 | export type IChatitoCache = Map;
64 | export type IUtteranceWriter = (utterance: ISentenceTokens[], intentKey: string, isTrainingExample: boolean) => void;
65 |
--------------------------------------------------------------------------------
/web/components/Logo.tsx:
--------------------------------------------------------------------------------
1 | import * as React from 'react';
2 |
3 | export default function Logo() {
4 | return (
5 |
13 | );
14 | }
15 |
--------------------------------------------------------------------------------
/src/adapters/flair.ts:
--------------------------------------------------------------------------------
1 | import { WriteStream } from 'fs';
2 | import * as Tokenizer from 'wink-tokenizer';
3 | import * as gen from '../main';
4 | import { ISentenceTokens } from '../types';
5 |
6 | const tokenizer = new Tokenizer();
7 |
8 | export interface IDefaultDataset {
9 | [intent: string]: ISentenceTokens[][];
10 | }
11 | export interface IFlairWriteStreams {
12 | trainClassification: WriteStream;
13 | testClassification: WriteStream;
14 | trainNER: WriteStream;
15 | testNER: WriteStream;
16 | }
17 |
18 | // NOTE: Flair adapter uses write streams to text files and requires two different formats
19 | // reference https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md
20 | // E.G:
21 | // npm run generate -- ./examples --format=flair --outputPath=./output --trainingFileName=training.txt --testingFileName=testing.txt
22 | export async function streamAdapter(dsl: string, ws: IFlairWriteStreams, imp?: gen.IFileImporter, currPath?: string) {
23 | // NOTE: the utteranceWriter is called with each sentences with aliases already replaced,
24 | // so the sentence toke can only be text or slot types.
25 | const utteranceWriter = (utterance: ISentenceTokens[], intentKey: string, isTrainingExample: boolean) => {
26 | // classification dataset in FastText format
27 | const classificationText = utterance.map(v => v.value).join('');
28 | const classificationLabel = intentKey.replace(/\s+/g, '');
29 | const writeStreamClassif = isTrainingExample ? ws.trainClassification : ws.testClassification;
30 | writeStreamClassif.write(`__label__${classificationLabel} ${classificationText}` + '\n');
31 | // named entity recognition dataset in two column with BIO-annotated NER tags (requires tokenization)
32 | const writeStreamNER = isTrainingExample ? ws.trainNER : ws.testNER;
33 | utterance.forEach(v => {
34 | const wordTokens = tokenizer.tokenize(v.value);
35 | if (v.type === 'Slot') {
36 | wordTokens.forEach((wt, idx) => {
37 | const slotBorI = idx === 0 ? 'B' : 'I';
38 | const slotTag = v.slot!.toLocaleUpperCase().replace(/\s+/g, '');
39 | writeStreamNER.write(`${wt.value} ${slotBorI}-${slotTag}` + '\n');
40 | });
41 | } else {
42 | wordTokens.forEach(wt => writeStreamNER.write(`${wt.value} O` + '\n'));
43 | }
44 | });
45 | writeStreamNER.write('\n'); // always write an extra EOL at the end of sentences
46 | };
47 | await gen.datasetFromString(dsl, utteranceWriter, imp, currPath);
48 | }
49 |
--------------------------------------------------------------------------------
/web/pages/index.tsx:
--------------------------------------------------------------------------------
1 | import * as React from 'react';
2 | import Helmet from 'react-helmet';
3 | import Editor from '../components/Editor/Editor';
4 | import { Global, Header } from '../components/globalStyles';
5 | import Logo from '../components/Logo';
6 |
7 | // NOTE: gatsby global for prefix
8 | declare var __PATH_PREFIX__;
9 |
10 | export default class Index extends React.Component<{}, {}> {
11 | public render() {
12 | return (
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | Chatito DSL - Generate dataset for chatbots
23 |
27 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
41 |
42 | helps you generate datasets for natural language understanding models using a simple DSL
43 | Read the docs
44 |
45 |
46 |
47 |
48 |
49 | );
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/adapters/rasa.ts:
--------------------------------------------------------------------------------
1 | import * as gen from '../main';
2 | import { ISentenceTokens } from '../types';
3 | import * as utils from '../utils';
4 |
5 | export interface IRasaEntity {
6 | start: number;
7 | end: number;
8 | value: string;
9 | entity: string;
10 | }
11 | export interface IRasaExample {
12 | text: string;
13 | intent: string;
14 | entities: IRasaEntity[];
15 | }
16 | export interface IRasaDataset {
17 | rasa_nlu_data: {
18 | regex_features: any[];
19 | entity_synonyms: Array<{ value: string; synonyms: string[] }>;
20 | common_examples: IRasaExample[];
21 | };
22 | }
23 |
24 | export interface IRasaTestingDataset {
25 | [intent: string]: ISentenceTokens[][];
26 | }
27 |
28 | export async function adapter(dsl: string, formatOptions?: any, importer?: gen.IFileImporter, currentPath?: string) {
29 | const training: IRasaDataset = {
30 | rasa_nlu_data: {
31 | regex_features: [],
32 | entity_synonyms: [],
33 | common_examples: []
34 | }
35 | };
36 | const testing = { rasa_nlu_data: { common_examples: [] as IRasaExample[] } };
37 | const synonyms: { [key: string]: Set } = {};
38 | if (formatOptions) {
39 | utils.mergeDeep(training, formatOptions);
40 | }
41 | const utteranceWriter = (utterance: ISentenceTokens[], intentKey: string, isTrainingExample: boolean) => {
42 | const example = utterance.reduce(
43 | (acc, next) => {
44 | if (next.type === 'Slot' && next.slot) {
45 | if (next.synonym) {
46 | if (!synonyms[next.synonym]) {
47 | synonyms[next.synonym] = new Set();
48 | }
49 | if (next.synonym !== next.value) {
50 | synonyms[next.synonym].add(next.value);
51 | }
52 | }
53 | acc.entities.push({
54 | end: acc.text.length + next.value.length,
55 | entity: next.slot,
56 | start: acc.text.length,
57 | value: next.synonym ? next.synonym : next.value
58 | });
59 | }
60 | acc.text += next.value;
61 | return acc;
62 | },
63 | { text: '', intent: intentKey, entities: [] } as IRasaExample
64 | );
65 | if (isTrainingExample) {
66 | training.rasa_nlu_data.common_examples.push(example);
67 | } else {
68 | testing.rasa_nlu_data.common_examples.push(example);
69 | }
70 | };
71 | await gen.datasetFromString(dsl, utteranceWriter, importer, currentPath);
72 | Object.keys(synonyms).forEach(k => {
73 | training.rasa_nlu_data.entity_synonyms.push({
74 | synonyms: Array.from(synonyms[k]),
75 | value: k
76 | });
77 | });
78 | return { training, testing };
79 | }
80 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "chatito",
3 | "version": "2.3.4",
4 | "description": "Generate training datasets for NLU chatbots using a simple DSL",
5 | "bin": {
6 | "chatito": "./dist/bin.js"
7 | },
8 | "main": "./dist/main.js",
9 | "scripts": {
10 | "cleanup": "rm -rf .cache && rm -rf dist/* && find public -maxdepth 1 -not -name public -not -iname '*.png' -not -iname '*.ico' -not -iname '*.xml' -not -iname '*.svg' -not -iname '*.webmanifest' -exec rm -rv {} \\;",
11 | "prettier": "prettier --write '{web,src}/**/*.{ts,tsx}'",
12 | "prepublish": "npm run parser:build && npm run ts",
13 | "parser:build": "pegjs parser/chatito.pegjs",
14 | "ts": "tsc",
15 | "web:build": "npm run cleanup && gatsby build",
16 | "web:start": "gatsby develop",
17 | "web:deploy": "npm run cleanup && gatsby build --prefix-paths && gh-pages -d public",
18 | "generate": "node -r ts-node/register ./src/bin.ts",
19 | "test": "npx jest",
20 | "test:debug": "npm run test:kill && NODE_ENV=TEST node --inspect-brk ./node_modules/jest/bin/jest.js --no-cache --runInBand --forceExit --detectOpenHandles",
21 | "test:kill": "lsof -n -i4TCP:5858 | sed '1 d' | awk '{print $2}' | xargs kill -9",
22 | "lint": "npx tslint -c tslint.json 'src/**/*.ts' 'web/**/*.ts'"
23 | },
24 | "engines": {
25 | "node": ">=8.11.2"
26 | },
27 | "repository": {
28 | "type": "git",
29 | "url": "git+https://github.com/rodrigopivi/Chatito.git"
30 | },
31 | "keywords": [
32 | "nlu",
33 | "natural language processing",
34 | "typescript",
35 | "dataset generation",
36 | "named entity recognition",
37 | "nlp",
38 | "natural language understanding",
39 | "chatbot",
40 | "rasa nlu",
41 | "luis ai",
42 | "snips nlu"
43 | ],
44 | "author": {
45 | "name": "Rodrigo Pimentel",
46 | "url": "https://twitter.com/amaru_muru"
47 | },
48 | "license": "MIT",
49 | "homepage": "https://github.com/rodrigopivi/Chatito",
50 | "dependencies": {
51 | "chance": "1.0.18",
52 | "minimist": "1.2.0",
53 | "wink-tokenizer": "5.2.1"
54 | },
55 | "jest": {
56 | "transform": {
57 | "^.+\\.tsx?$": "ts-jest"
58 | },
59 | "testRegex": "(/src/tests/.*|(\\.|/)(test|spec))\\.(tsx?)$",
60 | "moduleFileExtensions": [
61 | "ts",
62 | "tsx",
63 | "js",
64 | "jsx",
65 | "json",
66 | "node"
67 | ],
68 | "collectCoverage": true,
69 | "coveragePathIgnorePatterns": [
70 | "/node_modules/",
71 | "/dist/",
72 | "/src/test/",
73 | "/parser/"
74 | ]
75 | },
76 | "devDependencies": {
77 | "@babel/core": "7.4.5",
78 | "@types/chance": "1.0.5",
79 | "@types/file-saver": "2.0.1",
80 | "@types/jest": "24.0.15",
81 | "@types/node": "12.0.10",
82 | "@types/react": "16.8.22",
83 | "@types/react-dom": "16.8.4",
84 | "@types/react-helmet": "5.0.8",
85 | "@types/react-router-dom": "4.3.4",
86 | "@types/wink-tokenizer": "4.0.0",
87 | "babel-loader": "8.0.6",
88 | "babel-plugin-import": "1.12.0",
89 | "babel-plugin-styled-components": "1.10.6",
90 | "codeflask": "1.4.1",
91 | "core-js": "3.1.4",
92 | "file-saver": "2.0.2",
93 | "gatsby": "2.12.0",
94 | "gatsby-link": "2.2.0",
95 | "gatsby-plugin-react-helmet": "3.1.0",
96 | "gatsby-plugin-styled-components": "3.1.0",
97 | "gatsby-plugin-typescript": "2.1.0",
98 | "gh-pages": "2.0.1",
99 | "jest": "24.8.0",
100 | "pegjs": "0.10.0",
101 | "prettier": "1.18.2",
102 | "react": "16.8.6",
103 | "react-dom": "16.8.6",
104 | "react-helmet": "5.2.1",
105 | "react-json-view": "1.19.1",
106 | "react-router-dom": "5.0.1",
107 | "regenerator-runtime": "0.13.2",
108 | "styled-components": "4.3.2",
109 | "ts-jest": "24.0.2",
110 | "ts-node": "8.3.0",
111 | "tslint": "5.18.0",
112 | "tslint-config-prettier": "1.18.0",
113 | "tslint-plugin-prettier": "2.0.1",
114 | "typescript": "3.5.2"
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/examples/dateBooking_large.chatito:
--------------------------------------------------------------------------------
1 | %[bookRestaurantsAtDatetime]('training': '1000', 'testing': '100')
2 | ~[find?] ~[some?] ~[restaurants] ~[available?] from @[bookTime] to @[bookTime]
3 | ~[find?] ~[some?] ~[restaurants] ~[available?] from @[bookTime] to @[bookTime] on @[bookDate]
4 |
5 | ~[find]
6 | show
7 | look for
8 | search
9 | show me
10 | find
11 | identify
12 | book
13 |
14 | ~[some]
15 | all
16 | any
17 | some
18 |
19 | ~[available]
20 | available
21 |
22 | ~[restaurants]
23 | restaurants
24 | places where to eat
25 | buffets
26 | sushi
27 | food courts
28 |
29 | @[bookTime]
30 | ~[12hour] ~[timePeriods?]
31 | ~[12hour]~[:]~[minute] ~[timePeriods?]
32 | ~[24hour]~[:]~[minute]
33 |
34 | @[bookDate]
35 | ~[monthNames] ~[monthDays]
36 | ~[monthDays] of ~[monthNames]
37 | ~[monthDayNumbers]/~[monthNumbers]
38 | today
39 | tomorrow
40 | next ~[weekDays]
41 |
42 | ~[:]
43 | :
44 |
45 | ~[timePeriods]
46 | am
47 | pm
48 |
49 |
50 | ~[monthNames]
51 | January
52 | February
53 | March
54 | April
55 | May
56 | June
57 | July
58 | August
59 | September
60 | October
61 | November
62 | December
63 |
64 | ~[monthNumbers]
65 | 1
66 | 2
67 | 3
68 | 4
69 | 5
70 | 6
71 | 7
72 | 8
73 | 9
74 | 10
75 | 11
76 | 12
77 |
78 |
79 | ~[st]
80 | st
81 |
82 | ~[nd]
83 | nd
84 |
85 | ~[rd]
86 | rd
87 |
88 | ~[th]
89 | th
90 |
91 | ~[weekDays]
92 | Monday
93 | Tuesday
94 | Wednesday
95 | Thursday
96 | Friday
97 | Saturday
98 | Sunday
99 | Mon
100 | Tue
101 | Wed
102 | Thu
103 | Fri
104 | Sat
105 | Sun
106 |
107 | ~[monthDayNumbers]
108 | 1
109 | 2
110 | 3
111 | 4
112 | 5
113 | 6
114 | 7
115 | 8
116 | 9
117 | 10
118 | 11
119 | 12
120 | 13
121 | 14
122 | 15
123 | 16
124 | 17
125 | 18
126 | 19
127 | 20
128 | 21
129 | 22
130 | 23
131 | 24
132 | 25
133 | 26
134 | 27
135 | 28
136 | 29
137 | 30
138 | 31
139 |
140 | ~[monthDays]
141 | 1~[st?]
142 | 2~[nd?]
143 | 3~[rd?]
144 | 4~[th?]
145 | 5~[th?]
146 | 6~[th?]
147 | 7~[th?]
148 | 8~[th?]
149 | 9~[th?]
150 | 10~[th?]
151 | 11~[th?]
152 | 12~[th?]
153 | 13~[th?]
154 | 14~[th?]
155 | 15~[th?]
156 | 16~[th?]
157 | 17~[th?]
158 | 18~[th?]
159 | 19~[th?]
160 | 20~[th?]
161 | 21~[th?]
162 | 22~[th?]
163 | 23~[th?]
164 | 24~[th?]
165 | 25~[th?]
166 | 26~[th?]
167 | 27~[th?]
168 | 28~[th?]
169 | 29~[th?]
170 | 30~[th?]
171 | 31~[th?]
172 |
173 | ~[12hour]
174 | 0
175 | 1
176 | 2
177 | 3
178 | 4
179 | 5
180 | 6
181 | 7
182 | 8
183 | 9
184 | 10
185 | 11
186 | 12
187 |
188 | ~[24hour]
189 | 0
190 | 1
191 | 2
192 | 3
193 | 4
194 | 5
195 | 6
196 | 7
197 | 8
198 | 9
199 | 10
200 | 11
201 | 12
202 | 13
203 | 14
204 | 15
205 | 16
206 | 17
207 | 18
208 | 19
209 | 20
210 | 21
211 | 22
212 | 23
213 |
214 | ~[minute]
215 | 00
216 | 01
217 | 02
218 | 03
219 | 04
220 | 05
221 | 06
222 | 07
223 | 08
224 | 09
225 | 0
226 | 1
227 | 2
228 | 3
229 | 4
230 | 5
231 | 6
232 | 7
233 | 8
234 | 9
235 | 10
236 | 11
237 | 12
238 | 13
239 | 14
240 | 15
241 | 16
242 | 17
243 | 18
244 | 19
245 | 20
246 | 21
247 | 22
248 | 23
249 | 24
250 | 25
251 | 26
252 | 27
253 | 28
254 | 29
255 | 30
256 | 31
257 | 32
258 | 33
259 | 34
260 | 35
261 | 36
262 | 37
263 | 38
264 | 39
265 | 40
266 | 41
267 | 42
268 | 43
269 | 44
270 | 45
271 | 46
272 | 47
273 | 48
274 | 49
275 | 50
276 | 51
277 | 52
278 | 53
279 | 54
280 | 55
281 | 56
282 | 57
283 | 58
284 | 59
--------------------------------------------------------------------------------
/src/adapters/snips.ts:
--------------------------------------------------------------------------------
1 | import * as gen from '../main';
2 | import { ISentenceTokens } from '../types';
3 | import * as utils from '../utils';
4 |
5 | export interface ISnipsUtteranceData {
6 | text: string;
7 | entity?: string;
8 | slot_name?: string;
9 | }
10 | export interface ISnipsUtterance {
11 | data: ISnipsUtteranceData[];
12 | }
13 | export interface ISnipsIntent {
14 | utterances: ISnipsUtterance[];
15 | }
16 | export interface ISnipsDataset {
17 | intents: { [intentKey: string]: ISnipsIntent };
18 | entities: {
19 | [entityKey: string]: {
20 | data?: Array<{ value: string; synonyms: string[] }>;
21 | use_synonyms?: boolean;
22 | automatically_extensible?: boolean;
23 | };
24 | };
25 | language: string;
26 | }
27 |
28 | export interface ISnipsTestingDataset {
29 | [intent: string]: ISentenceTokens[][];
30 | }
31 |
32 | export async function adapter(dsl: string, formatOptions?: any, importer?: gen.IFileImporter, currentPath?: string) {
33 | const training: ISnipsDataset = { language: 'en', entities: {}, intents: {} };
34 | const testing: ISnipsTestingDataset = {};
35 | if (formatOptions) {
36 | utils.mergeDeep(training, formatOptions);
37 | }
38 | const synonymsForSlots: {
39 | [slot: string]: { [key: string]: Set };
40 | } = {};
41 | // const slots: Set = new Set();
42 | const entities: Set = new Set();
43 | const utteranceWriter = (utterance: ISentenceTokens[], intentKey: string, isTrainingExample: boolean) => {
44 | if (isTrainingExample) {
45 | if (!training.intents[intentKey]) {
46 | training.intents[intentKey] = { utterances: [] };
47 | }
48 | const data = utterance.map(u => {
49 | const ret: ISnipsUtteranceData = { text: u.value };
50 | if (u.type === 'Slot' && u.slot) {
51 | ret.slot_name = u.slot;
52 | if (u.args) {
53 | Object.keys(u.args).forEach(key => {
54 | if (u.args && key === 'entity') {
55 | entities.add(u.args[key]);
56 | ret.entity = u.args[key];
57 | }
58 | });
59 | }
60 | if (!ret.entity) {
61 | ret.entity = u.slot;
62 | entities.add(u.slot);
63 | }
64 | if (u.synonym && ret.entity) {
65 | if (!synonymsForSlots[ret.entity]) {
66 | synonymsForSlots[ret.entity] = {};
67 | }
68 | const synonyms = synonymsForSlots[ret.entity];
69 | if (!synonyms[u.synonym]) {
70 | synonyms[u.synonym] = new Set();
71 | }
72 | if (u.synonym !== u.value) {
73 | synonyms[u.synonym].add(u.value);
74 | }
75 | }
76 | }
77 | return ret;
78 | });
79 | training.intents[intentKey].utterances.push({ data });
80 | } else {
81 | if (!testing[intentKey]) {
82 | testing[intentKey] = [];
83 | }
84 | testing[intentKey].push(utterance);
85 | }
86 | };
87 | await gen.datasetFromString(dsl, utteranceWriter, importer, currentPath);
88 | entities.forEach(slotKey => {
89 | if (!synonymsForSlots[slotKey]) {
90 | if (!training.entities[slotKey]) {
91 | training.entities[slotKey] = {};
92 | }
93 | return;
94 | }
95 | Object.keys(synonymsForSlots[slotKey]).forEach(synonymsValue => {
96 | if (!training.entities[slotKey]) {
97 | training.entities[slotKey] = {};
98 | }
99 | training.entities[slotKey].use_synonyms = true;
100 | training.entities[slotKey].automatically_extensible = true;
101 | if (!training.entities[slotKey].data) {
102 | training.entities[slotKey].data = [];
103 | }
104 | const slotSynonymsSet = synonymsForSlots[slotKey][synonymsValue];
105 | const synonymsList = slotSynonymsSet.size ? Array.from(slotSynonymsSet) : [];
106 | (training.entities[slotKey].data as any[]).push({
107 | synonyms: synonymsList,
108 | value: synonymsValue
109 | });
110 | });
111 | });
112 | return { training, testing };
113 | }
114 |
--------------------------------------------------------------------------------
/web/lib/editorConfig.ts:
--------------------------------------------------------------------------------
1 | import * as rasaAdapter from '../../src/adapters/rasa';
2 | import * as snipsAdapter from '../../src/adapters/snips';
3 | import * as webAdapter from '../../src/adapters/web';
4 |
5 | const findRestaurantsByCity = `import ./common.chatito
6 |
7 | # Ways to request a restaurant within a location (using probability operator)
8 | # NOTE: 60% of the examples should come from the first sentence, and 40% from the second
9 |
10 | %[findRestaurantsByCity]('training': '100', 'testing': '100')
11 | *[60%] ~[hi?] ~[please?] ~[find?] ~[restaurants] ~[located at] @[city] ~[city?] ~[thanks?]
12 | *[40%] ~[restaurants] ~[located at] @[city]
13 |
14 | @[city]
15 | ~[new york]
16 | ~[san francisco]
17 | ~[atlanta]
18 |
19 | ~[find]
20 | find
21 | i'm looking for
22 | help me find
23 |
24 | ~[located at]
25 | located at
26 | in the area of
27 | near
28 |
29 | ~[restaurants]
30 | restaurants
31 | places to eat
32 | where to eat
33 | `;
34 |
35 | const affirmative = `// Ways to say yes
36 |
37 | import ./common.chatito
38 |
39 | %[affirmative]('training': '50', 'testing': '50')
40 | *[20%] ~[yes]
41 | ~[yes] ~[please?]
42 | ~[yes] ~[yes?] ~[thanks?]
43 | ~[yes?] ~[that is good] ~[yes?]
44 |
45 | ~[yes]
46 | yes
47 | right
48 | affirmative
49 | agreed
50 | correct
51 | yep
52 | yes sir
53 | sounds good
54 | im ok with that
55 |
56 | ~[that is good]
57 | that is good
58 | i want that
59 | that is fine
60 | that is correct
61 | that is what i want
62 | you understood me
63 | that is right
64 | its fine
65 | good
66 | `;
67 |
68 | const bye = `// Ways to say goodbye
69 |
70 | import ./common.chatito
71 |
72 | %[bye]('training': '50', 'testing': '50')
73 | *[20%] ~[bye]
74 | ~[thanks?] ~[bye]
75 | ~[bye] ~[thanks?]
76 | ~[leaving]
77 | ~[leaving] ~[bye]
78 |
79 | ~[bye]
80 | bye
81 | goodbye
82 | ttyl
83 | gtg
84 | adios
85 | farewell
86 | adieu
87 | chao
88 | chau
89 |
90 | ~[leaving]
91 | leaving
92 | talk to you soon
93 | have to go
94 | got to go
95 | talk to you later
96 | heading out
97 | im leaving now
98 | going out
99 | `;
100 |
101 | const greet = `// Ways to say hello
102 |
103 | import ./common.chatito
104 |
105 | %[greet]('training': '50', 'testing': '50')
106 | *[20%] ~[hi]
107 | ~[greetings]
108 | ~[hi] ~[greetings?]
109 | ~[hi] ~[whats up]
110 | ~[greetings] ~[whats up]
111 | ~[hi] ~[greetings] ~[whats up]
112 |
113 | ~[greetings]
114 | greetings
115 | good morning
116 | good afternoon
117 | good day
118 | good night
119 | morning
120 |
121 | ~[whats up]
122 | how are you
123 | whats up
124 | how are you doing
125 | how is it going
126 | are you there
127 | how are things going
128 | are you around
129 | whatsup
130 | sup
131 | are you around?
132 | `;
133 |
134 | const negative = `// Ways to say no
135 |
136 | import ./common.chatito
137 |
138 | %[negative]('training': '50', 'testing': '50')
139 | *[20%] ~[no]
140 | ~[no] ~[please?] ~[its not ok?]
141 | ~[please?] ~[no] ~[its not ok?]
142 | *[20%] ~[its not ok]
143 |
144 | ~[no]
145 | no
146 | nope
147 | not really
148 | that's not right
149 | incorrect
150 | don't do that
151 |
152 | ~[its not ok]
153 | i don't want that
154 | didnt meant that
155 | dont mean that
156 | that's not what i want
157 | that's not correct
158 | that's wrong
159 | it's not good
160 | that is wrong
161 | its not ok
162 | its not correct
163 | `;
164 |
165 | const common = `// Common entities to be imported and reused
166 | ~[hi]
167 | hi
168 | hello
169 | hey
170 |
171 | ~[please]
172 | please
173 | plz
174 | pls
175 |
176 | ~[thanks]
177 | thanks
178 | thank you
179 |
180 | `;
181 |
182 | export const tabs = [
183 | { title: 'findRestaurantsByCity.chatito', value: findRestaurantsByCity },
184 | { title: 'greet.chatito', value: greet },
185 | { title: 'bye.chatito', value: bye },
186 | { title: 'affirmative.chatito', value: affirmative },
187 | { title: 'negative.chatito', value: negative },
188 | { title: 'common.chatito', value: common }
189 | ];
190 |
191 | export const chatitoPrism = {
192 | comments: [{ pattern: /^(\/\/|\#).*/, greedy: true }, { pattern: /((\n|\r\n)+)(\/\/|\#).*/, greedy: true }],
193 | imports: [{ pattern: /(\n|\r\n)import\s/, greedy: true }, { pattern: /^import\s/, greedy: true }],
194 | intentDefinition: [
195 | {
196 | pattern: /^%\[[^\]]+\]((\(.+\))?)/,
197 | inside: { intentArguments: /((\(.+\))?)$/ }
198 | },
199 | {
200 | pattern: /((\n|\r\n)+)%\[[^\]]+\]((\(.+\))?)/,
201 | inside: { intentArguments: /((\(.+\))?)$/ }
202 | }
203 | ],
204 | slotDefinition: [
205 | {
206 | pattern: /^\@\[[^\]]+\]((\(.+\))?)/,
207 | inside: { slotArguments: /((\(.+\))?)$/ }
208 | },
209 | {
210 | pattern: /((\n|\r\n)+)\@\[[^\]]+\]((\(.+\))?)/,
211 | inside: { slotArguments: /((\(.+\))?)$/ }
212 | }
213 | ],
214 | aliasDefinition: [
215 | {
216 | pattern: /^~\[[^\]]+\]((\(.+\))?)/,
217 | inside: { aliasArguments: /((\(.+\))?)$/ }
218 | },
219 | {
220 | pattern: /((\n|\r\n)+)~\[[^\]]+\]((\(.+\))?)/,
221 | inside: { aliasArguments: /((\(.+\))?)$/ }
222 | }
223 | ],
224 | probability: { pattern: /(\n|\r\n)\s\s\s\s\*\[[^\]]+\]/, greedy: true },
225 | slot: { pattern: /\@\[[^\]]+(\?)?\]/, greedy: true },
226 | alias: { pattern: /~\[[^\]]+(\?)?\]/, greedy: true },
227 | default: { pattern: /[^\r\n]/i, greedy: true }
228 | };
229 |
230 | export const webDefaultOptions: webAdapter.IDefaultDataset = {};
231 | export const rasaDefaultOptions: rasaAdapter.IRasaDataset = {
232 | rasa_nlu_data: {
233 | regex_features: [],
234 | entity_synonyms: [],
235 | common_examples: []
236 | }
237 | };
238 | export const snipsDefaultOptions: snipsAdapter.ISnipsDataset = { language: 'en', entities: {}, intents: {} };
239 |
--------------------------------------------------------------------------------
/public/safari-pinned-tab.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
95 |
--------------------------------------------------------------------------------
/parser/chatito.pegjs:
--------------------------------------------------------------------------------
1 | { var STEP = 4; var level = 0; var entry = false; }
2 |
3 | Start = (ImportFile/TopLevelStatement/CommentLine)+
4 | TopLevelStatement = od:(IntentDefinition/SlotDefinition/AliasDefinition) { return od; }
5 |
6 | // ============= Probability operator =============
7 | ProbabilityOperatorDefinition = "*[" probability:Number percent:"%"? "]" { return `${probability}${percent || ''}`; }
8 | // ============= Entities =============
9 | EntityOpt = "?"
10 | EntityBody = "[" value:EntityKeywordLiteral "]" { return value }
11 | EntityOptionalBody = "[" value:EntityKeywordLiteral opt:EntityOpt? "]"
12 | { return { value: value, opt: !!opt }; }
13 |
14 | // Intent
15 | EntityIntentDefinition = "%" value:EntityBody args:EntityArguments?
16 | { return { value: value, type: "IntentDefinition", args: args, location: location() } }
17 | // Intents allow any text except end of lines, alias and slot definitions (because they are parsed as another value)
18 | AnyTextWithSlotAndAlias = v:(t:((!"\r\n")(!"\n")(!"~[")(!"@[") .) { return t.join(""); })+ { return v.join(""); }
19 | IntentAndSlotKeywordLiterals = value:AnyTextWithSlotAndAlias { return { value: value, type: "Text" }}
20 | IntentAndSlotValidInner = (OptionalSlot/OptionalAlias/IntentAndSlotKeywordLiterals)+
21 | IntentAndSlotInnerStatements = IntentAndSlotInnerStatement+
22 | IntentAndSlotInnerStatement = Samedent p:ProbabilityOperatorDefinition? s:IntentAndSlotValidInner EOS
23 | { return { sentence: s, probability: p }; }
24 | IntentDefinition = EOL? o:EntityIntentDefinition EOL
25 | Indent s:IntentAndSlotInnerStatements Dedent
26 | { return { type: o.type, key: o.value, args: o.args, location: o.location, inner: s } }
27 |
28 | // Slot
29 | SlotVariationStartDefinition = "#"
30 | SlotVariationDefinition = SlotVariationStartDefinition id:SlotKeywordLiteral { return id }
31 | EntitySlotDefinition = "@[" value:SlotKeywordLiteral variation:SlotVariationDefinition? "]" args:EntityArguments?
32 | { return { value: value, type: "SlotDefinition", variation: variation, args: args, location: location() } }
33 | SlotOptionalBody = "[" value:SlotKeywordLiteral variation:SlotVariationDefinition? opt:EntityOpt? "]"
34 | { return { value: value, opt: !!opt, variation: variation }; }
35 | OptionalSlot = "@" op:SlotOptionalBody
36 | { return { value: op.value, type: "Slot", opt: op.opt, location: location(), variation: op.variation } }
37 | // Slots allow any text except end of lines and alias definitions (because they are parsed as another value)
38 | AnyTextWithAlias = v:(t:((!"\r\n")(!"\n")(!"~[") .) { return t.join(""); })+ { return v.join(""); }
39 | SlotKeywordLiterals = value:AnyTextWithAlias { return { value: value, type: "Text" }}
40 | SlotValidInner = (OptionalAlias/SlotKeywordLiterals)+
41 | SlotInnerStatement = Samedent p:ProbabilityOperatorDefinition? s:SlotValidInner EOS { return { sentence: s, probability: p }; }
42 | SlotInnerStatements = SlotInnerStatement+
43 | SlotDefinition = EOL? o:EntitySlotDefinition EOL
44 | Indent s:SlotInnerStatements Dedent
45 | { return { type: o.type, key: o.value, args: o.args, location: o.location, inner: s, variation: o.variation } }
46 |
47 | // Alias
48 | EntityAliasDefinition = "~" value:EntityBody args:EntityArguments?
49 | { return { value: value, type: "AliasDefinition", location: location(), args: args } }
50 | OptionalAlias = "~" op:EntityOptionalBody { return { value: op.value, type: "Alias", opt: op.opt } }
51 | AliasDefinition = EOL? o:EntityAliasDefinition EOL
52 | Indent s:IntentAndSlotInnerStatements Dedent
53 | { return { type: o.type, key: o.value, location: o.location, inner: s, args: o.args } }
54 |
55 | // ============= Identation =============
56 | Samedent "correct indentation" = s:" "* &{ return s.length === level * STEP; }
57 | Indent = &{ level++; return true; }
58 | Dedent = &{ level--; return true; }
59 |
60 | // ============= Primitives =============
61 | AnyTextWithoutEOL = v:(t:((!"\r\n")(!"\n") .) { return t.join(""); })+ { return v.join(""); }
62 | DoubleSlashCommentLine = EOL? "//" c:AnyTextWithoutEOL EOS? { return { type: "Comment" , value: c.trim() }; }
63 | HashCommentLine = EOL? "#" c:AnyTextWithoutEOL EOS? { return { type: "Comment" , value: c.trim() }; }
64 | CommentLine = (DoubleSlashCommentLine/HashCommentLine)
65 |
66 | ImportFile = EOL? "import " s:AnyTextWithoutEOL EOS? { return { type: "ImportFile", value: s.trim() }; }
67 |
68 | // KeywordLiteral "word" = v:([a-zA-Z0-9_ \:\+]+) { return v.join(""); }
69 | BasicKeywordLiteral "entity name" = v:(t:((!"\r\n")(!"\n")(!"]") .) { return t.join(""); })+ { return v.join(""); }
70 | EntityKeywordLiteral "entity name" = v:(t:((!"\r\n")(!"\n")(!"]")(!"?") .) { return t.join(""); })+ { return v.join(""); }
71 | SlotKeywordLiteral "entity name" = v:(t:((!"\r\n")(!"\n")(!"#")(!"]")(!"?") .) { return t.join(""); })+ { return v.join(""); }
72 |
73 | // Number
74 | Number "number" = int frac? { return parseFloat(text()); }
75 | DecimalPoint = "."
76 | Digit1_9 = [1-9]
77 | Digit0_9 = [0-9]
78 | frac = DecimalPoint Digit0_9+
79 | int = zero / (Digit1_9 Digit0_9*)
80 | zero = "0"
81 |
82 | EOS "end of sentence" = EOL / EOF
83 | EOL "end of line "= (EOLNonWindows/EOLWindows)+
84 | EOLNonWindows "non windows end of line" = "\n"
85 | EOLWindows "windows end of line" = "\r\n"
86 | EOF = !.
87 |
88 | // ============= Entity arguments =============
89 | EntityArguments = "(" args:(EntityArg)+ ")" {
90 | return args.reduce(function (prev, curr) { prev[curr.key] = curr.value; return prev; }, {});
91 | }
92 | EntityArg = (" "*)? ek:ArgumentKeyValueString (" "*)? ":" (" "*)? ev:ArgumentKeyValueString ((" "*)? ",")? (" "*)? { return { key: ek, value: ev }; }
93 | // EntityValidKeyOrValue = v:(t:((!"\r\n")(!"\n")(!"=")(!",")(!")")(!"(") .) { return t.join(""); })+ { return v.join(""); }
94 | // based from json parser from https://github.com/pegjs/pegjs/blob/master/examples/json.pegjs
95 | ArgumentKeyValueString
96 | = '"' chars:DoubleQuotedString* '"' { return chars.join(''); }
97 | / "'" chars:SingleQuotedString* "'" { return chars.join(''); }
98 | DoubleQuotedString
99 | = !('"' / "\\" / "\n") char:. { return char; }
100 | / "\\" sequence:StringEscapedChars { return sequence; }
101 | SingleQuotedString
102 | = !("'" / "\\" / "\n") char:. { return char; }
103 | / "\\" sequence:StringEscapedChars { return sequence; }
104 | StringEscapedChars
105 | = "'"
106 | / '"'
107 | / "\\"
108 | / "b" { return "\b"; }
109 | / "f" { return "\f"; }
110 | / "n" { return "\n"; }
111 | / "r" { return "\r"; }
112 | / "t" { return "\t"; }
113 | / "v" { return "\x0B"; }
114 | / "u" digits:$(HEXDIG HEXDIG HEXDIG HEXDIG) { return String.fromCharCode(parseInt(digits, 16)); }
115 | HEXDIG = [0-9a-f]i
--------------------------------------------------------------------------------
/web/components/Editor/editorStyles.tsx:
--------------------------------------------------------------------------------
1 | import styled from 'styled-components';
2 |
3 | export const AlertNotification = styled('div')`
4 | width: 100%;
5 | background-color: ${({ state }: { state: 'error' | 'warning' | 'success' }) =>
6 | state === 'error' ? '#c80000' : state === 'warning' ? '#7f8000' : '#008800'};
7 | bottom: 0;
8 | margin: auto;
9 | right: 0;
10 | text-align: center;
11 | padding: 12px;
12 | color: white;
13 | z-index: 99;
14 | font-size: 14px;
15 | `;
16 |
17 | export const CodeStyles = styled('div')`
18 | white-space: pre-wrap;
19 | position: relative;
20 | margin: auto;
21 | width: inherit;
22 | height: calc(100vh - 210px) !important;
23 | min-height: 400px;
24 | background-color: #282a35;
25 | > .codeflask {
26 | background-color: #282a35;
27 | > textarea.codeflask__textarea {
28 | color: #282a35;
29 | caret-color: #fff;
30 | }
31 | &.codeflask--has-line-numbers {
32 | :before {
33 | background-color: #3c3c4c;
34 | }
35 | > pre {
36 | width: auto !important;
37 | }
38 | div.codeflask__lines {
39 | z-index: 3;
40 | height: auto !important;
41 | padding: 10px 4px 0 0;
42 | > .codeflask__lines__line {
43 | color: #6473a0;
44 | background-color: #3c3c4c;
45 | }
46 | }
47 | }
48 | *::-webkit-scrollbar {
49 | width: 10px;
50 | height: 10px;
51 | }
52 | *::-webkit-scrollbar-thumb {
53 | background-color: #7c7c9c;
54 | box-shadow: inset 0 0 2px rgba(0, 0, 0, 0.8);
55 | }
56 | *::-webkit-scrollbar-track {
57 | box-shadow: inset 0 0 2px rgba(0, 0, 0, 0.8);
58 | }
59 | *::-webkit-scrollbar-corner {
60 | background-color: transparent;
61 | }
62 | }
63 | .token.imports {
64 | color: #f7717d;
65 | }
66 | .token.comments {
67 | color: #999;
68 | }
69 | .token.intentDefinition {
70 | color: #ef82c3;
71 | }
72 | .token.slotDefinition {
73 | color: #ffaf56;
74 | }
75 | .token.aliasDefinition {
76 | color: #a0e7fb;
77 | }
78 | .token.probability {
79 | color: #00f0b5;
80 | }
81 | .token.slot {
82 | color: #ffaf56;
83 | }
84 | .token.alias {
85 | color: #a0e7fb;
86 | }
87 | .token.default {
88 | color: #e2e2dd;
89 | }
90 | .token.intentArguments {
91 | color: #b5669e;
92 | }
93 | .token.slotArguments {
94 | color: #7a9d98;
95 | }
96 | .token.aliasArguments {
97 | color: #80c7db;
98 | }
99 | `;
100 |
101 | export const TabButton = styled('div')`
102 | cursor: pointer;
103 | display: inline-block;
104 | background-color: ${({ active }: { active: boolean }) => (active ? '#282A35' : '#3c3c4c')};
105 | font-size: 12px;
106 | color: #ededed;
107 | padding: 13px 3px 13px 13px;
108 | border-right: 1px solid #2c2c3c;
109 | zoom: 1;
110 | -webkit-touch-callout: none;
111 | -webkit-user-select: none;
112 | -moz-user-select: none;
113 | -ms-user-select: none;
114 | user-select: none;
115 | `;
116 |
117 | export const CloseTab = styled('div')`
118 | :after {
119 | content: 'x';
120 | }
121 | padding: 8px;
122 | margin-left: 8px;
123 | display: inline-block;
124 | color: #ccf;
125 | line-height: 10px;
126 | font-size: 14px;
127 | cursor: pointer;
128 | font-weight: bold;
129 | `;
130 |
131 | export const EditorHeader = styled('div')`
132 | display: flex;
133 | flex-direction: row;
134 | width: 100%;
135 | max-width: 100%;
136 | background-color: #3c3c4c;
137 | padding-left: 40px;
138 | padding-top: 10px;
139 | `;
140 |
141 | export const TabsAreaButton = styled('button')`
142 | cursor: pointer;
143 | background-color: #6c1de2;
144 | font-size: 12px;
145 | color: #efefef;
146 | line-height: 14px;
147 | padding: 8px 24px;
148 | white-space: nowrap;
149 | margin: auto 10px;
150 | border-radius: 4px;
151 | border-color: #333;
152 | -webkit-transition: 0.25s ease;
153 | -moz-transition: 0.25s ease;
154 | -o-transition: 0.25s ease;
155 | transition: 0.25s ease;
156 | &:first-of-type {
157 | margin-left: 20px;
158 | }
159 | :disabled {
160 | border: 1px solid #999999;
161 | background-color: #cccccc;
162 | color: #666666;
163 | }
164 | `;
165 |
166 | export const TabsArea = styled('div')`
167 | width: auto;
168 | max-width: 100%;
169 | white-space: nowrap;
170 | position: relative;
171 | overflow-x: auto;
172 | overflow-y: hidden;
173 | -webkit-overflow-scrolling: touch;
174 | &::-webkit-scrollbar {
175 | height: 6px;
176 | }
177 | &::-webkit-scrollbar-thumb {
178 | background-color: #7c7c9c;
179 | -webkit-box-shadow: inset 0 0 2px rgba(0, 0, 0, 0.8);
180 | }
181 | &::-webkit-scrollbar-track {
182 | -webkit-box-shadow: inset 0 0 2px rgba(0, 0, 0, 0.8);
183 | }
184 | *::-webkit-scrollbar-corner {
185 | background-color: transparent;
186 | }
187 | `;
188 |
189 | export const EditorWrapper = styled('div')`
190 | width: 90vw;
191 | overflow: auto;
192 | margin: auto;
193 | position: relative;
194 | -webkit-box-shadow: 0px 0px 36px 2px rgba(0, 0, 0, 0.63);
195 | -moz-box-shadow: 0px 0px 36px 2px rgba(0, 0, 0, 0.63);
196 | box-shadow: 0px 0px 36px 2px rgba(0, 0, 0, 0.63);
197 | `;
198 |
199 | export const Drawer = styled('div')`
200 | z-index: 99;
201 | position: absolute;
202 | background-color: #352252;
203 | -webkit-box-shadow: -5px 0px 5px -5px rgba(0, 0, 0, 0.55);
204 | -moz-box-shadow: -5px 0px 5px -5px rgba(0, 0, 0, 0.55);
205 | box-shadow: -5px 0px 5px -5px rgba(0, 0, 0, 0.55);
206 | top: 0;
207 | right: 0;
208 | max-width: 700px;
209 | height: 100%;
210 | width: ${({ showDrawer }: { showDrawer: boolean }) => (showDrawer ? `100%` : `0px`)};
211 | -webkit-transition: 0.65s ease;
212 | -moz-transition: 0.65s ease;
213 | -o-transition: 0.65s ease;
214 | transition: 0.65s ease;
215 | overflow: auto;
216 | `;
217 |
218 | export const EditorOverlay = styled('div')`
219 | z-index: 999;
220 | position: absolute;
221 | top: 0;
222 | left: 0;
223 | width: 100%;
224 | height: 100%;
225 | background: rgba(0, 0, 0, 0.6);
226 | visibility: ${({ showDrawer }: { showDrawer: boolean }) => (showDrawer ? 'visible' : 'hidden')};
227 | -webkit-transition: 0.25s ease;
228 | -moz-transition: 0.25s ease;
229 | -o-transition: 0.25s ease;
230 | transition: 0.25s ease;
231 | `;
232 |
233 | export const BlockWrapper = styled('div')`
234 | background-color: #e4e4e4;
235 | margin: 20px;
236 | overflow: auto;
237 | border-radius: 8px;
238 | -webkit-box-shadow: 0px 0px 50px 0px rgba(0, 0, 0, 0.4);
239 | -moz-box-shadow: 0px 0px 50px 0px rgba(0, 0, 0, 0.4);
240 | box-shadow: 0px 0px 50px 0px rgba(0, 0, 0, 0.4);
241 | clear: both;
242 | `;
243 |
244 | export const BlockWrapperTitle = styled('div')`
245 | background-color: #6b5a86;
246 | color: #efefef;
247 | font-size: 13px;
248 | padding: 8px 10px;
249 | border-top-left-radius: 8px;
250 | border-top-right-radius: 8px;
251 | `;
252 |
253 | export const CloseDrawerButton = styled('div')`
254 | cursor: pointer;
255 | color: white;
256 | font-size: 16px;
257 | padding: 8px;
258 | font-weight: bold;
259 | margin: 8px 20px 8px 20px;
260 | float: right;
261 | `;
262 |
263 | export const DrawerFormField = styled('div')`
264 | padding: 10px 20px;
265 | display: flex;
266 | align-items: center;
267 | flex: 1;
268 | > label {
269 | font-size: 12px;
270 | padding-right: 10px;
271 | }
272 | `;
273 |
274 | export const SelectWrapper = styled('div')`
275 | position: relative;
276 | z-index: 0;
277 | display: inline-block;
278 | overflow: hidden;
279 | height: auto;
280 | padding: 0 5px 0 0;
281 | margin: 0 5px 0 0;
282 | border-radius: 5px;
283 | border: solid 1px #ccc;
284 | background-color: #fff;
285 | :before {
286 | position: absolute;
287 | z-index: 1;
288 | content: '\\25BE';
289 | top: 50%;
290 | right: 10px;
291 | margin-top: -9px;
292 | }
293 | select {
294 | position: relative;
295 | z-index: 2;
296 | outline: none;
297 | width: 120%;
298 | padding: 5px 20px 5px 10px;
299 | background-color: transparent;
300 | background-image: none;
301 | -webkit-appearance: none;
302 | border: none;
303 | box-shadow: none;
304 | }
305 | `;
306 |
307 | export const CheckboxWrapper = styled('div')`
308 | font-size: 12px;
309 | text-decoration: underline;
310 | cursor: pointer;
311 | input {
312 | margin-right: 10px;
313 | cursor: pointer;
314 | }
315 | `;
316 |
--------------------------------------------------------------------------------
/src/bin.ts:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | import * as fs from 'fs';
3 | import * as path from 'path';
4 | import * as flair from './adapters/flair';
5 | import * as luis from './adapters/luis';
6 | import * as rasa from './adapters/rasa';
7 | import * as snips from './adapters/snips';
8 | import * as web from './adapters/web';
9 | import { config, VALID_AUTO_ALIASES, VALID_DISTRIBUTIONS } from './main';
10 | import * as utils from './utils';
11 |
12 | // tslint:disable-next-line:no-var-requires
13 | const argv = require('minimist')(process.argv.slice(2));
14 |
15 | const logger = console;
16 | const adapters = { default: web, rasa, snips, luis, flair };
17 | const workingDirectory = process.cwd();
18 | const getFileWithPath = (filename: string) => path.resolve(workingDirectory, filename);
19 |
20 | const chatitoFilesFromDir = async (startPath: string, cb: (filename: string) => Promise) => {
21 | if (!fs.existsSync(startPath)) {
22 | logger.error(`Invalid directory: ${startPath}`);
23 | process.exit(1);
24 | }
25 | const files = fs.readdirSync(startPath);
26 | for (const file of files) {
27 | const filename = path.join(startPath, file);
28 | const stat = fs.lstatSync(filename);
29 | if (stat.isDirectory()) {
30 | await chatitoFilesFromDir(filename, cb);
31 | } else if (/\.chatito$/.test(filename)) {
32 | await cb(filename);
33 | }
34 | }
35 | };
36 |
37 | const importer = (fromPath: string, importFile: string) => {
38 | const filePath = path.resolve(path.dirname(fromPath), importFile);
39 | if (path.extname(filePath) !== '.chatito') {
40 | throw new Error('Only files with .chatito extension can be imported');
41 | }
42 | if (!fs.existsSync(filePath)) {
43 | throw new Error(`Can't import ${filePath}`);
44 | }
45 | const dsl = fs.readFileSync(filePath, 'utf8');
46 | return { filePath, dsl };
47 | };
48 |
49 | const streamedAdapterAccumulator = (format: 'flair', outputPath: string) => {
50 | const adapterHandler = adapters[format];
51 | if (!adapterHandler) {
52 | throw new Error(`Invalid adapter: ${format}`);
53 | }
54 | if (!fs.existsSync(outputPath)) {
55 | fs.mkdirSync(outputPath);
56 | }
57 | const trainingFileName = argv.trainingFileName || `${format}_dataset_training.txt`;
58 | const testingFileName = argv.testingFileName || `${format}_dataset_testing.txt`;
59 | const trainingClassificationFilePath = path.resolve(outputPath, `classification_${trainingFileName}`);
60 | const testingClassificationFilePath = path.resolve(outputPath, `classification_${testingFileName}`);
61 | const trainingNerFilePath = path.resolve(outputPath, `ner_${trainingFileName}`);
62 | const testingNerFilePath = path.resolve(outputPath, `ner_${testingFileName}`);
63 | // write streams
64 | const trainClassification = fs.createWriteStream(trainingClassificationFilePath, { flags: 'a' });
65 | const testClassification = fs.createWriteStream(testingClassificationFilePath, { flags: 'a' });
66 | const trainNER = fs.createWriteStream(trainingNerFilePath, { flags: 'a' });
67 | const testNER = fs.createWriteStream(testingNerFilePath, { flags: 'a' });
68 | trainClassification.on('close', () => logger.log('Train classification dataset done!'));
69 | testClassification.on('close', () => logger.log('Test classification dataset done!'));
70 | trainNER.on('close', () => logger.log('Test NER dataset done!'));
71 | testNER.on('close', () => logger.log('Test NER dataset done!'));
72 | return {
73 | write: async (fullFilenamePath: string) => {
74 | logger.log(`Processing file: ${fullFilenamePath}`);
75 | const dsl = fs.readFileSync(fullFilenamePath, 'utf8');
76 | const streams = { trainClassification, testClassification, trainNER, testNER };
77 | await adapterHandler.streamAdapter(dsl, streams, importer, fullFilenamePath);
78 | },
79 | save: () => {
80 | trainClassification.close();
81 | testClassification.close();
82 | trainNER.close();
83 | testNER.close();
84 | logger.log(`Saved training dataset: ${trainingClassificationFilePath}`);
85 | logger.log(`Saved testing dataset: ${testingClassificationFilePath}`);
86 | logger.log(`Saved training dataset: ${trainingNerFilePath}`);
87 | logger.log(`Saved testing dataset: ${testingNerFilePath}`);
88 | }
89 | };
90 | };
91 |
92 | type IValidFormat = 'default' | 'rasa' | 'snips' | 'luis' | 'flair';
93 | const adapterAccumulator = (format: IValidFormat, outputPath: string, formatOptions?: any) => {
94 | const trainingDataset: snips.ISnipsDataset | rasa.IRasaDataset | luis.ILuisDataset | {} = {};
95 | const testingDataset: any = {};
96 | if (format === 'flair') {
97 | return streamedAdapterAccumulator('flair', outputPath);
98 | }
99 | const trainingJsonFileName = argv.trainingFileName || `${format}_dataset_training.json`;
100 | const trainingJsonFilePath = path.resolve(outputPath, trainingJsonFileName);
101 | const testingFileName = argv.testingFileName || `${format}_dataset_testing.json`;
102 | const testingJsonFilePath = path.resolve(outputPath, testingFileName);
103 | const adapterHandler = adapters[format];
104 | if (!adapterHandler) {
105 | throw new Error(`Invalid adapter: ${format}`);
106 | }
107 | return {
108 | write: async (fullFilenamePath: string) => {
109 | logger.log(`Processing file: ${fullFilenamePath}`);
110 | const dsl = fs.readFileSync(fullFilenamePath, 'utf8');
111 | const { training, testing } = await adapterHandler.adapter(dsl, formatOptions, importer, fullFilenamePath);
112 | utils.mergeDeep(trainingDataset, training);
113 | utils.mergeDeep(testingDataset, testing);
114 | },
115 | save: () => {
116 | if (!fs.existsSync(outputPath)) {
117 | fs.mkdirSync(outputPath);
118 | }
119 | fs.writeFileSync(trainingJsonFilePath, JSON.stringify(trainingDataset));
120 | logger.log(`Saved training dataset: ${trainingJsonFilePath}`);
121 |
122 | if (Object.keys(testingDataset).length) {
123 | fs.writeFileSync(testingJsonFilePath, JSON.stringify(testingDataset));
124 | logger.log(`Saved testing dataset: ${testingJsonFilePath}`);
125 | }
126 | }
127 | };
128 | };
129 |
130 | const validateArgs = () => {
131 | if (argv.defaultDistribution) {
132 | if (VALID_DISTRIBUTIONS.includes(argv.defaultDistribution)) {
133 | config.defaultDistribution = argv.defaultDistribution;
134 | } else {
135 | throw new Error(
136 | `Unknow defaultDistribution value: '${argv.defaultDistribution}'. Valid values are: ${VALID_DISTRIBUTIONS.join(', ')}.`
137 | );
138 | }
139 | }
140 | if (argv.autoAliases) {
141 | if (VALID_AUTO_ALIASES.includes(argv.autoAliases)) {
142 | config.autoAliases = argv.autoAliases;
143 | } else {
144 | throw new Error(`Unknow autoAliases value: '${argv.autoAliases}'. Valid values are: ${VALID_AUTO_ALIASES.join(', ')}.`);
145 | }
146 | }
147 | };
148 |
149 | (async () => {
150 | if (!argv._ || !argv._.length) {
151 | logger.error('Invalid chatito file.');
152 | process.exit(1);
153 | }
154 | const dslFile = argv._[0];
155 | const format = (argv.format || 'default').toLowerCase();
156 | if (['default', 'rasa', 'snips', 'luis', 'flair'].indexOf(format) === -1) {
157 | logger.error(`Invalid format argument: ${format}`);
158 | process.exit(1);
159 | }
160 | const outputPath = argv.outputPath || process.cwd();
161 | try {
162 | validateArgs();
163 | logger.log(`NOTE: Using ${config.defaultDistribution} as default frequency distribution.`);
164 | // parse the formatOptions argument
165 | let formatOptions = null;
166 | if (argv.formatOptions) {
167 | formatOptions = JSON.parse(fs.readFileSync(path.resolve(argv.formatOptions), 'utf8'));
168 | }
169 | const dslFilePath = getFileWithPath(dslFile);
170 | const isDirectory = fs.existsSync(dslFilePath) && fs.lstatSync(dslFilePath).isDirectory();
171 | const accumulator = adapterAccumulator(format, outputPath, formatOptions);
172 | if (isDirectory) {
173 | await chatitoFilesFromDir(dslFilePath, accumulator.write);
174 | } else {
175 | await accumulator.write(dslFilePath);
176 | }
177 | accumulator.save();
178 | } catch (e) {
179 | if (e && e.message && e.location) {
180 | logger.log('==== CHATITO SYNTAX ERROR ====');
181 | logger.log(' ', e.message);
182 | logger.log(` Line: ${e.location.start.line}, Column: ${e.location.start.column}`);
183 | logger.log('==============================');
184 | } else {
185 | logger.error(e && e.stack ? e.stack : e);
186 | }
187 | logger.log('FULL ERROR REPORT:');
188 | logger.error(e);
189 | process.exit(1);
190 | }
191 | })();
192 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # Chatito
2 |
3 | [](https://www.npmjs.com/package/chatito)
4 | [](https://circleci.com/gh/rodrigopivi/workflows/Chatito)
7 | [](https://www.npmjs.com/package/chatito)
8 | [](https://www.npmjs.com/package/chatito)
9 |
10 |
11 | [](https://rodrigopivi.github.io/Chatito/)
12 |
13 | [Try the online IDE!](https://rodrigopivi.github.io/Chatito/)
14 |
15 |
16 | ## Overview
17 | Chatito helps you generate datasets for training and validating chatbot models using a simple DSL.
18 |
19 | If you are building chatbots using commercial models, open source frameworks or writing your own natural language processing model, you need training and testing examples. Chatito is here to help you.
20 |
21 | This project contains the:
22 | - [Online chatito IDE](https://rodrigopivi.github.io/Chatito/)
23 | - [Chatito DSL specification](https://github.com/rodrigopivi/Chatito/blob/master/spec.md)
24 | - [DSL AST parser in pegjs format](https://github.com/rodrigopivi/Chatito/blob/master/parser/chatito.pegjs)
25 | - [Generator implemented in typescript + npm package](https://github.com/rodrigopivi/Chatito/tree/master/src)
26 |
27 | ### Chatito language
28 | For the full language specification and documentation, please refer to the [DSL spec document](https://github.com/rodrigopivi/Chatito/blob/master/spec.md).
29 |
30 | ### Adapters
31 | The language is independent from the generated output format and because each model can receive different parameters and settings, this are the currently implemented data formats, if your provider is not listed, at the Tools and resources section there is more information on how to support more formats.
32 |
33 | NOTE: Samples are not shuffled between intents for easier review and because some adapters stream samples directly to the file.
34 |
35 | #### Default format
36 | Use the default format if you plan to train a custom model or if you are writing a custom adapter. This is the most flexible format because you can annotate `Slots` and `Intents` with custom entity arguments, and they all will be present at the generated output, so for example, you could also include dialog/response generation logic with the DSL. E.g.:
37 |
38 | ```
39 | %[some intent]('context': 'some annotation')
40 | @[some slot] ~[please?]
41 |
42 | @[some slot]('required': 'true', 'type': 'some type')
43 | ~[some alias here]
44 |
45 | ```
46 |
47 | Custom entities like 'context', 'required' and 'type' will be available at the output so you can handle this custom arguments as you want.
48 |
49 | #### [Rasa NLU](https://rasa.com/docs/nlu/)
50 | [Rasa NLU](https://rasa.com/docs/nlu/) is a great open source framework for training NLU models.
51 | One particular behavior of the Rasa adapter is that when a slot definition sentence only contains one alias, and that alias defines the 'synonym' argument with 'true', the generated Rasa dataset will map the alias as a synonym. e.g.:
52 |
53 | ```
54 | %[some intent]('training': '1')
55 | @[some slot]
56 |
57 | @[some slot]
58 | ~[some slot synonyms]
59 |
60 | ~[some slot synonyms]('synonym': 'true')
61 | synonym 1
62 | synonym 2
63 | ```
64 |
65 | In this example, the generated Rasa dataset will contain the `entity_synonyms` of `synonym 1` and `synonym 2` mapping to `some slot synonyms`.
66 |
67 | #### [Flair](https://github.com/zalandoresearch/flair)
68 | [Flair](https://github.com/zalandoresearch/flair) A very simple framework for state-of-the-art NLP. Developed by Zalando Research. It provides state of the art (GPT, BERT, ELMo, etc...) pre trained models and embeddings for many languages that work out of the box. This adapter supports the `text classification` dataset in FastText format and the `named entity recognition` dataset in two column [BIO](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) annotated words, as documented at [flair corpus documentation](https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_6_CORPUS.md). This two data formats are very common and with many other providers or models.
69 |
70 | The NER dataset requires a word tokenization processing that is currently done using [wink-tokenizer](https://github.com/winkjs/wink-tokenizer) npm package. Extending the adapter to add PoS tagging can be explored in the future, but it's not implemented.
71 |
72 | NOTE: Flair adapter is only available for the NodeJS NPM CLI package, not for the IDE.
73 |
74 | #### [LUIS](https://www.luis.ai/)
75 | [LUIS](https://www.luis.ai/) is part of Microsoft's Cognitive services. Chatito supports training a LUIS NLU model through its [batch add labeled utterances endpoint](https://westus.dev.cognitive.microsoft.com/docs/services/5890b47c39e2bb17b84a55ff/operations/5890b47c39e2bb052c5b9c09), and its [batch testing api](https://docs.microsoft.com/en-us/azure/cognitive-services/LUIS/luis-how-to-batch-test).
76 |
77 | To train a LUIS model, you will need to post the utterance in batches to the relevant API for training or testing.
78 |
79 | Reference issue: [#61](https://github.com/rodrigopivi/Chatito/issues/61)
80 |
81 | #### [Snips NLU](https://snips-nlu.readthedocs.io/en/latest/)
82 | [Snips NLU](https://snips-nlu.readthedocs.io/en/latest/) is another great open source framework for NLU. One particular behavior of the Snips adapter is that you can define entity types for the slots. e.g.:
83 |
84 | ```
85 | %[date search]('training':'1')
86 | for @[date]
87 |
88 | @[date]('entity': 'snips/datetime')
89 | ~[today]
90 | ~[tomorrow]
91 | ```
92 |
93 | In the previous example, all `@[date]` values will be tagged with the `snips/datetime` entity tag.
94 |
95 | ### NPM package
96 |
97 | Chatito supports Node.js `v8.11.2 LTS` or higher.
98 |
99 | Install it with yarn or npm:
100 | ```
101 | npm i chatito --save
102 | ```
103 |
104 | Then create a definition file (e.g.: `trainClimateBot.chatito`) with your code.
105 |
106 | Run the npm generator:
107 |
108 | ```
109 | npx chatito trainClimateBot.chatito
110 | ```
111 |
112 | The generated dataset should be available next to your definition file.
113 |
114 | Here is the full npm generator options:
115 | ```
116 | npx chatito --format= --formatOptions= --outputPath= --trainingFileName= --testingFileName= --defaultDistribution=
117 | ```
118 |
119 | - `` path to a `.chatito` file or a directory that contains chatito files. If it is a directory, will search recursively for all `*.chatito` files inside and use them to generate the dataset. e.g.: `lightsChange.chatito` or `./chatitoFilesFolder`
120 | - `` Optional. `default`, `rasa`, `luis`, `flair` or `snips`.
121 | - `` Optional. Path to a .json file that each adapter optionally can use
122 | - `` Optional. The directory where to save the generated datasets. Uses the current directory as default.
123 | - `` Optional. The name of the generated training dataset file. Do not forget to add a .json extension at the end. Uses ``_dataset_training.json as default file name.
124 | - `` Optional. The name of the generated testing dataset file. Do not forget to add a .json extension at the end. Uses ``_dataset_testing.json as default file name.
125 | - `` Optional. The default frequency distribution if not defined at the entity level. Defaults to `regular` and can be set to `even`.
126 |
127 | - `` Optional. The generaor behavior when finding an undefined alias. Valid opions are `allow`, `warn`, `restrict`. Defauls to 'allow'.
128 |
129 | ### Notes to prevent overfitting
130 |
131 | [Overfitting](https://en.wikipedia.org/wiki/Overfitting) is a problem that can be prevented if we use Chatito correctly. The idea behind this tool, is to have an intersection between data augmentation and a probabilistic description of possible sentences combinations. It is not intended to generate deterministic datasets, you should avoid generating all possible combinations.
132 |
133 | ### Tools and resources
134 |
135 | - [Visual Studio Code syntax highlighting plugin](https://marketplace.visualstudio.com/items?itemName=nimfin.chatito) Thanks to [Yuri Golobokov](https://github.com/nimf) for his [work on this](https://github.com/nimf/chatito-vscode).
136 |
137 | - [AI Blueprints: How to build and deploy AI business projects](https://books.google.com.pe/books?id=sR2CDwAAQBAJ) implements practical full chatbot examples using chatito at chapter 7.
138 |
139 | - [3 steps to convert chatbot training data between different NLP Providers](https://medium.com/@benoit.alvarez/3-steps-to-convert-chatbot-training-data-between-different-nlp-providers-fa235f67617c) details a simple way to convert the data format to non implemented adapters. You can use a generated dataset with providers like DialogFlow, Wit.ai and Watson.
140 |
141 | - [Aida-nlp](https://github.com/rodrigopivi/aida) is a tiny experimental NLP deep learning library for text classification and NER. Built with Tensorflow.js, Keras and Chatito. Implemented in JS and Python.
142 |
143 | ### Author and maintainer
144 | Rodrigo Pimentel
145 |
--------------------------------------------------------------------------------
/spec.md:
--------------------------------------------------------------------------------
1 | # Chatito Spec
2 |
3 | ## 1 - Overview
4 |
5 | Chatito is a domain specific language designed to simplify the process of creating, extending and maintaining
6 | datasets for training natural language processing (NLP) models for text classification, named entity recognition, slot filling or equivalent tasks.
7 |
8 | Chatito design principles:
9 |
10 | - Simplicity: should be understandable by someone looking at it for the first time
11 |
12 | - Speed: generate samples by pulling them from a cloud of probabilities on demand
13 |
14 | - Practicality: this tool is meant to help people who use it, the design should be guided by the community needs
15 |
16 | Following those principles this is an example of the language and its generated output:
17 |
18 | ```
19 | %[greet]('training': '2')
20 | ~[hi] @[name?] ~[whatsUp?]
21 |
22 | ~[hi]
23 | hi
24 | hey
25 |
26 | @[name]
27 | Janis
28 | Bob
29 |
30 | ~[whatsUp]
31 | whats up
32 | how is it going
33 | ```
34 |
35 | This code could produce a maximum of 18 examples, the output format is independent from the DSL language,
36 | although it is recommended to use a newline delimited format to just stream results to a file, a format like ndjson is recommended over plain json and using the `training` entity argument to limit the dataset size is recommended for large dataset where there should be no need to generate all variations.
37 |
38 | That said, the earlier DSL code generates two training examples for the `greet` intent. Here is the `Newline Delimited JSON` (ndjson.org) examples generated from the previous code:
39 |
40 | ```
41 | [{"type":"Text","value":"hi how is it going"}]
42 | [{"type":"Text","value":"hey "},{"type":"Slot","value":"Bob","slot":"name"}]
43 | ```
44 |
45 | Given this principles in mind, this document is the specification of such language.
46 |
47 | ## 2 - Language
48 |
49 | A chatito file, is a document containing the grammar definitions. Because of the different encoding formats and range of
50 | non printable characters, this are the requirements of document source text and some terminology:
51 |
52 | - Format: UTF-8
53 | - Valid characters: Allow international language characters.
54 | - White space: allows white space character, not horizontal tab
55 | - Line end: new line, carriage return, carriage return + new line (supporting non windows and windows)
56 | - Indentation: should use a 4 space character to define the scope of entities
57 | - Entities: Special keywords with special behaviors used to declare the sentence combinations
58 | - Sentences: 4 space indented text lines after an entity definition
59 | - Definition order: It does not matter if an entity is defined after it is being referenced
60 | - Comments: Lines of text starting with '//' or '#' (no spaces before)
61 | - Imports: Lines of text starting with 'import' keyword followed by a relative filepath
62 | - Entity arguments: Optional key-values that can be declared at intents and slot definitions
63 | - Probability operator: an optional keyword declared at the start of sentences to control the odds.
64 |
65 | ### 2.1 - Entities
66 | Entities are the way to define keywords that wrap sentence variations and attach some properties to them.
67 | There are three types of entities: `intent`, `slot` and `alias`.
68 |
69 | #### 2.1.1 - Intent
70 |
71 | The intent entity is defined by the `%[` symbols at the start of a line, following by the entity name and `]`.
72 |
73 | Intent names should be at least 1 character long and can contain any characters except `]`, `line end` and `?`
74 | . e.g.: (%[intentName], %[intent_name], %[intent name])
75 |
76 | Repeating intent name definitions should not be allowed.
77 |
78 | Each intent defined in a file is an entry point for the generation, the intent is the classification tag that is
79 | added to the sentences defined inside. e.g.:
80 |
81 | ```
82 | %[greet]
83 | hello
84 | hi
85 | ```
86 |
87 | The previous example will generate all possible unique examples for greet (in this case 2 utterances). But there are cases where there is no need to generate all utterances, or when we want to attach some extra properties to the generated utterance, that is where entity arguments can help.
88 |
89 | Entity arguments are comma separated key-values declared with the entity definition inside parenthesis. Each entity argument is composed of a key, followed by the `:` symbol and the value. The argument key or value are just strings wrapped with single or double quotes, optional spaces between the parenthesis and comma are allowed, the format is similar to ndjson but only for string values.
90 |
91 | By default, intent definitions can expect the `training` and `testing` argument keys, when defined, are used to declare the maximum number of unique examples to generate for the given intent, and splitting them in two datasets, the training dataset is to be used to train the NLU model, and the testing dataset should be used to evaluate the accuracy of the model with examples it never trained with. Creating a testing dataset is not required, but it is important to be aware of the accuracy of your model to detect overfitting and compare against previous accuracies. The generator will first populate the training dataset, then testing dataset until reaching the sum of both values, each value must be `>= 1`. e.g.:
92 |
93 | ```
94 | %[greet]('training': '2', 'testing': '1')
95 | hello
96 | hi
97 | hola
98 | salute
99 | ```
100 |
101 | In this example, the greet intent could generate a maximum of 4 examples, but the declaration only requests 3. The training dataset will contain 2 utterances for greet intent and the testing dataset 1. Other entity arguments are ignored by default and their functionality depend on the dataset generator/adapter, this means that each adapter may use the other entity arguments differently in its own context (e.g.: Rasa/Snips adapter may expect different entity arguments).
102 |
103 | Nesting entities: Sentences defined inside an intent can refer to slots and alias entities.
104 |
105 | #### 2.1.2 - Slot
106 | The slot entity is defined by the `@[` symbols at the start of a line, following by the name of the slot and `]`.
107 |
108 | Slot names should be at least 1 character long and can contain any characters except `]`, `line end`, `?` and `#` (as # is used for variations).
109 | . e.g.: (@[slotName], %[slot_name], %[slot name])
110 |
111 | Repeating slot name definitions should not be allowed.
112 |
113 | From the output perspective, a slot is the tag that is added the relevant words in a generated sentence. e.g.:
114 |
115 | ```
116 | %[greet]
117 | ~[hi] @[name?]
118 |
119 | ~[hi]
120 | hi
121 | hey
122 |
123 | @[name]
124 | Janis
125 | Bob
126 | ```
127 |
128 | Slot entities referenced within sentences, can have `?` symbol at the end of the reference name. (e.g.: @[name?]).
129 | In that context, the `?` symbol means that the slot combination is optional, and could be omitted at generation. The probabilities of being omitted are defined by the number of sentence definitions at the entity. If the entity defines only one sentence, then the probabilities of empty string will be 50%, if the sentences defines 2 sentences, the probabilities of being omitted are 33.3333%, and so on.
130 |
131 | Slots provide a particular property at their definitions called variations.
132 |
133 | - Variations: There are cases where a slot combination only makes sense in a given context, variations allow to map one slot to different sentences in different contexts. e.g.:
134 |
135 | ```
136 | %[ask_for_delivery]
137 | my parcel should be delivered in @[delivery_time#time_in_hours]
138 | my parcel should be delivered @[delivery_time#relative_time]
139 |
140 | @[delivery_time#time_in_hours]
141 | 3 days
142 | 5 hours
143 |
144 | @[delivery_time#relative_time]
145 | as fast as possible
146 | quickly
147 | ```
148 |
149 | In this example, both combinations map to the `delivery_time` slot, but
150 | the generated sentences only generate their variations contexts where they make sense.
151 |
152 | Slot definitions can have entity arguments too but there are no default argument keys. Entity arguments are ignored by default and their functionality depends on the dataset adapter, this means that each adapter may use the entity arguments differently in its own context (e.g.: Rasa/Snips adapter may expect different entity arguments like for pre-build date parsing, or text value aliases mappings).
153 |
154 | Nesting entities: Sentences defined inside a slot can only reference alias entities.
155 |
156 | #### 2.1.3 - Alias
157 | The alias entity is defined by the `~[` symbols at the start of a line, following by the name of the alias and `]`.
158 | Alias are just variations of a word and does not generate any tag. By default if an alias is referenced but not defined (like in the next example for `how are you`, it just uses the alias key name, this is useful for making a word optional but not having to add the extra lines of code defining a new alias. (This 'auto alias' behavior can be configurable) e.g.:
159 |
160 | ```
161 | %[greet]
162 | ~[hi] ~[how are you?]
163 |
164 | ~[hi]
165 | hi
166 | hey
167 | ```
168 |
169 | Same as with slots, alias references can be ommited using a `?` symbol at the end of the reference name. (e.g.: ~[hi?]).
170 |
171 | When an alias is referenced inside a slot definition, and it is the only token of the slot sentence, by default the generator will tag the generated alias value as a `synonym` of the alias key name.
172 |
173 | Alias definitions are not allowed to declare entity arguments.
174 |
175 | Nesting entities: Sentences defined inside aliases can reference slots and other aliases but preventing recursive loops.
176 |
177 |
178 | ### 2.2 - Importing chatito files
179 |
180 | To allow reusing entity declarations. It is possible to import another chatito file using the import keyword. Importing another chatito file only allows using the slots and aliases defined there, if the imported file defines intents, they will be ignored since intents are generation entry points.
181 |
182 | As an example, given two chatito files:
183 |
184 | ```
185 | # file slot1.chatito
186 | @[slot1]
187 | s1v1
188 | s1v2
189 | ```
190 |
191 | and
192 |
193 | ```
194 | # file main.chatito
195 | import ./slot1.chatito
196 |
197 | %[some intent]
198 | ~[word] @[slot1]
199 | ```
200 |
201 | The file `main.chatito` will import all alias and slot definitions from `./slot1.chatito`.
202 | The text next to the import statement should be a relative path from the main file to the imported file. Imports can be nested, and the path is always relative to the file that declares the reference.
203 |
204 | Note: Chatito will throw an exception if two imports define the same entity.
205 |
206 |
207 | ### 2.2 - Controlling probabilities
208 |
209 | The way Chatito works, is like pulling samples from a cloud of possible combinations and avoiding duplicates. Once the sentences definitions gain complexity, the max possible combinations increments exponentially, causing a problem where the generator will most likely pick sentences that have more possible combinations, and omit some sentences that may be more important at the dataset. To overcome this problem, semantics for controlling the data generation probabilities are provided.
210 |
211 | #### 2.2.1 - Frequency distribution strategies
212 |
213 | When generating samples for an entity, the generator will randomly pick a sentence model using one of the two frequency distribution strategies available: `regular` or `even`.
214 |
215 | For a regular distribution strategy, each sentence probabilities are defined by their maximum possible combinations, in other words, a sentence that can produce more combinations will have more probabilities. For even distribution strategy, sentence probabilities are the same.
216 |
217 | The distribution strategy can be declared as an argument at the entity level. If not declared, the generator should use the default strategy configured (at the IDE or CLI level), if there is no default definition, then `regular` should be the default.
218 |
219 | Lets look at an example, here, all the alias entities are defined at `./aliases.chatito`, and are named by the maximum possible combinations each provide:
220 |
221 | ```
222 | import ./aliases.chatito
223 |
224 | %[intent with a maximum of 1k combinations]('distribution': 'regular')
225 | first sentence equals ~[100 maximum combinations]
226 | second sentence equals ~[50 maximum combinations] multiplied by ~[10 maximum combinations]
227 | third sentence equals ~[400 maximum combinations]
228 | ```
229 |
230 | Since the intent declares a `regular` distribution, this would be the odds:
231 |
232 | | | Max combinations | Weight | Probability % |
233 | |------------|------------------|--------|---------------|
234 | | sentence 1 | 100 | 100 | 10% |
235 | | sentence 2 | 500 | 500 | 50% |
236 | | sentence 3 | 400 | 400 | 40% |
237 |
238 |
239 | Now the code to get an `even` distribution:
240 |
241 | ```
242 | import ./aliases.chatito
243 |
244 | %[intent with a maximum of 1k combinations]('distribution': 'even')
245 | first sentence equals ~[100 maximum combinations]
246 | second sentence equals ~[50 maximum combinations] multiplied by ~[10 maximum combinations]
247 | third sentence equals ~[400 maximum combinations]
248 | ```
249 |
250 | For `even` distribution using the previous example:
251 |
252 | | | Max combinations | Weight | Probability % |
253 | |------------|------------------|--------|---------------|
254 | | sentence 1 | 100 | 1 | 33.3333% |
255 | | sentence 2 | 500 | 1 | 33.3333% |
256 | | sentence 3 | 400 | 1 | 33.3333% |
257 |
258 |
259 | #### 2.2.1 - Sentence probability operator
260 |
261 | The sentence probability operator is defined by the `*[` symbol at the start of a sentence following by the probability value and `]`. The probability value may be expressed in two ways, as a plain number (considered as weighted probabilty, e.g.: `1`) or as a percentage value (a number ending with `%`, e.g.: `33.3333%`), but once an entity defines a probabilty as either weight or percentage, then all the other sentences for that entity should use the same type. Inconsistencies declaring entity sentence probabilty values should be considered an input error and if the value is not a valid integer, float or percentual value, the input should be considered as simple text and not as a sentence probability definition.
262 |
263 | NOTE: If the probabilty value is a percentage type, then and the sum of all sentence probabilty operators declared inside the entity definition should never exceed 100.
264 |
265 | Lets continue with some examples:
266 |
267 | ```
268 | %[intent with a maximum of 1k combinations]
269 | *[20%] first sentence ~[100 maximum combinations]
270 | second sentence ~[50 maximum combinations] multiplied by ~[10 maximum combinations]
271 | third sentence ~[400 maximum combinations]
272 | ```
273 |
274 | The previous example, declares `20%` probabilties for the first sentence. This would be odds table for the two strategy distributions:
275 |
276 | | | Max combinations | % with even | % with regular |
277 | |------------|------------------|-------------|-----------------------|
278 | | sentence 1 | 100 | 20% | 20% |
279 | | sentence 2 | 500 | 40% | 44.4444% (500*80/900) |
280 | | sentence 3 | 400 | 40% | 35.5556% (400*80/900) |
281 |
282 |
283 | When probabilty value is a weight with regular distribution, multiply that value with the maximum combinations for that sentence, if distribution is even, that value is the actual weighted probability. E.g.:
284 |
285 | ```
286 | %[intent with a maximum of 1k combinations]
287 | *[2] first sentence ~[100 maximum combinations]
288 | second sentence ~[50 maximum combinations] multiplied by ~[10 maximum combinations]
289 | third sentence ~[400 maximum combinations]
290 | ```
291 |
292 | And the odds table:
293 |
294 | | | Max combinations | even weight | even % | regular weight | regular % |
295 | |------------|------------------|-------------|--------|----------------|-----------|
296 | | sentence 1 | 100 | 2 | 50% | 200 | 18.1818% |
297 | | sentence 2 | 500 | 1 | 25% | 500 | 45.4545% |
298 | | sentence 3 | 400 | 1 | 25% | 400 | 36.3636% |
299 |
300 |
301 | NOTE: Be careful when using probabilty operator, because if the sentence reaches its max number of unique generated values, it will start producing duplicates and slowing down the generator that filters duplicates.
302 |
303 | ## 3 - Data Generation
304 |
305 | The entry points for the data generation are the intent definitions, for each intent definition available:
306 | - If the intent does not specify the 'training' or 'testing' arguments, generate all possible unique combinations and add them to the training dataset.
307 |
308 | - Respect probabilty operator declarations and distribution strategy.
309 |
310 | - Generate unique combinations for the training and testing dataset until the provided sum of both argument numbers are reached.
311 |
312 | - Recursive loop references should be prevented.
313 |
--------------------------------------------------------------------------------
/src/tests/parser.spec.ts:
--------------------------------------------------------------------------------
1 | import { IChatitoParser } from '../types';
2 |
3 | // tslint:disable-next-line:no-var-requires
4 | const chatitoParser = require('../../parser/chatito') as IChatitoParser;
5 |
6 | describe('Simple example', () => {
7 | const firstSpecExample = `
8 | %[greet]
9 | ~[hi] @[name?] ~[whatsUp?] one two three im @[name?]
10 |
11 | ~[hi]
12 | hi
13 | hey
14 |
15 | @[name]
16 | Janis
17 | Bob
18 |
19 | ~[whatsUp]
20 | whats up
21 | how is it going
22 | `;
23 | test('correct PEGJS output', () => {
24 | let error = null;
25 | let result = null;
26 | try {
27 | result = chatitoParser.parse(firstSpecExample);
28 | } catch (e) {
29 | error = e;
30 | }
31 | expect(error).toBeNull();
32 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
33 | });
34 | });
35 |
36 | describe('Simple examples with max training and testing', () => {
37 | const specExampleWithMaximum = `
38 | %[greet]('training': '3')
39 | ~[hi] @[name?] ~[whatsUp?]
40 | ~[hi]
41 | hi
42 | hey
43 | @[name]
44 | Janis
45 | Bob
46 | ~[whatsUp]
47 | whats up
48 | how is it going
49 | `;
50 | test('CORRECT parser output specExampleWithMaximum', () => {
51 | let error = null;
52 | let result = null;
53 | try {
54 | result = chatitoParser.parse(specExampleWithMaximum);
55 | } catch (e) {
56 | error = e;
57 | }
58 | expect(error).toBeNull();
59 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
60 | });
61 | const specExampleWithTrainingAndTesting = `
62 | %[greet]('training': '3', 'testing': '3')
63 | ~[hi] @[name?] ~[whatsUp?]
64 | ~[hi]
65 | hi
66 | hey
67 | @[name]
68 | Janis
69 | Bob
70 | ~[whatsUp]
71 | whats up
72 | how is it going
73 | `;
74 | test('CORRECT parser output for specExampleWithTrainingAndTesting', () => {
75 | let error = null;
76 | let result = null;
77 | try {
78 | result = chatitoParser.parse(specExampleWithTrainingAndTesting);
79 | } catch (e) {
80 | error = e;
81 | }
82 | expect(error).toBeNull();
83 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
84 | });
85 | const specExampleWithTrainingAndTestingWithSpaces = `
86 | %[greet]( 'training' : '3' , 'testing': '3' )
87 | ~[hi] @[name?] ~[whatsUp?]
88 | ~[hi]
89 | hi
90 | hey
91 | @[name]
92 | Janis
93 | Bob
94 | ~[whatsUp]
95 | whats up
96 | how is it going
97 | `;
98 | test('CORRECT parser output for specExampleWithTrainingAndTestingWithSpaces', () => {
99 | let error = null;
100 | let result = null;
101 | try {
102 | result = chatitoParser.parse(specExampleWithTrainingAndTestingWithSpaces);
103 | } catch (e) {
104 | error = e;
105 | }
106 | expect(error).toBeNull();
107 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
108 | });
109 | });
110 |
111 | describe('Simple example with wrong syntax', () => {
112 | const specExampleWithWrongSyntax = `
113 | %[greet]('training': '3')wrong
114 | hi
115 | `;
116 | test('ERROR with wrong syntax after maximum', () => {
117 | let error = null;
118 | let result = null;
119 | try {
120 | result = chatitoParser.parse(specExampleWithWrongSyntax);
121 | } catch (e) {
122 | error = e;
123 | }
124 | expect(error).toMatchSnapshot();
125 | });
126 | const specExampleWithWrongTestingTrainingSyntax = `
127 | %[greet]('training': 3, 'testing': 3)
128 | hi
129 | `;
130 | test('ERROR with wrong syntax after training and testing defined', () => {
131 | let error = null;
132 | let result = null;
133 | try {
134 | result = chatitoParser.parse(specExampleWithWrongTestingTrainingSyntax);
135 | } catch (e) {
136 | error = e;
137 | }
138 | expect(error).toMatchSnapshot();
139 | });
140 | });
141 |
142 | describe('Simple example with wrong identation', () => {
143 | const specExampleWithWrongIndentationSyntax = `
144 | %[greet]
145 | wrong
146 | `;
147 | test('ERROR with wrong indentation syntax', () => {
148 | let error = null;
149 | let result = null;
150 | try {
151 | result = chatitoParser.parse(specExampleWithWrongIndentationSyntax);
152 | } catch (e) {
153 | error = e;
154 | }
155 | expect(error).toMatchSnapshot();
156 | });
157 | });
158 |
159 | describe('Simple example for windows end of line', () => {
160 | // tslint:disable-next-line:max-line-length
161 | const specExampleWindowsEOLSyntax = `%[greet]\r\n hi hi\r\n how are you @[full names] sup\r\n@[full names]\r\n jim raynor`;
162 | test('CORRECT parser output', () => {
163 | let error = null;
164 | let result = null;
165 | try {
166 | result = chatitoParser.parse(specExampleWindowsEOLSyntax);
167 | } catch (e) {
168 | error = e;
169 | }
170 | expect(error).toBeNull();
171 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
172 | });
173 | });
174 |
175 | describe('Example variation spec', () => {
176 | const slotVariationSpecSyntax = `
177 | %[ask_for_delivery]
178 | my parcel should be delivered in @[delivery_time#time_in_hours]
179 | my parcel should be delivered @[delivery_time#relative_time]
180 |
181 | @[delivery_time#time_in_hours]
182 | 3 days
183 | 5 hours
184 |
185 | @[delivery_time#relative_time]
186 | as fast as possible
187 | quickly
188 | `;
189 | test('CORRECT parser output', () => {
190 | let error: any = null;
191 | let result = null;
192 | try {
193 | result = chatitoParser.parse(slotVariationSpecSyntax);
194 | } catch (e) {
195 | error = { error: e };
196 | if (e.location) {
197 | error.location = {
198 | line: e.location.start.line,
199 | column: e.location.start.column
200 | };
201 | }
202 | }
203 | expect(error).toBeNull();
204 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
205 | });
206 | });
207 |
208 | describe('Example for weird variations', () => {
209 | const slotExamplesWithWeirdKeywords = `
210 | %[intent]
211 | [ adfd] adf ~ @ asdfasdf asdf ~[alias_name ok] @[slot name#variation name?]
212 | ~ @~[alias_name ok]@[slot name#variation name?]
213 | @@~[alias_name ok]~~@[slot name#variation name?]
214 |
215 | @[slot name#variation name]
216 | 3 ~[daysOrHours]
217 | 5 ~[daysOrHours]
218 |
219 | ~[alias_name ok]
220 | as fast as possible
221 | quickly
222 |
223 | ~[daysOrHours]
224 | days
225 | hours
226 | `;
227 | test('CORRECT parser output', () => {
228 | let error: any = null;
229 | let result = null;
230 | try {
231 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords);
232 | } catch (e) {
233 | error = { error: e };
234 | if (e.location) {
235 | error.location = {
236 | line: e.location.start.line,
237 | column: e.location.start.column
238 | };
239 | }
240 | }
241 | expect(error).toBeNull();
242 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
243 | });
244 | });
245 |
246 | describe('Example with multi intent', () => {
247 | const slotExamplesWithWeirdKeywords = `
248 | %[hi + bye]
249 | hi, i have to go, bye
250 | `;
251 | test('CORRECT parser output', () => {
252 | let error: any = null;
253 | let result = null;
254 | try {
255 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords);
256 | } catch (e) {
257 | error = { error: e };
258 | if (e.location) {
259 | error.location = {
260 | line: e.location.start.line,
261 | column: e.location.start.column
262 | };
263 | }
264 | }
265 | expect(error).toBeNull();
266 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
267 | });
268 | });
269 |
270 | describe('Example with comments spec', () => {
271 | const exampleWithCorrectComments = `
272 | // this is a comment
273 | %[ask_for_delivery]
274 | my parcel should be delivered in @[delivery_time#time_in_hours]
275 |
276 | // this is two
277 | // line comment
278 | @[delivery_time#time_in_hours]
279 | 3 days
280 | 5 hours
281 | // more comments here
282 | `;
283 | test('CORRECT parser output for exampleWithCorrectComments', () => {
284 | let error: any = null;
285 | let result = null;
286 | try {
287 | result = chatitoParser.parse(exampleWithCorrectComments);
288 | } catch (e) {
289 | error = { error: e };
290 | if (e.location) {
291 | error.location = {
292 | line: e.location.start.line,
293 | column: e.location.start.column
294 | };
295 | }
296 | }
297 | expect(error).toBeNull();
298 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
299 | });
300 |
301 | const exampleWithCorrectHashComments = `
302 | #this is a comment
303 | %[ask_for_delivery]
304 | my parcel should be delivered in @[delivery_time#time_in_hours]
305 | `;
306 | test('CORRECT parser output for exampleWithCorrectHashComments', () => {
307 | let error: any = null;
308 | let result = null;
309 | try {
310 | result = chatitoParser.parse(exampleWithCorrectHashComments);
311 | } catch (e) {
312 | error = { error: e };
313 | if (e.location) {
314 | error.location = {
315 | line: e.location.start.line,
316 | column: e.location.start.column
317 | };
318 | }
319 | }
320 | expect(error).toBeNull();
321 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
322 | });
323 |
324 | const exampleWithWrongComments = `
325 | // this is a comment
326 | %[ask_for_delivery]
327 | my parcel should be delivered in @[delivery_time#time_in_hours]
328 |
329 | @[delivery_time#time_in_hours]
330 | 3 days
331 | 5 hours
332 | `;
333 | test('CORRECT parser output for exampleWithWrongComments', () => {
334 | let error: any = null;
335 | let result = null;
336 | try {
337 | result = chatitoParser.parse(exampleWithWrongComments);
338 | } catch (e) {
339 | error = { error: e };
340 | if (e.location) {
341 | error.location = {
342 | line: e.location.start.line,
343 | column: e.location.start.column
344 | };
345 | }
346 | }
347 | expect(error).toMatchSnapshot();
348 | });
349 | });
350 |
351 | describe('Example with probability weighted opreator', () => {
352 | const slotExamplesWithWeirdKeywords = `
353 | %[greet]('training': '10', 'testing': '10')
354 | *[50] ~[phrase1]
355 | *[30] ~[phrase2] ~[phrase3?]
356 | ~[another phrase] ~[something] ~[something else?]
357 | `;
358 | test('CORRECT parser output', () => {
359 | let error: any = null;
360 | let result = null;
361 | try {
362 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords);
363 | } catch (e) {
364 | error = { error: e };
365 | if (e.location) {
366 | error.location = {
367 | line: e.location.start.line,
368 | column: e.location.start.column
369 | };
370 | }
371 | }
372 | expect(error).toBeNull();
373 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
374 | });
375 | });
376 |
377 | describe('Example with probability percentual opreator', () => {
378 | const slotExamplesWithWeirdKeywords = `
379 | %[greet]('training': '10', 'testing': '10')
380 | *[50%] ~[phrase1]
381 | *[30%] ~[phrase2] ~[phrase3?]
382 | ~[another phrase] ~[something] ~[something else?]
383 | `;
384 | test('CORRECT parser output', () => {
385 | let error: any = null;
386 | let result = null;
387 | try {
388 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords);
389 | } catch (e) {
390 | error = { error: e };
391 | if (e.location) {
392 | error.location = {
393 | line: e.location.start.line,
394 | column: e.location.start.column
395 | };
396 | }
397 | }
398 | expect(error).toBeNull();
399 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
400 | });
401 | });
402 |
403 | describe('Example with probability opreator but non int or float value parses as text', () => {
404 | const slotExamplesWithWeirdKeywords = `
405 | %[greet]('training': '10', 'testing': '10')
406 | *[5c0] ~[phrase1]
407 | `;
408 | test('CORRECT parser output', () => {
409 | let error: any = null;
410 | let result = null;
411 | try {
412 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords);
413 | } catch (e) {
414 | error = { error: e };
415 | if (e.location) {
416 | error.location = {
417 | line: e.location.start.line,
418 | column: e.location.start.column
419 | };
420 | }
421 | }
422 | expect(error).toBeNull();
423 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
424 | });
425 | });
426 |
427 | describe('Example with probability opreator but no after space parses correctly', () => {
428 | const slotExamplesWithWeirdKeywords = `
429 | %[greet]('training': '10', 'testing': '10')
430 | *[50]~[phrase1]
431 | `;
432 | test('CORRECT parser output', () => {
433 | let error: any = null;
434 | let result = null;
435 | try {
436 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords);
437 | } catch (e) {
438 | error = { error: e };
439 | if (e.location) {
440 | error.location = {
441 | line: e.location.start.line,
442 | column: e.location.start.column
443 | };
444 | }
445 | }
446 | expect(error).toBeNull();
447 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
448 | });
449 | });
450 |
451 | describe('Example with international language characters', () => {
452 | const slotExamplesWithWeirdKeywords = `
453 | %[中文]
454 | 中文 @[中文] ~[中文]
455 |
456 | @[中文]
457 | 中文
458 | `;
459 | test('CORRECT parser output', () => {
460 | let error: any = null;
461 | let result = null;
462 | try {
463 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords);
464 | } catch (e) {
465 | error = { error: e };
466 | if (e.location) {
467 | error.location = {
468 | line: e.location.start.line,
469 | column: e.location.start.column
470 | };
471 | }
472 | }
473 | expect(error).toBeNull();
474 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
475 | });
476 | });
477 |
478 | describe('Example with import statement at start', () => {
479 | const slotExamplesWithWeirdKeywords = `
480 |
481 | import ../some/file.chatito
482 | import ../some/file.chatito
483 |
484 | %[greet]
485 | hey yo!
486 | `;
487 | test('CORRECT parser output', () => {
488 | let error: any = null;
489 | let result = null;
490 | try {
491 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords);
492 | } catch (e) {
493 | error = { error: e };
494 | if (e.location) {
495 | error.location = {
496 | line: e.location.start.line,
497 | column: e.location.start.column
498 | };
499 | }
500 | }
501 | expect(error).toBeNull();
502 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
503 | });
504 | });
505 |
506 | describe('Example with alias arguments', () => {
507 | const slotExamplesWithWeirdKeywords = `
508 | %[g]('training': '2', 'testing': '1')
509 | ~[g]
510 |
511 | ~[g]('arg': 'val')
512 | g1
513 | g2
514 | g3
515 | `;
516 | test('CORRECT parser output', () => {
517 | let error: any = null;
518 | let result = null;
519 | try {
520 | result = chatitoParser.parse(slotExamplesWithWeirdKeywords);
521 | } catch (e) {
522 | error = { error: e };
523 | if (e.location) {
524 | error.location = {
525 | line: e.location.start.line,
526 | column: e.location.start.column
527 | };
528 | }
529 | }
530 | expect(error).toBeNull();
531 | expect(JSON.stringify(result, null, 2)).toMatchSnapshot();
532 | });
533 | });
534 |
--------------------------------------------------------------------------------
/src/tests/bin.spec.ts:
--------------------------------------------------------------------------------
1 | import * as cp from 'child_process';
2 | import * as fs from 'fs';
3 | import * as path from 'path';
4 |
5 | test('test npm command line generator for large example', () => {
6 | const d = __dirname;
7 | const generatedDir = path.resolve(`${d}/../../examples/dateBooking_large`);
8 | const generatedTrainingFile = path.resolve(generatedDir, 'default_dataset_training.json');
9 | const generatedTestingFile = path.resolve(generatedDir, 'default_dataset_testing.json');
10 | const npmBin = path.resolve(`${d}/../bin.ts`);
11 | const grammarFile = path.resolve(`${d}/../../examples/dateBooking_large.chatito`);
12 | if (fs.existsSync(generatedTrainingFile)) {
13 | fs.unlinkSync(generatedTrainingFile);
14 | }
15 | if (fs.existsSync(generatedTestingFile)) {
16 | fs.unlinkSync(generatedTestingFile);
17 | }
18 | if (fs.existsSync(generatedDir)) {
19 | fs.rmdirSync(generatedDir);
20 | }
21 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --outputPath=${generatedDir}`);
22 | expect(fs.existsSync(generatedDir)).toBeTruthy();
23 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy();
24 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy();
25 | const trainingDataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8'));
26 | expect(trainingDataset).not.toBeNull();
27 | expect(trainingDataset.bookRestaurantsAtDatetime).not.toBeNull();
28 | expect(trainingDataset.bookRestaurantsAtDatetime.length).toEqual(1000);
29 | fs.unlinkSync(generatedTrainingFile);
30 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8'));
31 | expect(testingDataset).not.toBeNull();
32 | expect(testingDataset.bookRestaurantsAtDatetime).not.toBeNull();
33 | expect(testingDataset.bookRestaurantsAtDatetime.length).toEqual(100);
34 | fs.unlinkSync(generatedTestingFile);
35 | fs.rmdirSync(generatedDir);
36 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy();
37 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy();
38 | expect(fs.existsSync(generatedDir)).toBeFalsy();
39 | });
40 |
41 | test('test npm command line generator for medium example', () => {
42 | const d = __dirname;
43 | const generatedDir = path.resolve(`${d}/../../examples/citySearch_medium`);
44 | const generatedTrainingFile = path.resolve(generatedDir, 'default_dataset_training.json');
45 | const generatedTestingFile = path.resolve(generatedDir, 'default_dataset_testing.json');
46 | const npmBin = path.resolve(`${d}/../bin.ts`);
47 | const grammarFile = path.resolve(`${d}/../../examples/citySearch_medium.chatito`);
48 | if (fs.existsSync(generatedTrainingFile)) {
49 | fs.unlinkSync(generatedTrainingFile);
50 | }
51 | if (fs.existsSync(generatedTestingFile)) {
52 | fs.unlinkSync(generatedTestingFile);
53 | }
54 | if (fs.existsSync(generatedDir)) {
55 | fs.rmdirSync(generatedDir);
56 | }
57 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --outputPath=${generatedDir}`);
58 | expect(fs.existsSync(generatedDir)).toBeTruthy();
59 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy();
60 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy();
61 | const trainingDataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8'));
62 | expect(trainingDataset).not.toBeNull();
63 | expect(trainingDataset.findByCityAndCategory).not.toBeNull();
64 | expect(trainingDataset.findByCityAndCategory.length).toEqual(1000);
65 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8'));
66 | expect(testingDataset).not.toBeNull();
67 | expect(testingDataset.findByCityAndCategory).not.toBeNull();
68 | expect(testingDataset.findByCityAndCategory.length).toEqual(100);
69 | fs.unlinkSync(generatedTrainingFile);
70 | fs.unlinkSync(generatedTestingFile);
71 | fs.rmdirSync(generatedDir);
72 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy();
73 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy();
74 | expect(fs.existsSync(generatedDir)).toBeFalsy();
75 | });
76 |
77 | test('test npm command line generator for process all directory examples', () => {
78 | const d = __dirname;
79 | const generatedDir = path.resolve(`${d}/../../examples/citySearch_medium`);
80 | const generatedTrainingFile = path.resolve(generatedDir, 'default_dataset_training.json');
81 | const generatedTestingFile = path.resolve(generatedDir, 'default_dataset_testing.json');
82 | const npmBin = path.resolve(`${d}/../bin.ts`);
83 | const grammarFiles = path.resolve(`${d}/../../examples/`);
84 | if (fs.existsSync(generatedTrainingFile)) {
85 | fs.unlinkSync(generatedTrainingFile);
86 | }
87 | if (fs.existsSync(generatedTestingFile)) {
88 | fs.unlinkSync(generatedTestingFile);
89 | }
90 | if (fs.existsSync(generatedDir)) {
91 | fs.rmdirSync(generatedDir);
92 | }
93 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFiles} --outputPath=${generatedDir}`);
94 | expect(fs.existsSync(generatedDir)).toBeTruthy();
95 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy();
96 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy();
97 | const trainingDataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8'));
98 | expect(trainingDataset).not.toBeNull();
99 | expect(trainingDataset.findByCityAndCategory).not.toBeNull();
100 | expect(trainingDataset.findByCityAndCategory.length).toEqual(1000);
101 | expect(trainingDataset.bookRestaurantsAtDatetime).not.toBeNull();
102 | expect(trainingDataset.bookRestaurantsAtDatetime.length).toEqual(1000);
103 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8'));
104 | expect(testingDataset).not.toBeNull();
105 | expect(testingDataset.findByCityAndCategory).not.toBeNull();
106 | expect(testingDataset.findByCityAndCategory.length).toEqual(100);
107 | expect(testingDataset.bookRestaurantsAtDatetime).not.toBeNull();
108 | expect(testingDataset.bookRestaurantsAtDatetime.length).toEqual(100);
109 | fs.unlinkSync(generatedTrainingFile);
110 | fs.unlinkSync(generatedTestingFile);
111 | fs.rmdirSync(generatedDir);
112 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy();
113 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy();
114 | expect(fs.existsSync(generatedDir)).toBeFalsy();
115 | });
116 |
117 | test('test npm command line generator for rasa medium example', () => {
118 | const d = __dirname;
119 | const generatedTrainingFile = path.resolve(`${d}/../../examples/rasa_dataset_training.json`);
120 | const generatedTestingFile = path.resolve(`${d}/../../examples/rasa_dataset_testing.json`);
121 | const npmBin = path.resolve(`${d}/../bin.ts`);
122 | const grammarFile = path.resolve(`${d}/../../examples/citySearch_medium.chatito`);
123 | if (fs.existsSync(generatedTrainingFile)) {
124 | fs.unlinkSync(generatedTrainingFile);
125 | }
126 | if (fs.existsSync(generatedTestingFile)) {
127 | fs.unlinkSync(generatedTestingFile);
128 | }
129 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --format=rasa --outputPath=${d}/../../examples`);
130 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy();
131 | const dataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8'));
132 | expect(dataset).not.toBeNull();
133 | expect(dataset.rasa_nlu_data).not.toBeNull();
134 | expect(dataset.rasa_nlu_data.entity_synonyms).not.toBeNull();
135 | expect(dataset.rasa_nlu_data.entity_synonyms.length).toEqual(3);
136 | expect(dataset.rasa_nlu_data.common_examples).not.toBeNull();
137 | expect(dataset.rasa_nlu_data.common_examples.length).toEqual(1000);
138 | fs.unlinkSync(generatedTrainingFile);
139 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy();
140 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy();
141 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8'));
142 | expect(testingDataset).not.toBeNull();
143 | expect(testingDataset.rasa_nlu_data).not.toBeNull();
144 | expect(testingDataset.rasa_nlu_data.common_examples).not.toBeNull();
145 | expect(testingDataset.rasa_nlu_data.common_examples.length).toEqual(100);
146 | fs.unlinkSync(generatedTestingFile);
147 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy();
148 | });
149 |
150 | test('test npm command line generator for rasa directory examples', () => {
151 | const d = __dirname;
152 | const generatedTrainingFile = path.resolve(`${d}/../../examples/rasa_dataset_training.json`);
153 | const generatedTestingFile = path.resolve(`${d}/../../examples/rasa_dataset_testing.json`);
154 | const npmBin = path.resolve(`${d}/../bin.ts`);
155 | const grammarFile = path.resolve(`${d}/../../examples`);
156 | if (fs.existsSync(generatedTrainingFile)) {
157 | fs.unlinkSync(generatedTrainingFile);
158 | }
159 | if (fs.existsSync(generatedTestingFile)) {
160 | fs.unlinkSync(generatedTestingFile);
161 | }
162 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --format=rasa --outputPath=${d}/../../examples`);
163 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy();
164 | const dataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8'));
165 | expect(dataset).not.toBeNull();
166 | expect(dataset.rasa_nlu_data).not.toBeNull();
167 | expect(dataset.rasa_nlu_data.common_examples).not.toBeNull();
168 | expect(dataset.rasa_nlu_data.common_examples.length).toEqual(2030);
169 | expect(dataset.rasa_nlu_data.entity_synonyms).not.toBeNull();
170 | expect(dataset.rasa_nlu_data.entity_synonyms.length).toEqual(3);
171 | fs.unlinkSync(generatedTrainingFile);
172 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy();
173 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy();
174 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8'));
175 | expect(testingDataset).not.toBeNull();
176 | expect(testingDataset.rasa_nlu_data).not.toBeNull();
177 | expect(testingDataset.rasa_nlu_data.common_examples).not.toBeNull();
178 | expect(testingDataset.rasa_nlu_data.common_examples.length).toEqual(200);
179 | fs.unlinkSync(generatedTestingFile);
180 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy();
181 | });
182 |
183 | test('test npm command line generator for snips medium example', () => {
184 | const d = __dirname;
185 | const generatedTrainingFile = path.resolve(`${d}/../../examples/snips_dataset_training.json`);
186 | const generatedTestingFile = path.resolve(`${d}/../../examples/snips_dataset_testing.json`);
187 | const npmBin = path.resolve(`${d}/../bin.ts`);
188 | const grammarFile = path.resolve(`${d}/../../examples/citySearch_medium.chatito`);
189 | if (fs.existsSync(generatedTrainingFile)) {
190 | fs.unlinkSync(generatedTrainingFile);
191 | }
192 | if (fs.existsSync(generatedTestingFile)) {
193 | fs.unlinkSync(generatedTestingFile);
194 | }
195 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --format=snips --outputPath=${d}/../../examples`);
196 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy();
197 | const dataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8'));
198 | expect(dataset).not.toBeNull();
199 | expect(dataset.entities).not.toBeNull();
200 | expect(dataset.entities.location).not.toBeNull();
201 | expect(dataset.entities.location.data).not.toBeNull();
202 | expect(dataset.entities.location.data.length).toEqual(3);
203 | expect(dataset.intents).not.toBeNull();
204 | expect(dataset.intents.findByCityAndCategory).not.toBeNull();
205 | expect(dataset.intents.findByCityAndCategory.utterances).not.toBeNull();
206 | expect(dataset.intents.findByCityAndCategory.utterances.length).toEqual(1000);
207 | fs.unlinkSync(generatedTrainingFile);
208 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy();
209 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy();
210 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8'));
211 | expect(testingDataset).not.toBeNull();
212 | expect(testingDataset.findByCityAndCategory).not.toBeNull();
213 | expect(testingDataset.findByCityAndCategory.length).toEqual(100);
214 | fs.unlinkSync(generatedTestingFile);
215 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy();
216 | });
217 |
218 | test('test npm command line generator for snips all examples', () => {
219 | const d = __dirname;
220 | const generatedTrainingFile = path.resolve(`${d}/../../examples/snips_dataset_training.json`);
221 | const generatedTestingFile = path.resolve(`${d}/../../examples/snips_dataset_testing.json`);
222 | const npmBin = path.resolve(`${d}/../bin.ts`);
223 | const grammarFile = path.resolve(`${d}/../../examples`);
224 | if (fs.existsSync(generatedTrainingFile)) {
225 | fs.unlinkSync(generatedTrainingFile);
226 | }
227 | if (fs.existsSync(generatedTestingFile)) {
228 | fs.unlinkSync(generatedTestingFile);
229 | }
230 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --format=snips --outputPath=${d}/../../examples`);
231 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy();
232 | const dataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8'));
233 | expect(dataset).not.toBeNull();
234 | expect(dataset.entities).not.toBeNull();
235 | expect(dataset.entities.location).not.toBeNull();
236 | expect(dataset.entities.location.data).not.toBeNull();
237 | expect(dataset.entities.location.data.length).toEqual(3);
238 | expect(dataset.intents).not.toBeNull();
239 | expect(dataset.intents.findByCityAndCategory).not.toBeNull();
240 | expect(dataset.intents.findByCityAndCategory.utterances).not.toBeNull();
241 | expect(dataset.intents.findByCityAndCategory.utterances.length).toEqual(1000);
242 | expect(dataset.intents.bookRestaurantsAtDatetime).not.toBeNull();
243 | expect(dataset.intents.bookRestaurantsAtDatetime.utterances).not.toBeNull();
244 | expect(dataset.intents.bookRestaurantsAtDatetime.utterances.length).toEqual(1000);
245 | fs.unlinkSync(generatedTrainingFile);
246 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy();
247 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy();
248 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8'));
249 | expect(testingDataset).not.toBeNull();
250 | expect(testingDataset.findByCityAndCategory).not.toBeNull();
251 | expect(testingDataset.findByCityAndCategory.length).toEqual(100);
252 | expect(testingDataset.bookRestaurantsAtDatetime).not.toBeNull();
253 | expect(testingDataset.bookRestaurantsAtDatetime.length).toEqual(100);
254 | fs.unlinkSync(generatedTestingFile);
255 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy();
256 | });
257 |
258 | test('test npm command line generator for luis medium example', () => {
259 | const d = __dirname;
260 | const generatedTrainingFile = path.resolve(`${d}/../../examples/luis_dataset_training.json`);
261 | const generatedTestingFile = path.resolve(`${d}/../../examples/luis_dataset_testing.json`);
262 | const npmBin = path.resolve(`${d}/../bin.ts`);
263 | const grammarFile = path.resolve(`${d}/../../examples/citySearch_medium.chatito`);
264 | if (fs.existsSync(generatedTrainingFile)) {
265 | fs.unlinkSync(generatedTrainingFile);
266 | }
267 | if (fs.existsSync(generatedTestingFile)) {
268 | fs.unlinkSync(generatedTestingFile);
269 | }
270 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --format=luis --outputPath=${d}/../../examples`);
271 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy();
272 | const dataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8'));
273 | expect(dataset).not.toBeNull();
274 | expect(dataset.data).not.toBeNull();
275 | expect(dataset.data.length).toEqual(1000);
276 | fs.unlinkSync(generatedTrainingFile);
277 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy();
278 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy();
279 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8'));
280 | expect(testingDataset).not.toBeNull();
281 | expect(testingDataset.data).not.toBeNull();
282 | expect(testingDataset.data.length).toEqual(100);
283 | fs.unlinkSync(generatedTestingFile);
284 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy();
285 | });
286 |
287 | test('test npm command line generator for luis directory examples', () => {
288 | const d = __dirname;
289 | const generatedTrainingFile = path.resolve(`${d}/../../examples/luis_dataset_training.json`);
290 | const generatedTestingFile = path.resolve(`${d}/../../examples/luis_dataset_testing.json`);
291 | const npmBin = path.resolve(`${d}/../bin.ts`);
292 | const grammarFile = path.resolve(`${d}/../../examples`);
293 | if (fs.existsSync(generatedTrainingFile)) {
294 | fs.unlinkSync(generatedTrainingFile);
295 | }
296 | if (fs.existsSync(generatedTestingFile)) {
297 | fs.unlinkSync(generatedTestingFile);
298 | }
299 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --format=luis --outputPath=${d}/../../examples`);
300 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy();
301 | const dataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8'));
302 | expect(dataset).not.toBeNull();
303 | expect(dataset.data).not.toBeNull();
304 | expect(dataset.data.length).toEqual(2030);
305 | fs.unlinkSync(generatedTrainingFile);
306 | expect(fs.existsSync(generatedTrainingFile)).toBeFalsy();
307 | expect(fs.existsSync(generatedTestingFile)).toBeTruthy();
308 | const testingDataset = JSON.parse(fs.readFileSync(generatedTestingFile, 'utf8'));
309 | expect(testingDataset).not.toBeNull();
310 | expect(testingDataset.data).not.toBeNull();
311 | expect(testingDataset.data.length).toEqual(200);
312 | fs.unlinkSync(generatedTestingFile);
313 | expect(fs.existsSync(generatedTestingFile)).toBeFalsy();
314 | });
315 |
316 | test('test npm command line generator for imports example', () => {
317 | const d = __dirname;
318 | const generatedDir = path.resolve(`${d}/../../examples/importing/main`);
319 | const generatedTrainingFile = path.resolve(generatedDir, 'default_dataset_training.json');
320 | const npmBin = path.resolve(`${d}/../bin.ts`);
321 | const grammarFile = path.resolve(`${d}/../../examples/importing/main.chatito`);
322 | if (fs.existsSync(generatedTrainingFile)) {
323 | fs.unlinkSync(generatedTrainingFile);
324 | }
325 | if (fs.existsSync(generatedDir)) {
326 | fs.rmdirSync(generatedDir);
327 | }
328 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --outputPath=${generatedDir}`);
329 | expect(fs.existsSync(generatedDir)).toBeTruthy();
330 | expect(fs.existsSync(generatedTrainingFile)).toBeTruthy();
331 | const trainingDataset = JSON.parse(fs.readFileSync(generatedTrainingFile, 'utf8'));
332 | expect(trainingDataset).not.toBeNull();
333 | expect(trainingDataset.greet).not.toBeNull();
334 | expect(trainingDataset.greet.length).toEqual(30);
335 | fs.unlinkSync(generatedTrainingFile);
336 | fs.rmdirSync(generatedDir);
337 | });
338 |
339 | test('test npm command line generator for flair medium example', () => {
340 | const d = __dirname;
341 | const generatedClassificationTrainingFile = path.resolve(`${d}/../../examples/classification_flair_dataset_training.txt`);
342 | const generatedClassificationTestingFile = path.resolve(`${d}/../../examples/classification_flair_dataset_testing.txt`);
343 | const generatedNERTrainingFile = path.resolve(`${d}/../../examples/ner_flair_dataset_training.txt`);
344 | const generatedNERTestingFile = path.resolve(`${d}/../../examples/ner_flair_dataset_testing.txt`);
345 | const npmBin = path.resolve(`${d}/../bin.ts`);
346 | const grammarFile = path.resolve(`${d}/../../examples/citySearch_medium.chatito`);
347 | if (fs.existsSync(generatedClassificationTrainingFile)) {
348 | fs.unlinkSync(generatedClassificationTrainingFile);
349 | }
350 | if (fs.existsSync(generatedClassificationTestingFile)) {
351 | fs.unlinkSync(generatedClassificationTestingFile);
352 | }
353 | if (fs.existsSync(generatedNERTrainingFile)) {
354 | fs.unlinkSync(generatedNERTrainingFile);
355 | }
356 | if (fs.existsSync(generatedNERTestingFile)) {
357 | fs.unlinkSync(generatedNERTestingFile);
358 | }
359 | const child = cp.execSync(`node -r ts-node/register ${npmBin} ${grammarFile} --format=flair --outputPath=${d}/../../examples`);
360 | // generatedClassificationTrainingFile
361 | expect(fs.existsSync(generatedClassificationTrainingFile)).toBeTruthy();
362 | const dataset = fs.readFileSync(generatedClassificationTrainingFile, 'utf8');
363 | expect(dataset).not.toBeNull();
364 | expect(dataset.length).toBeGreaterThan(0);
365 | fs.unlinkSync(generatedClassificationTrainingFile);
366 | expect(fs.existsSync(generatedClassificationTrainingFile)).toBeFalsy();
367 |
368 | // generatedClassificationTestingFile
369 | expect(fs.existsSync(generatedClassificationTestingFile)).toBeTruthy();
370 | const testingDataset = fs.readFileSync(generatedClassificationTestingFile, 'utf8');
371 | expect(testingDataset).not.toBeNull();
372 | expect(testingDataset.length).toBeGreaterThan(0);
373 | fs.unlinkSync(generatedClassificationTestingFile);
374 | expect(fs.existsSync(generatedClassificationTestingFile)).toBeFalsy();
375 |
376 | // generatedNERTrainingFile
377 | expect(fs.existsSync(generatedNERTrainingFile)).toBeTruthy();
378 | const nerDataset = fs.readFileSync(generatedNERTrainingFile, 'utf8');
379 | expect(nerDataset).not.toBeNull();
380 | expect(nerDataset.length).toBeGreaterThan(0);
381 | fs.unlinkSync(generatedNERTrainingFile);
382 | expect(fs.existsSync(generatedNERTrainingFile)).toBeFalsy();
383 |
384 | // generatedNERTestingFile
385 | expect(fs.existsSync(generatedNERTestingFile)).toBeTruthy();
386 | const testingNerDataset = fs.readFileSync(generatedNERTestingFile, 'utf8');
387 | expect(testingNerDataset).not.toBeNull();
388 | expect(testingNerDataset.length).toBeGreaterThan(0);
389 | fs.unlinkSync(generatedNERTestingFile);
390 | expect(fs.existsSync(generatedNERTestingFile)).toBeFalsy();
391 | });
392 |
--------------------------------------------------------------------------------
/web/components/Editor/Editor.tsx:
--------------------------------------------------------------------------------
1 | import { saveAs } from 'file-saver';
2 | import * as React from 'react';
3 | import * as luisAdapter from '../../../src/adapters/luis';
4 | import * as rasaAdapter from '../../../src/adapters/rasa';
5 | import * as snipsAdapter from '../../../src/adapters/snips';
6 | import * as webAdapter from '../../../src/adapters/web';
7 | import * as chatito from '../../../src/main';
8 | import * as utils from '../../../src/utils';
9 | import { chatitoPrism, rasaDefaultOptions, snipsDefaultOptions, tabs } from '../../lib/editorConfig';
10 | import { debounce } from '../../lib/utils';
11 | import * as es from './editorStyles';
12 |
13 | const logger = console;
14 |
15 | const adapters = {
16 | default: webAdapter,
17 | rasa: rasaAdapter,
18 | snips: snipsAdapter,
19 | luis: luisAdapter
20 | };
21 |
22 | interface IEditorState {
23 | error: null | string;
24 | warning: null | string;
25 | activeTabIndex: number;
26 | showDrawer: boolean;
27 | dataset: any;
28 | adapterOptions: any;
29 | currentAdapter: 'default' | 'rasa' | 'snips' | 'luis';
30 | useCustomOptions: boolean;
31 | frequencyDistribution: chatito.distributionType;
32 | autoAliases: chatito.autoAliasesType;
33 | }
34 |
35 | type IDataset = webAdapter.IDefaultDataset | snipsAdapter.ISnipsDataset | rasaAdapter.IRasaDataset | luisAdapter.ILuisDataset;
36 |
37 | // NOTE: for SSR, wrap the require in check for window
38 | let CodeFlask = null;
39 | let ReactJson = null;
40 | if (typeof window !== `undefined`) {
41 | // tslint:disable-next-line:no-var-requires
42 | CodeFlask = require('codeflask').default;
43 | // tslint:disable-next-line:no-var-requires
44 | ReactJson = require('react-json-view').default;
45 | }
46 |
47 | export default class Editor extends React.Component<{}, IEditorState> {
48 | public state: IEditorState = {
49 | error: null,
50 | warning: null,
51 | activeTabIndex: 0,
52 | showDrawer: false,
53 | dataset: null,
54 | adapterOptions: null,
55 | currentAdapter: 'default',
56 | useCustomOptions: false,
57 | frequencyDistribution: 'regular',
58 | autoAliases: 'allow'
59 | };
60 | private tabsContainer = React.createRef() as React.RefObject;
61 | private codeflask = null;
62 | private editorUpdatesSetupCount = 0;
63 | private codeInputValue = '';
64 | private tabs: Array<{ title: string; value: string }> = [];
65 |
66 | private debouncedTabDSLValidation = debounce(() => {
67 | if (!this.codeInputValue.length) {
68 | if (this.state.error || this.state.warning) {
69 | this.setState({ error: null, warning: null });
70 | }
71 | return;
72 | }
73 | const validation = this.getDSLValidation(this.codeInputValue);
74 | let newState = {};
75 | if (validation && validation.error) {
76 | newState = { error: validation.error, warning: null };
77 | } else if (validation && validation.warning) {
78 | newState = { error: null, warning: validation.warning };
79 | } else {
80 | newState = { error: null, warning: null };
81 | }
82 | this.setState(newState, () => {
83 | this.saveToLocalStorage(true, false, false);
84 | });
85 | }, 300);
86 |
87 | public componentDidMount() {
88 | if (!CodeFlask) {
89 | return;
90 | }
91 | this.loadFromLocalStorage(() => {
92 | const flask = new CodeFlask('#my-code-editor', {
93 | language: 'chatito',
94 | lineNumbers: true
95 | });
96 | flask.addLanguage('chatito', chatitoPrism);
97 | flask.onUpdate(code => {
98 | if (!this.tabs || !this.tabs[this.state.activeTabIndex]) {
99 | return;
100 | }
101 | this.codeInputValue = code;
102 | this.tabs[this.state.activeTabIndex].value = code;
103 | // NOTE: ugly hack to know when codeflask is mounted (it makes 2 calls to update on mount)
104 | if (this.editorUpdatesSetupCount < 2) {
105 | this.editorUpdatesSetupCount++;
106 | } else {
107 | this.setState({ dataset: null });
108 | this.debouncedTabDSLValidation();
109 | }
110 | });
111 | if (this.tabs && this.tabs[this.state.activeTabIndex]) {
112 | flask.updateCode(this.tabs[this.state.activeTabIndex].value);
113 | }
114 | flask.setLineNumber();
115 | this.codeflask = flask;
116 | });
117 | }
118 |
119 | public render() {
120 | const alertState = !!this.state.error ? 'error' : !!this.state.warning ? 'warning' : 'success';
121 | return (
122 |
123 |
124 | {this.tabs.map(this.renderTabButton)}
125 |
126 | New file
127 |
128 |
129 | Generate Dataset
130 |
131 |
132 |
133 |
134 | {' '}
135 | {this.state.error || this.state.warning || `Correct syntax!`}
136 |
137 |
138 | e.stopPropagation()} showDrawer={this.state.showDrawer}>
139 | x
140 | {this.renderDatasetGeneratorSettings()}
141 | {this.renderDatasetPreviewer()}
142 |
143 |
144 |
145 | );
146 | }
147 |
148 | /* ================== Renderers ================== */
149 | private renderDatasetGeneratorSettings = () => {
150 | return (
151 |
152 | Dataset generation settings
153 |
154 |
155 |
156 |
157 |
168 |
169 |
170 |
171 |
172 |
173 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
196 |
197 |
198 |
199 |
200 |
201 |
211 |
212 |
213 |
214 |
220 | {this.renderEditAdapterOptions()}
221 |
222 | Generate and download dataset!
223 |
224 |
225 | );
226 | };
227 |
228 | private renderEditAdapterOptions = () => {
229 | if (!this.state.useCustomOptions || !ReactJson) {
230 | return null;
231 | }
232 | return (
233 |
234 |
235 | Edit the adapter custom initial options:
236 |
237 |
238 |
251 |
252 |
253 | );
254 | };
255 |
256 | private renderDatasetPreviewer = () => {
257 | if (!this.state.dataset || !ReactJson) {
258 | return null;
259 | }
260 | return (
261 |
262 | Review the generated training dataset
263 |
273 |
274 | );
275 | };
276 |
277 | private renderTabButton = (t, i) => {
278 | const changeTab = () => this.changeTab(i);
279 | const onCloseTab = this.closerTab(i);
280 | return (
281 |
282 | {t.title}
283 |
284 |
285 | );
286 | };
287 |
288 | /* ================== Event Handlers ================== */
289 | private onCloseDrawer = () => this.setState({ showDrawer: false, dataset: null });
290 |
291 | private onCustomOptionsCheckboxChange = e => {
292 | let adapterOptions = {};
293 | if (this.state.currentAdapter === 'rasa') {
294 | adapterOptions = Object.assign({}, rasaDefaultOptions);
295 | } else if (this.state.currentAdapter === 'snips') {
296 | adapterOptions = Object.assign({}, snipsDefaultOptions);
297 | }
298 | this.setState({ useCustomOptions: e.target.checked, adapterOptions, dataset: null }, () => {
299 | this.saveToLocalStorage(false, true, true);
300 | });
301 | };
302 |
303 | private onAdapterChange = e => {
304 | let adapterOptions = {};
305 | if (e.target.value === 'rasa') {
306 | adapterOptions = Object.assign({}, rasaDefaultOptions);
307 | } else if (e.target.value === 'snips') {
308 | adapterOptions = Object.assign({}, snipsDefaultOptions);
309 | }
310 | this.setState({ currentAdapter: e.target.value, adapterOptions, dataset: null }, () => {
311 | this.saveToLocalStorage(false, true, true);
312 | });
313 | };
314 |
315 | private onDistributionChange = e => {
316 | this.setState(
317 | {
318 | frequencyDistribution: e.target.value === 'even' ? 'even' : 'regular',
319 | dataset: null
320 | },
321 | () => this.saveToLocalStorage(false, true, true)
322 | );
323 | };
324 |
325 | private onAutoAliasesChange = (e: React.ChangeEvent) => {
326 | if ((chatito.VALID_AUTO_ALIASES as readonly string[]).includes(e.target.value)) {
327 | this.setState(
328 | {
329 | autoAliases: e.target.value as chatito.autoAliasesType,
330 | dataset: null
331 | },
332 | () => this.saveToLocalStorage(false, true, true)
333 | );
334 | }
335 | };
336 |
337 | private onEditAdapterOptions = changes => {
338 | if (changes && changes.updated_src) {
339 | this.setState({ adapterOptions: changes.updated_src }, () => {
340 | this.saveToLocalStorage(false, true, false);
341 | });
342 | return null;
343 | }
344 | return false;
345 | };
346 |
347 | private onAddFile = () => {
348 | let filename = 'newFile';
349 | if (window && window.prompt) {
350 | filename = prompt('Please enter the new .chatito file name:', filename);
351 | }
352 | if (filename) {
353 | this.tabs.push({ title: `${filename}.chatito`, value: '' });
354 | this.changeTab(this.tabs.length - 1, () => {
355 | this.tabsContainer.current.scrollTo({
356 | left: this.tabsContainer.current.scrollWidth,
357 | behavior: 'smooth'
358 | });
359 | });
360 | }
361 | };
362 |
363 | private onToggleDrawer = async () => {
364 | if (!this.state.showDrawer) {
365 | if (this.validateChatitoFiles()) {
366 | try {
367 | this.setState({ showDrawer: !this.state.showDrawer });
368 | } catch (e) {
369 | return;
370 | }
371 | } else {
372 | if (window && window.alert) {
373 | window.alert('Please fix the errors or warnings found in the code.');
374 | }
375 | }
376 | }
377 | };
378 |
379 | /* ================== Utils ================== */
380 |
381 | private saveToLocalStorage = (saveTabs, saveAdapterOptions, saveCurrentAdapter) => {
382 | if (window && localStorage) {
383 | if (saveTabs) {
384 | localStorage.setItem('___tabs', JSON.stringify(this.tabs));
385 | }
386 | if (saveAdapterOptions) {
387 | localStorage.setItem('___adapterOptions', this.state.useCustomOptions ? JSON.stringify(this.state.adapterOptions) : '');
388 | localStorage.setItem('___defaultDistribution', this.state.frequencyDistribution);
389 | localStorage.setItem('___autoAliases', this.state.autoAliases);
390 | }
391 | if (saveCurrentAdapter) {
392 | localStorage.setItem('___currentAdapter', this.state.currentAdapter);
393 | }
394 | }
395 | };
396 |
397 | private loadFromLocalIfPresent = (key: string, parseAsJSON: boolean) => {
398 | if (window && localStorage) {
399 | try {
400 | const item = localStorage.getItem(key);
401 | if (!parseAsJSON) {
402 | return item;
403 | }
404 | if (item) {
405 | try {
406 | return JSON.parse(item);
407 | } catch (e) {
408 | // just catch the error
409 | }
410 | }
411 | } catch (e) {
412 | logger.error(e);
413 | }
414 | }
415 | };
416 |
417 | private loadFromLocalStorage = (cb: () => void) => {
418 | if (window && localStorage) {
419 | const newState: IEditorState = this.state;
420 | const localTabs = this.loadFromLocalIfPresent('___tabs', true);
421 | const localAdapterOptions = this.loadFromLocalIfPresent('___adapterOptions', true);
422 | const localCurrentAdapter = this.loadFromLocalIfPresent('___currentAdapter', false);
423 | const localDefaultDistribution: string | undefined = this.loadFromLocalIfPresent('___defaultDistribution', false);
424 | const localAutoAliases: string | undefined = this.loadFromLocalIfPresent('___autoAliases', false);
425 | this.tabs = localTabs ? localTabs : tabs;
426 | if (localAdapterOptions) {
427 | newState.adapterOptions = localAdapterOptions;
428 | newState.useCustomOptions = true;
429 | }
430 | if (localCurrentAdapter) {
431 | newState.currentAdapter = localCurrentAdapter;
432 | }
433 | if (localDefaultDistribution && (chatito.VALID_DISTRIBUTIONS as readonly string[]).includes(localDefaultDistribution)) {
434 | newState.frequencyDistribution = localDefaultDistribution as chatito.distributionType;
435 | }
436 | if (localAutoAliases && (chatito.VALID_AUTO_ALIASES as readonly string[]).includes(localAutoAliases)) {
437 | newState.autoAliases = localAutoAliases as chatito.autoAliasesType;
438 | }
439 | this.setState(newState, cb);
440 | } else {
441 | this.tabs = tabs;
442 | }
443 | cb();
444 | };
445 |
446 | private changeTab = (i: number, cb?: () => void) => {
447 | this.setState({ activeTabIndex: i }, () => {
448 | this.codeflask.updateCode(this.tabs[this.state.activeTabIndex].value);
449 | this.codeflask.setLineNumber();
450 | if (cb) {
451 | setTimeout(cb, 600); // note; hack using setTimeout because codeflask uses a timeout on update code
452 | }
453 | });
454 | };
455 |
456 | private closerTab = (i: number) => {
457 | return (e: React.SyntheticEvent) => {
458 | if (e) {
459 | e.stopPropagation();
460 | }
461 | if (this.tabs[i].value) {
462 | if (!window.confirm(`Do you really want to remove '${this.tabs[i].title}'?`)) {
463 | return;
464 | }
465 | }
466 | const ati = this.state.activeTabIndex;
467 | let newActiveTabIndex = this.state.activeTabIndex;
468 | if (ati === i && ati > 0) {
469 | newActiveTabIndex = ati - 1;
470 | }
471 | this.tabs = [...this.tabs.slice(0, i), ...this.tabs.slice(i + 1)];
472 | if (!this.tabs.length) {
473 | this.tabs.push({ title: 'newFile.chatito', value: '' });
474 | newActiveTabIndex = 0;
475 | }
476 | this.saveToLocalStorage(true, false, false);
477 | this.changeTab(newActiveTabIndex);
478 | };
479 | };
480 |
481 | private getDSLValidation = (dsl: string): null | { error?: string; warning?: string } => {
482 | try {
483 | const ast = chatito.astFromString(dsl);
484 | const intentsWithoutLimit = ast.filter(entity => entity.type === 'IntentDefinition' && entity.args === null);
485 | if (intentsWithoutLimit.length) {
486 | return {
487 | warning: `Warning: Limit the number of generated examples for intents. E.g.: %[${intentsWithoutLimit[0].key}]('training': '100')`
488 | };
489 | }
490 | return null;
491 | } catch (e) {
492 | const error =
493 | e.constructor === Error
494 | ? e.toString()
495 | : `${e.name}: ${e.message} Line: ${e.location.start.line}, Column: ${e.location.start.column}`;
496 | return { error };
497 | }
498 | };
499 |
500 | private validateChatitoFiles = () => {
501 | return !this.tabs.some((tab, i) => {
502 | if (tab.value) {
503 | const validation = this.getDSLValidation(tab.value);
504 | if (validation !== null) {
505 | this.changeTab(i);
506 | return true;
507 | }
508 | }
509 | return false;
510 | });
511 | };
512 |
513 | private importFile = (startPath: string, endPath: string) => {
514 | const filename = endPath.replace(/^\.\//, '');
515 | const tabFound = this.tabs.find(t => t.title.trim() === filename);
516 | if (!tabFound) {
517 | throw new Error(`Can't import ${endPath}. Not found.`);
518 | }
519 | // note: returning empty path since there is no actual filesystem
520 | return { filePath: '', dsl: tabFound.value };
521 | };
522 |
523 | private generateDataset = async () => {
524 | let dataset: IDataset | null = null;
525 | const testingDataset = {};
526 | const adapter = adapters[this.state.currentAdapter];
527 | if (!adapter) {
528 | return;
529 | }
530 | chatito.config.defaultDistribution = this.state.frequencyDistribution;
531 | chatito.config.autoAliases = this.state.autoAliases;
532 | for (const [i, tab] of this.tabs.entries()) {
533 | try {
534 | if (dataset === null && this.state.useCustomOptions && this.state.adapterOptions) {
535 | dataset = JSON.parse(JSON.stringify(this.state.adapterOptions));
536 | }
537 | const { training, testing } = await adapter.adapter(tab.value, dataset, this.importFile, '');
538 | dataset = training;
539 | utils.mergeDeep(testingDataset, testing);
540 | } catch (e) {
541 | this.setState({ dataset: null, showDrawer: false }, () => {
542 | this.changeTab(i, () =>
543 | this.setState({ error: e.message }, () => {
544 | if (window && window.alert) {
545 | logger.log(e);
546 | window.alert(`Please fix error: ${e.message}`);
547 | }
548 | })
549 | );
550 | });
551 | return;
552 | }
553 | }
554 | const datasetBlob = new Blob([JSON.stringify(dataset)], { type: 'text/json;charset=utf-8' });
555 | const testingBlob = new Blob([JSON.stringify(testingDataset)], { type: 'text/json;charset=utf-8' });
556 | saveAs(datasetBlob, `training_dataset_${Math.round(new Date().getTime() / 1000)}.json`);
557 | setTimeout(() => {
558 | saveAs(testingBlob, `testing_dataset_${Math.round(new Date().getTime() / 1000)}.json`);
559 | }, 100); // note: timeout to allow multiple downloads at once
560 | this.setState({ dataset });
561 | };
562 | }
563 |
--------------------------------------------------------------------------------
/src/main.ts:
--------------------------------------------------------------------------------
1 | import { Chance } from 'chance';
2 | import {
3 | IChatitoCache,
4 | IChatitoEntityAST,
5 | IChatitoParser,
6 | IEntities,
7 | IEntityDef,
8 | ISentenceTokens,
9 | IStatCache,
10 | IUtteranceWriter
11 | } from './types';
12 |
13 | const logger = console;
14 |
15 | export const VALID_DISTRIBUTIONS = ['regular', 'even'] as const;
16 | export const VALID_AUTO_ALIASES = ['allow', 'warn', 'restrict'] as const;
17 |
18 | export type distributionType = typeof VALID_DISTRIBUTIONS[number];
19 | export type autoAliasesType = typeof VALID_AUTO_ALIASES[number];
20 |
21 | export interface IConfigOptions {
22 | defaultDistribution?: distributionType;
23 | autoAliases?: autoAliasesType;
24 | }
25 |
26 | type Configuration = Required;
27 |
28 | export const config: Configuration = {
29 | defaultDistribution: 'regular',
30 | autoAliases: 'allow'
31 | };
32 |
33 | // tslint:disable-next-line:no-var-requires
34 | const chatito = require('../parser/chatito') as IChatitoParser;
35 | const chance = new Chance();
36 |
37 | /**
38 | * Returns the entity key for the Alias/Slot that `token` refers to
39 | * @param token Sentence's token
40 | */
41 | const getEntityKey = (token: ISentenceTokens) => (token.variation ? `${token.value}#${token.variation}` : token.value);
42 |
43 | const chatitoFormatPostProcess = (data: ISentenceTokens[]) => {
44 | const arr = data.reduce(
45 | (accumulator: ISentenceTokens[], next: ISentenceTokens, i, arrShadow) => {
46 | if (accumulator.length) {
47 | const lastWord = accumulator[accumulator.length - 1];
48 | if (lastWord.type === next.type && lastWord.type === 'Text') {
49 | accumulator[accumulator.length - 1] = {
50 | type: lastWord.type,
51 | value: (lastWord.value + next.value).replace(/\s+/g, ' ')
52 | };
53 | } else {
54 | accumulator.push(next);
55 | }
56 | } else if (next.value.trim()) {
57 | accumulator.push(next);
58 | }
59 | if (i === arrShadow.length - 1) {
60 | // if its the last token of a sentence
61 | // remove empty strings at the end
62 | if (accumulator.length) {
63 | if (!accumulator[accumulator.length - 1].value.trim()) {
64 | accumulator.pop();
65 | }
66 | accumulator[accumulator.length - 1] = Object.assign({}, accumulator[accumulator.length - 1], {
67 | value: accumulator[accumulator.length - 1].value.replace(/\s+$/g, '')
68 | });
69 | }
70 | }
71 | return accumulator;
72 | },
73 | [] as ISentenceTokens[]
74 | );
75 | if (arr.length) {
76 | arr[0] = Object.assign({}, arr[0], {
77 | value: arr[0].value.replace(/^\s+/, '')
78 | });
79 | }
80 | if (!arr.length) {
81 | throw new Error(`Some sentence generated an empty string. Can't map empty to an intent.`);
82 | }
83 | return arr;
84 | };
85 |
86 | const calcSentencesProbabilities = (
87 | isPercentageProbability: boolean,
88 | isEvenDistribution: boolean,
89 | definedSentenceProbabilities: Array,
90 | sumOfTotalProbabilitiesDefined: number,
91 | maxCounts: number[]
92 | ) => {
93 | let sentencesWithNullProbabilityCount = 0;
94 | let totalMaxCountsToShareBetweenNullProbs = 0;
95 | definedSentenceProbabilities.forEach((prob, i) => {
96 | if (prob === null) {
97 | sentencesWithNullProbabilityCount += 1;
98 | totalMaxCountsToShareBetweenNullProbs += maxCounts[i];
99 | }
100 | });
101 | let probabilities: number[];
102 | if (isPercentageProbability) {
103 | // if defined probabilities is percentual, then calculate each sentence chances in percent
104 | probabilities = definedSentenceProbabilities.map((p, i) => {
105 | if (p !== null) {
106 | return p;
107 | }
108 | if (isEvenDistribution) {
109 | return (100 - sumOfTotalProbabilitiesDefined) / sentencesWithNullProbabilityCount;
110 | }
111 | return (((maxCounts[i] * 100) / totalMaxCountsToShareBetweenNullProbs) * (100 - sumOfTotalProbabilitiesDefined)) / 100;
112 | });
113 | } else {
114 | // if probabilityTypeDefined is weighted, then multiply the weight by max counts
115 | probabilities = definedSentenceProbabilities.map((p, i) => {
116 | if (p !== null) {
117 | return isEvenDistribution ? p : maxCounts[i] * p;
118 | }
119 | if (isEvenDistribution) {
120 | return 1;
121 | }
122 | return maxCounts[i];
123 | });
124 | }
125 | return probabilities;
126 | };
127 |
128 | // recursive function that generates variations using a cache
129 | // that uses counts to avoid repetitions
130 | export const getVariationsFromEntity = async (
131 | ed: IChatitoEntityAST,
132 | entities: IEntities,
133 | optional: boolean,
134 | cache: IChatitoCache
135 | ): Promise => {
136 | // if this entity is a slot variation, add that as the key
137 | const variationKey = ed.variation ? `#${ed.variation}` : '';
138 | const cacheKey = `${ed.type}-${ed.key}${variationKey}`;
139 | let cacheStats = cache.get(cacheKey) as IStatCache;
140 | if (!cacheStats) {
141 | // if the entity is not cache, create an empty cache for it
142 | const counts: IChatitoCache[] = [];
143 | const maxCounts: number[] = ed.inner.map(s => s.cardinality!);
144 | let probabilityTypeDefined: 'w' | '%' | null = null;
145 | const definedSentenceProbabilities: Array = []; // the posibility operators defined for sentences
146 | let isEvenDistribution = config.defaultDistribution === 'even';
147 | if (ed.args && ed.args.distribution) {
148 | isEvenDistribution = ed.args.distribution === 'even';
149 | }
150 | let sumOfTotalProbabilitiesDefined = 0;
151 | for (const c of ed.inner) {
152 | // get counts for each of the sentences inside the entity
153 | counts.push(new Map());
154 | if (c.probability === null) {
155 | definedSentenceProbabilities.push(null);
156 | } else {
157 | const p = c.probability || '';
158 | const isPercent = p.slice(-1) === '%';
159 | const setenceProbabilityType = isPercent ? '%' : 'w';
160 | if (probabilityTypeDefined === null) {
161 | probabilityTypeDefined = setenceProbabilityType;
162 | } else if (setenceProbabilityType !== probabilityTypeDefined) {
163 | throw new Error(`All probability definitions for "${cacheKey}" must be of the same type.`);
164 | }
165 | const prob = parseFloat(isPercent ? p.slice(0, -1) : p);
166 | if (isPercent) {
167 | if (prob <= 0 || prob > 100) {
168 | throw new Error(`Probability "${p}" must be greater than 0 up to 100. At ${cacheKey}`);
169 | }
170 | } else if (setenceProbabilityType === 'w') {
171 | if (prob <= 0) {
172 | throw new Error(`Probability weight "${p}" must be greater than 0. At ${cacheKey}`);
173 | }
174 | }
175 | sumOfTotalProbabilitiesDefined += prob;
176 | definedSentenceProbabilities.push(prob);
177 | }
178 | }
179 | if (probabilityTypeDefined === '%' && sumOfTotalProbabilitiesDefined && sumOfTotalProbabilitiesDefined > 100) {
180 | throw new Error(
181 | `The sum of sentence probabilities (${sumOfTotalProbabilitiesDefined}) for an entity can't be higher than 100%. At ${cacheKey}`
182 | );
183 | }
184 | const isPercentageProbability = probabilityTypeDefined === '%';
185 | const probabilities = calcSentencesProbabilities(
186 | isPercentageProbability,
187 | isEvenDistribution,
188 | definedSentenceProbabilities,
189 | sumOfTotalProbabilitiesDefined,
190 | maxCounts
191 | );
192 | const currentEntityCache: IStatCache = { counts, maxCounts, probabilities };
193 | cache.set(cacheKey, currentEntityCache);
194 | cacheStats = cache.get(cacheKey) as IStatCache;
195 | }
196 | // NOTE: if an entity has 5 sentences we add one (the optional empty sentence) and get that probability
197 | const optionalProb = 100 / (cacheStats.probabilities.length + 1);
198 | const sentenceIndex = chance.weighted(Array.from(cacheStats.probabilities.keys()), cacheStats.probabilities);
199 | if (optional && chance.bool({ likelihood: optionalProb })) {
200 | return [];
201 | }
202 | const sentence = ed.inner[sentenceIndex].sentence;
203 | let accumulator: ISentenceTokens[] = [];
204 | // For slots where a sentence is composed of only one alias, we add the synonym tag,
205 | // to denote that the generated alias is a synonym of its alias name
206 | const isSlotDefSentenceWithOnlyOneAlias = ed.type === 'SlotDefinition' && sentence.length === 1 && sentence[0].type === 'Alias';
207 | for (const t of sentence) {
208 | // slots and alias entities generate the sentences recursively
209 | const slotsInSentenceKeys: Set = new Set([]);
210 | if (t.type === 'Slot' || t.type === 'Alias') {
211 | const def = entities[t.type];
212 | const innerEntityKey = getEntityKey(t);
213 | const currentCache = slotsInSentenceKeys.has(innerEntityKey) ? cacheStats.counts[sentenceIndex] : new Map();
214 | slotsInSentenceKeys.add(innerEntityKey);
215 | const sentenceVariation = await getVariationsFromEntity(def[innerEntityKey], entities, !!t.opt, currentCache);
216 | if (sentenceVariation.length) {
217 | const returnSentenceTokens = chatitoFormatPostProcess(sentenceVariation);
218 | for (const returnToken of returnSentenceTokens) {
219 | const ettArgs = def[innerEntityKey].args;
220 | if (isSlotDefSentenceWithOnlyOneAlias && ettArgs && ettArgs.synonym === 'true') {
221 | returnToken.synonym = t.value;
222 | }
223 | if (t.type === 'Slot') {
224 | if (def[innerEntityKey].args) {
225 | returnToken.args = def[innerEntityKey].args;
226 | }
227 | returnToken.value = returnToken.value.trim();
228 | returnToken.type = t.type;
229 | returnToken.slot = t.value;
230 | }
231 | accumulator = accumulator.concat(returnToken);
232 | }
233 | }
234 | } else {
235 | accumulator = accumulator.concat(t);
236 | }
237 | }
238 | return accumulator;
239 | };
240 |
241 | /**
242 | * Picks the `combinationNumber`th example amongst all possible `entity` examples.
243 | *
244 | * @param defs All entities definitions
245 | * @param entity Entity to get the example from
246 | * @param combinationNumber The number of the example
247 | */
248 | export const getExampleByNumber = (defs: IEntities, entity: IChatitoEntityAST, combinationNumber: number): ISentenceTokens[] => {
249 | let lookupNumber = combinationNumber;
250 | const sentence = entity.inner.find(s => {
251 | if (lookupNumber < s.cardinality!) {
252 | return true;
253 | }
254 | lookupNumber -= s.cardinality!;
255 | return false;
256 | });
257 | if (!sentence) {
258 | return [];
259 | }
260 | let prevCardinality = 1;
261 | let prevRemaining = 0;
262 | const isSlotDefSentenceWithOnlyOneAlias =
263 | entity.type === 'SlotDefinition' && sentence.sentence.length === 1 && sentence.sentence[0].type === 'Alias';
264 | const resultTokens = sentence.sentence.reduce(
265 | (example, token) => {
266 | if (token.type === 'Text') {
267 | return example.concat([token]);
268 | }
269 | if (token.type === 'Slot' || token.type === 'Alias') {
270 | let cardinality = token.opt ? 1 : 0;
271 | const innerEntity = token.type === 'Alias' ? defs.Alias : defs.Slot;
272 | const entityKey = getEntityKey(token);
273 | cardinality += innerEntity[entityKey].cardinality!;
274 | lookupNumber = (lookupNumber - prevRemaining) / prevCardinality;
275 | prevRemaining = lookupNumber % cardinality;
276 | prevCardinality = cardinality;
277 | if (prevRemaining === 0 && token.opt) {
278 | return example;
279 | }
280 | const innerNumber = token.opt ? prevRemaining - 1 : prevRemaining;
281 | let tokens = getExampleByNumber(defs, innerEntity[entityKey], innerNumber);
282 | tokens = chatitoFormatPostProcess(tokens).map(t => {
283 | const ettArgs = innerEntity[entityKey].args;
284 | if (isSlotDefSentenceWithOnlyOneAlias && ettArgs && ettArgs.synonym === 'true') {
285 | t.synonym = token.value;
286 | }
287 | if (token.type === 'Slot') {
288 | if (innerEntity[entityKey].args) {
289 | t.args = innerEntity[entityKey].args;
290 | }
291 | t.value = t.value.trim();
292 | t.type = token.type;
293 | t.slot = token.value;
294 | }
295 | return t;
296 | });
297 | return example.concat(tokens);
298 | }
299 | throw Error(`Unknown token type: ${token.type}`);
300 | },
301 | [] as ISentenceTokens[]
302 | );
303 | return chatitoFormatPostProcess(resultTokens);
304 | };
305 |
306 | /**
307 | * Returns a generator providing every possible combination of entity's examples
308 | * including duplicates.
309 | *
310 | * @param defs All entities definitions
311 | * @param entity Entity to get all examples for
312 | */
313 | export function* allExamplesGenerator(defs: IEntities, entity: IChatitoEntityAST) {
314 | for (let i = 0; i < entity.cardinality!; i++) {
315 | yield getExampleByNumber(defs, entity, i);
316 | }
317 | }
318 |
319 | /**
320 | * Calculates the cardinality of the `sentence`.
321 | * All the entities used in the sentence must already have their cardinalities
322 | * calculated.
323 | *
324 | * @param defs All entities definitions
325 | * @param sentence Sentence tokens
326 | */
327 | const getCardinality = (defs: IEntities, sentence: ISentenceTokens[]) => {
328 | return sentence.reduce((acc, token) => {
329 | if (token.type === 'Text') {
330 | return acc;
331 | }
332 | const entity = token.type === 'Alias' ? defs.Alias : defs.Slot;
333 | const entityKey = getEntityKey(token);
334 |
335 | let tokenCardinality = entity[entityKey].cardinality!;
336 | if (token.opt) {
337 | tokenCardinality += 1;
338 | }
339 | return acc * tokenCardinality;
340 | }, 1);
341 | };
342 |
343 | /**
344 | * Calculates the cardinality of the `entity`.
345 | * All the entities used in the entity must already have their cardinalities
346 | * calculated.
347 | *
348 | * @param defs All entities definitions
349 | * @param entity Entity to calc cardinality for
350 | */
351 | const calcCardinality = (defs: IEntities, entity: IChatitoEntityAST) => {
352 | entity.inner.forEach(sentence => {
353 | const cardinality = getCardinality(defs, sentence.sentence);
354 | sentence.cardinality = cardinality;
355 | });
356 | entity.cardinality = entity.inner.reduce((acc, sentence) => acc + sentence.cardinality!, 0);
357 | };
358 |
359 | /**
360 | * Returns human readable string representing an entity.
361 | * Returns the same string for entity definition and it's use in a token.
362 | *
363 | * @param item Token or Entity definition
364 | */
365 | const getRefKey = (item: IChatitoEntityAST | ISentenceTokens) => {
366 | const type = item.type.replace('Definition', '');
367 | const key = 'key' in item ? item.key : getEntityKey(item);
368 | switch (type) {
369 | case 'Intent':
370 | return `%[${key}]`;
371 | case 'Alias':
372 | return `~[${key}]`;
373 | case 'Slot':
374 | return `@[${key}]`;
375 |
376 | default:
377 | return `(${key})`;
378 | }
379 | };
380 |
381 | /**
382 | * Returns true if the `entity` has any entity with cardinality not yet being
383 | * calculated.
384 | * Also populates `refs` map.
385 | *
386 | * @param defs All entities definitions
387 | * @param entity An Entity
388 | * @param refs A map of entities references
389 | */
390 | const hasTokenWithoutCardinality = (defs: IEntities, entity: IChatitoEntityAST, refs: { [key: string]: Set }) => {
391 | const parentKey = getRefKey(entity);
392 | return entity.inner.some(sentence =>
393 | sentence.sentence.some(token => {
394 | if (token.type === 'Text') {
395 | return false;
396 | }
397 | const entityKey = getEntityKey(token);
398 | const refKey = getRefKey(token);
399 | if (refKey in refs) {
400 | refs[refKey].add(parentKey);
401 | } else {
402 | refs[refKey] = new Set([parentKey]);
403 | }
404 | if (!defs[token.type][entityKey]) {
405 | throw new Error(`${token.type} not defined: ${entityKey}`);
406 | }
407 | return defs[token.type][entityKey].cardinality === undefined;
408 | })
409 | );
410 | };
411 |
412 | /**
413 | * Throws an error showing loop path if there is any in entities references (`refs`)
414 | * starting with `path` path.
415 | *
416 | * @param path Current path
417 | * @param refs Entities references map
418 | */
419 | const checkLoopIn = (path: string[], refs: { [key: string]: Set }) => {
420 | const last = path[path.length - 1];
421 | if (refs[last]) {
422 | for (const parent of refs[last]) {
423 | if (parent === path[0]) {
424 | const loop = path.concat([parent]).reverse();
425 | throw new Error(`You have a circular nesting: ${loop.join(' -> ')}. Infinite loop prevented.`);
426 | } else {
427 | checkLoopIn(path.concat([parent]), refs);
428 | }
429 | }
430 | }
431 | };
432 |
433 | /**
434 | * Throws an error showing loop path if there is any in entities references (`refs`)
435 | *
436 | * @param refs Entities references map
437 | */
438 | const checkLoop = (refs: { [key: string]: Set }) => {
439 | for (const key of Object.keys(refs)) {
440 | const path = [key];
441 | checkLoopIn(path, refs);
442 | }
443 | };
444 |
445 | /**
446 | * Throws an error showing slots nesting path if there is any
447 | * in the entitiesreferences (`refs`) starting with `path` path.
448 | *
449 | * @param path Current path
450 | * @param refs Entities references map
451 | */
452 | const findNestedSlots = (path: string[], refs: { [key: string]: Set }) => {
453 | const last = path[path.length - 1];
454 | if (refs[last]) {
455 | for (const parent of refs[last]) {
456 | const firstIndex = path.findIndex(item => item.startsWith('@'));
457 | if (firstIndex !== -1 && parent.startsWith('@')) {
458 | const slotsPath = path
459 | .slice(firstIndex)
460 | .reverse()
461 | .join(' -> ');
462 | throw new Error(`You have nested slots: ${parent} -> ${slotsPath}. A slot can't reference other slot.`);
463 | } else {
464 | findNestedSlots(path.concat([parent]), refs);
465 | }
466 | }
467 | }
468 | };
469 |
470 | /**
471 | * Throws an error showing slots nesting path if there is any
472 | * in the entitiesreferences (`refs`).
473 | *
474 | * @param refs Entities references map
475 | */
476 | const checkNestedSlots = (refs: { [key: string]: Set }) => {
477 | for (const key of Object.keys(refs)) {
478 | const path = [key];
479 | findNestedSlots(path, refs);
480 | }
481 | };
482 |
483 | /**
484 | * Calculates cardinalities for all entities.
485 | * Also checks for nested slots.
486 | *
487 | * @param defs All entities definitions
488 | */
489 | const preCalcCardinality = (defs: IEntities) => {
490 | // cycle through uncalculated:
491 | const uncalced = {
492 | Intent: [] as string[],
493 | Alias: [] as string[],
494 | Slot: [] as string[]
495 | };
496 | const refs: { [key: string]: Set } = {};
497 | let totalUncalced = 0;
498 | let lastUncalced = -1;
499 | do {
500 | totalUncalced = 0;
501 | for (const type of Object.keys(uncalced) as Array) {
502 | uncalced[type] = Object.keys(defs[type]).filter(key => defs[type][key].cardinality === undefined);
503 | uncalced[type].forEach(key => {
504 | if (!hasTokenWithoutCardinality(defs, defs[type][key], refs)) {
505 | calcCardinality(defs, defs[type][key]);
506 | } else {
507 | totalUncalced += 1;
508 | }
509 | });
510 | }
511 | if (lastUncalced === totalUncalced) {
512 | checkLoop(refs);
513 | }
514 | lastUncalced = totalUncalced;
515 | } while (totalUncalced > 0);
516 | checkNestedSlots(refs);
517 | };
518 |
519 | /**
520 | * Adds missing alias definitions.
521 | * When alias is used in sentence tokens but not defined.
522 | *
523 | * @param defs All entities definitions
524 | */
525 | const addMissingAliases = (defs: IEntities) => {
526 | const aliases = new Set();
527 | for (const entities of [defs.Alias, defs.Slot, defs.Intent]) {
528 | for (const key of Object.keys(entities)) {
529 | entities[key].inner.forEach(sentence => {
530 | sentence.sentence.forEach(token => {
531 | if (token.type === 'Alias') {
532 | aliases.add(token.value);
533 | }
534 | });
535 | });
536 | }
537 | }
538 | for (const alias of aliases) {
539 | if (!defs.Alias[alias]) {
540 | if (config.autoAliases === 'warn') {
541 | // tslint:disable-next-line: no-console
542 | console.warn(`WARNING! Auto alias creation: '${alias}'`);
543 | }
544 | defs.Alias[alias] = {
545 | inner: [{ sentence: [{ value: alias, type: 'Text' }], probability: null }],
546 | key: alias,
547 | type: 'AliasDefinition'
548 | };
549 | }
550 | }
551 | };
552 |
553 | export type IFileImporter = (
554 | fromPath: string,
555 | importFile: string
556 | ) => {
557 | filePath: string;
558 | dsl: string;
559 | };
560 |
561 | export const astFromString = (str: string) => chatito.parse(str);
562 | export const datasetFromString = (str: string, writterFn: IUtteranceWriter, importer?: IFileImporter, currentPath?: string) => {
563 | const ast = astFromString(str);
564 | return datasetFromAST(ast, writterFn, importer, currentPath);
565 | };
566 |
567 | export const getImports = (from: string, to: string, importer: IFileImporter) => {
568 | const fileContent = importer(from, to);
569 | if (!fileContent || !fileContent.dsl) {
570 | throw new Error(`Failed importing ${to}`);
571 | }
572 | try {
573 | const importAst = astFromString(fileContent.dsl);
574 | let outAst: IChatitoEntityAST[] = [];
575 | importAst.forEach(ett => {
576 | if (ett.type === 'ImportFile' && ett.value) {
577 | outAst = [...outAst, ...getImports(fileContent.filePath, ett.value, importer)];
578 | } else if (ett.type === 'AliasDefinition' || ett.type === 'SlotDefinition') {
579 | outAst = [...outAst, ett];
580 | }
581 | });
582 | return outAst;
583 | } catch (e) {
584 | throw new Error(`Failed importing ${to}. ${e.message} - ${JSON.stringify(e.location)}`);
585 | }
586 | };
587 |
588 | export const definitionsFromAST = (initialAst: IChatitoEntityAST[], importHandler?: IFileImporter, currPath?: string) => {
589 | const operatorDefinitions: IEntities = { Intent: {}, Slot: {}, Alias: {} };
590 | if (!initialAst || !initialAst.length) {
591 | return;
592 | }
593 | const importer = importHandler ? importHandler : () => ({ filePath: '', dsl: '' });
594 | const currentPath = currPath ? currPath : '';
595 | // gete imports first
596 | let ast: IChatitoEntityAST[] = [...initialAst];
597 | initialAst.forEach(od => {
598 | if (od.type === 'ImportFile' && od.value) {
599 | ast = [...ast, ...getImports(currentPath, od.value, importer)];
600 | }
601 | });
602 | ast.forEach(od => {
603 | let entity: IEntityDef;
604 | if (od.type === 'IntentDefinition') {
605 | entity = operatorDefinitions.Intent;
606 | } else if (od.type === 'SlotDefinition') {
607 | entity = operatorDefinitions.Slot;
608 | } else if (od.type === 'AliasDefinition') {
609 | entity = operatorDefinitions.Alias;
610 | } else {
611 | // type is 'Comment' or 'ImportFile'
612 | return; // skip comments
613 | }
614 | const odKey = od.variation ? `${od.key}#${od.variation}` : od.key;
615 | if (entity[odKey]) {
616 | throw new Error(`Duplicate definition for ${od.type} '${odKey}'`);
617 | }
618 | entity[odKey] = od;
619 | });
620 | if (config.autoAliases !== 'restrict') {
621 | addMissingAliases(operatorDefinitions);
622 | }
623 | preCalcCardinality(operatorDefinitions);
624 | return operatorDefinitions;
625 | };
626 |
627 | export const datasetFromAST = async (
628 | initialAst: IChatitoEntityAST[],
629 | writterFn: IUtteranceWriter,
630 | importHandler?: IFileImporter,
631 | currPath?: string
632 | ) => {
633 | const operatorDefinitions = definitionsFromAST(initialAst, importHandler, currPath);
634 | if (!operatorDefinitions) {
635 | return;
636 | }
637 | const intentKeys = Object.keys(operatorDefinitions.Intent);
638 | if (!intentKeys || !intentKeys.length) {
639 | return;
640 | }
641 | for (const intentKey of intentKeys) {
642 | // and for all tokens inside the sentence
643 | const maxPossibleCombinations = operatorDefinitions.Intent[intentKey].cardinality!;
644 | let maxIntentExamples = maxPossibleCombinations; // counter that will change
645 | const entityArgs = operatorDefinitions.Intent[intentKey].args;
646 | // by default if no training or testing arguments are declared, all go to training
647 | let trainingN = maxIntentExamples;
648 | let testingN = 0;
649 | let generatedTrainingExamplesCount = 0;
650 | let generatedTestingExamplesCount = 0;
651 | if (entityArgs) {
652 | if (entityArgs.training) {
653 | trainingN = parseInt(entityArgs.training, 10);
654 | if (trainingN < 1) {
655 | throw new Error(`The 'training' argument for ${intentKey} must be higher than 0.`);
656 | }
657 | if (entityArgs.testing) {
658 | testingN = parseInt(entityArgs.testing, 10);
659 | if (testingN < 1) {
660 | throw new Error(`The 'testing' argument for ${intentKey} must be higher than 0.`);
661 | }
662 | }
663 | }
664 | let intentMax = trainingN + testingN;
665 | if (intentMax > maxIntentExamples) {
666 | logger.warn(
667 | `Can't generate ${intentMax} examples. ` +
668 | `Using the maximum possible combinations: ${maxIntentExamples}. ` +
669 | 'NOTE: Using the maximum leads to overfitting.'
670 | );
671 | intentMax = maxIntentExamples;
672 | } else if (intentMax < maxIntentExamples) {
673 | maxIntentExamples = intentMax;
674 | }
675 | }
676 | const maxEx = maxIntentExamples;
677 | const globalCache: IChatitoCache = new Map();
678 | const collitionsCache: { [id: string]: boolean } = {};
679 | if (maxIntentExamples >= maxPossibleCombinations) {
680 | for (const utterance of allExamplesGenerator(operatorDefinitions, operatorDefinitions.Intent[intentKey])) {
681 | const utteranceString = utterance.reduce((p, n) => p + n.value, '');
682 | if (!collitionsCache[utteranceString]) {
683 | collitionsCache[utteranceString] = true;
684 | const completedTraining = generatedTrainingExamplesCount >= trainingN;
685 | const completedTesting = generatedTestingExamplesCount >= testingN;
686 | let isTrainingExample = !completedTraining;
687 | if (!completedTraining && !completedTesting) {
688 | const trainingLeft = trainingN - generatedTrainingExamplesCount;
689 | const testingLeft = testingN - generatedTestingExamplesCount;
690 | isTrainingExample = Math.random() < trainingLeft / (trainingLeft + testingLeft);
691 | }
692 | writterFn(utterance, intentKey, isTrainingExample);
693 | if (isTrainingExample) {
694 | generatedTrainingExamplesCount++;
695 | } else {
696 | generatedTestingExamplesCount++;
697 | }
698 | }
699 | }
700 | continue;
701 | }
702 | let duplicatesCounter = 0;
703 | while (maxIntentExamples) {
704 | const intentSentence = await getVariationsFromEntity(
705 | operatorDefinitions.Intent[intentKey],
706 | operatorDefinitions,
707 | false,
708 | globalCache
709 | );
710 | const utterance = chatitoFormatPostProcess(intentSentence);
711 | const utteranceString = utterance.reduce((p, n) => p + n.value, '');
712 | if (!collitionsCache[utteranceString]) {
713 | collitionsCache[utteranceString] = true;
714 | const completedTraining = generatedTrainingExamplesCount >= trainingN;
715 | const completedTesting = generatedTestingExamplesCount >= testingN;
716 | let isTrainingExample = !completedTraining;
717 | if (!completedTraining && !completedTesting) {
718 | // reference: https://stackoverflow.com/questions/44263229/generate-a-random-boolean-70-true-30-false
719 | isTrainingExample = Math.random() < 0.7;
720 | }
721 | writterFn(utterance, intentKey, isTrainingExample);
722 | maxIntentExamples--;
723 | if (isTrainingExample) {
724 | generatedTrainingExamplesCount++;
725 | } else {
726 | generatedTestingExamplesCount++;
727 | }
728 | } else {
729 | duplicatesCounter++;
730 | // note: trick to make all combinations for small datasets, but avoid them for large ones
731 | const smallDupesLimit = 10000;
732 | const maxDupes = maxPossibleCombinations * maxPossibleCombinations;
733 | const maxDupesLimit = Math.floor(maxDupes / 2);
734 | const isBigDataset = maxPossibleCombinations > smallDupesLimit;
735 | if (
736 | (isBigDataset && duplicatesCounter > maxDupesLimit) ||
737 | (!isBigDataset && duplicatesCounter > maxDupes * maxPossibleCombinations)
738 | ) {
739 | // prevent cases where duplicates are part of the entity definitions
740 | let m = `Too many duplicates while generating dataset! Looks like we have probably reached `;
741 | m += `the maximum ammount of possible unique generated examples. `;
742 | m += `The generator has stopped at ${maxEx - maxIntentExamples} examples for intent ${intentKey}.`;
743 | logger.warn(m);
744 | maxIntentExamples = 0;
745 | }
746 | }
747 | }
748 | }
749 | };
750 |
--------------------------------------------------------------------------------