├── .gitignore
├── 10kGNAD-master
    ├── LICENSE
    ├── LICENSE.txt
    ├── README.md
    ├── README.txt
    ├── articles.csv
    ├── code
    │   ├── extract_dataset_from_sqlite.py
    │   ├── generate_lowshot_sets.py
    │   └── split_articles_into_train_test.py
    ├── database_schema.md
    ├── requirements.txt
    ├── test.csv
    └── train.csv
├── README.md
├── bert-base-german-cased
    └── bert_config.json
├── bert_classify.py
├── bert_finetuner_ner.py
├── bert_finetuner_splitset.py
├── bert_rasa_classify.py
├── create_train_test_split.py
├── create_xlsx_dataset_from_rasa_nlu.py
├── data
    └── articles.csv
├── huggingface_finetune.py
├── logging.conf
├── rasa_bot_generator.py
├── requirements.txt
└── test_bot
    ├── __init__.py
    ├── actions.py
    ├── config.yml
    ├── credentials.yml
    ├── domain.yml
    └── endpoints.yml


/.gitignore:
--------------------------------------------------------------------------------
 1 | finetuning/*
 2 | data/*
 3 | !data/articles.csv
 4 | .idea
 5 | test_bot/data
 6 | test_bot/test_data
 7 | test_bot/evaluation
 8 | test_bot/models
 9 | 10kGNAD-master/corpus.sqlite3
10 | 10kGNAD-master/million_post_corpus.tar.bz2
11 | __pycache__/


--------------------------------------------------------------------------------
/10kGNAD-master/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Timo Block
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/10kGNAD-master/LICENSE.txt:
--------------------------------------------------------------------------------
1 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
2 | 


--------------------------------------------------------------------------------
/10kGNAD-master/README.md:
--------------------------------------------------------------------------------
 1 | # Ten Thousand German News Articles Dataset
 2 | 
 3 | For more information visit the detailed [project page](https://tblock.github.io/10kGNAD/).
 4 | 
 5 | 1. Install the required python packages `pip install -r requirements.txt`.
 6 | 2. Download the `corpus.sqlite3` file into the project root from [here (compressed)](https://github.com/OFAI/million-post-corpus/releases/download/v1.0.0/million_post_corpus.tar.bz2) or directly from [here](https://github.com/tblock/10kGNAD/releases/download/v1.0/corpus.sqlite3).
 7 | 3. Run `python code/extract_dataset_from_sqlite.py corpus.sqlite3 articles.csv` to extract the articles.
 8 | 4. Run `python code/split_articles_into_train_test.py` to split the dataset.
 9 | 
10 | ## License
11 | 
12 | All code in this repository is licensed under a MIT License.
13 | 
14 | The dataset is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/).
15 | 


--------------------------------------------------------------------------------
/10kGNAD-master/README.txt:
--------------------------------------------------------------------------------
 1 | Million Post Corpus
 2 | ===================
 3 | 
 4 | Please see the corpus website at
 5 | 
 6 | https://ofai.github.io/million-post-corpus/
 7 | 
 8 | If you use this data set for your research, please cite our paper (more details
 9 | on the above website):
10 | 
11 | Dietmar Schabus, Marcin Skowron, Martin Trapp
12 | One Million Posts: A Data Set of German Online Discussions
13 | Proceedings of the 40th International ACM SIGIR Conference on Research and
14 | Development in Information Retrieval (SIGIR)
15 | Tokyo, Japan, August 2017
16 | 


--------------------------------------------------------------------------------
/10kGNAD-master/code/extract_dataset_from_sqlite.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | """
 5 | extract_dataset_from_sqlite.py.py
 6 | 
 7 | This exports the articles from the _One Million Posts Corpus_ and generates a 
 8 | CSV file containing a label and the text for each article.
 9 | """
10 | 
11 | import re
12 | import sys
13 | import csv
14 | import sqlite3
15 | 
16 | from tqdm import tqdm
17 | from bs4 import BeautifulSoup
18 | from argparse import ArgumentParser
19 | 
20 | 
21 | ARTICLE_QUERY = "SELECT Path, Body FROM Articles WHERE PATH LIKE 'Newsroom/%' AND PATH NOT LIKE 'Newsroom/User%' ORDER BY Path"
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     parser = ArgumentParser()
26 |     parser.add_argument(dest="sqlite_file", action="store", help="sqlite input filename", metavar="<SQLite Database>")
27 |     parser.add_argument(dest="csv_file", action="store", help="csv output filebame", metavar="<CSV Filename>")
28 |     args = parser.parse_args()
29 | 
30 |     
31 |     conn = sqlite3.connect(args.sqlite_file)
32 |     cursor = conn.cursor()
33 | 
34 |     with open(args.csv_file, "w", encoding='utf-8') as csvfile:
35 |         writer = csv.writer(csvfile, delimiter=';',quotechar='\'', quoting=csv.QUOTE_MINIMAL)
36 | 
37 |         for row in tqdm(cursor.execute(ARTICLE_QUERY).fetchall(), unit_scale=True):
38 |             path = row[0]
39 |             body = row[1]
40 |             text = ""
41 |             description = ""
42 | 
43 |             soup = BeautifulSoup(body, 'html.parser')
44 | 
45 |             # get description from subheadline 
46 |             description_obj = soup.find('h2',{'itemprop':'description'})
47 |             if description_obj is not None:
48 |                 description = description_obj.text
49 |                 description = description.replace("\n"," ").replace("\t"," ").strip() + ". "
50 | 
51 |             # get text from paragraphs
52 |             text_container = soup.find('div',{'class':'copytext'})
53 |             if text_container is not None:
54 |                 for p in text_container.findAll('p'):
55 |                     text += p.text.replace("\n"," ").replace("\t"," ").replace("\"","").replace("'","") + " "
56 |             text = text.strip()
57 |             
58 |             # remove article autors
59 |             for author in re.findall(r"\.\ \(.+,.+2[0-9]+\)", text[-50:]): # some articles have a year of 21015..
60 |                 text = text.replace(author, ".")
61 | 
62 |             # get category from path
63 |             category = path.split("/")[1]
64 |             sample = [category, description + text]
65 | 
66 |             # filter empty samples, then write to csv
67 |             if sample[1] != "":
68 |                 writer.writerow(sample)
69 | 
70 |     conn.close()
71 |    
72 | 


--------------------------------------------------------------------------------
/10kGNAD-master/code/generate_lowshot_sets.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os  
 5 | import csv
 6 | import random as rnd
 7 | 
 8 | from argparse import ArgumentParser
 9 | 
10 | """ ompc_generate_lowshot_sets.py
11 | Generates low shot sets from the full train set
12 | """
13 | 
14 | SIZES = [.01,.02,.05,.075,.1,.2,.5,.75,1.]
15 | ITERATIONS = 10
16 | 
17 | full_dataset = []
18 | 
19 | def stratifyed_shuffled_subset(data, size):
20 | 	""" returns a stratifyed, but shuffled subset of lenght *size* """
21 | 	dataset_dict = {}
22 | 	subset = []
23 | 
24 | 	rnd.shuffle(data)
25 | 
26 | 	for entry in data:
27 | 		label = entry[0]
28 | 		text = entry[1]
29 | 		if label not in dataset_dict:  # if the label is not yet in the dict
30 | 		   dataset_dict[label] = []
31 | 		f = dataset_dict[label]
32 | 		f.append(text)
33 | 
34 | 	for key in dataset_dict:
35 | 		key_set = dataset_dict[key]
36 | 		for entry in key_set[: int(len(key_set) * size)]:
37 | 			subset.append([key, entry])
38 | 
39 | 	rnd.shuffle(subset)
40 | 	return subset
41 | 
42 | def write_dataset(data, size, iteration):
43 | 	with open("lowshot/lowshot_" + str(size) +"_" + str(iteration) + ".csv", "w") as file_write:
44 | 		writer = csv.writer(file_write, delimiter=';', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
45 | 		for entry in data:
46 | 			writer.writerow(entry)
47 | 
48 | def write_dataset_fasttext(data, size, iteration):
49 | 	with open("lowshot/lowshot_" + str(size) +"_" + str(iteration) + ".csv", "w") as file_write:
50 | 		writer = csv.writer(file_write, delimiter='\t', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
51 | 		for entry in data:
52 | 			label = "__label__" + entry[0]
53 | 			writer.writerow([label, entry[1]])
54 | 
55 | 
56 | def createFolder(directory):
57 |     try:
58 |         if not os.path.exists(directory):
59 |             os.makedirs(directory)
60 |     except OSError:
61 |         print ('Error: Creating directory. ' +  directory)
62 | 
63 | 
64 | if __name__ == "__main__":
65 | 	parser = ArgumentParser()
66 | 	parser.add_argument('-fastText', action='store_true')
67 | 	args = parser.parse_args()
68 | 
69 | 	print("writing files in fastText format:", args.fastText)
70 | 
71 | 	createFolder('./lowshot/')	
72 | 
73 | 	with open("train.csv") as full_file:
74 | 		reader = csv.reader(full_file, delimiter=';', quotechar='\'')
75 | 		for row in reader:
76 | 			full_dataset.append(row)
77 | 
78 | 	# sets a seed for reproducability
79 | 	rnd.seed(42)
80 | 
81 | 	size_counter = 0
82 | 	for size in SIZES:
83 | 		for iteration in range(ITERATIONS):
84 | 			print("Generating iteration %.0f of size %.3f" % (iteration, size))
85 | 			subset = stratifyed_shuffled_subset(full_dataset, size)
86 | 
87 | 			if not args.fastText:
88 | 				write_dataset(subset, str(size_counter) + "_" + str(size), iteration)
89 | 			if args.fastText:
90 | 				write_dataset_fasttext(subset, str(size_counter) + "_" + str(size), iteration)
91 | 			
92 | 		size_counter+=1
93 | 


--------------------------------------------------------------------------------
/10kGNAD-master/code/split_articles_into_train_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import csv
 5 | import collections
 6 | 
 7 | from argparse import ArgumentParser
 8 | from sklearn.model_selection import train_test_split
 9 | 
10 | """ split_articles_into_train_test.py:
11 | 
12 | processes the dataset and splits the dataset into a training- and testset
13 | """
14 | 
15 | SPLIT = .1
16 | 
17 | 
18 | def write_datasets(data, name, args):
19 |     """ write a csv file in a normal and optinally in the fastText format """
20 | 
21 |     with open(name + ".csv", "w", encoding='utf-8') as file_write:
22 |         writer = csv.writer(file_write, delimiter=';', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
23 |         for row in data:
24 |             writer.writerow(row)
25 | 
26 |     # optionally output files in the fastText format
27 |     if args.fastText:
28 |         with open("fastText_" + name + ".csv", "w", encoding='utf-8') as file_write:
29 |             writer = csv.writer(file_write, delimiter='\t', quotechar='\'', quoting=csv.QUOTE_MINIMAL)
30 |             for row in data:
31 |                 label = row[0]
32 |                 label = "__label__" +  label
33 |                 writer.writerow([label, row[1]])
34 | 
35 | 
36 | if __name__ == "__main__":
37 | 
38 |     parser = ArgumentParser()
39 |     parser.add_argument('-fastText', action='store_true')
40 |     args = parser.parse_args()
41 | 
42 |     labels = []
43 |     texts = []
44 | 
45 |     # read full dataset file
46 |     with open("articles.csv", "r", encoding='utf-8') as csvfile:
47 |         reader = csv.reader(csvfile, delimiter=';', quotechar='\'')
48 |         for row in reader:
49 |             if len(row) == 2:
50 |                 labels.append(row[0])
51 |                 texts.append(row[1])
52 |     print(len(texts))
53 | 
54 |     # split dataset
55 |     trn_texts, tst_texts, trn_labels, tst_labels = train_test_split(texts, labels, test_size=SPLIT, random_state=42, stratify=labels)
56 | 
57 |     # write train and test datasets
58 |     train = []
59 |     test = []
60 |     
61 |     for i in range(len(trn_labels)):
62 |         train.append([trn_labels[i], trn_texts[i]])
63 |     
64 |     for i in range(len(tst_labels)):
65 |         test.append([tst_labels[i], tst_texts[i]])
66 | 
67 |     write_datasets(train, "train", args)
68 |     write_datasets(test, "test", args)
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/10kGNAD-master/database_schema.md:
--------------------------------------------------------------------------------
 1 | ```sql
 2 | CREATE TABLE Articles (
 3 |     ID_Article INTEGER PRIMARY KEY,     --
 4 |     Path TEXT NOT NULL,                 -- Topic path, e.g.: 'Newsroom/Sports/Motorsports/Formula 1'
 5 |     publishingDate TIMESTAMP NOT NULL,  --
 6 |     Title TEXT NOT NULL,                --
 7 |     Body TEXT                           -- Main article body, contains HTML markup
 8 | );
 9 | 
10 | CREATE TABLE Posts (
11 |     ID_Post INTEGER PRIMARY KEY,        --
12 |     ID_Parent_Post INTEGER,             -- if this post is a reply: parent post's ID, otherwise NULL
13 |     ID_Article INTEGER NOT NULL,        -- foreign key to 'Articles' table
14 |     ID_User INTEGER NOT NULL,           --
15 |     CreatedAt TIMESTAMP NOT NULL,       --
16 |     Status TEXT,                        -- 'online' or 'deleted' (if deleted by moderator)
17 |     Headline TEXT,                      -- Post headline (may be NULL if Body isn't)
18 |     Body TEXT,                          -- Post main body (may be NULL if Headline isn't)
19 |     PositiveVotes INTEGER NOT NULL,     -- Number of positive votes by other users
20 |     NegativeVotes INTEGER NOT NULL      -- Number of negative votes by other users
21 | );
22 | 
23 | -- This table lists all users who work for the newspaper (e.g. moderators, editorial journalists)
24 | CREATE TABLE Newspaper_Staff (
25 |     ID_User INTEGER PRIMARY KEY         -- matches with Posts.ID_User
26 | );
27 | 
28 | -- This table may contain multiple annotator opinions for a given (ID_Post, Category) pair
29 | CREATE TABLE Annotations (
30 |     ID_Post INTEGER NOT NULL,           -- foreign key to 'Posts' table
31 |     ID_Annotator INTEGER NOT NULL,      --
32 |     Category TEXT NOT NULL,             -- name of the category, e.g. 'SentimentNegative'
33 |     Value INTEGER NOT NULL,             -- 0 or 1, where 1 means the category does apply to the post
34 |     PRIMARY KEY(ID_Post, ID_Annotator, Category)
35 | );
36 | 
37 | -- This table will contain only one consolidated judgment for a given (ID_Post, Category) pair,
38 | -- determined by a majority vote across all opinions in the 'Annotations' table
39 | CREATE TABLE Annotations_consolidated (
40 |     ID_Post INTEGER NOT NULL,           -- 
41 |     Category TEXT NOT NULL,             -- name of the category, e.g. 'SentimentNegative'
42 |     Value INTEGER NOT NULL,             -- 0 or 1, where 1 means the category does apply to the post
43 |     PRIMARY KEY(ID_Post, Category)
44 | );
45 | 
46 | -- This table is meant for reproducible cross validation. For each category, all posts are split
47 | -- into ten folds in a stratified manner
48 | -- https://en.wikipedia.org/wiki/Cross-validation_%28statistics%29#k-fold_cross-validation
49 | CREATE TABLE CrossValSplit(
50 |     ID_Post INTEGER NOT NULL,           -- foreign key to 'Posts' table
51 |     Category TEXT NOT NULL,             -- name of the category, e.g. 'SentimentNegative'
52 |     Fold INTEGER NOT NULL,              -- from [1,10]
53 |     PRIMARY KEY(ID_Post, Category, Fold)
54 | );
55 | 
56 | -- This table defines the default ordering of the categories
57 | CREATE TABLE Categories (
58 |     Name TEXT PRIMARY KEY,              -- name of the category, e.g. 'SentimentNegative'
59 |     Ord INTEGER                         -- ordering index
60 | );
61 | ```
62 | 


--------------------------------------------------------------------------------
/10kGNAD-master/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn>=0.20.2
2 | beautifulsoup4>=4.6.3
3 | tqdm>=4.26.0
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Finetune BERT Embeddings with spaCy and Rasa
  2 | 
  3 | **For whom this repository might be of interest:**
  4 | 
  5 | This repository describes the process of finetuning the *german pretrained BERT* model of [deepset.ai](https://deepset.ai/german-bert)
  6 | on a domain-specific dataset, converting it into a [spaCy](https://spacy.io/) packaged model and loading it in [Rasa](https://rasa.com/) to evaluate its
  7 | performance on domain-specific **Conversational AI** tasks like *intent detection* and *NER*.
  8 | If there are questions though, feel free to ask.
  9 | 
 10 | This repository is meant for those who want to have a quick dive into the matter. 
 11 | 
 12 | I am going to use the [10kGNAD](https://tblock.github.io/10kGNAD/) dataset for this task but it should be easy to
 13 | modify the files for your specific use case.
 14 | 
 15 | **Short-term Roadmap**:
 16 | 
 17 | - [x] Add [DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) support
 18 | - [ ] Add CUDA Installation Guide
 19 | - [x] Add [RoBERTa](https://arxiv.org/abs/1907.11692) support
 20 | - [x] Add NER support
 21 | 
 22 | ___
 23 | ## Updates
 24 | 
 25 | **Update 06.07.2020: Alternative ways of usage**
 26 | 
 27 | Hi everyone,
 28 | 
 29 | long time no see! 
 30 | 
 31 | Though the content of this repository should still do the job even with the newest Rasa version, a lot happened the past months.
 32 | 
 33 | * [Spacy updated to version 2.3](https://spacy.io/usage/v2-3#_title)
 34 | * HuggingFace released version 3 of their [transformers](https://github.com/huggingface/transformers) library
 35 | * Rasa released version [1.10.5](https://rasa.com/docs/rasa/changelog/#id1) of their library
 36 | 
 37 | Since there is a lot of change going on regarding Spacy version 3, the currently used spacy-transformers library most likely
 38 | won't get any more updates. I therefore strongly recommend to use the transformers library to achieve the same finetuning 
 39 | results as with Spacy. In order to do so, simply use the script "huggingface_finetune.py" alongside a [HFTransformersNLP](https://rasa.com/docs/rasa/nlu/components/#hftransformersnlp) component the following way:
 40 | 
 41 | ```
 42 | pipeline:
 43 |  - name: HFTransformersNLP
 44 |    model_name: "bert"
 45 |    model_weights: "PATH_TO_YOUR_FINETUNED_MODEL_DIRECTORY"
 46 |    cache_dir: "PATH_TO_SOME_CACHE_FOLDER"
 47 |  - name: LanguageModelFeaturizer
 48 |  - name: DIETClassifier
 49 |    random_seed: 42
 50 |    intent_classification: True
 51 |    entity_recognition: False
 52 |    use_masked_language_model: True
 53 |    epochs: 80
 54 |    number_of_transformer_layers: 4
 55 |    transformer_size: 256
 56 |    drop_rate: 0.2
 57 |    weight_sparsity: 0.7
 58 |    batch_size: [64, 256]
 59 |    embedding_dimension: 50
 60 |    hidden_layer_sizes:
 61 |      text: [512, 128]
 62 | ```
 63 | 
 64 | Please change the settings according to your situation, especially the Hyperparameters for DIET. The given ones prove to perform
 65 | good on the german language.
 66 | 
 67 | Tested with:
 68 | 
 69 | * python = 3.6.8
 70 | * transformers = 3.0.0
 71 | * rasa = 1.10.5
 72 | 
 73 | 
 74 | **Update 24.03.2020: Changes to Rasa and Spacy**
 75 | 
 76 | I verified that everything is still working with:
 77 | 
 78 | * python = 3.6.8
 79 | * spacy = 2.2.4
 80 | * spacy-transformers = 0.5.1
 81 | * rasa = 1.8.2
 82 | 
 83 | Besides, Rasa added the [HFTransformersNLP](https://rasa.com/docs/rasa/nlu/components/#hftransformersnlp) pipeline element to its core which enables the user to use every [pretrained model](https://huggingface.co/transformers/pretrained_models.html) accordingly. However, this currently doesn't replace the finetuning aspect which still significantly boosts the models performance. I am currently working on a finetuning CustomComponent for Rasa.
 84 | 
 85 | **Update 28.12.2019: DistilBERT**
 86 | 
 87 | I finally got the time to add a [DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) version
 88 | that can be used for finetuning and as a Spacy model used in Rasa.
 89 | 
 90 | I order to use this one you need to follow these steps:
 91 | 
 92 | 1. Modify the files changed in [this PR](https://github.com/explosion/spacy-transformers/pull/120) in your local spacy-transformers installation
 93 | 2. Modify the files changed in [this PR](https://github.com/explosion/spacy-transformers/pull/121) in your local spacy-transformers installation
 94 | 3. Download [this DistilBERT model](https://huggingface.co/distilbert-base-german-cased) from HuggingFace into your repository
 95 | 4. In the downloaded directory make sure to have the following files present: `config.json`, `pytorch_model.bin`, `vocab.txt`
 96 | 5. Use the command `python examples/init_model.py --lang de --name distilbert-base-german-cased /path/to/model` from the `spacy-transformers` repo
 97 | 6. You should see a new folder *distilbert-base-german-cased* with the spacy-initiated model files. Use:
 98 | 
 99 | ```
100 | python -m spacy package distilbert-base-german-cased/ /packaged_model
101 | cd /packaged_model/de_distilbert_base_german_cased-0.0.1
102 | python setup.py sdist
103 | pip install dist/de_distilbert_base_german_cased-0.0.1.tar.gz
104 | ```
105 | 
106 | * (Optional) If you want to **finetune** de_distilbert_base_german_cased, change the `trf_textcat` architecture to `softmax_last_hidden`
107 | * (Optional) If you want to **create an Excel-file** for finetuning out of an existing **`nlu.md` from Rasa**, you can use `create_xlsx_dataset_from_rasa_nlu.py` to create one.
108 | 
109 | It is worth to mention, that every model that is supported by the `transformers` library can be converted and used
110 | this way. If you want to do that, simply use the `init_model.py` of `spacy-transformers` this way:
111 | 
112 | ```
113 | python examples/init_model.py --lang xx --name TRANSFOMERS_MODEL_NAME /path/to/model
114 | ```
115 | ___
116 | 
117 | **Update 28.12.2019: NER finetuning**
118 | 
119 | I finally got the time to evaluate the NER support for training an already finetuned BERT/DistilBERT model on
120 | a *Named Entity Recognition* task. 
121 | 
122 | In order to use this one, follow these steps:
123 | 
124 | 1. Modify the files in [this PR](https://github.com/explosion/spacy-transformers/pull/95) in your current spacy-transformers installation
125 | 2. Modify the files changed in [this PR](https://github.com/explosion/spacy-transformers/pull/120) in your local spacy-transformers installation
126 | 3. Modify the files changed in [this PR](https://github.com/explosion/spacy-transformers/pull/121) in your local spacy-transformers installation
127 | 4. Use the added `bert_finetuner_ner.py` script from the spacy-transformers library on any pretrained BERT-architectured model
128 | 
129 | After the finetuning process finished, you can treat the resulting model as later explained in this guide by *packaging* it
130 | for the usage in Rasa.
131 | 
132 | ## Installation
133 | 
134 | ### Requirements
135 | 
136 | Basically all you need to to is execute:
137 | 
138 | ```
139 | pip install -r requirements.txt
140 | ```
141 | 
142 | The scripts are tested using the following libraries:
143 | 
144 | * python = 3.6.8
145 | * spacy = 2.2.3
146 | * spacy-transformers = 0.5.1
147 | * rasa = 1.6.0
148 | * transformers 2.3.0
149 | 
150 | Please keep in mind that some of the dependencies are work in progress and there might be inter-incompatibilities. 
151 | However, at the time of writing this, the libraries can simply be installed by using `pip`.
152 | 
153 | I strongly suggest do finetune and test BERT with GPU support since finetuning on even a good CPU
154 | can last several hours per epoch. 
155 | ___
156 | ### Getting started
157 | 
158 | #### Preparing the dataset
159 | 
160 | The *split* is done by the finetuning script. If you want to have a different setting,
161 | feel free to modify the script.
162 | 
163 | As suggested, we do a simple but stratified train-test split with 15% as the test subset and 85% as the training subset, which results in 8732 training
164 | samples and 1541 evaluation samples. As there are many possibilities left, this is only one
165 | possible approach. While converting the `articles.csv` into a pandas dataframe, there were some broken lines
166 | which currently are omitted.
167 | ___
168 | #### Loading the pretrained BERT
169 | 
170 | The script assumes the pretrained BERT to be installed with:
171 | 
172 | ```
173 | python -m spacy download de_trf_bertbasecased_lg
174 | ```
175 | 
176 | For the sake of interest, I have added the ``bert_config.json`` from Deepset's awesome work
177 | if someone wonders how the ``de_trf_bertbasecased_lg`` was trained.
178 | ___
179 | #### Finetune the pretrained BERT
180 | 
181 | You can start the finetuning process by using:
182 | 
183 | ```
184 | python bert_finetuner_splitset.py de_trf_bertbasecased_lg -o finetuning\output
185 | ```
186 | 
187 | Currently, I am using a ```softmax_pooler_ouput``` configuration for the ``trf_textcat``component.
188 | I'd suggest a ``softmax_last_hidden`` as the next approach. The other parameters
189 | were set based on several evaluations and might be modified for your specific use case.
190 | ___
191 | #### Package the finetuned BERT with spaCy and install it
192 | 
193 | You can easily package your newly trained model by using:
194 | 
195 | ```
196 | python -m spacy package finetuning/output /packaged_model
197 | cd /packaged_model/de_trf_bertbasecased_lg-1.0.0
198 | python setup.py sdist
199 | pip install dist/de_trf_bertbasecased_lg-1.0.0.tar.gz
200 | ```
201 | 
202 | I recommend **changing the model's name** to avoid unnecessary inconveniences
203 | by editting the config file and modifying the ``name`` value of `/finetuning/output/meta.json`.
204 | 
205 | ___
206 | #### Load the spaCy model as part of your Rasa pipeline (optional)
207 | 
208 | At the time of writing this, BERT outperforms most of the recent state-of-the-art approaches
209 | in NLP/NLU tasks, e.g. document classification. 
210 | Since those techniques are used in several **conversational AI** tasks like **intent detection**, I thought it might be a good idea to evaluate its performance with **Rasa** - IMHO one of the
211 | best open source CAI engines currently available.
212 | 
213 | If someone is interested in building a chatbot with Rasa, it might be a good idea to read the
214 | [Getting started](https://rasa.com/docs/getting-started/) guide.
215 | 
216 | Assuming that someone is familiar with Rasa, here is one possible configuration proposal which
217 | loads the newly added finetuned BERT model as a part of the training pipeline:
218 | 
219 | ```
220 | language: de
221 | pipeline: 
222 |  - name: SpacyNLP
223 |    case_sensitive: 1
224 |    model: de_trf_bertbasecased_lg_gnad
225 |  - name: SpacyTokenizer
226 |  - name: SpacyFeaturizer
227 |  - name: SklearnIntentClassifier
228 | ```
229 | 
230 | As you can see, I just specified the model's name, using the spaCy architecture with
231 | Rasa. This works, even if ``python -m spacy validate`` does **not** show your model.
232 | 
233 | Assuming that you might want to test the performance with Rasa, you can use the ``test_bot`` directory
234 | which contains the skeletton for a Rasa bot to do so. In advance, use:
235 | 
236 | ```
237 | python rasa_bot_generator.py
238 | cp test.md test_bot/test_data/
239 | cp train.md test_bot/data/
240 | cd test_bot
241 | rasa train --data data/ -c config.yml -d domain.yml --out models/
242 | rasa run -m models/ --enable-api
243 | ```
244 | 
245 | to create a valid ``stories.md`` and a valid ``domain.yml``. Please keep in mind that
246 | this will be a minimal sample from which I don't recommend to use it productively.
247 | 
248 | If the bot is loaded, you can use the endpoint:
249 | 
250 | ```
251 | http://localhost:5005/model/parse
252 | 
253 | POST
254 | {
255 | 	"text": "<any article you want to get its domain for>"
256 | }
257 | 
258 | ```
259 | ___
260 | #### Evaluate different pipelines
261 | 
262 | To keep things simple, there are two scripts which will do the work for you.
263 | 
264 | **bert_classify** evaluates the finetuned BERT by training a logistic regression
265 | and a simple SVM classifier.
266 | 
267 | ```
268 | python -m bert_classify.py 
269 | ```
270 | 
271 | **bert_rasa_classify** loads the trained Rasa model and uses the pretrained BERT features to evaluate the
272 | model's performance on the test data. Keep in mind that Rasa *compresses* your model, so you simply
273 | have to unzip/untar it and also modify the path to the NLU model in the script.
274 | 
275 | ```
276 | python -m bert_rasa_classify.py 
277 | ```
278 | 
279 | Please be aware of the fact that to evaluate the **generalization capabilities** of the model,
280 | it would be better to split the original dataset into three parts such that there is a dataset
281 | completely unknown by the model (i.e. train/validation/test split).
282 | ___
283 | #### Productive usage of a large BERT model
284 | 
285 | TBD
286 | ___
287 | 
288 | #### A note on NER (Named Entity Recognition)
289 | 
290 | As soon as I realized that I won’t be able to use the finetuned BERT-spaCy model in rasa for e.g. extracting entities like PERSON (in fact, duckling is currently not able to do that), I thought about how this would be done in general:
291 | 
292 | 1. Use the SpacyFeaturizer and SpacyEntityExtractor which currently would be recommended but which is not possible due to manual effort on the side of BERT (as mentioned, I am working on that).
293 | 2. Finetuning the pretrained BERT that afterwards is converted into a spaCy-compatible model on any NER dataset is absolutely possible and intended. We can finetune the BERT on both tasks alongside. If so, the model contains everything we are going to need to derive entities from it. Currently just not with spaCy directly. Instead we could use a CustomBERTEntityExtractor which loads the model that the pipeline already has loaded and do the work, that spaCy is currently not “able” to do.
294 | 
295 | 3. Since 2 seems to be an overhead at least for the moment, why not do the following:
296 | ```
297 | language: de
298 | pipeline: 
299 |  - name: SpacyNLP
300 |    case_sensitive: 1
301 |    model: de_trf_bertbasecased_lg_gnad
302 |  - name: SpacyTokenizer
303 |  - name: SpacyFeaturizer
304 |  - name: SklearnIntentClassifier
305 |  - name: SpacyNLP
306 |    case_sensitive: 1
307 |    model: de_core_news_md
308 |  - name: RegexFeaturizer
309 |  - name: CRFEntityExtractor
310 |  - name: DucklingHTTPExtractor
311 |    dimensions: ['time', 'duration', 'email']
312 |    locale: de_DE
313 |    timezone: Europe/Berlin
314 |    url: http://localhost:8001
315 |  - name: SpacyEntityExtractor
316 |    dimensions: ['PER', 'LOC', 'CARDINAL']
317 |  - name: rasa_mod_regex.RegexEntityExtractor
318 |  - name: EntitySynonymMapper
319 | 
320 | ```
321 | This pipeline will then load and use the features of de_trf_bertbasecased_lg_gnad for SklearnIntentClassifier, and the features of de_core_news_md for SpacyEntityExtractor.
322 | 
323 | This is not a neat solution and it should only be used until there is a smarter way (1,2) but it works.
324 | 
325 | It should be mentioned, that of course you are able to even train your own with spaCy.
326 | 
327 | 
328 | #### Troubleshooting
329 | 
330 | 
331 | ##### CUDA Out of Memory
332 | 
333 | As discussed in a [spacy-trf-issue](https://github.com/explosion/spacy-pytorch-transformers/issues/48) you may run into
334 | memory problems. I have tested the finetuning script on a *GTX 1080 with 8GB VRAM* and even with a batch size of
335 | 2 (which is absolutely *not* recommended), I got memory problems.
336 | 
337 | One way to deal with it is to use the sentencizer which splits larger documents into sentences while keeping their original labels.
338 | Another way is to reduce the batch size by half, to 12. BERT models usually need bigger batches but for the sake of functionality, I tried it.
339 | 
340 | Currently I am using a *T80 with 12 GB VRAM*, sentencizing and a lowered batch size and that setup worked fine.
341 | 
342 | 
343 | ##### AttributeError: module 'thinc_gpu_ops' has no attribute 'mean_pool'
344 | 
345 | As discussed [here](https://github.com/explosion/spacy-pytorch-transformers/issues/27) you might run into the mentioned
346 | error. I was able to resolve it by manually cloning thinc-gpu-ops, running ``pip install -r requirements.txt`` (that actually installed cython) and then running ``pip install`` .
347 | 
348 | ___
349 | 
350 | 
351 | 
352 | 
353 | A *thank you* goes to all of the **amazing open source workers** out there:
354 | 
355 | * [Rasa](https://github.com/RasaHQ)
356 | * [spaCy](https://github.com/explosion/spaCy)
357 | * [Deepset](https://deepset.ai/german-bert)
358 | * [HuggingFace](https://github.com/huggingface/pytorch-transformers)
359 | * [MKaze](https://github.com/mkaze/)
360 | 
361 | 
362 | 
363 | 


--------------------------------------------------------------------------------
/bert-base-german-cased/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 12,
10 |   "num_hidden_layers": 12,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 30000
13 | }
14 | 


--------------------------------------------------------------------------------
/bert_classify.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import json
 5 | import logging.config
 6 | import spacy
 7 | import numpy as np
 8 | from sklearn.linear_model import LogisticRegression
 9 | from sklearn.svm import SVC
10 | 
11 | 
12 | logging.config.fileConfig('logging.conf')
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def get_features(nlp, texts):
17 |     """
18 | 
19 |     :param nlp:
20 |     :param texts:
21 |     :return:
22 |     """
23 |     features = []
24 |     for doc in nlp.pipe(texts, batch_size=32):
25 |         features.append(doc.vector)
26 |     return np.array(features)
27 | 
28 | 
29 | # Load fine-tuned model
30 | model_dir = 'de_pytt_bertbasecased_lg_gnad'
31 | logger.info('Loading fine-tuned model...')
32 | nlp = spacy.load(model_dir)
33 | 
34 | # Load test data
35 | logger.info('Loading test data...')
36 | with open('data/test.json', 'r', encoding='utf-8') as handle:
37 |     test_data = json.load(handle)
38 | 
39 | with open('data/train.json', 'r', encoding='utf-8') as handle:
40 |     train_data = json.load(handle)
41 | 
42 | train_cats, train_texts = zip(*train_data)
43 | test_cats, test_texts = zip(*test_data)
44 | 
45 | 
46 | # Get the features of training and test data
47 | logger.info('Get features of training and test data...')
48 | train_feats = get_features(nlp, train_texts)
49 | test_feats = get_features(nlp, test_texts)
50 | 
51 | 
52 | # Train a logistic regression model
53 | logger.info('Train a Logistic Regression model...')
54 | clsr = LogisticRegression()
55 | clsr.fit(train_feats, train_cats)
56 | logger.info('Accuracy of Logistic Regression model on test data: {}'.format(clsr.score(test_feats, test_cats)))
57 | 
58 | # Train a SVM model
59 | logger.info('Train a SVM model...')
60 | svc = SVC(C=1, gamma=0.1, kernel='linear')
61 | svc.fit(train_feats, train_cats)
62 | logger.info('Accuracy of SVM model on test data: {}'.format(svc.score(test_feats, test_cats)))
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/bert_finetuner_ner.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf8
  3 | """Example of training spaCy's named entity recognizer, starting off with an
  4 | existing model or a blank model.
  5 | 
  6 | For more details, see the documentation:
  7 | * Training: https://spacy.io/usage/training
  8 | * NER: https://spacy.io/usage/linguistic-features#named-entities
  9 | 
 10 | Compatible with: spaCy v2.0.0+
 11 | Last tested with: v2.1.0
 12 | """
 13 | from __future__ import unicode_literals, print_function
 14 | 
 15 | import plac
 16 | import random
 17 | from pathlib import Path
 18 | import spacy
 19 | from spacy.util import minibatch, compounding
 20 | 
 21 | 
 22 | # training data
 23 | TRAIN_DATA = [
 24 |     ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
 25 |     ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
 26 | ]
 27 | 
 28 | 
 29 | @plac.annotations(
 30 |     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
 31 |     output_dir=("Optional output directory", "option", "o", Path),
 32 |     n_iter=("Number of training iterations", "option", "n", int),
 33 | )
 34 | def main(model=None, output_dir=None, n_iter=100):
 35 |     """Load the model, set up the pipeline and train the entity recognizer."""
 36 |     nlp = spacy.load(model)  # load existing spaCy model
 37 |     print("Loaded model '%s'" % model)
 38 | 
 39 |     # create the built-in pipeline components and add them to the pipeline
 40 |     # nlp.create_pipe works for built-ins that are registered with spaCy
 41 |     ner = nlp.create_pipe("trf_ner")
 42 |     nlp.add_pipe(ner, last=True)
 43 | 
 44 |     # add labels
 45 |     for _, annotations in TRAIN_DATA:
 46 |         for ent in annotations.get("entities"):
 47 |             ner.add_label(ent[2])
 48 | 
 49 |     # reset and initialize the weights randomly – but only if we're
 50 |     # training a new model
 51 |     nlp.resume_training()
 52 |     for itn in range(n_iter):
 53 |         random.shuffle(TRAIN_DATA)
 54 |         losses = {}
 55 |         # batch up the examples using spaCy's minibatch
 56 |         batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
 57 |         for batch in batches:
 58 |             texts, annotations = zip(*batch)
 59 |             nlp.update(
 60 |                 texts,  # batch of texts
 61 |                 annotations,  # batch of annotations
 62 |                 drop=0.5,  # dropout - make it harder to memorise data
 63 |                 losses=losses,
 64 |             )
 65 |         print("Losses", losses)
 66 | 
 67 |     # test the trained model
 68 |     for text, _ in TRAIN_DATA:
 69 |         doc = nlp(text)
 70 |         print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
 71 |         print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 72 | 
 73 |     # save model to output directory
 74 |     if output_dir is not None:
 75 |         output_dir = Path(output_dir)
 76 |         if not output_dir.exists():
 77 |             output_dir.mkdir()
 78 |         nlp.to_disk(output_dir)
 79 |         print("Saved model to", output_dir)
 80 | 
 81 |         # test the saved model
 82 |         print("Loading from", output_dir)
 83 |         nlp2 = spacy.load(output_dir)
 84 |         for text, _ in TRAIN_DATA:
 85 |             doc = nlp2(text)
 86 |             print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
 87 |             print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 88 | 
 89 | 
 90 | if __name__ == "__main__":
 91 |     plac.call(main)
 92 | 
 93 |     # Expected output:
 94 |     # Entities [('Shaka Khan', 'PERSON')]
 95 |     # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3),
 96 |     # ('Khan', 'PERSON', 1), ('?', '', 2)]
 97 |     # Entities [('London', 'LOC'), ('Berlin', 'LOC')]
 98 |     # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3),
 99 |     # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
100 | 


--------------------------------------------------------------------------------
/bert_finetuner_splitset.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from collections import Counter
  3 | 
  4 | import plac
  5 | import re
  6 | import random
  7 | import json
  8 | import spacy
  9 | import torch
 10 | import tqdm
 11 | import unicodedata
 12 | import logging.config
 13 | import numpy as np
 14 | import pandas as pd
 15 | from pathlib import Path
 16 | 
 17 | import wasabi
 18 | from sklearn.model_selection import train_test_split
 19 | from spacy.util import minibatch
 20 | from spacy_transformers.util import cyclic_triangular_rate
 21 | 
 22 | logging.config.fileConfig('logging.conf')
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | 
 26 | @plac.annotations(
 27 |     model=("Model name", "positional", None, str),
 28 |     output_dir=("Optional output directory", "option", "o", Path),
 29 |     batch_size=("Number of docs per batch", "option", "bs", int),
 30 |     learn_rate=("Learning rate", "option", "lr", float),
 31 |     n_iter=("Number of training epochs", "option", "n", int),
 32 |     sentence_split=("If the script should split docs into their sentences", "option", "s", bool),
 33 | )
 34 | def main(
 35 |     model,
 36 |     output_dir=None,
 37 |     n_iter=4,
 38 |     batch_size=24,
 39 |     learn_rate=2e-5,
 40 |     sentence_split=False
 41 | ):
 42 |     """
 43 | 
 44 |     :param model:
 45 |     :param output_dir:
 46 |     :param n_iter:
 47 |     :param batch_size:
 48 |     :param learn_rate:
 49 |     :return:
 50 |     """
 51 | 
 52 |     max_wpb = 1000
 53 | 
 54 |     spacy.util.fix_random_seed(0)
 55 |     is_using_gpu = spacy.prefer_gpu()
 56 |     if is_using_gpu:
 57 |         torch.set_default_tensor_type("torch.cuda.FloatTensor")
 58 |     if output_dir is not None:
 59 |         output_dir = Path(output_dir)
 60 |         if not output_dir.exists():
 61 |             output_dir.mkdir()
 62 | 
 63 |     # Creating the output directory if it's not already present
 64 |     if output_dir is not None:
 65 |         output_dir = Path(output_dir)
 66 |         if not output_dir.exists():
 67 |             output_dir.mkdir()
 68 | 
 69 |     # Load the pretrained BERT with spacy and check the pipeline names
 70 |     nlp = spacy.load(model)
 71 |     logger.info('Loaded model: {}'.format(model))
 72 |     logger.info('Loaded models pipeline names: {}'.format(nlp.pipe_names))
 73 | 
 74 |     # Using a softmax pooler output as first approach.
 75 |     textcat = nlp.create_pipe(
 76 |         "trf_textcat", config={"architecture": "softmax_pooler_output", "words_per_batch": max_wpb}
 77 |     )
 78 | 
 79 |     # Loading the 10kGNAD dataset with pandas, representing its labels as a list
 80 |     logger.info("Loading domain specific data...")
 81 |     df_train_set = pd.read_csv('data/articles.csv', delimiter=';', error_bad_lines=False, names=['label', 'article'])
 82 |     train_label_list = df_train_set['label'].unique().tolist()
 83 | 
 84 |     logger.info(
 85 |         f"Using {len(df_train_set)} training docs overall.")
 86 | 
 87 |     # Do a stratified train test split and persist the result for later usage
 88 |     train_dataframe, eval_dataframe_first = train_test_split(df_train_set, test_size=0.4, stratify=df_train_set['label'])
 89 |     eval_dataframe, dev_dataframe = train_test_split(eval_dataframe_first, test_size=0.5, stratify=eval_dataframe_first['label'])
 90 | 
 91 |     train_data = list(train_dataframe.itertuples(index=False, name=None))
 92 |     test_data = list(eval_dataframe.itertuples(index=False, name=None))
 93 |     dev_data = list(dev_dataframe.itertuples(index=False, name=None))
 94 | 
 95 |     logger.info(f"Using {len(train_data)} training docs, {len(test_data)} evaluation) and {len(dev_data)} for development.")
 96 | 
 97 |     # Some of the evaluation scripts are loading JSON so we are going to provide the split as JSON aswell
 98 |     with open('data/train.json', 'w') as handle:
 99 |         json.dump(train_data, handle)
100 | 
101 |     with open('data/test.json', 'w') as handle:
102 |         json.dump(test_data, handle)
103 | 
104 |     with open('data/dev.json', 'w') as handle:
105 |         json.dump(dev_data, handle)
106 | 
107 |     # Since rasa usually reads from markdown files, we are going ti provide the split as MD aswell
108 |     create_rasa_training_set(train_dataframe)
109 |     create_rasa_test_set(eval_dataframe)
110 |     create_rasa_dev_set(dev_dataframe)
111 | 
112 |     # For later usage, we persist the labels separate from the rest aswell
113 |     with open('data/labels.json', 'w', encoding='utf-8') as file:
114 |         json.dump(train_label_list, file)
115 | 
116 |     # Add all the labels to the finetuner
117 |     for label in train_label_list:
118 |         textcat.add_label(str(label))
119 | 
120 |     # Proper represent the labels
121 |     (train_texts, train_cats), (eval_texts, eval_cats) = load_data(
122 |         train_dataframe=train_dataframe, eval_dataframe=eval_dataframe, label_list=train_label_list
123 |     )
124 | 
125 |     # Configuring the pipeline for the finetuning process
126 |     nlp.add_pipe(textcat, last=True)
127 | 
128 |     # It might be a good idea to split sentences of an article into separate training samples
129 |     # For the moment, we are skipping that step to keep things simple.
130 | 
131 |     if sentence_split:
132 |         logger.info(f"Sentencicing ...")
133 |         train_texts, train_cats = make_sentence_examples(nlp, train_texts, train_cats)
134 |         logger.info(f"Extracted {len(train_texts)} training sentences.")
135 | 
136 |     total_words = sum(len(text.split()) for text in train_texts)
137 |     train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
138 | 
139 |     # Initialize the TextCategorizer, and create an optimizer.
140 |     optimizer = nlp.resume_training()
141 |     optimizer.alpha = 0.001
142 |     optimizer.trf_weight_decay = 0.005
143 |     optimizer.L2 = 0.0
144 |     learn_rates = cyclic_triangular_rate(
145 |         learn_rate / 3, learn_rate * 3, 2 * len(train_data) // batch_size
146 |     )
147 | 
148 |     pbar = tqdm.tqdm(total=100, leave=False)
149 |     results = []
150 |     epoch = 0
151 |     step = 0
152 |     eval_every = 100
153 |     patience = 3
154 |     while True:
155 |         # Train and evaluate
156 |         losses = Counter()
157 |         random.shuffle(train_data)
158 |         batches = minibatch(train_data, size=batch_size)
159 |         for batch in batches:
160 |             optimizer.trf_lr = next(learn_rates)
161 |             texts, annotations = zip(*batch)
162 |             nlp.update(texts, annotations, sgd=optimizer, drop=0.1, losses=losses)
163 |             pbar.update(1)
164 |             if step and (step % eval_every) == 0:
165 |                 pbar.close()
166 |                 with nlp.use_params(optimizer.averages):
167 |                     scores = evaluate_multiclass(nlp, eval_texts, eval_cats)
168 |                 results.append((scores["textcat_acc"], step, epoch))
169 |                 print(
170 |                     "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(
171 |                         losses["trf_textcat"],
172 |                         scores["textcat_acc"],
173 |                         scores["textcat_cor"],
174 |                         scores["textcat_wrg"],
175 |                     )
176 |                 )
177 |                 pbar = tqdm.tqdm(total=eval_every, leave=False)
178 |             step += 1
179 |         epoch += 1
180 |         # Stop if no improvement in HP.patience checkpoints
181 |         if results:
182 |             best_score, best_step, best_epoch = max(results)
183 |             if ((step - best_step) // eval_every) >= patience:
184 |                 break
185 | 
186 |     msg = wasabi.Printer()
187 |     table_widths = [2, 4, 6]
188 |     msg.info(f"Best scoring checkpoints")
189 |     msg.row(["Epoch", "Step", "Score"], widths=table_widths)
190 |     msg.row(["-" * width for width in table_widths])
191 |     for score, step, epoch in sorted(results, reverse=True)[:10]:
192 |         msg.row([epoch, step, "%.2f" % (score * 100)], widths=table_widths)
193 | 
194 |     if output_dir is not None:
195 |         nlp.to_disk(output_dir)
196 |         logger.info("Saved model to {}".format(output_dir))
197 | 
198 | 
199 | def make_sentence_examples(nlp, texts, labels):
200 |     """
201 | 
202 |     :param nlp:
203 |     :param texts:
204 |     :param labels:
205 |     :return:
206 |     """
207 |     sents = []
208 |     sent_cats = []
209 |     for text, cats in zip(texts, labels):
210 |         doc = nlp.make_doc(text)
211 |         doc = nlp.get_pipe("sentencizer")(doc)
212 |         for sent in doc.sents:
213 |             sents.append(sent.text)
214 |             sent_cats.append(cats)
215 |     return sents, sent_cats
216 | 
217 | def preprocess_text(text):
218 |     """
219 | 
220 |     :param text:
221 |     :return:
222 |     """
223 | 
224 |     white_re = re.compile(r"\s\s+")
225 |     text = text.replace("<s>", "<open-s-tag>")
226 |     text = text.replace("</s>", "<close-s-tag>")
227 |     text = white_re.sub(" ", text).strip()
228 |     return "".join(
229 |         c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn"
230 |     )
231 | 
232 | 
233 | def load_data(train_dataframe, eval_dataframe, label_list):
234 |     """
235 | 
236 |     :param train_dataframe:
237 |     :param eval_dataframe:
238 |     :param label_list:
239 |     :return:
240 |     """
241 | 
242 |     train_data = list(train_dataframe.itertuples(index=False, name=None))
243 |     dev_data = list(eval_dataframe.itertuples(index=False, name=None))
244 |     train_labels, train_texts = _prepare_partition(train_data, preprocess=False, label_list=label_list)
245 |     dev_labels, dev_texts = _prepare_partition(dev_data, preprocess=False, label_list=label_list)
246 |     return (train_labels, train_texts), (dev_labels, dev_texts)
247 | 
248 | 
249 | def _prepare_partition(text_label_tuples, *, preprocess=False, label_list):
250 |     """
251 | 
252 |     :param text_label_tuples:
253 |     :param preprocess:
254 |     :param label_list:
255 |     :return:
256 |     """
257 |     labels, texts = zip(*text_label_tuples)
258 |     if preprocess:
259 |         texts = [preprocess_text(text) for text in texts]
260 |     cats = [{str(i): 1.0 if i == y else 0.0 for i in label_list} for y in labels]
261 | 
262 |     return texts, cats
263 | 
264 | 
265 | def find_max_prob_cat(cats_probs):
266 |     """
267 | 
268 |     :param cats_probs:
269 |     :return:
270 |     """
271 |     cats, probs = zip(*cats_probs.items())
272 |     idx = np.argmax(probs)
273 |     return cats[idx]
274 | 
275 | 
276 | def evaluate_multiclass(nlp, texts, cats):
277 |     """
278 | 
279 |     :param nlp:
280 |     :param texts:
281 |     :param cats:
282 |     :return:
283 |     """
284 |     correct = 0
285 |     total_words = sum(len(text.split()) for text in texts)
286 |     with tqdm.tqdm(total=total_words, leave=False) as pbar:
287 |         for i, doc in enumerate(nlp.pipe(texts, batch_size=8)):
288 |             true_label = find_max_prob_cat(cats[i])
289 |             pred_label = find_max_prob_cat(doc.cats)
290 |             if true_label == pred_label:
291 |                 correct += 1
292 |             pbar.update(len(doc.text.split()))
293 |     return {'textcat_acc': float(correct) / len(texts),
294 |             'textcat_cor': correct,
295 |             'textcat_wrg': len(texts) - correct}
296 | 
297 | 
298 | def create_rasa_training_set(df_train_set):
299 |     """
300 | 
301 |     :param df_train_set:
302 |     :return:
303 |     """
304 |     label_samples = {}
305 |     with open('data/train.md', 'w', encoding='utf-8') as file:
306 |         for index, entry in df_train_set.iterrows():
307 |             if entry['label'] not in label_samples:
308 |                 label_samples[entry['label']] = []
309 |                 label_samples[entry['label']].append(entry['article'])
310 |             else:
311 |                 label_samples[entry['label']].append(entry['article'])
312 |         for label, articles in label_samples.items():
313 |             file.write('## intent:' + label + '\n')
314 |             for article in articles:
315 |                 file.write('- ' + article + '\n')
316 | 
317 | 
318 | def create_rasa_dev_set(df_dev_set):
319 |     """
320 | 
321 |     :param df_dev_set:
322 |     :return:
323 |     """
324 |     label_samples = {}
325 |     with open('data/dev.md', 'w', encoding='utf-8') as file:
326 |         for index, entry in df_dev_set.iterrows():
327 |             if entry['label'] not in label_samples:
328 |                 label_samples[entry['label']] = []
329 |                 label_samples[entry['label']].append(entry['article'])
330 |             else:
331 |                 label_samples[entry['label']].append(entry['article'])
332 |         for label, articles in label_samples.items():
333 |             file.write('## intent:' + label + '\n')
334 |             for article in articles:
335 |                 file.write('- ' + article + '\n')
336 | 
337 | 
338 | def create_rasa_test_set(df_test_set):
339 |     """
340 | 
341 |     :param df_test_set:
342 |     :return:
343 |     """
344 |     label_samples = {}
345 |     with open('data/test.md', 'w', encoding='utf-8') as file:
346 |         for index, entry in df_test_set.iterrows():
347 |             if entry['label'] not in label_samples:
348 |                 label_samples[entry['label']] = []
349 |                 label_samples[entry['label']].append(entry['article'])
350 |             else:
351 |                 label_samples[entry['label']].append(entry['article'])
352 |         for label, articles in label_samples.items():
353 |             file.write('## intent:' + label + '\n')
354 |             for article in articles:
355 |                 file.write('- ' + article + '\n')
356 | 
357 | 
358 | if __name__ == "__main__":
359 |     plac.call(main)
360 | 


--------------------------------------------------------------------------------
/bert_rasa_classify.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import spacy
 3 | import numpy as np
 4 | import logging.config
 5 | from rasa.nlu.utils import json_unpickle
 6 | from sklearn.preprocessing import LabelEncoder
 7 | from sklearn import metrics
 8 | 
 9 | logging.config.fileConfig('logging.conf')
10 | logger = logging.getLogger(__name__)
11 | 
12 | # Set the paths
13 | classifier_file = '<path_to_model>/nlu/component_3_SklearnIntentClassifier_classifier.pkl'
14 | encoder_file = '<path_to_model>/nlu/component_3_SklearnIntentClassifier_encoder.pkl'
15 | 
16 | logger.info('Load Rasa classifier model')
17 | classifier = json_unpickle(classifier_file)
18 | classes = json_unpickle(encoder_file)
19 | encoder = LabelEncoder()
20 | encoder.classes_ = classes
21 | 
22 | 
23 | def get_features(nlp, texts):
24 |     """
25 | 
26 |     :param nlp:
27 |     :param texts:
28 |     :return:
29 |     """
30 |     features = []
31 |     for doc in nlp.pipe(texts, batch_size=32):
32 |         features.append(doc.vector)
33 |     return np.array(features)
34 | 
35 | # Load fine-tuned model
36 | model_dir = 'de_pytt_bertbasecased_lg_gnad'
37 | logger.info('Loading fine-tuned model...')
38 | nlp = spacy.load(model_dir)
39 | 
40 | # Load test data
41 | logger.info('Loading test data...')
42 | with open('data/test.json', 'r', encoding='utf-8') as handle:
43 |     test_data = json.load(handle)
44 | test_cats, test_texts = zip(*test_data)
45 | 
46 | # Get the features of test data
47 | logger.info('Get features of test data...')
48 | test_feats = get_features(nlp, test_texts)
49 | 
50 | # Encode labels
51 | test_labels = encoder.transform(test_cats)
52 | 
53 | preds = classifier.predict(test_feats)
54 | logger.info('Micro: {}'.format(metrics.precision_score(test_labels, preds, average='micro')))
55 | logger.info('Macro: {}'.format(metrics.precision_score(test_labels, preds, average='macro')))
56 | logger.info('Weighted: {}'.format(metrics.precision_score(test_labels, preds, average='weighted')))
57 | logger.info('Accuracy: {}'.format(classifier.score(test_feats, test_labels)))
58 | 


--------------------------------------------------------------------------------
/create_train_test_split.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pandas as pd
 4 | from sklearn.model_selection import train_test_split
 5 | 
 6 | 
 7 | def create_rasa_training_set(df_train_set):
 8 |     label_samples = {}
 9 |     with open('train.md', 'w', encoding='utf-8') as file:
10 |         for index, entry in df_train_set.iterrows():
11 |             if entry['label'] not in label_samples:
12 |                 label_samples[entry['label']] = []
13 |                 label_samples[entry['label']].append(entry['article'])
14 |             else:
15 |                 label_samples[entry['label']].append(entry['article'])
16 |         for label, articles in label_samples.items():
17 |             file.write('## intent:' + label + '\n')
18 |             for article in articles:
19 |                 file.write('- ' + article + '\n')
20 | 
21 | 
22 | def create_rasa_test_set(df_test_set):
23 |     label_samples = {}
24 |     with open('test.md', 'w', encoding='utf-8') as file:
25 |         for index, entry in df_test_set.iterrows():
26 |             if entry['label'] not in label_samples:
27 |                 label_samples[entry['label']] = []
28 |                 label_samples[entry['label']].append(entry['article'])
29 |             else:
30 |                 label_samples[entry['label']].append(entry['article'])
31 |         for label, articles in label_samples.items():
32 |             file.write('## intent:' + label + '\n')
33 |             for article in articles:
34 |                 file.write('- ' + article + '\n')
35 | 
36 | 
37 | df_train_set = pd.read_excel('mennekes_full_v2.xlsx')
38 | train_label_list = df_train_set['label'].unique().tolist()
39 | # do a stratified train test split and persist the result
40 | train_dataframe, eval_dataframe = train_test_split(df_train_set, test_size=0.10, stratify=df_train_set['label'])
41 | train_data = list(train_dataframe.itertuples(index=False, name=None))
42 | test_data = list(eval_dataframe.itertuples(index=False, name=None))
43 | 
44 | create_rasa_training_set(train_dataframe)
45 | create_rasa_test_set(eval_dataframe)
46 | 
47 | with open('train.json', 'w') as handle:
48 |     json.dump(train_data, handle)
49 | 
50 | with open('test.json', 'w') as handle:
51 |     json.dump(test_data, handle)
52 | 


--------------------------------------------------------------------------------
/create_xlsx_dataset_from_rasa_nlu.py:
--------------------------------------------------------------------------------
 1 | from rasa.nlu.training_data.formats import MarkdownReader
 2 | import xlsxwriter
 3 | 
 4 | workbook = xlsxwriter.Workbook('filename.xlsx')
 5 | worksheet = workbook.add_worksheet()
 6 | worksheet.write('A1', 'question')
 7 | worksheet.write('B1', 'label')
 8 | worksheet.write('C1', 'answer')
 9 | 
10 | training_data = ()
11 | row = 1
12 | col = 0
13 | 
14 | doc = "PATH\\TO\\nlu.md"
15 | 
16 | reader = MarkdownReader()
17 | reader.read(doc, language='de', fformat='MARKDOWN')
18 | for message in reader.training_examples:
19 |     training_data = training_data + ([message.text, message.get('intent')],)
20 | 
21 | for question, label in (training_data):
22 |     worksheet.write_string(row, col, question)
23 |     worksheet.write_string(row, col + 1, label)
24 |     worksheet.write_string(row, col + 2, '')
25 |     row += 1
26 | 
27 | workbook.close()
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/huggingface_finetune.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | from transformers import BertConfig, BertTokenizer, TFBertForSequenceClassification, TFBertModel
 6 | from sklearn.preprocessing import LabelEncoder
 7 | from rasa.nlu.training_data.formats import MarkdownReader
 8 | 
 9 | # disable GPU
10 | #os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
11 | 
12 | 
13 | def read_inputs_json(input_path):
14 |     texts = []
15 |     cats = []
16 |     with open(input_path, mode="r") as file_:
17 |         data = json.load(file_)['rasa_nlu_data']['common_examples']
18 |         for d in data:
19 |             texts.append(d['text'])
20 |             cats.append(d['intent'])
21 | 
22 |     return texts, cats
23 | 
24 | 
25 | def read_inputs_md(input_path):
26 |     reader = MarkdownReader()
27 |     reader.read(input_path, language='de', fformat='MARKDOWN')
28 |     texts = []
29 |     cats = []
30 |     for message in reader.training_examples:
31 |         texts.append(message.text)
32 |         cats.append(message.get('intent'))
33 | 
34 |     return texts, cats
35 | 
36 | train_texts, train_labels = read_inputs_md(
37 |     os.path.join('train_test_split', 'training_data.md')
38 | )
39 | test_texts, test_labels = read_inputs_json(
40 |     os.path.join('train_test_split', 'test_data.md')
41 | )
42 | 
43 | le = LabelEncoder().fit(train_labels)
44 | train_labels = le.transform(train_labels)
45 | test_labels = le.transform(test_labels)
46 | 
47 | 
48 | config = BertConfig.from_pretrained("dbmdz/bert-base-german-cased",
49 |                                     num_labels=len(le.classes_))
50 | tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
51 | model = TFBertForSequenceClassification.from_pretrained(
52 |     "dbmdz/bert-base-german-cased",
53 |     config=config,
54 |     trainable=True,
55 | )
56 | 
57 | def encode(texts):
58 |     input_ids = []
59 |     attention_mask = []
60 |     for text in texts:
61 |         tokens = tokenizer.encode_plus(text,
62 |                                        max_length=128,
63 |                                        pad_to_max_length=True,
64 |                                        return_token_type_ids=False,
65 |                                        return_attention_mask=True)
66 |         input_ids.append(tokens['input_ids'])
67 |         attention_mask.append(tokens['attention_mask'])
68 |     
69 |     return np.array(input_ids), np.array(attention_mask)
70 | 
71 | 
72 | train_input_ids, train_attention_mask = encode(train_texts)
73 | test_input_ids, test_attention_maks = encode(test_texts)
74 | print(train_input_ids.shape, train_attention_mask.shape)
75 | 
76 | opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
77 | loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
78 | metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
79 | 
80 | model.compile(loss=loss, optimizer=opt, metrics=[metric])
81 | 
82 | print('Model compiled; start training...')
83 | history = model.fit([train_input_ids, train_attention_mask],
84 |                     train_labels,
85 |                     epochs=20,
86 |                     batch_size=8,
87 |                     validation_data=([test_input_ids, test_attention_maks], test_labels))
88 | 
89 | bert = TFBertModel.from_pretrained("dbmdz/bert-base-german-cased")
90 | bert.bert = model.bert
91 | bert.save_pretrained('model/')
92 | 
93 | tokenizer.save_pretrained('model/')
94 | 


--------------------------------------------------------------------------------
/logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root,eva_core_api
 3 | 
 4 | [handlers]
 5 | keys=console
 6 | 
 7 | [formatters]
 8 | keys=simple
 9 | 
10 | [logger_root]
11 | level=DEBUG
12 | handlers=console
13 | 
14 | [logger_eva_core_api]
15 | level=DEBUG
16 | handlers=console
17 | qualname=eva_core_api
18 | propagate=0
19 | 
20 | [handler_console]
21 | class=StreamHandler
22 | level=DEBUG
23 | formatter=simple
24 | args=(sys.stdout,)
25 | 
26 | [formatter_simple]
27 | format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
28 | datefmt=
29 | 


--------------------------------------------------------------------------------
/rasa_bot_generator.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | def faq_domain_generator():
 4 |     """
 5 | 
 6 |     :return:
 7 |     """
 8 |     with open('test_bot/domain.yml', 'w', encoding='utf-8') as story_file:
 9 |         with open('labels.json', 'r', encoding='utf-8') as label_file:
10 |             labels_as_intents = json.load(label_file)
11 |             story_file.write('intents: \n')
12 |             for intent in labels_as_intents:
13 |                 story_file.write(f'  - {intent}\n')
14 |             story_file.write('actions: \n')
15 |             for intent in labels_as_intents:
16 |                 story_file.write(f'  - utter_{intent}\n')
17 | 
18 | def faq_story_generator():
19 |     """"""
20 | 
21 |     with open('test_bot/data/stories.md', 'w', encoding='utf-8') as story_file:
22 |         with open('labels.json', 'r', encoding='utf-8') as label_file:
23 |             labels_as_intents = json.load(label_file)
24 |             for intent in labels_as_intents:
25 |                 story_file.write(f'## faq_story_{intent}\n')
26 |                 story_file.write(f'* {intent}\n')
27 |                 story_file.write(f'  - utter_{intent}\n')
28 |                 story_file.write(f'  - action_restart\n')
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     faq_story_generator()
33 |     faq_domain_generator()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | spacy
2 | rasa
3 | spacy-transformers
4 | pandas


--------------------------------------------------------------------------------
/test_bot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JulianGerhard21/bert_spacy_rasa/0ee3739680c5015df65d963f85b3e4e9deded8a4/test_bot/__init__.py


--------------------------------------------------------------------------------
/test_bot/actions.py:
--------------------------------------------------------------------------------
 1 | # This files contains your custom actions which can be used to run
 2 | # custom Python code.
 3 | #
 4 | # See this guide on how to implement these action:
 5 | # https://rasa.com/docs/rasa/core/actions/#custom-actions/
 6 | 
 7 | 
 8 | from typing import Any, Text, Dict, List
 9 | 
10 | from rasa_sdk import Action, Tracker
11 | from rasa_sdk.executor import CollectingDispatcher


--------------------------------------------------------------------------------
/test_bot/config.yml:
--------------------------------------------------------------------------------
 1 | language: de
 2 | pipeline:
 3 |  - name: SpacyNLP
 4 |    case_sensitive: 1
 5 |    model: de_pytt_bertbasecased_lg_gnad
 6 |  - name: SpacyTokenizer
 7 |  - name: SpacyFeaturizer
 8 |  - name: SklearnIntentClassifier
 9 | policies:
10 |  - name: KerasPolicy
11 |    batch_size: 50
12 |    epochs: 200
13 |    max_training_samples: 300
14 |  - name: MemoizationPolicy
15 |    max_history: 5
16 |  - name: FormPolicy
17 |  - name: MappingPolicy
18 | 


--------------------------------------------------------------------------------
/test_bot/credentials.yml:
--------------------------------------------------------------------------------
 1 | # This file contains the credentials for the voice & chat platforms
 2 | # which your bot is using.
 3 | # https://rasa.com/docs/rasa/user-guide/messaging-and-voice-channels/
 4 | 
 5 | rest:
 6 | #  # you don't need to provide anything here - this channel doesn't
 7 | #  # require any credentials
 8 | 
 9 | 
10 | #facebook:
11 | #  verify: "<verify>"
12 | #  secret: "<your secret>"
13 | #  page-access-token: "<your page access token>"
14 | 
15 | #slack:
16 | #  slack_token: "<your slack token>"
17 | #  slack_channel: "<the slack channel>"
18 | 
19 | #socketio:
20 | #  user_message_evt: <event name for user message>
21 | #  bot_message_evt: <event name for but messages>
22 | #  session_persistence: <true/false>
23 | 
24 | rasa:
25 |   url: "http://localhost:5002/api"
26 | 


--------------------------------------------------------------------------------
/test_bot/domain.yml:
--------------------------------------------------------------------------------
 1 | intents:
 2 |   - Etat
 3 |   - Inland
 4 |   - International
 5 |   - Kultur
 6 |   - Panorama
 7 |   - Sport
 8 |   - Web
 9 |   - Wirtschaft
10 |   - Wissenschaft
11 | actions:
12 |   - utter_Etat
13 |   - utter_Inland
14 |   - utter_International
15 |   - utter_Kultur
16 |   - utter_Panorama
17 |   - utter_Sport
18 |   - utter_Web
19 |   - utter_Wirtschaft
20 |   - utter_Wissenschaft
21 | 


--------------------------------------------------------------------------------
/test_bot/endpoints.yml:
--------------------------------------------------------------------------------
 1 | # This file contains the different endpoints your bot can use.
 2 | 
 3 | # Server where the models are pulled from.
 4 | # https://rasa.com/docs/rasa/user-guide/running-the-server/#fetching-models-from-a-server/
 5 | 
 6 | #models:
 7 | #  url: http://my-server.com/models/default_core@latest
 8 | #  wait_time_between_pulls:  10   # [optional](default: 100)
 9 | 
10 | # Server which runs your custom actions.
11 | # https://rasa.com/docs/rasa/core/actions/#custom-actions/
12 | 
13 | action_endpoint:
14 |   url: "http://localhost:5055/webhook"
15 | 
16 | # Tracker store which is used to store the conversations.
17 | # By default the conversations are stored in memory.
18 | # https://rasa.com/docs/rasa/api/tracker-stores/
19 | 
20 | #tracker_store:
21 | #    type: redis
22 | #    url: <host of the redis instance, e.g. localhost>
23 | #    port: <port of your redis instance, usually 6379>
24 | #    db: <number of your database within redis, e.g. 0>
25 | #    password: <password used for authentication>
26 | 
27 | #tracker_store:
28 | #    type: mongod
29 | #    url: <url to your mongo instance, e.g. mongodb://localhost:27017>
30 | #    db: <name of the db within your mongo instance, e.g. rasa>
31 | #    username: <username used for authentication>
32 | #    password: <password used for authentication>
33 | 
34 | # Event broker which all conversation events should be streamed to.
35 | # https://rasa.com/docs/rasa/api/event-brokers/
36 | 
37 | #event_broker:
38 | #  url: localhost
39 | #  username: username
40 | #  password: password
41 | #  queue: queue
42 | 


--------------------------------------------------------------------------------