├── .gitignore ├── 10kGNAD-master ├── LICENSE ├── LICENSE.txt ├── README.md ├── README.txt ├── articles.csv ├── code │ ├── extract_dataset_from_sqlite.py │ ├── generate_lowshot_sets.py │ └── split_articles_into_train_test.py ├── database_schema.md ├── requirements.txt ├── test.csv └── train.csv ├── README.md ├── bert-base-german-cased └── bert_config.json ├── bert_classify.py ├── bert_finetuner_ner.py ├── bert_finetuner_splitset.py ├── bert_rasa_classify.py ├── create_train_test_split.py ├── create_xlsx_dataset_from_rasa_nlu.py ├── data └── articles.csv ├── huggingface_finetune.py ├── logging.conf ├── rasa_bot_generator.py ├── requirements.txt └── test_bot ├── __init__.py ├── actions.py ├── config.yml ├── credentials.yml ├── domain.yml └── endpoints.yml /.gitignore: -------------------------------------------------------------------------------- 1 | finetuning/* 2 | data/* 3 | !data/articles.csv 4 | .idea 5 | test_bot/data 6 | test_bot/test_data 7 | test_bot/evaluation 8 | test_bot/models 9 | 10kGNAD-master/corpus.sqlite3 10 | 10kGNAD-master/million_post_corpus.tar.bz2 11 | __pycache__/ -------------------------------------------------------------------------------- /10kGNAD-master/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Timo Block 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /10kGNAD-master/LICENSE.txt: -------------------------------------------------------------------------------- 1 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. 2 | -------------------------------------------------------------------------------- /10kGNAD-master/README.md: -------------------------------------------------------------------------------- 1 | # Ten Thousand German News Articles Dataset 2 | 3 | For more information visit the detailed [project page](https://tblock.github.io/10kGNAD/). 4 | 5 | 1. Install the required python packages `pip install -r requirements.txt`. 6 | 2. Download the `corpus.sqlite3` file into the project root from [here (compressed)](https://github.com/OFAI/million-post-corpus/releases/download/v1.0.0/million_post_corpus.tar.bz2) or directly from [here](https://github.com/tblock/10kGNAD/releases/download/v1.0/corpus.sqlite3). 7 | 3. Run `python code/extract_dataset_from_sqlite.py corpus.sqlite3 articles.csv` to extract the articles. 8 | 4. Run `python code/split_articles_into_train_test.py` to split the dataset. 9 | 10 | ## License 11 | 12 | All code in this repository is licensed under a MIT License. 13 | 14 | The dataset is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/). 15 | -------------------------------------------------------------------------------- /10kGNAD-master/README.txt: -------------------------------------------------------------------------------- 1 | Million Post Corpus 2 | =================== 3 | 4 | Please see the corpus website at 5 | 6 | https://ofai.github.io/million-post-corpus/ 7 | 8 | If you use this data set for your research, please cite our paper (more details 9 | on the above website): 10 | 11 | Dietmar Schabus, Marcin Skowron, Martin Trapp 12 | One Million Posts: A Data Set of German Online Discussions 13 | Proceedings of the 40th International ACM SIGIR Conference on Research and 14 | Development in Information Retrieval (SIGIR) 15 | Tokyo, Japan, August 2017 16 | -------------------------------------------------------------------------------- /10kGNAD-master/code/extract_dataset_from_sqlite.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """ 5 | extract_dataset_from_sqlite.py.py 6 | 7 | This exports the articles from the _One Million Posts Corpus_ and generates a 8 | CSV file containing a label and the text for each article. 9 | """ 10 | 11 | import re 12 | import sys 13 | import csv 14 | import sqlite3 15 | 16 | from tqdm import tqdm 17 | from bs4 import BeautifulSoup 18 | from argparse import ArgumentParser 19 | 20 | 21 | ARTICLE_QUERY = "SELECT Path, Body FROM Articles WHERE PATH LIKE 'Newsroom/%' AND PATH NOT LIKE 'Newsroom/User%' ORDER BY Path" 22 | 23 | 24 | if __name__ == '__main__': 25 | parser = ArgumentParser() 26 | parser.add_argument(dest="sqlite_file", action="store", help="sqlite input filename", metavar="") 27 | parser.add_argument(dest="csv_file", action="store", help="csv output filebame", metavar="") 28 | args = parser.parse_args() 29 | 30 | 31 | conn = sqlite3.connect(args.sqlite_file) 32 | cursor = conn.cursor() 33 | 34 | with open(args.csv_file, "w", encoding='utf-8') as csvfile: 35 | writer = csv.writer(csvfile, delimiter=';',quotechar='\'', quoting=csv.QUOTE_MINIMAL) 36 | 37 | for row in tqdm(cursor.execute(ARTICLE_QUERY).fetchall(), unit_scale=True): 38 | path = row[0] 39 | body = row[1] 40 | text = "" 41 | description = "" 42 | 43 | soup = BeautifulSoup(body, 'html.parser') 44 | 45 | # get description from subheadline 46 | description_obj = soup.find('h2',{'itemprop':'description'}) 47 | if description_obj is not None: 48 | description = description_obj.text 49 | description = description.replace("\n"," ").replace("\t"," ").strip() + ". " 50 | 51 | # get text from paragraphs 52 | text_container = soup.find('div',{'class':'copytext'}) 53 | if text_container is not None: 54 | for p in text_container.findAll('p'): 55 | text += p.text.replace("\n"," ").replace("\t"," ").replace("\"","").replace("'","") + " " 56 | text = text.strip() 57 | 58 | # remove article autors 59 | for author in re.findall(r"\.\ \(.+,.+2[0-9]+\)", text[-50:]): # some articles have a year of 21015.. 60 | text = text.replace(author, ".") 61 | 62 | # get category from path 63 | category = path.split("/")[1] 64 | sample = [category, description + text] 65 | 66 | # filter empty samples, then write to csv 67 | if sample[1] != "": 68 | writer.writerow(sample) 69 | 70 | conn.close() 71 | 72 | -------------------------------------------------------------------------------- /10kGNAD-master/code/generate_lowshot_sets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import csv 6 | import random as rnd 7 | 8 | from argparse import ArgumentParser 9 | 10 | """ ompc_generate_lowshot_sets.py 11 | Generates low shot sets from the full train set 12 | """ 13 | 14 | SIZES = [.01,.02,.05,.075,.1,.2,.5,.75,1.] 15 | ITERATIONS = 10 16 | 17 | full_dataset = [] 18 | 19 | def stratifyed_shuffled_subset(data, size): 20 | """ returns a stratifyed, but shuffled subset of lenght *size* """ 21 | dataset_dict = {} 22 | subset = [] 23 | 24 | rnd.shuffle(data) 25 | 26 | for entry in data: 27 | label = entry[0] 28 | text = entry[1] 29 | if label not in dataset_dict: # if the label is not yet in the dict 30 | dataset_dict[label] = [] 31 | f = dataset_dict[label] 32 | f.append(text) 33 | 34 | for key in dataset_dict: 35 | key_set = dataset_dict[key] 36 | for entry in key_set[: int(len(key_set) * size)]: 37 | subset.append([key, entry]) 38 | 39 | rnd.shuffle(subset) 40 | return subset 41 | 42 | def write_dataset(data, size, iteration): 43 | with open("lowshot/lowshot_" + str(size) +"_" + str(iteration) + ".csv", "w") as file_write: 44 | writer = csv.writer(file_write, delimiter=';', quotechar='\'', quoting=csv.QUOTE_MINIMAL) 45 | for entry in data: 46 | writer.writerow(entry) 47 | 48 | def write_dataset_fasttext(data, size, iteration): 49 | with open("lowshot/lowshot_" + str(size) +"_" + str(iteration) + ".csv", "w") as file_write: 50 | writer = csv.writer(file_write, delimiter='\t', quotechar='\'', quoting=csv.QUOTE_MINIMAL) 51 | for entry in data: 52 | label = "__label__" + entry[0] 53 | writer.writerow([label, entry[1]]) 54 | 55 | 56 | def createFolder(directory): 57 | try: 58 | if not os.path.exists(directory): 59 | os.makedirs(directory) 60 | except OSError: 61 | print ('Error: Creating directory. ' + directory) 62 | 63 | 64 | if __name__ == "__main__": 65 | parser = ArgumentParser() 66 | parser.add_argument('-fastText', action='store_true') 67 | args = parser.parse_args() 68 | 69 | print("writing files in fastText format:", args.fastText) 70 | 71 | createFolder('./lowshot/') 72 | 73 | with open("train.csv") as full_file: 74 | reader = csv.reader(full_file, delimiter=';', quotechar='\'') 75 | for row in reader: 76 | full_dataset.append(row) 77 | 78 | # sets a seed for reproducability 79 | rnd.seed(42) 80 | 81 | size_counter = 0 82 | for size in SIZES: 83 | for iteration in range(ITERATIONS): 84 | print("Generating iteration %.0f of size %.3f" % (iteration, size)) 85 | subset = stratifyed_shuffled_subset(full_dataset, size) 86 | 87 | if not args.fastText: 88 | write_dataset(subset, str(size_counter) + "_" + str(size), iteration) 89 | if args.fastText: 90 | write_dataset_fasttext(subset, str(size_counter) + "_" + str(size), iteration) 91 | 92 | size_counter+=1 93 | -------------------------------------------------------------------------------- /10kGNAD-master/code/split_articles_into_train_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import csv 5 | import collections 6 | 7 | from argparse import ArgumentParser 8 | from sklearn.model_selection import train_test_split 9 | 10 | """ split_articles_into_train_test.py: 11 | 12 | processes the dataset and splits the dataset into a training- and testset 13 | """ 14 | 15 | SPLIT = .1 16 | 17 | 18 | def write_datasets(data, name, args): 19 | """ write a csv file in a normal and optinally in the fastText format """ 20 | 21 | with open(name + ".csv", "w", encoding='utf-8') as file_write: 22 | writer = csv.writer(file_write, delimiter=';', quotechar='\'', quoting=csv.QUOTE_MINIMAL) 23 | for row in data: 24 | writer.writerow(row) 25 | 26 | # optionally output files in the fastText format 27 | if args.fastText: 28 | with open("fastText_" + name + ".csv", "w", encoding='utf-8') as file_write: 29 | writer = csv.writer(file_write, delimiter='\t', quotechar='\'', quoting=csv.QUOTE_MINIMAL) 30 | for row in data: 31 | label = row[0] 32 | label = "__label__" + label 33 | writer.writerow([label, row[1]]) 34 | 35 | 36 | if __name__ == "__main__": 37 | 38 | parser = ArgumentParser() 39 | parser.add_argument('-fastText', action='store_true') 40 | args = parser.parse_args() 41 | 42 | labels = [] 43 | texts = [] 44 | 45 | # read full dataset file 46 | with open("articles.csv", "r", encoding='utf-8') as csvfile: 47 | reader = csv.reader(csvfile, delimiter=';', quotechar='\'') 48 | for row in reader: 49 | if len(row) == 2: 50 | labels.append(row[0]) 51 | texts.append(row[1]) 52 | print(len(texts)) 53 | 54 | # split dataset 55 | trn_texts, tst_texts, trn_labels, tst_labels = train_test_split(texts, labels, test_size=SPLIT, random_state=42, stratify=labels) 56 | 57 | # write train and test datasets 58 | train = [] 59 | test = [] 60 | 61 | for i in range(len(trn_labels)): 62 | train.append([trn_labels[i], trn_texts[i]]) 63 | 64 | for i in range(len(tst_labels)): 65 | test.append([tst_labels[i], tst_texts[i]]) 66 | 67 | write_datasets(train, "train", args) 68 | write_datasets(test, "test", args) 69 | 70 | 71 | -------------------------------------------------------------------------------- /10kGNAD-master/database_schema.md: -------------------------------------------------------------------------------- 1 | ```sql 2 | CREATE TABLE Articles ( 3 | ID_Article INTEGER PRIMARY KEY, -- 4 | Path TEXT NOT NULL, -- Topic path, e.g.: 'Newsroom/Sports/Motorsports/Formula 1' 5 | publishingDate TIMESTAMP NOT NULL, -- 6 | Title TEXT NOT NULL, -- 7 | Body TEXT -- Main article body, contains HTML markup 8 | ); 9 | 10 | CREATE TABLE Posts ( 11 | ID_Post INTEGER PRIMARY KEY, -- 12 | ID_Parent_Post INTEGER, -- if this post is a reply: parent post's ID, otherwise NULL 13 | ID_Article INTEGER NOT NULL, -- foreign key to 'Articles' table 14 | ID_User INTEGER NOT NULL, -- 15 | CreatedAt TIMESTAMP NOT NULL, -- 16 | Status TEXT, -- 'online' or 'deleted' (if deleted by moderator) 17 | Headline TEXT, -- Post headline (may be NULL if Body isn't) 18 | Body TEXT, -- Post main body (may be NULL if Headline isn't) 19 | PositiveVotes INTEGER NOT NULL, -- Number of positive votes by other users 20 | NegativeVotes INTEGER NOT NULL -- Number of negative votes by other users 21 | ); 22 | 23 | -- This table lists all users who work for the newspaper (e.g. moderators, editorial journalists) 24 | CREATE TABLE Newspaper_Staff ( 25 | ID_User INTEGER PRIMARY KEY -- matches with Posts.ID_User 26 | ); 27 | 28 | -- This table may contain multiple annotator opinions for a given (ID_Post, Category) pair 29 | CREATE TABLE Annotations ( 30 | ID_Post INTEGER NOT NULL, -- foreign key to 'Posts' table 31 | ID_Annotator INTEGER NOT NULL, -- 32 | Category TEXT NOT NULL, -- name of the category, e.g. 'SentimentNegative' 33 | Value INTEGER NOT NULL, -- 0 or 1, where 1 means the category does apply to the post 34 | PRIMARY KEY(ID_Post, ID_Annotator, Category) 35 | ); 36 | 37 | -- This table will contain only one consolidated judgment for a given (ID_Post, Category) pair, 38 | -- determined by a majority vote across all opinions in the 'Annotations' table 39 | CREATE TABLE Annotations_consolidated ( 40 | ID_Post INTEGER NOT NULL, -- 41 | Category TEXT NOT NULL, -- name of the category, e.g. 'SentimentNegative' 42 | Value INTEGER NOT NULL, -- 0 or 1, where 1 means the category does apply to the post 43 | PRIMARY KEY(ID_Post, Category) 44 | ); 45 | 46 | -- This table is meant for reproducible cross validation. For each category, all posts are split 47 | -- into ten folds in a stratified manner 48 | -- https://en.wikipedia.org/wiki/Cross-validation_%28statistics%29#k-fold_cross-validation 49 | CREATE TABLE CrossValSplit( 50 | ID_Post INTEGER NOT NULL, -- foreign key to 'Posts' table 51 | Category TEXT NOT NULL, -- name of the category, e.g. 'SentimentNegative' 52 | Fold INTEGER NOT NULL, -- from [1,10] 53 | PRIMARY KEY(ID_Post, Category, Fold) 54 | ); 55 | 56 | -- This table defines the default ordering of the categories 57 | CREATE TABLE Categories ( 58 | Name TEXT PRIMARY KEY, -- name of the category, e.g. 'SentimentNegative' 59 | Ord INTEGER -- ordering index 60 | ); 61 | ``` 62 | -------------------------------------------------------------------------------- /10kGNAD-master/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn>=0.20.2 2 | beautifulsoup4>=4.6.3 3 | tqdm>=4.26.0 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Finetune BERT Embeddings with spaCy and Rasa 2 | 3 | **For whom this repository might be of interest:** 4 | 5 | This repository describes the process of finetuning the *german pretrained BERT* model of [deepset.ai](https://deepset.ai/german-bert) 6 | on a domain-specific dataset, converting it into a [spaCy](https://spacy.io/) packaged model and loading it in [Rasa](https://rasa.com/) to evaluate its 7 | performance on domain-specific **Conversational AI** tasks like *intent detection* and *NER*. 8 | If there are questions though, feel free to ask. 9 | 10 | This repository is meant for those who want to have a quick dive into the matter. 11 | 12 | I am going to use the [10kGNAD](https://tblock.github.io/10kGNAD/) dataset for this task but it should be easy to 13 | modify the files for your specific use case. 14 | 15 | **Short-term Roadmap**: 16 | 17 | - [x] Add [DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) support 18 | - [ ] Add CUDA Installation Guide 19 | - [x] Add [RoBERTa](https://arxiv.org/abs/1907.11692) support 20 | - [x] Add NER support 21 | 22 | ___ 23 | ## Updates 24 | 25 | **Update 06.07.2020: Alternative ways of usage** 26 | 27 | Hi everyone, 28 | 29 | long time no see! 30 | 31 | Though the content of this repository should still do the job even with the newest Rasa version, a lot happened the past months. 32 | 33 | * [Spacy updated to version 2.3](https://spacy.io/usage/v2-3#_title) 34 | * HuggingFace released version 3 of their [transformers](https://github.com/huggingface/transformers) library 35 | * Rasa released version [1.10.5](https://rasa.com/docs/rasa/changelog/#id1) of their library 36 | 37 | Since there is a lot of change going on regarding Spacy version 3, the currently used spacy-transformers library most likely 38 | won't get any more updates. I therefore strongly recommend to use the transformers library to achieve the same finetuning 39 | results as with Spacy. In order to do so, simply use the script "huggingface_finetune.py" alongside a [HFTransformersNLP](https://rasa.com/docs/rasa/nlu/components/#hftransformersnlp) component the following way: 40 | 41 | ``` 42 | pipeline: 43 | - name: HFTransformersNLP 44 | model_name: "bert" 45 | model_weights: "PATH_TO_YOUR_FINETUNED_MODEL_DIRECTORY" 46 | cache_dir: "PATH_TO_SOME_CACHE_FOLDER" 47 | - name: LanguageModelFeaturizer 48 | - name: DIETClassifier 49 | random_seed: 42 50 | intent_classification: True 51 | entity_recognition: False 52 | use_masked_language_model: True 53 | epochs: 80 54 | number_of_transformer_layers: 4 55 | transformer_size: 256 56 | drop_rate: 0.2 57 | weight_sparsity: 0.7 58 | batch_size: [64, 256] 59 | embedding_dimension: 50 60 | hidden_layer_sizes: 61 | text: [512, 128] 62 | ``` 63 | 64 | Please change the settings according to your situation, especially the Hyperparameters for DIET. The given ones prove to perform 65 | good on the german language. 66 | 67 | Tested with: 68 | 69 | * python = 3.6.8 70 | * transformers = 3.0.0 71 | * rasa = 1.10.5 72 | 73 | 74 | **Update 24.03.2020: Changes to Rasa and Spacy** 75 | 76 | I verified that everything is still working with: 77 | 78 | * python = 3.6.8 79 | * spacy = 2.2.4 80 | * spacy-transformers = 0.5.1 81 | * rasa = 1.8.2 82 | 83 | Besides, Rasa added the [HFTransformersNLP](https://rasa.com/docs/rasa/nlu/components/#hftransformersnlp) pipeline element to its core which enables the user to use every [pretrained model](https://huggingface.co/transformers/pretrained_models.html) accordingly. However, this currently doesn't replace the finetuning aspect which still significantly boosts the models performance. I am currently working on a finetuning CustomComponent for Rasa. 84 | 85 | **Update 28.12.2019: DistilBERT** 86 | 87 | I finally got the time to add a [DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) version 88 | that can be used for finetuning and as a Spacy model used in Rasa. 89 | 90 | I order to use this one you need to follow these steps: 91 | 92 | 1. Modify the files changed in [this PR](https://github.com/explosion/spacy-transformers/pull/120) in your local spacy-transformers installation 93 | 2. Modify the files changed in [this PR](https://github.com/explosion/spacy-transformers/pull/121) in your local spacy-transformers installation 94 | 3. Download [this DistilBERT model](https://huggingface.co/distilbert-base-german-cased) from HuggingFace into your repository 95 | 4. In the downloaded directory make sure to have the following files present: `config.json`, `pytorch_model.bin`, `vocab.txt` 96 | 5. Use the command `python examples/init_model.py --lang de --name distilbert-base-german-cased /path/to/model` from the `spacy-transformers` repo 97 | 6. You should see a new folder *distilbert-base-german-cased* with the spacy-initiated model files. Use: 98 | 99 | ``` 100 | python -m spacy package distilbert-base-german-cased/ /packaged_model 101 | cd /packaged_model/de_distilbert_base_german_cased-0.0.1 102 | python setup.py sdist 103 | pip install dist/de_distilbert_base_german_cased-0.0.1.tar.gz 104 | ``` 105 | 106 | * (Optional) If you want to **finetune** de_distilbert_base_german_cased, change the `trf_textcat` architecture to `softmax_last_hidden` 107 | * (Optional) If you want to **create an Excel-file** for finetuning out of an existing **`nlu.md` from Rasa**, you can use `create_xlsx_dataset_from_rasa_nlu.py` to create one. 108 | 109 | It is worth to mention, that every model that is supported by the `transformers` library can be converted and used 110 | this way. If you want to do that, simply use the `init_model.py` of `spacy-transformers` this way: 111 | 112 | ``` 113 | python examples/init_model.py --lang xx --name TRANSFOMERS_MODEL_NAME /path/to/model 114 | ``` 115 | ___ 116 | 117 | **Update 28.12.2019: NER finetuning** 118 | 119 | I finally got the time to evaluate the NER support for training an already finetuned BERT/DistilBERT model on 120 | a *Named Entity Recognition* task. 121 | 122 | In order to use this one, follow these steps: 123 | 124 | 1. Modify the files in [this PR](https://github.com/explosion/spacy-transformers/pull/95) in your current spacy-transformers installation 125 | 2. Modify the files changed in [this PR](https://github.com/explosion/spacy-transformers/pull/120) in your local spacy-transformers installation 126 | 3. Modify the files changed in [this PR](https://github.com/explosion/spacy-transformers/pull/121) in your local spacy-transformers installation 127 | 4. Use the added `bert_finetuner_ner.py` script from the spacy-transformers library on any pretrained BERT-architectured model 128 | 129 | After the finetuning process finished, you can treat the resulting model as later explained in this guide by *packaging* it 130 | for the usage in Rasa. 131 | 132 | ## Installation 133 | 134 | ### Requirements 135 | 136 | Basically all you need to to is execute: 137 | 138 | ``` 139 | pip install -r requirements.txt 140 | ``` 141 | 142 | The scripts are tested using the following libraries: 143 | 144 | * python = 3.6.8 145 | * spacy = 2.2.3 146 | * spacy-transformers = 0.5.1 147 | * rasa = 1.6.0 148 | * transformers 2.3.0 149 | 150 | Please keep in mind that some of the dependencies are work in progress and there might be inter-incompatibilities. 151 | However, at the time of writing this, the libraries can simply be installed by using `pip`. 152 | 153 | I strongly suggest do finetune and test BERT with GPU support since finetuning on even a good CPU 154 | can last several hours per epoch. 155 | ___ 156 | ### Getting started 157 | 158 | #### Preparing the dataset 159 | 160 | The *split* is done by the finetuning script. If you want to have a different setting, 161 | feel free to modify the script. 162 | 163 | As suggested, we do a simple but stratified train-test split with 15% as the test subset and 85% as the training subset, which results in 8732 training 164 | samples and 1541 evaluation samples. As there are many possibilities left, this is only one 165 | possible approach. While converting the `articles.csv` into a pandas dataframe, there were some broken lines 166 | which currently are omitted. 167 | ___ 168 | #### Loading the pretrained BERT 169 | 170 | The script assumes the pretrained BERT to be installed with: 171 | 172 | ``` 173 | python -m spacy download de_trf_bertbasecased_lg 174 | ``` 175 | 176 | For the sake of interest, I have added the ``bert_config.json`` from Deepset's awesome work 177 | if someone wonders how the ``de_trf_bertbasecased_lg`` was trained. 178 | ___ 179 | #### Finetune the pretrained BERT 180 | 181 | You can start the finetuning process by using: 182 | 183 | ``` 184 | python bert_finetuner_splitset.py de_trf_bertbasecased_lg -o finetuning\output 185 | ``` 186 | 187 | Currently, I am using a ```softmax_pooler_ouput``` configuration for the ``trf_textcat``component. 188 | I'd suggest a ``softmax_last_hidden`` as the next approach. The other parameters 189 | were set based on several evaluations and might be modified for your specific use case. 190 | ___ 191 | #### Package the finetuned BERT with spaCy and install it 192 | 193 | You can easily package your newly trained model by using: 194 | 195 | ``` 196 | python -m spacy package finetuning/output /packaged_model 197 | cd /packaged_model/de_trf_bertbasecased_lg-1.0.0 198 | python setup.py sdist 199 | pip install dist/de_trf_bertbasecased_lg-1.0.0.tar.gz 200 | ``` 201 | 202 | I recommend **changing the model's name** to avoid unnecessary inconveniences 203 | by editting the config file and modifying the ``name`` value of `/finetuning/output/meta.json`. 204 | 205 | ___ 206 | #### Load the spaCy model as part of your Rasa pipeline (optional) 207 | 208 | At the time of writing this, BERT outperforms most of the recent state-of-the-art approaches 209 | in NLP/NLU tasks, e.g. document classification. 210 | Since those techniques are used in several **conversational AI** tasks like **intent detection**, I thought it might be a good idea to evaluate its performance with **Rasa** - IMHO one of the 211 | best open source CAI engines currently available. 212 | 213 | If someone is interested in building a chatbot with Rasa, it might be a good idea to read the 214 | [Getting started](https://rasa.com/docs/getting-started/) guide. 215 | 216 | Assuming that someone is familiar with Rasa, here is one possible configuration proposal which 217 | loads the newly added finetuned BERT model as a part of the training pipeline: 218 | 219 | ``` 220 | language: de 221 | pipeline: 222 | - name: SpacyNLP 223 | case_sensitive: 1 224 | model: de_trf_bertbasecased_lg_gnad 225 | - name: SpacyTokenizer 226 | - name: SpacyFeaturizer 227 | - name: SklearnIntentClassifier 228 | ``` 229 | 230 | As you can see, I just specified the model's name, using the spaCy architecture with 231 | Rasa. This works, even if ``python -m spacy validate`` does **not** show your model. 232 | 233 | Assuming that you might want to test the performance with Rasa, you can use the ``test_bot`` directory 234 | which contains the skeletton for a Rasa bot to do so. In advance, use: 235 | 236 | ``` 237 | python rasa_bot_generator.py 238 | cp test.md test_bot/test_data/ 239 | cp train.md test_bot/data/ 240 | cd test_bot 241 | rasa train --data data/ -c config.yml -d domain.yml --out models/ 242 | rasa run -m models/ --enable-api 243 | ``` 244 | 245 | to create a valid ``stories.md`` and a valid ``domain.yml``. Please keep in mind that 246 | this will be a minimal sample from which I don't recommend to use it productively. 247 | 248 | If the bot is loaded, you can use the endpoint: 249 | 250 | ``` 251 | http://localhost:5005/model/parse 252 | 253 | POST 254 | { 255 | "text": "" 256 | } 257 | 258 | ``` 259 | ___ 260 | #### Evaluate different pipelines 261 | 262 | To keep things simple, there are two scripts which will do the work for you. 263 | 264 | **bert_classify** evaluates the finetuned BERT by training a logistic regression 265 | and a simple SVM classifier. 266 | 267 | ``` 268 | python -m bert_classify.py 269 | ``` 270 | 271 | **bert_rasa_classify** loads the trained Rasa model and uses the pretrained BERT features to evaluate the 272 | model's performance on the test data. Keep in mind that Rasa *compresses* your model, so you simply 273 | have to unzip/untar it and also modify the path to the NLU model in the script. 274 | 275 | ``` 276 | python -m bert_rasa_classify.py 277 | ``` 278 | 279 | Please be aware of the fact that to evaluate the **generalization capabilities** of the model, 280 | it would be better to split the original dataset into three parts such that there is a dataset 281 | completely unknown by the model (i.e. train/validation/test split). 282 | ___ 283 | #### Productive usage of a large BERT model 284 | 285 | TBD 286 | ___ 287 | 288 | #### A note on NER (Named Entity Recognition) 289 | 290 | As soon as I realized that I won’t be able to use the finetuned BERT-spaCy model in rasa for e.g. extracting entities like PERSON (in fact, duckling is currently not able to do that), I thought about how this would be done in general: 291 | 292 | 1. Use the SpacyFeaturizer and SpacyEntityExtractor which currently would be recommended but which is not possible due to manual effort on the side of BERT (as mentioned, I am working on that). 293 | 2. Finetuning the pretrained BERT that afterwards is converted into a spaCy-compatible model on any NER dataset is absolutely possible and intended. We can finetune the BERT on both tasks alongside. If so, the model contains everything we are going to need to derive entities from it. Currently just not with spaCy directly. Instead we could use a CustomBERTEntityExtractor which loads the model that the pipeline already has loaded and do the work, that spaCy is currently not “able” to do. 294 | 295 | 3. Since 2 seems to be an overhead at least for the moment, why not do the following: 296 | ``` 297 | language: de 298 | pipeline: 299 | - name: SpacyNLP 300 | case_sensitive: 1 301 | model: de_trf_bertbasecased_lg_gnad 302 | - name: SpacyTokenizer 303 | - name: SpacyFeaturizer 304 | - name: SklearnIntentClassifier 305 | - name: SpacyNLP 306 | case_sensitive: 1 307 | model: de_core_news_md 308 | - name: RegexFeaturizer 309 | - name: CRFEntityExtractor 310 | - name: DucklingHTTPExtractor 311 | dimensions: ['time', 'duration', 'email'] 312 | locale: de_DE 313 | timezone: Europe/Berlin 314 | url: http://localhost:8001 315 | - name: SpacyEntityExtractor 316 | dimensions: ['PER', 'LOC', 'CARDINAL'] 317 | - name: rasa_mod_regex.RegexEntityExtractor 318 | - name: EntitySynonymMapper 319 | 320 | ``` 321 | This pipeline will then load and use the features of de_trf_bertbasecased_lg_gnad for SklearnIntentClassifier, and the features of de_core_news_md for SpacyEntityExtractor. 322 | 323 | This is not a neat solution and it should only be used until there is a smarter way (1,2) but it works. 324 | 325 | It should be mentioned, that of course you are able to even train your own with spaCy. 326 | 327 | 328 | #### Troubleshooting 329 | 330 | 331 | ##### CUDA Out of Memory 332 | 333 | As discussed in a [spacy-trf-issue](https://github.com/explosion/spacy-pytorch-transformers/issues/48) you may run into 334 | memory problems. I have tested the finetuning script on a *GTX 1080 with 8GB VRAM* and even with a batch size of 335 | 2 (which is absolutely *not* recommended), I got memory problems. 336 | 337 | One way to deal with it is to use the sentencizer which splits larger documents into sentences while keeping their original labels. 338 | Another way is to reduce the batch size by half, to 12. BERT models usually need bigger batches but for the sake of functionality, I tried it. 339 | 340 | Currently I am using a *T80 with 12 GB VRAM*, sentencizing and a lowered batch size and that setup worked fine. 341 | 342 | 343 | ##### AttributeError: module 'thinc_gpu_ops' has no attribute 'mean_pool' 344 | 345 | As discussed [here](https://github.com/explosion/spacy-pytorch-transformers/issues/27) you might run into the mentioned 346 | error. I was able to resolve it by manually cloning thinc-gpu-ops, running ``pip install -r requirements.txt`` (that actually installed cython) and then running ``pip install`` . 347 | 348 | ___ 349 | 350 | 351 | 352 | 353 | A *thank you* goes to all of the **amazing open source workers** out there: 354 | 355 | * [Rasa](https://github.com/RasaHQ) 356 | * [spaCy](https://github.com/explosion/spaCy) 357 | * [Deepset](https://deepset.ai/german-bert) 358 | * [HuggingFace](https://github.com/huggingface/pytorch-transformers) 359 | * [MKaze](https://github.com/mkaze/) 360 | 361 | 362 | 363 | -------------------------------------------------------------------------------- /bert-base-german-cased/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 12, 10 | "num_hidden_layers": 12, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30000 13 | } 14 | -------------------------------------------------------------------------------- /bert_classify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import json 5 | import logging.config 6 | import spacy 7 | import numpy as np 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.svm import SVC 10 | 11 | 12 | logging.config.fileConfig('logging.conf') 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def get_features(nlp, texts): 17 | """ 18 | 19 | :param nlp: 20 | :param texts: 21 | :return: 22 | """ 23 | features = [] 24 | for doc in nlp.pipe(texts, batch_size=32): 25 | features.append(doc.vector) 26 | return np.array(features) 27 | 28 | 29 | # Load fine-tuned model 30 | model_dir = 'de_pytt_bertbasecased_lg_gnad' 31 | logger.info('Loading fine-tuned model...') 32 | nlp = spacy.load(model_dir) 33 | 34 | # Load test data 35 | logger.info('Loading test data...') 36 | with open('data/test.json', 'r', encoding='utf-8') as handle: 37 | test_data = json.load(handle) 38 | 39 | with open('data/train.json', 'r', encoding='utf-8') as handle: 40 | train_data = json.load(handle) 41 | 42 | train_cats, train_texts = zip(*train_data) 43 | test_cats, test_texts = zip(*test_data) 44 | 45 | 46 | # Get the features of training and test data 47 | logger.info('Get features of training and test data...') 48 | train_feats = get_features(nlp, train_texts) 49 | test_feats = get_features(nlp, test_texts) 50 | 51 | 52 | # Train a logistic regression model 53 | logger.info('Train a Logistic Regression model...') 54 | clsr = LogisticRegression() 55 | clsr.fit(train_feats, train_cats) 56 | logger.info('Accuracy of Logistic Regression model on test data: {}'.format(clsr.score(test_feats, test_cats))) 57 | 58 | # Train a SVM model 59 | logger.info('Train a SVM model...') 60 | svc = SVC(C=1, gamma=0.1, kernel='linear') 61 | svc.fit(train_feats, train_cats) 62 | logger.info('Accuracy of SVM model on test data: {}'.format(svc.score(test_feats, test_cats))) 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /bert_finetuner_ner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf8 3 | """Example of training spaCy's named entity recognizer, starting off with an 4 | existing model or a blank model. 5 | 6 | For more details, see the documentation: 7 | * Training: https://spacy.io/usage/training 8 | * NER: https://spacy.io/usage/linguistic-features#named-entities 9 | 10 | Compatible with: spaCy v2.0.0+ 11 | Last tested with: v2.1.0 12 | """ 13 | from __future__ import unicode_literals, print_function 14 | 15 | import plac 16 | import random 17 | from pathlib import Path 18 | import spacy 19 | from spacy.util import minibatch, compounding 20 | 21 | 22 | # training data 23 | TRAIN_DATA = [ 24 | ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), 25 | ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), 26 | ] 27 | 28 | 29 | @plac.annotations( 30 | model=("Model name. Defaults to blank 'en' model.", "option", "m", str), 31 | output_dir=("Optional output directory", "option", "o", Path), 32 | n_iter=("Number of training iterations", "option", "n", int), 33 | ) 34 | def main(model=None, output_dir=None, n_iter=100): 35 | """Load the model, set up the pipeline and train the entity recognizer.""" 36 | nlp = spacy.load(model) # load existing spaCy model 37 | print("Loaded model '%s'" % model) 38 | 39 | # create the built-in pipeline components and add them to the pipeline 40 | # nlp.create_pipe works for built-ins that are registered with spaCy 41 | ner = nlp.create_pipe("trf_ner") 42 | nlp.add_pipe(ner, last=True) 43 | 44 | # add labels 45 | for _, annotations in TRAIN_DATA: 46 | for ent in annotations.get("entities"): 47 | ner.add_label(ent[2]) 48 | 49 | # reset and initialize the weights randomly – but only if we're 50 | # training a new model 51 | nlp.resume_training() 52 | for itn in range(n_iter): 53 | random.shuffle(TRAIN_DATA) 54 | losses = {} 55 | # batch up the examples using spaCy's minibatch 56 | batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) 57 | for batch in batches: 58 | texts, annotations = zip(*batch) 59 | nlp.update( 60 | texts, # batch of texts 61 | annotations, # batch of annotations 62 | drop=0.5, # dropout - make it harder to memorise data 63 | losses=losses, 64 | ) 65 | print("Losses", losses) 66 | 67 | # test the trained model 68 | for text, _ in TRAIN_DATA: 69 | doc = nlp(text) 70 | print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) 71 | print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) 72 | 73 | # save model to output directory 74 | if output_dir is not None: 75 | output_dir = Path(output_dir) 76 | if not output_dir.exists(): 77 | output_dir.mkdir() 78 | nlp.to_disk(output_dir) 79 | print("Saved model to", output_dir) 80 | 81 | # test the saved model 82 | print("Loading from", output_dir) 83 | nlp2 = spacy.load(output_dir) 84 | for text, _ in TRAIN_DATA: 85 | doc = nlp2(text) 86 | print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) 87 | print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) 88 | 89 | 90 | if __name__ == "__main__": 91 | plac.call(main) 92 | 93 | # Expected output: 94 | # Entities [('Shaka Khan', 'PERSON')] 95 | # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3), 96 | # ('Khan', 'PERSON', 1), ('?', '', 2)] 97 | # Entities [('London', 'LOC'), ('Berlin', 'LOC')] 98 | # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), 99 | # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)] 100 | -------------------------------------------------------------------------------- /bert_finetuner_splitset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from collections import Counter 3 | 4 | import plac 5 | import re 6 | import random 7 | import json 8 | import spacy 9 | import torch 10 | import tqdm 11 | import unicodedata 12 | import logging.config 13 | import numpy as np 14 | import pandas as pd 15 | from pathlib import Path 16 | 17 | import wasabi 18 | from sklearn.model_selection import train_test_split 19 | from spacy.util import minibatch 20 | from spacy_transformers.util import cyclic_triangular_rate 21 | 22 | logging.config.fileConfig('logging.conf') 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | @plac.annotations( 27 | model=("Model name", "positional", None, str), 28 | output_dir=("Optional output directory", "option", "o", Path), 29 | batch_size=("Number of docs per batch", "option", "bs", int), 30 | learn_rate=("Learning rate", "option", "lr", float), 31 | n_iter=("Number of training epochs", "option", "n", int), 32 | sentence_split=("If the script should split docs into their sentences", "option", "s", bool), 33 | ) 34 | def main( 35 | model, 36 | output_dir=None, 37 | n_iter=4, 38 | batch_size=24, 39 | learn_rate=2e-5, 40 | sentence_split=False 41 | ): 42 | """ 43 | 44 | :param model: 45 | :param output_dir: 46 | :param n_iter: 47 | :param batch_size: 48 | :param learn_rate: 49 | :return: 50 | """ 51 | 52 | max_wpb = 1000 53 | 54 | spacy.util.fix_random_seed(0) 55 | is_using_gpu = spacy.prefer_gpu() 56 | if is_using_gpu: 57 | torch.set_default_tensor_type("torch.cuda.FloatTensor") 58 | if output_dir is not None: 59 | output_dir = Path(output_dir) 60 | if not output_dir.exists(): 61 | output_dir.mkdir() 62 | 63 | # Creating the output directory if it's not already present 64 | if output_dir is not None: 65 | output_dir = Path(output_dir) 66 | if not output_dir.exists(): 67 | output_dir.mkdir() 68 | 69 | # Load the pretrained BERT with spacy and check the pipeline names 70 | nlp = spacy.load(model) 71 | logger.info('Loaded model: {}'.format(model)) 72 | logger.info('Loaded models pipeline names: {}'.format(nlp.pipe_names)) 73 | 74 | # Using a softmax pooler output as first approach. 75 | textcat = nlp.create_pipe( 76 | "trf_textcat", config={"architecture": "softmax_pooler_output", "words_per_batch": max_wpb} 77 | ) 78 | 79 | # Loading the 10kGNAD dataset with pandas, representing its labels as a list 80 | logger.info("Loading domain specific data...") 81 | df_train_set = pd.read_csv('data/articles.csv', delimiter=';', error_bad_lines=False, names=['label', 'article']) 82 | train_label_list = df_train_set['label'].unique().tolist() 83 | 84 | logger.info( 85 | f"Using {len(df_train_set)} training docs overall.") 86 | 87 | # Do a stratified train test split and persist the result for later usage 88 | train_dataframe, eval_dataframe_first = train_test_split(df_train_set, test_size=0.4, stratify=df_train_set['label']) 89 | eval_dataframe, dev_dataframe = train_test_split(eval_dataframe_first, test_size=0.5, stratify=eval_dataframe_first['label']) 90 | 91 | train_data = list(train_dataframe.itertuples(index=False, name=None)) 92 | test_data = list(eval_dataframe.itertuples(index=False, name=None)) 93 | dev_data = list(dev_dataframe.itertuples(index=False, name=None)) 94 | 95 | logger.info(f"Using {len(train_data)} training docs, {len(test_data)} evaluation) and {len(dev_data)} for development.") 96 | 97 | # Some of the evaluation scripts are loading JSON so we are going to provide the split as JSON aswell 98 | with open('data/train.json', 'w') as handle: 99 | json.dump(train_data, handle) 100 | 101 | with open('data/test.json', 'w') as handle: 102 | json.dump(test_data, handle) 103 | 104 | with open('data/dev.json', 'w') as handle: 105 | json.dump(dev_data, handle) 106 | 107 | # Since rasa usually reads from markdown files, we are going ti provide the split as MD aswell 108 | create_rasa_training_set(train_dataframe) 109 | create_rasa_test_set(eval_dataframe) 110 | create_rasa_dev_set(dev_dataframe) 111 | 112 | # For later usage, we persist the labels separate from the rest aswell 113 | with open('data/labels.json', 'w', encoding='utf-8') as file: 114 | json.dump(train_label_list, file) 115 | 116 | # Add all the labels to the finetuner 117 | for label in train_label_list: 118 | textcat.add_label(str(label)) 119 | 120 | # Proper represent the labels 121 | (train_texts, train_cats), (eval_texts, eval_cats) = load_data( 122 | train_dataframe=train_dataframe, eval_dataframe=eval_dataframe, label_list=train_label_list 123 | ) 124 | 125 | # Configuring the pipeline for the finetuning process 126 | nlp.add_pipe(textcat, last=True) 127 | 128 | # It might be a good idea to split sentences of an article into separate training samples 129 | # For the moment, we are skipping that step to keep things simple. 130 | 131 | if sentence_split: 132 | logger.info(f"Sentencicing ...") 133 | train_texts, train_cats = make_sentence_examples(nlp, train_texts, train_cats) 134 | logger.info(f"Extracted {len(train_texts)} training sentences.") 135 | 136 | total_words = sum(len(text.split()) for text in train_texts) 137 | train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats])) 138 | 139 | # Initialize the TextCategorizer, and create an optimizer. 140 | optimizer = nlp.resume_training() 141 | optimizer.alpha = 0.001 142 | optimizer.trf_weight_decay = 0.005 143 | optimizer.L2 = 0.0 144 | learn_rates = cyclic_triangular_rate( 145 | learn_rate / 3, learn_rate * 3, 2 * len(train_data) // batch_size 146 | ) 147 | 148 | pbar = tqdm.tqdm(total=100, leave=False) 149 | results = [] 150 | epoch = 0 151 | step = 0 152 | eval_every = 100 153 | patience = 3 154 | while True: 155 | # Train and evaluate 156 | losses = Counter() 157 | random.shuffle(train_data) 158 | batches = minibatch(train_data, size=batch_size) 159 | for batch in batches: 160 | optimizer.trf_lr = next(learn_rates) 161 | texts, annotations = zip(*batch) 162 | nlp.update(texts, annotations, sgd=optimizer, drop=0.1, losses=losses) 163 | pbar.update(1) 164 | if step and (step % eval_every) == 0: 165 | pbar.close() 166 | with nlp.use_params(optimizer.averages): 167 | scores = evaluate_multiclass(nlp, eval_texts, eval_cats) 168 | results.append((scores["textcat_acc"], step, epoch)) 169 | print( 170 | "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( 171 | losses["trf_textcat"], 172 | scores["textcat_acc"], 173 | scores["textcat_cor"], 174 | scores["textcat_wrg"], 175 | ) 176 | ) 177 | pbar = tqdm.tqdm(total=eval_every, leave=False) 178 | step += 1 179 | epoch += 1 180 | # Stop if no improvement in HP.patience checkpoints 181 | if results: 182 | best_score, best_step, best_epoch = max(results) 183 | if ((step - best_step) // eval_every) >= patience: 184 | break 185 | 186 | msg = wasabi.Printer() 187 | table_widths = [2, 4, 6] 188 | msg.info(f"Best scoring checkpoints") 189 | msg.row(["Epoch", "Step", "Score"], widths=table_widths) 190 | msg.row(["-" * width for width in table_widths]) 191 | for score, step, epoch in sorted(results, reverse=True)[:10]: 192 | msg.row([epoch, step, "%.2f" % (score * 100)], widths=table_widths) 193 | 194 | if output_dir is not None: 195 | nlp.to_disk(output_dir) 196 | logger.info("Saved model to {}".format(output_dir)) 197 | 198 | 199 | def make_sentence_examples(nlp, texts, labels): 200 | """ 201 | 202 | :param nlp: 203 | :param texts: 204 | :param labels: 205 | :return: 206 | """ 207 | sents = [] 208 | sent_cats = [] 209 | for text, cats in zip(texts, labels): 210 | doc = nlp.make_doc(text) 211 | doc = nlp.get_pipe("sentencizer")(doc) 212 | for sent in doc.sents: 213 | sents.append(sent.text) 214 | sent_cats.append(cats) 215 | return sents, sent_cats 216 | 217 | def preprocess_text(text): 218 | """ 219 | 220 | :param text: 221 | :return: 222 | """ 223 | 224 | white_re = re.compile(r"\s\s+") 225 | text = text.replace("", "") 226 | text = text.replace("", "") 227 | text = white_re.sub(" ", text).strip() 228 | return "".join( 229 | c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn" 230 | ) 231 | 232 | 233 | def load_data(train_dataframe, eval_dataframe, label_list): 234 | """ 235 | 236 | :param train_dataframe: 237 | :param eval_dataframe: 238 | :param label_list: 239 | :return: 240 | """ 241 | 242 | train_data = list(train_dataframe.itertuples(index=False, name=None)) 243 | dev_data = list(eval_dataframe.itertuples(index=False, name=None)) 244 | train_labels, train_texts = _prepare_partition(train_data, preprocess=False, label_list=label_list) 245 | dev_labels, dev_texts = _prepare_partition(dev_data, preprocess=False, label_list=label_list) 246 | return (train_labels, train_texts), (dev_labels, dev_texts) 247 | 248 | 249 | def _prepare_partition(text_label_tuples, *, preprocess=False, label_list): 250 | """ 251 | 252 | :param text_label_tuples: 253 | :param preprocess: 254 | :param label_list: 255 | :return: 256 | """ 257 | labels, texts = zip(*text_label_tuples) 258 | if preprocess: 259 | texts = [preprocess_text(text) for text in texts] 260 | cats = [{str(i): 1.0 if i == y else 0.0 for i in label_list} for y in labels] 261 | 262 | return texts, cats 263 | 264 | 265 | def find_max_prob_cat(cats_probs): 266 | """ 267 | 268 | :param cats_probs: 269 | :return: 270 | """ 271 | cats, probs = zip(*cats_probs.items()) 272 | idx = np.argmax(probs) 273 | return cats[idx] 274 | 275 | 276 | def evaluate_multiclass(nlp, texts, cats): 277 | """ 278 | 279 | :param nlp: 280 | :param texts: 281 | :param cats: 282 | :return: 283 | """ 284 | correct = 0 285 | total_words = sum(len(text.split()) for text in texts) 286 | with tqdm.tqdm(total=total_words, leave=False) as pbar: 287 | for i, doc in enumerate(nlp.pipe(texts, batch_size=8)): 288 | true_label = find_max_prob_cat(cats[i]) 289 | pred_label = find_max_prob_cat(doc.cats) 290 | if true_label == pred_label: 291 | correct += 1 292 | pbar.update(len(doc.text.split())) 293 | return {'textcat_acc': float(correct) / len(texts), 294 | 'textcat_cor': correct, 295 | 'textcat_wrg': len(texts) - correct} 296 | 297 | 298 | def create_rasa_training_set(df_train_set): 299 | """ 300 | 301 | :param df_train_set: 302 | :return: 303 | """ 304 | label_samples = {} 305 | with open('data/train.md', 'w', encoding='utf-8') as file: 306 | for index, entry in df_train_set.iterrows(): 307 | if entry['label'] not in label_samples: 308 | label_samples[entry['label']] = [] 309 | label_samples[entry['label']].append(entry['article']) 310 | else: 311 | label_samples[entry['label']].append(entry['article']) 312 | for label, articles in label_samples.items(): 313 | file.write('## intent:' + label + '\n') 314 | for article in articles: 315 | file.write('- ' + article + '\n') 316 | 317 | 318 | def create_rasa_dev_set(df_dev_set): 319 | """ 320 | 321 | :param df_dev_set: 322 | :return: 323 | """ 324 | label_samples = {} 325 | with open('data/dev.md', 'w', encoding='utf-8') as file: 326 | for index, entry in df_dev_set.iterrows(): 327 | if entry['label'] not in label_samples: 328 | label_samples[entry['label']] = [] 329 | label_samples[entry['label']].append(entry['article']) 330 | else: 331 | label_samples[entry['label']].append(entry['article']) 332 | for label, articles in label_samples.items(): 333 | file.write('## intent:' + label + '\n') 334 | for article in articles: 335 | file.write('- ' + article + '\n') 336 | 337 | 338 | def create_rasa_test_set(df_test_set): 339 | """ 340 | 341 | :param df_test_set: 342 | :return: 343 | """ 344 | label_samples = {} 345 | with open('data/test.md', 'w', encoding='utf-8') as file: 346 | for index, entry in df_test_set.iterrows(): 347 | if entry['label'] not in label_samples: 348 | label_samples[entry['label']] = [] 349 | label_samples[entry['label']].append(entry['article']) 350 | else: 351 | label_samples[entry['label']].append(entry['article']) 352 | for label, articles in label_samples.items(): 353 | file.write('## intent:' + label + '\n') 354 | for article in articles: 355 | file.write('- ' + article + '\n') 356 | 357 | 358 | if __name__ == "__main__": 359 | plac.call(main) 360 | -------------------------------------------------------------------------------- /bert_rasa_classify.py: -------------------------------------------------------------------------------- 1 | import json 2 | import spacy 3 | import numpy as np 4 | import logging.config 5 | from rasa.nlu.utils import json_unpickle 6 | from sklearn.preprocessing import LabelEncoder 7 | from sklearn import metrics 8 | 9 | logging.config.fileConfig('logging.conf') 10 | logger = logging.getLogger(__name__) 11 | 12 | # Set the paths 13 | classifier_file = '/nlu/component_3_SklearnIntentClassifier_classifier.pkl' 14 | encoder_file = '/nlu/component_3_SklearnIntentClassifier_encoder.pkl' 15 | 16 | logger.info('Load Rasa classifier model') 17 | classifier = json_unpickle(classifier_file) 18 | classes = json_unpickle(encoder_file) 19 | encoder = LabelEncoder() 20 | encoder.classes_ = classes 21 | 22 | 23 | def get_features(nlp, texts): 24 | """ 25 | 26 | :param nlp: 27 | :param texts: 28 | :return: 29 | """ 30 | features = [] 31 | for doc in nlp.pipe(texts, batch_size=32): 32 | features.append(doc.vector) 33 | return np.array(features) 34 | 35 | # Load fine-tuned model 36 | model_dir = 'de_pytt_bertbasecased_lg_gnad' 37 | logger.info('Loading fine-tuned model...') 38 | nlp = spacy.load(model_dir) 39 | 40 | # Load test data 41 | logger.info('Loading test data...') 42 | with open('data/test.json', 'r', encoding='utf-8') as handle: 43 | test_data = json.load(handle) 44 | test_cats, test_texts = zip(*test_data) 45 | 46 | # Get the features of test data 47 | logger.info('Get features of test data...') 48 | test_feats = get_features(nlp, test_texts) 49 | 50 | # Encode labels 51 | test_labels = encoder.transform(test_cats) 52 | 53 | preds = classifier.predict(test_feats) 54 | logger.info('Micro: {}'.format(metrics.precision_score(test_labels, preds, average='micro'))) 55 | logger.info('Macro: {}'.format(metrics.precision_score(test_labels, preds, average='macro'))) 56 | logger.info('Weighted: {}'.format(metrics.precision_score(test_labels, preds, average='weighted'))) 57 | logger.info('Accuracy: {}'.format(classifier.score(test_feats, test_labels))) 58 | -------------------------------------------------------------------------------- /create_train_test_split.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pandas as pd 4 | from sklearn.model_selection import train_test_split 5 | 6 | 7 | def create_rasa_training_set(df_train_set): 8 | label_samples = {} 9 | with open('train.md', 'w', encoding='utf-8') as file: 10 | for index, entry in df_train_set.iterrows(): 11 | if entry['label'] not in label_samples: 12 | label_samples[entry['label']] = [] 13 | label_samples[entry['label']].append(entry['article']) 14 | else: 15 | label_samples[entry['label']].append(entry['article']) 16 | for label, articles in label_samples.items(): 17 | file.write('## intent:' + label + '\n') 18 | for article in articles: 19 | file.write('- ' + article + '\n') 20 | 21 | 22 | def create_rasa_test_set(df_test_set): 23 | label_samples = {} 24 | with open('test.md', 'w', encoding='utf-8') as file: 25 | for index, entry in df_test_set.iterrows(): 26 | if entry['label'] not in label_samples: 27 | label_samples[entry['label']] = [] 28 | label_samples[entry['label']].append(entry['article']) 29 | else: 30 | label_samples[entry['label']].append(entry['article']) 31 | for label, articles in label_samples.items(): 32 | file.write('## intent:' + label + '\n') 33 | for article in articles: 34 | file.write('- ' + article + '\n') 35 | 36 | 37 | df_train_set = pd.read_excel('mennekes_full_v2.xlsx') 38 | train_label_list = df_train_set['label'].unique().tolist() 39 | # do a stratified train test split and persist the result 40 | train_dataframe, eval_dataframe = train_test_split(df_train_set, test_size=0.10, stratify=df_train_set['label']) 41 | train_data = list(train_dataframe.itertuples(index=False, name=None)) 42 | test_data = list(eval_dataframe.itertuples(index=False, name=None)) 43 | 44 | create_rasa_training_set(train_dataframe) 45 | create_rasa_test_set(eval_dataframe) 46 | 47 | with open('train.json', 'w') as handle: 48 | json.dump(train_data, handle) 49 | 50 | with open('test.json', 'w') as handle: 51 | json.dump(test_data, handle) 52 | -------------------------------------------------------------------------------- /create_xlsx_dataset_from_rasa_nlu.py: -------------------------------------------------------------------------------- 1 | from rasa.nlu.training_data.formats import MarkdownReader 2 | import xlsxwriter 3 | 4 | workbook = xlsxwriter.Workbook('filename.xlsx') 5 | worksheet = workbook.add_worksheet() 6 | worksheet.write('A1', 'question') 7 | worksheet.write('B1', 'label') 8 | worksheet.write('C1', 'answer') 9 | 10 | training_data = () 11 | row = 1 12 | col = 0 13 | 14 | doc = "PATH\\TO\\nlu.md" 15 | 16 | reader = MarkdownReader() 17 | reader.read(doc, language='de', fformat='MARKDOWN') 18 | for message in reader.training_examples: 19 | training_data = training_data + ([message.text, message.get('intent')],) 20 | 21 | for question, label in (training_data): 22 | worksheet.write_string(row, col, question) 23 | worksheet.write_string(row, col + 1, label) 24 | worksheet.write_string(row, col + 2, '') 25 | row += 1 26 | 27 | workbook.close() 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /huggingface_finetune.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | import tensorflow as tf 5 | from transformers import BertConfig, BertTokenizer, TFBertForSequenceClassification, TFBertModel 6 | from sklearn.preprocessing import LabelEncoder 7 | from rasa.nlu.training_data.formats import MarkdownReader 8 | 9 | # disable GPU 10 | #os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 11 | 12 | 13 | def read_inputs_json(input_path): 14 | texts = [] 15 | cats = [] 16 | with open(input_path, mode="r") as file_: 17 | data = json.load(file_)['rasa_nlu_data']['common_examples'] 18 | for d in data: 19 | texts.append(d['text']) 20 | cats.append(d['intent']) 21 | 22 | return texts, cats 23 | 24 | 25 | def read_inputs_md(input_path): 26 | reader = MarkdownReader() 27 | reader.read(input_path, language='de', fformat='MARKDOWN') 28 | texts = [] 29 | cats = [] 30 | for message in reader.training_examples: 31 | texts.append(message.text) 32 | cats.append(message.get('intent')) 33 | 34 | return texts, cats 35 | 36 | train_texts, train_labels = read_inputs_md( 37 | os.path.join('train_test_split', 'training_data.md') 38 | ) 39 | test_texts, test_labels = read_inputs_json( 40 | os.path.join('train_test_split', 'test_data.md') 41 | ) 42 | 43 | le = LabelEncoder().fit(train_labels) 44 | train_labels = le.transform(train_labels) 45 | test_labels = le.transform(test_labels) 46 | 47 | 48 | config = BertConfig.from_pretrained("dbmdz/bert-base-german-cased", 49 | num_labels=len(le.classes_)) 50 | tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-cased") 51 | model = TFBertForSequenceClassification.from_pretrained( 52 | "dbmdz/bert-base-german-cased", 53 | config=config, 54 | trainable=True, 55 | ) 56 | 57 | def encode(texts): 58 | input_ids = [] 59 | attention_mask = [] 60 | for text in texts: 61 | tokens = tokenizer.encode_plus(text, 62 | max_length=128, 63 | pad_to_max_length=True, 64 | return_token_type_ids=False, 65 | return_attention_mask=True) 66 | input_ids.append(tokens['input_ids']) 67 | attention_mask.append(tokens['attention_mask']) 68 | 69 | return np.array(input_ids), np.array(attention_mask) 70 | 71 | 72 | train_input_ids, train_attention_mask = encode(train_texts) 73 | test_input_ids, test_attention_maks = encode(test_texts) 74 | print(train_input_ids.shape, train_attention_mask.shape) 75 | 76 | opt = tf.keras.optimizers.Adam(learning_rate=1e-5) 77 | loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) 78 | metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") 79 | 80 | model.compile(loss=loss, optimizer=opt, metrics=[metric]) 81 | 82 | print('Model compiled; start training...') 83 | history = model.fit([train_input_ids, train_attention_mask], 84 | train_labels, 85 | epochs=20, 86 | batch_size=8, 87 | validation_data=([test_input_ids, test_attention_maks], test_labels)) 88 | 89 | bert = TFBertModel.from_pretrained("dbmdz/bert-base-german-cased") 90 | bert.bert = model.bert 91 | bert.save_pretrained('model/') 92 | 93 | tokenizer.save_pretrained('model/') 94 | -------------------------------------------------------------------------------- /logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root,eva_core_api 3 | 4 | [handlers] 5 | keys=console 6 | 7 | [formatters] 8 | keys=simple 9 | 10 | [logger_root] 11 | level=DEBUG 12 | handlers=console 13 | 14 | [logger_eva_core_api] 15 | level=DEBUG 16 | handlers=console 17 | qualname=eva_core_api 18 | propagate=0 19 | 20 | [handler_console] 21 | class=StreamHandler 22 | level=DEBUG 23 | formatter=simple 24 | args=(sys.stdout,) 25 | 26 | [formatter_simple] 27 | format=%(asctime)s - %(name)s - %(levelname)s - %(message)s 28 | datefmt= 29 | -------------------------------------------------------------------------------- /rasa_bot_generator.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | def faq_domain_generator(): 4 | """ 5 | 6 | :return: 7 | """ 8 | with open('test_bot/domain.yml', 'w', encoding='utf-8') as story_file: 9 | with open('labels.json', 'r', encoding='utf-8') as label_file: 10 | labels_as_intents = json.load(label_file) 11 | story_file.write('intents: \n') 12 | for intent in labels_as_intents: 13 | story_file.write(f' - {intent}\n') 14 | story_file.write('actions: \n') 15 | for intent in labels_as_intents: 16 | story_file.write(f' - utter_{intent}\n') 17 | 18 | def faq_story_generator(): 19 | """""" 20 | 21 | with open('test_bot/data/stories.md', 'w', encoding='utf-8') as story_file: 22 | with open('labels.json', 'r', encoding='utf-8') as label_file: 23 | labels_as_intents = json.load(label_file) 24 | for intent in labels_as_intents: 25 | story_file.write(f'## faq_story_{intent}\n') 26 | story_file.write(f'* {intent}\n') 27 | story_file.write(f' - utter_{intent}\n') 28 | story_file.write(f' - action_restart\n') 29 | 30 | 31 | if __name__ == "__main__": 32 | faq_story_generator() 33 | faq_domain_generator() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | spacy 2 | rasa 3 | spacy-transformers 4 | pandas -------------------------------------------------------------------------------- /test_bot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JulianGerhard21/bert_spacy_rasa/0ee3739680c5015df65d963f85b3e4e9deded8a4/test_bot/__init__.py -------------------------------------------------------------------------------- /test_bot/actions.py: -------------------------------------------------------------------------------- 1 | # This files contains your custom actions which can be used to run 2 | # custom Python code. 3 | # 4 | # See this guide on how to implement these action: 5 | # https://rasa.com/docs/rasa/core/actions/#custom-actions/ 6 | 7 | 8 | from typing import Any, Text, Dict, List 9 | 10 | from rasa_sdk import Action, Tracker 11 | from rasa_sdk.executor import CollectingDispatcher -------------------------------------------------------------------------------- /test_bot/config.yml: -------------------------------------------------------------------------------- 1 | language: de 2 | pipeline: 3 | - name: SpacyNLP 4 | case_sensitive: 1 5 | model: de_pytt_bertbasecased_lg_gnad 6 | - name: SpacyTokenizer 7 | - name: SpacyFeaturizer 8 | - name: SklearnIntentClassifier 9 | policies: 10 | - name: KerasPolicy 11 | batch_size: 50 12 | epochs: 200 13 | max_training_samples: 300 14 | - name: MemoizationPolicy 15 | max_history: 5 16 | - name: FormPolicy 17 | - name: MappingPolicy 18 | -------------------------------------------------------------------------------- /test_bot/credentials.yml: -------------------------------------------------------------------------------- 1 | # This file contains the credentials for the voice & chat platforms 2 | # which your bot is using. 3 | # https://rasa.com/docs/rasa/user-guide/messaging-and-voice-channels/ 4 | 5 | rest: 6 | # # you don't need to provide anything here - this channel doesn't 7 | # # require any credentials 8 | 9 | 10 | #facebook: 11 | # verify: "" 12 | # secret: "" 13 | # page-access-token: "" 14 | 15 | #slack: 16 | # slack_token: "" 17 | # slack_channel: "" 18 | 19 | #socketio: 20 | # user_message_evt: 21 | # bot_message_evt: 22 | # session_persistence: 23 | 24 | rasa: 25 | url: "http://localhost:5002/api" 26 | -------------------------------------------------------------------------------- /test_bot/domain.yml: -------------------------------------------------------------------------------- 1 | intents: 2 | - Etat 3 | - Inland 4 | - International 5 | - Kultur 6 | - Panorama 7 | - Sport 8 | - Web 9 | - Wirtschaft 10 | - Wissenschaft 11 | actions: 12 | - utter_Etat 13 | - utter_Inland 14 | - utter_International 15 | - utter_Kultur 16 | - utter_Panorama 17 | - utter_Sport 18 | - utter_Web 19 | - utter_Wirtschaft 20 | - utter_Wissenschaft 21 | -------------------------------------------------------------------------------- /test_bot/endpoints.yml: -------------------------------------------------------------------------------- 1 | # This file contains the different endpoints your bot can use. 2 | 3 | # Server where the models are pulled from. 4 | # https://rasa.com/docs/rasa/user-guide/running-the-server/#fetching-models-from-a-server/ 5 | 6 | #models: 7 | # url: http://my-server.com/models/default_core@latest 8 | # wait_time_between_pulls: 10 # [optional](default: 100) 9 | 10 | # Server which runs your custom actions. 11 | # https://rasa.com/docs/rasa/core/actions/#custom-actions/ 12 | 13 | action_endpoint: 14 | url: "http://localhost:5055/webhook" 15 | 16 | # Tracker store which is used to store the conversations. 17 | # By default the conversations are stored in memory. 18 | # https://rasa.com/docs/rasa/api/tracker-stores/ 19 | 20 | #tracker_store: 21 | # type: redis 22 | # url: 23 | # port: 24 | # db: 25 | # password: 26 | 27 | #tracker_store: 28 | # type: mongod 29 | # url: 30 | # db: 31 | # username: 32 | # password: 33 | 34 | # Event broker which all conversation events should be streamed to. 35 | # https://rasa.com/docs/rasa/api/event-brokers/ 36 | 37 | #event_broker: 38 | # url: localhost 39 | # username: username 40 | # password: password 41 | # queue: queue 42 | --------------------------------------------------------------------------------