├── .gitignore ├── LICENSE ├── README.md ├── database_entity_extractor.py ├── luis_entity_extractor.py ├── rasa_cer_config_sample_db.json ├── rasa_config_sample_db.yml ├── rasa_simple_config_sample.json ├── requirements.txt └── simple_entity_extractor.py /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | venv 3 | __pycache__ 4 | .vscode -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 ESC Deutschland GmbH 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Custom Entity Extraction for RASA 2 | This repository contains some custom entity extractors for RASA. Further details will follow below. 3 | 4 | ## Simple Entity Extractor 5 | An entity extractor for Json files. A sample for such Json file can be found [here](rasa_simple_config_sample.json). 6 | 7 | A sample config for RASA: 8 | 9 | ```yml 10 | language: en 11 | pipeline: 12 | - name: WhitespaceTokenizer 13 | - name: RegexFeaturizer 14 | - name: simple_entity_extractor.SimpleEntityExtractor 15 | config: "myconfig.json" 16 | min_confidence: 0.8 17 | ``` 18 | 19 | 20 | ## Database Entity Extractor 21 | An entity extractor for [MySQL](https://www.mysql.com/). You can simply use the a database to extract entities via fuzzy sets. You have to define queries for the different entity types. 22 | 23 | A sample config for the extractor is [here](rasa_cer_config_sample_db.json): 24 | 25 | ```json 26 | { 27 | "database_config": { 28 | "host": "", 29 | "user": "", 30 | "password": "", 31 | "database": "" 32 | }, 33 | "database_queries": { 34 | "firstnames": "SELECT name FROM NamesDB WHERE EntityType = 'firstname';", 35 | "lastnames": "SELECT name FROM NamesDB WHERE EntityType = 'lastname';" 36 | }, 37 | "minimumConfidence": 0.81 38 | } 39 | ``` 40 | 41 | A sample config for RASA could look like [here](rasa_config_sample_db.yml) 42 | ```yml 43 | language: en 44 | pipeline: 45 | - name: WhitespaceTokenizer 46 | - name: RegexFeaturizer 47 | - name: database_entity_extractor.DatabaseEntityExtractor 48 | config: "/path/to/sample_config.json" 49 | ``` 50 | 51 | ## LUIS Entity Extractor 52 | An entity extractor for [LUIS](https://www.luis.ai). You can simply use the exported LUIS model to extract entities via fuzzy sets. Currently only list entities are supported. 53 | 54 | A sample config for RASA: 55 | 56 | ```yml 57 | language: en 58 | pipeline: 59 | - name: WhitespaceTokenizer 60 | - name: RegexFeaturizer 61 | - name: luis_entity_extractor.LuisEntityExtractor 62 | config: "luis.json" 63 | min_confidence: 0.8 64 | ``` 65 | -------------------------------------------------------------------------------- /database_entity_extractor.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Dict, Optional, Text 3 | 4 | import pymysql 5 | import rasa.utils.io 6 | from rasa.nlu.config import RasaNLUModelConfig 7 | from rasa.nlu.extractors.extractor import EntityExtractor 8 | from rasa.nlu.model import Metadata 9 | from rasa.nlu.training_data import Message, TrainingData 10 | 11 | try: 12 | from cfuzzyset import cFuzzySet as FuzzySet 13 | except ImportError: 14 | from fuzzyset import FuzzySet 15 | 16 | class DatabaseEntityExtractor(EntityExtractor): 17 | """ 18 | This is a custom entity extractor accessing a database that contains a list of entities. 19 | Infos about the database are stored in the config-file config.json. 20 | This class performs (fuzzy-)matching of an input against every known entity. The most similar entities 21 | are returned as entities. 22 | """ 23 | 24 | name = "DatabaseEntityExtractor" 25 | provides = ["entities"] 26 | requires = ["tokens"] 27 | 28 | def __init__(self, parameters: Dict[Text, Text]) -> None: 29 | super(DatabaseEntityExtractor, self).__init__(parameters) 30 | 31 | if parameters is None: raise AttributeError("No valid config given!") 32 | 33 | if not isinstance(parameters, dict): raise AttributeError(f"config has type {type(parameters)}") 34 | 35 | if "config" not in parameters.keys(): 36 | raise AttributeError(f"config not given: parameters contains {parameters.keys()}") 37 | 38 | component_config = None 39 | with open(parameters["config"]) as json_file: 40 | component_config = json.load(json_file) 41 | 42 | 43 | self.min_confidence = float(component_config["minimumConfidence"]) 44 | self.ents = {} 45 | try: 46 | self._get_entity_groups(component_config["database_config"], component_config["database_queries"]) 47 | except Exception: 48 | import warnings 49 | warnings.warn("An error occured while fetching the database") 50 | 51 | def _get_entity_groups(self, database_config: Dict[Text, Text], database_queries: Dict[Text, Text]): 52 | db = pymysql.connect(host=database_config["host"], 53 | user=database_config["user"], 54 | passwd=database_config["password"], 55 | db=database_config["database"]) 56 | cur = db.cursor() 57 | print(f"Queries are: {database_queries.keys()}") 58 | for entity_key in database_queries.keys(): 59 | cur.execute(database_queries[entity_key]) 60 | current_entity = FuzzySet() 61 | for row in cur.fetchall(): 62 | if len(row) != 1: raise SyntaxError(f"{entity_key}: query returned more than one column!") 63 | current_entity.add(row[0]) 64 | self.ents[entity_key] = current_entity 65 | db.close() 66 | 67 | def train(self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any) -> None: 68 | """ 69 | Currently no training is needed for fuzzy matching. 70 | """ 71 | pass 72 | 73 | def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]: 74 | """ 75 | Persist this component to disk for future loading. 76 | Currently does nothing because there is nothing to be persisted. 77 | """ 78 | pass 79 | 80 | def process(self, message: Message, **kwargs: Any) -> None: 81 | """ 82 | Process an incoming message by determining the most similar (or matching) names. 83 | """ 84 | extracted = self.match_entities(message) 85 | message.set("entities", message.get("entities", []) + extracted, add_to_output=True) 86 | 87 | def match_entities(self, message: Message): 88 | """ 89 | Perform fuzzy matching on each token of the message. 90 | A token contains its text, its offset, its end and optionally additional data. 91 | """ 92 | extracted_entities = [] 93 | tokens = message.get("tokens") 94 | for token in tokens: 95 | for entity_type in self.ents.keys(): 96 | fuzzy_matches = self.ents[entity_type].get(token.text) 97 | for match in fuzzy_matches: 98 | if match[0] < self.min_confidence: continue # skip low-confidence entities 99 | entity = { 100 | "start": token.start, 101 | "end": token.end, 102 | "value": match[1], 103 | "confidence": match[0], 104 | "entity": entity_type, 105 | } 106 | extracted_entities.append(entity) 107 | return extracted_entities 108 | -------------------------------------------------------------------------------- /luis_entity_extractor.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Dict, Optional, Text 3 | 4 | import rasa.utils.io 5 | from rasa.nlu.config import RasaNLUModelConfig 6 | from rasa.nlu.extractors.extractor import EntityExtractor 7 | from rasa.nlu.model import Metadata 8 | from rasa.nlu.training_data import Message, TrainingData 9 | 10 | try: 11 | from cfuzzyset import cFuzzySet as FuzzySet 12 | except ImportError: 13 | from fuzzyset import FuzzySet 14 | 15 | 16 | class LuisEntityExtractor(EntityExtractor): 17 | """ 18 | This is a custom entity extractor accessing a LUIS configuration file that contains a list of entities. 19 | This class performs (fuzzy-)matching of an input against every known list entity. The most similar entities 20 | are returned as entities. 21 | """ 22 | 23 | name = "LuisEntityExtractor" 24 | provides = ["entities"] 25 | requires = ["tokens"] 26 | 27 | def __init__(self, parameters: Dict[Text, Text]) -> None: 28 | super(LuisEntityExtractor, self).__init__(parameters) 29 | 30 | if parameters is None: 31 | raise AttributeError("No valid config given!") 32 | if not isinstance(parameters, dict): 33 | raise AttributeError(f"config has type {type(parameters)}") 34 | if "config" not in parameters.keys(): 35 | raise AttributeError(f"config not given: parameters contains {parameters.keys()}") 36 | 37 | with open(parameters["config"], encoding="utf-8-sig") as json_file: 38 | parsed = json.load(json_file) 39 | self._entities = self._load(parsed["closedLists"]) 40 | 41 | self._min_confidence = 0.7 if "min_confidence" not in parameters.keys() else parameters["min_confidence"] 42 | 43 | def process(self, message: Message, **kwargs: Any) -> None: 44 | """ 45 | Process an incoming message by determining the most similar (or matching) names. 46 | """ 47 | extracted = self._match_entities(message) 48 | message.set("entities", message.get("entities", []) + extracted, add_to_output=True) 49 | 50 | def _load(self, parsed_entities): 51 | entities = [] 52 | for group in parsed_entities: 53 | group_name = group["name"] 54 | for group_element in group["subLists"]: 55 | fuzzy = FuzzySet() 56 | for x in [group_element["canonicalForm"]] + group_element["list"]: 57 | fuzzy.add(x) 58 | 59 | entity = { 60 | "group": group_name, 61 | "canonical": group_element["canonicalForm"], 62 | "fuzzy": fuzzy 63 | } 64 | entities.append(entity) 65 | return entities 66 | 67 | def _match_entities(self, message: Message): 68 | """ 69 | Perform fuzzy matching on each token of the message. 70 | A token contains its text, its offset, its end and optionally additional data. 71 | """ 72 | extracted_entities = [] 73 | tokens = message.get("tokens") 74 | for token in tokens: 75 | for entity in self._entities: 76 | matches = entity["fuzzy"].get(token.text) 77 | 78 | if matches is None: 79 | continue 80 | for match in matches: 81 | if match[0] < self._min_confidence: 82 | continue 83 | match = { 84 | "start": token.start, 85 | "end": token.end, 86 | "value": entity["canonical"], 87 | "confidence": match[0], 88 | "entity": entity["group"], 89 | } 90 | extracted_entities.append(match) 91 | return extracted_entities 92 | -------------------------------------------------------------------------------- /rasa_cer_config_sample_db.json: -------------------------------------------------------------------------------- 1 | { 2 | "database_config": { 3 | "host": "", 4 | "user": "", 5 | "password": "", 6 | "database": "" 7 | }, 8 | "database_queries": { 9 | "firstnames": "SELECT name FROM NamesDB WHERE EntityType = 'firstname';", 10 | "lastnames": "SELECT name FROM NamesDB WHERE EntityType = 'lastname';" 11 | }, 12 | "minimumConfidence": 0.81 13 | } -------------------------------------------------------------------------------- /rasa_config_sample_db.yml: -------------------------------------------------------------------------------- 1 | language: en 2 | pipeline: 3 | - name: WhitespaceTokenizer 4 | - name: RegexFeaturizer 5 | - name: database_entity_extractor.DatabaseEntityExtractor 6 | config: "/path/to/sample_config.json" -------------------------------------------------------------------------------- /rasa_simple_config_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "Group1": { 3 | "normalized": [ 4 | "synonym", 5 | "synonym2" 6 | ] 7 | } 8 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fuzzyset==0.0.19 2 | PyMySQL==0.9.3 3 | rasa==1.10.0 4 | -------------------------------------------------------------------------------- /simple_entity_extractor.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Dict, Optional, Text 3 | 4 | import rasa.utils.io 5 | from rasa.nlu.config import RasaNLUModelConfig 6 | from rasa.nlu.extractors.extractor import EntityExtractor 7 | from rasa.nlu.model import Metadata 8 | from rasa.nlu.training_data import Message, TrainingData 9 | 10 | try: 11 | from cfuzzyset import cFuzzySet as FuzzySet 12 | except ImportError: 13 | from fuzzyset import FuzzySet 14 | 15 | 16 | class SimpleEntityExtractor(EntityExtractor): 17 | """ 18 | This is a custom entity extractor accessing a simple JSON file that contains a dictionary of entitie groups. 19 | This class performs (fuzzy-)matching of an input against every known list entity. The most similar entities 20 | are returned as entities. 21 | """ 22 | 23 | name = "SimpleEntityExtractor" 24 | provides = ["entities"] 25 | requires = ["tokens"] 26 | 27 | def __init__(self, parameters: Dict[Text, Text]) -> None: 28 | super(SimpleEntityExtractor, self).__init__(parameters) 29 | 30 | if parameters is None: 31 | raise AttributeError("No valid config given!") 32 | if not isinstance(parameters, dict): 33 | raise AttributeError(f"config has type {type(parameters)}") 34 | if "config" not in parameters.keys(): 35 | raise AttributeError(f"config not given: parameters contains {parameters.keys()}") 36 | 37 | with open(parameters["config"], encoding="utf-8-sig") as json_file: 38 | parsed = json.load(json_file) 39 | self._entities = self._load(parsed) 40 | 41 | self._min_confidence = 0.7 if "min_confidence" not in parameters.keys() else parameters["min_confidence"] 42 | 43 | def process(self, message: Message, **kwargs: Any) -> None: 44 | """ 45 | Process an incoming message by determining the most similar (or matching) names. 46 | """ 47 | extracted = self._match_entities(message) 48 | message.set("entities", message.get("entities", []) + extracted, add_to_output=True) 49 | 50 | def _load(self, parsed_entities): 51 | entities = [] 52 | for group in parsed_entities.keys(): 53 | for element in parsed_entities[group].keys(): 54 | fuzzy = FuzzySet() 55 | for x in [element] + parsed_entities[group][element]: 56 | fuzzy.add(x) 57 | 58 | entity = { 59 | "group": group, 60 | "canonical": element, 61 | "fuzzy": fuzzy 62 | } 63 | entities.append(entity) 64 | return entities 65 | 66 | def _match_entities(self, message: Message): 67 | """ 68 | Perform fuzzy matching on each token of the message. 69 | A token contains its text, its offset, its end and optionally additional data. 70 | """ 71 | extracted_entities = [] 72 | tokens = message.get("tokens") 73 | for token in tokens: 74 | for entity in self._entities: 75 | matches = entity["fuzzy"].get(token.text) 76 | 77 | if matches is None: 78 | continue 79 | for match in matches: 80 | if match[0] < self._min_confidence: 81 | continue 82 | match = { 83 | "start": token.start, 84 | "end": token.end, 85 | "value": entity["canonical"], 86 | "confidence": match[0], 87 | "entity": entity["group"], 88 | } 89 | extracted_entities.append(match) 90 | return extracted_entities 91 | --------------------------------------------------------------------------------