├── macaw ├── __init__.py ├── core │ ├── __init__.py │ ├── input_handler │ │ ├── __init__.py │ │ ├── actions.py │ │ └── action_detection.py │ ├── output_handler │ │ ├── __init__.py │ │ ├── output_selection.py │ │ └── naive_output_selection.py │ ├── interaction_handler │ │ ├── __init__.py │ │ ├── user_requests_db.py │ │ └── msg.py │ ├── mrc │ │ ├── __init__.py │ │ └── drqa_mrc.py │ └── retrieval │ │ ├── __init__.py │ │ ├── search_engine.py │ │ ├── doc.py │ │ ├── bing_api.py │ │ ├── indri.py │ │ └── query_generation.py ├── interface │ ├── interface.py │ ├── __init__.py │ ├── stdio.py │ ├── fileio.py │ ├── speech_recognition.py │ └── telegram.py ├── util │ ├── logging.py │ ├── __init__.py │ └── text_parser.py ├── batch_exp_main.py ├── cis.py ├── live_main.py └── wizard_of_oz_main.py ├── .idea ├── .gitignore ├── vcs.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml ├── misc.xml └── macaw.iml ├── data ├── example_qa_output.txt ├── example_retrieval_input.txt ├── example_qa_input.txt └── example_retrieval_output.txt ├── macaw-arch.jpg ├── macaw-example-tax.jpg ├── macaw-example-shakespeare.jpg ├── requirements.txt ├── CODE_OF_CONDUCT.md ├── setup.py ├── LICENSE ├── .gitignore ├── SECURITY.md └── README.md /macaw/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /macaw/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /macaw/core/input_handler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /macaw/core/output_handler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Default ignored files 3 | /workspace.xml -------------------------------------------------------------------------------- /data/example_qa_output.txt: -------------------------------------------------------------------------------- 1 | Q21 2020 2 | Q34 F. Wilfrid Lancaster 3 | -------------------------------------------------------------------------------- /macaw-arch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/macaw/HEAD/macaw-arch.jpg -------------------------------------------------------------------------------- /macaw-example-tax.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/macaw/HEAD/macaw-example-tax.jpg -------------------------------------------------------------------------------- /macaw-example-shakespeare.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/macaw/HEAD/macaw-example-shakespeare.jpg -------------------------------------------------------------------------------- /data/example_retrieval_input.txt: -------------------------------------------------------------------------------- 1 | 123 lung cancer symptoms treatments 2 | 267 information retrieval IR tutorials 3 | -------------------------------------------------------------------------------- /macaw/core/interaction_handler/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | FILE DESCRIPTION 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ -------------------------------------------------------------------------------- /data/example_qa_input.txt: -------------------------------------------------------------------------------- 1 | Q21 who is the president of the united states? when is the next presidential election? 2 | Q34 who is the father of information retrieval research? 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | func_timeout==4.3.5 2 | pymongo==3.9.0 3 | justext==2.2.0 4 | SpeechRecognition 5 | pydub 6 | python-telegram-bot==12.0.0 7 | stanfordcorenlp 8 | google-cloud-texttospeech 9 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /data/example_retrieval_output.txt: -------------------------------------------------------------------------------- 1 | 123 Q0 LA040190-0030 1 -10.726514210775541 macaw 2 | 123 Q0 FR940831-2-00098 2 -10.781756119063719 macaw 3 | 123 Q0 FR941028-2-00076 3 -10.803175601098255 macaw 4 | 267 Q0 FBIS4-20702 1 -4.786052863876007 macaw 5 | 267 Q0 FBIS4-20699 2 -4.849902126042474 macaw 6 | 267 Q0 FBIS4-20701 3 -5.229284984955124 macaw 7 | -------------------------------------------------------------------------------- /macaw/interface/interface.py: -------------------------------------------------------------------------------- 1 | """ 2 | The abstract interface class. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | from abc import ABC, abstractmethod 8 | 9 | 10 | class Interface(ABC): 11 | def __init__(self, params): 12 | self.params = params 13 | 14 | @abstractmethod 15 | def run(self): 16 | pass 17 | 18 | @abstractmethod 19 | def result_presentation(self, response_msg, params): 20 | pass -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='macaw', 5 | version='0.1', 6 | packages=['macaw', 'macaw.core', 'macaw.core.mrc', 'macaw.core.retrieval', 'macaw.core.input_handler', 7 | 'macaw.core.output_handler', 'macaw.core.interaction_handler', 'macaw.util', 'macaw.interface'], 8 | url='https://github.com/microsoft/macaw/', 9 | license='MIT', 10 | author='Hamed Zamani', 11 | author_email='hazamani@microsoft.com', 12 | description='An extensible framework for conversational information seeking research' 13 | ) 14 | -------------------------------------------------------------------------------- /.idea/macaw.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | 12 | 14 | 15 | 17 | -------------------------------------------------------------------------------- /macaw/core/mrc/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The MRC module init. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | from macaw.core.mrc import drqa_mrc 8 | 9 | 10 | def get_mrc_model(params): 11 | """ 12 | This method returns the MRC class requested in the parameter dict. 13 | Args: 14 | params(dict): A dict of parameters. In this method, the parameters 'logger' and 'mrc' are required. Currently, 15 | only one MRC model (i.e., 'drqa') is implemented. 16 | 17 | Returns: 18 | An MRC object for machine reading comprehension. 19 | """ 20 | params['logger'].info('The MRC model for QA: ' + params['mrc']) 21 | if params['mrc'] == 'drqa': 22 | return drqa_mrc.DrQA(params) 23 | else: 24 | raise Exception('The requested MRC model does not exist!') 25 | 26 | -------------------------------------------------------------------------------- /macaw/interface/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The interface module init. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | from macaw.interface import speech_recognition, telegram, stdio, fileio 8 | 9 | 10 | def get_interface(params): 11 | if 'asr_model' in params and params['asr_model'] == 'google': 12 | params['asr'] = speech_recognition.GoogleASR(params) 13 | if 'asg_model' in params and params['asg_model'] == 'google': 14 | params['asg'] = speech_recognition.GoogleText2Speech(params) 15 | 16 | if params['interface'] == 'telegram': 17 | return telegram.TelegramBot(params) 18 | elif params['interface'] == 'stdio': 19 | return stdio.StdioInterface(params) 20 | elif params['interface'] == 'fileio': 21 | return fileio.FileioInterface(params) 22 | else: 23 | raise Exception('The requested interface does not exist!') -------------------------------------------------------------------------------- /macaw/util/logging.py: -------------------------------------------------------------------------------- 1 | """ 2 | The internal logger. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | import logging 8 | 9 | 10 | class Logger(logging.Logger): 11 | def __init__(self, params): 12 | """ 13 | A simple logging class, inherited from the standard logging.Logger. 14 | 15 | Args: 16 | params(dict): A dict containing some parameters. 'logging_file' is an optional parameter, otherwise STDIO 17 | will be used for logging. 18 | """ 19 | super().__init__('Macaw Logger') 20 | self.params = params 21 | if 'logging_file' in params: 22 | self.handler_ = logging.FileHandler(params['logging_file']) 23 | else: 24 | self.handler_ = logging.StreamHandler() 25 | 26 | self.format = logging.Formatter('%(name)s - %(asctime)s - %(levelname)s - %(message)s') 27 | self.handler_.setFormatter(self.format) 28 | self.addHandler(self.handler_) 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /macaw/core/output_handler/output_selection.py: -------------------------------------------------------------------------------- 1 | """ 2 | The output post processing unit. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | from abc import ABC, abstractmethod 8 | 9 | 10 | class OutputProcessing(ABC): 11 | @abstractmethod 12 | def __init__(self, params): 13 | """ 14 | The post-processing unit for producing the response message. 15 | 16 | Args: 17 | params(dict): A dict of parameters. 18 | """ 19 | self.params = params 20 | 21 | @abstractmethod 22 | def get_output(self, conv, candidate_outputs): 23 | """ 24 | The response message generator method. 25 | 26 | Args: 27 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 28 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 29 | candidate_outputs(dict): A dict of str (i.e., action) to list of Documents (i.e., the action's result) as 30 | the response. This dict is produced by action dispatcher, which means this is the aggregation of all the 31 | executed actions. 32 | 33 | Returns: 34 | A response Message to be sent to the user. 35 | """ 36 | pass 37 | -------------------------------------------------------------------------------- /macaw/util/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Some util functions. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | import json 8 | import time 9 | 10 | from stanfordcorenlp import StanfordCoreNLP 11 | 12 | 13 | def current_time_in_milliseconds(): 14 | """ 15 | A method that returns the current time in milliseconds. 16 | 17 | Returns: 18 | An int representing the current time in milliseconds. 19 | """ 20 | return int(round(time.time() * 1000)) 21 | 22 | 23 | class NLPUtil: 24 | def __init__(self, params): 25 | """ 26 | A simple NLP helper class. 27 | 28 | Args: 29 | params(dict): A dict containing some parameters. 30 | """ 31 | self.params = params 32 | self.corenlp = StanfordCoreNLP(self.params['corenlp_path'], quiet=False) 33 | 34 | # Pre-fetching the required models. 35 | props = {'annotators': 'coref', 'pipelineLanguage': 'en', 'ner.useSUTime': False} 36 | self.corenlp.annotate('', properties=props) 37 | 38 | def get_coref(self, text): 39 | """ 40 | Run co-reference resolution on the input text. 41 | Args: 42 | text(str): It can be the concatenation of all conversation history. 43 | 44 | Returns: 45 | A json object containing all co-reference resolutions extracted from the input text. 46 | """ 47 | props = {'annotators': 'coref', 'pipelineLanguage': 'en', 'ner.useSUTime': False} 48 | result = json.loads(self.corenlp.annotate(text, properties=props)) 49 | 50 | return result 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /macaw/core/interaction_handler/user_requests_db.py: -------------------------------------------------------------------------------- 1 | """ 2 | The conversation (or interaction) database implemented using MongoDB. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | from pymongo import MongoClient 8 | 9 | from macaw import util 10 | from macaw.core.interaction_handler.msg import Message 11 | 12 | 13 | class InteractionDB: 14 | def __init__(self, host, port, dbname): 15 | self.client = MongoClient(host, port) 16 | self.db = self.client[dbname] 17 | self.col = self.db['macaw_msgs'] 18 | 19 | def insert_one(self, msg): 20 | if msg.user_id is None or msg.text is None or msg.timestamp is None or msg.user_interface is None: 21 | raise Exception('Each message should include a user_interface, user_id, text, and timestamp.') 22 | self.col.insert_one(msg.__dict__) 23 | 24 | def get_all(self): 25 | print('Using get_all is only recommended for development purposes. It is not efficient!') 26 | return self.dict_list_to_msg_list(self.col.find({})) 27 | 28 | def get_conv_history(self, user_id, max_time, max_count): 29 | if max_time is None: 30 | res = self.col.find({'user_id': user_id}).sort([('timestamp', -1)]) 31 | else: 32 | res = self.col.find({'user_id': user_id, 33 | 'timestamp': {'$gt': util.current_time_in_milliseconds() - max_time}}).sort([('timestamp', -1)]) 34 | 35 | if max_count is not None: 36 | res = res.limit(max_count) 37 | return self.dict_list_to_msg_list(res) 38 | 39 | def close(self): 40 | self.client.close() 41 | 42 | @staticmethod 43 | def dict_list_to_msg_list(msg_dict_list): 44 | return [Message.from_dict(msg_dict) for msg_dict in msg_dict_list] 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /macaw/core/interaction_handler/msg.py: -------------------------------------------------------------------------------- 1 | """ 2 | The message used to represent each interaction in Macaw. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | 8 | class Message: 9 | def __init__(self, user_interface, user_id, user_info, msg_info, text, timestamp): 10 | """ 11 | An object for input and output Message. 12 | 13 | Args: 14 | user_interface(str): The interface name used for this message (e.g., 'telegram') 15 | user_id(str or int): The user ID. 16 | user_info(dict): The dict containing some more information about the user. 17 | msg_info(dict): The dict containing some more information about the message. 18 | text(str): The message text. 19 | timestamp(int): The timestamp of message in milliseconds. 20 | """ 21 | self.user_id = user_id 22 | self.user_info = user_info 23 | self.msg_info = msg_info 24 | self.text = text 25 | self.timestamp = timestamp 26 | self.user_interface = user_interface 27 | 28 | @classmethod 29 | def from_dict(cls, msg_dict): 30 | """ 31 | Get a Message object from dict. 32 | Args: 33 | msg_dict(dict): A dict containing all the information required to construct a Message object. 34 | 35 | Returns: 36 | A Message object. 37 | """ 38 | user_interface = msg_dict['user_interface'] if 'user_interface' in msg_dict else None 39 | user_id = msg_dict['user_id'] if 'user_id' in msg_dict else None 40 | user_info = msg_dict['user_info'] if 'user_info' in msg_dict else None 41 | msg_info = msg_dict['msg_info'] if 'msg_info' in msg_dict else None 42 | text = msg_dict['text'] if 'text' in msg_dict else None 43 | timestamp = msg_dict['timestamp'] if 'timestamp' in msg_dict else None 44 | return cls(user_interface, user_id, user_info, msg_info, text, timestamp) -------------------------------------------------------------------------------- /macaw/core/retrieval/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The retrieval module init. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | import macaw.core.retrieval.bing_api 7 | import macaw.core.retrieval.indri 8 | from macaw.core.retrieval import search_engine, query_generation 9 | 10 | 11 | def get_retrieval_model(params): 12 | """ 13 | This method returns the Retrieval class requested in the parameter dict. 14 | Args: 15 | params(dict): A dict of parameters. In this method, the parameters 'logger' and 'query_generation', and 16 | 'search_engine' are required. Based on the requested retrievel model, some more parameters may be mandatory. 17 | Currently, Macaw serves two different search engines. One is based on indri (http://lemurproject.org/indri.php), 18 | and the other one is the Microsoft Bing API. If you want to retrieve results from your own document collection, 19 | indri is a useful search engine, otherwise you can rely on the Bing's Web search. 20 | 21 | Returns: 22 | A Retrieval object for document retrieval. 23 | """ 24 | params['logger'].info('The query generation model for retrieval: ' + params['query_generation']) 25 | if params['query_generation'] == 'simple': 26 | q_generation = query_generation.SimpleQueryGeneration(params) 27 | else: 28 | raise Exception('The requested query generation model does not exist!') 29 | 30 | params['logger'].info('The search engine for retrieval: ' + params['search_engine']) 31 | if params['search_engine'] == 'indri': 32 | return macaw.core.retrieval.indri.Indri({'query_generation': q_generation, 33 | 'indri_path': params['search_engine_path'], 34 | 'index': params['col_index'], 35 | 'text_format': params['col_text_format'], 36 | 'results_requested': params['results_requested'], 37 | 'logger': params['logger']}) 38 | elif params['search_engine'] == 'bing': 39 | return macaw.core.retrieval.bing_api.BingWebSearch({'query_generation': q_generation, 40 | 'bing_key': params['bing_key'], 41 | 'results_requested': params['results_requested'], 42 | 'logger': params['logger']}) 43 | else: 44 | raise Exception('The requested retrieval model does not exist!') -------------------------------------------------------------------------------- /macaw/batch_exp_main.py: -------------------------------------------------------------------------------- 1 | """ 2 | The main file for an experimental CIS with batch interaction support. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | from macaw.cis import CIS 8 | from macaw.core import mrc, retrieval 9 | from macaw.core.input_handler.action_detection import RequestDispatcher 10 | from macaw.core.output_handler import naive_output_selection 11 | 12 | 13 | class ConvSearch(CIS): 14 | def __init__(self, params): 15 | super().__init__(params) 16 | self.retrieval = retrieval.get_retrieval_model(params=self.params) 17 | self.qa = mrc.get_mrc_model(params=self.params) 18 | self.request_dispatcher = RequestDispatcher({'retrieval': self.retrieval, 'qa': self.qa}) 19 | self.output_selection = naive_output_selection.NaiveOutputProcessing({}) 20 | 21 | def request_handler_func(self, conv_list): 22 | # identify action 23 | dispatcher_output = self.request_dispatcher.dispatch(conv_list) 24 | 25 | output_msg = self.output_selection.get_output(conv_list, dispatcher_output) 26 | return output_msg 27 | 28 | def run(self): 29 | self.interface.run() 30 | 31 | 32 | if __name__ == '__main__': 33 | basic_params = {'timeout': -1, # timeout is in terms of second. 34 | 'mode': 'exp'} # mode can be either live or exp. 35 | interface_params = {'interface': 'fileio', 36 | 'input_file_path': 'INPUT_FILE', 37 | 'output_file_path': 'OUTPUT_FILE', 38 | 'output_format': 'text'} 39 | retrieval_params = {'query_generation': 'simple', 40 | 'search_engine': 'bing', # 'bing' or 'indri' 41 | 'use_coref': True, # True, if query generator can use coreference resolution, otherwise False. 42 | 'bing_key': 'YOUR_BING_SUBSCRIPTION_TOKEN', # only for Bing Web Search 43 | 'search_engine_path': 'PATH_TO_INDRI', # only for Indri 44 | 'col_index': 'PATH_TO_INDRI_INDEX', # only for Indri 45 | 'col_text_format': 'trectext', # trectext or trecweb. Only for Indri. 46 | 'results_requested': 3} 47 | mrc_params = {'mrc': 'drqa', 48 | 'mrc_model_path': 'PATH_TO_PRETRAINED_MRC_MODEL', 49 | 'mrc_path': 'PATH_TO_MRC_DIRECTORY', 50 | 'corenlp_path': 'PATH_TO_STANFORD_CORE_NLP_DIRECTORY', 51 | 'qa_results_requested': 3} 52 | 53 | params = {**basic_params, **interface_params, **retrieval_params, **mrc_params} 54 | ConvSearch(params).run() -------------------------------------------------------------------------------- /macaw/interface/stdio.py: -------------------------------------------------------------------------------- 1 | """ 2 | The STDIO interface for interactive CIS. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | import time 8 | import traceback 9 | 10 | from macaw import util 11 | from macaw.interface.interface import Interface 12 | from macaw.core.interaction_handler.msg import Message 13 | 14 | 15 | class StdioInterface(Interface): 16 | def __init__(self, params): 17 | super().__init__(params) 18 | self.msg_id = int(time.time()) 19 | 20 | def run(self): 21 | while True: 22 | try: 23 | request = input('ENTER YOUR COMMAND: ').strip() 24 | if len(request) == 0: 25 | continue 26 | user_info = {'first_name': 'STDIO', 27 | 'is_bot': 'False' 28 | } 29 | msg_info = {'msg_id': self.msg_id, 30 | 'msg_type': 'command' if request.startswith('#') else 'text', 31 | 'msg_source': 'user'} 32 | self.msg_id += 1 33 | msg = Message(user_interface='stdio', 34 | user_id=-1, 35 | user_info=user_info, 36 | msg_info=msg_info, 37 | text=request, 38 | timestamp=util.current_time_in_milliseconds()) 39 | output = self.params['live_request_handler'](msg) 40 | self.result_presentation(output, {}) 41 | except Exception as ex: 42 | traceback.print_exc() 43 | 44 | def result_presentation(self, response_msg, params): 45 | try: 46 | print('THE RESPONSE STARTS') 47 | print('----------------------------------------------------------------------') 48 | if response_msg.msg_info['msg_type'] == 'text': 49 | print(response_msg.text) 50 | elif response_msg.msg_info['msg_type'] == 'options': 51 | for (option_text, option_data, output_score) in response_msg.msg_info['options']: 52 | print(option_data, ' | ', option_text) 53 | elif response_msg.msg_info['msg_type'] == 'error': 54 | print('ERROR: NO RESULT!') 55 | else: 56 | raise Exception('The msg_type is not recognized:', response_msg.msg_info['msg_type']) 57 | print('----------------------------------------------------------------------') 58 | print('THE RESPONSE STARTS') 59 | except Exception as ex: 60 | traceback.print_exc() 61 | 62 | -------------------------------------------------------------------------------- /macaw/core/retrieval/search_engine.py: -------------------------------------------------------------------------------- 1 | """ 2 | Abstract classes for retrieval and ranking models. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | from abc import ABC, abstractmethod 8 | 9 | 10 | class Retrieval(ABC): 11 | @abstractmethod 12 | def __init__(self, params): 13 | """ 14 | An abstract class for retrieval models. 15 | 16 | Args: 17 | params(dict): A dict containing some mandatory and optional parameters. 'query_generation' and 'logger' are 18 | required for all retrieval models. 19 | """ 20 | self.params = params 21 | self.query_generation = self.params['query_generation'] 22 | 23 | @abstractmethod 24 | def retrieve(self, query): 25 | """ 26 | This method should retrieve documents for the given query. 27 | 28 | Args: 29 | query(str): The query string. 30 | """ 31 | pass 32 | 33 | def get_results(self, conv_list): 34 | """ 35 | This method is the one that should be called. It simply calls the query generation model to generate a query 36 | from a conversation list and then runs the retrieval model and returns the results. 37 | Args: 38 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 39 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 40 | 41 | Returns: 42 | A list of Documents retrieved by the search engine. 43 | """ 44 | query = self.query_generation.get_query(conv_list) 45 | self.params['logger'].info('New query: ' + query) 46 | result_list = self.retrieve(query) 47 | if 'reranker' in self.params: 48 | return self.params['reranker'].rerank(query, conv_list, result_list, self.params) 49 | return result_list 50 | 51 | 52 | class ReRanker(ABC): 53 | @abstractmethod 54 | def __init__(self, params): 55 | """ 56 | This is an abstract class for a re-ranking model, e.g., learning to rank models. 57 | 58 | Args: 59 | params(dict): A dict containing some mandatory and optional parameters, such as the hyper-parameters for the 60 | re-ranking model. 61 | """ 62 | self.params = params 63 | 64 | def rerank(self, query, conv_list, result_list, params): 65 | """ 66 | This method is called for re-ranking the result_list in response to the query. 67 | 68 | Args: 69 | query(str): A query generated by a query generation model 70 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 71 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 72 | result_list(list): A list of Documents retrieved by a first stage retrieval model. 73 | params(dict): A dict containing some parameters required by the re-ranking model. 74 | 75 | Returns: 76 | A list of Documents. This list contains a subset of result_list with the highest re-ranking scores. 77 | """ 78 | pass 79 | 80 | 81 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /macaw/interface/fileio.py: -------------------------------------------------------------------------------- 1 | """ 2 | The FileIO interface (for experimental batch interactions). 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | import time 8 | 9 | from macaw.interface.interface import Interface 10 | from macaw.core.interaction_handler.msg import Message 11 | 12 | 13 | class FileioInterface(Interface): 14 | def __init__(self, params): 15 | super().__init__(params) 16 | self.msg_id = int(time.time()) 17 | 18 | def run(self): 19 | output_file = open(self.params['output_file_path'], 'w+') 20 | with open(self.params['input_file_path']) as input_file: 21 | for line in input_file: 22 | str_list = line.strip().split('\t') 23 | if len(str_list) < 2: 24 | raise Exception('Each input line should contain at least 2 elements: a query ID and a query text.') 25 | qid = str_list[0] 26 | 27 | conv_list = [] 28 | for i in range(1, len(str_list)): 29 | user_info = {'first_name': 'NONE'} 30 | msg_info = {'msg_id': qid, 31 | 'msg_type': 'text', 32 | 'msg_source': 'user'} 33 | msg = Message(user_interface='NONE', 34 | user_id=-1, 35 | user_info=user_info, 36 | msg_info=msg_info, 37 | text=str_list[i], 38 | timestamp=-1) 39 | conv_list.append(msg) 40 | conv_list.reverse() 41 | output_msg = self.params['experimental_request_handler'](conv_list) 42 | self.result_presentation(output_msg, {'output_file': output_file, 'qid': qid}) 43 | output_file.close() 44 | 45 | def result_presentation(self, output_msg, params): 46 | qid = params['qid'] 47 | output_file = params['output_file'] 48 | if self.params['output_format'] == 'trec': 49 | if output_msg.msg_info['msg_type'] == 'options': 50 | for (i, (option_name, option_id, output_score)) in enumerate(output_msg.msg_info['options']): 51 | output_file.write(qid + '\tQ0\t' + option_name + '\t' + str(i+1) + '\t' + str(output_score) + '\tmacaw\n') 52 | else: 53 | raise Exception('TREC output format is only recognized for retrieval results. ' 54 | 'Therefore, the message type should be options.') 55 | elif self.params['output_format'] == 'text': 56 | if output_msg.msg_info['msg_type'] == 'text': 57 | output_file.write(qid + '\t' + output_msg.text.replace('\n', ' ').replace('\t', ' ') + '\n') 58 | else: 59 | raise Exception('text output format is only recognized for text outputs.') 60 | else: 61 | raise Exception('Unknown output file format!') 62 | -------------------------------------------------------------------------------- /macaw/core/retrieval/doc.py: -------------------------------------------------------------------------------- 1 | """ 2 | The document class and some util functions useful for retrieval result list. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | import re 8 | 9 | import justext 10 | 11 | 12 | class Document: 13 | def __init__(self, id, title, text, score): 14 | """ 15 | A simple class representing a document for retrieval. 16 | Args: 17 | id(str): Document ID. 18 | title(str): Document title (if any). 19 | text(str): Document content. 20 | score(float): The retrieval score. 21 | """ 22 | self.id = id 23 | self.title = title 24 | self.text = text 25 | self.score = score 26 | 27 | 28 | def get_recursive_content_as_str(doc): 29 | """ 30 | THIS METHOD IS DEPRECATED! 31 | """ 32 | text = '' 33 | if isinstance(doc, str): 34 | return doc.strip() + '\n' 35 | elif isinstance(doc, dict): 36 | for key in doc: 37 | text += get_recursive_content_as_str(doc[key]) 38 | elif isinstance(doc, list): 39 | for t in doc: 40 | text += get_recursive_content_as_str(t) 41 | else: 42 | raise Exception('cannot parse document recursively, ' + str(type(doc))) 43 | return text 44 | 45 | 46 | # def get_trec_doc(doc): 47 | # doc_dict = xml_text_to_dict(doc) 48 | # id = doc_dict['DOCNO'] 49 | # title = None 50 | # text = get_recursive_content_as_str(doc_dict['TEXT']) 51 | # return Document(id, title, text, 0) 52 | 53 | 54 | def get_trec_doc(trec_doc, format='trectext'): 55 | """ 56 | This method returns a Document given a standard trectext or trecweb document. NOTE: There are much better parsers 57 | for TREC documents. 58 | Args: 59 | trec_doc(str): The document content with the trectext or trecweb format. 60 | format(str): The document format. Either 'trectext' or 'trecweb'. The default value is 'trectext'. 61 | 62 | Returns: 63 | An instance of Document. Note that the score is assigned to 0 and should be set later. 64 | """ 65 | trec_doc_lower = trec_doc.lower() 66 | id = trec_doc[trec_doc_lower.find('') + len(''):trec_doc_lower.find('')].strip() 67 | title = id # for some presentation reasons, the title of document is set to ids ID. 68 | if format == 'trectext': 69 | text = trec_doc[trec_doc_lower.find('') + len(''):trec_doc_lower.find('')] 70 | elif format == 'trecweb': 71 | text = trec_doc[trec_doc_lower.find('') + len(''):trec_doc_lower.find('')] 72 | else: 73 | raise Exception('Undefined TREC document format. Supported document formats are trectext and trecweb') 74 | text = re.sub('\s+', ' ', text).strip() # removing multiple consecutive whitespaces 75 | 76 | # Removing other tags in the text, e.g.,

. 77 | clean_text_list = [] 78 | paragraphs = justext.justext(text, justext.get_stoplist("English")) 79 | for paragraph in paragraphs: 80 | if not paragraph.is_boilerplate: 81 | clean_text_list.append(paragraph.text) 82 | 83 | return Document(id, title, '\n'.join(clean_text_list), 0.) 84 | -------------------------------------------------------------------------------- /macaw/core/retrieval/bing_api.py: -------------------------------------------------------------------------------- 1 | """ 2 | Abstract classes for retrieval and ranking models. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | import requests 8 | 9 | from macaw.core.retrieval.doc import Document 10 | from macaw.core.retrieval.search_engine import Retrieval 11 | from macaw.util.text_parser import html_to_clean_text 12 | 13 | 14 | class BingWebSearch(Retrieval): 15 | def __init__(self, params): 16 | """ 17 | The Microsoft Bing Web search API. This class uses the Bing's API to get the retrieval results from the Web. 18 | Note that for some reasons, the results returned by the Bing API are usually different from the Bing search 19 | (without API). 20 | 21 | Args: 22 | params(dict): A dict containing some parameters. Here is the list of all required parameters: 23 | 'bing_key': The Bing API key. 24 | 'results_requested': The maximum number of requested documents for retrieval. If not given, it is set to 1. 25 | Note that this is limited by the number of results returned by the API. 26 | """ 27 | super().__init__(params) 28 | self.results_requested = self.params['results_requested'] if 'results_requested' in self.params else 1 29 | self.subscription_key = self.params['bing_key'] 30 | self.bing_api_url = 'https://api.cognitive.microsoft.com/bing/v7.0/search' 31 | self.header = {"Ocp-Apim-Subscription-Key": self.subscription_key} 32 | params['logger'].warning('There is a maximum number of transactions per second for the Bing API.') 33 | 34 | def retrieve(self, query): 35 | """ 36 | This method retrieve documents in response to the given query. 37 | 38 | Args: 39 | query(str): The query string. 40 | 41 | Returns: 42 | A list of Documents with the maximum length of the 'results_requested' parameter. 43 | """ 44 | params = {"q": query, "textDecorations": True, "textFormat": "HTML"} 45 | response = requests.get(self.bing_api_url, headers=self.header, params=params) 46 | response.raise_for_status() 47 | search_results = response.json() 48 | results = [] 49 | for i in range(min(len(search_results['webPages']['value']), self.results_requested)): 50 | id = search_results['webPages']['value'][i]['url'] 51 | title = search_results['webPages']['value'][i]['name'] 52 | snippet = search_results['webPages']['value'][i]['snippet'] 53 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0'} 54 | text = html_to_clean_text(requests.get(id, headers=headers).content) 55 | score = 10 - i # this is not a score returned by Bing (just 10 - document rank) 56 | results.append(Document(id, title, text, score)) 57 | return results 58 | 59 | def get_doc_from_index(self, doc_id): 60 | """ 61 | This method retrieves a document content for a given document id (i.e., URL). 62 | 63 | Args: 64 | doc_id(str): The document ID. 65 | 66 | Returns: 67 | A Document from the collection whose ID is equal to the given doc_id. For some reasons, the method returns 68 | a list of Documents with a length of 1. 69 | """ 70 | # Telegram has a nice interface for loading websites. Therefore, we decided to only pass the doc_id (URL). This 71 | # can be simply enhanced by the title and the content of the document. 72 | doc = Document(doc_id, doc_id, doc_id, -1) 73 | return [doc] -------------------------------------------------------------------------------- /macaw/core/retrieval/indri.py: -------------------------------------------------------------------------------- 1 | """ 2 | The Indri search engine. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | import os 8 | import subprocess 9 | 10 | import pyndri 11 | 12 | from macaw.core.retrieval.doc import get_trec_doc 13 | from macaw.core.retrieval.search_engine import Retrieval 14 | 15 | 16 | class Indri(Retrieval): 17 | def __init__(self, params): 18 | """ 19 | The Indri retrieval model. Indri is an open-source search engine implemented as part of the lemur project by 20 | UMass Amherst and CMU. Refer to http://lemurproject.org/indri.php for more information. 21 | The retrieval model used here is based on language modeling framework and retrieves documents using the query 22 | likelihood retrieval model [Ponte & Croft; SIGIR 1998] and Dirichlet prior smoothing [Zhai and Lafferty; SIGIR 23 | 2001]. It is implemented using the Pyndri [Van Gysel et al.; ECIR 2017], which is a python interface to Indri. 24 | Refer to http://lemurproject.org/indri.php for more information on the Lemur toolkit. 25 | 26 | Args: 27 | params(dict): A dict containing some parameters. Here is the list of all required parameters: 28 | 'indri_path': The path to the installed Indri toolkit. 29 | 'index': The path to the Indri index constructed from the collection. 30 | 'results_requested': The maximum number of requested documents for retrieval. If not given, it is set to 1. 31 | 'text_format': The text format for document collection (e.g., 'trectext'). 32 | Note that the parameters 'query_generation' and 'logger' are required by the parent class. 33 | """ 34 | super().__init__(params) 35 | self.results_requested = self.params['results_requested'] if 'results_requested' in self.params else 1 36 | self.indri_path = self.params['indri_path'] 37 | self.index = pyndri.Index(self.params['index']) 38 | self.term2id, self.id2term, self.id2df = self.index.get_dictionary() 39 | self.id2tf = self.index.get_term_frequencies() 40 | 41 | def retrieve(self, query): 42 | """ 43 | This method retrieve documents in response to the given query. 44 | 45 | Args: 46 | query(str): The query string. 47 | 48 | Returns: 49 | A list of Documents with the maximum length of the 'results_requested' parameter. 50 | """ 51 | int_results = self.index.query(query, results_requested=self.results_requested) 52 | results = [] 53 | for int_doc_id, score in int_results: 54 | # ext_doc_id, content_term_id = self.index.document(int_doc_id) 55 | # index_content = [self.id2term[term_id] if term_id> 0 else 'UNK' for term_id in content_term_id] 56 | doc = self.get_doc_from_index(int_doc_id)[0] 57 | doc.score = score 58 | doc.id = str(int_doc_id) 59 | results.append(doc) 60 | return results 61 | 62 | def get_doc_from_index(self, doc_id): 63 | """ 64 | This method retrieves a document content for a given document id. 65 | 66 | Args: 67 | doc_id(str): The document ID. 68 | 69 | Returns: 70 | A Document from the collection whose ID is equal to the given doc_id. For some reasons, the method returns 71 | a list of Documents with a length of 1. 72 | """ 73 | content = subprocess.run([os.path.join(self.indri_path, 'dumpindex/dumpindex'), self.params['index'], 74 | 'dt', str(doc_id)], stdout=subprocess.PIPE).stdout.decode('UTF-8') 75 | if self.params['text_format'] == 'trectext': 76 | doc = get_trec_doc(content) 77 | else: 78 | raise Exception('The requested text format is not supported!') 79 | return [doc] -------------------------------------------------------------------------------- /macaw/core/mrc/drqa_mrc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from abc import ABC, abstractmethod 4 | 5 | import drqa 6 | from drqa.reader import Predictor 7 | 8 | """ 9 | A wrapper to the DrQA model from FAIR: https://github.com/facebookresearch/DrQA 10 | 11 | Authors: Hamed Zamani (hazamani@microsoft.com) 12 | """ 13 | 14 | from macaw.core.retrieval.doc import Document 15 | 16 | 17 | class MRC(ABC): 18 | @abstractmethod 19 | def __init__(self, params): 20 | """ 21 | An abstract class for machine reading comprehension models implemented in Macaw. 22 | 23 | Args: 24 | params(dict): A dict containing some mandatory and optional parameters. 25 | """ 26 | self.params = params 27 | 28 | @abstractmethod 29 | def get_results(self, conv_list, doc): 30 | """ 31 | This method is called to get the answer(s) to a question. 32 | 33 | Args: 34 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 35 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 36 | doc(Document): A document (core.retrieval.doc.Document) that potentially contains the answer. 37 | 38 | Returns: 39 | The inherited class should implements this method and return a list of Documents each containing a candidate 40 | answer and its confidence score. 41 | """ 42 | pass 43 | 44 | 45 | class DrQA(MRC): 46 | def __init__(self, params): 47 | """ 48 | A machine reading comprehension model based on DrQA (https://github.com/facebookresearch/DrQA). 49 | 50 | Args: 51 | params(dict): A dict of parameters. Required parameters are: 52 | 'mrc_path': The path to the DrQA repository. 53 | 'corenlp_path': The path to the Stanford's corenlp toolkit. DrQA requires corenlp. 54 | 'mrc_model_path': The path to the learned DrQA parameters. 55 | 'qa_results_requested': The maximum number of candidate answers that should be found by DrQA. 56 | """ 57 | super().__init__(params) 58 | sys.path.insert(0, self.params['mrc_path']) 59 | drqa.tokenizers.set_default('corenlp_classpath', os.path.join(self.params['corenlp_path'], '*')) 60 | self.predictor = Predictor(self.params['mrc_model_path'], tokenizer='simple', num_workers=0, normalize=False) 61 | 62 | def get_results(self, conv_list, doc): 63 | """ 64 | This method returns the answers to the question. 65 | 66 | Args: 67 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 68 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 69 | doc(Document): A document (core.retrieval.doc.Document) that potentially contains the answer. 70 | 71 | Returns: 72 | Returns a list of Documents each containing a candidate answer and its confidence score. The length of this 73 | list is less than or equal to the parameter 'qa_results_requested'. 74 | """ 75 | q = conv_list[0].text 76 | predictions = self.predictor.predict(doc, q, None, self.params['qa_results_requested']) 77 | results = [] 78 | for i, p in enumerate(predictions, 1): 79 | results.append(Document(None, None, p[0], p[1])) 80 | return results 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /macaw/util/text_parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Some text parser for document cleaning. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | import justext 8 | from xml.etree import cElementTree as ElementTree 9 | 10 | 11 | class XmlListConfig(list): 12 | def __init__(self, aList): 13 | """ 14 | THIS CLASS IS DEPRECATED! 15 | """ 16 | super().__init__() 17 | for element in aList: 18 | if element: 19 | if len(element) == 1 or element[0].tag != element[1].tag: 20 | self.append(XmlDictConfig(element)) 21 | elif element[0].tag == element[1].tag: 22 | self.append(XmlListConfig(element)) 23 | elif element.text: 24 | text = element.text.strip() 25 | if text: 26 | self.append(text) 27 | 28 | 29 | class XmlDictConfig(dict): 30 | def __init__(self, parent_element): 31 | """ 32 | THIS CLASS IS DEPRECATED! 33 | """ 34 | super().__init__() 35 | if parent_element.items(): 36 | self.update(dict(parent_element.items())) 37 | for element in parent_element: 38 | if element: 39 | if len(element) == 1 or element[0].tag != element[1].tag: 40 | aDict = XmlDictConfig(element) 41 | else: 42 | aDict = {element[0].tag: XmlListConfig(element)} 43 | if element.items(): 44 | aDict.update(dict(element.items())) 45 | self.update({element.tag: aDict}) 46 | elif element.items(): 47 | self.update({element.tag: dict(element.items())}) 48 | else: 49 | self.update({element.tag: element.text}) 50 | 51 | 52 | def xml_text_to_dict(xml_text): 53 | """ 54 | THIS CLASS IS DEPRECATED! 55 | """ 56 | print(xml_text) 57 | root = ElementTree.XML(xml_text) 58 | return XmlDictConfig(root) 59 | 60 | 61 | def xml_file_to_dict(xml_file): 62 | """ 63 | THIS CLASS IS DEPRECATED! 64 | """ 65 | tree = ElementTree.parse(xml_file) 66 | root = tree.getroot() 67 | return XmlDictConfig(root) 68 | 69 | 70 | # def html_to_clean_text(html): 71 | # """ 72 | # Converting an HTML document to clean text. 73 | # Args: 74 | # html(str): The content of an HTML web page. 75 | # 76 | # Returns: 77 | # A str containing the clean content of the web page. 78 | # """ 79 | # def visible(element): 80 | # if element.parent.name in ['style', 'script', '[document]', 'head', 'title']: 81 | # return False 82 | # elif re.match('', str(element.encode('utf-8'))): 83 | # return False 84 | # return True 85 | # 86 | # soup = BeautifulSoup(html, features='html.parser') #.stripped_strings 87 | # data = soup.findAll(text=True) 88 | # result = filter(visible, data) 89 | # return ' '.join(result) 90 | 91 | def html_to_clean_text(html): 92 | """ 93 | Converting an HTML document to clean text. 94 | Args: 95 | html(str): The content of an HTML web page. 96 | 97 | Returns: 98 | A str containing the clean content of the web page. 99 | """ 100 | paragraphs = justext.justext(html, justext.get_stoplist("English")) 101 | clean_text_list = [] 102 | for paragraph in paragraphs: 103 | if not paragraph.is_boilerplate: 104 | clean_text_list.append(paragraph.text) 105 | return '\n'.join(clean_text_list) 106 | -------------------------------------------------------------------------------- /macaw/interface/speech_recognition.py: -------------------------------------------------------------------------------- 1 | """ 2 | Speech recognition and generation and some utility functions. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | from abc import ABC, abstractmethod 8 | import os 9 | import tempfile 10 | 11 | import speech_recognition as sr 12 | from google.cloud import texttospeech 13 | from pydub import AudioSegment 14 | 15 | 16 | def mp3_to_ogg(input_file_name): # caller should delete the file afterwards. 17 | ogg_file = tempfile.NamedTemporaryFile(delete=False) 18 | AudioSegment.from_mp3(input_file_name).export(ogg_file.name, format='ogg', parameters=["-acodec", "libopus"]) 19 | ogg_file.close() 20 | return ogg_file.name 21 | 22 | 23 | def ogg_to_wav(input_file_name): # caller should delete the file afterwards. 24 | wav_file = tempfile.NamedTemporaryFile(delete=False) 25 | AudioSegment.from_ogg(input_file_name).export(wav_file.name, format='wav') 26 | wav_file.close() 27 | return wav_file.name 28 | 29 | 30 | class ASR(ABC): # Automatic Speech Recognition 31 | def __init__(self, params): 32 | self.params = params 33 | 34 | @abstractmethod 35 | def speech_to_text(self, file_path): 36 | pass 37 | 38 | 39 | class ASG(ABC): # Automatic Speech Generation 40 | def __init__(self, params): 41 | self.params = params 42 | 43 | @abstractmethod 44 | def text_to_speech(self, text): 45 | pass 46 | 47 | 48 | class GoogleASR(ASR): 49 | def __init__(self, params): 50 | super().__init__(params) 51 | self.asr = sr.Recognizer() 52 | 53 | def speech_to_text(self, file_path): 54 | print(file_path) 55 | wav_file_name = ogg_to_wav(file_path) 56 | with sr.AudioFile(wav_file_name) as source: 57 | audio = self.asr.record(source) 58 | try: 59 | text = self.asr.recognize_google(audio) 60 | os.remove(wav_file_name) 61 | return text 62 | except sr.UnknownValueError: 63 | print("Google Speech Recognition could not understand audio") 64 | except sr.RequestError as e: 65 | print("Could not request results from Google Speech Recognition service; {0}".format(e)) 66 | 67 | 68 | class GoogleText2Speech(ASG): 69 | def __init__(self, params): 70 | super().__init__(params) 71 | os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.params['google-speech-to-text-credential-file'] 72 | # Instantiates a client 73 | self.client = texttospeech.TextToSpeechClient() 74 | # Build the voice request, select the language code ("en-US") and the ssml 75 | # voice gender ("neutral") 76 | self.voice = texttospeech.types.VoiceSelectionParams( 77 | language_code='en-US', 78 | ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL) 79 | # Select the type of audio file you want returned 80 | self.audio_config = texttospeech.types.AudioConfig( 81 | audio_encoding=texttospeech.enums.AudioEncoding.MP3) 82 | 83 | def text_to_speech(self, text): 84 | # Set the text input to be synthesized 85 | synthesis_input = texttospeech.types.SynthesisInput(text=text) 86 | 87 | # Perform the text-to-speech request on the text input with the selected 88 | # voice parameters and audio file type 89 | response = self.client.synthesize_speech(synthesis_input, self.voice, self.audio_config) 90 | 91 | mp3_file = tempfile.NamedTemporaryFile(delete=True) 92 | mp3_file.write(response.audio_content) 93 | ogg_file_name = mp3_to_ogg(mp3_file.name) 94 | mp3_file.close() 95 | return ogg_file_name 96 | 97 | -------------------------------------------------------------------------------- /macaw/cis.py: -------------------------------------------------------------------------------- 1 | """ 2 | The CIS class. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | from abc import ABC, abstractmethod 8 | from func_timeout import FunctionTimedOut 9 | 10 | from macaw import interface, util 11 | from macaw.core.interaction_handler.user_requests_db import InteractionDB 12 | from macaw.core.interaction_handler.msg import Message 13 | 14 | 15 | class CIS(ABC): 16 | def __init__(self, params): 17 | """ 18 | A Conversational Information Seeking class containing some abstract methods. Each CIS application is expected to 19 | be inherited from this class. 20 | 21 | Args: 22 | params(dict): A dict containing some parameters. 23 | """ 24 | self.params = params 25 | if params['mode'] == 'live': 26 | self.params['live_request_handler'] = self.live_request_handler 27 | self.msg_db = InteractionDB(host=self.params['interaction_db_host'], 28 | port=self.params['interaction_db_port'], 29 | dbname=self.params['interaction_db_name']) 30 | elif params['mode'] == 'exp': 31 | self.params['experimental_request_handler'] = self.request_handler_func 32 | 33 | self.interface = interface.get_interface(params) 34 | try: 35 | self.nlp_util = util.NLPUtil(self.params) 36 | self.params['nlp_util'] = self.nlp_util 37 | except Exception as ex: 38 | self.params['logger'].warning('WARNING: There is a problem with setting up the NLP utility module.') 39 | self.timeout = self.params['timeout'] if 'timeout' in self.params else -1 40 | 41 | def live_request_handler(self, msg): 42 | try: 43 | # load conversation from the database and add the current message to the database 44 | conv = [msg] + self.msg_db.get_conv_history(user_id=msg.user_id, max_time=10 * 60 * 1000, max_count=10) 45 | self.msg_db.insert_one(msg) 46 | 47 | # output_msg = func_timeout(self.timeout, self.request_handler_func, args=[conv]) 48 | output_msg = self.request_handler_func(conv) 49 | self.msg_db.insert_one(output_msg) 50 | return output_msg 51 | 52 | except FunctionTimedOut: 53 | msg_info = dict() 54 | msg_info['msg_id'] = msg.msg_info['msg_id'] 55 | msg_info['msg_source'] = 'system' 56 | msg_info['msg_type'] = 'error' 57 | text = 'Time out, no result!' 58 | timestamp = util.current_time_in_milliseconds() 59 | error_msg = Message(msg.user_interface, msg.user_id, msg.user_info, msg_info, text, timestamp) 60 | self.msg_db.insert_one(error_msg) 61 | return error_msg 62 | 63 | # def experimental_request_handler(self, str_list): 64 | # if not isinstance(str_list, list): 65 | # raise Exception('The input should be a list!') 66 | # 67 | # conv_list = [] 68 | # for i in range(len(str_list)): 69 | # if not isinstance(str_list[i], str): 70 | # raise Exception('Each element of the input should be a string!') 71 | # user_info = {'first_name': 'NONE'} 72 | # msg_info = {'msg_id': -1, 73 | # 'msg_type': 'command' if str_list[i].startswith('#') else 'text', 74 | # 'msg_source': 'user'} 75 | # msg = Message(user_interface='NONE', 76 | # user_id=-1, 77 | # user_info=user_info, 78 | # msg_info=msg_info, 79 | # text=str_list[i], 80 | # timestamp=util.current_time_in_milliseconds()) 81 | # conv_list.append(msg) 82 | # conv_list.reverse() 83 | # 84 | # if self.timeout > 0: 85 | # output_msg = func_timeout(self.timeout, self.request_handler_func, args=[conv_list]) 86 | # else: 87 | # output_msg = self.request_handler_func(conv_list) 88 | # return output_msg 89 | 90 | @abstractmethod 91 | def request_handler_func(self, conv_list): 92 | pass 93 | 94 | @abstractmethod 95 | def run(self): 96 | pass -------------------------------------------------------------------------------- /macaw/core/input_handler/actions.py: -------------------------------------------------------------------------------- 1 | """ 2 | All actions supported by CIS. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | from abc import ABC, abstractmethod 8 | from func_timeout import func_timeout, FunctionTimedOut 9 | import traceback 10 | 11 | 12 | class Action(ABC): 13 | @staticmethod 14 | @abstractmethod 15 | def run(conv_list, params): 16 | """ 17 | This is a static method for an abstract class. This method should run the corresponding action. 18 | 19 | Args: 20 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 21 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 22 | params(dict): A dict containing some mandatory and optional parameters. 23 | """ 24 | pass 25 | 26 | 27 | class RetrievalAction(Action): 28 | @staticmethod 29 | def run(conv_list, params): 30 | """ 31 | The retrieval action runs the retrieval model and returns a list of documents. 32 | Args: 33 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 34 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 35 | params(dict): A dict containing some parameters. The parameter 'retrieval' is required, which should be the 36 | retrieval model object. 37 | 38 | Returns: 39 | A list of Documents. 40 | """ 41 | return params['actions']['retrieval'].get_results(conv_list) 42 | 43 | 44 | class GetDocFromIndex(Action): 45 | @staticmethod 46 | def run(conv_list, params): 47 | """ 48 | Getting document from the collection index. 49 | Args: 50 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 51 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 52 | params(dict): A dict containing some parameters. The parameters 'retrieval' and 'doc_id' are required. 53 | 54 | Returns: 55 | A list of Documents with a length of 1. 56 | """ 57 | return params['actions']['retrieval'].get_doc_from_index(params['doc_id']) 58 | 59 | 60 | class QAAction(Action): 61 | @staticmethod 62 | def run(conv_list, params): 63 | """ 64 | The question answering action runs the MRC model and returns a list of answers. 65 | Args: 66 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 67 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 68 | params(dict): A dict containing some parameters. The parameters 'qa' and 'doc' are required, which are the 69 | MRC model and the candidate document, respectively. 70 | 71 | Returns: 72 | A list of Documents containing the answers. 73 | """ 74 | 75 | doc_list = RetrievalAction.run(conv_list, params) 76 | doc = '' 77 | for i in range(len(doc_list)): 78 | doc = doc_list[i].text 79 | if len(doc.strip()) > 0: 80 | break 81 | return params['actions']['qa'].get_results(conv_list, doc) 82 | 83 | 84 | def run_action(action, conv_list, params, return_dict): 85 | """ 86 | This method runs the specified action. 87 | 88 | Args: 89 | action(str): The action name, e.g., 'retrieval', 'qa', etc. 90 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 91 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 92 | params(dict): A dict containing some parameters. 93 | return_dict(dict): A shared dict for all processes running this action. The actions' outputs should be added to 94 | this dict. 95 | """ 96 | if action == 'retrieval': 97 | action_func = RetrievalAction.run 98 | elif action == 'qa': 99 | action_func = QAAction.run 100 | else: 101 | raise Exception('Unknown Action!') 102 | 103 | try: 104 | return_dict[action] = func_timeout(params['timeout'], action_func, args=[conv_list, params]) 105 | except FunctionTimedOut: 106 | params['logger'].warning('The action "%s" did not respond in %d seconds.', action, params['timeout']) 107 | except Exception: 108 | return_dict[action] = None 109 | traceback.print_exc() 110 | -------------------------------------------------------------------------------- /macaw/core/output_handler/naive_output_selection.py: -------------------------------------------------------------------------------- 1 | """ 2 | The naive output post processing unit. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | from macaw import util 8 | from macaw.core.output_handler.output_selection import OutputProcessing 9 | from macaw.core.interaction_handler.msg import Message 10 | 11 | 12 | class NaiveOutputProcessing(OutputProcessing): 13 | def __init__(self, params): 14 | """ 15 | This module simply prioritizes the action outputs. If the message was a command, it returns the command's 16 | output. Otherwise, it prioritizes QA results and then retrieval results (the rational is that if there is an 17 | exact answer for the user's question, there is no need to show the retrieval results). 18 | 19 | Args: 20 | params(dict): A dict of parameters. 21 | """ 22 | super().__init__(params) 23 | 24 | def output_selection(self, conv_list, candidate_outputs): 25 | """ 26 | This method selects one of the outputs produced by the actions. 27 | 28 | Args: 29 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 30 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 31 | candidate_outputs(dict): A dict of str (i.e., action) to list of Documents (i.e., the action's result) as 32 | the response. This dict is produced by action dispatcher, which means this is the aggregation of all the 33 | executed actions. 34 | 35 | Returns: 36 | A str denoting the selected action. If none is selected, None is returned. 37 | """ 38 | if '#get_doc' in candidate_outputs: 39 | return '#get_doc' 40 | if 'qa' in candidate_outputs: 41 | if len(candidate_outputs['qa'][0].text) > 0: 42 | if conv_list[0].text.endswith('?') \ 43 | or conv_list[0].text.lower().startswith('what') \ 44 | or conv_list[0].text.lower().startswith('who') \ 45 | or conv_list[0].text.lower().startswith('when') \ 46 | or conv_list[0].text.lower().startswith('where') \ 47 | or conv_list[0].text.lower().startswith('how'): 48 | return 'qa' 49 | if 'retrieval' in candidate_outputs: 50 | if len(candidate_outputs['retrieval']) > 0: 51 | return 'retrieval' 52 | return None 53 | 54 | def get_output(self, conv, candidate_outputs): 55 | """ 56 | The response Message generation method. 57 | 58 | Args: 59 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 60 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 61 | candidate_outputs(dict): A dict of str (i.e., action) to list of Documents (i.e., the action's result) as 62 | the response. This dict is produced by action dispatcher, which means this is the aggregation of all the 63 | executed actions. 64 | 65 | Returns: 66 | A response Message to be sent to the user. 67 | """ 68 | user_id = conv[0].user_id 69 | user_info = conv[0].user_info 70 | msg_info = dict() 71 | msg_info['msg_id'] = conv[0].msg_info['msg_id'] 72 | msg_info['msg_source'] = 'system' 73 | text = '' 74 | user_interface = conv[0].user_interface 75 | 76 | selected_action = self.output_selection(conv, candidate_outputs) 77 | if selected_action is None: 78 | msg_info['msg_type'] = 'text' 79 | msg_info['msg_creator'] = 'no answer error' 80 | text = 'No response has been found! Please try again!' 81 | elif selected_action == 'qa': 82 | msg_info['msg_type'] = conv[0].msg_info['msg_type'] 83 | msg_info['msg_creator'] = 'qa' 84 | text = candidate_outputs['qa'][0].text 85 | elif selected_action == 'retrieval': 86 | msg_info['msg_type'] = 'options' 87 | msg_info['msg_creator'] = 'retrieval' 88 | text = 'Retrieved document list (click to see the document content):' 89 | msg_info['options'] = [(output.title, '#get_doc ' + output.id, output.score) for output in candidate_outputs['retrieval']] 90 | elif selected_action == '#get_doc': 91 | msg_info['msg_type'] = 'text' 92 | msg_info['msg_creator'] = '#get_doc' 93 | text = candidate_outputs['#get_doc'][0].text 94 | else: 95 | raise Exception('The candidate output key is not familiar!') 96 | timestamp = util.current_time_in_milliseconds() 97 | if timestamp <= conv[0].timestamp: 98 | raise Exception('There is a problem in the output timestamp!') 99 | return Message(user_interface, user_id, user_info, msg_info, text, timestamp) -------------------------------------------------------------------------------- /macaw/core/retrieval/query_generation.py: -------------------------------------------------------------------------------- 1 | """ 2 | The query generation model for search engine. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | from abc import ABC, abstractmethod 8 | import string 9 | 10 | 11 | class QueryGeneration(ABC): 12 | @abstractmethod 13 | def __init__(self, params): 14 | """ 15 | An abstract class for query generation models. 16 | 17 | Args: 18 | params(dict): A dict containing some mandatory and optional parameters. 19 | """ 20 | self.params = params 21 | 22 | @abstractmethod 23 | def get_query(self, conv_list): 24 | """ 25 | This method is called to get the query generated from a list of conversational interactions. 26 | 27 | Args: 28 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 29 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 30 | 31 | Returns: 32 | The inherited class should implements this method and return a str containing a query for retrieval purpose. 33 | """ 34 | pass 35 | 36 | 37 | class SimpleQueryGeneration(QueryGeneration): 38 | def __init__(self, params): 39 | """ 40 | This class is a simple implementation of query generation that only focuses on the last interaction in the 41 | conversation and use the last interaction as the query. 42 | 43 | Args: 44 | params(dict): A dict containing some mandatory and optional parameters. 45 | """ 46 | super().__init__(params) 47 | 48 | def get_query(self, conv_list): 49 | """ 50 | This method generates a query from a list of conversational interactions by using the last user request, with 51 | some pre-processing (e.g., removing punctuations). 52 | 53 | Args: 54 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 55 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 56 | 57 | Returns: 58 | A str containing the query for retrieval. 59 | """ 60 | # q = ' '.join(msg.text for msg in conv_list) 61 | q = conv_list[0].text 62 | if 'use_coref' in self.params and self.params['use_coref']: 63 | q_coref = self.get_query_coref(conv_list) 64 | for key in q_coref: 65 | q += ' ' + ' '.join(q_coref[key]) 66 | 67 | q = q.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).strip() 68 | 69 | # print(q) 70 | return q 71 | 72 | def get_query_coref(self, conv_list): 73 | """ 74 | This methods compute all co-references in the conversation history for the query terms (i.e., those in the last 75 | interaction). 76 | 77 | Args: 78 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 79 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 80 | 81 | Returns: 82 | A dict from terms in the last user request to a list of all identified co-references. 83 | 84 | """ 85 | corenlp_coref_result = self.compute_corefs(conv_list) 86 | q_coref = dict() 87 | last_index = len(corenlp_coref_result['sentences']) 88 | for key in corenlp_coref_result['corefs']: 89 | has_coref = False 90 | for item in corenlp_coref_result['corefs'][key]: 91 | if item['sentNum'] == last_index: 92 | has_coref = True 93 | text = item['text'] 94 | break 95 | if has_coref: 96 | q_coref[text] = [] 97 | for item in corenlp_coref_result['corefs'][key]: 98 | if item['sentNum'] == last_index: 99 | continue 100 | q_coref[text].append(item['text']) 101 | return q_coref 102 | 103 | def compute_corefs(self, conv_list): 104 | """ 105 | This method runs CoreNLP co-reference resolution on the requests made by the user in the conversation. 106 | Note: this method ignores system responses. 107 | 108 | Args: 109 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 110 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 111 | 112 | Returns: 113 | A dict containing all sentence and co-reference information. 114 | 115 | """ 116 | conv_history = [] 117 | for msg in reversed(conv_list): 118 | if msg.msg_info['msg_source'] == 'user' and msg.msg_info['msg_type'] in ['text', 'voice']: 119 | temp = msg.text if msg.text.endswith('?') else (msg.text + '?') 120 | conv_history.append(temp) 121 | # elif msg.msg_info['msg_source'] == 'system' and msg.msg_info['msg_type'] == 'text' and len(msg.text.split()) < 30: 122 | # temp = msg.text + '.' 123 | # conv_history.append(temp) 124 | if len(conv_history) == 0: 125 | raise Exception('The query generation model cannot generate any query! There should be a problem') 126 | coref_results = self.params['nlp_util'].get_coref(' '.join(conv_history)) 127 | return coref_results 128 | 129 | 130 | -------------------------------------------------------------------------------- /macaw/core/input_handler/action_detection.py: -------------------------------------------------------------------------------- 1 | """ 2 | The request dispatcher module. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | import multiprocessing 8 | 9 | from macaw.core.input_handler import actions 10 | 11 | 12 | class PreActionRequestDispatcher: 13 | def __init__(self, params): 14 | """ 15 | A simple pre-action request dispatcher module that selects one of the actions based on the user's message and 16 | only run one action. 17 | 18 | Args: 19 | params(dict): A dict of parameters. 20 | """ 21 | self.params = params 22 | 23 | def action_detection(self, conv_list): 24 | """ 25 | Action detection based on the conversation. This method simply identifies if a message is a command or if it's a 26 | question based on the starting word. 27 | 28 | Args: 29 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 30 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 31 | 32 | Returns: 33 | A str denoting the identified action, e.g., 'retrieval', 'qa', or a command. 34 | 35 | """ 36 | if conv_list[0].msg_info['msg_type'] == 'command': 37 | command = conv_list[0].text.split(' ')[0] 38 | return command 39 | 40 | if 'qa' in self.params: 41 | if conv_list[0].text.lower().startswith('what') \ 42 | or conv_list[0].text.lower().startswith('who') \ 43 | or conv_list[0].text.lower().startswith('when') \ 44 | or conv_list[0].text.lower().startswith('where') \ 45 | or conv_list[0].text.lower().startswith('how'): 46 | return 'qa' 47 | if 'retrieval' in self.params: 48 | return 'retrieval' 49 | 50 | def dispatch(self, conv_list): 51 | """ 52 | A dispatcher function that runs the action identified by 'action_detection'. 53 | 54 | Args: 55 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 56 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 57 | 58 | Returns: 59 | A dict of str (i.e., action) to list of Documents (i.e., the action's result) as the response. 60 | 61 | """ 62 | action = self.action_detection(conv_list) 63 | if action == 'retrieval': 64 | return {'retrieval': actions.RetrievalAction.run(conv_list, self.params)} 65 | if action == 'qa': 66 | return {'qa': actions.QAAction.run(conv_list, self.params)} 67 | if action == '#get_doc': 68 | doc_id = ' '.join(conv_list[0].text.split(' ')[1:]) 69 | return {'#get_doc': actions.GetDocFromIndex.run(None, {**self.params, **{'doc_id': doc_id}})} 70 | 71 | 72 | class RequestDispatcher: 73 | def __init__(self, params): 74 | """ 75 | The main request dispatcher class. This module runs multiple actions in parallel for a pre-specified timeout and 76 | returns all of the obtained results. 77 | 78 | Args: 79 | params(dict): A dict of parameters. Required params include 'actions' and 'timeout'. 80 | """ 81 | self.params = params 82 | 83 | def dispatch(self, conv_list): 84 | """ 85 | The request dispatcher method. This method runs all non-command messages in parallel using multiprocessing. 86 | 87 | Args: 88 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 89 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 90 | 91 | Returns: 92 | A dict of str (i.e., action) to list of Documents (i.e., the action's result) as the response. 93 | """ 94 | if conv_list[0].msg_info['msg_type'] == 'command': 95 | command = conv_list[0].text.split(' ')[0] 96 | return self.execute_command(conv_list, command) 97 | 98 | action_processes = [] 99 | manager = multiprocessing.Manager() 100 | action_results = manager.dict() 101 | for action in self.params['actions']: 102 | p = multiprocessing.Process(target=actions.run_action, args=[action, conv_list.copy(), self.params, action_results]) 103 | action_processes.append(p) 104 | p.start() 105 | 106 | for p in action_processes: 107 | p.join() 108 | 109 | candidate_outputs = dict() 110 | for key in action_results: 111 | if action_results[key]: 112 | candidate_outputs[key] = action_results[key] 113 | return candidate_outputs 114 | 115 | def execute_command(self, conv_list, command): 116 | """ 117 | The command executor method. 118 | 119 | Args: 120 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 121 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 122 | command(str): A str showing the command. 123 | 124 | Returns: 125 | A dict of str (i.e., command) to list of Documents (i.e., the action's result) as the response. 126 | """ 127 | if command == '#get_doc': 128 | doc_id = ' '.join(conv_list[0].text.split(' ')[1:]) 129 | return {'#get_doc': actions.GetDocFromIndex.run(None, {**self.params, **{'doc_id': doc_id}})} 130 | else: 131 | raise Exception('Command not found!') 132 | -------------------------------------------------------------------------------- /macaw/live_main.py: -------------------------------------------------------------------------------- 1 | """ 2 | The interactive CIS main file. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | from macaw.cis import CIS 8 | from macaw.core import mrc, retrieval 9 | from macaw.core.input_handler.action_detection import RequestDispatcher 10 | from macaw.core.output_handler import naive_output_selection 11 | from macaw.util.logging import Logger 12 | 13 | 14 | class ConvQA(CIS): 15 | def __init__(self, params): 16 | """ 17 | The constructor for Conversational Question Answering. This is a Conversational application class and is 18 | inherited from the CIS class. 19 | 20 | Args: 21 | params(dict): A dict of parameters. These are mandatory parameters for this class: 'logger' which is an 22 | instance of the util.logging.Logger class. ConvQA requires both a retrieval and machine reading 23 | comprehension engines. Each of them requires some additional parameters. Refer to the corresponding class 24 | for more information on the required parameters. 25 | """ 26 | super().__init__(params) 27 | self.logger = params['logger'] 28 | self.logger.info('Conversational QA Model... starting up...') 29 | self.retrieval = retrieval.get_retrieval_model(params=self.params) 30 | self.qa = mrc.get_mrc_model(params=self.params) 31 | self.params['actions'] = {'retrieval': self.retrieval, 'qa': self.qa} 32 | self.request_dispatcher = RequestDispatcher(self.params) 33 | self.output_selection = naive_output_selection.NaiveOutputProcessing({}) 34 | 35 | def request_handler_func(self, conv_list): 36 | """ 37 | This function is called for each conversational interaction made by the user. In fact, this function calls the 38 | dispatcher to send the user request to the information seeking components. 39 | 40 | Args: 41 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 42 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 43 | 44 | Returns: 45 | output_msg(Message): Returns an output message that should be sent to the UI to be presented to the user. 46 | """ 47 | self.logger.info(conv_list) 48 | dispatcher_output = self.request_dispatcher.dispatch(conv_list) 49 | output_msg = self.output_selection.get_output(conv_list, dispatcher_output) 50 | return output_msg 51 | 52 | def run(self): 53 | """ 54 | This function is called to run the ConvQA system. In live mode, it never stops until the program is killed. 55 | """ 56 | self.interface.run() 57 | 58 | 59 | if __name__ == '__main__': 60 | basic_params = {'timeout': 15, # timeout is in terms of second. 61 | 'mode': 'live', # mode can be either live or exp. 62 | 'logger': Logger({})} # for logging into file, pass the filepath to the Logger class. 63 | 64 | # These are required database parameters if the mode is 'live'. The host and port of the machine hosting the 65 | # database, as well as the database name. 66 | db_params = {'interaction_db_host': 'localhost', 67 | 'interaction_db_port': 27017, 68 | 'interaction_db_name': 'macaw_test'} 69 | 70 | # These are interface parameters. They are interface specific. 71 | interface_params = {'interface': 'telegram', # interface can be 'telegram' or 'stdio' for live mode, and 'fileio' 72 | # for exp mode. 73 | 'bot_token': 'YOUR_TELECGRAM_BOT_TOKEN', # Telegram bot token. 74 | 'asr_model': 'google', # The API used for speech recognition. 75 | 'asg_model': 'google', # The API used for speech generation. 76 | 'google-speech-to-text-credential-file': 'YOUR_GOOGLE_CREDENTIAL_FILE'} 77 | 78 | # These are parameters used by the retrieval model. 79 | retrieval_params = {'query_generation': 'simple', # the model that generates a query from a conversation history. 80 | 'use_coref': True, # True, if query generator can use coreference resolution, otherwise False. 81 | 'search_engine': 'bing', # the search engine. It can be either 'indri' or 'bing'. 82 | 'bing_key': 'YOUR_BING_SUBSCRIPTION_KEY', # Bing API key 83 | 'search_engine_path': 'PATH_TO_INDRI', # The path to the indri toolkit. 84 | 'col_index': 'PATH_TO_INDRI_INDEX', # The path to the indri index. 85 | 'col_text_format': 'trectext', # collection text format. Standard 'trectext' is only supported. 86 | 'results_requested': 3} # Maximum number of docs that should be retrieved by search engine. 87 | # Note: If you want to have a re-ranking model (e.g., learning to rank), you just need to simply extend the class 88 | # core.retrieval.search_engine.ReRanker and implement the method 'rerank'. Then simply add a 'reranker' parameter to 89 | # retrieval_params that points to an instance of your favorite ReRanker class. If there is a 'reranker' parameter in 90 | # retrieval_params, the retrieval model automatically calls the re-ranking method. For more details, see the method 91 | # 'get_results' in class core.retrieval.search_engine.Retrieval. 92 | 93 | # These are parameters used by the machine reading comprehension model. 94 | mrc_params = {'mrc': 'drqa', # MRC model. 95 | 'mrc_model_path': 'PATH_TO_PRETRAINED_MRC_MODEL', # The path to the model parameters. 96 | 'mrc_path': 'PATH_TO_MRC_DIRECTORY', # The path to the model toolkit. 97 | 'corenlp_path': 'PATH_TO_STANFORD_CORE_NLP_DIRECTORY', # The path to the corenlp toolkit. 98 | 'qa_results_requested': 3} # The number of candidate answers returned by the MRC model. 99 | 100 | params = {**basic_params, **db_params, **interface_params, **retrieval_params, **mrc_params} 101 | basic_params['logger'].info(params) 102 | ConvQA(params).run() 103 | 104 | -------------------------------------------------------------------------------- /macaw/interface/telegram.py: -------------------------------------------------------------------------------- 1 | """ 2 | The Telegram bot (supports interactive multi-modal interactions with different devices). 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | import urllib.parse 8 | import os 9 | import tempfile 10 | import traceback 11 | 12 | from telegram import InlineKeyboardButton, InlineKeyboardMarkup 13 | from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackQueryHandler 14 | 15 | from macaw import util 16 | from macaw.core.interaction_handler.msg import Message 17 | from macaw.interface.interface import Interface 18 | 19 | 20 | class TelegramBot(Interface): 21 | def __init__(self, params): 22 | """ 23 | A Telegram bot interface for Macaw. 24 | 25 | Args: 26 | params(dict): A dict of parameters. The params 'logger' and 'bot_token' are mandatory. 27 | """ 28 | super().__init__(params) 29 | self.logger = self.params['logger'] 30 | 31 | self.MAX_MSG_LEN = 1000 # maximum number of characters in each response message. 32 | self.MAX_OPTION_LEN = 30 # maximum number of characters in each clickable option text. 33 | 34 | # Starting the bot by creating the Updater. 35 | # Make sure to set use_context=True to use the new context based callbacks 36 | # If you don't have a bot_token, add 'botfather' to your personal Telegram account and follow the instructions 37 | # to get a token for your bot. 38 | self.updater = Updater(self.params['bot_token'], use_context=True) 39 | self.dp = self.updater.dispatcher 40 | 41 | # Telegram command handlers (e.g., /start) 42 | self.dp.add_handler(CommandHandler('start', self.start)) 43 | self.dp.add_handler(CommandHandler('help', self.help)) 44 | 45 | # Telegram message handlers 46 | self.dp.add_handler(MessageHandler(Filters.text, self.request_handler)) 47 | self.dp.add_handler(MessageHandler(Filters.voice, self.voice_request_handler)) 48 | self.dp.add_handler(CallbackQueryHandler(self.button_click_handler)) 49 | 50 | # logging all errors 51 | self.dp.add_error_handler(self.error) 52 | 53 | def start(self, update, context): 54 | """Send a message when the command /start is issued.""" 55 | update.message.reply_text('Hi, welcome to Macaw! Macaw is an open-source extensible framework for ' 56 | 'conversational information seeking. Visit: https://github.com/microsoft/macaw') 57 | 58 | def help(self, update, context): 59 | """Send a message when the command /help is issued.""" 60 | update.message.reply_text('Macaw should be able to answer your questions. Just ask a question!') 61 | 62 | def request_handler(self, update, context): 63 | """This method handles all text messages, and asks result_presentation to send the response to the user.""" 64 | try: 65 | self.logger.info(update.message) 66 | user_info = {'first_name': update.message.chat.first_name, 67 | 'last_name': update.message.chat.last_name, 68 | 'is_bot': update._effective_user.is_bot 69 | } 70 | msg_info = {'msg_id': update.message.message_id, 71 | 'msg_type': 'text', 72 | 'msg_source': 'user'} 73 | msg = Message(user_interface='telegram', 74 | user_id=update.message.chat.id, 75 | user_info=user_info, 76 | msg_info=msg_info, 77 | text=update.message.text, 78 | timestamp=util.current_time_in_milliseconds()) 79 | output = self.params['live_request_handler'](msg) 80 | self.result_presentation(output, {'update': update}) 81 | except Exception: 82 | traceback.print_exc() 83 | 84 | def voice_request_handler(self, update, context): 85 | """This method handles all voice messages, and asks result_presentation to send the response to the user.""" 86 | try: 87 | ogg_file = tempfile.NamedTemporaryFile(delete=True) 88 | update.message.voice.get_file().download(ogg_file.name) 89 | text = self.params['asr'].speech_to_text(ogg_file.name) 90 | ogg_file.close() 91 | update.message.reply_text('Macaw heard: ' + text) 92 | 93 | user_info = {'first_name': update.message.chat.first_name, 94 | 'last_name': update.message.chat.last_name, 95 | 'is_bot': update._effective_user.is_bot 96 | } 97 | msg_info = {'msg_id': update.message.message_id, 98 | 'msg_type': 'voice', 99 | 'msg_source': 'user'} 100 | msg = Message(user_interface='telegram', 101 | user_id=update.message.chat.id, 102 | user_info=user_info, 103 | msg_info=msg_info, 104 | text=text, 105 | timestamp=util.current_time_in_milliseconds()) 106 | output = self.params['live_request_handler'](msg) 107 | self.result_presentation(output, {'update': update}) 108 | except Exception: 109 | traceback.print_exc() 110 | 111 | def button_click_handler(self, update, context): 112 | """This method handles clicks, and asks result_presentation to send the response to the user.""" 113 | try: 114 | self.logger.info(update) 115 | user_info = {'first_name': update.callback_query.message.chat.first_name, 116 | 'last_name': update.callback_query.message.chat.last_name, 117 | 'is_bot': update._effective_user.is_bot 118 | } 119 | msg_info = {'msg_id': update.callback_query.message.message_id, 120 | 'msg_type': 'command', 121 | 'msg_source': 'user'} 122 | msg = Message(user_interface='telegram', 123 | user_id=update.callback_query.message.chat.id, 124 | user_info=user_info, 125 | msg_info=msg_info, 126 | text=update.callback_query.data, 127 | timestamp=util.current_time_in_milliseconds()) 128 | output = self.params['live_request_handler'](msg) 129 | self.result_presentation(output, {'update': update}) 130 | except Exception as ex: 131 | traceback.print_exc() 132 | 133 | def result_presentation(self, response_msg, params): 134 | """This method produces an appropriate response to be sent to the client.""" 135 | try: 136 | if response_msg is None: 137 | return 138 | update = params['update'] 139 | if response_msg.msg_info['msg_type'] == 'text': 140 | if update.message is not None: 141 | update.message.reply_text(response_msg.text[:self.MAX_MSG_LEN]) 142 | elif update.callback_query.message is not None: 143 | update.callback_query.message.reply_text(response_msg.text[:self.MAX_MSG_LEN]) 144 | elif response_msg.msg_info['msg_type'] == 'voice': 145 | ogg_file_name = self.params['asg'].text_to_speech(response_msg.text[:self.MAX_MSG_LEN]) 146 | self.updater.bot.send_voice(chat_id=update.message.chat.id, voice=open(ogg_file_name, 'rb')) 147 | os.remove(ogg_file_name) # removing audio files for privacy reasons. 148 | elif response_msg.msg_info['msg_type'] == 'options': 149 | keyboard = [[InlineKeyboardButton(option_text[:self.MAX_OPTION_LEN], 150 | callback_data=urllib.parse.unquote(option_data))] 151 | for (option_text, option_data, output_score) in response_msg.msg_info['options']] 152 | reply_markup = InlineKeyboardMarkup(keyboard) 153 | update.message.reply_text(response_msg.text[:self.MAX_MSG_LEN], reply_markup=reply_markup) 154 | elif response_msg.msg_info['msg_type'] == 'error': 155 | error_msg = 'ERROR: NO RESULT!' 156 | if update.message is not None: 157 | update.message.reply_text(error_msg) 158 | elif update.callback_query.message is not None: 159 | update.callback_query.message.reply_text(error_msg) 160 | else: 161 | raise Exception('The msg_type is not recognized:', response_msg.msg_info['msg_type']) 162 | except Exception: 163 | traceback.print_exc() 164 | 165 | def error(self, update, context): 166 | """Log Errors caused by Updates.""" 167 | self.logger.warning('Update "%s" caused error "%s"', update, context.error) 168 | 169 | def send_msg(self, chat_id, msg_text): 170 | """This method is used for sending a message to a user. It can be used for mixed-initiative interactions, as 171 | well as Wizard of Oz settings.""" 172 | self.updater.bot.sendMessage(chat_id=chat_id, text=msg_text) 173 | 174 | def run(self): 175 | """Starting the bot!""" 176 | self.logger.info('Running the Telegram bot!') 177 | self.updater.start_polling() 178 | # Run the bot until you press Ctrl-C or the process receives SIGINT, 179 | # SIGTERM or SIGABRT. This should be used most of the time, since 180 | # start_polling() is non-blocking and will stop the bot gracefully. 181 | self.updater.idle() 182 | 183 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Macaw: An Extensible Conversational Information Seeking Platform 2 | Conversational information seeking (CIS) has been recognized as a major emerging research area in information retrieval. 3 | Such research will require data and tools, to allow the implementation and study of conversational systems. Macaw is 4 | an open-source framework with a modular architecture for CIS research. Macaw supports *multi-turn*, *multi-modal*, and 5 | *mixed-initiative* interactions, for tasks such as document retrieval, question answering, recommendation, and 6 | structured data exploration. It has a modular design to encourage the study of new CIS algorithms, which can be 7 | evaluated in batch mode. It can also integrate with a user interface, which allows user studies and data collection in 8 | an interactive mode, where the back end can be *fully algorithmic* or a *wizard of oz* setup. 9 | 10 | Macaw could be of interest to the researchers and practitioners working on information retrieval, natural language 11 | processing, and dialogue systems. 12 | 13 | For more information on Macaw, please refer to [this paper](https://arxiv.org/pdf/1912.08904.pdf). 14 | 15 | Table of content: 16 | + [Macaw Architecture](#macaw-architecture) 17 | + [Interfaces](#interfaces) 18 | + [Retrieval](#retrieval) 19 | + [Answer Selection and Generation](#answer-selection-and-generation) 20 | + [Installation](#installation) 21 | + [Running Macaw](#running-macaw) 22 | + [Bug Report and Feature Request](#bug-report-and-feature-request) 23 | + [Citation](#citation) 24 | + [License](#license) 25 | + [Contribution](#contribution) 26 | 27 | ## Macaw Architecture 28 | Macaw has a modular architecture, which allows further development and extension. The high-level architecture of Macaw 29 | is presented below: 30 | 31 | ![The high-level architecture of Macaw](macaw-arch.jpg) 32 | 33 | For more information on each module in Macaw, refer to this paper. 34 | 35 | #### Interfaces 36 | Macaw supports the following interfaces: 37 | + Standard IO: For *development* purposes 38 | + File IO: For *batch experiments* (see the examples in the `data` folder for input and output file formats) 39 | + Telegram bot: For interaction with real users 40 | 41 | Here is an example of the Telegram interface for Macaw. It supports multi-modal interactions (text, speech, click, etc). 42 | 43 | ![Telegram interface for Macaw](macaw-example-tax.jpg) 44 | ![Telegram interface for Macaw](macaw-example-shakespeare.jpg) 45 | 46 | 47 | #### Retrieval 48 | Macaw features the following search engines: 49 | + [Indri](http://lemurproject.org/indri.php): an open-source search engine that can be used for any arbitrary text 50 | collection. 51 | + Bing web search API: sending a request to the Bing API and getting the results. 52 | 53 | #### Answer Selection and Generation 54 | For question answering, Macaw only features [the DrQA model](https://github.com/facebookresearch/DrQA) in its current 55 | version. 56 | 57 | 58 | ## Installation 59 | Macaw requires `Python >= 3.6` and `pip3`. If you don't have `setuptools`, run `sudo pip3 install setuptools`. 60 | To install Macaw, first **clone macaw** from this repo and then follow the following installation steps. The 61 | mentioned installation commands can be executed on Ubuntu. You can use the same or similar commands on other Linux 62 | distribution. If you are using Windows 10, we recommend installing Macaw and all the required packages on 63 | [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10). 64 | 65 | #### Step 1: Installing MongoDB server 66 | Macaw uses MongoDB for storing and retrieving user interactions (conversations). To install MongoDB server, run the 67 | following command: 68 | ``` 69 | sudo apt-get install mongodb-server-core 70 | ``` 71 | 72 | #### Step 2: Installing Indri and Pyndri 73 | [Indri](http://lemurproject.org/indri.php) is an open-source search engine for information retrieval research, 74 | implemented as part of the [Lemur Project](http://lemurproject.org/). 75 | [Pyndri](https://github.com/cvangysel/pyndri) is a python interface to Indri. Macaw uses Indri for retrieving documents 76 | from an arbitrary text collection. 77 | To install Indri, first download Indri from https://sourceforge.net/projects/lemur/files/lemur/. As suggested by pyndri, 78 | we have used Indri-5.11. This Indri version can be installed as follows: 79 | ``` 80 | # download indri-5.11.tar.gz 81 | sudo apt install g++ zlib1g-dev 82 | tar xzvf indri-5.11.tar.gz 83 | rm indri-5.11.tar.gz 84 | cd indri-5.11 85 | ./configure CXX="g++ -D_GLIBCXX_USE_CXX11_ABI=0" 86 | make 87 | sudo make install 88 | ``` 89 | 90 | Then, clone the pyndri repository from https://github.com/cvangysel/pyndri and run the following command: 91 | ``` 92 | python3 setup.py install 93 | ``` 94 | 95 | At this step, you can make sure your installation is complete by running the pyndri tests. 96 | 97 | #### Step 3: Installing Stanford Core NLP 98 | Stanford Core NLP can be used for tokenization and most importantly for co-reference resolution. If you do not need 99 | co-reference resolution, you can ignore this step. Stanford Core NLP requires `java`. Get it by following these 100 | commands: 101 | ``` 102 | wget -O "stanford-corenlp-full-2017-06-09.zip" "http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip" 103 | sudo apt-get install unzip 104 | unzip "stanford-corenlp-full-2017-06-09.zip" 105 | rm "stanford-corenlp-full-2017-06-09.zip" 106 | ``` 107 | 108 | If you don't have `java`, install it using: 109 | ``` 110 | sudo apt-get install default-jre 111 | ``` 112 | 113 | #### Step 4: Installing DrQA 114 | Macaw also supports answer extraction / generation for user queries from retrieved documents. For this purpose, it 115 | features [DrQA](https://github.com/facebookresearch/DrQA). If you do not need this functionality, ignore this step (you 116 | can also install this later). 117 | To install DrQA, run the following commands: 118 | ``` 119 | git clone https://github.com/facebookresearch/DrQA.git 120 | cd DrQA 121 | pip3 install -r requirements.txt 122 | pip3 install torch 123 | sudo python3 setup.py develop 124 | ``` 125 | 126 | To use pre-trained DrQA model, use the following command. 127 | ``` 128 | ./download.sh 129 | ``` 130 | This downloads a 7.5GB (compressed) file and requires 25GB (uncompressed) space. This may take a while! 131 | 132 | 133 | 134 | #### Step 5: Installing FFmpeg 135 | To support speech interactions with users, Macaw requires FFmpeg for some multimedia processing steps. If you don't 136 | need a speech support from Macaw, you can skip this step. To install FFmpeg, run the following command: 137 | ``` 138 | sudo apt-get install 139 | ``` 140 | 141 | #### Step 6: Installing Macaw 142 | After cloning Macaw, use the following commands for installation: 143 | ``` 144 | cd macaw 145 | sudo pip3 install -r requirements.txt 146 | sudo python3 setup.py install 147 | ``` 148 | 149 | ## Running Macaw 150 | If you run macaw with interactive (or live) mode, you should first run MongoDB server using the following command: 151 | ``` 152 | sudo mongod 153 | ``` 154 | Note that this command uses the default database directory (`/data/db`) for storing the data. You may need to create 155 | this directory if you haven't. You can also use other locations using the `--dbpath` argument. 156 | 157 | 158 | We provide three different main scripts (i.e., app): 159 | + `live_main.py`: An interactive conversational search and question answering system. It can use both STDIO and Telegram 160 | interfaces. 161 | + `batch_ext_main.py`: A model for running experiments on a reusable dataset. This main script uses FILEIO as the 162 | interface. 163 | + `wizard_of_oz_main.py`: A main script for Wizard of Oz experiments. 164 | 165 | After selecting the desired main script, open the python file and provide the required parameters. For example, you need 166 | to use your Bing subscription key (if using Bing), the path to Indri index (if using Indri), Telegram bot token (if 167 | using Telegram interface), etc. in order to run the `live_main.py` script. You can further run the favorite main script 168 | as below: 169 | 170 | ``` 171 | python3 live_main.py 172 | ``` 173 | 174 | 175 | ## Bug Report and Feature Request 176 | For bug report and feature request, you can open an issue in github, or send an email to 177 | [Hamed Zamani](http://hamedz.ir) at `hazamani@microsoft.com`. 178 | 179 | ## Citation 180 | If you found Macaw useful, you can cite the following article: 181 | ``` 182 | Hamed Zamani and Nick Craswell, "Macaw: An Extensible Conversational Information Seeking System", arxiv pre-print. 183 | ``` 184 | 185 | bibtex: 186 | ``` 187 | @article{macaw, 188 | title={Macaw: An Extensible Conversational Information Seeking Platform}, 189 | author={Zamani, Hamed and Craswell, Nick}, 190 | journal={arXiv preprint arXiv:1912.08904}, 191 | year={2019}, 192 | } 193 | ``` 194 | 195 | ## License 196 | Macaw is distributed under the **MIT License**. See the `LICENSE` file for more information. 197 | 198 | 199 | ## Contribution 200 | 201 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 202 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 203 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 204 | 205 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 206 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 207 | provided by the bot. You will only need to do this once across all repos using our CLA. 208 | 209 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 210 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 211 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 212 | -------------------------------------------------------------------------------- /macaw/wizard_of_oz_main.py: -------------------------------------------------------------------------------- 1 | """ 2 | The interactive CIS main file. 3 | 4 | Authors: Hamed Zamani (hazamani@microsoft.com) 5 | """ 6 | 7 | import multiprocessing 8 | 9 | from macaw import interface 10 | from macaw.core import retrieval 11 | from macaw.core.input_handler.action_detection import RequestDispatcher 12 | from macaw.core.interaction_handler.user_requests_db import InteractionDB 13 | from macaw.core.output_handler import naive_output_selection 14 | from macaw.util.logging import Logger 15 | 16 | 17 | class Seeker: 18 | def __init__(self, params): 19 | """ 20 | The constructor for Conversational Question Answering. This is a Conversational application class and is 21 | inherited from the CIS class. 22 | 23 | Args: 24 | params(dict): A dict of parameters. These are mandatory parameters for this class: 'logger' which is an 25 | instance of the util.logging.Logger class. ConvQA requires both a retrieval and machine reading 26 | comprehension engines. Each of them requires some additional parameters. Refer to the corresponding class 27 | for more information on the required parameters. 28 | """ 29 | self.params = params 30 | self.logger = params['logger'] 31 | self.logger.info('Conversational Wirzard of Oz System... starting up...') 32 | self.wizard = None 33 | self.params['live_request_handler'] = self.live_request_handler 34 | 35 | self.interface = interface.get_interface(params) 36 | 37 | self.retrieval = retrieval.get_retrieval_model(params=self.params) 38 | self.request_dispatcher = RequestDispatcher({'retrieval': self.retrieval}) 39 | self.output_selection = naive_output_selection.NaiveOutputProcessing({}) 40 | 41 | def live_request_handler(self, msg): 42 | """ 43 | This function is called for each conversational interaction made by the user. In fact, this function calls the 44 | dispatcher to send the user request to the information seeking components. 45 | 46 | Args: 47 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 48 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 49 | 50 | Returns: 51 | output_msg(Message): Returns an output message that should be sent to the UI to be presented to the user. 52 | """ 53 | msg_db = InteractionDB(host=self.params['interaction_db_host'], 54 | port=self.params['interaction_db_port'], 55 | dbname=self.params['interaction_db_name']) 56 | msg_db.insert_one(msg) 57 | msg_db.close() 58 | self.logger.info(msg) 59 | # dispatcher_output = self.request_dispatcher.dispatch(conv_list) 60 | # output_msg = self.output_selection.get_output(conv_list, dispatcher_output) 61 | 62 | self.wizard.send_msg(msg.text) 63 | 64 | def set_wizard(self, wizard): 65 | self.wizard = wizard 66 | 67 | def send_msg(self, msg_text): 68 | self.interface.send_msg(self.params['user_id'], msg_text) 69 | 70 | def run(self): 71 | """ 72 | This function is called to run the ConvQA system. In live mode, it never stops until the program is killed. 73 | """ 74 | self.interface.run() 75 | 76 | 77 | class Wizard: 78 | def __init__(self, params): 79 | """ 80 | The constructor for Conversational Question Answering. This is a Conversational application class and is 81 | inherited from the CIS class. 82 | 83 | Args: 84 | params(dict): A dict of parameters. These are mandatory parameters for this class: 'logger' which is an 85 | instance of the util.logging.Logger class. ConvQA requires both a retrieval and machine reading 86 | comprehension engines. Each of them requires some additional parameters. Refer to the corresponding class 87 | for more information on the required parameters. 88 | """ 89 | self.params = params 90 | self.logger = params['logger'] 91 | self.logger.info('Conversational Wirzard of Oz System... starting up...') 92 | self.params['live_request_handler'] = self.live_request_handler 93 | self.seeker = None 94 | 95 | self.interface = interface.get_interface(params) 96 | 97 | self.retrieval = retrieval.get_retrieval_model(params=self.params) 98 | self.request_dispatcher = RequestDispatcher({'retrieval': self.retrieval}) 99 | self.output_selection = naive_output_selection.NaiveOutputProcessing({}) 100 | 101 | def live_request_handler(self, msg): 102 | """ 103 | This function is called for each conversational interaction made by the user. In fact, this function calls the 104 | dispatcher to send the user request to the information seeking components. 105 | 106 | Args: 107 | conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the 108 | user. This list is in reverse order, meaning that the first elements is the last interaction made by user. 109 | 110 | Returns: 111 | output_msg(Message): Returns an output message that should be sent to the UI to be presented to the user. 112 | """ 113 | msg_db = InteractionDB(host=self.params['interaction_db_host'], 114 | port=self.params['interaction_db_port'], 115 | dbname=self.params['interaction_db_name']) 116 | msg_db.insert_one(msg) 117 | self.logger.info(msg) 118 | 119 | if msg.text.startswith('@seeker'): 120 | self.seeker.send_msg(msg.text[7:].strip()) 121 | output_msg = None 122 | elif msg.text.startswith('@system'): 123 | msg.text = msg.text[7:].strip() 124 | dispatcher_output = self.request_dispatcher.dispatch([msg]) 125 | output_msg = self.output_selection.get_output([msg], dispatcher_output) 126 | msg_db.insert_one(output_msg) 127 | elif msg.text.startswith('@logger'): 128 | msg_db.close() 129 | output_msg = None 130 | else: 131 | self.send_msg('The message should starts with @system, @seeker, or @logger') 132 | output_msg = None 133 | 134 | msg_db.close() 135 | return output_msg 136 | 137 | def set_seeker(self, seeker): 138 | self.seeker = seeker 139 | 140 | def send_msg(self, msg_text): 141 | self.interface.send_msg(self.params['user_id'], msg_text) 142 | 143 | def run(self): 144 | """ 145 | This function is called to run the ConvQA system. In live mode, it never stops until the program is killed. 146 | """ 147 | self.interface.run() 148 | 149 | 150 | if __name__ == '__main__': 151 | basic_params = {'timeout': 15, # timeout is in terms of second. 152 | 'mode': 'live', # mode can be either live or exp. 153 | 'logger': Logger({})} # for logging into file, pass the filepath to the Logger class. 154 | 155 | # These are required database parameters if the mode is 'live'. The host and port of the machine hosting the 156 | # database, as well as the database name. 157 | db_params = {'interaction_db_host': 'localhost', 158 | 'interaction_db_port': 27017, 159 | 'interaction_db_name': 'macaw_test'} 160 | 161 | # These are interface parameters. They are interface specific. 162 | seeker_interface_params = {'interface': 'telegram', # interface can be 'telegram' or 'stdio'. 163 | 'bot_token': 'YOUR_TELECGRAM_BOT_TOKEN_FOR_SEEKER', # Telegram bot token. 164 | 'asr_model': 'google', # The API used for speech recognition. 165 | 'asg_model': 'google', # The API used for speech generation. 166 | 'google-speech-to-text-credential-file': 'YOUR_GOOGLE_CREDENTIAL_FILE', 167 | 'user_id': 'TELEGRAM_USER_ID_FOR_SEEKER'} 168 | 169 | wizard_interface_params = {'interface': 'telegram', # interface can be 'telegram' or 'stdio'. 170 | 'bot_token': 'YOUR_TELECGRAM_BOT_TOKEN_FOR_WIZARD', # Telegram bot token. 171 | 'asr_model': 'google', # The API used for speech recognition. 172 | 'asg_model': 'google', # The API used for speech generation. 173 | 'google-speech-to-text-credential-file': 'YOUR_GOOGLE_CREDENTIAL_FILE', 174 | 'user_id': 'TELEGRAM_USER_ID_FOR_WIZARD'} 175 | 176 | # These are parameters used by the retrieval model. 177 | retrieval_params = {'query_generation': 'simple', # the model that generates a query from a conversation history. 178 | 'use_coref': False, # True, if query generator can use coreference resolution, otherwise False. 179 | 'search_engine': 'indri', # the search engine. It can be either 'indri' or 'bing'. 180 | 'bing_key': 'YOUR_BING_SUBSCRIPTION_KEY', # Bing API key 181 | 'search_engine_path': 'PATH_TO_INDRI', # The path to the indri toolkit. 182 | 'col_index': 'PATH_TO_INDRI_INDEX', # The path to the indri index. 183 | 'col_text_format': 'trectext', # collection text format. Standard 'trectext' is only supported. 184 | 'results_requested': 3} # Maximum number of docs that should be retrieved by search engine. 185 | # Note: If you want to have a re-ranking model (e.g., learning to rank), you just need to simply extend the class 186 | # core.retrieval.search_engine.ReRanker and implement the method 'rerank'. Then simply add a 'reranker' parameter to 187 | # retrieval_params that points to an instance of your favorite ReRanker class. If there is a 'reranker' parameter in 188 | # retrieval_params, the retrieval model automatically calls the re-ranking method. For more details, see the method 189 | # 'get_results' in class core.retrieval.search_engine.Retrieval. 190 | 191 | seeker_params = {**basic_params, **db_params, **seeker_interface_params, **retrieval_params} 192 | wizard_params = {**basic_params, **db_params, **wizard_interface_params, **retrieval_params} 193 | basic_params['logger'].info(seeker_params) 194 | basic_params['logger'].info(wizard_params) 195 | 196 | seeker = Seeker(seeker_params) 197 | wizard = Wizard(wizard_params) 198 | seeker.set_wizard(wizard) 199 | wizard.set_seeker(seeker) 200 | 201 | seeker_process = multiprocessing.Process(target=seeker.run) 202 | wizard_process = multiprocessing.Process(target=wizard.run) 203 | 204 | seeker_process.start() 205 | wizard_process.start() 206 | 207 | basic_params['logger'].info('Seeker Process ID: {}'.format(seeker_process.pid)) 208 | basic_params['logger'].info('Wizard Process ID: {}'.format(wizard_process.pid)) 209 | 210 | --------------------------------------------------------------------------------