├── macaw
    ├── __init__.py
    ├── core
    │   ├── __init__.py
    │   ├── input_handler
    │   │   ├── __init__.py
    │   │   ├── actions.py
    │   │   └── action_detection.py
    │   ├── output_handler
    │   │   ├── __init__.py
    │   │   ├── output_selection.py
    │   │   └── naive_output_selection.py
    │   ├── interaction_handler
    │   │   ├── __init__.py
    │   │   ├── user_requests_db.py
    │   │   └── msg.py
    │   ├── mrc
    │   │   ├── __init__.py
    │   │   └── drqa_mrc.py
    │   └── retrieval
    │   │   ├── __init__.py
    │   │   ├── search_engine.py
    │   │   ├── doc.py
    │   │   ├── bing_api.py
    │   │   ├── indri.py
    │   │   └── query_generation.py
    ├── interface
    │   ├── interface.py
    │   ├── __init__.py
    │   ├── stdio.py
    │   ├── fileio.py
    │   ├── speech_recognition.py
    │   └── telegram.py
    ├── util
    │   ├── logging.py
    │   ├── __init__.py
    │   └── text_parser.py
    ├── batch_exp_main.py
    ├── cis.py
    ├── live_main.py
    └── wizard_of_oz_main.py
├── .idea
    ├── .gitignore
    ├── vcs.xml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── modules.xml
    ├── misc.xml
    └── macaw.iml
├── data
    ├── example_qa_output.txt
    ├── example_retrieval_input.txt
    ├── example_qa_input.txt
    └── example_retrieval_output.txt
├── macaw-arch.jpg
├── macaw-example-tax.jpg
├── macaw-example-shakespeare.jpg
├── requirements.txt
├── CODE_OF_CONDUCT.md
├── setup.py
├── LICENSE
├── .gitignore
├── SECURITY.md
└── README.md


/macaw/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/macaw/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/macaw/core/input_handler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/macaw/core/output_handler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | # Default ignored files
3 | /workspace.xml


--------------------------------------------------------------------------------
/data/example_qa_output.txt:
--------------------------------------------------------------------------------
1 | Q21	2020
2 | Q34	F. Wilfrid Lancaster
3 | 


--------------------------------------------------------------------------------
/macaw-arch.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/macaw/HEAD/macaw-arch.jpg


--------------------------------------------------------------------------------
/macaw-example-tax.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/macaw/HEAD/macaw-example-tax.jpg


--------------------------------------------------------------------------------
/macaw-example-shakespeare.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/macaw/HEAD/macaw-example-shakespeare.jpg


--------------------------------------------------------------------------------
/data/example_retrieval_input.txt:
--------------------------------------------------------------------------------
1 | 123	lung cancer symptoms    treatments
2 | 267	information retrieval   IR tutorials
3 | 


--------------------------------------------------------------------------------
/macaw/core/interaction_handler/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | FILE DESCRIPTION
3 | 
4 | Authors: Hamed Zamani (hazamani@microsoft.com)
5 | """


--------------------------------------------------------------------------------
/data/example_qa_input.txt:
--------------------------------------------------------------------------------
1 | Q21	who is the president of the united states?	when is the next presidential election?
2 | Q34	who is the father of information retrieval research?
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | func_timeout==4.3.5
2 | pymongo==3.9.0
3 | justext==2.2.0
4 | SpeechRecognition
5 | pydub
6 | python-telegram-bot==12.0.0
7 | stanfordcorenlp
8 | google-cloud-texttospeech
9 | 


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/macaw.iml" filepath="$PROJECT_DIR$/.idea/macaw.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="3.6 @ Ubuntu" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/data/example_retrieval_output.txt:
--------------------------------------------------------------------------------
1 | 123	Q0	LA040190-0030	1	-10.726514210775541	macaw
2 | 123	Q0	FR940831-2-00098	2	-10.781756119063719	macaw
3 | 123	Q0	FR941028-2-00076	3	-10.803175601098255	macaw
4 | 267	Q0	FBIS4-20702	1	-4.786052863876007	macaw
5 | 267	Q0	FBIS4-20699	2	-4.849902126042474	macaw
6 | 267	Q0	FBIS4-20701	3	-5.229284984955124	macaw
7 | 


--------------------------------------------------------------------------------
/macaw/interface/interface.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The abstract interface class.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | from abc import ABC, abstractmethod
 8 | 
 9 | 
10 | class Interface(ABC):
11 |     def __init__(self, params):
12 |         self.params = params
13 | 
14 |     @abstractmethod
15 |     def run(self):
16 |         pass
17 | 
18 |     @abstractmethod
19 |     def result_presentation(self, response_msg, params):
20 |         pass


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='macaw',
 5 |     version='0.1',
 6 |     packages=['macaw', 'macaw.core', 'macaw.core.mrc', 'macaw.core.retrieval', 'macaw.core.input_handler',
 7 |               'macaw.core.output_handler', 'macaw.core.interaction_handler', 'macaw.util', 'macaw.interface'],
 8 |     url='https://github.com/microsoft/macaw/',
 9 |     license='MIT',
10 |     author='Hamed Zamani',
11 |     author_email='hazamani@microsoft.com',
12 |     description='An extensible framework for conversational information seeking research'
13 | )
14 | 


--------------------------------------------------------------------------------
/.idea/macaw.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="3.6 @ Ubuntu" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="PackageRequirementsSettings">
 9 |     <option name="requirementsPath" value="E:\msft-repos\macaw\requirements.txt" />
10 |   </component>
11 |   <component name="PyDocumentationSettings">
12 |     <option name="myDocStringFormat" value="Google" />
13 |   </component>
14 |   <component name="TestRunnerService">
15 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
16 |   </component>
17 | </module>


--------------------------------------------------------------------------------
/macaw/core/mrc/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The MRC module init.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | from macaw.core.mrc import drqa_mrc
 8 | 
 9 | 
10 | def get_mrc_model(params):
11 |     """
12 |     This method returns the MRC class requested in the parameter dict.
13 |     Args:
14 |         params(dict): A dict of parameters. In this method, the parameters 'logger' and 'mrc' are required. Currently,
15 |         only one MRC model (i.e., 'drqa') is implemented.
16 | 
17 |     Returns:
18 |         An MRC object for machine reading comprehension.
19 |     """
20 |     params['logger'].info('The MRC model for QA: ' + params['mrc'])
21 |     if params['mrc'] == 'drqa':
22 |         return drqa_mrc.DrQA(params)
23 |     else:
24 |         raise Exception('The requested MRC model does not exist!')
25 | 
26 | 


--------------------------------------------------------------------------------
/macaw/interface/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The interface module init.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | from macaw.interface import speech_recognition, telegram, stdio, fileio
 8 | 
 9 | 
10 | def get_interface(params):
11 |     if 'asr_model' in params and params['asr_model'] == 'google':
12 |         params['asr'] = speech_recognition.GoogleASR(params)
13 |     if 'asg_model' in params and params['asg_model'] == 'google':
14 |         params['asg'] = speech_recognition.GoogleText2Speech(params)
15 | 
16 |     if params['interface'] == 'telegram':
17 |         return telegram.TelegramBot(params)
18 |     elif params['interface'] == 'stdio':
19 |         return stdio.StdioInterface(params)
20 |     elif params['interface'] == 'fileio':
21 |         return fileio.FileioInterface(params)
22 |     else:
23 |         raise Exception('The requested interface does not exist!')


--------------------------------------------------------------------------------
/macaw/util/logging.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The internal logger.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | import logging
 8 | 
 9 | 
10 | class Logger(logging.Logger):
11 |     def __init__(self, params):
12 |         """
13 |         A simple logging class, inherited from the standard logging.Logger.
14 | 
15 |         Args:
16 |             params(dict): A dict containing some parameters. 'logging_file' is an optional parameter, otherwise STDIO
17 |             will be used for logging.
18 |         """
19 |         super().__init__('Macaw Logger')
20 |         self.params = params
21 |         if 'logging_file' in params:
22 |             self.handler_ = logging.FileHandler(params['logging_file'])
23 |         else:
24 |             self.handler_ = logging.StreamHandler()
25 | 
26 |         self.format = logging.Formatter('%(name)s - %(asctime)s - %(levelname)s - %(message)s')
27 |         self.handler_.setFormatter(self.format)
28 |         self.addHandler(self.handler_)
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/macaw/core/output_handler/output_selection.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The output post processing unit.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | from abc import ABC, abstractmethod
 8 | 
 9 | 
10 | class OutputProcessing(ABC):
11 |     @abstractmethod
12 |     def __init__(self, params):
13 |         """
14 |         The post-processing unit for producing the response message.
15 | 
16 |         Args:
17 |             params(dict): A dict of parameters.
18 |         """
19 |         self.params = params
20 | 
21 |     @abstractmethod
22 |     def get_output(self, conv, candidate_outputs):
23 |         """
24 |         The response message generator method.
25 | 
26 |         Args:
27 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
28 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
29 |             candidate_outputs(dict): A dict of str (i.e., action) to list of Documents (i.e., the action's result) as
30 |             the response. This dict is produced by action dispatcher, which means this is the aggregation of all the
31 |             executed actions.
32 | 
33 |         Returns:
34 |             A response Message to be sent to the user.
35 |         """
36 |         pass
37 | 


--------------------------------------------------------------------------------
/macaw/util/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Some util functions.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | import json
 8 | import time
 9 | 
10 | from stanfordcorenlp import StanfordCoreNLP
11 | 
12 | 
13 | def current_time_in_milliseconds():
14 |     """
15 |     A method that returns the current time in milliseconds.
16 | 
17 |     Returns:
18 |         An int representing the current time in milliseconds.
19 |     """
20 |     return int(round(time.time() * 1000))
21 | 
22 | 
23 | class NLPUtil:
24 |     def __init__(self, params):
25 |         """
26 |         A simple NLP helper class.
27 | 
28 |         Args:
29 |             params(dict): A dict containing some parameters.
30 |         """
31 |         self.params = params
32 |         self.corenlp = StanfordCoreNLP(self.params['corenlp_path'], quiet=False)
33 | 
34 |         # Pre-fetching the required models.
35 |         props = {'annotators': 'coref', 'pipelineLanguage': 'en', 'ner.useSUTime': False}
36 |         self.corenlp.annotate('', properties=props)
37 | 
38 |     def get_coref(self, text):
39 |         """
40 |         Run co-reference resolution on the input text.
41 |         Args:
42 |             text(str): It can be the concatenation of all conversation history.
43 | 
44 |         Returns:
45 |             A json object containing all co-reference resolutions extracted from the input text.
46 |         """
47 |         props = {'annotators': 'coref', 'pipelineLanguage': 'en', 'ner.useSUTime': False}
48 |         result = json.loads(self.corenlp.annotate(text, properties=props))
49 | 
50 |         return result
51 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/macaw/core/interaction_handler/user_requests_db.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The conversation (or interaction) database implemented using MongoDB.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | from pymongo import MongoClient
 8 | 
 9 | from macaw import util
10 | from macaw.core.interaction_handler.msg import Message
11 | 
12 | 
13 | class InteractionDB:
14 |     def __init__(self, host, port, dbname):
15 |         self.client = MongoClient(host, port)
16 |         self.db = self.client[dbname]
17 |         self.col = self.db['macaw_msgs']
18 | 
19 |     def insert_one(self, msg):
20 |         if msg.user_id is None or msg.text is None or msg.timestamp is None or msg.user_interface is None:
21 |             raise Exception('Each message should include a user_interface, user_id, text, and timestamp.')
22 |         self.col.insert_one(msg.__dict__)
23 | 
24 |     def get_all(self):
25 |         print('Using get_all is only recommended for development purposes. It is not efficient!')
26 |         return self.dict_list_to_msg_list(self.col.find({}))
27 | 
28 |     def get_conv_history(self, user_id, max_time, max_count):
29 |         if max_time is None:
30 |             res = self.col.find({'user_id': user_id}).sort([('timestamp', -1)])
31 |         else:
32 |             res = self.col.find({'user_id': user_id,
33 |                                  'timestamp': {'$gt': util.current_time_in_milliseconds() - max_time}}).sort([('timestamp', -1)])
34 | 
35 |         if max_count is not None:
36 |             res = res.limit(max_count)
37 |         return self.dict_list_to_msg_list(res)
38 | 
39 |     def close(self):
40 |         self.client.close()
41 | 
42 |     @staticmethod
43 |     def dict_list_to_msg_list(msg_dict_list):
44 |         return [Message.from_dict(msg_dict) for msg_dict in msg_dict_list]
45 | 
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/macaw/core/interaction_handler/msg.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The message used to represent each interaction in Macaw.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | 
 8 | class Message:
 9 |     def __init__(self, user_interface, user_id, user_info, msg_info, text, timestamp):
10 |         """
11 |         An object for input and output Message.
12 | 
13 |         Args:
14 |             user_interface(str): The interface name used for this message (e.g., 'telegram')
15 |             user_id(str or int): The user ID.
16 |             user_info(dict): The dict containing some more information about the user.
17 |             msg_info(dict): The dict containing some more information about the message.
18 |             text(str): The message text.
19 |             timestamp(int): The timestamp of message in milliseconds.
20 |         """
21 |         self.user_id = user_id
22 |         self.user_info = user_info
23 |         self.msg_info = msg_info
24 |         self.text = text
25 |         self.timestamp = timestamp
26 |         self.user_interface = user_interface
27 | 
28 |     @classmethod
29 |     def from_dict(cls, msg_dict):
30 |         """
31 |         Get a Message object from dict.
32 |         Args:
33 |             msg_dict(dict): A dict containing all the information required to construct a Message object.
34 | 
35 |         Returns:
36 |             A Message object.
37 |         """
38 |         user_interface = msg_dict['user_interface'] if 'user_interface' in msg_dict else None
39 |         user_id = msg_dict['user_id'] if 'user_id' in msg_dict else None
40 |         user_info = msg_dict['user_info'] if 'user_info' in msg_dict else None
41 |         msg_info = msg_dict['msg_info'] if 'msg_info' in msg_dict else None
42 |         text = msg_dict['text'] if 'text' in msg_dict else None
43 |         timestamp = msg_dict['timestamp'] if 'timestamp' in msg_dict else None
44 |         return cls(user_interface, user_id, user_info, msg_info, text, timestamp)


--------------------------------------------------------------------------------
/macaw/core/retrieval/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The retrieval module init.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | import macaw.core.retrieval.bing_api
 7 | import macaw.core.retrieval.indri
 8 | from macaw.core.retrieval import search_engine, query_generation
 9 | 
10 | 
11 | def get_retrieval_model(params):
12 |     """
13 |     This method returns the Retrieval class requested in the parameter dict.
14 |     Args:
15 |         params(dict): A dict of parameters. In this method, the parameters 'logger' and 'query_generation', and
16 |         'search_engine' are required. Based on the requested retrievel model, some more parameters may be mandatory.
17 |         Currently, Macaw serves two different search engines. One is based on indri (http://lemurproject.org/indri.php),
18 |         and the other one is the Microsoft Bing API. If you want to retrieve results from your own document collection,
19 |         indri is a useful search engine, otherwise you can rely on the Bing's Web search.
20 | 
21 |     Returns:
22 |         A Retrieval object for document retrieval.
23 |     """
24 |     params['logger'].info('The query generation model for retrieval: ' + params['query_generation'])
25 |     if params['query_generation'] == 'simple':
26 |         q_generation = query_generation.SimpleQueryGeneration(params)
27 |     else:
28 |         raise Exception('The requested query generation model does not exist!')
29 | 
30 |     params['logger'].info('The search engine for retrieval: ' + params['search_engine'])
31 |     if params['search_engine'] == 'indri':
32 |         return macaw.core.retrieval.indri.Indri({'query_generation': q_generation,
33 |                                     'indri_path': params['search_engine_path'],
34 |                                     'index': params['col_index'],
35 |                                     'text_format': params['col_text_format'],
36 |                                     'results_requested': params['results_requested'],
37 |                                     'logger': params['logger']})
38 |     elif params['search_engine'] == 'bing':
39 |         return macaw.core.retrieval.bing_api.BingWebSearch({'query_generation': q_generation,
40 |                                             'bing_key': params['bing_key'],
41 |                                             'results_requested': params['results_requested'],
42 |                                             'logger': params['logger']})
43 |     else:
44 |         raise Exception('The requested retrieval model does not exist!')


--------------------------------------------------------------------------------
/macaw/batch_exp_main.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The main file for an experimental CIS with batch interaction support.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | from macaw.cis import CIS
 8 | from macaw.core import mrc, retrieval
 9 | from macaw.core.input_handler.action_detection import RequestDispatcher
10 | from macaw.core.output_handler import naive_output_selection
11 | 
12 | 
13 | class ConvSearch(CIS):
14 |     def __init__(self, params):
15 |         super().__init__(params)
16 |         self.retrieval = retrieval.get_retrieval_model(params=self.params)
17 |         self.qa = mrc.get_mrc_model(params=self.params)
18 |         self.request_dispatcher = RequestDispatcher({'retrieval': self.retrieval, 'qa': self.qa})
19 |         self.output_selection = naive_output_selection.NaiveOutputProcessing({})
20 | 
21 |     def request_handler_func(self, conv_list):
22 |         # identify action
23 |         dispatcher_output = self.request_dispatcher.dispatch(conv_list)
24 | 
25 |         output_msg = self.output_selection.get_output(conv_list, dispatcher_output)
26 |         return output_msg
27 | 
28 |     def run(self):
29 |         self.interface.run()
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     basic_params = {'timeout': -1,  # timeout is in terms of second.
34 |                     'mode': 'exp'}  # mode can be either live or exp.
35 |     interface_params = {'interface': 'fileio',
36 |                         'input_file_path': 'INPUT_FILE',
37 |                         'output_file_path': 'OUTPUT_FILE',
38 |                         'output_format': 'text'}
39 |     retrieval_params = {'query_generation': 'simple',
40 |                         'search_engine': 'bing',  # 'bing' or 'indri'
41 |                         'use_coref': True,  # True, if query generator can use coreference resolution, otherwise False.
42 |                         'bing_key': 'YOUR_BING_SUBSCRIPTION_TOKEN',  # only for Bing Web Search
43 |                         'search_engine_path': 'PATH_TO_INDRI',  # only for Indri
44 |                         'col_index': 'PATH_TO_INDRI_INDEX',  # only for Indri
45 |                         'col_text_format': 'trectext',  # trectext or trecweb. Only for Indri.
46 |                         'results_requested': 3}
47 |     mrc_params = {'mrc': 'drqa',
48 |                   'mrc_model_path': 'PATH_TO_PRETRAINED_MRC_MODEL',
49 |                   'mrc_path': 'PATH_TO_MRC_DIRECTORY',
50 |                   'corenlp_path': 'PATH_TO_STANFORD_CORE_NLP_DIRECTORY',
51 |                   'qa_results_requested': 3}
52 | 
53 |     params = {**basic_params, **interface_params, **retrieval_params, **mrc_params}
54 |     ConvSearch(params).run()


--------------------------------------------------------------------------------
/macaw/interface/stdio.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The STDIO interface for interactive CIS.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | import time
 8 | import traceback
 9 | 
10 | from macaw import util
11 | from macaw.interface.interface import Interface
12 | from macaw.core.interaction_handler.msg import Message
13 | 
14 | 
15 | class StdioInterface(Interface):
16 |     def __init__(self, params):
17 |         super().__init__(params)
18 |         self.msg_id = int(time.time())
19 | 
20 |     def run(self):
21 |         while True:
22 |             try:
23 |                 request = input('ENTER YOUR COMMAND: ').strip()
24 |                 if len(request) == 0:
25 |                     continue
26 |                 user_info = {'first_name': 'STDIO',
27 |                              'is_bot': 'False'
28 |                              }
29 |                 msg_info = {'msg_id': self.msg_id,
30 |                             'msg_type': 'command' if request.startswith('#') else 'text',
31 |                             'msg_source': 'user'}
32 |                 self.msg_id += 1
33 |                 msg = Message(user_interface='stdio',
34 |                               user_id=-1,
35 |                               user_info=user_info,
36 |                               msg_info=msg_info,
37 |                               text=request,
38 |                               timestamp=util.current_time_in_milliseconds())
39 |                 output = self.params['live_request_handler'](msg)
40 |                 self.result_presentation(output, {})
41 |             except Exception as ex:
42 |                 traceback.print_exc()
43 | 
44 |     def result_presentation(self, response_msg, params):
45 |         try:
46 |             print('THE RESPONSE STARTS')
47 |             print('----------------------------------------------------------------------')
48 |             if response_msg.msg_info['msg_type'] == 'text':
49 |                 print(response_msg.text)
50 |             elif response_msg.msg_info['msg_type'] == 'options':
51 |                 for (option_text, option_data, output_score) in response_msg.msg_info['options']:
52 |                     print(option_data, ' | ', option_text)
53 |             elif response_msg.msg_info['msg_type'] == 'error':
54 |                 print('ERROR: NO RESULT!')
55 |             else:
56 |                 raise Exception('The msg_type is not recognized:', response_msg.msg_info['msg_type'])
57 |             print('----------------------------------------------------------------------')
58 |             print('THE RESPONSE STARTS')
59 |         except Exception as ex:
60 |             traceback.print_exc()
61 | 
62 | 


--------------------------------------------------------------------------------
/macaw/core/retrieval/search_engine.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Abstract classes for retrieval and ranking models.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | from abc import ABC, abstractmethod
 8 | 
 9 | 
10 | class Retrieval(ABC):
11 | 	@abstractmethod
12 | 	def __init__(self, params):
13 | 		"""
14 | 		An abstract class for retrieval models.
15 | 
16 | 		Args:
17 | 			params(dict): A dict containing some mandatory and optional parameters. 'query_generation' and 'logger' are
18 | 			required for all retrieval models.
19 | 		"""
20 | 		self.params = params
21 | 		self.query_generation = self.params['query_generation']
22 | 
23 | 	@abstractmethod
24 | 	def retrieve(self, query):
25 | 		"""
26 | 		This method should retrieve documents for the given query.
27 | 
28 | 		Args:
29 | 			query(str): The query string.
30 | 		"""
31 | 		pass
32 | 
33 | 	def get_results(self, conv_list):
34 | 		"""
35 | 		This method is the one that should be called. It simply calls the query generation model to generate a query
36 | 		from a conversation list and then runs the retrieval model and returns the results.
37 | 		Args:
38 | 			conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
39 | 			user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
40 | 
41 | 		Returns:
42 | 			A list of Documents retrieved by the search engine.
43 | 		"""
44 | 		query = self.query_generation.get_query(conv_list)
45 | 		self.params['logger'].info('New query: ' + query)
46 | 		result_list = self.retrieve(query)
47 | 		if 'reranker' in self.params:
48 | 			return self.params['reranker'].rerank(query, conv_list, result_list, self.params)
49 | 		return result_list
50 | 
51 | 
52 | class ReRanker(ABC):
53 | 	@abstractmethod
54 | 	def __init__(self, params):
55 | 		"""
56 | 		This is an abstract class for a re-ranking model, e.g., learning to rank models.
57 | 
58 | 		Args:
59 | 			params(dict): A dict containing some mandatory and optional parameters, such as the hyper-parameters for the
60 | 			re-ranking model.
61 | 		"""
62 | 		self.params = params
63 | 
64 | 	def rerank(self, query, conv_list, result_list, params):
65 | 		"""
66 | 		This method is called for re-ranking the result_list in response to the query.
67 | 
68 | 		Args:
69 | 			query(str): A query generated by a query generation model
70 | 			conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
71 | 			user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
72 | 			result_list(list): A list of Documents retrieved by a first stage retrieval model.
73 | 			params(dict): A dict containing some parameters required by the re-ranking model.
74 | 
75 | 		Returns:
76 | 			A list of Documents. This list contains a subset of result_list with the highest re-ranking scores.
77 | 		"""
78 | 		pass
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.3 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/macaw/interface/fileio.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The FileIO interface (for experimental batch interactions).
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | import time
 8 | 
 9 | from macaw.interface.interface import Interface
10 | from macaw.core.interaction_handler.msg import Message
11 | 
12 | 
13 | class FileioInterface(Interface):
14 |     def __init__(self, params):
15 |         super().__init__(params)
16 |         self.msg_id = int(time.time())
17 | 
18 |     def run(self):
19 |         output_file = open(self.params['output_file_path'], 'w+')
20 |         with open(self.params['input_file_path']) as input_file:
21 |             for line in input_file:
22 |                 str_list = line.strip().split('\t')
23 |                 if len(str_list) < 2:
24 |                     raise Exception('Each input line should contain at least 2 elements: a query ID and a query text.')
25 |                 qid = str_list[0]
26 | 
27 |                 conv_list = []
28 |                 for i in range(1, len(str_list)):
29 |                     user_info = {'first_name': 'NONE'}
30 |                     msg_info = {'msg_id': qid,
31 |                                 'msg_type': 'text',
32 |                                 'msg_source': 'user'}
33 |                     msg = Message(user_interface='NONE',
34 |                                   user_id=-1,
35 |                                   user_info=user_info,
36 |                                   msg_info=msg_info,
37 |                                   text=str_list[i],
38 |                                   timestamp=-1)
39 |                     conv_list.append(msg)
40 |                 conv_list.reverse()
41 |                 output_msg = self.params['experimental_request_handler'](conv_list)
42 |                 self.result_presentation(output_msg, {'output_file': output_file, 'qid': qid})
43 |         output_file.close()
44 | 
45 |     def result_presentation(self, output_msg, params):
46 |         qid = params['qid']
47 |         output_file = params['output_file']
48 |         if self.params['output_format'] == 'trec':
49 |             if output_msg.msg_info['msg_type'] == 'options':
50 |                 for (i, (option_name, option_id, output_score)) in enumerate(output_msg.msg_info['options']):
51 |                     output_file.write(qid + '\tQ0\t' + option_name + '\t' + str(i+1) + '\t' + str(output_score) + '\tmacaw\n')
52 |             else:
53 |                 raise Exception('TREC output format is only recognized for retrieval results. '
54 |                                 'Therefore, the message type should be options.')
55 |         elif self.params['output_format'] == 'text':
56 |             if output_msg.msg_info['msg_type'] == 'text':
57 |                 output_file.write(qid + '\t' + output_msg.text.replace('\n', ' ').replace('\t', ' ') + '\n')
58 |             else:
59 |                 raise Exception('text output format is only recognized for text outputs.')
60 |         else:
61 |             raise Exception('Unknown output file format!')
62 | 


--------------------------------------------------------------------------------
/macaw/core/retrieval/doc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The document class and some util functions useful for retrieval result list.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | import re
 8 | 
 9 | import justext
10 | 
11 | 
12 | class Document:
13 |     def __init__(self, id, title, text, score):
14 |         """
15 |             A simple class representing a document for retrieval.
16 |         Args:
17 |             id(str): Document ID.
18 |             title(str): Document title (if any).
19 |             text(str): Document content.
20 |             score(float): The retrieval score.
21 |         """
22 |         self.id = id
23 |         self.title = title
24 |         self.text = text
25 |         self.score = score
26 | 
27 | 
28 | def get_recursive_content_as_str(doc):
29 |     """
30 |     THIS METHOD IS DEPRECATED!
31 |     """
32 |     text = ''
33 |     if isinstance(doc, str):
34 |         return doc.strip() + '\n'
35 |     elif isinstance(doc, dict):
36 |         for key in doc:
37 |             text += get_recursive_content_as_str(doc[key])
38 |     elif isinstance(doc, list):
39 |         for t in doc:
40 |             text += get_recursive_content_as_str(t)
41 |     else:
42 |         raise Exception('cannot parse document recursively, ' + str(type(doc)))
43 |     return text
44 | 
45 | 
46 | # def get_trec_doc(doc):
47 | #     doc_dict = xml_text_to_dict(doc)
48 | #     id = doc_dict['DOCNO']
49 | #     title = None
50 | #     text = get_recursive_content_as_str(doc_dict['TEXT'])
51 | #     return Document(id, title, text, 0)
52 | 
53 | 
54 | def get_trec_doc(trec_doc, format='trectext'):
55 |     """
56 |     This method returns a Document given a standard trectext or trecweb document. NOTE: There are much better parsers
57 |     for TREC documents.
58 |     Args:
59 |         trec_doc(str): The document content with the trectext or trecweb format.
60 |         format(str): The document format. Either 'trectext' or 'trecweb'. The default value is 'trectext'.
61 | 
62 |     Returns:
63 |         An instance of Document. Note that the score is assigned to 0 and should be set later.
64 |     """
65 |     trec_doc_lower = trec_doc.lower()
66 |     id = trec_doc[trec_doc_lower.find('<docno>') + len('<docno>'):trec_doc_lower.find('</docno>')].strip()
67 |     title = id  # for some presentation reasons, the title of document is set to ids ID.
68 |     if format == 'trectext':
69 |         text = trec_doc[trec_doc_lower.find('<text>') + len('<text>'):trec_doc_lower.find('</text>')]
70 |     elif format == 'trecweb':
71 |         text = trec_doc[trec_doc_lower.find('<body>') + len('<body>'):trec_doc_lower.find('</body>')]
72 |     else:
73 |         raise Exception('Undefined TREC document format. Supported document formats are trectext and trecweb')
74 |     text = re.sub('\s+', ' ', text).strip()  # removing multiple consecutive whitespaces
75 | 
76 |     # Removing other tags in the text, e.g., <p>.
77 |     clean_text_list = []
78 |     paragraphs = justext.justext(text, justext.get_stoplist("English"))
79 |     for paragraph in paragraphs:
80 |         if not paragraph.is_boilerplate:
81 |             clean_text_list.append(paragraph.text)
82 | 
83 |     return Document(id, title, '\n'.join(clean_text_list), 0.)
84 | 


--------------------------------------------------------------------------------
/macaw/core/retrieval/bing_api.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Abstract classes for retrieval and ranking models.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | import requests
 8 | 
 9 | from macaw.core.retrieval.doc import Document
10 | from macaw.core.retrieval.search_engine import Retrieval
11 | from macaw.util.text_parser import html_to_clean_text
12 | 
13 | 
14 | class BingWebSearch(Retrieval):
15 | 	def __init__(self, params):
16 | 		"""
17 | 		The Microsoft Bing Web search API. This class uses the Bing's API to get the retrieval results from the Web.
18 | 		Note that for some reasons, the results returned by the Bing API are usually different from the Bing search
19 | 		(without API).
20 | 
21 | 		Args:
22 | 			params(dict): A dict containing some parameters. Here is the list of all required parameters:
23 | 			'bing_key': The Bing API key.
24 | 			'results_requested': The maximum number of requested documents for retrieval. If not given, it is set to 1.
25 | 			Note that this is limited by the number of results returned by the API.
26 | 		"""
27 | 		super().__init__(params)
28 | 		self.results_requested = self.params['results_requested'] if 'results_requested' in self.params else 1
29 | 		self.subscription_key = self.params['bing_key']
30 | 		self.bing_api_url = 'https://api.cognitive.microsoft.com/bing/v7.0/search'
31 | 		self.header = {"Ocp-Apim-Subscription-Key": self.subscription_key}
32 | 		params['logger'].warning('There is a maximum number of transactions per second for the Bing API.')
33 | 
34 | 	def retrieve(self, query):
35 | 		"""
36 | 		This method retrieve documents in response to the given query.
37 | 
38 | 		Args:
39 | 			query(str): The query string.
40 | 
41 | 		Returns:
42 | 			A list of Documents with the maximum length of the 'results_requested' parameter.
43 | 		"""
44 | 		params = {"q": query, "textDecorations": True, "textFormat": "HTML"}
45 | 		response = requests.get(self.bing_api_url, headers=self.header, params=params)
46 | 		response.raise_for_status()
47 | 		search_results = response.json()
48 | 		results = []
49 | 		for i in range(min(len(search_results['webPages']['value']), self.results_requested)):
50 | 			id = search_results['webPages']['value'][i]['url']
51 | 			title = search_results['webPages']['value'][i]['name']
52 | 			snippet = search_results['webPages']['value'][i]['snippet']
53 | 			headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0'}
54 | 			text = html_to_clean_text(requests.get(id, headers=headers).content)
55 | 			score = 10 - i  # this is not a score returned by Bing (just 10 - document rank)
56 | 			results.append(Document(id, title, text, score))
57 | 		return results
58 | 
59 | 	def get_doc_from_index(self, doc_id):
60 | 		"""
61 | 		This method retrieves a document content for a given document id (i.e., URL).
62 | 
63 | 		Args:
64 | 			doc_id(str): The document ID.
65 | 
66 | 		Returns:
67 | 			A Document from the collection whose ID is equal to the given doc_id. For some reasons, the method returns
68 | 			a list of Documents with a length of 1.
69 | 		"""
70 | 		# Telegram has a nice interface for loading websites. Therefore, we decided to only pass the doc_id (URL). This
71 | 		# can be simply enhanced by the title and the content of the document.
72 | 		doc = Document(doc_id, doc_id, doc_id, -1)
73 | 		return [doc]


--------------------------------------------------------------------------------
/macaw/core/retrieval/indri.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The Indri search engine.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | import os
 8 | import subprocess
 9 | 
10 | import pyndri
11 | 
12 | from macaw.core.retrieval.doc import get_trec_doc
13 | from macaw.core.retrieval.search_engine import Retrieval
14 | 
15 | 
16 | class Indri(Retrieval):
17 | 	def __init__(self, params):
18 | 		"""
19 | 		The Indri retrieval model. Indri is an open-source search engine implemented as part of the lemur project by
20 | 		UMass Amherst and CMU. Refer to http://lemurproject.org/indri.php for more information.
21 | 		The retrieval model used here is based on language modeling framework and retrieves documents using the query
22 | 		likelihood retrieval model [Ponte & Croft; SIGIR 1998] and Dirichlet prior smoothing [Zhai and Lafferty; SIGIR
23 | 		2001]. It is implemented using the Pyndri [Van Gysel et al.; ECIR 2017], which is a python interface to Indri.
24 | 		Refer to http://lemurproject.org/indri.php for more information on the Lemur toolkit.
25 | 
26 | 		Args:
27 | 			params(dict): A dict containing some parameters. Here is the list of all required parameters:
28 | 			'indri_path': The path to the installed Indri toolkit.
29 | 			'index': The path to the Indri index constructed from the collection.
30 | 			'results_requested': The maximum number of requested documents for retrieval. If not given, it is set to 1.
31 | 			'text_format': The text format for document collection (e.g., 'trectext').
32 | 			Note that the parameters 'query_generation' and 'logger' are required by the parent class.
33 | 		"""
34 | 		super().__init__(params)
35 | 		self.results_requested = self.params['results_requested'] if 'results_requested' in self.params else 1
36 | 		self.indri_path = self.params['indri_path']
37 | 		self.index = pyndri.Index(self.params['index'])
38 | 		self.term2id, self.id2term, self.id2df = self.index.get_dictionary()
39 | 		self.id2tf = self.index.get_term_frequencies()
40 | 
41 | 	def retrieve(self, query):
42 | 		"""
43 | 		This method retrieve documents in response to the given query.
44 | 
45 | 		Args:
46 | 			query(str): The query string.
47 | 
48 | 		Returns:
49 | 			A list of Documents with the maximum length of the 'results_requested' parameter.
50 | 		"""
51 | 		int_results = self.index.query(query, results_requested=self.results_requested)
52 | 		results = []
53 | 		for int_doc_id, score in int_results:
54 | 			# ext_doc_id, content_term_id = self.index.document(int_doc_id)
55 | 			# index_content = [self.id2term[term_id] if term_id> 0 else 'UNK' for term_id in content_term_id]
56 | 			doc = self.get_doc_from_index(int_doc_id)[0]
57 | 			doc.score = score
58 | 			doc.id = str(int_doc_id)
59 | 			results.append(doc)
60 | 		return results
61 | 
62 | 	def get_doc_from_index(self, doc_id):
63 | 		"""
64 | 		This method retrieves a document content for a given document id.
65 | 
66 | 		Args:
67 | 			doc_id(str): The document ID.
68 | 
69 | 		Returns:
70 | 			A Document from the collection whose ID is equal to the given doc_id. For some reasons, the method returns
71 | 			a list of Documents with a length of 1.
72 | 		"""
73 | 		content = subprocess.run([os.path.join(self.indri_path, 'dumpindex/dumpindex'), self.params['index'],
74 | 								  'dt', str(doc_id)], stdout=subprocess.PIPE).stdout.decode('UTF-8')
75 | 		if self.params['text_format'] == 'trectext':
76 | 			doc = get_trec_doc(content)
77 | 		else:
78 | 			raise Exception('The requested text format is not supported!')
79 | 		return [doc]


--------------------------------------------------------------------------------
/macaw/core/mrc/drqa_mrc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from abc import ABC, abstractmethod
 4 | 
 5 | import drqa
 6 | from drqa.reader import Predictor
 7 | 
 8 | """
 9 | A wrapper to the DrQA model from FAIR: https://github.com/facebookresearch/DrQA 
10 | 
11 | Authors: Hamed Zamani (hazamani@microsoft.com)
12 | """
13 | 
14 | from macaw.core.retrieval.doc import Document
15 | 
16 | 
17 | class MRC(ABC):
18 |     @abstractmethod
19 |     def __init__(self, params):
20 |         """
21 |         An abstract class for machine reading comprehension models implemented in Macaw.
22 | 
23 |         Args:
24 |             params(dict): A dict containing some mandatory and optional parameters.
25 |         """
26 |         self.params = params
27 | 
28 |     @abstractmethod
29 |     def get_results(self, conv_list, doc):
30 |         """
31 |             This method is called to get the answer(s) to a question.
32 | 
33 |         Args:
34 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
35 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
36 |             doc(Document): A document (core.retrieval.doc.Document) that potentially contains the answer.
37 | 
38 |         Returns:
39 |             The inherited class should implements this method and return a list of Documents each containing a candidate
40 |             answer and its confidence score.
41 |         """
42 |         pass
43 | 
44 | 
45 | class DrQA(MRC):
46 |     def __init__(self, params):
47 |         """
48 |         A machine reading comprehension model based on DrQA (https://github.com/facebookresearch/DrQA).
49 | 
50 |         Args:
51 |             params(dict): A dict of parameters. Required parameters are:
52 |             'mrc_path': The path to the DrQA repository.
53 |             'corenlp_path': The path to the Stanford's corenlp toolkit. DrQA requires corenlp.
54 |             'mrc_model_path': The path to the learned DrQA parameters.
55 |             'qa_results_requested': The maximum number of candidate answers that should be found by DrQA.
56 |         """
57 |         super().__init__(params)
58 |         sys.path.insert(0, self.params['mrc_path'])
59 |         drqa.tokenizers.set_default('corenlp_classpath', os.path.join(self.params['corenlp_path'], '*'))
60 |         self.predictor = Predictor(self.params['mrc_model_path'], tokenizer='simple', num_workers=0, normalize=False)
61 | 
62 |     def get_results(self, conv_list, doc):
63 |         """
64 |         This method returns the answers to the question.
65 | 
66 |         Args:
67 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
68 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
69 |             doc(Document): A document (core.retrieval.doc.Document) that potentially contains the answer.
70 | 
71 |         Returns:
72 |             Returns a list of Documents each containing a candidate answer and its confidence score. The length of this
73 |             list is less than or equal to the parameter 'qa_results_requested'.
74 |         """
75 |         q = conv_list[0].text
76 |         predictions = self.predictor.predict(doc, q, None, self.params['qa_results_requested'])
77 |         results = []
78 |         for i, p in enumerate(predictions, 1):
79 |             results.append(Document(None, None, p[0], p[1]))
80 |         return results
81 | 
82 | 
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/macaw/util/text_parser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Some text parser for document cleaning.
  3 | 
  4 | Authors: Hamed Zamani (hazamani@microsoft.com)
  5 | """
  6 | 
  7 | import justext
  8 | from xml.etree import cElementTree as ElementTree
  9 | 
 10 | 
 11 | class XmlListConfig(list):
 12 |     def __init__(self, aList):
 13 |         """
 14 |         THIS CLASS IS DEPRECATED!
 15 |         """
 16 |         super().__init__()
 17 |         for element in aList:
 18 |             if element:
 19 |                 if len(element) == 1 or element[0].tag != element[1].tag:
 20 |                     self.append(XmlDictConfig(element))
 21 |                 elif element[0].tag == element[1].tag:
 22 |                     self.append(XmlListConfig(element))
 23 |             elif element.text:
 24 |                 text = element.text.strip()
 25 |                 if text:
 26 |                     self.append(text)
 27 | 
 28 | 
 29 | class XmlDictConfig(dict):
 30 |     def __init__(self, parent_element):
 31 |         """
 32 |         THIS CLASS IS DEPRECATED!
 33 |         """
 34 |         super().__init__()
 35 |         if parent_element.items():
 36 |             self.update(dict(parent_element.items()))
 37 |         for element in parent_element:
 38 |             if element:
 39 |                 if len(element) == 1 or element[0].tag != element[1].tag:
 40 |                     aDict = XmlDictConfig(element)
 41 |                 else:
 42 |                     aDict = {element[0].tag: XmlListConfig(element)}
 43 |                 if element.items():
 44 |                     aDict.update(dict(element.items()))
 45 |                 self.update({element.tag: aDict})
 46 |             elif element.items():
 47 |                 self.update({element.tag: dict(element.items())})
 48 |             else:
 49 |                 self.update({element.tag: element.text})
 50 | 
 51 | 
 52 | def xml_text_to_dict(xml_text):
 53 |     """
 54 |     THIS CLASS IS DEPRECATED!
 55 |     """
 56 |     print(xml_text)
 57 |     root = ElementTree.XML(xml_text)
 58 |     return XmlDictConfig(root)
 59 | 
 60 | 
 61 | def xml_file_to_dict(xml_file):
 62 |     """
 63 |     THIS CLASS IS DEPRECATED!
 64 |     """
 65 |     tree = ElementTree.parse(xml_file)
 66 |     root = tree.getroot()
 67 |     return XmlDictConfig(root)
 68 | 
 69 | 
 70 | # def html_to_clean_text(html):
 71 | #     """
 72 | #     Converting an HTML document to clean text.
 73 | #     Args:
 74 | #         html(str): The content of an HTML web page.
 75 | #
 76 | #     Returns:
 77 | #         A str containing the clean content of the web page.
 78 | #     """
 79 | #     def visible(element):
 80 | #         if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
 81 | #             return False
 82 | #         elif re.match('<!--.*-->', str(element.encode('utf-8'))):
 83 | #             return False
 84 | #         return True
 85 | #
 86 | #     soup = BeautifulSoup(html, features='html.parser') #.stripped_strings
 87 | #     data = soup.findAll(text=True)
 88 | #     result = filter(visible, data)
 89 | #     return ' '.join(result)
 90 | 
 91 | def html_to_clean_text(html):
 92 |     """
 93 |     Converting an HTML document to clean text.
 94 |     Args:
 95 |         html(str): The content of an HTML web page.
 96 | 
 97 |     Returns:
 98 |         A str containing the clean content of the web page.
 99 |     """
100 |     paragraphs = justext.justext(html, justext.get_stoplist("English"))
101 |     clean_text_list = []
102 |     for paragraph in paragraphs:
103 |         if not paragraph.is_boilerplate:
104 |             clean_text_list.append(paragraph.text)
105 |     return '\n'.join(clean_text_list)
106 | 


--------------------------------------------------------------------------------
/macaw/interface/speech_recognition.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Speech recognition and generation and some utility functions.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | from abc import ABC, abstractmethod
 8 | import os
 9 | import tempfile
10 | 
11 | import speech_recognition as sr
12 | from google.cloud import texttospeech
13 | from pydub import AudioSegment
14 | 
15 | 
16 | def mp3_to_ogg(input_file_name): # caller should delete the file afterwards.
17 |     ogg_file = tempfile.NamedTemporaryFile(delete=False)
18 |     AudioSegment.from_mp3(input_file_name).export(ogg_file.name, format='ogg', parameters=["-acodec", "libopus"])
19 |     ogg_file.close()
20 |     return ogg_file.name
21 | 
22 | 
23 | def ogg_to_wav(input_file_name): # caller should delete the file afterwards.
24 |     wav_file = tempfile.NamedTemporaryFile(delete=False)
25 |     AudioSegment.from_ogg(input_file_name).export(wav_file.name, format='wav')
26 |     wav_file.close()
27 |     return wav_file.name
28 | 
29 | 
30 | class ASR(ABC): # Automatic Speech Recognition
31 |     def __init__(self, params):
32 |         self.params = params
33 | 
34 |     @abstractmethod
35 |     def speech_to_text(self, file_path):
36 |         pass
37 | 
38 | 
39 | class ASG(ABC): # Automatic Speech Generation
40 |     def __init__(self, params):
41 |         self.params = params
42 | 
43 |     @abstractmethod
44 |     def text_to_speech(self, text):
45 |         pass
46 | 
47 | 
48 | class GoogleASR(ASR):
49 |     def __init__(self, params):
50 |         super().__init__(params)
51 |         self.asr = sr.Recognizer()
52 | 
53 |     def speech_to_text(self, file_path):
54 |         print(file_path)
55 |         wav_file_name = ogg_to_wav(file_path)
56 |         with sr.AudioFile(wav_file_name) as source:
57 |             audio = self.asr.record(source)
58 |         try:
59 |             text = self.asr.recognize_google(audio)
60 |             os.remove(wav_file_name)
61 |             return text
62 |         except sr.UnknownValueError:
63 |             print("Google Speech Recognition could not understand audio")
64 |         except sr.RequestError as e:
65 |             print("Could not request results from Google Speech Recognition service; {0}".format(e))
66 | 
67 | 
68 | class GoogleText2Speech(ASG):
69 |     def __init__(self, params):
70 |         super().__init__(params)
71 |         os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.params['google-speech-to-text-credential-file']
72 |         # Instantiates a client
73 |         self.client = texttospeech.TextToSpeechClient()
74 |         # Build the voice request, select the language code ("en-US") and the ssml
75 |         # voice gender ("neutral")
76 |         self.voice = texttospeech.types.VoiceSelectionParams(
77 |             language_code='en-US',
78 |             ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
79 |         # Select the type of audio file you want returned
80 |         self.audio_config = texttospeech.types.AudioConfig(
81 |             audio_encoding=texttospeech.enums.AudioEncoding.MP3)
82 | 
83 |     def text_to_speech(self, text):
84 |         # Set the text input to be synthesized
85 |         synthesis_input = texttospeech.types.SynthesisInput(text=text)
86 | 
87 |         # Perform the text-to-speech request on the text input with the selected
88 |         # voice parameters and audio file type
89 |         response = self.client.synthesize_speech(synthesis_input, self.voice, self.audio_config)
90 | 
91 |         mp3_file = tempfile.NamedTemporaryFile(delete=True)
92 |         mp3_file.write(response.audio_content)
93 |         ogg_file_name = mp3_to_ogg(mp3_file.name)
94 |         mp3_file.close()
95 |         return ogg_file_name
96 | 
97 | 


--------------------------------------------------------------------------------
/macaw/cis.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The CIS class.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | from abc import ABC, abstractmethod
 8 | from func_timeout import FunctionTimedOut
 9 | 
10 | from macaw import interface, util
11 | from macaw.core.interaction_handler.user_requests_db import InteractionDB
12 | from macaw.core.interaction_handler.msg import Message
13 | 
14 | 
15 | class CIS(ABC):
16 |     def __init__(self, params):
17 |         """
18 |         A Conversational Information Seeking class containing some abstract methods. Each CIS application is expected to
19 |         be inherited from this class.
20 | 
21 |         Args:
22 |             params(dict): A dict containing some parameters.
23 |         """
24 |         self.params = params
25 |         if params['mode'] == 'live':
26 |             self.params['live_request_handler'] = self.live_request_handler
27 |             self.msg_db = InteractionDB(host=self.params['interaction_db_host'],
28 |                                         port=self.params['interaction_db_port'],
29 |                                         dbname=self.params['interaction_db_name'])
30 |         elif params['mode'] == 'exp':
31 |             self.params['experimental_request_handler'] = self.request_handler_func
32 | 
33 |         self.interface = interface.get_interface(params)
34 |         try:
35 |             self.nlp_util = util.NLPUtil(self.params)
36 |             self.params['nlp_util'] = self.nlp_util
37 |         except Exception as ex:
38 |             self.params['logger'].warning('WARNING: There is a problem with setting up the NLP utility module.')
39 |         self.timeout = self.params['timeout'] if 'timeout' in self.params else -1
40 | 
41 |     def live_request_handler(self, msg):
42 |         try:
43 |             # load conversation from the database and add the current message to the database
44 |             conv = [msg] + self.msg_db.get_conv_history(user_id=msg.user_id, max_time=10 * 60 * 1000, max_count=10)
45 |             self.msg_db.insert_one(msg)
46 | 
47 |             # output_msg = func_timeout(self.timeout, self.request_handler_func, args=[conv])
48 |             output_msg = self.request_handler_func(conv)
49 |             self.msg_db.insert_one(output_msg)
50 |             return output_msg
51 | 
52 |         except FunctionTimedOut:
53 |             msg_info = dict()
54 |             msg_info['msg_id'] = msg.msg_info['msg_id']
55 |             msg_info['msg_source'] = 'system'
56 |             msg_info['msg_type'] = 'error'
57 |             text = 'Time out, no result!'
58 |             timestamp = util.current_time_in_milliseconds()
59 |             error_msg = Message(msg.user_interface, msg.user_id, msg.user_info, msg_info, text, timestamp)
60 |             self.msg_db.insert_one(error_msg)
61 |             return error_msg
62 | 
63 |     # def experimental_request_handler(self, str_list):
64 |     #     if not isinstance(str_list, list):
65 |     #         raise Exception('The input should be a list!')
66 |     #
67 |     #     conv_list = []
68 |     #     for i in range(len(str_list)):
69 |     #         if not isinstance(str_list[i], str):
70 |     #             raise Exception('Each element of the input should be a string!')
71 |     #         user_info = {'first_name': 'NONE'}
72 |     #         msg_info = {'msg_id': -1,
73 |     #                     'msg_type': 'command' if str_list[i].startswith('#') else 'text',
74 |     #                     'msg_source': 'user'}
75 |     #         msg = Message(user_interface='NONE',
76 |     #                       user_id=-1,
77 |     #                       user_info=user_info,
78 |     #                       msg_info=msg_info,
79 |     #                       text=str_list[i],
80 |     #                       timestamp=util.current_time_in_milliseconds())
81 |     #         conv_list.append(msg)
82 |     #     conv_list.reverse()
83 |     #
84 |     #     if self.timeout > 0:
85 |     #         output_msg = func_timeout(self.timeout, self.request_handler_func, args=[conv_list])
86 |     #     else:
87 |     #         output_msg = self.request_handler_func(conv_list)
88 |     #     return output_msg
89 | 
90 |     @abstractmethod
91 |     def request_handler_func(self, conv_list):
92 |         pass
93 | 
94 |     @abstractmethod
95 |     def run(self):
96 |         pass


--------------------------------------------------------------------------------
/macaw/core/input_handler/actions.py:
--------------------------------------------------------------------------------
  1 | """
  2 | All actions supported by CIS.
  3 | 
  4 | Authors: Hamed Zamani (hazamani@microsoft.com)
  5 | """
  6 | 
  7 | from abc import ABC, abstractmethod
  8 | from func_timeout import func_timeout, FunctionTimedOut
  9 | import traceback
 10 | 
 11 | 
 12 | class Action(ABC):
 13 |     @staticmethod
 14 |     @abstractmethod
 15 |     def run(conv_list, params):
 16 |         """
 17 |         This is a static method for an abstract class. This method should run the corresponding action.
 18 | 
 19 |         Args:
 20 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
 21 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
 22 |             params(dict): A dict containing some mandatory and optional parameters.
 23 |         """
 24 |         pass
 25 | 
 26 | 
 27 | class RetrievalAction(Action):
 28 |     @staticmethod
 29 |     def run(conv_list, params):
 30 |         """
 31 |         The retrieval action runs the retrieval model and returns a list of documents.
 32 |         Args:
 33 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
 34 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
 35 |             params(dict): A dict containing some parameters. The parameter 'retrieval' is required, which should be the
 36 |             retrieval model object.
 37 | 
 38 |         Returns:
 39 |             A list of Documents.
 40 |         """
 41 |         return params['actions']['retrieval'].get_results(conv_list)
 42 | 
 43 | 
 44 | class GetDocFromIndex(Action):
 45 |     @staticmethod
 46 |     def run(conv_list, params):
 47 |         """
 48 |         Getting document from the collection index.
 49 |         Args:
 50 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
 51 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
 52 |             params(dict): A dict containing some parameters. The parameters 'retrieval' and 'doc_id' are required.
 53 | 
 54 |         Returns:
 55 |             A list of Documents with a length of 1.
 56 |         """
 57 |         return params['actions']['retrieval'].get_doc_from_index(params['doc_id'])
 58 | 
 59 | 
 60 | class QAAction(Action):
 61 |     @staticmethod
 62 |     def run(conv_list, params):
 63 |         """
 64 |         The question answering action runs the MRC model and returns a list of answers.
 65 |         Args:
 66 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
 67 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
 68 |             params(dict): A dict containing some parameters. The parameters 'qa' and 'doc' are required, which are the
 69 |             MRC model and the candidate document, respectively.
 70 | 
 71 |         Returns:
 72 |             A list of Documents containing the answers.
 73 |         """
 74 | 
 75 |         doc_list = RetrievalAction.run(conv_list, params)
 76 |         doc = ''
 77 |         for i in range(len(doc_list)):
 78 |             doc = doc_list[i].text
 79 |             if len(doc.strip()) > 0:
 80 |                 break
 81 |         return params['actions']['qa'].get_results(conv_list, doc)
 82 | 
 83 | 
 84 | def run_action(action, conv_list, params, return_dict):
 85 |     """
 86 |     This method runs the specified action.
 87 | 
 88 |     Args:
 89 |         action(str): The action name, e.g., 'retrieval', 'qa', etc.
 90 |         conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
 91 |         user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
 92 |         params(dict): A dict containing some parameters.
 93 |         return_dict(dict): A shared dict for all processes running this action. The actions' outputs should be added to
 94 |         this dict.
 95 |     """
 96 |     if action == 'retrieval':
 97 |         action_func = RetrievalAction.run
 98 |     elif action == 'qa':
 99 |         action_func = QAAction.run
100 |     else:
101 |         raise Exception('Unknown Action!')
102 | 
103 |     try:
104 |         return_dict[action] = func_timeout(params['timeout'], action_func, args=[conv_list, params])
105 |     except FunctionTimedOut:
106 |         params['logger'].warning('The action "%s" did not respond in %d seconds.', action, params['timeout'])
107 |     except Exception:
108 |         return_dict[action] = None
109 |         traceback.print_exc()
110 | 


--------------------------------------------------------------------------------
/macaw/core/output_handler/naive_output_selection.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The naive output post processing unit.
 3 | 
 4 | Authors: Hamed Zamani (hazamani@microsoft.com)
 5 | """
 6 | 
 7 | from macaw import util
 8 | from macaw.core.output_handler.output_selection import OutputProcessing
 9 | from macaw.core.interaction_handler.msg import Message
10 | 
11 | 
12 | class NaiveOutputProcessing(OutputProcessing):
13 |     def __init__(self, params):
14 |         """
15 |         This module simply prioritizes the action outputs. If the message was a command, it returns the command's
16 |         output. Otherwise, it prioritizes QA results and then retrieval results (the rational is that if there is an
17 |         exact answer for the user's question, there is no need to show the retrieval results).
18 | 
19 |         Args:
20 |             params(dict): A dict of parameters.
21 |         """
22 |         super().__init__(params)
23 | 
24 |     def output_selection(self, conv_list, candidate_outputs):
25 |         """
26 |         This method selects one of the outputs produced by the actions.
27 | 
28 |         Args:
29 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
30 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
31 |             candidate_outputs(dict): A dict of str (i.e., action) to list of Documents (i.e., the action's result) as
32 |             the response. This dict is produced by action dispatcher, which means this is the aggregation of all the
33 |             executed actions.
34 | 
35 |         Returns:
36 |             A str denoting the selected action. If none is selected, None is returned.
37 |         """
38 |         if '#get_doc' in candidate_outputs:
39 |             return '#get_doc'
40 |         if 'qa' in candidate_outputs:
41 |             if len(candidate_outputs['qa'][0].text) > 0:
42 |                 if conv_list[0].text.endswith('?') \
43 |                         or conv_list[0].text.lower().startswith('what') \
44 |                         or conv_list[0].text.lower().startswith('who') \
45 |                         or conv_list[0].text.lower().startswith('when') \
46 |                         or conv_list[0].text.lower().startswith('where') \
47 |                         or conv_list[0].text.lower().startswith('how'):
48 |                     return 'qa'
49 |         if 'retrieval' in candidate_outputs:
50 |             if len(candidate_outputs['retrieval']) > 0:
51 |                 return 'retrieval'
52 |         return None
53 | 
54 |     def get_output(self, conv, candidate_outputs):
55 |         """
56 |         The response Message generation method.
57 | 
58 |         Args:
59 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
60 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
61 |             candidate_outputs(dict): A dict of str (i.e., action) to list of Documents (i.e., the action's result) as
62 |             the response. This dict is produced by action dispatcher, which means this is the aggregation of all the
63 |             executed actions.
64 | 
65 |         Returns:
66 |             A response Message to be sent to the user.
67 |         """
68 |         user_id = conv[0].user_id
69 |         user_info = conv[0].user_info
70 |         msg_info = dict()
71 |         msg_info['msg_id'] = conv[0].msg_info['msg_id']
72 |         msg_info['msg_source'] = 'system'
73 |         text = ''
74 |         user_interface = conv[0].user_interface
75 | 
76 |         selected_action = self.output_selection(conv, candidate_outputs)
77 |         if selected_action is None:
78 |             msg_info['msg_type'] = 'text'
79 |             msg_info['msg_creator'] = 'no answer error'
80 |             text = 'No response has been found! Please try again!'
81 |         elif selected_action == 'qa':
82 |             msg_info['msg_type'] = conv[0].msg_info['msg_type']
83 |             msg_info['msg_creator'] = 'qa'
84 |             text = candidate_outputs['qa'][0].text
85 |         elif selected_action == 'retrieval':
86 |             msg_info['msg_type'] = 'options'
87 |             msg_info['msg_creator'] = 'retrieval'
88 |             text = 'Retrieved document list (click to see the document content):'
89 |             msg_info['options'] = [(output.title, '#get_doc ' + output.id, output.score) for output in candidate_outputs['retrieval']]
90 |         elif selected_action == '#get_doc':
91 |             msg_info['msg_type'] = 'text'
92 |             msg_info['msg_creator'] = '#get_doc'
93 |             text = candidate_outputs['#get_doc'][0].text
94 |         else:
95 |             raise Exception('The candidate output key is not familiar!')
96 |         timestamp = util.current_time_in_milliseconds()
97 |         if timestamp <= conv[0].timestamp:
98 |             raise Exception('There is a problem in the output timestamp!')
99 |         return Message(user_interface, user_id, user_info, msg_info, text, timestamp)


--------------------------------------------------------------------------------
/macaw/core/retrieval/query_generation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The query generation model for search engine.
  3 | 
  4 | Authors: Hamed Zamani (hazamani@microsoft.com)
  5 | """
  6 | 
  7 | from abc import ABC, abstractmethod
  8 | import string
  9 | 
 10 | 
 11 | class QueryGeneration(ABC):
 12 |     @abstractmethod
 13 |     def __init__(self, params):
 14 |         """
 15 |         An abstract class for query generation models.
 16 | 
 17 |         Args:
 18 |             params(dict): A dict containing some mandatory and optional parameters.
 19 |         """
 20 |         self.params = params
 21 | 
 22 |     @abstractmethod
 23 |     def get_query(self, conv_list):
 24 |         """
 25 |         This method is called to get the query generated from a list of conversational interactions.
 26 | 
 27 |         Args:
 28 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
 29 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
 30 | 
 31 |         Returns:
 32 |             The inherited class should implements this method and return a str containing a query for retrieval purpose.
 33 |         """
 34 |         pass
 35 | 
 36 | 
 37 | class SimpleQueryGeneration(QueryGeneration):
 38 |     def __init__(self, params):
 39 |         """
 40 |         This class is a simple implementation of query generation that only focuses on the last interaction in the
 41 |         conversation and use the last interaction as the query.
 42 | 
 43 |         Args:
 44 |             params(dict): A dict containing some mandatory and optional parameters.
 45 |         """
 46 |         super().__init__(params)
 47 | 
 48 |     def get_query(self, conv_list):
 49 |         """
 50 |         This method generates a query from a list of conversational interactions by using the last user request, with
 51 |         some pre-processing (e.g., removing punctuations).
 52 | 
 53 |         Args:
 54 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
 55 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
 56 | 
 57 |         Returns:
 58 |             A str containing the query for retrieval.
 59 |         """
 60 |         # q = ' '.join(msg.text for msg in conv_list)
 61 |         q = conv_list[0].text
 62 |         if 'use_coref' in self.params and self.params['use_coref']:
 63 |             q_coref = self.get_query_coref(conv_list)
 64 |             for key in q_coref:
 65 |                 q += ' ' + ' '.join(q_coref[key])
 66 | 
 67 |         q = q.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).strip()
 68 | 
 69 |         # print(q)
 70 |         return q
 71 | 
 72 |     def get_query_coref(self, conv_list):
 73 |         """
 74 |         This methods compute all co-references in the conversation history for the query terms (i.e., those in the last
 75 |         interaction).
 76 | 
 77 |         Args:
 78 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
 79 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
 80 | 
 81 |         Returns:
 82 |             A dict from terms in the last user request to a list of all identified co-references.
 83 | 
 84 |         """
 85 |         corenlp_coref_result = self.compute_corefs(conv_list)
 86 |         q_coref = dict()
 87 |         last_index = len(corenlp_coref_result['sentences'])
 88 |         for key in corenlp_coref_result['corefs']:
 89 |             has_coref = False
 90 |             for item in corenlp_coref_result['corefs'][key]:
 91 |                 if item['sentNum'] == last_index:
 92 |                     has_coref = True
 93 |                     text = item['text']
 94 |                     break
 95 |             if has_coref:
 96 |                 q_coref[text] = []
 97 |                 for item in corenlp_coref_result['corefs'][key]:
 98 |                     if item['sentNum'] == last_index:
 99 |                         continue
100 |                     q_coref[text].append(item['text'])
101 |         return q_coref
102 | 
103 |     def compute_corefs(self, conv_list):
104 |         """
105 |         This method runs CoreNLP co-reference resolution on the requests made by the user in the conversation.
106 |         Note: this method ignores system responses.
107 | 
108 |         Args:
109 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
110 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
111 | 
112 |         Returns:
113 |             A dict containing all sentence and co-reference information.
114 | 
115 |         """
116 |         conv_history = []
117 |         for msg in reversed(conv_list):
118 |             if msg.msg_info['msg_source'] == 'user' and msg.msg_info['msg_type'] in ['text', 'voice']:
119 |                 temp = msg.text if msg.text.endswith('?') else (msg.text + '?')
120 |                 conv_history.append(temp)
121 |             # elif msg.msg_info['msg_source'] == 'system' and msg.msg_info['msg_type'] == 'text' and len(msg.text.split()) < 30:
122 |             #     temp = msg.text + '.'
123 |             #     conv_history.append(temp)
124 |         if len(conv_history) == 0:
125 |             raise Exception('The query generation model cannot generate any query! There should be a problem')
126 |         coref_results = self.params['nlp_util'].get_coref(' '.join(conv_history))
127 |         return coref_results
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/macaw/core/input_handler/action_detection.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The request dispatcher module.
  3 | 
  4 | Authors: Hamed Zamani (hazamani@microsoft.com)
  5 | """
  6 | 
  7 | import multiprocessing
  8 | 
  9 | from macaw.core.input_handler import actions
 10 | 
 11 | 
 12 | class PreActionRequestDispatcher:
 13 |     def __init__(self, params):
 14 |         """
 15 |         A simple pre-action request dispatcher module that selects one of the actions based on the user's message and
 16 |         only run one action.
 17 | 
 18 |         Args:
 19 |             params(dict): A dict of parameters.
 20 |         """
 21 |         self.params = params
 22 | 
 23 |     def action_detection(self, conv_list):
 24 |         """
 25 |         Action detection based on the conversation. This method simply identifies if a message is a command or if it's a
 26 |         question based on the starting word.
 27 | 
 28 |         Args:
 29 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
 30 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
 31 | 
 32 |         Returns:
 33 |             A str denoting the identified action, e.g., 'retrieval', 'qa', or a command.
 34 | 
 35 |         """
 36 |         if conv_list[0].msg_info['msg_type'] == 'command':
 37 |             command = conv_list[0].text.split(' ')[0]
 38 |             return command
 39 | 
 40 |         if 'qa' in self.params:
 41 |             if conv_list[0].text.lower().startswith('what') \
 42 |                     or conv_list[0].text.lower().startswith('who') \
 43 |                     or conv_list[0].text.lower().startswith('when') \
 44 |                     or conv_list[0].text.lower().startswith('where') \
 45 |                     or conv_list[0].text.lower().startswith('how'):
 46 |                 return 'qa'
 47 |         if 'retrieval' in self.params:
 48 |             return 'retrieval'
 49 | 
 50 |     def dispatch(self, conv_list):
 51 |         """
 52 |         A dispatcher function that runs the action identified by 'action_detection'.
 53 | 
 54 |         Args:
 55 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
 56 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
 57 | 
 58 |         Returns:
 59 |             A dict of str (i.e., action) to list of Documents (i.e., the action's result) as the response.
 60 | 
 61 |         """
 62 |         action = self.action_detection(conv_list)
 63 |         if action == 'retrieval':
 64 |             return {'retrieval': actions.RetrievalAction.run(conv_list, self.params)}
 65 |         if action == 'qa':
 66 |             return {'qa': actions.QAAction.run(conv_list, self.params)}
 67 |         if action == '#get_doc':
 68 |             doc_id = ' '.join(conv_list[0].text.split(' ')[1:])
 69 |             return {'#get_doc': actions.GetDocFromIndex.run(None, {**self.params, **{'doc_id': doc_id}})}
 70 | 
 71 | 
 72 | class RequestDispatcher:
 73 |     def __init__(self, params):
 74 |         """
 75 |         The main request dispatcher class. This module runs multiple actions in parallel for a pre-specified timeout and
 76 |         returns all of the obtained results.
 77 | 
 78 |         Args:
 79 |             params(dict): A dict of parameters. Required params include 'actions' and 'timeout'.
 80 |         """
 81 |         self.params = params
 82 | 
 83 |     def dispatch(self, conv_list):
 84 |         """
 85 |         The request dispatcher method. This method runs all non-command messages in parallel using multiprocessing.
 86 | 
 87 |         Args:
 88 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
 89 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
 90 | 
 91 |         Returns:
 92 |             A dict of str (i.e., action) to list of Documents (i.e., the action's result) as the response.
 93 |         """
 94 |         if conv_list[0].msg_info['msg_type'] == 'command':
 95 |             command = conv_list[0].text.split(' ')[0]
 96 |             return self.execute_command(conv_list, command)
 97 | 
 98 |         action_processes = []
 99 |         manager = multiprocessing.Manager()
100 |         action_results = manager.dict()
101 |         for action in self.params['actions']:
102 |             p = multiprocessing.Process(target=actions.run_action, args=[action, conv_list.copy(), self.params, action_results])
103 |             action_processes.append(p)
104 |             p.start()
105 | 
106 |         for p in action_processes:
107 |             p.join()
108 | 
109 |         candidate_outputs = dict()
110 |         for key in action_results:
111 |             if action_results[key]:
112 |                 candidate_outputs[key] = action_results[key]
113 |         return candidate_outputs
114 | 
115 |     def execute_command(self, conv_list, command):
116 |         """
117 |         The command executor method.
118 | 
119 |         Args:
120 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
121 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
122 |             command(str): A str showing the command.
123 | 
124 |         Returns:
125 |             A dict of str (i.e., command) to list of Documents (i.e., the action's result) as the response.
126 |         """
127 |         if command == '#get_doc':
128 |             doc_id = ' '.join(conv_list[0].text.split(' ')[1:])
129 |             return {'#get_doc': actions.GetDocFromIndex.run(None, {**self.params, **{'doc_id': doc_id}})}
130 |         else:
131 |             raise Exception('Command not found!')
132 | 


--------------------------------------------------------------------------------
/macaw/live_main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The interactive CIS main file.
  3 | 
  4 | Authors: Hamed Zamani (hazamani@microsoft.com)
  5 | """
  6 | 
  7 | from macaw.cis import CIS
  8 | from macaw.core import mrc, retrieval
  9 | from macaw.core.input_handler.action_detection import RequestDispatcher
 10 | from macaw.core.output_handler import naive_output_selection
 11 | from macaw.util.logging import Logger
 12 | 
 13 | 
 14 | class ConvQA(CIS):
 15 |     def __init__(self, params):
 16 |         """
 17 |         The constructor for Conversational Question Answering. This is a Conversational application class and is
 18 |         inherited from the CIS class.
 19 | 
 20 |         Args:
 21 |             params(dict): A dict of parameters. These are mandatory parameters for this class: 'logger' which is an
 22 |             instance of the util.logging.Logger class. ConvQA requires both a retrieval and machine reading
 23 |             comprehension engines. Each of them requires some additional parameters. Refer to the corresponding class
 24 |             for more information on the required parameters.
 25 |         """
 26 |         super().__init__(params)
 27 |         self.logger = params['logger']
 28 |         self.logger.info('Conversational QA Model... starting up...')
 29 |         self.retrieval = retrieval.get_retrieval_model(params=self.params)
 30 |         self.qa = mrc.get_mrc_model(params=self.params)
 31 |         self.params['actions'] = {'retrieval': self.retrieval, 'qa': self.qa}
 32 |         self.request_dispatcher = RequestDispatcher(self.params)
 33 |         self.output_selection = naive_output_selection.NaiveOutputProcessing({})
 34 | 
 35 |     def request_handler_func(self, conv_list):
 36 |         """
 37 |         This function is called for each conversational interaction made by the user. In fact, this function calls the
 38 |         dispatcher to send the user request to the information seeking components.
 39 | 
 40 |         Args:
 41 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
 42 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
 43 | 
 44 |         Returns:
 45 |             output_msg(Message): Returns an output message that should be sent to the UI to be presented to the user.
 46 |         """
 47 |         self.logger.info(conv_list)
 48 |         dispatcher_output = self.request_dispatcher.dispatch(conv_list)
 49 |         output_msg = self.output_selection.get_output(conv_list, dispatcher_output)
 50 |         return output_msg
 51 | 
 52 |     def run(self):
 53 |         """
 54 |             This function is called to run the ConvQA system. In live mode, it never stops until the program is killed.
 55 |         """
 56 |         self.interface.run()
 57 | 
 58 | 
 59 | if __name__ == '__main__':
 60 |     basic_params = {'timeout': 15,  # timeout is in terms of second.
 61 |                     'mode': 'live',  # mode can be either live or exp.
 62 |                     'logger': Logger({})}  # for logging into file, pass the filepath to the Logger class.
 63 | 
 64 |     # These are required database parameters if the mode is 'live'. The host and port of the machine hosting the
 65 |     # database, as well as the database name.
 66 |     db_params = {'interaction_db_host': 'localhost',
 67 |                  'interaction_db_port': 27017,
 68 |                  'interaction_db_name': 'macaw_test'}
 69 | 
 70 |     # These are interface parameters. They are interface specific.
 71 |     interface_params = {'interface': 'telegram',  # interface can be 'telegram' or 'stdio' for live mode, and 'fileio'
 72 |                                                   # for exp mode.
 73 |                         'bot_token': 'YOUR_TELECGRAM_BOT_TOKEN',  # Telegram bot token.
 74 |                         'asr_model': 'google',  # The API used for speech recognition.
 75 |                         'asg_model': 'google',  # The API used for speech generation.
 76 |                         'google-speech-to-text-credential-file': 'YOUR_GOOGLE_CREDENTIAL_FILE'}
 77 | 
 78 |     # These are parameters used by the retrieval model.
 79 |     retrieval_params = {'query_generation': 'simple',  # the model that generates a query from a conversation history.
 80 |                         'use_coref': True,  # True, if query generator can use coreference resolution, otherwise False.
 81 |                         'search_engine': 'bing',  # the search engine. It can be either 'indri' or 'bing'.
 82 |                         'bing_key': 'YOUR_BING_SUBSCRIPTION_KEY',  # Bing API key
 83 |                         'search_engine_path': 'PATH_TO_INDRI',  # The path to the indri toolkit.
 84 |                         'col_index': 'PATH_TO_INDRI_INDEX',  # The path to the indri index.
 85 |                         'col_text_format': 'trectext',  # collection text format. Standard 'trectext' is only supported.
 86 |                         'results_requested': 3}  # Maximum number of docs that should be retrieved by search engine.
 87 |     # Note: If you want to have a re-ranking model (e.g., learning to rank), you just need to simply extend the class
 88 |     # core.retrieval.search_engine.ReRanker and implement the method 'rerank'. Then simply add a 'reranker' parameter to
 89 |     # retrieval_params that points to an instance of your favorite ReRanker class. If there is a 'reranker' parameter in
 90 |     # retrieval_params, the retrieval model automatically calls the re-ranking method. For more details, see the method
 91 |     # 'get_results' in class core.retrieval.search_engine.Retrieval.
 92 | 
 93 |     # These are parameters used by the machine reading comprehension model.
 94 |     mrc_params = {'mrc': 'drqa',  # MRC model.
 95 |                   'mrc_model_path': 'PATH_TO_PRETRAINED_MRC_MODEL',  # The path to the model parameters.
 96 |                   'mrc_path': 'PATH_TO_MRC_DIRECTORY',  # The path to the model toolkit.
 97 |                   'corenlp_path': 'PATH_TO_STANFORD_CORE_NLP_DIRECTORY',  # The path to the corenlp toolkit.
 98 |                   'qa_results_requested': 3}  # The number of candidate answers returned by the MRC model.
 99 | 
100 |     params = {**basic_params, **db_params, **interface_params, **retrieval_params, **mrc_params}
101 |     basic_params['logger'].info(params)
102 |     ConvQA(params).run()
103 | 
104 | 


--------------------------------------------------------------------------------
/macaw/interface/telegram.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The Telegram bot (supports interactive multi-modal interactions with different devices).
  3 | 
  4 | Authors: Hamed Zamani (hazamani@microsoft.com)
  5 | """
  6 | 
  7 | import urllib.parse
  8 | import os
  9 | import tempfile
 10 | import traceback
 11 | 
 12 | from telegram import InlineKeyboardButton, InlineKeyboardMarkup
 13 | from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackQueryHandler
 14 | 
 15 | from macaw import util
 16 | from macaw.core.interaction_handler.msg import Message
 17 | from macaw.interface.interface import Interface
 18 | 
 19 | 
 20 | class TelegramBot(Interface):
 21 |     def __init__(self, params):
 22 |         """
 23 |         A Telegram bot interface for Macaw.
 24 | 
 25 |         Args:
 26 |             params(dict): A dict of parameters. The params 'logger' and 'bot_token' are mandatory.
 27 |         """
 28 |         super().__init__(params)
 29 |         self.logger = self.params['logger']
 30 | 
 31 |         self.MAX_MSG_LEN = 1000  # maximum number of characters in each response message.
 32 |         self.MAX_OPTION_LEN = 30  # maximum number of characters in each clickable option text.
 33 | 
 34 |         # Starting the bot by creating the Updater.
 35 |         # Make sure to set use_context=True to use the new context based callbacks
 36 |         # If you don't have a bot_token, add 'botfather' to your personal Telegram account and follow the instructions
 37 |         # to get a token for your bot.
 38 |         self.updater = Updater(self.params['bot_token'], use_context=True)
 39 |         self.dp = self.updater.dispatcher
 40 | 
 41 |         # Telegram command handlers (e.g., /start)
 42 |         self.dp.add_handler(CommandHandler('start', self.start))
 43 |         self.dp.add_handler(CommandHandler('help', self.help))
 44 | 
 45 |         # Telegram message handlers
 46 |         self.dp.add_handler(MessageHandler(Filters.text, self.request_handler))
 47 |         self.dp.add_handler(MessageHandler(Filters.voice, self.voice_request_handler))
 48 |         self.dp.add_handler(CallbackQueryHandler(self.button_click_handler))
 49 | 
 50 |         # logging all errors
 51 |         self.dp.add_error_handler(self.error)
 52 | 
 53 |     def start(self, update, context):
 54 |         """Send a message when the command /start is issued."""
 55 |         update.message.reply_text('Hi, welcome to Macaw! Macaw is an open-source extensible framework for '
 56 |                                   'conversational information seeking. Visit: https://github.com/microsoft/macaw')
 57 | 
 58 |     def help(self, update, context):
 59 |         """Send a message when the command /help is issued."""
 60 |         update.message.reply_text('Macaw should be able to answer your questions. Just ask a question!')
 61 | 
 62 |     def request_handler(self, update, context):
 63 |         """This method handles all text messages, and asks result_presentation to send the response to the user."""
 64 |         try:
 65 |             self.logger.info(update.message)
 66 |             user_info = {'first_name': update.message.chat.first_name,
 67 |                          'last_name': update.message.chat.last_name,
 68 |                          'is_bot': update._effective_user.is_bot
 69 |             }
 70 |             msg_info = {'msg_id': update.message.message_id,
 71 |                         'msg_type': 'text',
 72 |                         'msg_source': 'user'}
 73 |             msg = Message(user_interface='telegram',
 74 |                           user_id=update.message.chat.id,
 75 |                           user_info=user_info,
 76 |                           msg_info=msg_info,
 77 |                           text=update.message.text,
 78 |                           timestamp=util.current_time_in_milliseconds())
 79 |             output = self.params['live_request_handler'](msg)
 80 |             self.result_presentation(output, {'update': update})
 81 |         except Exception:
 82 |             traceback.print_exc()
 83 | 
 84 |     def voice_request_handler(self, update, context):
 85 |         """This method handles all voice messages, and asks result_presentation to send the response to the user."""
 86 |         try:
 87 |             ogg_file = tempfile.NamedTemporaryFile(delete=True)
 88 |             update.message.voice.get_file().download(ogg_file.name)
 89 |             text = self.params['asr'].speech_to_text(ogg_file.name)
 90 |             ogg_file.close()
 91 |             update.message.reply_text('Macaw heard: ' + text)
 92 | 
 93 |             user_info = {'first_name': update.message.chat.first_name,
 94 |                          'last_name': update.message.chat.last_name,
 95 |                          'is_bot': update._effective_user.is_bot
 96 |                          }
 97 |             msg_info = {'msg_id': update.message.message_id,
 98 |                         'msg_type': 'voice',
 99 |                         'msg_source': 'user'}
100 |             msg = Message(user_interface='telegram',
101 |                           user_id=update.message.chat.id,
102 |                           user_info=user_info,
103 |                           msg_info=msg_info,
104 |                           text=text,
105 |                           timestamp=util.current_time_in_milliseconds())
106 |             output = self.params['live_request_handler'](msg)
107 |             self.result_presentation(output, {'update': update})
108 |         except Exception:
109 |             traceback.print_exc()
110 | 
111 |     def button_click_handler(self, update, context):
112 |         """This method handles clicks, and asks result_presentation to send the response to the user."""
113 |         try:
114 |             self.logger.info(update)
115 |             user_info = {'first_name': update.callback_query.message.chat.first_name,
116 |                          'last_name': update.callback_query.message.chat.last_name,
117 |                          'is_bot': update._effective_user.is_bot
118 |                          }
119 |             msg_info = {'msg_id': update.callback_query.message.message_id,
120 |                         'msg_type': 'command',
121 |                         'msg_source': 'user'}
122 |             msg = Message(user_interface='telegram',
123 |                           user_id=update.callback_query.message.chat.id,
124 |                           user_info=user_info,
125 |                           msg_info=msg_info,
126 |                           text=update.callback_query.data,
127 |                           timestamp=util.current_time_in_milliseconds())
128 |             output = self.params['live_request_handler'](msg)
129 |             self.result_presentation(output, {'update': update})
130 |         except Exception as ex:
131 |             traceback.print_exc()
132 | 
133 |     def result_presentation(self, response_msg, params):
134 |         """This method produces an appropriate response to be sent to the client."""
135 |         try:
136 |             if response_msg is None:
137 |                 return
138 |             update = params['update']
139 |             if response_msg.msg_info['msg_type'] == 'text':
140 |                 if update.message is not None:
141 |                     update.message.reply_text(response_msg.text[:self.MAX_MSG_LEN])
142 |                 elif update.callback_query.message is not None:
143 |                     update.callback_query.message.reply_text(response_msg.text[:self.MAX_MSG_LEN])
144 |             elif response_msg.msg_info['msg_type'] == 'voice':
145 |                 ogg_file_name = self.params['asg'].text_to_speech(response_msg.text[:self.MAX_MSG_LEN])
146 |                 self.updater.bot.send_voice(chat_id=update.message.chat.id, voice=open(ogg_file_name, 'rb'))
147 |                 os.remove(ogg_file_name)  # removing audio files for privacy reasons.
148 |             elif response_msg.msg_info['msg_type'] == 'options':
149 |                 keyboard = [[InlineKeyboardButton(option_text[:self.MAX_OPTION_LEN],
150 |                                                   callback_data=urllib.parse.unquote(option_data))]
151 |                             for (option_text, option_data, output_score) in response_msg.msg_info['options']]
152 |                 reply_markup = InlineKeyboardMarkup(keyboard)
153 |                 update.message.reply_text(response_msg.text[:self.MAX_MSG_LEN], reply_markup=reply_markup)
154 |             elif response_msg.msg_info['msg_type'] == 'error':
155 |                 error_msg = 'ERROR: NO RESULT!'
156 |                 if update.message is not None:
157 |                     update.message.reply_text(error_msg)
158 |                 elif update.callback_query.message is not None:
159 |                     update.callback_query.message.reply_text(error_msg)
160 |             else:
161 |                 raise Exception('The msg_type is not recognized:', response_msg.msg_info['msg_type'])
162 |         except Exception:
163 |             traceback.print_exc()
164 | 
165 |     def error(self, update, context):
166 |         """Log Errors caused by Updates."""
167 |         self.logger.warning('Update "%s" caused error "%s"', update, context.error)
168 | 
169 |     def send_msg(self, chat_id, msg_text):
170 |         """This method is used for sending a message to a user. It can be used for mixed-initiative interactions, as
171 |         well as Wizard of Oz settings."""
172 |         self.updater.bot.sendMessage(chat_id=chat_id, text=msg_text)
173 | 
174 |     def run(self):
175 |         """Starting the bot!"""
176 |         self.logger.info('Running the Telegram bot!')
177 |         self.updater.start_polling()
178 |         # Run the bot until you press Ctrl-C or the process receives SIGINT,
179 |         # SIGTERM or SIGABRT. This should be used most of the time, since
180 |         # start_polling() is non-blocking and will stop the bot gracefully.
181 |         self.updater.idle()
182 | 
183 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Macaw: An Extensible Conversational Information Seeking Platform
  2 | Conversational information seeking (CIS) has been recognized as a major emerging research area in information retrieval.
  3 | Such research will require data and tools, to allow the implementation and study of conversational systems. Macaw is
  4 | an open-source framework with a modular architecture for CIS research. Macaw supports *multi-turn*, *multi-modal*, and 
  5 | *mixed-initiative* interactions, for tasks such as document retrieval, question answering, recommendation, and 
  6 | structured data exploration. It has a modular design to encourage the study of new CIS algorithms, which can be 
  7 | evaluated in batch mode. It can also integrate with a user interface, which allows user studies and data collection in 
  8 | an interactive mode, where the back end can be *fully algorithmic* or a *wizard of oz* setup. 
  9 | 
 10 | Macaw could be of interest to the researchers and practitioners working on information retrieval, natural language 
 11 | processing, and dialogue systems.
 12 | 
 13 | For more information on Macaw, please refer to [this paper](https://arxiv.org/pdf/1912.08904.pdf).
 14 | 
 15 | Table of content:
 16 | + [Macaw Architecture](#macaw-architecture)
 17 |     + [Interfaces](#interfaces)
 18 |     + [Retrieval](#retrieval)
 19 |     + [Answer Selection and Generation](#answer-selection-and-generation)
 20 | + [Installation](#installation) 
 21 | + [Running Macaw](#running-macaw)
 22 | + [Bug Report and Feature Request](#bug-report-and-feature-request)
 23 | + [Citation](#citation)
 24 | + [License](#license)
 25 | + [Contribution](#contribution)
 26 | 
 27 | ## Macaw Architecture
 28 | Macaw has a modular architecture, which allows further development and extension. The high-level architecture of Macaw
 29 | is presented below:
 30 | 
 31 | ![The high-level architecture of Macaw](macaw-arch.jpg)
 32 | 
 33 | For more information on each module in Macaw, refer to this paper.
 34 | 
 35 | #### Interfaces
 36 | Macaw supports the following interfaces:
 37 | + Standard IO: For *development* purposes
 38 | + File IO: For *batch experiments* (see the examples in the `data` folder for input and output file formats)
 39 | + Telegram bot: For interaction with real users
 40 | 
 41 | Here is an example of the Telegram interface for Macaw. It supports multi-modal interactions (text, speech, click, etc).
 42 | 
 43 | ![Telegram interface for Macaw](macaw-example-tax.jpg) 
 44 | ![Telegram interface for Macaw](macaw-example-shakespeare.jpg)
 45 | 
 46 | 
 47 | #### Retrieval
 48 | Macaw features the following search engines:
 49 | + [Indri](http://lemurproject.org/indri.php): an open-source search engine that can be used for any arbitrary text 
 50 | collection. 
 51 | + Bing web search API: sending a request to the Bing API and getting the results.
 52 | 
 53 | #### Answer Selection and Generation
 54 | For question answering, Macaw only features [the DrQA model](https://github.com/facebookresearch/DrQA) in its current 
 55 | version.
 56 | 
 57 | 
 58 | ## Installation
 59 | Macaw requires `Python >= 3.6` and `pip3`. If you don't have `setuptools`, run `sudo pip3 install setuptools`. 
 60 | To install Macaw, first **clone macaw** from this repo and then follow the following installation steps. The
 61 | mentioned installation commands can be executed on Ubuntu. You can use the same or similar commands on other Linux 
 62 | distribution. If you are using Windows 10, we recommend installing Macaw and all the required packages on 
 63 | [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10).
 64 | 
 65 | #### Step 1: Installing MongoDB server
 66 | Macaw uses MongoDB for storing and retrieving user interactions (conversations). To install MongoDB server, run the
 67 | following command:
 68 | ```
 69 | sudo apt-get install mongodb-server-core
 70 | ```
 71 | 
 72 | #### Step 2: Installing Indri and Pyndri
 73 | [Indri](http://lemurproject.org/indri.php) is an open-source search engine for information retrieval research, 
 74 | implemented as part of the [Lemur Project](http://lemurproject.org/).
 75 | [Pyndri](https://github.com/cvangysel/pyndri) is a python interface to Indri. Macaw uses Indri for retrieving documents 
 76 | from an arbitrary text collection.
 77 | To install Indri, first download Indri from https://sourceforge.net/projects/lemur/files/lemur/. As suggested by pyndri,
 78 | we have used Indri-5.11. This Indri version can be installed as follows:
 79 | ```
 80 | # download indri-5.11.tar.gz
 81 | sudo apt install g++ zlib1g-dev
 82 | tar xzvf indri-5.11.tar.gz
 83 | rm indri-5.11.tar.gz
 84 | cd indri-5.11
 85 | ./configure CXX="g++ -D_GLIBCXX_USE_CXX11_ABI=0"
 86 | make
 87 | sudo make install
 88 | ```
 89 | 
 90 | Then, clone the pyndri repository from https://github.com/cvangysel/pyndri and run the following command:
 91 | ```
 92 | python3 setup.py install
 93 | ```
 94 | 
 95 | At this step, you can make sure your installation is complete by running the pyndri tests.
 96 | 
 97 | #### Step 3: Installing Stanford Core NLP
 98 | Stanford Core NLP can be used for tokenization and most importantly for co-reference resolution. If you do not need 
 99 | co-reference resolution, you can ignore this step. Stanford Core NLP requires `java`. Get it by following these 
100 | commands:
101 | ```
102 | wget -O "stanford-corenlp-full-2017-06-09.zip" "http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip"
103 | sudo apt-get install unzip
104 | unzip "stanford-corenlp-full-2017-06-09.zip"
105 | rm "stanford-corenlp-full-2017-06-09.zip"
106 | ``` 
107 | 
108 | If you don't have `java`, install it using:
109 | ```
110 | sudo apt-get install default-jre
111 | ```
112 | 
113 | #### Step 4: Installing DrQA
114 | Macaw also supports answer extraction / generation for user queries from retrieved documents. For this purpose, it 
115 | features [DrQA](https://github.com/facebookresearch/DrQA). If you do not need this functionality, ignore this step (you
116 | can also install this later). 
117 | To install DrQA, run the following commands:
118 | ```
119 | git clone https://github.com/facebookresearch/DrQA.git
120 | cd DrQA
121 | pip3 install -r requirements.txt
122 | pip3 install torch
123 | sudo python3 setup.py develop
124 | ```
125 | 
126 | To use pre-trained DrQA model, use the following command. 
127 | ```
128 | ./download.sh
129 | ```
130 | This downloads a 7.5GB (compressed) file and requires 25GB (uncompressed) space. This may take a while!
131 |  
132 | 
133 | 
134 | #### Step 5: Installing FFmpeg
135 | To support speech interactions with users, Macaw requires FFmpeg for some multimedia processing steps. If you don't 
136 | need a speech support from Macaw, you can skip this step. To install FFmpeg, run the following command:
137 | ```
138 | sudo apt-get install 
139 | ```
140 | 
141 | #### Step 6: Installing Macaw
142 | After cloning Macaw, use the following commands for installation:
143 | ```
144 | cd macaw
145 | sudo pip3 install -r requirements.txt
146 | sudo python3 setup.py install
147 | ```
148 | 
149 | ## Running Macaw
150 | If you run macaw with interactive (or live) mode, you should first run MongoDB server using the following command:
151 | ```
152 | sudo mongod
153 | ```
154 | Note that this command uses the default database directory (`/data/db`) for storing the data. You may need to create 
155 | this directory if you haven't. You can also use other locations using the `--dbpath` argument. 
156 | 
157 | 
158 | We provide three different main scripts (i.e., app):
159 | + `live_main.py`: An interactive conversational search and question answering system. It can use both STDIO and Telegram
160 | interfaces.
161 | + `batch_ext_main.py`: A model for running experiments on a reusable dataset. This main script uses FILEIO as the 
162 | interface.
163 | + `wizard_of_oz_main.py`: A main script for Wizard of Oz experiments.
164 |  
165 | After selecting the desired main script, open the python file and provide the required parameters. For example, you need
166 | to use your Bing subscription key (if using Bing), the path to Indri index (if using Indri), Telegram bot token (if 
167 | using Telegram interface), etc. in order to run the `live_main.py` script. You can further run the favorite main script
168 | as below:
169 | 
170 | ```
171 | python3 live_main.py
172 | ```
173 | 
174 | 
175 | ## Bug Report and Feature Request
176 | For bug report and feature request, you can open an issue in github, or send an email to 
177 | [Hamed Zamani](http://hamedz.ir) at `hazamani@microsoft.com`.
178 | 
179 | ## Citation
180 | If you found Macaw useful, you can cite the following article:
181 | ```
182 | Hamed Zamani and Nick Craswell, "Macaw: An Extensible Conversational Information Seeking System", arxiv pre-print.
183 | ```
184 | 
185 | bibtex:
186 | ```
187 | @article{macaw,
188 |   title={Macaw: An Extensible Conversational Information Seeking Platform},
189 |   author={Zamani, Hamed and Craswell, Nick},
190 |   journal={arXiv preprint arXiv:1912.08904},
191 |   year={2019},
192 | }
193 | ```
194 | 
195 | ## License
196 | Macaw is distributed under the **MIT License**. See the `LICENSE` file for more information.
197 | 
198 | 
199 | ## Contribution
200 | 
201 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
202 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
203 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
204 | 
205 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
206 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
207 | provided by the bot. You will only need to do this once across all repos using our CLA.
208 | 
209 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
210 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
211 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
212 | 


--------------------------------------------------------------------------------
/macaw/wizard_of_oz_main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The interactive CIS main file.
  3 | 
  4 | Authors: Hamed Zamani (hazamani@microsoft.com)
  5 | """
  6 | 
  7 | import multiprocessing
  8 | 
  9 | from macaw import interface
 10 | from macaw.core import retrieval
 11 | from macaw.core.input_handler.action_detection import RequestDispatcher
 12 | from macaw.core.interaction_handler.user_requests_db import InteractionDB
 13 | from macaw.core.output_handler import naive_output_selection
 14 | from macaw.util.logging import Logger
 15 | 
 16 | 
 17 | class Seeker:
 18 |     def __init__(self, params):
 19 |         """
 20 |         The constructor for Conversational Question Answering. This is a Conversational application class and is
 21 |         inherited from the CIS class.
 22 | 
 23 |         Args:
 24 |             params(dict): A dict of parameters. These are mandatory parameters for this class: 'logger' which is an
 25 |             instance of the util.logging.Logger class. ConvQA requires both a retrieval and machine reading
 26 |             comprehension engines. Each of them requires some additional parameters. Refer to the corresponding class
 27 |             for more information on the required parameters.
 28 |         """
 29 |         self.params = params
 30 |         self.logger = params['logger']
 31 |         self.logger.info('Conversational Wirzard of Oz System... starting up...')
 32 |         self.wizard = None
 33 |         self.params['live_request_handler'] = self.live_request_handler
 34 | 
 35 |         self.interface = interface.get_interface(params)
 36 | 
 37 |         self.retrieval = retrieval.get_retrieval_model(params=self.params)
 38 |         self.request_dispatcher = RequestDispatcher({'retrieval': self.retrieval})
 39 |         self.output_selection = naive_output_selection.NaiveOutputProcessing({})
 40 | 
 41 |     def live_request_handler(self, msg):
 42 |         """
 43 |         This function is called for each conversational interaction made by the user. In fact, this function calls the
 44 |         dispatcher to send the user request to the information seeking components.
 45 | 
 46 |         Args:
 47 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
 48 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
 49 | 
 50 |         Returns:
 51 |             output_msg(Message): Returns an output message that should be sent to the UI to be presented to the user.
 52 |         """
 53 |         msg_db = InteractionDB(host=self.params['interaction_db_host'],
 54 |                                port=self.params['interaction_db_port'],
 55 |                                dbname=self.params['interaction_db_name'])
 56 |         msg_db.insert_one(msg)
 57 |         msg_db.close()
 58 |         self.logger.info(msg)
 59 |         # dispatcher_output = self.request_dispatcher.dispatch(conv_list)
 60 |         # output_msg = self.output_selection.get_output(conv_list, dispatcher_output)
 61 | 
 62 |         self.wizard.send_msg(msg.text)
 63 | 
 64 |     def set_wizard(self, wizard):
 65 |         self.wizard = wizard
 66 | 
 67 |     def send_msg(self, msg_text):
 68 |         self.interface.send_msg(self.params['user_id'], msg_text)
 69 | 
 70 |     def run(self):
 71 |         """
 72 |             This function is called to run the ConvQA system. In live mode, it never stops until the program is killed.
 73 |         """
 74 |         self.interface.run()
 75 | 
 76 | 
 77 | class Wizard:
 78 |     def __init__(self, params):
 79 |         """
 80 |         The constructor for Conversational Question Answering. This is a Conversational application class and is
 81 |         inherited from the CIS class.
 82 | 
 83 |         Args:
 84 |             params(dict): A dict of parameters. These are mandatory parameters for this class: 'logger' which is an
 85 |             instance of the util.logging.Logger class. ConvQA requires both a retrieval and machine reading
 86 |             comprehension engines. Each of them requires some additional parameters. Refer to the corresponding class
 87 |             for more information on the required parameters.
 88 |         """
 89 |         self.params = params
 90 |         self.logger = params['logger']
 91 |         self.logger.info('Conversational Wirzard of Oz System... starting up...')
 92 |         self.params['live_request_handler'] = self.live_request_handler
 93 |         self.seeker = None
 94 | 
 95 |         self.interface = interface.get_interface(params)
 96 | 
 97 |         self.retrieval = retrieval.get_retrieval_model(params=self.params)
 98 |         self.request_dispatcher = RequestDispatcher({'retrieval': self.retrieval})
 99 |         self.output_selection = naive_output_selection.NaiveOutputProcessing({})
100 | 
101 |     def live_request_handler(self, msg):
102 |         """
103 |         This function is called for each conversational interaction made by the user. In fact, this function calls the
104 |         dispatcher to send the user request to the information seeking components.
105 | 
106 |         Args:
107 |             conv_list(list): List of util.msg.Message, each corresponding to a conversational message from / to the
108 |             user. This list is in reverse order, meaning that the first elements is the last interaction made by user.
109 | 
110 |         Returns:
111 |             output_msg(Message): Returns an output message that should be sent to the UI to be presented to the user.
112 |         """
113 |         msg_db = InteractionDB(host=self.params['interaction_db_host'],
114 |                                port=self.params['interaction_db_port'],
115 |                                dbname=self.params['interaction_db_name'])
116 |         msg_db.insert_one(msg)
117 |         self.logger.info(msg)
118 | 
119 |         if msg.text.startswith('@seeker'):
120 |             self.seeker.send_msg(msg.text[7:].strip())
121 |             output_msg = None
122 |         elif msg.text.startswith('@system'):
123 |             msg.text = msg.text[7:].strip()
124 |             dispatcher_output = self.request_dispatcher.dispatch([msg])
125 |             output_msg = self.output_selection.get_output([msg], dispatcher_output)
126 |             msg_db.insert_one(output_msg)
127 |         elif msg.text.startswith('@logger'):
128 |             msg_db.close()
129 |             output_msg = None
130 |         else:
131 |             self.send_msg('The message should starts with @system, @seeker, or @logger')
132 |             output_msg = None
133 | 
134 |         msg_db.close()
135 |         return output_msg
136 | 
137 |     def set_seeker(self, seeker):
138 |         self.seeker = seeker
139 | 
140 |     def send_msg(self, msg_text):
141 |         self.interface.send_msg(self.params['user_id'], msg_text)
142 | 
143 |     def run(self):
144 |         """
145 |             This function is called to run the ConvQA system. In live mode, it never stops until the program is killed.
146 |         """
147 |         self.interface.run()
148 | 
149 | 
150 | if __name__ == '__main__':
151 |     basic_params = {'timeout': 15,  # timeout is in terms of second.
152 |                     'mode': 'live',  # mode can be either live or exp.
153 |                     'logger': Logger({})}  # for logging into file, pass the filepath to the Logger class.
154 | 
155 |     # These are required database parameters if the mode is 'live'. The host and port of the machine hosting the
156 |     # database, as well as the database name.
157 |     db_params = {'interaction_db_host': 'localhost',
158 |                  'interaction_db_port': 27017,
159 |                  'interaction_db_name': 'macaw_test'}
160 | 
161 |     # These are interface parameters. They are interface specific.
162 |     seeker_interface_params = {'interface': 'telegram',  # interface can be 'telegram' or 'stdio'.
163 |                                'bot_token': 'YOUR_TELECGRAM_BOT_TOKEN_FOR_SEEKER',  # Telegram bot token.
164 |                                'asr_model': 'google',  # The API used for speech recognition.
165 |                                'asg_model': 'google',  # The API used for speech generation.
166 |                                'google-speech-to-text-credential-file': 'YOUR_GOOGLE_CREDENTIAL_FILE',
167 |                                'user_id': 'TELEGRAM_USER_ID_FOR_SEEKER'}
168 | 
169 |     wizard_interface_params = {'interface': 'telegram',  # interface can be 'telegram' or 'stdio'.
170 |                                'bot_token': 'YOUR_TELECGRAM_BOT_TOKEN_FOR_WIZARD',  # Telegram bot token.
171 |                                'asr_model': 'google',  # The API used for speech recognition.
172 |                                'asg_model': 'google',  # The API used for speech generation.
173 |                                'google-speech-to-text-credential-file': 'YOUR_GOOGLE_CREDENTIAL_FILE',
174 |                                'user_id': 'TELEGRAM_USER_ID_FOR_WIZARD'}
175 | 
176 |     # These are parameters used by the retrieval model.
177 |     retrieval_params = {'query_generation': 'simple',  # the model that generates a query from a conversation history.
178 |                         'use_coref': False,  # True, if query generator can use coreference resolution, otherwise False.
179 |                         'search_engine': 'indri',  # the search engine. It can be either 'indri' or 'bing'.
180 |                         'bing_key': 'YOUR_BING_SUBSCRIPTION_KEY',  # Bing API key
181 |                         'search_engine_path': 'PATH_TO_INDRI',  # The path to the indri toolkit.
182 |                         'col_index': 'PATH_TO_INDRI_INDEX',  # The path to the indri index.
183 |                         'col_text_format': 'trectext',  # collection text format. Standard 'trectext' is only supported.
184 |                         'results_requested': 3}  # Maximum number of docs that should be retrieved by search engine.
185 |     # Note: If you want to have a re-ranking model (e.g., learning to rank), you just need to simply extend the class
186 |     # core.retrieval.search_engine.ReRanker and implement the method 'rerank'. Then simply add a 'reranker' parameter to
187 |     # retrieval_params that points to an instance of your favorite ReRanker class. If there is a 'reranker' parameter in
188 |     # retrieval_params, the retrieval model automatically calls the re-ranking method. For more details, see the method
189 |     # 'get_results' in class core.retrieval.search_engine.Retrieval.
190 | 
191 |     seeker_params = {**basic_params, **db_params, **seeker_interface_params, **retrieval_params}
192 |     wizard_params = {**basic_params, **db_params, **wizard_interface_params, **retrieval_params}
193 |     basic_params['logger'].info(seeker_params)
194 |     basic_params['logger'].info(wizard_params)
195 | 
196 |     seeker = Seeker(seeker_params)
197 |     wizard = Wizard(wizard_params)
198 |     seeker.set_wizard(wizard)
199 |     wizard.set_seeker(seeker)
200 | 
201 |     seeker_process = multiprocessing.Process(target=seeker.run)
202 |     wizard_process = multiprocessing.Process(target=wizard.run)
203 | 
204 |     seeker_process.start()
205 |     wizard_process.start()
206 | 
207 |     basic_params['logger'].info('Seeker Process ID: {}'.format(seeker_process.pid))
208 |     basic_params['logger'].info('Wizard Process ID: {}'.format(wizard_process.pid))
209 | 
210 | 


--------------------------------------------------------------------------------