├── .gitignore
├── Image
    ├── NB_weight.PNG
    ├── data_diagram.png
    ├── evaluation_3.PNG
    ├── feature_selection.PNG
    ├── idf.PNG
    ├── probability_function.PNG
    ├── run_naive_bayes.PNG
    ├── run_rf.PNG
    ├── tf.PNG
    ├── tfidf.PNG
    └── training_size.PNG
├── LICENSE
├── README.md
├── aml_config
    ├── conda_dependencies.yml
    ├── docker.compute
    ├── docker.runconfig
    ├── jupyter_notebook_config.py
    ├── local.compute
    ├── local.runconfig
    └── spark_dependencies.yml
├── modules
    ├── __init__.py
    ├── feature_extractor.py
    └── phrase_learning.py
├── notebooks
    ├── Part_1_Data_Preparation.ipynb
    ├── Part_2_Phrase_Learning.ipynb
    └── Part_3_Model_Training_and_Evaluation.ipynb
└── scripts
    ├── naive_bayes.py
    └── random_forest.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/Image/NB_weight.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/NB_weight.PNG


--------------------------------------------------------------------------------
/Image/data_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/data_diagram.png


--------------------------------------------------------------------------------
/Image/evaluation_3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/evaluation_3.PNG


--------------------------------------------------------------------------------
/Image/feature_selection.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/feature_selection.PNG


--------------------------------------------------------------------------------
/Image/idf.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/idf.PNG


--------------------------------------------------------------------------------
/Image/probability_function.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/probability_function.PNG


--------------------------------------------------------------------------------
/Image/run_naive_bayes.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/run_naive_bayes.PNG


--------------------------------------------------------------------------------
/Image/run_rf.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/run_rf.PNG


--------------------------------------------------------------------------------
/Image/tf.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/tf.PNG


--------------------------------------------------------------------------------
/Image/tfidf.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/tfidf.PNG


--------------------------------------------------------------------------------
/Image/training_size.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/training_size.PNG


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # QnA Matching
 2 | 
 3 | > **NOTE** This content is no longer maintained. Visit the [Azure Machine Learning Notebook](https://github.com/Azure/MachineLearningNotebooks) project for sample Jupyter notebooks for ML and deep learning with Azure Machine Learning.
 4 | 
 5 | 
 6 | ## Link to the Microsoft DOCS site
 7 | 
 8 | The detailed documentation for this Q & A matching example includes the step-by-step walk-through:
 9 | [https://docs.microsoft.com/azure/machine-learning/preview/scenario-qna-matching](https://docs.microsoft.com/azure/machine-learning/preview/scenario-qna-matching)
10 | 
11 | 
12 | ## Link to the Gallery GitHub repository
13 | 
14 | The public GitHub repository for this Q & A matching example contains all the code samples:
15 | [https://github.com/Azure/MachineLearningSamples-QnAMatching](https://github.com/Azure/MachineLearningSamples-QnAMatching)
16 | 
17 | 
18 | ## Overview
19 | 
20 | This example addresses the problem of mapping user questions to pre-existing Question & Answer (Q&A) pairs as is typically provided in a list of Frequently Asked Questions (that is, a FAQ) or in the Q&A pairs present on websites like [Stack Overflow](https://stackoverflow.com/). There are many approaches to match a question to its correct answer, such as finding the answer that is the most similar to the question. However, in this example open ended questions are matched to previously asked questions by assuming that each answer in the FAQ can answer multiple semantically equivalent questions.
21 | 
22 | The key steps required to deliver this solution are as follows:
23 | 
24 | 1. Clean and process text data.
25 | 2. Learn informative phrases, which are multi-word sequences that provide more information when viewed in sequence than when treated independently.
26 | 3. Extract features from text data.
27 | 4. Train text classification models and evaluate model performance.
28 | 
29 | 
30 | ## Key components needed to run this example
31 | 
32 | 1. An [Azure account](https://azure.microsoft.com/free/) (free trials are available).
33 | 2. An installed copy of Azure Machine Learning Workbench with a workspace created.
34 | 3. This example could be run on any compute context. However, it is recommended to run it on a multi-core machine with at least of 16-GB memory and 5-GB disk space.
35 | 
36 | 
37 | ## Data / Telemetry
38 | QnA Matching collects usage data and sends it to Microsoft to help improve our products and services. Read our [privacy statement](http://go.microsoft.com/fwlink/?LinkId=521839) to learn more. 
39 | 
40 | 
41 | ## Contributing
42 | 
43 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
44 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
45 | the rights to use your contribution. For details, visit https://cla.microsoft.com.
46 | 
47 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
48 | a CLA and decorate the PR appropriately (for example, label, comment). Simply follow the instructions
49 | provided by the bot. You will only need to do this once across all repos using our CLA.
50 | 
51 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
52 | For more information, see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
53 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
54 | 
55 | 


--------------------------------------------------------------------------------
/aml_config/conda_dependencies.yml:
--------------------------------------------------------------------------------
 1 | # Conda environment specification. The dependencies defined in this file will be
 2 | # automatically provisioned for runs against docker, VM, and HDI cluster targets.
 3 | 
 4 | # Details about the Conda environment file format:
 5 | # https://conda.io/docs/using/envs.html#create-environment-file-by-hand
 6 | 
 7 | # For Spark packages and configuration, see spark_dependencies.yml.
 8 | 
 9 | name: project_environment
10 | dependencies:
11 |   - python=3.5.2
12 |   - scikit-learn
13 |   - pip:
14 |     - notebook
15 |     - nltk
16 |     # The API for Azure Machine Learning Model Management Service.
17 |     - azure-ml-api-sdk==0.1.0a11
18 | 
19 |     # Helper utilities for calculating dataprofiles from Pandas DataFrames.
20 |     - https://azuremldownloads.blob.core.windows.net/wheels/latest/azureml.pyrecipes.dataframe-1.0.12-py3-none-any.whl?sv=2016-05-31&si=ro-2017&sr=c&sig=xnUdTm0B%2F%2FfknhTaRInBXyu2QTTt8wA3OsXwGVgU%2BJk%3D
21 | 
22 |     # Helper utilities for dealing with Azure ML Workbench Assets.
23 |     - https://azuremldownloads.blob.core.windows.net/wheels/latest/azureml.assets-1.0.0-py3-none-any.whl?sv=2016-05-31&si=ro-2017&sr=c&sig=xnUdTm0B%2F%2FfknhTaRInBXyu2QTTt8wA3OsXwGVgU%2BJk%3D
24 | 


--------------------------------------------------------------------------------
/aml_config/docker.compute:
--------------------------------------------------------------------------------
1 | type: "docker"
2 | baseDockerImage: "microsoft/mmlspark:0.7.91"
3 | 
4 | # Enabling Docker shared volumes will increase increase execution performance,
5 | # but the shared volume feature of Docker isn't stable on Windows yet.
6 | sharedVolumes: true
7 | 


--------------------------------------------------------------------------------
/aml_config/docker.runconfig:
--------------------------------------------------------------------------------
 1 | ArgumentVector:
 2 |   - "$file"
 3 | Target: "docker-python"
 4 | EnvironmentVariables:
 5 |   "EXAMPLE_ENV_VAR": "Example Value"
 6 | Framework: "Python"
 7 | CondaDependenciesFile: "aml_config/conda_dependencies.yml"
 8 | SparkDependenciesFile: "aml_config/spark_dependencies.yml"
 9 | PrepareEnvironment: true
10 | TrackedRun: true


--------------------------------------------------------------------------------
/aml_config/jupyter_notebook_config.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for jupyter-notebook.
  2 | 
  3 | #------------------------------------------------------------------------------
  4 | # Application(SingletonConfigurable) configuration
  5 | #------------------------------------------------------------------------------
  6 | 
  7 | ## This is an application.
  8 | 
  9 | ## The date format used by logging formatters for %(asctime)s
 10 | #c.Application.log_datefmt = '%Y-%m-%d %H:%M:%S'
 11 | 
 12 | ## The Logging format template
 13 | #c.Application.log_format = '[%(name)s]%(highlevel)s %(message)s'
 14 | 
 15 | ## Set the log level by value or name.
 16 | #c.Application.log_level = 30
 17 | 
 18 | #------------------------------------------------------------------------------
 19 | # JupyterApp(Application) configuration
 20 | #------------------------------------------------------------------------------
 21 | 
 22 | ## Base class for Jupyter applications
 23 | 
 24 | ## Answer yes to any prompts.
 25 | #c.JupyterApp.answer_yes = False
 26 | 
 27 | ## Full path of a config file.
 28 | #c.JupyterApp.config_file = ''
 29 | 
 30 | ## Specify a config file to load.
 31 | #c.JupyterApp.config_file_name = ''
 32 | 
 33 | ## Generate default config file.
 34 | #c.JupyterApp.generate_config = False
 35 | 
 36 | #------------------------------------------------------------------------------
 37 | # NotebookApp(JupyterApp) configuration
 38 | #------------------------------------------------------------------------------
 39 | 
 40 | ## Set the Access-Control-Allow-Credentials: true header
 41 | #c.NotebookApp.allow_credentials = False
 42 | 
 43 | ## Set the Access-Control-Allow-Origin header
 44 | #  
 45 | #  Use '*' to allow any origin to access your server.
 46 | #  
 47 | #  Takes precedence over allow_origin_pat.
 48 | #c.NotebookApp.allow_origin = ''
 49 | 
 50 | ## Use a regular expression for the Access-Control-Allow-Origin header
 51 | #  
 52 | #  Requests from an origin matching the expression will get replies with:
 53 | #  
 54 | #      Access-Control-Allow-Origin: origin
 55 | #  
 56 | #  where `origin` is the origin of the request.
 57 | #  
 58 | #  Ignored if allow_origin is set.
 59 | #c.NotebookApp.allow_origin_pat = ''
 60 | 
 61 | ## Whether to allow the user to run the notebook as root.
 62 | #c.NotebookApp.allow_root = False
 63 | 
 64 | ## DEPRECATED use base_url
 65 | #c.NotebookApp.base_project_url = '/'
 66 | 
 67 | ## The base URL for the notebook server.
 68 | #  
 69 | #  Leading and trailing slashes can be omitted, and will automatically be added.
 70 | #c.NotebookApp.base_url = '/'
 71 | 
 72 | ## Specify what command to use to invoke a web browser when opening the notebook.
 73 | #  If not specified, the default browser will be determined by the `webbrowser`
 74 | #  standard library module, which allows setting of the BROWSER environment
 75 | #  variable to override it.
 76 | #c.NotebookApp.browser = ''
 77 | 
 78 | ## The full path to an SSL/TLS certificate file.
 79 | #c.NotebookApp.certfile = ''
 80 | 
 81 | ## The full path to a certificate authority certificate for SSL/TLS client
 82 | #  authentication.
 83 | #c.NotebookApp.client_ca = ''
 84 | 
 85 | ## The config manager class to use
 86 | #c.NotebookApp.config_manager_class = 'notebook.services.config.manager.ConfigManager'
 87 | 
 88 | ## The notebook manager class to use.
 89 | #c.NotebookApp.contents_manager_class = 'notebook.services.contents.largefilemanager.LargeFileManager'
 90 | 
 91 | ## Extra keyword arguments to pass to `set_secure_cookie`. See tornado's
 92 | #  set_secure_cookie docs for details.
 93 | #c.NotebookApp.cookie_options = {}
 94 | 
 95 | ## The random bytes used to secure cookies. By default this is a new random
 96 | #  number every time you start the Notebook. Set it to a value in a config file
 97 | #  to enable logins to persist across server sessions.
 98 | #  
 99 | #  Note: Cookie secrets should be kept private, do not share config files with
100 | #  cookie_secret stored in plaintext (you can read the value from a file).
101 | #c.NotebookApp.cookie_secret = b''
102 | 
103 | ## The file where the cookie secret is stored.
104 | #c.NotebookApp.cookie_secret_file = ''
105 | 
106 | ## The default URL to redirect to from `/`
107 | #c.NotebookApp.default_url = '/tree'
108 | 
109 | ## Disable cross-site-request-forgery protection
110 | #  
111 | #  Jupyter notebook 4.3.1 introduces protection from cross-site request
112 | #  forgeries, requiring API requests to either:
113 | #  
114 | #  - originate from pages served by this server (validated with XSRF cookie and
115 | #  token), or - authenticate with a token
116 | #  
117 | #  Some anonymous compute resources still desire the ability to run code,
118 | #  completely without authentication. These services can disable all
119 | #  authentication and security checks, with the full knowledge of what that
120 | #  implies.
121 | #c.NotebookApp.disable_check_xsrf = False
122 | 
123 | ## Whether to enable MathJax for typesetting math/TeX
124 | #  
125 | #  MathJax is the javascript library Jupyter uses to render math/LaTeX. It is
126 | #  very large, so you may want to disable it if you have a slow internet
127 | #  connection, or for offline use of the notebook.
128 | #  
129 | #  When disabled, equations etc. will appear as their untransformed TeX source.
130 | #c.NotebookApp.enable_mathjax = True
131 | 
132 | ## extra paths to look for Javascript notebook extensions
133 | #c.NotebookApp.extra_nbextensions_path = []
134 | 
135 | ## Extra paths to search for serving static files.
136 | #  
137 | #  This allows adding javascript/css to be available from the notebook server
138 | #  machine, or overriding individual files in the IPython
139 | #c.NotebookApp.extra_static_paths = []
140 | 
141 | ## Extra paths to search for serving jinja templates.
142 | #  
143 | #  Can be used to override templates from notebook.templates.
144 | #c.NotebookApp.extra_template_paths = []
145 | 
146 | ## 
147 | #c.NotebookApp.file_to_run = ''
148 | 
149 | ## Deprecated: Use minified JS file or not, mainly use during dev to avoid JS
150 | #  recompilation
151 | #c.NotebookApp.ignore_minified_js = False
152 | 
153 | ## (bytes/sec) Maximum rate at which messages can be sent on iopub before they
154 | #  are limited.
155 | #c.NotebookApp.iopub_data_rate_limit = 1000000
156 | 
157 | ## (msgs/sec) Maximum rate at which messages can be sent on iopub before they are
158 | #  limited.
159 | #c.NotebookApp.iopub_msg_rate_limit = 1000
160 | 
161 | ## The IP address the notebook server will listen on.
162 | #c.NotebookApp.ip = 'localhost'
163 | 
164 | ## Supply extra arguments that will be passed to Jinja environment.
165 | #c.NotebookApp.jinja_environment_options = {}
166 | 
167 | ## Extra variables to supply to jinja templates when rendering.
168 | #c.NotebookApp.jinja_template_vars = {}
169 | 
170 | ## The kernel manager class to use.
171 | #c.NotebookApp.kernel_manager_class = 'notebook.services.kernels.kernelmanager.MappingKernelManager'
172 | 
173 | ## The kernel spec manager class to use. Should be a subclass of
174 | #  `jupyter_client.kernelspec.KernelSpecManager`.
175 | #  
176 | #  The Api of KernelSpecManager is provisional and might change without warning
177 | #  between this version of Jupyter and the next stable one.
178 | #c.NotebookApp.kernel_spec_manager_class = 'jupyter_client.kernelspec.KernelSpecManager'
179 | 
180 | ## The full path to a private key file for usage with SSL/TLS.
181 | #c.NotebookApp.keyfile = ''
182 | 
183 | ## The login handler class to use.
184 | #c.NotebookApp.login_handler_class = 'notebook.auth.login.LoginHandler'
185 | 
186 | ## The logout handler class to use.
187 | #c.NotebookApp.logout_handler_class = 'notebook.auth.logout.LogoutHandler'
188 | 
189 | ## The MathJax.js configuration file that is to be used.
190 | #c.NotebookApp.mathjax_config = 'TeX-AMS-MML_HTMLorMML-full,Safe'
191 | 
192 | ## A custom url for MathJax.js. Should be in the form of a case-sensitive url to
193 | #  MathJax, for example:  /static/components/MathJax/MathJax.js
194 | #c.NotebookApp.mathjax_url = ''
195 | 
196 | ## Dict of Python modules to load as notebook server extensions.Entry values can
197 | #  be used to enable and disable the loading ofthe extensions. The extensions
198 | #  will be loaded in alphabetical order.
199 | #c.NotebookApp.nbserver_extensions = {}
200 | 
201 | ## The directory to use for notebooks and kernels.
202 | #c.NotebookApp.notebook_dir = ''
203 | 
204 | ## Whether to open in a browser after starting. The specific browser used is
205 | #  platform dependent and determined by the python standard library `webbrowser`
206 | #  module, unless it is overridden using the --browser (NotebookApp.browser)
207 | #  configuration option.
208 | #c.NotebookApp.open_browser = True
209 | 
210 | ## Hashed password to use for web authentication.
211 | #  
212 | #  To generate, type in a python/IPython shell:
213 | #  
214 | #    from notebook.auth import passwd; passwd()
215 | #  
216 | #  The string should be of the form type:salt:hashed-password.
217 | #c.NotebookApp.password = ''
218 | 
219 | ## Forces users to use a password for the Notebook server. This is useful in a
220 | #  multi user environment, for instance when everybody in the LAN can access each
221 | #  other's machine though ssh.
222 | #  
223 | #  In such a case, server the notebook server on localhost is not secure since
224 | #  any user can connect to the notebook server via ssh.
225 | #c.NotebookApp.password_required = False
226 | 
227 | ## The port the notebook server will listen on.
228 | #c.NotebookApp.port = 8888
229 | 
230 | ## The number of additional ports to try if the specified port is not available.
231 | #c.NotebookApp.port_retries = 50
232 | 
233 | ## DISABLED: use %pylab or %matplotlib in the notebook to enable matplotlib.
234 | #c.NotebookApp.pylab = 'disabled'
235 | 
236 | ## (sec) Time window used to  check the message and data rate limits.
237 | #c.NotebookApp.rate_limit_window = 3
238 | 
239 | ## Reraise exceptions encountered loading server extensions?
240 | #c.NotebookApp.reraise_server_extension_failures = False
241 | 
242 | ## DEPRECATED use the nbserver_extensions dict instead
243 | #c.NotebookApp.server_extensions = []
244 | 
245 | ## The session manager class to use.
246 | #c.NotebookApp.session_manager_class = 'notebook.services.sessions.sessionmanager.SessionManager'
247 | 
248 | ## Supply SSL options for the tornado HTTPServer. See the tornado docs for
249 | #  details.
250 | #c.NotebookApp.ssl_options = {}
251 | 
252 | ## Supply overrides for terminado. Currently only supports "shell_command".
253 | #c.NotebookApp.terminado_settings = {}
254 | 
255 | ## Token used for authenticating first-time connections to the server.
256 | #  
257 | #  When no password is enabled, the default is to generate a new, random token.
258 | #  
259 | #  Setting to an empty string disables authentication altogether, which is NOT
260 | #  RECOMMENDED.
261 | #c.NotebookApp.token = '<generated>'
262 | 
263 | ## Supply overrides for the tornado.web.Application that the Jupyter notebook
264 | #  uses.
265 | #c.NotebookApp.tornado_settings = {}
266 | 
267 | ## Whether to trust or not X-Scheme/X-Forwarded-Proto and X-Real-Ip/X-Forwarded-
268 | #  For headerssent by the upstream reverse proxy. Necessary if the proxy handles
269 | #  SSL
270 | #c.NotebookApp.trust_xheaders = False
271 | 
272 | ## DEPRECATED, use tornado_settings
273 | #c.NotebookApp.webapp_settings = {}
274 | 
275 | ## The base URL for websockets, if it differs from the HTTP server (hint: it
276 | #  almost certainly doesn't).
277 | #  
278 | #  Should be in the form of an HTTP origin: ws[s]://hostname[:port]
279 | #c.NotebookApp.websocket_url = ''
280 | 
281 | #------------------------------------------------------------------------------
282 | # ConnectionFileMixin(LoggingConfigurable) configuration
283 | #------------------------------------------------------------------------------
284 | 
285 | ## Mixin for configurable classes that work with connection files
286 | 
287 | ## JSON file in which to store connection info [default: kernel-<pid>.json]
288 | #  
289 | #  This file will contain the IP, ports, and authentication key needed to connect
290 | #  clients to this kernel. By default, this file will be created in the security
291 | #  dir of the current profile, but can be specified by absolute path.
292 | #c.ConnectionFileMixin.connection_file = ''
293 | 
294 | ## set the control (ROUTER) port [default: random]
295 | #c.ConnectionFileMixin.control_port = 0
296 | 
297 | ## set the heartbeat port [default: random]
298 | #c.ConnectionFileMixin.hb_port = 0
299 | 
300 | ## set the iopub (PUB) port [default: random]
301 | #c.ConnectionFileMixin.iopub_port = 0
302 | 
303 | ## Set the kernel's IP address [default localhost]. If the IP address is
304 | #  something other than localhost, then Consoles on other machines will be able
305 | #  to connect to the Kernel, so be careful!
306 | #c.ConnectionFileMixin.ip = ''
307 | 
308 | ## set the shell (ROUTER) port [default: random]
309 | #c.ConnectionFileMixin.shell_port = 0
310 | 
311 | ## set the stdin (ROUTER) port [default: random]
312 | #c.ConnectionFileMixin.stdin_port = 0
313 | 
314 | ## 
315 | #c.ConnectionFileMixin.transport = 'tcp'
316 | 
317 | #------------------------------------------------------------------------------
318 | # KernelManager(ConnectionFileMixin) configuration
319 | #------------------------------------------------------------------------------
320 | 
321 | ## Manages a single kernel in a subprocess on this host.
322 | #  
323 | #  This version starts kernels with Popen.
324 | 
325 | ## Should we autorestart the kernel if it dies.
326 | #c.KernelManager.autorestart = True
327 | 
328 | ## DEPRECATED: Use kernel_name instead.
329 | #  
330 | #  The Popen Command to launch the kernel. Override this if you have a custom
331 | #  kernel. If kernel_cmd is specified in a configuration file, Jupyter does not
332 | #  pass any arguments to the kernel, because it cannot make any assumptions about
333 | #  the arguments that the kernel understands. In particular, this means that the
334 | #  kernel does not receive the option --debug if it given on the Jupyter command
335 | #  line.
336 | #c.KernelManager.kernel_cmd = []
337 | 
338 | ## Time to wait for a kernel to terminate before killing it, in seconds.
339 | #c.KernelManager.shutdown_wait_time = 5.0
340 | 
341 | #------------------------------------------------------------------------------
342 | # Session(Configurable) configuration
343 | #------------------------------------------------------------------------------
344 | 
345 | ## Object for handling serialization and sending of messages.
346 | #  
347 | #  The Session object handles building messages and sending them with ZMQ sockets
348 | #  or ZMQStream objects.  Objects can communicate with each other over the
349 | #  network via Session objects, and only need to work with the dict-based IPython
350 | #  message spec. The Session will handle serialization/deserialization, security,
351 | #  and metadata.
352 | #  
353 | #  Sessions support configurable serialization via packer/unpacker traits, and
354 | #  signing with HMAC digests via the key/keyfile traits.
355 | #  
356 | #  Parameters ----------
357 | #  
358 | #  debug : bool
359 | #      whether to trigger extra debugging statements
360 | #  packer/unpacker : str : 'json', 'pickle' or import_string
361 | #      importstrings for methods to serialize message parts.  If just
362 | #      'json' or 'pickle', predefined JSON and pickle packers will be used.
363 | #      Otherwise, the entire importstring must be used.
364 | #  
365 | #      The functions must accept at least valid JSON input, and output *bytes*.
366 | #  
367 | #      For example, to use msgpack:
368 | #      packer = 'msgpack.packb', unpacker='msgpack.unpackb'
369 | #  pack/unpack : callables
370 | #      You can also set the pack/unpack callables for serialization directly.
371 | #  session : bytes
372 | #      the ID of this Session object.  The default is to generate a new UUID.
373 | #  username : unicode
374 | #      username added to message headers.  The default is to ask the OS.
375 | #  key : bytes
376 | #      The key used to initialize an HMAC signature.  If unset, messages
377 | #      will not be signed or checked.
378 | #  keyfile : filepath
379 | #      The file containing a key.  If this is set, `key` will be initialized
380 | #      to the contents of the file.
381 | 
382 | ## Threshold (in bytes) beyond which an object's buffer should be extracted to
383 | #  avoid pickling.
384 | #c.Session.buffer_threshold = 1024
385 | 
386 | ## Whether to check PID to protect against calls after fork.
387 | #  
388 | #  This check can be disabled if fork-safety is handled elsewhere.
389 | #c.Session.check_pid = True
390 | 
391 | ## Threshold (in bytes) beyond which a buffer should be sent without copying.
392 | #c.Session.copy_threshold = 65536
393 | 
394 | ## Debug output in the Session
395 | #c.Session.debug = False
396 | 
397 | ## The maximum number of digests to remember.
398 | #  
399 | #  The digest history will be culled when it exceeds this value.
400 | #c.Session.digest_history_size = 65536
401 | 
402 | ## The maximum number of items for a container to be introspected for custom
403 | #  serialization. Containers larger than this are pickled outright.
404 | #c.Session.item_threshold = 64
405 | 
406 | ## execution key, for signing messages.
407 | #c.Session.key = b''
408 | 
409 | ## path to file containing execution key.
410 | #c.Session.keyfile = ''
411 | 
412 | ## Metadata dictionary, which serves as the default top-level metadata dict for
413 | #  each message.
414 | #c.Session.metadata = {}
415 | 
416 | ## The name of the packer for serializing messages. Should be one of 'json',
417 | #  'pickle', or an import name for a custom callable serializer.
418 | #c.Session.packer = 'json'
419 | 
420 | ## The UUID identifying this session.
421 | #c.Session.session = ''
422 | 
423 | ## The digest scheme used to construct the message signatures. Must have the form
424 | #  'hmac-HASH'.
425 | #c.Session.signature_scheme = 'hmac-sha256'
426 | 
427 | ## The name of the unpacker for unserializing messages. Only used with custom
428 | #  functions for `packer`.
429 | #c.Session.unpacker = 'json'
430 | 
431 | ## Username for the Session. Default is your system username.
432 | #c.Session.username = 'username'
433 | 
434 | #------------------------------------------------------------------------------
435 | # MultiKernelManager(LoggingConfigurable) configuration
436 | #------------------------------------------------------------------------------
437 | 
438 | ## A class for managing multiple kernels.
439 | 
440 | ## The name of the default kernel to start
441 | #c.MultiKernelManager.default_kernel_name = 'python3'
442 | 
443 | ## The kernel manager class.  This is configurable to allow subclassing of the
444 | #  KernelManager for customized behavior.
445 | #c.MultiKernelManager.kernel_manager_class = 'jupyter_client.ioloop.IOLoopKernelManager'
446 | 
447 | #------------------------------------------------------------------------------
448 | # MappingKernelManager(MultiKernelManager) configuration
449 | #------------------------------------------------------------------------------
450 | 
451 | ## A KernelManager that handles notebook mapping and HTTP error handling
452 | 
453 | ## 
454 | #c.MappingKernelManager.root_dir = ''
455 | 
456 | #------------------------------------------------------------------------------
457 | # ContentsManager(LoggingConfigurable) configuration
458 | #------------------------------------------------------------------------------
459 | 
460 | ## Base class for serving files and directories.
461 | #  
462 | #  This serves any text or binary file, as well as directories, with special
463 | #  handling for JSON notebook documents.
464 | #  
465 | #  Most APIs take a path argument, which is always an API-style unicode path, and
466 | #  always refers to a directory.
467 | #  
468 | #  - unicode, not url-escaped
469 | #  - '/'-separated
470 | #  - leading and trailing '/' will be stripped
471 | #  - if unspecified, path defaults to '',
472 | #    indicating the root path.
473 | 
474 | ## 
475 | #c.ContentsManager.checkpoints = None
476 | 
477 | ## 
478 | #c.ContentsManager.checkpoints_class = 'notebook.services.contents.checkpoints.Checkpoints'
479 | 
480 | ## 
481 | #c.ContentsManager.checkpoints_kwargs = {}
482 | 
483 | ## Glob patterns to hide in file and directory listings.
484 | #c.ContentsManager.hide_globs = ['__pycache__', '*.pyc', '*.pyo', '.DS_Store', '*.so', '*.dylib', '*~']
485 | 
486 | ## Python callable or importstring thereof
487 | #  
488 | #  To be called on a contents model prior to save.
489 | #  
490 | #  This can be used to process the structure, such as removing notebook outputs
491 | #  or other side effects that should not be saved.
492 | #  
493 | #  It will be called as (all arguments passed by keyword)::
494 | #  
495 | #      hook(path=path, model=model, contents_manager=self)
496 | #  
497 | #  - model: the model to be saved. Includes file contents.
498 | #    Modifying this dict will affect the file that is stored.
499 | #  - path: the API path of the save destination
500 | #  - contents_manager: this ContentsManager instance
501 | #c.ContentsManager.pre_save_hook = None
502 | 
503 | ## 
504 | #c.ContentsManager.root_dir = '/'
505 | 
506 | ## The base name used when creating untitled directories.
507 | #c.ContentsManager.untitled_directory = 'Untitled Folder'
508 | 
509 | ## The base name used when creating untitled files.
510 | #c.ContentsManager.untitled_file = 'untitled'
511 | 
512 | ## The base name used when creating untitled notebooks.
513 | #c.ContentsManager.untitled_notebook = 'Untitled'
514 | 
515 | #------------------------------------------------------------------------------
516 | # FileManagerMixin(Configurable) configuration
517 | #------------------------------------------------------------------------------
518 | 
519 | ## Mixin for ContentsAPI classes that interact with the filesystem.
520 | #  
521 | #  Provides facilities for reading, writing, and copying both notebooks and
522 | #  generic files.
523 | #  
524 | #  Shared by FileContentsManager and FileCheckpoints.
525 | #  
526 | #  Note ---- Classes using this mixin must provide the following attributes:
527 | #  
528 | #  root_dir : unicode
529 | #      A directory against against which API-style paths are to be resolved.
530 | #  
531 | #  log : logging.Logger
532 | 
533 | ## By default notebooks are saved on disk on a temporary file and then if
534 | #  succefully written, it replaces the old ones. This procedure, namely
535 | #  'atomic_writing', causes some bugs on file system whitout operation order
536 | #  enforcement (like some networked fs). If set to False, the new notebook is
537 | #  written directly on the old one which could fail (eg: full filesystem or quota
538 | #  )
539 | #c.FileManagerMixin.use_atomic_writing = True
540 | 
541 | #------------------------------------------------------------------------------
542 | # FileContentsManager(FileManagerMixin,ContentsManager) configuration
543 | #------------------------------------------------------------------------------
544 | 
545 | ## Python callable or importstring thereof
546 | #  
547 | #  to be called on the path of a file just saved.
548 | #  
549 | #  This can be used to process the file on disk, such as converting the notebook
550 | #  to a script or HTML via nbconvert.
551 | #  
552 | #  It will be called as (all arguments passed by keyword)::
553 | #  
554 | #      hook(os_path=os_path, model=model, contents_manager=instance)
555 | #  
556 | #  - path: the filesystem path to the file just written - model: the model
557 | #  representing the file - contents_manager: this ContentsManager instance
558 | #c.FileContentsManager.post_save_hook = None
559 | 
560 | ## 
561 | #c.FileContentsManager.root_dir = ''
562 | 
563 | ## DEPRECATED, use post_save_hook. Will be removed in Notebook 5.0
564 | #c.FileContentsManager.save_script = False
565 | 
566 | #------------------------------------------------------------------------------
567 | # NotebookNotary(LoggingConfigurable) configuration
568 | #------------------------------------------------------------------------------
569 | 
570 | ## A class for computing and verifying notebook signatures.
571 | 
572 | ## The hashing algorithm used to sign notebooks.
573 | #c.NotebookNotary.algorithm = 'sha256'
574 | 
575 | ## The sqlite file in which to store notebook signatures. By default, this will
576 | #  be in your Jupyter data directory. You can set it to ':memory:' to disable
577 | #  sqlite writing to the filesystem.
578 | #c.NotebookNotary.db_file = ''
579 | 
580 | ## The secret key with which notebooks are signed.
581 | #c.NotebookNotary.secret = b''
582 | 
583 | ## The file where the secret key is stored.
584 | #c.NotebookNotary.secret_file = ''
585 | 
586 | ## A callable returning the storage backend for notebook signatures. The default
587 | #  uses an SQLite database.
588 | #c.NotebookNotary.store_factory = traitlets.Undefined
589 | 
590 | #------------------------------------------------------------------------------
591 | # KernelSpecManager(LoggingConfigurable) configuration
592 | #------------------------------------------------------------------------------
593 | 
594 | ## If there is no Python kernelspec registered and the IPython kernel is
595 | #  available, ensure it is added to the spec list.
596 | #c.KernelSpecManager.ensure_native_kernel = True
597 | 
598 | ## The kernel spec class.  This is configurable to allow subclassing of the
599 | #  KernelSpecManager for customized behavior.
600 | #c.KernelSpecManager.kernel_spec_class = 'jupyter_client.kernelspec.KernelSpec'
601 | 
602 | ## Whitelist of allowed kernel names.
603 | #  
604 | #  By default, all installed kernels are allowed.
605 | #c.KernelSpecManager.whitelist = set()
606 | 


--------------------------------------------------------------------------------
/aml_config/local.compute:
--------------------------------------------------------------------------------
1 | type: "local"
2 | pythonLocation: "python"
3 | sparkSubmitLocation: "spark-submit"
4 | nativeSharedDirectory: "~/.azureml/share/"


--------------------------------------------------------------------------------
/aml_config/local.runconfig:
--------------------------------------------------------------------------------
 1 | ArgumentVector:
 2 |   - "$file"
 3 | Target: "local"
 4 | EnvironmentVariables:
 5 |   "EXAMPLE_ENV_VAR": "Example Value"
 6 | Framework: "Python"
 7 | CondaDependenciesFile: "aml_config/conda_dependencies.yml"
 8 | SparkDependenciesFile: "aml_config/spark_dependencies.yml"
 9 | PrepareEnvironment: true
10 | TrackedRun: true


--------------------------------------------------------------------------------
/aml_config/spark_dependencies.yml:
--------------------------------------------------------------------------------
 1 | # Spark configuration and packages specification. The dependencies defined in
 2 | # this file will be automatically provisioned for each run that uses Spark.
 3 | 
 4 | # Spark configuration values can be set through the configuration dictionary.
 5 | # Spark packages can be added through the repositories and packages lists.
 6 | 
 7 | # For third-party python libraries, see conda_dependencies.yml.
 8 | 
 9 | configuration:
10 |   "spark.app.name": "AzureML Experiment"
11 | repositories:
12 |   - "https://mmlspark.azureedge.net/maven"
13 | packages:
14 |   - group: "com.microsoft.ml.spark"
15 |     artifact: "mmlspark_2.11"
16 |     version: "0.7.91"
17 | 


--------------------------------------------------------------------------------
/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .feature_extractor import *
2 | from .phrase_learning import *


--------------------------------------------------------------------------------
/modules/feature_extractor.py:
--------------------------------------------------------------------------------
  1 | ################################################
  2 | # User Defined Functions for Feature Extraction
  3 | ################################################
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | from azureml.logging import get_azureml_logger
  8 | 
  9 | run_logger = get_azureml_logger()
 10 | run_logger.log('amlrealworld.QnA-matching.feature-extractor','true')
 11 | 
 12 | 
 13 | # get Token to ID mapping: {Token: tokenId}
 14 | def tokensToIds(tokens, featureHash):
 15 |     token2IdHash = {}
 16 |     for i in range(len(tokens)):
 17 |         tokenList = tokens.iloc[i].split(',')
 18 |         if featureHash is None:
 19 |             for t in tokenList:
 20 |                 if t not in token2IdHash.keys():
 21 |                     token2IdHash[t] = len(token2IdHash)
 22 |         else:
 23 |             for t in tokenList:
 24 |                 if t not in token2IdHash.keys() and t in list(featureHash.keys()):
 25 |                     token2IdHash[t] = len(token2IdHash)
 26 |             
 27 |     return token2IdHash
 28 | 
 29 | # create a matrix to store the token frequency.
 30 | def countMatrix(frame, token2IdHash, labelColumnName=None, uniqueLabel=None):
 31 |     # create am empty matrix with the shape of:
 32 |     # num_row = num of unique tokens
 33 |     # num_column = num of unique answerIds (N_wA) or num of questions in testQ (tfMatrix)
 34 |     # rowIdx = token2IdHash.values()
 35 |     # colIdx = index of uniqueClass (N_wA) or index of questions in testQ (tfMatrix)
 36 |     num_row = len(token2IdHash)
 37 |     if uniqueLabel is not None:  # get N_wA
 38 |         num_column = len(uniqueLabel)
 39 |     else:
 40 |         num_column = len(frame)
 41 |     countMatrix = np.zeros(shape=(num_row, num_column))
 42 | 
 43 |     # loop through each question in the frame to fill in the countMatrix with corresponding counts
 44 |     for i in range(len(frame)):
 45 |         tokens = frame['Tokens'].iloc[i].split(',')
 46 |         if uniqueLabel is not None:   # get N_wA
 47 |             label = frame[labelColumnName].iloc[i]
 48 |             colIdx = uniqueLabel.index(label)
 49 |         else:
 50 |             colIdx = i
 51 |             
 52 |         for t in tokens:
 53 |             if t in token2IdHash.keys():
 54 |                 rowIdx = token2IdHash[t]
 55 |                 countMatrix[rowIdx, colIdx] += 1
 56 | 
 57 |     return countMatrix
 58 | 
 59 | # calculate the prior probability of each answer class P(A): [P_A1, P_A2, ...]
 60 | def priorProbabilityAnswer(answerIds, uniqueLabel): 
 61 |     P_A = []
 62 |     # convert a pandas series to a list
 63 |     answerIds = list(answerIds)
 64 |     
 65 |     for id in uniqueLabel:
 66 |         P_A.append(answerIds.count(id)/len(answerIds))
 67 |     return np.array(P_A)
 68 | 
 69 | # calculate the conditional probability of each answer class by giving a token P(A|w).
 70 | def posterioriProb(N_wAInit, P_A, uniqueLabel):
 71 |     # N_A is the total number of answers
 72 |     N_A = len(uniqueLabel)
 73 |     # N_w is the total number of times w appears over all documents 
 74 |     # rowSum of count matrix (N_wAInit)
 75 |     N_wInit = np.sum(N_wAInit, axis = 1)
 76 |     # P(A|w) = (N_w|A + N_A * P(A))/(N_w + N_A)
 77 |     N = N_wAInit + N_A * P_A
 78 |     D = N_wInit + N_A
 79 |     P_Aw = np.divide(N.T, D).T    
 80 |     
 81 |     return P_Aw
 82 | 
 83 | # select the top N tokens w which maximize P(A|w) for each A.
 84 | # get FeatureHash: {token: 1}
 85 | def feature_selection(P_Aw, token2IdHashInit, topN):
 86 |     featureHash = {}
 87 |     # for each answer A, sort tokens w by P(A|w)
 88 |     sortedIdxMatrix = np.argsort(P_Aw, axis=0)[::-1]
 89 |     # select top N tokens for each answer A
 90 |     topMatrix = sortedIdxMatrix[0:topN, :]
 91 |     # for each token w in topMatrix, add w to FeatureHash if it has not already been included
 92 |     topTokenIdList = np.reshape(topMatrix, topMatrix.shape[0] * topMatrix.shape[1])
 93 |     # get ID to Token mapping: {tokenId: Token}
 94 |     Id2TokenHashInit = {y:x for x, y in token2IdHashInit.items()}
 95 |     
 96 |     for tokenId in topTokenIdList:
 97 |         token = Id2TokenHashInit[tokenId]
 98 |         if token not in featureHash.keys():
 99 |             featureHash[token] = 1
100 |     return featureHash
101 | 
102 | # calculate the weight for each feature. 
103 | def featureWeights(N_wA, alpha):
104 |     # N_w is the total number of times w appears over all documents 
105 |     # rowSum of count matrix (N_wA)
106 |     N_w = np.sum(N_wA, axis = 1)
107 |     # N_W is the total count of all words
108 |     N_W = np.sum(N_wA)
109 |     # N_V is the count of unique words in the vocabulary
110 |     N_V = N_wA.shape[0]
111 |     # P(w) = (N_w + 1*alpha) / (N_W +N_V*alpha)
112 |     N2 = N_w + 1 * alpha
113 |     D2 = N_W + alpha * N_V
114 |     P_w = N2/D2
115 | 
116 |     return P_w
117 | 
118 | # calculate the conditional probability of each token within an answer class P(A|w).
119 | def wordProbabilityInAnswer(N_wA, P_w, beta):
120 |     # N_V is the count of unique words in the vocabulary
121 |     N_V = N_wA.shape[0]
122 |     # N_WA is the total count of all words in questions on answer A 
123 |     # colSum of count matrix (N_wA)
124 |     N_WA = np.sum(N_wA, axis=0)
125 |     # P(w|A) = (N_w|A + beta N_V P(w))/(N_W|A + beta * N_V)
126 |     N = (N_wA.T + beta * N_V * P_w).T
127 |     D = N_WA + beta * N_V
128 |     P_wA = N / D
129 |     
130 |     return P_wA
131 | 
132 | # calculate the conditional probability of each token not within an answer class P(notA|w).
133 | def wordProbabilityNotinAnswer(N_wA, P_w, beta):
134 |     # N_V is the count of unique words in the vocabulary
135 |     N_V = N_wA.shape[0]
136 |     # N_wNotA is the count of w over all documents but not on answer A
137 |     # N_wNotA = N_w - N_wA
138 |     N_w = np.sum(N_wA, axis = 1)
139 |     N_wNotA = (N_w - N_wA.T).T
140 |     # N_WNotA is the count of all words over all documents but not on answer A
141 |     # N_WNotA = N_W - N_WA
142 |     N_W = np.sum(N_wA)
143 |     N_WA = np.sum(N_wA, axis=0)
144 |     N_WNotA = N_W - N_WA
145 |     # P(w|NotA) = (N_w|NotA + beta * N_V * P(w))/(N_W|NotA + beta * N_V)
146 |     N = (N_wNotA.T + beta * N_V * P_w).T
147 |     D = N_WNotA + beta * N_V
148 |     P_wNotA = N / D
149 |     
150 |     return P_wNotA
151 | 
152 | # calculate the normalized Term Frequency.
153 | def normalizeTF(frame, token2IdHash):
154 |     
155 |     N_wQ = countMatrix(frame, token2IdHash)
156 |     N_WQ = np.sum(N_wQ, axis=0)
157 |     
158 |     # find the index where N_WQ is zero
159 |     zeroIdx = np.where(N_WQ == 0)[0]
160 |     
161 |     # if N_WQ is zero, then the x_w for that particular question would be zero.
162 |     # for a simple calculation, we convert the N_WQ to 1 in those cases so the demoninator is not zero. 
163 |     if len(zeroIdx) > 0:
164 |         N_WQ[zeroIdx] = 1
165 |     
166 |     # x_w = P_wd = count(w)/sum(count(i in V))
167 |     x_w = N_wQ / N_WQ
168 |     
169 |     return x_w
170 | 
171 | # calculate the Inverse Document Frequency.
172 | def getIDF(N_wQ):
173 |     # N is total number of documents in the corpus
174 |     # N_V is the number of tokens in the vocabulary
175 |     N_V, N = N_wQ.shape
176 |     # D is the number of documents where the token w appears
177 |     D = np.zeros(shape=(0, N_V))
178 |     for i in range(N_V):
179 |         D = np.append(D, len(np.nonzero(N_wQ[i, ])[0]))
180 |     return np.log(N/D)
181 | 
182 | # create a softmax function.
183 | def softmax(scores2D):
184 |     # input: scores from different models
185 |     # row: test example
186 |     # column: label
187 |     return np.exp(scores2D)/np.sum(np.exp(scores2D), axis=1)[:, None]


--------------------------------------------------------------------------------
/modules/phrase_learning.py:
--------------------------------------------------------------------------------
  1 | ##############################################
  2 | # User Defined Functions for Phrase Learning
  3 | ##############################################
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | import re, nltk, time, gc, math
  8 | from azureml.logging import get_azureml_logger
  9 | 
 10 | run_logger = get_azureml_logger()
 11 | run_logger.log('amlrealworld.QnA-matching.phrase-learning','true')
 12 | 
 13 | 
 14 | def CleanAndSplitText(frame):
 15 |     
 16 |     global EMPTY, SPACE, NLTK_PUNKT_EN, SENTENCE_BREAKER
 17 |     EMPTY = ''
 18 |     SPACE = ' '
 19 |     nltk.download("punkt")
 20 |     NLTK_PUNKT_EN = 'tokenizers/punkt/english.pickle'
 21 |     SENTENCE_BREAKER = nltk.data.load(NLTK_PUNKT_EN)
 22 | 
 23 |     textDataOut = [] 
 24 | 
 25 |     # This regular expression is for punctuation that we wish to clean out
 26 |     # We also will split sentences into smaller phrase like units using this expression
 27 |     rePhraseBreaks = re.compile("[\"\!\?\)\]\}\,\:\;\*\-]*\s+\([0-9]+\)\s+[\(\[\{\"\*\-]*"                         
 28 |                                 "|[\"\!\?\)\]\}\,\:\;\*\-]+\s+[\(\[\{\"\*\-]*" 
 29 |                                 "|\.\.+"       # ..
 30 |                                 "|\s*\-\-+\s*" # --
 31 |                                 "|\s+\-\s+"    # -  
 32 |                                 "|\:\:+"       # ::
 33 |                                 "|\s+[\/\(\[\{\"\-\*]+\s*"  
 34 |                                 "|[\,!\?\"\)\(\]\[\}\{\:\;\*](?=[a-zA-Z])"
 35 |                                 "|[\"\!\?\)\]\}\,\:\;]+[\.]*$"
 36 |                              )
 37 |     
 38 |     # Regex for underbars
 39 |     regexUnderbar = re.compile('_|_+')
 40 |     
 41 |     # Regex for space
 42 |     regexSpace = re.compile(' +')
 43 |  
 44 |     # Regex for sentence final period
 45 |     regexPeriod = re.compile("\.$")
 46 |     
 47 |     # Regex for parentheses
 48 |     regexParentheses = re.compile("\(\$?")
 49 |     
 50 |     # Regex for equal sign
 51 |     regexEqual = re.compile("=")
 52 | 
 53 |     # Iterate through each document and do:
 54 |     #    (1) Split documents into sections based on section headers and remove section headers
 55 |     #    (2) Split the sections into sentences using NLTK sentence tokenizer
 56 |     #    (3) Further split sentences into phrasal units based on punctuation and remove punctuation
 57 |     #    (4) Remove sentence final periods when not part of a abbreviation 
 58 | 
 59 |     for i in range(0,len(frame)):
 60 |         
 61 |         # Extract one document from frame
 62 |         docID = frame.index.values[i]
 63 |         docText = frame['Text'].iloc[i] 
 64 | 
 65 |         # Set counter for output line count for this document
 66 |         lineIndex=0
 67 | 
 68 |         sentences = SENTENCE_BREAKER.tokenize(docText)
 69 |         
 70 |         for sentence in sentences:
 71 | 
 72 |             # Split each sentence into phrase level chunks based on punctuation
 73 |             textSegs = rePhraseBreaks.split(sentence)
 74 |             numSegs = len(textSegs)
 75 | 
 76 |             for j in range(0,numSegs):
 77 |                 if len(textSegs[j])>0:
 78 |                     # Convert underbars to spaces 
 79 |                     # Underbars are reserved for building the compound word phrases                   
 80 |                     textSegs[j] = regexUnderbar.sub(" ",textSegs[j])
 81 |                     
 82 |                     # Split out the words so we can specially handle the last word
 83 |                     words = regexSpace.split(textSegs[j])
 84 |                     
 85 |                     # Remove parentheses and equal signs
 86 |                     words = [regexEqual.sub("", regexParentheses.sub("", w)) for w in words]
 87 |                     
 88 |                     phraseOut = ""
 89 |                     last = len(words) -1
 90 |                     for i in range(0, last):
 91 |                         phraseOut += words[i] + " "
 92 |                     # If the last word ends in a period then remove the period
 93 |                     lastWord = regexPeriod.sub("", words[last])
 94 |                     # If the last word is an abbreviation like "U.S."
 95 |                     # then add the word final perios back on
 96 |                     if "\." in lastWord:
 97 |                         lastWord += "."
 98 |                     phraseOut += lastWord    
 99 | 
100 |                     textDataOut.append([docID,lineIndex,phraseOut, phraseOut.lower()])
101 |                     lineIndex += 1
102 |                         
103 |     # Convert to pandas frame 
104 |     frameOut = pd.DataFrame(textDataOut, columns=['DocID','DocLine','CleanedText', 'LowercaseText'])                      
105 |     
106 |     return frameOut
107 | 
108 | # count the number of occurances of all 2-gram, 3-ngram, and 4-gram word sequences.
109 | def ComputeNgramStats(textData,functionwordHash,blacklistHash):
110 |     
111 |     # Create an array to store the total count of all ngrams up to 4-grams
112 |     # Array element 0 is unused, element 1 is unigrams, element 2 is bigrams, etc.
113 |     ngramCounts = [0]*5;
114 |        
115 |     # Create a list of structures to tabulate ngram count statistics
116 |     # Array element 0 is the array of total ngram counts,
117 |     # Array element 1 is a hash table of individual unigram counts
118 |     # Array element 2 is a hash table of individual bigram counts
119 |     # Array element 3 is a hash table of individual trigram counts
120 |     # Array element 4 is a hash table of individual 4-gram counts
121 |     ngramStats = [ngramCounts, {}, {}, {}, {}]
122 |           
123 |     # Create a regular expression for assessing validity of words
124 |     # for phrase modeling. The expression says words in phrases
125 |     # must either:
126 |     # (1) contain an alphabetic character, or 
127 |     # (2) be the single charcater '&', or
128 |     # (3) be a one or two digit number
129 |     reWordIsValid = re.compile('[A-Za-z]|^&$|^\d\d?$')
130 |     
131 |     # Go through the text data line by line collecting count statistics
132 |     # for all valid n-grams that could appear in a potential phrase
133 |     numLines = len(textData)
134 |     for i in range(0, numLines):
135 | 
136 |         # Split the text line into an array of words
137 |         wordArray = textData[i].split()
138 |         numWords = len(wordArray)
139 |         
140 |         # Create an array marking each word as valid or invalid
141 |         validArray = [];
142 |         for word in wordArray:
143 |             validArray.append(reWordIsValid.match(word) != None)        
144 |             
145 |         # Tabulate total raw ngrams for this line into counts for each ngram bin
146 |         # The total ngrams counts include the counts of all ngrams including those
147 |         # that we won't consider as parts of phrases
148 |         for j in range(1,5):
149 |             if j<=numWords:
150 |                 ngramCounts[j] += numWords - j + 1 
151 |         
152 |         # Collect counts for viable phrase ngrams and left context sub-phrases
153 |         for j in range(0,numWords):
154 |             word = wordArray[j]
155 | 
156 |             # Only bother counting the ngrams that start with a valid content word
157 |             # i.e., valids words not in the function word list or the black list
158 |             if ( ( word not in functionwordHash ) and ( word not in blacklistHash ) and validArray[j] ):
159 | 
160 |                 # Initialize ngram string with first content word and add it to unigram counts
161 |                 ngramSeq = word 
162 |                 if ngramSeq in ngramStats[1]:
163 |                     ngramStats[1][ngramSeq] += 1
164 |                 else:
165 |                     ngramStats[1][ngramSeq] = 1
166 | 
167 |                 # Count valid ngrams from bigrams up to 4-grams
168 |                 stop = 0
169 |                 k = 1
170 |                 while (k<4) and (j+k<numWords) and not stop:
171 |                     n = k + 1
172 |                     nextNgramWord = wordArray[j+k]
173 |                     # Only count ngrams with valid words not in the blacklist
174 |                     if ( validArray[j+k] and nextNgramWord not in blacklistHash ):
175 |                         ngramSeq += " " + nextNgramWord
176 |                         if ngramSeq in ngramStats[n]:
177 |                             ngramStats[n][ngramSeq] += 1
178 |                         else:
179 |                             ngramStats[n][ngramSeq] = 1 
180 |                         k += 1
181 |                         if nextNgramWord not in functionwordHash:
182 |                             # Stop counting new ngrams after second content word in 
183 |                             # ngram is reached and ngram is a viable full phrase
184 |                             stop = 1
185 |                     else:
186 |                         stop = 1
187 |     return ngramStats
188 | 
189 | # rank potential phrases by the Weighted Pointwise Mutual Information of their constituent words
190 | def RankNgrams(ngramStats,functionwordHash,minCount):
191 |     # Create a hash table to store weighted pointwise mutual 
192 |     # information scores for each viable phrase
193 |     ngramWPMIHash = {}
194 |         
195 |     # Go through each of the ngram tables and compute the phrase scores
196 |     # for the viable phrases
197 |     for n in range(2,5):
198 |         i = n-1
199 |         for ngram in ngramStats[n].keys():
200 |             ngramCount = ngramStats[n][ngram]
201 |             if ngramCount >= minCount:
202 |                 wordArray = ngram.split()
203 |                 # If the final word in the ngram is not a function word then
204 |                 # the ngram is a valid phrase candidate we want to score
205 |                 if wordArray[i] not in functionwordHash: 
206 |                     leftNgram = wordArray[0]
207 |                     for j in range(1,i):
208 |                         leftNgram += ' ' + wordArray[j]
209 |                     rightWord = wordArray[i]
210 |                     
211 |                     # Compute the weighted pointwise mutual information (WPMI) for the phrase
212 |                     probNgram = float(ngramStats[n][ngram])/float(ngramStats[0][n])
213 |                     probLeftNgram = float(ngramStats[n-1][leftNgram])/float(ngramStats[0][n-1])
214 |                     probRightWord = float(ngramStats[1][rightWord])/float(ngramStats[0][1])
215 |                     WPMI = probNgram * math.log(probNgram/(probLeftNgram*probRightWord));
216 | 
217 |                     # Add the phrase into the list of scored phrases only if WMPI is positive
218 |                     if WPMI > 0:
219 |                         ngramWPMIHash[ngram] = WPMI  
220 |     
221 |     # Create a sorted list of the phrase candidates
222 |     rankedNgrams = sorted(ngramWPMIHash, key=ngramWPMIHash.__getitem__, reverse=True)
223 | 
224 |     # Force a memory clean-up
225 |     ngramWPMIHash = None
226 |     gc.collect()
227 | 
228 |     return rankedNgrams
229 | 
230 | # apply the phrase rewrites to training data.
231 | def ApplyPhraseRewrites(rankedNgrams,textData,learnedPhrases,                 
232 |                         maxPhrasesToAdd,maxPhraseLength,verbose):
233 |     
234 |     if len(rankedNgrams) == 0:
235 |         return
236 |     
237 |     # This function will consider at most maxRewrite 
238 |     # new phrases to be added into the learned phrase 
239 |     # list as specified by the calling fuinction
240 |     maxRewrite=maxPhrasesToAdd
241 | 
242 |     # If the remaining number of proposed ngram phrases is less 
243 |     # than the max allowed, then reset maxRewrite to the size of 
244 |     # the proposed ngram phrases list
245 |     numNgrams = len(rankedNgrams)
246 |     if numNgrams < maxRewrite:
247 |         maxRewrite = numNgrams
248 |     
249 |     # Create empty hash tables to keep track of phrase overlap conflicts
250 |     leftConflictHash = {}
251 |     rightConflictHash = {}
252 |     
253 |     # Create an empty hash table collecting the set of rewrite rules
254 |     # to be applied during this iteration of phrase learning
255 |     ngramRewriteHash = {}
256 |     
257 |     # Precompile the regex for finding spaces in ngram phrases
258 |     regexSpace = re.compile(' ')
259 | 
260 |     # Initialize some bookkeeping variables
261 |     numLines = len(textData)
262 |     numPhrasesAdded = 0
263 |     numConsidered = 0
264 |     lastSkippedNgram = ""
265 |     lastAddedNgram = ""
266 |   
267 |     # Collect list up to maxRewrite ngram phrase rewrites
268 |     stop = False
269 |     index = 0
270 |     while not stop:
271 | 
272 |         # Get the next phrase to consider adding to the phrase list
273 |         inputNgram = rankedNgrams[index]
274 | 
275 |         # Create the output compound word version of the phrase
276 |         # The extra space is added to make the regex rewrite easier
277 |         outputNgram = " " + regexSpace.sub("_",inputNgram)
278 | 
279 |         # Count the total number of words in the proposed phrase
280 |         numWords = len(outputNgram.split("_"))
281 | 
282 |         # Only add phrases that don't exceed the max phrase length
283 |         if (numWords <= maxPhraseLength):
284 |     
285 |             # Keep count of phrases considered for inclusion during this iteration
286 |             numConsidered += 1
287 | 
288 |             # Extract the left and right words in the phrase to use
289 |             # in checks for phrase overlap conflicts
290 |             ngramArray = inputNgram.split()
291 |             leftWord = ngramArray[0]
292 |             rightWord = ngramArray[len(ngramArray)-1]
293 | 
294 |             # Skip any ngram phrases that conflict with earlier phrases added
295 |             # These ngram phrases will be reconsidered in the next iteration
296 |             if (leftWord in leftConflictHash) or (rightWord in rightConflictHash): 
297 |                 if verbose: 
298 |                     print ("(%d) Skipping (context conflict): %s" % (numConsidered,inputNgram))
299 |                 lastSkippedNgram = inputNgram
300 |                 
301 |             # If no conflict exists then add this phrase into the list of phrase rewrites     
302 |             else: 
303 |                 if verbose:
304 |                     print ("(%d) Adding: %s" % (numConsidered,inputNgram))
305 |                 ngramRewriteHash[" " + inputNgram] = outputNgram
306 |                 learnedPhrases.append(inputNgram) 
307 |                 lastAddedNgram = inputNgram
308 |                 numPhrasesAdded += 1
309 |             
310 |             # Keep track of all context words that might conflict with upcoming
311 |             # propose phrases (even when phrases are skipped instead of added)
312 |             leftConflictHash[rightWord] = 1
313 |             rightConflictHash[leftWord] = 1
314 | 
315 |             # Stop when we've considered the maximum number of phrases per iteration
316 |             if ( numConsidered >= maxRewrite ):
317 |                 stop = True
318 |             
319 |         # Increment to next phrase
320 |         index += 1
321 |     
322 |         # Stop if we've reached the end of the ranked ngram list
323 |         if index >= len(rankedNgrams):
324 |             stop = True
325 | 
326 |     # Now do the phrase rewrites over the entire set of text data
327 |     if numPhrasesAdded == 1:
328 |         # If only one phrase to add use a single regex rule to do this phrase rewrite        
329 |         inputNgram = " " + lastAddedNgram
330 |         outputNgram = ngramRewriteHash[inputNgram]
331 |         regexNgram = re.compile (r'%s(?= )' % re.escape(inputNgram)) 
332 |         # Apply the regex over the full data set
333 |         for j in range(0,numLines):
334 |             textData[j] = regexNgram.sub(outputNgram, textData[j])
335 |     elif numPhrasesAdded > 1:
336 |         # Compile a single regex rule from the collected set of phrase rewrites for this iteration
337 |         ngramRegex = re.compile(r'%s(?= )' % "|".join(map(re.escape, ngramRewriteHash.keys())))
338 |         # Apply the regex over the full data set
339 |         for i in range(0,len(textData)):
340 |             # The regex substituion looks up the output string rewrite  
341 |             # in the hash table for each matched input phrase regex
342 |             textData[i] = ngramRegex.sub(lambda mo: ngramRewriteHash[mo.string[mo.start():mo.end()]], textData[i]) 
343 |       
344 |     return
345 | 
346 | # run the full iterative phrase learning process.
347 | def ApplyPhraseLearning(textData, learnedPhrases, maxNumPhrases=200, maxPhraseLength=7, maxPhrasesPerIter=50, 
348 |     minCount=5, functionwordHash={}, blacklistHash={}, verbose=False):
349 |     
350 |     stop = 0
351 |     iterNum = 0
352 |     
353 |     # Start timing the process
354 |     functionStartTime = time.clock()
355 |     
356 |     numPhrasesLearned = len(learnedPhrases)
357 |     print ("Start phrase learning with %d phrases of %d phrases learned" % (numPhrasesLearned,maxNumPhrases))
358 | 
359 |     while not stop:
360 |         iterNum += 1
361 |                 
362 |         # Start timing this iteration
363 |         startTime = time.clock()
364 |  
365 |         # Collect ngram stats
366 |         ngramStats = ComputeNgramStats(textData,functionwordHash,blacklistHash)
367 | 
368 |         # Rank ngrams
369 |         rankedNgrams = RankNgrams(ngramStats,functionwordHash,minCount)
370 |         
371 |         # Incorporate top ranked phrases into phrase list
372 |         # and rewrite the text to use these phrases
373 |         maxPhrasesToAdd = maxNumPhrases - numPhrasesLearned
374 |         if maxPhrasesToAdd > maxPhrasesPerIter:
375 |             maxPhrasesToAdd = maxPhrasesPerIter
376 |         ApplyPhraseRewrites(rankedNgrams,textData,learnedPhrases,maxPhrasesToAdd,maxPhraseLength,verbose)
377 |         numPhrasesAdded = len(learnedPhrases) - numPhrasesLearned
378 | 
379 |         # Garbage collect
380 |         ngramStats = None
381 |         rankedNgrams = None
382 |         gc.collect();
383 |                
384 |         elapsedTime = time.clock() - startTime
385 | 
386 |         numPhrasesLearned = len(learnedPhrases)
387 |         print ("Iteration %d: Added %d new phrases in %.2f seconds (Learned %d of max %d)" % 
388 |                (iterNum,numPhrasesAdded,elapsedTime,numPhrasesLearned,maxNumPhrases))
389 |         
390 |         if numPhrasesAdded >= maxPhrasesToAdd or numPhrasesAdded == 0:
391 |             stop = 1
392 |         
393 |     # Remove the space padding at the start and end of each line
394 |     regexSpacePadding = re.compile('^ +| +$')
395 |     for i in range(0,len(textData)):
396 |         textData[i] = regexSpacePadding.sub("",textData[i])
397 |     
398 |     gc.collect()
399 |  
400 |     elapsedTime = time.clock() - functionStartTime
401 |     elapsedTimeHours = elapsedTime/3600.0;
402 |     print ("*** Phrase learning completed in %.2f hours ***" % elapsedTimeHours) 
403 | 
404 |     return
405 | 
406 | # apply the learned phrases to test data.
407 | def ApplyPhraseRewritesInPlace(textFrame, textColumnName, phraseRules):
408 |         
409 |     # Get text data column from frame
410 |     textData = textFrame[textColumnName]
411 |     numLines = len(textData)
412 |     
413 |     # initial a list to store output text
414 |     textOutput = [None] * numLines
415 |     
416 |     # Add leading and trailing spaces to make regex matching easier
417 |     for i in range(0,numLines):
418 |         textOutput[i] = " " + textData[i] + " "  
419 | 
420 |     # Make sure we have phrase to add
421 |     numPhraseRules = len(phraseRules)
422 |     if numPhraseRules == 0: 
423 |         print ("Warning: phrase rule lise is empty - no phrases being applied to text data")
424 |         return
425 | 
426 |     # Precompile the regex for finding spaces in ngram phrases
427 |     regexSpace = re.compile(' ')
428 |    
429 |     # Initialize some bookkeeping variables
430 | 
431 |     # Iterate through full set of phrases to find sets of 
432 |     # non-conflicting phrases that can be apply simultaneously
433 |     index = 0
434 |     outerStop = False
435 |     while not outerStop:
436 |        
437 |         # Create empty hash tables to keep track of phrase overlap conflicts
438 |         leftConflictHash = {}
439 |         rightConflictHash = {}
440 |         prevConflictHash = {}
441 |     
442 |         # Create an empty hash table collecting the next set of rewrite rules
443 |         # to be applied during this iteration of phrase rewriting
444 |         phraseRewriteHash = {}
445 |     
446 |         # Progress through phrases until the next conflicting phrase is found
447 |         innerStop = 0
448 |         numPhrasesAdded = 0
449 |         while not innerStop:
450 |         
451 |             # Get the next phrase to consider adding to the phrase list
452 |             nextPhrase = phraseRules[index]            
453 |             
454 |             # Extract the left and right sides of the phrase to use
455 |             # in checks for phrase overlap conflicts
456 |             ngramArray = nextPhrase.split()
457 |             leftWord = ngramArray[0]
458 |             rightWord = ngramArray[len(ngramArray)-1] 
459 | 
460 |             # Stop if we reach any phrases that conflicts with earlier phrases in this iteration
461 |             # These ngram phrases will be reconsidered in the next iteration
462 |             if ((leftWord in leftConflictHash) or (rightWord in rightConflictHash) 
463 |                 or (leftWord in prevConflictHash) or (rightWord in prevConflictHash)): 
464 |                 innerStop = True
465 |                 
466 |             # If no conflict exists then add this phrase into the list of phrase rewrites     
467 |             else: 
468 |                 # Create the output compound word version of the phrase
469 |                                 
470 |                 outputPhrase = regexSpace.sub("_",nextPhrase);
471 |                 
472 |                 # Keep track of all context words that might conflict with upcoming
473 |                 # propose phrases (even when phrases are skipped instead of added)
474 |                 leftConflictHash[rightWord] = 1
475 |                 rightConflictHash[leftWord] = 1
476 |                 prevConflictHash[outputPhrase] = 1           
477 |                 
478 |                 # Add extra space to input an output versions of the current phrase 
479 |                 # to make the regex rewrite easier
480 |                 outputPhrase = " " + outputPhrase
481 |                 lastAddedPhrase = " " + nextPhrase
482 |                 
483 |                 # Add the phrase to the rewrite hash
484 |                 phraseRewriteHash[lastAddedPhrase] = outputPhrase
485 |                   
486 |                 # Increment to next phrase
487 |                 index += 1
488 |                 numPhrasesAdded  += 1
489 |     
490 |                 # Stop if we've reached the end of the phrases list
491 |                 if index >= numPhraseRules:
492 |                     innerStop = True
493 |                     outerStop = True
494 |                     
495 |         # Now do the phrase rewrites over the entire set of text data
496 |         if numPhrasesAdded == 1:
497 |         
498 |             # If only one phrase to add use a single regex rule to do this phrase rewrite        
499 |             outputPhrase = phraseRewriteHash[lastAddedPhrase]
500 |             regexPhrase = re.compile (r'%s(?= )' % re.escape(lastAddedPhrase)) 
501 |         
502 |             # Apply the regex over the full data set
503 |             for j in range(0,numLines):
504 |                 textOutput[j] = regexPhrase.sub(outputPhrase, textOutput[j])
505 |        
506 |         elif numPhrasesAdded > 1:
507 |             # Compile a single regex rule from the collected set of phrase rewrites for this iteration
508 |             regexPhrase = re.compile(r'%s(?= )' % "|".join(map(re.escape, phraseRewriteHash.keys())))
509 |             
510 |             # Apply the regex over the full data set
511 |             for i in range(0,numLines):
512 |                 # The regex substituion looks up the output string rewrite  
513 |                 # in the hash table for each matched input phrase regex
514 |                 textOutput[i] = regexPhrase.sub(lambda mo: phraseRewriteHash[mo.string[mo.start():mo.end()]], textOutput[i]) 
515 |     
516 |     # Remove the space padding at the start and end of each line
517 |     regexSpacePadding = re.compile('^ +| +$')
518 |     for i in range(0,len(textOutput)):
519 |         textOutput[i] = regexSpacePadding.sub("",textOutput[i])
520 |     
521 |     return textOutput
522 | 
523 | # reconstruct the full processed text and put it back into a new data frame.
524 | def ReconstituteDocsFromChunks(textData, idColumnName, textColumnName):
525 |     dataOut = []
526 |     
527 |     currentDoc = "";
528 |     currentDocID = "";
529 |     
530 |     for i in range(0,len(textData)):
531 |         textChunk = textData[textColumnName][i]
532 |         docID = textData[idColumnName][i]
533 |         if docID != currentDocID:
534 |             if currentDocID != "":
535 |                 dataOut.append(currentDoc)
536 |             currentDoc = textChunk
537 |             currentDocID = docID
538 |         else:
539 |             currentDoc += " " + textChunk
540 |     dataOut.append(currentDoc)
541 |     
542 |     return dataOut
543 | 
544 | # create the Vocabulary with some filtering criteria.
545 | def CreateVocabForTopicModeling(textData,stopwordHash):
546 | 
547 |     print ("Counting words")
548 |     numDocs = len(textData) 
549 |     globalWordCountHash = {} 
550 |     globalDocCountHash = {} 
551 |     for textLine in textData:
552 |         docWordCountHash = {}
553 |         for word in textLine.split():
554 |             if word in globalWordCountHash:
555 |                 globalWordCountHash[word] += 1
556 |             else:
557 |                 globalWordCountHash[word] = 1
558 |             if word not in docWordCountHash: 
559 |                 docWordCountHash[word] = 1
560 |                 if word in globalDocCountHash:
561 |                     globalDocCountHash[word] += 1
562 |                 else:
563 |                     globalDocCountHash[word] = 1
564 | 
565 |     minWordCount = 5;
566 |     minDocCount = 2;
567 |     maxDocFreq = .25;
568 |     vocabCount = 0;
569 |     vocabHash = {}
570 | 
571 |     excStopword = 0
572 |     excNonalphabetic = 0
573 |     excMinwordcount = 0
574 |     excNotindochash = 0
575 |     excMindoccount = 0
576 |     excMaxdocfreq =0
577 | 
578 |     print ("Building vocab")
579 |     for word in globalWordCountHash.keys():
580 |         # Test vocabulary exclusion criteria for each word
581 |         if ( word in stopwordHash ):
582 |             excStopword += 1
583 |         elif ( not re.search(r'[a-zA-Z]', word, 0) ):
584 |             excNonalphabetic += 1
585 |         elif ( globalWordCountHash[word] < minWordCount ):
586 |             excMinwordcount += 1
587 |         elif ( word not in globalDocCountHash ):
588 |             print ("Warning: Word '%s' not in doc count hash") % (word)
589 |             excNotindochash += 1
590 |         elif ( globalDocCountHash[word] < minDocCount ):
591 |             excMindoccount += 1
592 |         elif ( float(globalDocCountHash[word])/float(numDocs) > maxDocFreq ):
593 |             excMaxdocfreq += 1
594 |         else:
595 |             # Add word to vocab
596 |             vocabHash[word]= globalWordCountHash[word];
597 |             vocabCount += 1 
598 |     print ("Excluded %d stop words" % (excStopword))       
599 |     print ("Excluded %d non-alphabetic words" % (excNonalphabetic))  
600 |     print ("Excluded %d words below word count threshold" % (excMinwordcount)) 
601 |     print ("Excluded %d words below doc count threshold" % (excMindoccount))
602 |     print ("Excluded %d words above max doc frequency" % (excMaxdocfreq)) 
603 |     print ("Final Vocab Size: %d words" % vocabCount)
604 |             
605 |     return vocabHash


--------------------------------------------------------------------------------
/notebooks/Part_1_Data_Preparation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Part 1: Data Preparation\n",
  8 |     "\n",
  9 |     "Please make sure you have __notebook__ and __nltk__ Python packages installed in the compute context you choose as kernel. For demonstration purpose, this series of notebooks uses the `local` compute context.\n",
 10 |     "\n",
 11 |     "**NOTE**: Python 3 kernel doesn't include Azure Machine Learning Workbench functionalities. Please switch the kernel to `local` before continuing further. \n",
 12 |     "\n",
 13 |     "To install __notebook__ and __nltk__, please uncomment and run the following script."
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "# !pip install --upgrade notebook\n",
 25 |     "# !pip install --upgrade nltk"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### Import Required Python Modules"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "metadata": {
 39 |     "collapsed": true
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "import pandas as pd\n",
 44 |     "import numpy as np\n",
 45 |     "import re, os, gzip, requests, warnings\n",
 46 |     "from azureml.logging import get_azureml_logger\n",
 47 |     "warnings.filterwarnings(\"ignore\")"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "run_logger = get_azureml_logger()\n",
 59 |     "run_logger.log('amlrealworld.QnA-matching.part1-data-preparation','true')"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "## Access Sample Data\n",
 67 |     "\n",
 68 |     "In this example, we have collected a set of Q&A pairs from Stack Overflow site tagged as `JavaScript` questions. The data contains 1,201 original Q&A pairs as well as many duplicate questions, i.e. new questions that Stack Overflow users have linked back to pre-existing Q&A pairs that effectively provide answers to these new questions. The data schema of the original questions (Q), duplicate questions (D), and answers (A) can be found in the following table:\n",
 69 |     "\n",
 70 |     "| Dataset | Field | Type | Description\n",
 71 |     "| ----------|------------|------------|--------\n",
 72 |     "| question (Q) | Id | String | The unique question ID (primary key)\n",
 73 |     "|  | AnswerId | String | The unique answer ID per question\n",
 74 |     "|  | Text0 | String | The raw text data including the question's title and body\n",
 75 |     "|  | CreationDate | Timestamp | The timestamp of when the question has been asked\n",
 76 |     "| dupes (D) | Id | String | The unique duplication ID (primary key)\n",
 77 |     "|  | AnswerId | String | The answer ID associated with the duplication\n",
 78 |     "|  | Text0 | String | The raw text data including the duplication's title and body\n",
 79 |     "|  | CreationDate | Timestamp | The timestamp of when the duplication has been asked\n",
 80 |     "| answers (A) | Id | String | The unique answer ID (primary key)\n",
 81 |     "|  | text0 | String | The raw text data of the answer\n",
 82 |     "\n",
 83 |     "The datasets are compressed and stored in Azure Blob storage as `.tsv.gz` files and this section provides you the code to retreive the data in the notebook."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 3,
 89 |    "metadata": {
 90 |     "collapsed": true
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "# load raw data from a .tsv.gz file into Pandas data frame.\n",
 95 |     "def read_csv_gz(url, **kwargs):\n",
 96 |     "    df = pd.read_csv(gzip.open(requests.get(url, stream=True).raw, mode='rb'), sep='\\t', encoding='utf8', **kwargs)\n",
 97 |     "    return df.set_index('Id')"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 4,
103 |    "metadata": {
104 |     "collapsed": true
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "# URLs to Original questions, Duplications, and Answers.\n",
109 |     "questions_url = 'https://bostondata.blob.core.windows.net/stackoverflow/orig-q.tsv.gz'\n",
110 |     "dupes_url = 'https://bostondata.blob.core.windows.net/stackoverflow/dup-q.tsv.gz'\n",
111 |     "answers_url = 'https://bostondata.blob.core.windows.net/stackoverflow/ans.tsv.gz'\n",
112 |     "\n",
113 |     "# load datasets.\n",
114 |     "questions = read_csv_gz(questions_url, names=('Id', 'AnswerId', 'Text0', 'CreationDate'))\n",
115 |     "dupes = read_csv_gz(dupes_url, names=('Id', 'AnswerId', 'Text0', 'CreationDate'))\n",
116 |     "answers = read_csv_gz(answers_url, names=('Id', 'Text0'))"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "To provide some example, here are the first five rows of the __questions__ table:"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 5,
129 |    "metadata": {
130 |     "collapsed": false,
131 |     "scrolled": true
132 |    },
133 |    "outputs": [
134 |     {
135 |      "data": {
136 |       "text/html": [
137 |        "<div>\n",
138 |        "<table border=\"1\" class=\"dataframe\">\n",
139 |        "  <thead>\n",
140 |        "    <tr style=\"text-align: right;\">\n",
141 |        "      <th></th>\n",
142 |        "      <th>AnswerId</th>\n",
143 |        "      <th>Text0</th>\n",
144 |        "      <th>CreationDate</th>\n",
145 |        "    </tr>\n",
146 |        "    <tr>\n",
147 |        "      <th>Id</th>\n",
148 |        "      <th></th>\n",
149 |        "      <th></th>\n",
150 |        "      <th></th>\n",
151 |        "    </tr>\n",
152 |        "  </thead>\n",
153 |        "  <tbody>\n",
154 |        "    <tr>\n",
155 |        "      <th>220231</th>\n",
156 |        "      <td>220233</td>\n",
157 |        "      <td>Accessing the web page's HTTP Headers in JavaS...</td>\n",
158 |        "      <td>2008-10-20 22:54:38.767</td>\n",
159 |        "    </tr>\n",
160 |        "    <tr>\n",
161 |        "      <th>391979</th>\n",
162 |        "      <td>810461</td>\n",
163 |        "      <td>Get client IP using just JavaScript?. &lt;p&gt;I nee...</td>\n",
164 |        "      <td>2008-12-24 18:22:30.780</td>\n",
165 |        "    </tr>\n",
166 |        "    <tr>\n",
167 |        "      <th>109086</th>\n",
168 |        "      <td>109091</td>\n",
169 |        "      <td>Stop setInterval call in JavaScript. &lt;p&gt;I am u...</td>\n",
170 |        "      <td>2008-09-20 19:29:55.377</td>\n",
171 |        "    </tr>\n",
172 |        "    <tr>\n",
173 |        "      <th>46155</th>\n",
174 |        "      <td>46181</td>\n",
175 |        "      <td>Validate email address in JavaScript?. &lt;p&gt;How ...</td>\n",
176 |        "      <td>2008-09-05 16:10:11.093</td>\n",
177 |        "    </tr>\n",
178 |        "    <tr>\n",
179 |        "      <th>121499</th>\n",
180 |        "      <td>121708</td>\n",
181 |        "      <td>When onblur occurs, how can I find out which e...</td>\n",
182 |        "      <td>2008-09-23 14:48:43.483</td>\n",
183 |        "    </tr>\n",
184 |        "  </tbody>\n",
185 |        "</table>\n",
186 |        "</div>"
187 |       ],
188 |       "text/plain": [
189 |        "        AnswerId                                              Text0  \\\n",
190 |        "Id                                                                    \n",
191 |        "220231    220233  Accessing the web page's HTTP Headers in JavaS...   \n",
192 |        "391979    810461  Get client IP using just JavaScript?. <p>I nee...   \n",
193 |        "109086    109091  Stop setInterval call in JavaScript. <p>I am u...   \n",
194 |        "46155      46181  Validate email address in JavaScript?. <p>How ...   \n",
195 |        "121499    121708  When onblur occurs, how can I find out which e...   \n",
196 |        "\n",
197 |        "                   CreationDate  \n",
198 |        "Id                               \n",
199 |        "220231  2008-10-20 22:54:38.767  \n",
200 |        "391979  2008-12-24 18:22:30.780  \n",
201 |        "109086  2008-09-20 19:29:55.377  \n",
202 |        "46155   2008-09-05 16:10:11.093  \n",
203 |        "121499  2008-09-23 14:48:43.483  "
204 |       ]
205 |      },
206 |      "execution_count": 5,
207 |      "metadata": {},
208 |      "output_type": "execute_result"
209 |     }
210 |    ],
211 |    "source": [
212 |     "questions.head(5)"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "Here is the full text of one __original__ question, whose is `Id` is `220231`. The `AnswerId` associated with this question is `220233`."
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 6,
225 |    "metadata": {
226 |     "collapsed": false,
227 |     "scrolled": true
228 |    },
229 |    "outputs": [
230 |     {
231 |      "name": "stdout",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "Accessing the web page's HTTP Headers in JavaScript. <p>How do I access a page's HTTP response headers via JavaScript?</p> <p>Related to <a href=\"http://stackoverflow.com/questions/220149/how-do-i-access-the-http-request-header-fields-via-javascript\"><strong>this question</strong></a>, which was modified to ask about accessing two specific HTTP headers.</p> <blockquote> <p><strong>Related:</strong><br> <a href=\"http://stackoverflow.com/questions/220149/how-do-i-access-the-http-request-header-fields-via-javascript\">How do I access the HTTP request header fields via JavaScript?</a></p> </blockquote>\n"
235 |      ]
236 |     }
237 |    ],
238 |    "source": [
239 |     "# This text include the HTML code.\n",
240 |     "print(questions[\"Text0\"][220231])"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "markdown",
245 |    "metadata": {},
246 |    "source": [
247 |     "Here is the full text of the __answer__ associated with the above original question:"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 7,
253 |    "metadata": {
254 |     "collapsed": false,
255 |     "scrolled": true
256 |    },
257 |    "outputs": [
258 |     {
259 |      "name": "stdout",
260 |      "output_type": "stream",
261 |      "text": [
262 |       "<p>Unfortunately, there isn't an API to give you the HTTP response headers for your initial page request. That was the original question posted here. It has been <a href=\"http://stackoverflow.com/questions/12258705/how-can-i-read-the-current-headers-without-making-a-new-request-with-js\">repeatedly asked</a>, too, because some people would like to get the actual response headers of the original page request without issuing another one.</p> <h1><br/>For AJAX Requests:</h1> <p>If an HTTP request is made over AJAX, it is possible to get the response headers with the <strong><code>getAllResponseHeaders()</code></strong> method. It's part of the XMLHttpRequest API. To see how this can be applied, check out the <em><code>fetchSimilarHeaders()</code></em> function below. Note that this is a work-around to the problem that won't be reliable for some applications.</p> <pre><code>myXMLHttpRequest.getAllResponseHeaders(); </code></pre> <ul> <li><p>The API was specified in the following candidate recommendation for XMLHttpRequest: <a href=\"http://www.w3.org/TR/XMLHttpRequest/#the-getresponseheader-method\" rel=\"nofollow\">XMLHttpRequest - W3C Candidate Recommendation 3 August 2010</a></p></li> <li><p>Specifically, the <code>getAllResponseHeaders()</code> method was specified in the following section: <a href=\"http://www.w3.org/TR/XMLHttpRequest/#the-getallresponseheaders()-method\" rel=\"nofollow\">w3.org: <code>XMLHttpRequest</code>: the <code>getallresponseheaders()</code> method</a> </p></li> <li><p>The MDN documentation is good, too: <a href=\"https://developer.mozilla.org/en-US/docs/Web/API/XMLHttpRequest\" rel=\"nofollow\">developer.mozilla.org: <code>XMLHttpRequest</code></a>.</p></li> </ul> <p>This will not give you information about the original page request's HTTP response headers, but it could be used to make educated guesses about what those headers were. More on that is described next.</p> <h1><br/>Getting header values from the Initial Page Request:</h1> <p>This question was first asked several years ago, asking specifically about how to get at the original HTTP response headers for the <em>current page</em> (i.e. the same page inside of which the javascript was running). This is quite a different question than simply getting the response headers for any HTTP request. For the initial page request, the headers aren't readily available to javascript. Whether the header values you need will be reliably and sufficiently consistent if you request the same page again via AJAX will depend on your particular application.</p> <p>The following are a few suggestions for getting around that problem.</p> <h2><br/>1. Requests on Resources which are largely static</h2> <p>If the response is largely static and the headers are not expected to change much between requests, you could make an AJAX request for the same page you're currently on and assume that they're they are the same values which were part of the page's HTTP response. This could allow you to access the headers you need using the nice XMLHttpRequest API described above.</p> <pre><code>function fetchSimilarHeaders (callback) { var request = new XMLHttpRequest(); request.onreadystatechange = function () { if (request.readyState === 4) { // // The following headers may often be similar // to those of the original page request... // if (callback &amp;&amp; typeof callback === 'function') { callback(request.getAllResponseHeaders()); } } }; // // Re-request the same page (document.location) // We hope to get the same or similar response headers to those which // came with the current page, but we have no guarantee. // Since we are only after the headers, a HEAD request may be sufficient. // request.open('HEAD', document.location, true); request.send(null); } </code></pre> <p>This approach will be problematic if you truly have to rely on the values being consistent between requests, since you can't fully guarantee that they are the same. It's going to depend on your specific application and\n"
263 |      ]
264 |     }
265 |    ],
266 |    "source": [
267 |     "print(answers[\"Text0\"][220233])"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "__Duplicate__ questions share the same `AnswerId` as the original question they link to. Here is the first duplicate question linked to the above original question:"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 8,
280 |    "metadata": {
281 |     "collapsed": false,
282 |     "scrolled": true
283 |    },
284 |    "outputs": [
285 |     {
286 |      "name": "stdout",
287 |      "output_type": "stream",
288 |      "text": [
289 |       "Monitoring http request header on a page. <blockquote> <p><strong>Possible Duplicates:</strong><br> <a href=\"http://stackoverflow.com/questions/220231/accessing-http-headers-in-javascript\">Accessing HTTP Headers in Javascript?</a><br> <a href=\"http://stackoverflow.com/questions/220149/how-do-i-access-the-http-request-header-fields-via-javascript\">How do I access the HTTP request header fields via JavaScript?</a> </p> </blockquote> <p>We can use httpwatch on IE or httpfox on Firefox to monitor http activity</p> <p>If i don't want to use any plugs on browser...</p> <p>Is it possible to monitor http request header on a page just by javascript?</p>\n"
290 |      ]
291 |     }
292 |    ],
293 |    "source": [
294 |     "print(dupes.query(\"AnswerId == 220233\").iloc[0][\"Text0\"])"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "markdown",
299 |    "metadata": {},
300 |    "source": [
301 |     "## Pre-process Text Data\n",
302 |     "\n",
303 |     "### Clean up text\n",
304 |     "\n",
305 |     "The raw data is in `HTML` format and needs to be cleaned up for any further analysis. We exclude HTML tags, links and code snippets from the data."
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 9,
311 |    "metadata": {
312 |     "collapsed": true
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "# remove embedded code chunks, HTML tags and links/URLs.\n",
317 |     "def clean_text(text):\n",
318 |     "    global EMPTY\n",
319 |     "    EMPTY = ''\n",
320 |     "    \n",
321 |     "    if not isinstance(text, str): \n",
322 |     "        return text\n",
323 |     "    text = re.sub('<pre><code>.*?</code></pre>', EMPTY, text)\n",
324 |     "\n",
325 |     "    def replace_link(match):\n",
326 |     "        return EMPTY if re.match('[a-z]+://', match.group(1)) else match.group(1)\n",
327 |     "    \n",
328 |     "    text = re.sub('<a[^>]+>(.*)</a>', replace_link, text)\n",
329 |     "    return re.sub('<[^>]+>', EMPTY, text)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 10,
335 |    "metadata": {
336 |     "collapsed": true
337 |    },
338 |    "outputs": [],
339 |    "source": [
340 |     "for df in (questions, dupes, answers):\n",
341 |     "    df['Text'] = df['Text0'].apply(clean_text).str.lower()\n",
342 |     "    df['NumChars'] = df['Text'].str.len()"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "metadata": {},
348 |    "source": [
349 |     "### Set data selection criteria\n",
350 |     "\n",
351 |     "To obtain the high quality datasets for phrase learning and model training, we requires a minimum length of characters in the text field. Different thresholds are considered for original questions, duplications, and answers, respectively. Also, each Q&A pair in our set must have a minimum of 3 additional semantically equivalent duplicate questions linked to it. "
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 11,
357 |    "metadata": {
358 |     "collapsed": true
359 |    },
360 |    "outputs": [],
361 |    "source": [
362 |     "# find the AnswerIds has at least 3 dupes.\n",
363 |     "def find_answerId(answersC, dupesC, num_dupes):\n",
364 |     "       \n",
365 |     "    countHash = {}\n",
366 |     "    for i in dupesC.AnswerId:\n",
367 |     "        if i not in answersC.index.values:\n",
368 |     "            continue\n",
369 |     "        if i not in countHash.keys():\n",
370 |     "            countHash[i] = 1\n",
371 |     "        else:\n",
372 |     "            countHash[i] += 1\n",
373 |     "            \n",
374 |     "    countHash = {k: v for k, v in countHash.items() if v >= num_dupes}\n",
375 |     "    commonAnswerId = countHash.keys()\n",
376 |     "    \n",
377 |     "    return commonAnswerId\n",
378 |     "\n",
379 |     "# extract data based on the selection criteria.\n",
380 |     "def select_data(questions, dupes, answers):\n",
381 |     "    # exclude the records without any text\n",
382 |     "    questions_nz = questions.query('NumChars > 0')\n",
383 |     "    dupes_nz = dupes.query('NumChars > 0')\n",
384 |     "    answers_nz = answers.query('NumChars > 0')\n",
385 |     "\n",
386 |     "    # get the 10th percentile of text length as the minimum length of characters to consider in the text field\n",
387 |     "    minLenQ = questions_nz.quantile(.1)['NumChars']\n",
388 |     "    minLenD = dupes_nz.quantile(.1)['NumChars']\n",
389 |     "    minLenA = answers_nz.quantile(.1)['NumChars']\n",
390 |     "    \n",
391 |     "    # eliminate records with text less than the minimum length\n",
392 |     "    questionsC = questions.query('NumChars >' + str(int(minLenQ)))\n",
393 |     "    dupesC = dupes.query('NumChars >' + str(minLenD))\n",
394 |     "    answersC = answers.query('NumChars >' + str(minLenA))\n",
395 |     "    \n",
396 |     "    # remove the records in dupesC whose questionId has already existed in questionsC\n",
397 |     "    duplicatedIndex = list(set(questionsC.index).intersection(set(dupesC.index)))\n",
398 |     "    dupesC.drop(duplicatedIndex, inplace=True)\n",
399 |     "    \n",
400 |     "    # make sure Questions 1:1 match with Answers \n",
401 |     "    matches = questionsC.merge(answersC, left_on = 'AnswerId', right_index = True)\n",
402 |     "    questionsC = matches[['AnswerId', 'Text0_x', 'CreationDate', 'Text_x', 'NumChars_x']]\n",
403 |     "    questionsC.columns = ['AnswerId', 'Text0', 'CreationDate', 'Text', 'NumChars']\n",
404 |     "\n",
405 |     "    answersC = matches[['Text0_y', 'Text_y', 'NumChars_y']]\n",
406 |     "    answersC.index = matches['AnswerId']\n",
407 |     "    answersC.columns = ['Text0', 'Text', 'NumChars']\n",
408 |     "    \n",
409 |     "    # find the AnswerIds has at least 3 dupes\n",
410 |     "    commonAnswerId = find_answerId(answersC, dupesC, 3)\n",
411 |     "    \n",
412 |     "    # select the records with those AnswerIds\n",
413 |     "    questionsC = questionsC.loc[questionsC.AnswerId.isin(commonAnswerId)]\n",
414 |     "    dupesC = dupesC.loc[dupesC.AnswerId.isin(commonAnswerId)]\n",
415 |     "    \n",
416 |     "    return questionsC, dupesC"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": 12,
422 |    "metadata": {
423 |     "collapsed": true
424 |    },
425 |    "outputs": [],
426 |    "source": [
427 |     "# some questions have been linked to multiple AnswerIds.\n",
428 |     "# we keep the first AnswerId associated with that question and remove the rest.\n",
429 |     "questions = questions.groupby(questions.index).first()\n",
430 |     "dupes = dupes.groupby(dupes.index).first()\n",
431 |     "\n",
432 |     "# execute the data selection function on questions, dupes and answers.\n",
433 |     "questionsC, dupesC = select_data(questions, dupes, answers)"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "markdown",
438 |    "metadata": {},
439 |    "source": [
440 |     "## Prepare Training and Test datasets\n",
441 |     "\n",
442 |     "In this example, we retain original question and 75% of the duplicate questions for training, and hold-out the most recently posted 25% of duplicate questions as test data. The training and test data are split by `CreationDate`.\n",
443 |     "\n",
444 |     "- training set = Original questions + 75% of oldest Duplications per original question\n",
445 |     "- test set = remaining 25% of Duplications per original question"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": 13,
451 |    "metadata": {
452 |     "collapsed": true
453 |    },
454 |    "outputs": [],
455 |    "source": [
456 |     "# split Original questions and their Duplications into training and test sets.\n",
457 |     "def split_data(questions, dupes, frac):\n",
458 |     "    trainQ = questions\n",
459 |     "    testQ = pd.DataFrame(columns = dupes.columns.values) # create an empty data frame\n",
460 |     "\n",
461 |     "    for answerId in np.unique(dupes.AnswerId):\n",
462 |     "        df = dupes.query('AnswerId == ' + str(answerId))\n",
463 |     "        totalCount = len(df)\n",
464 |     "        splitPoint = int(totalCount * frac)\n",
465 |     "        dfSort = df.sort_values(by = ['CreationDate'])\n",
466 |     "        trainQ = trainQ.append(dfSort.head(splitPoint)) # oldest N percent of duplications\n",
467 |     "        testQ = testQ.append(dfSort.tail(totalCount - splitPoint))\n",
468 |     "\n",
469 |     "    # convert data type to int\n",
470 |     "    testQ[[\"AnswerId\", \"NumChars\"]] = testQ[[\"AnswerId\", \"NumChars\"]].astype(int) \n",
471 |     "    # rename the index \n",
472 |     "    testQ.index.rename(\"Id\", inplace=True)\n",
473 |     "    \n",
474 |     "    return trainQ, testQ"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": 14,
480 |    "metadata": {
481 |     "collapsed": true
482 |    },
483 |    "outputs": [],
484 |    "source": [
485 |     "trainQ, testQ = split_data(questionsC, dupesC, 0.75)"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": 15,
491 |    "metadata": {
492 |     "collapsed": false
493 |    },
494 |    "outputs": [
495 |     {
496 |      "data": {
497 |       "text/html": [
498 |        "<div>\n",
499 |        "<table border=\"1\" class=\"dataframe\">\n",
500 |        "  <thead>\n",
501 |        "    <tr style=\"text-align: right;\">\n",
502 |        "      <th></th>\n",
503 |        "      <th>AnswerId</th>\n",
504 |        "      <th>Text0</th>\n",
505 |        "      <th>CreationDate</th>\n",
506 |        "      <th>Text</th>\n",
507 |        "      <th>NumChars</th>\n",
508 |        "    </tr>\n",
509 |        "    <tr>\n",
510 |        "      <th>Id</th>\n",
511 |        "      <th></th>\n",
512 |        "      <th></th>\n",
513 |        "      <th></th>\n",
514 |        "      <th></th>\n",
515 |        "      <th></th>\n",
516 |        "    </tr>\n",
517 |        "  </thead>\n",
518 |        "  <tbody>\n",
519 |        "    <tr>\n",
520 |        "      <th>3713</th>\n",
521 |        "      <td>3777</td>\n",
522 |        "      <td>Call ASP.NET Function From Javascript?. &lt;p&gt;I'm...</td>\n",
523 |        "      <td>2008-08-06 17:16:36.630</td>\n",
524 |        "      <td>call asp.net function from javascript?. i'm wr...</td>\n",
525 |        "      <td>227</td>\n",
526 |        "    </tr>\n",
527 |        "    <tr>\n",
528 |        "      <th>5223</th>\n",
529 |        "      <td>6700</td>\n",
530 |        "      <td>Length of a JavaScript object (that is, associ...</td>\n",
531 |        "      <td>2008-08-07 19:42:21.060</td>\n",
532 |        "      <td>length of a javascript object (that is, associ...</td>\n",
533 |        "      <td>313</td>\n",
534 |        "    </tr>\n",
535 |        "    <tr>\n",
536 |        "      <th>7477</th>\n",
537 |        "      <td>7523</td>\n",
538 |        "      <td>Autosizing textarea using Prototype. &lt;p&gt;I'm cu...</td>\n",
539 |        "      <td>2008-08-11 01:43:13.493</td>\n",
540 |        "      <td>autosizing textarea using prototype. i'm curre...</td>\n",
541 |        "      <td>1664</td>\n",
542 |        "    </tr>\n",
543 |        "    <tr>\n",
544 |        "      <th>18082</th>\n",
545 |        "      <td>1830844</td>\n",
546 |        "      <td>Validate decimal numbers in JavaScript - IsNum...</td>\n",
547 |        "      <td>2008-08-20 14:21:13.793</td>\n",
548 |        "      <td>validate decimal numbers in javascript - isnum...</td>\n",
549 |        "      <td>231</td>\n",
550 |        "    </tr>\n",
551 |        "    <tr>\n",
552 |        "      <th>21294</th>\n",
553 |        "      <td>242607</td>\n",
554 |        "      <td>Dynamically load a JavaScript file. &lt;p&gt;How can...</td>\n",
555 |        "      <td>2008-08-21 21:59:31.080</td>\n",
556 |        "      <td>dynamically load a javascript file. how can yo...</td>\n",
557 |        "      <td>980</td>\n",
558 |        "    </tr>\n",
559 |        "  </tbody>\n",
560 |        "</table>\n",
561 |        "</div>"
562 |       ],
563 |       "text/plain": [
564 |        "       AnswerId                                              Text0  \\\n",
565 |        "Id                                                                   \n",
566 |        "3713       3777  Call ASP.NET Function From Javascript?. <p>I'm...   \n",
567 |        "5223       6700  Length of a JavaScript object (that is, associ...   \n",
568 |        "7477       7523  Autosizing textarea using Prototype. <p>I'm cu...   \n",
569 |        "18082   1830844  Validate decimal numbers in JavaScript - IsNum...   \n",
570 |        "21294    242607  Dynamically load a JavaScript file. <p>How can...   \n",
571 |        "\n",
572 |        "                  CreationDate  \\\n",
573 |        "Id                               \n",
574 |        "3713   2008-08-06 17:16:36.630   \n",
575 |        "5223   2008-08-07 19:42:21.060   \n",
576 |        "7477   2008-08-11 01:43:13.493   \n",
577 |        "18082  2008-08-20 14:21:13.793   \n",
578 |        "21294  2008-08-21 21:59:31.080   \n",
579 |        "\n",
580 |        "                                                    Text  NumChars  \n",
581 |        "Id                                                                  \n",
582 |        "3713   call asp.net function from javascript?. i'm wr...       227  \n",
583 |        "5223   length of a javascript object (that is, associ...       313  \n",
584 |        "7477   autosizing textarea using prototype. i'm curre...      1664  \n",
585 |        "18082  validate decimal numbers in javascript - isnum...       231  \n",
586 |        "21294  dynamically load a javascript file. how can yo...       980  "
587 |       ]
588 |      },
589 |      "execution_count": 15,
590 |      "metadata": {},
591 |      "output_type": "execute_result"
592 |     }
593 |    ],
594 |    "source": [
595 |     "trainQ.head(5)"
596 |    ]
597 |   },
598 |   {
599 |    "cell_type": "markdown",
600 |    "metadata": {},
601 |    "source": [
602 |     "## Select Subsets with Sufficient Training Questions per Answer Class\n",
603 |     "\n",
604 |     "In our past experiments, we notice that some Q&A pairs only link to a small number of duplicate questions. This means those answer classes may contain an insufficient amount of examples to train an accurate model. We examine the effect of the number of duplicate questions available for training for each Q&A pair. \n",
605 |     "\n",
606 |     "<img src=\"https://raw.githubusercontent.com/Azure/MachineLearningSamples-QnAMatching/master/Image/training_size.PNG?token=APoO9rnKXamwVdXu8luA_Dd28UUBncwrks5ZwtRowA%3D%3D\">\n",
607 |     "\n",
608 |     "The above Figure shows results for questions relative to the number of training examples available for the correct Q&A pair that should be returned. Most of our Q&A pairs (857 out of 1201) have 5 or fewer known duplicate questions available for training. Performance on these questions is relatively weak, with the correct Q&A pair landing in the top 10 results less than 40% of the time. However, when greater numbers of duplicate questions are available for training, performance improves dramatically; when Q&A pairs have 50 or more duplicate questions available for training, the classification model places these pairs in the top 10 of the retrieved results 98% of the time when they correctly match the query. The most duplicated question contains 962 duplications. \n",
609 |     "\n",
610 |     "For the study in this notebook, we only consider the answer classes that have more than 13 training questions (original and duplicate questions) in this notebook. This reduces the entire dataset to 5,153 training questions, 1,735 test questions, and 103 unique answer classes."
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": 16,
616 |    "metadata": {
617 |     "collapsed": true
618 |    },
619 |    "outputs": [],
620 |    "source": [
621 |     "countPerAns = pd.DataFrame({\"NumTrain\" : trainQ.groupby(\"AnswerId\").size()})\n",
622 |     "trainQwithCount = trainQ.merge(countPerAns, left_on=\"AnswerId\", right_index=True)\n",
623 |     "testQwithCount = testQ.merge(countPerAns, left_on=\"AnswerId\", right_index=True)\n",
624 |     "\n",
625 |     "# for each Answer class, we request more than 13 training questions.\n",
626 |     "trainQ = trainQwithCount[trainQwithCount[\"NumTrain\"] > 13]\n",
627 |     "testQ = testQwithCount[testQwithCount[\"NumTrain\"] > 13]"
628 |    ]
629 |   },
630 |   {
631 |    "cell_type": "code",
632 |    "execution_count": 17,
633 |    "metadata": {
634 |     "collapsed": false,
635 |     "scrolled": false
636 |    },
637 |    "outputs": [
638 |     {
639 |      "name": "stdout",
640 |      "output_type": "stream",
641 |      "text": [
642 |       "# of training examples: 5153\n",
643 |       "# of testing examples: 1735\n",
644 |       "\n",
645 |       "A quick glance of the training data: \n",
646 |       "\n"
647 |      ]
648 |     },
649 |     {
650 |      "data": {
651 |       "text/html": [
652 |        "<div>\n",
653 |        "<table border=\"1\" class=\"dataframe\">\n",
654 |        "  <thead>\n",
655 |        "    <tr style=\"text-align: right;\">\n",
656 |        "      <th></th>\n",
657 |        "      <th>AnswerId</th>\n",
658 |        "      <th>Text</th>\n",
659 |        "    </tr>\n",
660 |        "    <tr>\n",
661 |        "      <th>Id</th>\n",
662 |        "      <th></th>\n",
663 |        "      <th></th>\n",
664 |        "    </tr>\n",
665 |        "  </thead>\n",
666 |        "  <tbody>\n",
667 |        "    <tr>\n",
668 |        "      <th>69913</th>\n",
669 |        "      <td>69984</td>\n",
670 |        "      <td>why don't self-closing script tags work?. what...</td>\n",
671 |        "    </tr>\n",
672 |        "    <tr>\n",
673 |        "      <th>392561</th>\n",
674 |        "      <td>69984</td>\n",
675 |        "      <td>firefox script tag error. while adding some ve...</td>\n",
676 |        "    </tr>\n",
677 |        "    <tr>\n",
678 |        "      <th>1297308</th>\n",
679 |        "      <td>69984</td>\n",
680 |        "      <td>weird javascript/jquery behavior.  possible du...</td>\n",
681 |        "    </tr>\n",
682 |        "    <tr>\n",
683 |        "      <th>3352182</th>\n",
684 |        "      <td>69984</td>\n",
685 |        "      <td>html: why script tags should always have full ...</td>\n",
686 |        "    </tr>\n",
687 |        "    <tr>\n",
688 |        "      <th>5355867</th>\n",
689 |        "      <td>69984</td>\n",
690 |        "      <td>loading scripts in javascript.  possible dupli...</td>\n",
691 |        "    </tr>\n",
692 |        "  </tbody>\n",
693 |        "</table>\n",
694 |        "</div>"
695 |       ],
696 |       "text/plain": [
697 |        "         AnswerId                                               Text\n",
698 |        "Id                                                                  \n",
699 |        "69913       69984  why don't self-closing script tags work?. what...\n",
700 |        "392561      69984  firefox script tag error. while adding some ve...\n",
701 |        "1297308     69984  weird javascript/jquery behavior.  possible du...\n",
702 |        "3352182     69984  html: why script tags should always have full ...\n",
703 |        "5355867     69984  loading scripts in javascript.  possible dupli..."
704 |       ]
705 |      },
706 |      "execution_count": 17,
707 |      "metadata": {},
708 |      "output_type": "execute_result"
709 |     }
710 |    ],
711 |    "source": [
712 |     "print(\"# of training examples: \" + str(len(trainQ)))\n",
713 |     "print(\"# of testing examples: \" + str(len(testQ)) + \"\\n\")\n",
714 |     "print(\"A quick glance of the training data: \\n\")\n",
715 |     "trainQ[[\"AnswerId\", \"Text\"]].head(5)"
716 |    ]
717 |   },
718 |   {
719 |    "cell_type": "markdown",
720 |    "metadata": {},
721 |    "source": [
722 |     "## Save Outputs to a Share Directory in the Workbench\n",
723 |     "\n",
724 |     "Azure Machine Learning Workbench provides a flexible way of saving intermediate files. `os.environ.get('AZUREML_NATIVE_SHARE_DIRECTORY')` retrieves a share directory where the files are stored. Those files can be accessed from other notebooks or Python files."
725 |    ]
726 |   },
727 |   {
728 |    "cell_type": "code",
729 |    "execution_count": 18,
730 |    "metadata": {
731 |     "collapsed": true
732 |    },
733 |    "outputs": [],
734 |    "source": [
735 |     "workfolder = os.environ.get('AZUREML_NATIVE_SHARE_DIRECTORY')\n",
736 |     "trainQ.to_csv(os.path.join(workfolder, 'trainQ_part1'), sep='\\t', header=True, index=True, index_label='Id')\n",
737 |     "testQ.to_csv(os.path.join(workfolder, 'testQ_part1'), sep='\\t', header=True, index=True, index_label='Id')"
738 |    ]
739 |   }
740 |  ],
741 |  "metadata": {
742 |   "kernelspec": {
743 |    "display_name": "Python [default]",
744 |    "language": "python",
745 |    "name": "python3"
746 |   },
747 |   "language_info": {
748 |    "codemirror_mode": {
749 |     "name": "ipython",
750 |     "version": 3
751 |    },
752 |    "file_extension": ".py",
753 |    "mimetype": "text/x-python",
754 |    "name": "python",
755 |    "nbconvert_exporter": "python",
756 |    "pygments_lexer": "ipython3",
757 |    "version": "3.5.2"
758 |   }
759 |  },
760 |  "nbformat": 4,
761 |  "nbformat_minor": 2
762 | }
763 | 


--------------------------------------------------------------------------------
/notebooks/Part_2_Phrase_Learning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Part 2: Phrase Learning\n",
  8 |     "\n",
  9 |     "If you haven't complete the **Part 1: Data Preparation**, please complete it before moving forward with **Part 2: Phrase Learning**. Part 2 requires files created from Part 1.\n",
 10 |     "\n",
 11 |     "Please make sure you have __notebook__ and __nltk__ Python packages installed in the compute context you choose as kernel. For demonstration purpose, this series of notebooks uses the `local` compute context.\n",
 12 |     "\n",
 13 |     "**NOTE**: Python 3 kernel doesn't include Azure Machine Learning Workbench functionalities. Please switch the kernel to `local` before continuing further. \n",
 14 |     "\n",
 15 |     "To install __notebook__ and __nltk__, please uncomment and run the following script."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {
 22 |     "collapsed": true
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# !pip install --upgrade notebook\n",
 27 |     "# !pip install --upgrade nltk"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "### Import Required Python Modules\n",
 35 |     "\n",
 36 |     "`modules.phrase_learning` contains a list of Python user-defined Python modules to learn informative phrases that are used in this examples. You can find the source code of those modules in the directory of `modules/phrase_learning.py`."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "import pandas as pd\n",
 48 |     "import numpy as np\n",
 49 |     "import re, os, requests, warnings\n",
 50 |     "from collections import (namedtuple, Counter)\n",
 51 |     "from modules.phrase_learning import (CleanAndSplitText, ComputeNgramStats, RankNgrams, ApplyPhraseRewrites,\n",
 52 |     "                            ApplyPhraseLearning, ApplyPhraseRewritesInPlace, ReconstituteDocsFromChunks,\n",
 53 |     "                            CreateVocabForTopicModeling)\n",
 54 |     "from azureml.logging import get_azureml_logger\n",
 55 |     "warnings.filterwarnings(\"ignore\")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {
 62 |     "collapsed": true
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "run_logger = get_azureml_logger()\n",
 67 |     "run_logger.log('amlrealworld.QnA-matching.part2-phrase-learning','true')"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "## Access trainQ and testQ from Part 1\n",
 75 |     "\n",
 76 |     "As we have prepared the _trainQ_ and _testQ_ from the `Part 1: Data Preparation`, we retrieve the datasets here for the further process.\n",
 77 |     "\n",
 78 |     "_trainQ_ contains 5,153 training examples and _testQ_ contains 1,735 test examples. Also, there are 103 unique answer classes in both datasets."
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 3,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# load non-content bearing function words (.txt file) into a Python dictionary. \n",
 90 |     "def LoadListAsHash(fileURL):\n",
 91 |     "    response = requests.get(fileURL, stream=True)\n",
 92 |     "    wordsList = response.text.split('\\n')\n",
 93 |     "\n",
 94 |     "    # Read in lines one by one and strip away extra spaces, \n",
 95 |     "    # leading spaces, and trailing spaces and inserting each\n",
 96 |     "    # cleaned up line into a hash table.\n",
 97 |     "    listHash = {}\n",
 98 |     "    re1 = re.compile(' +')\n",
 99 |     "    re2 = re.compile('^ +| +$')\n",
100 |     "    for stringIn in wordsList:\n",
101 |     "        term = re2.sub(\"\",re1.sub(\" \",stringIn.strip('\\n')))\n",
102 |     "        if term != '':\n",
103 |     "            listHash[term] = 1\n",
104 |     "    return listHash"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 4,
110 |    "metadata": {
111 |     "collapsed": true
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "workfolder = os.environ.get('AZUREML_NATIVE_SHARE_DIRECTORY')\n",
116 |     "\n",
117 |     "# paths to trainQ, testQ and function words.\n",
118 |     "trainQ_path = os.path.join(workfolder, 'trainQ_part1')\n",
119 |     "testQ_path = os.path.join(workfolder, 'testQ_part1')\n",
120 |     "function_words_url = 'https://bostondata.blob.core.windows.net/stackoverflow/function_words.txt'\n",
121 |     "\n",
122 |     "# load the training and test data.\n",
123 |     "trainQ = pd.read_csv(trainQ_path, sep='\\t', index_col='Id', encoding='latin1')\n",
124 |     "testQ = pd.read_csv(testQ_path, sep='\\t', index_col='Id', encoding='latin1')\n",
125 |     "\n",
126 |     "# Load the list of non-content bearing function words.\n",
127 |     "functionwordHash = LoadListAsHash(function_words_url)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "## Clean and Split the Text\n",
135 |     "\n",
136 |     "The CleanAndSplitText function from __phrase_learning__ takes as input a list where each row element is a single cohesive long string of text, i.e. a \"question\". The function first splits each string by various forms of punctuation into chunks of text that are likely sentences, phrases or sub-phrases. The splitting is designed to prohibit the phrase learning process from using cross-sentence or cross-phrase word strings when learning phrases.\n",
137 |     "\n",
138 |     "The function returns a table where each row represents a chunk of text from the questions. The `DocID` coulmn indicates the original row index from associated question in the input from which the chunk of text originated. The `DocLine` column contains the original text excluding the punctuation marks and `HTML` markup that have been during the cleaning process. The `Lowercase Taxt` column contains a fully lower-cased version of the text in the `CleanedText` column."
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 5,
144 |    "metadata": {
145 |     "collapsed": false
146 |    },
147 |    "outputs": [
148 |     {
149 |      "name": "stdout",
150 |      "output_type": "stream",
151 |      "text": [
152 |       "[nltk_data] Downloading package punkt to\n",
153 |       "[nltk_data]     C:\\Users\\mez\\AppData\\Roaming\\nltk_data...\n",
154 |       "[nltk_data]   Package punkt is already up-to-date!\n",
155 |       "[nltk_data] Downloading package punkt to\n",
156 |       "[nltk_data]     C:\\Users\\mez\\AppData\\Roaming\\nltk_data...\n",
157 |       "[nltk_data]   Package punkt is already up-to-date!\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "CleanedTrainQ = CleanAndSplitText(trainQ)\n",
163 |     "CleanedTestQ = CleanAndSplitText(testQ)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 6,
169 |    "metadata": {
170 |     "collapsed": false
171 |    },
172 |    "outputs": [
173 |     {
174 |      "data": {
175 |       "text/html": [
176 |        "<div>\n",
177 |        "<table border=\"1\" class=\"dataframe\">\n",
178 |        "  <thead>\n",
179 |        "    <tr style=\"text-align: right;\">\n",
180 |        "      <th></th>\n",
181 |        "      <th>DocID</th>\n",
182 |        "      <th>DocLine</th>\n",
183 |        "      <th>CleanedText</th>\n",
184 |        "      <th>LowercaseText</th>\n",
185 |        "    </tr>\n",
186 |        "  </thead>\n",
187 |        "  <tbody>\n",
188 |        "    <tr>\n",
189 |        "      <th>0</th>\n",
190 |        "      <td>69913</td>\n",
191 |        "      <td>0</td>\n",
192 |        "      <td>why don't self-closing script tags work</td>\n",
193 |        "      <td>why don't self-closing script tags work</td>\n",
194 |        "    </tr>\n",
195 |        "    <tr>\n",
196 |        "      <th>1</th>\n",
197 |        "      <td>69913</td>\n",
198 |        "      <td>1</td>\n",
199 |        "      <td>what is the reason browsers do not correctly r...</td>\n",
200 |        "      <td>what is the reason browsers do not correctly r...</td>\n",
201 |        "    </tr>\n",
202 |        "    <tr>\n",
203 |        "      <th>2</th>\n",
204 |        "      <td>69913</td>\n",
205 |        "      <td>2</td>\n",
206 |        "      <td>only this is recognized</td>\n",
207 |        "      <td>only this is recognized</td>\n",
208 |        "    </tr>\n",
209 |        "    <tr>\n",
210 |        "      <th>3</th>\n",
211 |        "      <td>69913</td>\n",
212 |        "      <td>3</td>\n",
213 |        "      <td>does this break the concept of xhtml support</td>\n",
214 |        "      <td>does this break the concept of xhtml support</td>\n",
215 |        "    </tr>\n",
216 |        "    <tr>\n",
217 |        "      <th>4</th>\n",
218 |        "      <td>69913</td>\n",
219 |        "      <td>4</td>\n",
220 |        "      <td>note</td>\n",
221 |        "      <td>note</td>\n",
222 |        "    </tr>\n",
223 |        "  </tbody>\n",
224 |        "</table>\n",
225 |        "</div>"
226 |       ],
227 |       "text/plain": [
228 |        "   DocID  DocLine                                        CleanedText  \\\n",
229 |        "0  69913        0            why don't self-closing script tags work   \n",
230 |        "1  69913        1  what is the reason browsers do not correctly r...   \n",
231 |        "2  69913        2                            only this is recognized   \n",
232 |        "3  69913        3       does this break the concept of xhtml support   \n",
233 |        "4  69913        4                                               note   \n",
234 |        "\n",
235 |        "                                       LowercaseText  \n",
236 |        "0            why don't self-closing script tags work  \n",
237 |        "1  what is the reason browsers do not correctly r...  \n",
238 |        "2                            only this is recognized  \n",
239 |        "3       does this break the concept of xhtml support  \n",
240 |        "4                                               note  "
241 |       ]
242 |      },
243 |      "execution_count": 6,
244 |      "metadata": {},
245 |      "output_type": "execute_result"
246 |     }
247 |    ],
248 |    "source": [
249 |     "CleanedTrainQ.head(5)"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {},
255 |    "source": [
256 |     "## Learn Informative Phrases \n",
257 |     "The phrases can be treated as single compound word units in down-stream processes such as discriminative training. To learn the phrases, we have implemented the basic framework for key phrase learning as described in the paper entitled [\"Modeling Multiword Phrases with Constrained Phrases Tree for Improved Topic Modeling of Conversational Speech\"](http://people.csail.mit.edu/hazen/publications/Hazen-SLT-2012.pdf) which was originally presented in the 2012 IEEE Workshop on Spoken Language Technology. Although the paper examines the use of the technology for analyzing human-to-human conversations, the techniques are quite general and can be applied to a wide range of natural language data including news stories, legal documents, research publications, social media forum discussions, customer feedback forms, product reviews, and many more.\n",
258 |     "\n",
259 |     "`ApplyPhraseLearning` module takes the following arguments:\n",
260 |     "- `textData`: array, a list of text data.\n",
261 |     "- `learnedPhrases`: array, a list of learned phrases. For initialization, an empty list should be given.\n",
262 |     "- `maxNumPhrases`: int, (default=200), maximium number of phrases to learn. If you want to test the code out quickly then set this to a small value (e.g. 100) and set `verbose` to True when running the quick test.\n",
263 |     "- `maxPhraseLength`: int, (default=7), maximum number of words allowed in the learned phrases.\n",
264 |     "- `maxPhrasesPerIter`: int, (default=50), maximum number of phrases to learn per iteration. Increasing this number may speed up processing but will affect the ordering of the phrases learned and good phrases could be by-passed if the maxNumPhrases is set to a small number.\n",
265 |     "- `minCount`: int, (default=5), minimum number of times a phrase must occur in the data to be considered during the phrase learning process.\n",
266 |     "- `functionwordHash`: dict, (default={}), a precreated hash table containing the list of function words used during phrase learning. \n",
267 |     "- `blacklistHash`: dict, (default={}), a precreated hash table containing the list of black list words to be ignored during phrase learning.\n",
268 |     "- `verbose`: boolean, (default=False). If verbose=True, it prints out the learned phrases to stdout buffer while its learning. This will generate a lot of text to stdout, so best to turn this off except for testing and debugging."
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 7,
274 |    "metadata": {
275 |     "collapsed": false
276 |    },
277 |    "outputs": [
278 |     {
279 |      "name": "stdout",
280 |      "output_type": "stream",
281 |      "text": [
282 |       "Start phrase learning with 0 phrases of 200 phrases learned\n",
283 |       "Iteration 1: Added 42 new phrases in 1.34 seconds (Learned 42 of max 200)\n",
284 |       "Iteration 2: Added 35 new phrases in 1.41 seconds (Learned 77 of max 200)\n",
285 |       "Iteration 3: Added 32 new phrases in 1.14 seconds (Learned 109 of max 200)\n",
286 |       "Iteration 4: Added 34 new phrases in 1.34 seconds (Learned 143 of max 200)\n",
287 |       "Iteration 5: Added 31 new phrases in 1.27 seconds (Learned 174 of max 200)\n",
288 |       "Iteration 6: Added 11 new phrases in 1.36 seconds (Learned 185 of max 200)\n",
289 |       "Iteration 7: Added 3 new phrases in 1.14 seconds (Learned 188 of max 200)\n",
290 |       "Iteration 8: Added 4 new phrases in 1.21 seconds (Learned 192 of max 200)\n",
291 |       "Iteration 9: Added 1 new phrases in 1.27 seconds (Learned 193 of max 200)\n",
292 |       "Iteration 10: Added 1 new phrases in 1.16 seconds (Learned 194 of max 200)\n",
293 |       "Iteration 11: Added 1 new phrases in 1.23 seconds (Learned 195 of max 200)\n",
294 |       "Iteration 12: Added 1 new phrases in 1.16 seconds (Learned 196 of max 200)\n",
295 |       "Iteration 13: Added 1 new phrases in 1.23 seconds (Learned 197 of max 200)\n",
296 |       "Iteration 14: Added 1 new phrases in 1.32 seconds (Learned 198 of max 200)\n",
297 |       "Iteration 15: Added 1 new phrases in 1.19 seconds (Learned 199 of max 200)\n",
298 |       "Iteration 16: Added 1 new phrases in 1.19 seconds (Learned 200 of max 200)\n",
299 |       "*** Phrase learning completed in 0.01 hours ***\n"
300 |      ]
301 |     }
302 |    ],
303 |    "source": [
304 |     "# Initialize an empty list of learned phrases\n",
305 |     "# If you have completed a partial run of phrase learning\n",
306 |     "# and want to add more phrases, you can use the pre-learned \n",
307 |     "# phrases as a starting point instead and the new phrases\n",
308 |     "# will be appended to the list\n",
309 |     "learnedPhrasesQ = []\n",
310 |     "\n",
311 |     "# Create a copy of the original text data that will be used during learning\n",
312 |     "# The copy is needed because the algorithm does in-place replacement of learned\n",
313 |     "# phrases directly on the text data structure it is provided\n",
314 |     "phraseTextDataQ = []\n",
315 |     "for textLine in CleanedTrainQ['LowercaseText']:\n",
316 |     "    phraseTextDataQ.append(' ' + textLine + ' ')\n",
317 |     "\n",
318 |     "# Run the phrase learning algorithm.\n",
319 |     "ApplyPhraseLearning(phraseTextDataQ, learnedPhrasesQ, maxNumPhrases=200, maxPhraseLength=7, maxPhrasesPerIter=50,\n",
320 |     "                    minCount=5, functionwordHash=functionwordHash)\n",
321 |     "\n",
322 |     "# Add text with learned phrases back into data frame\n",
323 |     "CleanedTrainQ['TextWithPhrases'] = phraseTextDataQ\n",
324 |     "\n",
325 |     "# Apply the phrase learning to test data.\n",
326 |     "CleanedTestQ['TextWithPhrases'] = ApplyPhraseRewritesInPlace(CleanedTestQ, 'LowercaseText', learnedPhrasesQ)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 8,
332 |    "metadata": {
333 |     "collapsed": false
334 |    },
335 |    "outputs": [
336 |     {
337 |      "name": "stdout",
338 |      "output_type": "stream",
339 |      "text": [
340 |       "\n",
341 |       "Here are some phrases we learned in this part of the tutorial: \n",
342 |       "\n",
343 |       "['possible duplicate', \"i'm trying\", 'works fine', 'doing wrong', 'click event', 'following code', 'using jquery', 'uncaught typeerror', 'ajax request', 'global variable', 'div class', 'json object', 'callback function', \"i'm not sure\", 'anonymous function', 'php file', 'return value', 'user clicks', 'dynamically created', 'input type']\n"
344 |      ]
345 |     }
346 |    ],
347 |    "source": [
348 |     "print(\"\\nHere are some phrases we learned in this part of the tutorial: \\n\")\n",
349 |     "print(learnedPhrasesQ[:20])"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "markdown",
354 |    "metadata": {},
355 |    "source": [
356 |     "## Reconstruct the Full Processed Text\n",
357 |     "\n",
358 |     "After replacing the text with learned phrases, we reconstruct the sentences from the chunks of text and insert the sentences in the `TextWithPhrases` field.  "
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 9,
364 |    "metadata": {
365 |     "collapsed": true
366 |    },
367 |    "outputs": [],
368 |    "source": [
369 |     "# reconstitue the text from seperated chunks.\n",
370 |     "trainQ['TextWithPhrases'] = ReconstituteDocsFromChunks(CleanedTrainQ, 'DocID', 'TextWithPhrases')\n",
371 |     "testQ['TextWithPhrases'] = ReconstituteDocsFromChunks(CleanedTestQ, 'DocID', 'TextWithPhrases')"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "markdown",
376 |    "metadata": {},
377 |    "source": [
378 |     "## Tokenize Text with Learned Phrases\n",
379 |     "\n",
380 |     "We learn a vocabulary by considering some text exclusion criteria, such as stop words, non-alphabetic words, the words below word count threshold, etc. \n",
381 |     "\n",
382 |     "`TokenizeText` module breaks the reconstituted text into individual tokens and excludes any word that doesn't exist in the vocabulary."
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": 10,
388 |    "metadata": {
389 |     "collapsed": true
390 |    },
391 |    "outputs": [],
392 |    "source": [
393 |     "def TokenizeText(textData, vocabHash):\n",
394 |     "    tokenizedText = ''\n",
395 |     "    for token in textData.split():\n",
396 |     "        if token in vocabHash:\n",
397 |     "            tokenizedText += (token.strip() + ',')\n",
398 |     "    return tokenizedText.strip(',')"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 11,
404 |    "metadata": {
405 |     "collapsed": false
406 |    },
407 |    "outputs": [
408 |     {
409 |      "name": "stdout",
410 |      "output_type": "stream",
411 |      "text": [
412 |       "Counting words\n",
413 |       "Building vocab\n",
414 |       "Excluded 307 stop words\n",
415 |       "Excluded 911 non-alphabetic words\n",
416 |       "Excluded 15265 words below word count threshold\n",
417 |       "Excluded 142 words below doc count threshold\n",
418 |       "Excluded 3 words above max doc frequency\n",
419 |       "Final Vocab Size: 3115 words\n"
420 |      ]
421 |     }
422 |    ],
423 |    "source": [
424 |     "# create the vocabulary.\n",
425 |     "vocabHashQ = CreateVocabForTopicModeling(trainQ['TextWithPhrases'], functionwordHash)\n",
426 |     "\n",
427 |     "# tokenize the text.\n",
428 |     "trainQ['Tokens'] = trainQ['TextWithPhrases'].apply(lambda x: TokenizeText(x, vocabHashQ))\n",
429 |     "testQ['Tokens'] = testQ['TextWithPhrases'].apply(lambda x: TokenizeText(x, vocabHashQ))"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": 12,
435 |    "metadata": {
436 |     "collapsed": false,
437 |     "scrolled": true
438 |    },
439 |    "outputs": [
440 |     {
441 |      "data": {
442 |       "text/html": [
443 |        "<div>\n",
444 |        "<table border=\"1\" class=\"dataframe\">\n",
445 |        "  <thead>\n",
446 |        "    <tr style=\"text-align: right;\">\n",
447 |        "      <th></th>\n",
448 |        "      <th>AnswerId</th>\n",
449 |        "      <th>Tokens</th>\n",
450 |        "    </tr>\n",
451 |        "    <tr>\n",
452 |        "      <th>Id</th>\n",
453 |        "      <th></th>\n",
454 |        "      <th></th>\n",
455 |        "    </tr>\n",
456 |        "  </thead>\n",
457 |        "  <tbody>\n",
458 |        "    <tr>\n",
459 |        "      <th>69913</th>\n",
460 |        "      <td>69984</td>\n",
461 |        "      <td>self-closing,script,tags,work,reason,browsers,...</td>\n",
462 |        "    </tr>\n",
463 |        "    <tr>\n",
464 |        "      <th>392561</th>\n",
465 |        "      <td>69984</td>\n",
466 |        "      <td>firefox,script,tag,error,adding,basic,script,t...</td>\n",
467 |        "    </tr>\n",
468 |        "    <tr>\n",
469 |        "      <th>1297308</th>\n",
470 |        "      <td>69984</td>\n",
471 |        "      <td>weird,javascript/jquery,behavior,possible_dupl...</td>\n",
472 |        "    </tr>\n",
473 |        "    <tr>\n",
474 |        "      <th>3352182</th>\n",
475 |        "      <td>69984</td>\n",
476 |        "      <td>html,script,tags,ending,possible_duplicate,t,s...</td>\n",
477 |        "    </tr>\n",
478 |        "    <tr>\n",
479 |        "      <th>5355867</th>\n",
480 |        "      <td>69984</td>\n",
481 |        "      <td>loading,scripts,possible_duplicate,don&amp;#39,t,s...</td>\n",
482 |        "    </tr>\n",
483 |        "  </tbody>\n",
484 |        "</table>\n",
485 |        "</div>"
486 |       ],
487 |       "text/plain": [
488 |        "         AnswerId                                             Tokens\n",
489 |        "Id                                                                  \n",
490 |        "69913       69984  self-closing,script,tags,work,reason,browsers,...\n",
491 |        "392561      69984  firefox,script,tag,error,adding,basic,script,t...\n",
492 |        "1297308     69984  weird,javascript/jquery,behavior,possible_dupl...\n",
493 |        "3352182     69984  html,script,tags,ending,possible_duplicate,t,s...\n",
494 |        "5355867     69984  loading,scripts,possible_duplicate,don&#39,t,s..."
495 |       ]
496 |      },
497 |      "execution_count": 12,
498 |      "metadata": {},
499 |      "output_type": "execute_result"
500 |     }
501 |    ],
502 |    "source": [
503 |     "trainQ[['AnswerId', 'Tokens']].head(5)"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "markdown",
508 |    "metadata": {},
509 |    "source": [
510 |     "## Save Outputs to a Share Directory in the Workbench"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": 13,
516 |    "metadata": {
517 |     "collapsed": true
518 |    },
519 |    "outputs": [],
520 |    "source": [
521 |     "trainQ.to_csv(os.path.join(workfolder, 'trainQ_part2'), sep='\\t', header=True, index=True, index_label='Id')\n",
522 |     "testQ.to_csv(os.path.join(workfolder, 'testQ_part2'), sep='\\t', header=True, index=True, index_label='Id')"
523 |    ]
524 |   }
525 |  ],
526 |  "metadata": {
527 |   "kernelspec": {
528 |    "display_name": "Python [default]",
529 |    "language": "python",
530 |    "name": "python3"
531 |   },
532 |   "language_info": {
533 |    "codemirror_mode": {
534 |     "name": "ipython",
535 |     "version": 3
536 |    },
537 |    "file_extension": ".py",
538 |    "mimetype": "text/x-python",
539 |    "name": "python",
540 |    "nbconvert_exporter": "python",
541 |    "pygments_lexer": "ipython3",
542 |    "version": "3.5.2"
543 |   }
544 |  },
545 |  "nbformat": 4,
546 |  "nbformat_minor": 2
547 | }
548 | 


--------------------------------------------------------------------------------
/notebooks/Part_3_Model_Training_and_Evaluation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Part 3: Model Training and Evaluation\n",
  8 |     "\n",
  9 |     "If you haven't complete the **Part 1: Data Preparation** and **Part 2: Phrase Learning**, please complete them before moving forward with **Part 3: Model Training and Evaluation**.\n",
 10 |     "\n",
 11 |     "**NOTE**: Python 3 kernel doesn't include Azure Machine Learning Workbench functionalities. Please switch the kernel to `local` before continuing further. \n",
 12 |     "\n",
 13 |     "This example is designed to score new questions against the pre-existing Q&A pairs by training text classification models where each pre-existing Q&A pair is a unique class and a subset of the duplicate questions for each Q&A pair are available as training material. \n",
 14 |     "\n",
 15 |     "In the Part 3, the classification model uses an ensemble method to aggregate the following three base classifiers. In each base classifier, the `AnswerId` is used as the class label and the BOWs representations is used as the features.\n",
 16 |     "\n",
 17 |     "1. Naive Bayes Classifier\n",
 18 |     "2. Support Vector Machine (TF-IDF as features)\n",
 19 |     "3. Random Forest (NB Scores as features)\n",
 20 |     "\n",
 21 |     "Two different evaluation metrics are used to assess performance.\n",
 22 |     "1. `Average Rank (AR)`: indicates the average position where the correct answer is found in the list of retrieved Q&A pairs (out of the full set of 103 answer classes). \n",
 23 |     "2. `Top 3 Percentage`: indicates the percentage of test questions that the correct answer can be retrieved in the top three choices in the returned ranked list. \n",
 24 |     "\n",
 25 |     "`Average Rank (AR)` and `Top 3 Percentage` on the test set are calculated using the following formula:\n",
 26 |     "\n",
 27 |     "<img src=\"https://raw.githubusercontent.com/Azure/MachineLearningSamples-QnAMatching/master/Image/evaluation_3.PNG?token=APoO9pHTAVmmb7YsGlsyWXgMHXDUz0xkks5Zwt4ywA%3D%3D\">"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "### Import Required Python Modules\n",
 35 |     "\n",
 36 |     "`modules.feature_extractor` contains a list of user-defined Python modules to extract effective features that are used in this examples. You can find the source code of those modules in the directory of `modules/feature_extractor.py`."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 1,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "import pandas as pd\n",
 48 |     "import numpy as np\n",
 49 |     "import os, warnings\n",
 50 |     "from sklearn import svm\n",
 51 |     "from sklearn.ensemble import RandomForestClassifier\n",
 52 |     "from modules.feature_extractor import (tokensToIds, countMatrix, priorProbabilityAnswer, posterioriProb, \n",
 53 |     "                               feature_selection, featureWeights, wordProbabilityInAnswer, \n",
 54 |     "                               wordProbabilityNotinAnswer, normalizeTF, getIDF, softmax)\n",
 55 |     "from azureml.logging import get_azureml_logger\n",
 56 |     "warnings.filterwarnings(\"ignore\")"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {
 63 |     "collapsed": true
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "run_logger = get_azureml_logger()\n",
 68 |     "run_logger.log('amlrealworld.QnA-matching.part3-model-training-eval','true')"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "## Access trainQ and testQ from Part 2\n",
 76 |     "\n",
 77 |     "As we have prepared the _trainQ_ and _testQ_ with learned phrases and tokens from `Part 2: Phrase Learning`, we retrieve the datasets here for the further process.\n",
 78 |     "\n",
 79 |     "_trainQ_ contains 5,153 training examples and _testQ_ contains 1,735 test examples. Also, there are 103 unique answer classes in both datasets."
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 2,
 85 |    "metadata": {
 86 |     "collapsed": true
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "workfolder = os.environ.get('AZUREML_NATIVE_SHARE_DIRECTORY')\n",
 91 |     "\n",
 92 |     "# paths to trainQ and testQ.\n",
 93 |     "trainQ_path = os.path.join(workfolder, 'trainQ_part2')\n",
 94 |     "testQ_path = os.path.join(workfolder, 'testQ_part2')\n",
 95 |     "\n",
 96 |     "# load the training and test data.\n",
 97 |     "trainQ = pd.read_csv(trainQ_path, sep='\\t', index_col='Id', encoding='latin1')\n",
 98 |     "testQ = pd.read_csv(testQ_path, sep='\\t', index_col='Id', encoding='latin1')"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "## Extract Features\n",
106 |     "\n",
107 |     "Selecting the right set of features is very critical for the model training. In this section, we show you several feature extraction approaches that have proved to yield good performance in text classification use cases.\n",
108 |     "\n",
109 |     "### Term Frequency and Inverse Document Frequency (TF-IDF) \n",
110 |     "\n",
111 |     "TF-IDF is a commonly used feature weighting approach for text classification. \n",
112 |     "\n",
113 |     "Each question `d` is typically represented by a feature vector `x` that represents the contents of `d`. Because different questions may have different lengths, it can be useful to apply L1 normalization on the feature vector `x`. Therefore, a normalized `Term Frequency` matrix can be obtained based on the following formula.\n",
114 |     "\n",
115 |     "<img src=\"https://raw.githubusercontent.com/Azure/MachineLearningSamples-QnAMatching/master/Image/tf.PNG?token=APoO9vtYknxorWSIoJ-dvhbNdu-3pjSIks5ZwuKzwA%3D%3D\">\n",
116 |     "\n",
117 |     "Considering all tokens observed in the training questions, we compute the `Inverse Document Frequency` for each token based on the following formula.\n",
118 |     "\n",
119 |     "<img src=\"https://raw.githubusercontent.com/Azure/MachineLearningSamples-QnAMatching/master/Image/idf.PNG?token=APoO9gVRgPlRbg7OSaV56CO0-yj2178Iks5ZwuK-wA%3D%3D\">\n",
120 |     "\n",
121 |     "By knowing the `Term Frequency (TF)` matrix and `Inverse Document Frequency (IDF)` vector, we can simply compute `TF-IDF` matrix by multiplying them together.\n",
122 |     "\n",
123 |     "<img src=\"https://raw.githubusercontent.com/Azure/MachineLearningSamples-QnAMatching/master/Image/tfidf.PNG?token=APoO9pllkWjHQTsshFCEGIUbyknjvq8Vks5ZwuMxwA%3D%3D\">"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 3,
129 |    "metadata": {
130 |     "collapsed": true
131 |    },
132 |    "outputs": [],
133 |    "source": [
134 |     "token2IdHashInit = tokensToIds(trainQ['Tokens'], featureHash=None)\n",
135 |     "\n",
136 |     "# get unique answerId in ascending order\n",
137 |     "uniqueAnswerId = list(np.unique(trainQ['AnswerId']))\n",
138 |     "\n",
139 |     "N_wQ = countMatrix(trainQ, token2IdHashInit)\n",
140 |     "idf = getIDF(N_wQ)\n",
141 |     "\n",
142 |     "x_wTrain = normalizeTF(trainQ, token2IdHashInit)\n",
143 |     "x_wTest = normalizeTF(testQ, token2IdHashInit)\n",
144 |     "\n",
145 |     "tfidfTrain = (x_wTrain.T * idf).T\n",
146 |     "tfidfTest = (x_wTest.T * idf).T"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "### Naive Bayes Scores\n",
154 |     "\n",
155 |     "Besides using the IDF as the word weighting mechnism, a hypothesis testing likelihood ratio approach is also implemented here. \n",
156 |     "\n",
157 |     "In this approach, the word weights are associated with the answer classes and are calculated using the following formula.\n",
158 |     "\n",
159 |     "<img src=\"https://raw.githubusercontent.com/Azure/MachineLearningSamples-QnAMatching/master/Image/NB_weight.PNG?token=APoO9kRUjFMeslJIVyY3wpBy8ycfyddKks5ZwuNjwA%3D%3D\">\n",
160 |     "\n",
161 |     "<img src=\"https://raw.githubusercontent.com/Azure/MachineLearningSamples-QnAMatching/master/Image/probability_function.PNG?token=APoO9v8kpp4bnjH00Tcr9qPA-tTs5Hezks5ZySQ8wA%3D%3D\">\n",
162 |     "\n",
163 |     "By knowing the `Term Frequency (TF)` matrix and `Weight` vector for each class, we can simply compute the `Naive Bayes Scores` matrix for each class by multiplying them together.\n",
164 |     "\n",
165 |     "#### Feature selection\n",
166 |     "\n",
167 |     "Text classification models often pre-select a set of features (i.e., tokens) which carry the most class relevant information for further processing while ignoring words that carry little to no value for identifying classes. A variety of feature selection methods have been previously explored for both text processing. In this example, we have had the most success selecting features based on the estimated class posterior probability `P(A|w)`, where `A` is a specific answer class and `w` is a specific token. The maximum a posteriori probability (MAP) estimate of `P(A|w)` is expressed as\n",
168 |     "\n",
169 |     "<img src=\"https://raw.githubusercontent.com/Azure/MachineLearningSamples-QnAMatching/master/Image/feature_selection.PNG?token=APoO9uPZca25b_A2_7-I4m3v1P2K0jSrks5ZySROwA%3D%3D\">\n",
170 |     "\n",
171 |     "Feature selection in this example is performed by selecting the top `N` tokens which maximize for each `P(A|w)`. In order to determine the best value for the `TopN` parameter, you can simply run the `scripts/naive_bayes.py` with `local` compute context in the Azure Machine Learning Workbench and enter different integer values as `Arguments`.\n",
172 |     "\n",
173 |     "<img src=\"https://raw.githubusercontent.com/Azure/MachineLearningSamples-QnAMatching/master/Image/run_naive_bayes.PNG?token=APoO9pKfKs4--gnpxNfM8Pueedv5oOwAks5ZwuXpwA%3D%3D\">\n",
174 |     "\n",
175 |     "Based our experiments, the `TopN = 19` yields the best result and is demonstrated in this notebook. "
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 4,
181 |    "metadata": {
182 |     "collapsed": true
183 |    },
184 |    "outputs": [],
185 |    "source": [
186 |     "# calculate the count matrix of all training questions.\n",
187 |     "N_wAInit = countMatrix(trainQ, token2IdHashInit, 'AnswerId', uniqueAnswerId)\n",
188 |     "\n",
189 |     "P_A = priorProbabilityAnswer(trainQ['AnswerId'], uniqueAnswerId)\n",
190 |     "P_Aw = posterioriProb(N_wAInit, P_A, uniqueAnswerId)\n",
191 |     "\n",
192 |     "# select top N important tokens per answer class.\n",
193 |     "featureHash = feature_selection(P_Aw, token2IdHashInit, topN=19)\n",
194 |     "token2IdHash = tokensToIds(trainQ['Tokens'], featureHash=featureHash)\n",
195 |     "\n",
196 |     "N_wA = countMatrix(trainQ, token2IdHash, 'AnswerId', uniqueAnswerId)\n",
197 |     "\n",
198 |     "alpha = 0.0001\n",
199 |     "P_w = featureWeights(N_wA, alpha)\n",
200 |     "\n",
201 |     "beta = 0.0001\n",
202 |     "P_wA = wordProbabilityInAnswer(N_wA, P_w, beta)\n",
203 |     "P_wNotA = wordProbabilityNotinAnswer(N_wA, P_w, beta)\n",
204 |     "\n",
205 |     "NBWeights = np.log(P_wA / P_wNotA)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "## Train Classification Models and Predict on Test Data"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "### Naive Bayes Classifier\n",
220 |     "\n",
221 |     "We implement the _Naive Bayes Classifier_ as described in the paper entitled [\"MCE Training Techniques for Topic Identification of Spoken Audio Documents\"](http://ieeexplore.ieee.org/abstract/document/5742980/)."
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 5,
227 |    "metadata": {
228 |     "collapsed": true
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "beta_A = 0\n",
233 |     "\n",
234 |     "x_wTest = normalizeTF(testQ, token2IdHash)\n",
235 |     "Y_test_prob1 = softmax(-beta_A + np.dot(x_wTest.T, NBWeights))"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "### Support Vector Machine (TF-IDF as features)\n",
243 |     "\n",
244 |     "Traditionally, Support Vector Machine (SVM) model finds a hyperplane which maximally seperates positive and negative training tokens in a vector space. In its standard form, an SVM is a two-class classifier. To create a SVM model for a problem with multiple classes, a one-versus-rest (OVR) SVM classifier is typically learned for each answer class.\n",
245 |     "\n",
246 |     "The `sklearn` Python package implement such a classifier and we use the implementation in this example. More information about this `LinearSVC` classifier can be found [here](http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html)."
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 6,
252 |    "metadata": {
253 |     "collapsed": true
254 |    },
255 |    "outputs": [],
256 |    "source": [
257 |     "X_train, Y_train = tfidfTrain.T, np.array(trainQ['AnswerId'])\n",
258 |     "clf = svm.LinearSVC(dual=True, multi_class='ovr', penalty='l2', C=1, loss=\"squared_hinge\", random_state=1)\n",
259 |     "clf.fit(X_train, Y_train)\n",
260 |     "\n",
261 |     "X_test = tfidfTest.T\n",
262 |     "Y_test_prob2 = softmax(clf.decision_function(X_test))"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "### Random Forest (NB Scores as features)\n",
270 |     "\n",
271 |     "Similar to the above one-versus-rest SVM classifier, we also implement a one-versus-rest Random Forest classifier based on a base two-class Random Forest classifier from `sklearn`. More information about the `RandomForestClassifier` can be found [here](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html).\n",
272 |     "\n",
273 |     "In each base classifier, we dynamically compute the naive bayes scores for the positive class as the features. Since the number of negative examples is much larger than the number of positive examples, we hold all positive example and randomly select negative examples based on a negative to positive ratio to obtain a balanced training data. This is controlled by the `ratio` parameter in the `ovrClassifier` function below.\n",
274 |     "\n",
275 |     "In this classifier, we need to tune two hyper-parameters: `TopN` and `n_estimators`. `TopN` is the same parameter as we learned in the _Feature Selection_ step and `n_estimators` indicates the number of trees to be constructed in the Random Forest classifier. To identify the best values for the hyper-parameters, you can run `scripts/random_forest.py` with `local` compute context in the Azure Machine Learning Workbench and enter different integer values `Arguments`. The value of `TopN` and the value of `n_estimators` should be space delimited.\n",
276 |     "\n",
277 |     "<img src=\"https://raw.githubusercontent.com/Azure/MachineLearningSamples-QnAMatching/master/Image/run_rf.PNG?token=APoO9qTD6OH201WZFpAETKAWN3MII-Ocks5ZwumRwA%3D%3D\">\n",
278 |     "\n",
279 |     "Based our experiments, the `TopN = 19` and `n_estimators = 250` yields the best result, and are demonstrated in this notebook."
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 7,
285 |    "metadata": {
286 |     "collapsed": true
287 |    },
288 |    "outputs": [],
289 |    "source": [
290 |     "# train one-vs-rest classifier using NB scores as features.\n",
291 |     "def ovrClassifier(trainLabels, x_wTrain, x_wTest, NBWeights, clf, ratio):\n",
292 |     "    uniqueLabel = np.unique(trainLabels)\n",
293 |     "    dummyLabels = pd.get_dummies(trainLabels)\n",
294 |     "    numTest = x_wTest.shape[1]\n",
295 |     "    Y_test_prob = np.zeros(shape=(numTest, len(uniqueLabel)))\n",
296 |     "\n",
297 |     "    for i in range(len(uniqueLabel)):\n",
298 |     "        X_train_all, Y_train_all = x_wTrain.T * NBWeights[:, i], dummyLabels.iloc[:, i]\n",
299 |     "        X_test = x_wTest.T * NBWeights[:, i]\n",
300 |     "        \n",
301 |     "        # with sample selection.\n",
302 |     "        if ratio is not None:\n",
303 |     "            # ratio = # of Negative/# of Positive\n",
304 |     "            posIdx = np.where(Y_train_all == 1)[0]\n",
305 |     "            negIdx = np.random.choice(np.where(Y_train_all == 0)[0], ratio*len(posIdx))\n",
306 |     "            allIdx = np.concatenate([posIdx, negIdx])\n",
307 |     "            X_train, Y_train = X_train_all[allIdx], Y_train_all.iloc[allIdx]\n",
308 |     "        else: # without sample selection.\n",
309 |     "            X_train, Y_train = X_train_all, Y_train_all\n",
310 |     "            \n",
311 |     "        clf.fit(X_train, Y_train)\n",
312 |     "        if hasattr(clf, \"decision_function\"):\n",
313 |     "            Y_test_prob[:, i] = clf.decision_function(X_test)\n",
314 |     "        else:\n",
315 |     "            Y_test_prob[:, i] = clf.predict_proba(X_test)[:, 1]\n",
316 |     "\n",
317 |     "    return softmax(Y_test_prob)"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 8,
323 |    "metadata": {
324 |     "collapsed": true
325 |    },
326 |    "outputs": [],
327 |    "source": [
328 |     "x_wTrain = normalizeTF(trainQ, token2IdHash)\n",
329 |     "x_wTest = normalizeTF(testQ, token2IdHash)\n",
330 |     "\n",
331 |     "clf = RandomForestClassifier(n_estimators=250, criterion='entropy', random_state=1)\n",
332 |     "Y_test_prob3 = ovrClassifier(trainQ[\"AnswerId\"], x_wTrain, x_wTest, NBWeights, clf, ratio=3)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {},
338 |    "source": [
339 |     "### Ensemble Model\n",
340 |     "\n",
341 |     "We build an ensemble model by aggregating the predicted probabilities from three previously trained classifiers. The base classifiers are equally weighted in this ensemble method. "
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 9,
347 |    "metadata": {
348 |     "collapsed": true
349 |    },
350 |    "outputs": [],
351 |    "source": [
352 |     "Y_test_prob_aggr = np.mean([Y_test_prob1, Y_test_prob2, Y_test_prob3], axis=0)"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "markdown",
357 |    "metadata": {},
358 |    "source": [
359 |     "## Evaluate Model Performance\n",
360 |     "\n",
361 |     "Two different evaluation metrics are used to assess performance. \n",
362 |     "1. `Average Rank (AR)`: indicates the average position where the correct answer is found in the list of retrieved Q&A pairs (out of the full set of 103 answer classes). \n",
363 |     "2. `Top 3 Percentage`: indicates the percentage of test questions that the correct answer can be retrieved in the top three choices in the returned ranked list. "
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 10,
369 |    "metadata": {
370 |     "collapsed": true
371 |    },
372 |    "outputs": [],
373 |    "source": [
374 |     "# get the rank of answerIds for a given question. \n",
375 |     "def rank(frame, scores, uniqueAnswerId):\n",
376 |     "    frame['SortedAnswers'] = list(np.array(uniqueAnswerId)[np.argsort(-scores, axis=1)])\n",
377 |     "    \n",
378 |     "    rankList = []\n",
379 |     "    for i in range(len(frame)):\n",
380 |     "        rankList.append(np.where(frame['SortedAnswers'].iloc[i] == frame['AnswerId'].iloc[i])[0][0] + 1)\n",
381 |     "    frame['Rank'] = rankList\n",
382 |     "    \n",
383 |     "    return frame"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": 11,
389 |    "metadata": {
390 |     "collapsed": false
391 |    },
392 |    "outputs": [
393 |     {
394 |      "name": "stdout",
395 |      "output_type": "stream",
396 |      "text": [
397 |       "Average of rank: 5.0\n",
398 |       "Percentage of questions find answers in the first 3 choices: 0.684\n"
399 |      ]
400 |     }
401 |    ],
402 |    "source": [
403 |     "testQ = rank(testQ, Y_test_prob_aggr, uniqueAnswerId)\n",
404 |     "\n",
405 |     "AR = np.floor(testQ['Rank'].mean())\n",
406 |     "top3 = round(len(testQ.query('Rank <= 3'))/len(testQ), 3)\n",
407 |     " \n",
408 |     "print('Average of rank: ' + str(AR))\n",
409 |     "print('Percentage of questions find answers in the first 3 choices: ' + str(top3))"
410 |    ]
411 |   }
412 |  ],
413 |  "metadata": {
414 |   "kernelspec": {
415 |    "display_name": "Python [default]",
416 |    "language": "python",
417 |    "name": "python3"
418 |   },
419 |   "language_info": {
420 |    "codemirror_mode": {
421 |     "name": "ipython",
422 |     "version": 3
423 |    },
424 |    "file_extension": ".py",
425 |    "mimetype": "text/x-python",
426 |    "name": "python",
427 |    "nbconvert_exporter": "python",
428 |    "pygments_lexer": "ipython3",
429 |    "version": "3.5.2"
430 |   }
431 |  },
432 |  "nbformat": 4,
433 |  "nbformat_minor": 2
434 | }
435 | 


--------------------------------------------------------------------------------
/scripts/naive_bayes.py:
--------------------------------------------------------------------------------
  1 | ###################################################################################################################################
  2 | # This script is used for hyperparameter tunning of the Naive Bayes model described in the Part 3: Model Training and Evaluation.
  3 | # To run this script, please enter a non-negative integer in the "Argument" box above the screen.
  4 | # This argument indicates the number of important tokens that we selected for each answer class. 
  5 | # The total number of tokens that are selected for all classes construct the whole feature space.
  6 | ###################################################################################################################################
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | import os, sys, warnings
 11 | from azureml.logging import get_azureml_logger
 12 | sys.path.append("")
 13 | from modules.feature_extractor import (tokensToIds, countMatrix, priorProbabilityAnswer, posterioriProb, 
 14 |                                        feature_selection, featureWeights, wordProbabilityInAnswer, 
 15 |                                        wordProbabilityNotinAnswer, normalizeTF, softmax)
 16 | warnings.filterwarnings("ignore")
 17 | 
 18 | run_logger = get_azureml_logger()
 19 | run_logger.log('amlrealworld.QnA-matching.naive-bayes','true')
 20 | 
 21 | 
 22 | 
 23 | #########################################
 24 | # User Defined Functions
 25 | #########################################
 26 | 
 27 | # get the rank of answerIds for a given question. 
 28 | def rank(frame, scores, uniqueAnswerId):
 29 |     frame['SortedAnswers'] = list(np.array(uniqueAnswerId)[np.argsort(-scores, axis=1)])
 30 |     
 31 |     rankList = []
 32 |     for i in range(len(frame)):
 33 |         rankList.append(np.where(frame['SortedAnswers'].iloc[i] == frame['AnswerId'].iloc[i])[0][0] + 1)
 34 |     frame['Rank'] = rankList
 35 |     
 36 |     return frame
 37 | 
 38 | 
 39 | #########################################
 40 | # Main Function
 41 | #########################################
 42 | 
 43 | def main():
 44 | 
 45 |     #########################################
 46 |     # Accept One Argument as Input
 47 |     #########################################
 48 | 
 49 |     try:
 50 |         topN = int(sys.argv[1])
 51 |     except IndexError:
 52 |         print("This script takes one argument. Please enter a valid non-negative integer number.\n")
 53 |         raise
 54 | 
 55 | 
 56 |     #########################################
 57 |     # Access trainQ and testQ from Part 2
 58 |     #########################################
 59 | 
 60 |     workfolder = os.environ.get('AZUREML_NATIVE_SHARE_DIRECTORY')
 61 |    
 62 |     # paths to trainQ and testQ.
 63 |     trainQ_path = os.path.join(workfolder, 'trainQ_part2')
 64 |     testQ_path = os.path.join(workfolder, 'testQ_part2')
 65 | 
 66 |     # load the training and test data.
 67 |     trainQ = pd.read_csv(trainQ_path, sep='\t', index_col='Id', encoding='latin1')
 68 |     testQ = pd.read_csv(testQ_path, sep='\t', index_col='Id', encoding='latin1')
 69 | 
 70 | 
 71 |     #########################################
 72 |     # Extract Features
 73 |     #########################################
 74 |     
 75 |     token2IdHashInit = tokensToIds(trainQ['Tokens'], featureHash=None)
 76 | 
 77 |     # get unique answerId in ascending order
 78 |     uniqueAnswerId = list(np.unique(trainQ['AnswerId']))
 79 | 
 80 |     # calculate the count matrix of all training questions.
 81 |     N_wAInit = countMatrix(trainQ, token2IdHashInit, 'AnswerId', uniqueAnswerId)
 82 | 
 83 |     P_A = priorProbabilityAnswer(trainQ['AnswerId'], uniqueAnswerId)
 84 |     P_Aw = posterioriProb(N_wAInit, P_A, uniqueAnswerId)
 85 | 
 86 |     # select top N important tokens per answer class.
 87 |     featureHash = feature_selection(P_Aw, token2IdHashInit, topN=topN)
 88 |     token2IdHash = tokensToIds(trainQ['Tokens'], featureHash=featureHash)
 89 | 
 90 |     N_wA = countMatrix(trainQ, token2IdHash, 'AnswerId', uniqueAnswerId)
 91 | 
 92 |     alpha = 0.0001
 93 |     P_w = featureWeights(N_wA, alpha)
 94 | 
 95 |     beta = 0.0001
 96 |     P_wA = wordProbabilityInAnswer(N_wA, P_w, beta)
 97 |     P_wNotA = wordProbabilityNotinAnswer(N_wA, P_w, beta)   
 98 | 
 99 | 
100 |     #########################################
101 |     # Train Naive Bayes Classifier
102 |     #########################################
103 | 
104 |     NBWeights = np.log(P_wA / P_wNotA)
105 | 
106 | 
107 |     #########################################
108 |     # Predict Probabilities on Test
109 |     #########################################
110 | 
111 |     beta_A = 0
112 |     x_wTest = normalizeTF(testQ, token2IdHash)
113 |     Y_test_prob = softmax(-beta_A + np.dot(x_wTest.T, NBWeights))
114 | 
115 | 
116 |     #########################################
117 |     # Evaluate Model Performance
118 |     #########################################
119 |     # We use two evaluation matrices (Average Rank and Top 3 Percentage) to test our model performance. 
120 |     # The Average Rank can be interpreted as in average at which position we can find the correct answer among all available answers for a given question.
121 |     # The Top 3 Percentage can be interpreted as how many percentage of the new questions that we can find their correct answers in the first 3 choices.
122 |     # sort the similarity scores in descending order and map them to the corresponding AnswerId in Answer set
123 | 
124 |     testQ = rank(testQ, Y_test_prob, uniqueAnswerId)
125 | 
126 |     AR = np.floor(testQ['Rank'].mean())
127 |     top3 = round(len(testQ.query('Rank <= 3'))/len(testQ), 3)
128 | 
129 |     print('Top %d important tokens selected per Class.' %topN)  
130 |     print('Average of rank: ' + str(AR))
131 |     print('Percentage of questions find answers in the first 3 choices: ' + str(top3))
132 | 
133 | 
134 |     #########################################
135 |     # Log Parameters and Performance
136 |     #########################################
137 | 
138 |     # initialize the logger
139 |     run_logger = get_azureml_logger()
140 | 
141 |     # log performance.
142 |     run_logger.log("Top N Tokens Selected", topN)
143 |     run_logger.log("Average Rank", AR)
144 |     run_logger.log("Top 3 Percentage", top3)
145 | 
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     main()
150 |     print("\nRun is complete!")
151 | 


--------------------------------------------------------------------------------
/scripts/random_forest.py:
--------------------------------------------------------------------------------
  1 | ###################################################################################################################################
  2 | # This script is used for hyperparameter tunning of the Random Forest model described in the Part 3: Model Training and Evaluation.
  3 | # To run this script, please enter two non-negative integers seperated by a space in the "Argument" box above the screen.
  4 | # The first argument indicates the number of important tokens that we selected for each answer class. 
  5 | # The second argument indicates the number of trees to be constructed in the Random Forest model.
  6 | ###################################################################################################################################
  7 | 
  8 | 
  9 | import pandas as pd
 10 | import numpy as np
 11 | import os, sys, warnings
 12 | from azureml.logging import get_azureml_logger
 13 | from sklearn.ensemble import RandomForestClassifier
 14 | sys.path.append("")
 15 | from modules.feature_extractor import (tokensToIds, countMatrix, priorProbabilityAnswer, posterioriProb, 
 16 |                                        feature_selection, featureWeights, wordProbabilityInAnswer, 
 17 |                                        wordProbabilityNotinAnswer, normalizeTF, softmax)
 18 | from naive_bayes import (rank)
 19 | warnings.filterwarnings("ignore")
 20 | 
 21 | run_logger = get_azureml_logger()
 22 | run_logger.log('amlrealworld.QnA-matching.random-forest','true')
 23 | 
 24 | #########################################
 25 | # User Defined Functions
 26 | #########################################
 27 | 
 28 | # train one-vs-rest classifier using NB scores as features.
 29 | def ovrClassifier(trainLabels, x_wTrain, x_wTest, NBWeights, clf, ratio):
 30 |     uniqueLabel = np.unique(trainLabels)
 31 |     dummyLabels = pd.get_dummies(trainLabels)
 32 |     numTest = x_wTest.shape[1]
 33 |     Y_test_prob = np.zeros(shape=(numTest, len(uniqueLabel)))
 34 | 
 35 |     for i in range(len(uniqueLabel)):
 36 |         X_train_all, Y_train_all = x_wTrain.T * NBWeights[:, i], dummyLabels.iloc[:, i]
 37 |         X_test = x_wTest.T * NBWeights[:, i]
 38 |         
 39 |         # with sample selection.
 40 |         if ratio is not None:
 41 |             # ratio = # of Negative/# of Positive
 42 |             posIdx = np.where(Y_train_all == 1)[0]
 43 |             negIdx = np.random.choice(np.where(Y_train_all == 0)[0], ratio*len(posIdx))
 44 |             allIdx = np.concatenate([posIdx, negIdx])
 45 |             X_train, Y_train = X_train_all[allIdx], Y_train_all.iloc[allIdx]
 46 |         else: # without sample selection.
 47 |             X_train, Y_train = X_train_all, Y_train_all
 48 |             
 49 |         clf.fit(X_train, Y_train)
 50 |         if hasattr(clf, "decision_function"):
 51 |             Y_test_prob[:, i] = clf.decision_function(X_test)
 52 |         else:
 53 |             Y_test_prob[:, i] = clf.predict_proba(X_test)[:, 1]
 54 | 
 55 |     return softmax(Y_test_prob)
 56 | 
 57 | 
 58 | #########################################
 59 | # Main Function
 60 | #########################################
 61 | 
 62 | def main():
 63 | 
 64 |     #########################################
 65 |     # Accept One Argument as Input
 66 |     #########################################
 67 |     
 68 |     try:
 69 |         topN = int(sys.argv[1])
 70 |         n_estimators = int(sys.argv[2])
 71 |     except IndexError:
 72 |         print("This script takes two arguments. Please enter valid non-negative integer numbers.\n")
 73 |         raise
 74 | 
 75 | 
 76 |     #########################################
 77 |     # Access trainQ and testQ from Part 2
 78 |     #########################################
 79 | 
 80 |     workfolder = os.environ.get('AZUREML_NATIVE_SHARE_DIRECTORY')
 81 | 
 82 |     # paths to trainQ and testQ.
 83 |     trainQ_path = os.path.join(workfolder, 'trainQ_part2')
 84 |     testQ_path = os.path.join(workfolder, 'testQ_part2')
 85 | 
 86 |     # load the training and test data.
 87 |     trainQ = pd.read_csv(trainQ_path, sep='\t', index_col='Id', encoding='latin1')
 88 |     testQ = pd.read_csv(testQ_path, sep='\t', index_col='Id', encoding='latin1')
 89 | 
 90 | 
 91 |     #########################################
 92 |     # Extract Features
 93 |     #########################################
 94 |     
 95 |     token2IdHashInit = tokensToIds(trainQ['Tokens'], featureHash=None)
 96 | 
 97 |     # get unique answerId in ascending order
 98 |     uniqueAnswerId = list(np.unique(trainQ['AnswerId']))
 99 | 
100 |     # calculate the count matrix of all training questions.
101 |     N_wAInit = countMatrix(trainQ, token2IdHashInit, 'AnswerId', uniqueAnswerId)
102 | 
103 |     P_A = priorProbabilityAnswer(trainQ['AnswerId'], uniqueAnswerId)
104 |     P_Aw = posterioriProb(N_wAInit, P_A, uniqueAnswerId)
105 | 
106 |     # select top N important tokens per answer class.
107 |     featureHash = feature_selection(P_Aw, token2IdHashInit, topN=topN)
108 |     token2IdHash = tokensToIds(trainQ['Tokens'], featureHash=featureHash)
109 | 
110 |     N_wA = countMatrix(trainQ, token2IdHash, 'AnswerId', uniqueAnswerId)
111 | 
112 |     alpha = 0.0001
113 |     P_w = featureWeights(N_wA, alpha)
114 | 
115 |     beta = 0.0001
116 |     P_wA = wordProbabilityInAnswer(N_wA, P_w, beta)
117 |     P_wNotA = wordProbabilityNotinAnswer(N_wA, P_w, beta)   
118 | 
119 |     x_wTrain = normalizeTF(trainQ, token2IdHash)
120 |     x_wTest = normalizeTF(testQ, token2IdHash)
121 | 
122 | 
123 |     #########################################
124 |     # Train Naive Bayes Classifier
125 |     #########################################
126 | 
127 |     NBWeights = np.log(P_wA / P_wNotA)
128 |     clf = RandomForestClassifier(n_estimators=n_estimators, criterion='entropy', random_state=1)
129 | 
130 | 
131 |     #########################################
132 |     # Predict Probabilities on Test
133 |     #########################################
134 | 
135 |     Y_test_prob = ovrClassifier(trainQ["AnswerId"], x_wTrain, x_wTest, NBWeights, clf, ratio=3)
136 | 
137 | 
138 |     #########################################
139 |     # Evaluate Model Performance
140 |     #########################################
141 |     # We use two evaluation matrices (Average Rank and Top 3 Percentage) to test our model performance. 
142 |     # The Average Rank can be interpreted as in average at which position we can find the correct answer among all available answers for a given question.
143 |     # The Top 3 Percentage can be interpreted as how many percentage of the new questions that we can find their correct answers in the first 3 choices.
144 |     # sort the similarity scores in descending order and map them to the corresponding AnswerId in Answer set
145 | 
146 |     testQ = rank(testQ, Y_test_prob, uniqueAnswerId)
147 | 
148 |     AR = np.floor(testQ['Rank'].mean())
149 |     top3 = round(len(testQ.query('Rank <= 3'))/len(testQ), 3)
150 | 
151 |     print('Top %d important tokens selected per Class.' %topN)  
152 |     print('# of trees in the Random Forest: ' + str(n_estimators))
153 |     print('Average of rank: ' + str(AR))
154 |     print('Percentage of questions find answers in the first 3 choices: ' + str(top3))
155 | 
156 | 
157 |     #########################################
158 |     # Log Parameters and Performance
159 |     #########################################
160 | 
161 |     # initialize the logger.
162 |     run_logger = get_azureml_logger() 
163 | 
164 |     # log performance.
165 |     run_logger.log("Top N Tokens Selected", topN)
166 |     run_logger.log("Number of Trees", n_estimators)
167 |     run_logger.log("Average Rank", AR)
168 |     run_logger.log("Top 3 Percentage", top3)
169 | 
170 | 
171 | if __name__ == "__main__":
172 |     main()
173 |     print("\nRun is complete!")


--------------------------------------------------------------------------------