├── .gitignore ├── Image ├── NB_weight.PNG ├── data_diagram.png ├── evaluation_3.PNG ├── feature_selection.PNG ├── idf.PNG ├── probability_function.PNG ├── run_naive_bayes.PNG ├── run_rf.PNG ├── tf.PNG ├── tfidf.PNG └── training_size.PNG ├── LICENSE ├── README.md ├── aml_config ├── conda_dependencies.yml ├── docker.compute ├── docker.runconfig ├── jupyter_notebook_config.py ├── local.compute ├── local.runconfig └── spark_dependencies.yml ├── modules ├── __init__.py ├── feature_extractor.py └── phrase_learning.py ├── notebooks ├── Part_1_Data_Preparation.ipynb ├── Part_2_Phrase_Learning.ipynb └── Part_3_Model_Training_and_Evaluation.ipynb └── scripts ├── naive_bayes.py └── random_forest.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /Image/NB_weight.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/NB_weight.PNG -------------------------------------------------------------------------------- /Image/data_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/data_diagram.png -------------------------------------------------------------------------------- /Image/evaluation_3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/evaluation_3.PNG -------------------------------------------------------------------------------- /Image/feature_selection.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/feature_selection.PNG -------------------------------------------------------------------------------- /Image/idf.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/idf.PNG -------------------------------------------------------------------------------- /Image/probability_function.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/probability_function.PNG -------------------------------------------------------------------------------- /Image/run_naive_bayes.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/run_naive_bayes.PNG -------------------------------------------------------------------------------- /Image/run_rf.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/run_rf.PNG -------------------------------------------------------------------------------- /Image/tf.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/tf.PNG -------------------------------------------------------------------------------- /Image/tfidf.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/tfidf.PNG -------------------------------------------------------------------------------- /Image/training_size.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure-Samples/MachineLearningSamples-QnAMatching/2683d34929037867eb1d89e79064828225b33f26/Image/training_size.PNG -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # QnA Matching 2 | 3 | > **NOTE** This content is no longer maintained. Visit the [Azure Machine Learning Notebook](https://github.com/Azure/MachineLearningNotebooks) project for sample Jupyter notebooks for ML and deep learning with Azure Machine Learning. 4 | 5 | 6 | ## Link to the Microsoft DOCS site 7 | 8 | The detailed documentation for this Q & A matching example includes the step-by-step walk-through: 9 | [https://docs.microsoft.com/azure/machine-learning/preview/scenario-qna-matching](https://docs.microsoft.com/azure/machine-learning/preview/scenario-qna-matching) 10 | 11 | 12 | ## Link to the Gallery GitHub repository 13 | 14 | The public GitHub repository for this Q & A matching example contains all the code samples: 15 | [https://github.com/Azure/MachineLearningSamples-QnAMatching](https://github.com/Azure/MachineLearningSamples-QnAMatching) 16 | 17 | 18 | ## Overview 19 | 20 | This example addresses the problem of mapping user questions to pre-existing Question & Answer (Q&A) pairs as is typically provided in a list of Frequently Asked Questions (that is, a FAQ) or in the Q&A pairs present on websites like [Stack Overflow](https://stackoverflow.com/). There are many approaches to match a question to its correct answer, such as finding the answer that is the most similar to the question. However, in this example open ended questions are matched to previously asked questions by assuming that each answer in the FAQ can answer multiple semantically equivalent questions. 21 | 22 | The key steps required to deliver this solution are as follows: 23 | 24 | 1. Clean and process text data. 25 | 2. Learn informative phrases, which are multi-word sequences that provide more information when viewed in sequence than when treated independently. 26 | 3. Extract features from text data. 27 | 4. Train text classification models and evaluate model performance. 28 | 29 | 30 | ## Key components needed to run this example 31 | 32 | 1. An [Azure account](https://azure.microsoft.com/free/) (free trials are available). 33 | 2. An installed copy of Azure Machine Learning Workbench with a workspace created. 34 | 3. This example could be run on any compute context. However, it is recommended to run it on a multi-core machine with at least of 16-GB memory and 5-GB disk space. 35 | 36 | 37 | ## Data / Telemetry 38 | QnA Matching collects usage data and sends it to Microsoft to help improve our products and services. Read our [privacy statement](http://go.microsoft.com/fwlink/?LinkId=521839) to learn more. 39 | 40 | 41 | ## Contributing 42 | 43 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 44 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 45 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 46 | 47 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 48 | a CLA and decorate the PR appropriately (for example, label, comment). Simply follow the instructions 49 | provided by the bot. You will only need to do this once across all repos using our CLA. 50 | 51 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 52 | For more information, see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 53 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 54 | 55 | -------------------------------------------------------------------------------- /aml_config/conda_dependencies.yml: -------------------------------------------------------------------------------- 1 | # Conda environment specification. The dependencies defined in this file will be 2 | # automatically provisioned for runs against docker, VM, and HDI cluster targets. 3 | 4 | # Details about the Conda environment file format: 5 | # https://conda.io/docs/using/envs.html#create-environment-file-by-hand 6 | 7 | # For Spark packages and configuration, see spark_dependencies.yml. 8 | 9 | name: project_environment 10 | dependencies: 11 | - python=3.5.2 12 | - scikit-learn 13 | - pip: 14 | - notebook 15 | - nltk 16 | # The API for Azure Machine Learning Model Management Service. 17 | - azure-ml-api-sdk==0.1.0a11 18 | 19 | # Helper utilities for calculating dataprofiles from Pandas DataFrames. 20 | - https://azuremldownloads.blob.core.windows.net/wheels/latest/azureml.pyrecipes.dataframe-1.0.12-py3-none-any.whl?sv=2016-05-31&si=ro-2017&sr=c&sig=xnUdTm0B%2F%2FfknhTaRInBXyu2QTTt8wA3OsXwGVgU%2BJk%3D 21 | 22 | # Helper utilities for dealing with Azure ML Workbench Assets. 23 | - https://azuremldownloads.blob.core.windows.net/wheels/latest/azureml.assets-1.0.0-py3-none-any.whl?sv=2016-05-31&si=ro-2017&sr=c&sig=xnUdTm0B%2F%2FfknhTaRInBXyu2QTTt8wA3OsXwGVgU%2BJk%3D 24 | -------------------------------------------------------------------------------- /aml_config/docker.compute: -------------------------------------------------------------------------------- 1 | type: "docker" 2 | baseDockerImage: "microsoft/mmlspark:0.7.91" 3 | 4 | # Enabling Docker shared volumes will increase increase execution performance, 5 | # but the shared volume feature of Docker isn't stable on Windows yet. 6 | sharedVolumes: true 7 | -------------------------------------------------------------------------------- /aml_config/docker.runconfig: -------------------------------------------------------------------------------- 1 | ArgumentVector: 2 | - "$file" 3 | Target: "docker-python" 4 | EnvironmentVariables: 5 | "EXAMPLE_ENV_VAR": "Example Value" 6 | Framework: "Python" 7 | CondaDependenciesFile: "aml_config/conda_dependencies.yml" 8 | SparkDependenciesFile: "aml_config/spark_dependencies.yml" 9 | PrepareEnvironment: true 10 | TrackedRun: true -------------------------------------------------------------------------------- /aml_config/jupyter_notebook_config.py: -------------------------------------------------------------------------------- 1 | # Configuration file for jupyter-notebook. 2 | 3 | #------------------------------------------------------------------------------ 4 | # Application(SingletonConfigurable) configuration 5 | #------------------------------------------------------------------------------ 6 | 7 | ## This is an application. 8 | 9 | ## The date format used by logging formatters for %(asctime)s 10 | #c.Application.log_datefmt = '%Y-%m-%d %H:%M:%S' 11 | 12 | ## The Logging format template 13 | #c.Application.log_format = '[%(name)s]%(highlevel)s %(message)s' 14 | 15 | ## Set the log level by value or name. 16 | #c.Application.log_level = 30 17 | 18 | #------------------------------------------------------------------------------ 19 | # JupyterApp(Application) configuration 20 | #------------------------------------------------------------------------------ 21 | 22 | ## Base class for Jupyter applications 23 | 24 | ## Answer yes to any prompts. 25 | #c.JupyterApp.answer_yes = False 26 | 27 | ## Full path of a config file. 28 | #c.JupyterApp.config_file = '' 29 | 30 | ## Specify a config file to load. 31 | #c.JupyterApp.config_file_name = '' 32 | 33 | ## Generate default config file. 34 | #c.JupyterApp.generate_config = False 35 | 36 | #------------------------------------------------------------------------------ 37 | # NotebookApp(JupyterApp) configuration 38 | #------------------------------------------------------------------------------ 39 | 40 | ## Set the Access-Control-Allow-Credentials: true header 41 | #c.NotebookApp.allow_credentials = False 42 | 43 | ## Set the Access-Control-Allow-Origin header 44 | # 45 | # Use '*' to allow any origin to access your server. 46 | # 47 | # Takes precedence over allow_origin_pat. 48 | #c.NotebookApp.allow_origin = '' 49 | 50 | ## Use a regular expression for the Access-Control-Allow-Origin header 51 | # 52 | # Requests from an origin matching the expression will get replies with: 53 | # 54 | # Access-Control-Allow-Origin: origin 55 | # 56 | # where `origin` is the origin of the request. 57 | # 58 | # Ignored if allow_origin is set. 59 | #c.NotebookApp.allow_origin_pat = '' 60 | 61 | ## Whether to allow the user to run the notebook as root. 62 | #c.NotebookApp.allow_root = False 63 | 64 | ## DEPRECATED use base_url 65 | #c.NotebookApp.base_project_url = '/' 66 | 67 | ## The base URL for the notebook server. 68 | # 69 | # Leading and trailing slashes can be omitted, and will automatically be added. 70 | #c.NotebookApp.base_url = '/' 71 | 72 | ## Specify what command to use to invoke a web browser when opening the notebook. 73 | # If not specified, the default browser will be determined by the `webbrowser` 74 | # standard library module, which allows setting of the BROWSER environment 75 | # variable to override it. 76 | #c.NotebookApp.browser = '' 77 | 78 | ## The full path to an SSL/TLS certificate file. 79 | #c.NotebookApp.certfile = '' 80 | 81 | ## The full path to a certificate authority certificate for SSL/TLS client 82 | # authentication. 83 | #c.NotebookApp.client_ca = '' 84 | 85 | ## The config manager class to use 86 | #c.NotebookApp.config_manager_class = 'notebook.services.config.manager.ConfigManager' 87 | 88 | ## The notebook manager class to use. 89 | #c.NotebookApp.contents_manager_class = 'notebook.services.contents.largefilemanager.LargeFileManager' 90 | 91 | ## Extra keyword arguments to pass to `set_secure_cookie`. See tornado's 92 | # set_secure_cookie docs for details. 93 | #c.NotebookApp.cookie_options = {} 94 | 95 | ## The random bytes used to secure cookies. By default this is a new random 96 | # number every time you start the Notebook. Set it to a value in a config file 97 | # to enable logins to persist across server sessions. 98 | # 99 | # Note: Cookie secrets should be kept private, do not share config files with 100 | # cookie_secret stored in plaintext (you can read the value from a file). 101 | #c.NotebookApp.cookie_secret = b'' 102 | 103 | ## The file where the cookie secret is stored. 104 | #c.NotebookApp.cookie_secret_file = '' 105 | 106 | ## The default URL to redirect to from `/` 107 | #c.NotebookApp.default_url = '/tree' 108 | 109 | ## Disable cross-site-request-forgery protection 110 | # 111 | # Jupyter notebook 4.3.1 introduces protection from cross-site request 112 | # forgeries, requiring API requests to either: 113 | # 114 | # - originate from pages served by this server (validated with XSRF cookie and 115 | # token), or - authenticate with a token 116 | # 117 | # Some anonymous compute resources still desire the ability to run code, 118 | # completely without authentication. These services can disable all 119 | # authentication and security checks, with the full knowledge of what that 120 | # implies. 121 | #c.NotebookApp.disable_check_xsrf = False 122 | 123 | ## Whether to enable MathJax for typesetting math/TeX 124 | # 125 | # MathJax is the javascript library Jupyter uses to render math/LaTeX. It is 126 | # very large, so you may want to disable it if you have a slow internet 127 | # connection, or for offline use of the notebook. 128 | # 129 | # When disabled, equations etc. will appear as their untransformed TeX source. 130 | #c.NotebookApp.enable_mathjax = True 131 | 132 | ## extra paths to look for Javascript notebook extensions 133 | #c.NotebookApp.extra_nbextensions_path = [] 134 | 135 | ## Extra paths to search for serving static files. 136 | # 137 | # This allows adding javascript/css to be available from the notebook server 138 | # machine, or overriding individual files in the IPython 139 | #c.NotebookApp.extra_static_paths = [] 140 | 141 | ## Extra paths to search for serving jinja templates. 142 | # 143 | # Can be used to override templates from notebook.templates. 144 | #c.NotebookApp.extra_template_paths = [] 145 | 146 | ## 147 | #c.NotebookApp.file_to_run = '' 148 | 149 | ## Deprecated: Use minified JS file or not, mainly use during dev to avoid JS 150 | # recompilation 151 | #c.NotebookApp.ignore_minified_js = False 152 | 153 | ## (bytes/sec) Maximum rate at which messages can be sent on iopub before they 154 | # are limited. 155 | #c.NotebookApp.iopub_data_rate_limit = 1000000 156 | 157 | ## (msgs/sec) Maximum rate at which messages can be sent on iopub before they are 158 | # limited. 159 | #c.NotebookApp.iopub_msg_rate_limit = 1000 160 | 161 | ## The IP address the notebook server will listen on. 162 | #c.NotebookApp.ip = 'localhost' 163 | 164 | ## Supply extra arguments that will be passed to Jinja environment. 165 | #c.NotebookApp.jinja_environment_options = {} 166 | 167 | ## Extra variables to supply to jinja templates when rendering. 168 | #c.NotebookApp.jinja_template_vars = {} 169 | 170 | ## The kernel manager class to use. 171 | #c.NotebookApp.kernel_manager_class = 'notebook.services.kernels.kernelmanager.MappingKernelManager' 172 | 173 | ## The kernel spec manager class to use. Should be a subclass of 174 | # `jupyter_client.kernelspec.KernelSpecManager`. 175 | # 176 | # The Api of KernelSpecManager is provisional and might change without warning 177 | # between this version of Jupyter and the next stable one. 178 | #c.NotebookApp.kernel_spec_manager_class = 'jupyter_client.kernelspec.KernelSpecManager' 179 | 180 | ## The full path to a private key file for usage with SSL/TLS. 181 | #c.NotebookApp.keyfile = '' 182 | 183 | ## The login handler class to use. 184 | #c.NotebookApp.login_handler_class = 'notebook.auth.login.LoginHandler' 185 | 186 | ## The logout handler class to use. 187 | #c.NotebookApp.logout_handler_class = 'notebook.auth.logout.LogoutHandler' 188 | 189 | ## The MathJax.js configuration file that is to be used. 190 | #c.NotebookApp.mathjax_config = 'TeX-AMS-MML_HTMLorMML-full,Safe' 191 | 192 | ## A custom url for MathJax.js. Should be in the form of a case-sensitive url to 193 | # MathJax, for example: /static/components/MathJax/MathJax.js 194 | #c.NotebookApp.mathjax_url = '' 195 | 196 | ## Dict of Python modules to load as notebook server extensions.Entry values can 197 | # be used to enable and disable the loading ofthe extensions. The extensions 198 | # will be loaded in alphabetical order. 199 | #c.NotebookApp.nbserver_extensions = {} 200 | 201 | ## The directory to use for notebooks and kernels. 202 | #c.NotebookApp.notebook_dir = '' 203 | 204 | ## Whether to open in a browser after starting. The specific browser used is 205 | # platform dependent and determined by the python standard library `webbrowser` 206 | # module, unless it is overridden using the --browser (NotebookApp.browser) 207 | # configuration option. 208 | #c.NotebookApp.open_browser = True 209 | 210 | ## Hashed password to use for web authentication. 211 | # 212 | # To generate, type in a python/IPython shell: 213 | # 214 | # from notebook.auth import passwd; passwd() 215 | # 216 | # The string should be of the form type:salt:hashed-password. 217 | #c.NotebookApp.password = '' 218 | 219 | ## Forces users to use a password for the Notebook server. This is useful in a 220 | # multi user environment, for instance when everybody in the LAN can access each 221 | # other's machine though ssh. 222 | # 223 | # In such a case, server the notebook server on localhost is not secure since 224 | # any user can connect to the notebook server via ssh. 225 | #c.NotebookApp.password_required = False 226 | 227 | ## The port the notebook server will listen on. 228 | #c.NotebookApp.port = 8888 229 | 230 | ## The number of additional ports to try if the specified port is not available. 231 | #c.NotebookApp.port_retries = 50 232 | 233 | ## DISABLED: use %pylab or %matplotlib in the notebook to enable matplotlib. 234 | #c.NotebookApp.pylab = 'disabled' 235 | 236 | ## (sec) Time window used to check the message and data rate limits. 237 | #c.NotebookApp.rate_limit_window = 3 238 | 239 | ## Reraise exceptions encountered loading server extensions? 240 | #c.NotebookApp.reraise_server_extension_failures = False 241 | 242 | ## DEPRECATED use the nbserver_extensions dict instead 243 | #c.NotebookApp.server_extensions = [] 244 | 245 | ## The session manager class to use. 246 | #c.NotebookApp.session_manager_class = 'notebook.services.sessions.sessionmanager.SessionManager' 247 | 248 | ## Supply SSL options for the tornado HTTPServer. See the tornado docs for 249 | # details. 250 | #c.NotebookApp.ssl_options = {} 251 | 252 | ## Supply overrides for terminado. Currently only supports "shell_command". 253 | #c.NotebookApp.terminado_settings = {} 254 | 255 | ## Token used for authenticating first-time connections to the server. 256 | # 257 | # When no password is enabled, the default is to generate a new, random token. 258 | # 259 | # Setting to an empty string disables authentication altogether, which is NOT 260 | # RECOMMENDED. 261 | #c.NotebookApp.token = '' 262 | 263 | ## Supply overrides for the tornado.web.Application that the Jupyter notebook 264 | # uses. 265 | #c.NotebookApp.tornado_settings = {} 266 | 267 | ## Whether to trust or not X-Scheme/X-Forwarded-Proto and X-Real-Ip/X-Forwarded- 268 | # For headerssent by the upstream reverse proxy. Necessary if the proxy handles 269 | # SSL 270 | #c.NotebookApp.trust_xheaders = False 271 | 272 | ## DEPRECATED, use tornado_settings 273 | #c.NotebookApp.webapp_settings = {} 274 | 275 | ## The base URL for websockets, if it differs from the HTTP server (hint: it 276 | # almost certainly doesn't). 277 | # 278 | # Should be in the form of an HTTP origin: ws[s]://hostname[:port] 279 | #c.NotebookApp.websocket_url = '' 280 | 281 | #------------------------------------------------------------------------------ 282 | # ConnectionFileMixin(LoggingConfigurable) configuration 283 | #------------------------------------------------------------------------------ 284 | 285 | ## Mixin for configurable classes that work with connection files 286 | 287 | ## JSON file in which to store connection info [default: kernel-.json] 288 | # 289 | # This file will contain the IP, ports, and authentication key needed to connect 290 | # clients to this kernel. By default, this file will be created in the security 291 | # dir of the current profile, but can be specified by absolute path. 292 | #c.ConnectionFileMixin.connection_file = '' 293 | 294 | ## set the control (ROUTER) port [default: random] 295 | #c.ConnectionFileMixin.control_port = 0 296 | 297 | ## set the heartbeat port [default: random] 298 | #c.ConnectionFileMixin.hb_port = 0 299 | 300 | ## set the iopub (PUB) port [default: random] 301 | #c.ConnectionFileMixin.iopub_port = 0 302 | 303 | ## Set the kernel's IP address [default localhost]. If the IP address is 304 | # something other than localhost, then Consoles on other machines will be able 305 | # to connect to the Kernel, so be careful! 306 | #c.ConnectionFileMixin.ip = '' 307 | 308 | ## set the shell (ROUTER) port [default: random] 309 | #c.ConnectionFileMixin.shell_port = 0 310 | 311 | ## set the stdin (ROUTER) port [default: random] 312 | #c.ConnectionFileMixin.stdin_port = 0 313 | 314 | ## 315 | #c.ConnectionFileMixin.transport = 'tcp' 316 | 317 | #------------------------------------------------------------------------------ 318 | # KernelManager(ConnectionFileMixin) configuration 319 | #------------------------------------------------------------------------------ 320 | 321 | ## Manages a single kernel in a subprocess on this host. 322 | # 323 | # This version starts kernels with Popen. 324 | 325 | ## Should we autorestart the kernel if it dies. 326 | #c.KernelManager.autorestart = True 327 | 328 | ## DEPRECATED: Use kernel_name instead. 329 | # 330 | # The Popen Command to launch the kernel. Override this if you have a custom 331 | # kernel. If kernel_cmd is specified in a configuration file, Jupyter does not 332 | # pass any arguments to the kernel, because it cannot make any assumptions about 333 | # the arguments that the kernel understands. In particular, this means that the 334 | # kernel does not receive the option --debug if it given on the Jupyter command 335 | # line. 336 | #c.KernelManager.kernel_cmd = [] 337 | 338 | ## Time to wait for a kernel to terminate before killing it, in seconds. 339 | #c.KernelManager.shutdown_wait_time = 5.0 340 | 341 | #------------------------------------------------------------------------------ 342 | # Session(Configurable) configuration 343 | #------------------------------------------------------------------------------ 344 | 345 | ## Object for handling serialization and sending of messages. 346 | # 347 | # The Session object handles building messages and sending them with ZMQ sockets 348 | # or ZMQStream objects. Objects can communicate with each other over the 349 | # network via Session objects, and only need to work with the dict-based IPython 350 | # message spec. The Session will handle serialization/deserialization, security, 351 | # and metadata. 352 | # 353 | # Sessions support configurable serialization via packer/unpacker traits, and 354 | # signing with HMAC digests via the key/keyfile traits. 355 | # 356 | # Parameters ---------- 357 | # 358 | # debug : bool 359 | # whether to trigger extra debugging statements 360 | # packer/unpacker : str : 'json', 'pickle' or import_string 361 | # importstrings for methods to serialize message parts. If just 362 | # 'json' or 'pickle', predefined JSON and pickle packers will be used. 363 | # Otherwise, the entire importstring must be used. 364 | # 365 | # The functions must accept at least valid JSON input, and output *bytes*. 366 | # 367 | # For example, to use msgpack: 368 | # packer = 'msgpack.packb', unpacker='msgpack.unpackb' 369 | # pack/unpack : callables 370 | # You can also set the pack/unpack callables for serialization directly. 371 | # session : bytes 372 | # the ID of this Session object. The default is to generate a new UUID. 373 | # username : unicode 374 | # username added to message headers. The default is to ask the OS. 375 | # key : bytes 376 | # The key used to initialize an HMAC signature. If unset, messages 377 | # will not be signed or checked. 378 | # keyfile : filepath 379 | # The file containing a key. If this is set, `key` will be initialized 380 | # to the contents of the file. 381 | 382 | ## Threshold (in bytes) beyond which an object's buffer should be extracted to 383 | # avoid pickling. 384 | #c.Session.buffer_threshold = 1024 385 | 386 | ## Whether to check PID to protect against calls after fork. 387 | # 388 | # This check can be disabled if fork-safety is handled elsewhere. 389 | #c.Session.check_pid = True 390 | 391 | ## Threshold (in bytes) beyond which a buffer should be sent without copying. 392 | #c.Session.copy_threshold = 65536 393 | 394 | ## Debug output in the Session 395 | #c.Session.debug = False 396 | 397 | ## The maximum number of digests to remember. 398 | # 399 | # The digest history will be culled when it exceeds this value. 400 | #c.Session.digest_history_size = 65536 401 | 402 | ## The maximum number of items for a container to be introspected for custom 403 | # serialization. Containers larger than this are pickled outright. 404 | #c.Session.item_threshold = 64 405 | 406 | ## execution key, for signing messages. 407 | #c.Session.key = b'' 408 | 409 | ## path to file containing execution key. 410 | #c.Session.keyfile = '' 411 | 412 | ## Metadata dictionary, which serves as the default top-level metadata dict for 413 | # each message. 414 | #c.Session.metadata = {} 415 | 416 | ## The name of the packer for serializing messages. Should be one of 'json', 417 | # 'pickle', or an import name for a custom callable serializer. 418 | #c.Session.packer = 'json' 419 | 420 | ## The UUID identifying this session. 421 | #c.Session.session = '' 422 | 423 | ## The digest scheme used to construct the message signatures. Must have the form 424 | # 'hmac-HASH'. 425 | #c.Session.signature_scheme = 'hmac-sha256' 426 | 427 | ## The name of the unpacker for unserializing messages. Only used with custom 428 | # functions for `packer`. 429 | #c.Session.unpacker = 'json' 430 | 431 | ## Username for the Session. Default is your system username. 432 | #c.Session.username = 'username' 433 | 434 | #------------------------------------------------------------------------------ 435 | # MultiKernelManager(LoggingConfigurable) configuration 436 | #------------------------------------------------------------------------------ 437 | 438 | ## A class for managing multiple kernels. 439 | 440 | ## The name of the default kernel to start 441 | #c.MultiKernelManager.default_kernel_name = 'python3' 442 | 443 | ## The kernel manager class. This is configurable to allow subclassing of the 444 | # KernelManager for customized behavior. 445 | #c.MultiKernelManager.kernel_manager_class = 'jupyter_client.ioloop.IOLoopKernelManager' 446 | 447 | #------------------------------------------------------------------------------ 448 | # MappingKernelManager(MultiKernelManager) configuration 449 | #------------------------------------------------------------------------------ 450 | 451 | ## A KernelManager that handles notebook mapping and HTTP error handling 452 | 453 | ## 454 | #c.MappingKernelManager.root_dir = '' 455 | 456 | #------------------------------------------------------------------------------ 457 | # ContentsManager(LoggingConfigurable) configuration 458 | #------------------------------------------------------------------------------ 459 | 460 | ## Base class for serving files and directories. 461 | # 462 | # This serves any text or binary file, as well as directories, with special 463 | # handling for JSON notebook documents. 464 | # 465 | # Most APIs take a path argument, which is always an API-style unicode path, and 466 | # always refers to a directory. 467 | # 468 | # - unicode, not url-escaped 469 | # - '/'-separated 470 | # - leading and trailing '/' will be stripped 471 | # - if unspecified, path defaults to '', 472 | # indicating the root path. 473 | 474 | ## 475 | #c.ContentsManager.checkpoints = None 476 | 477 | ## 478 | #c.ContentsManager.checkpoints_class = 'notebook.services.contents.checkpoints.Checkpoints' 479 | 480 | ## 481 | #c.ContentsManager.checkpoints_kwargs = {} 482 | 483 | ## Glob patterns to hide in file and directory listings. 484 | #c.ContentsManager.hide_globs = ['__pycache__', '*.pyc', '*.pyo', '.DS_Store', '*.so', '*.dylib', '*~'] 485 | 486 | ## Python callable or importstring thereof 487 | # 488 | # To be called on a contents model prior to save. 489 | # 490 | # This can be used to process the structure, such as removing notebook outputs 491 | # or other side effects that should not be saved. 492 | # 493 | # It will be called as (all arguments passed by keyword):: 494 | # 495 | # hook(path=path, model=model, contents_manager=self) 496 | # 497 | # - model: the model to be saved. Includes file contents. 498 | # Modifying this dict will affect the file that is stored. 499 | # - path: the API path of the save destination 500 | # - contents_manager: this ContentsManager instance 501 | #c.ContentsManager.pre_save_hook = None 502 | 503 | ## 504 | #c.ContentsManager.root_dir = '/' 505 | 506 | ## The base name used when creating untitled directories. 507 | #c.ContentsManager.untitled_directory = 'Untitled Folder' 508 | 509 | ## The base name used when creating untitled files. 510 | #c.ContentsManager.untitled_file = 'untitled' 511 | 512 | ## The base name used when creating untitled notebooks. 513 | #c.ContentsManager.untitled_notebook = 'Untitled' 514 | 515 | #------------------------------------------------------------------------------ 516 | # FileManagerMixin(Configurable) configuration 517 | #------------------------------------------------------------------------------ 518 | 519 | ## Mixin for ContentsAPI classes that interact with the filesystem. 520 | # 521 | # Provides facilities for reading, writing, and copying both notebooks and 522 | # generic files. 523 | # 524 | # Shared by FileContentsManager and FileCheckpoints. 525 | # 526 | # Note ---- Classes using this mixin must provide the following attributes: 527 | # 528 | # root_dir : unicode 529 | # A directory against against which API-style paths are to be resolved. 530 | # 531 | # log : logging.Logger 532 | 533 | ## By default notebooks are saved on disk on a temporary file and then if 534 | # succefully written, it replaces the old ones. This procedure, namely 535 | # 'atomic_writing', causes some bugs on file system whitout operation order 536 | # enforcement (like some networked fs). If set to False, the new notebook is 537 | # written directly on the old one which could fail (eg: full filesystem or quota 538 | # ) 539 | #c.FileManagerMixin.use_atomic_writing = True 540 | 541 | #------------------------------------------------------------------------------ 542 | # FileContentsManager(FileManagerMixin,ContentsManager) configuration 543 | #------------------------------------------------------------------------------ 544 | 545 | ## Python callable or importstring thereof 546 | # 547 | # to be called on the path of a file just saved. 548 | # 549 | # This can be used to process the file on disk, such as converting the notebook 550 | # to a script or HTML via nbconvert. 551 | # 552 | # It will be called as (all arguments passed by keyword):: 553 | # 554 | # hook(os_path=os_path, model=model, contents_manager=instance) 555 | # 556 | # - path: the filesystem path to the file just written - model: the model 557 | # representing the file - contents_manager: this ContentsManager instance 558 | #c.FileContentsManager.post_save_hook = None 559 | 560 | ## 561 | #c.FileContentsManager.root_dir = '' 562 | 563 | ## DEPRECATED, use post_save_hook. Will be removed in Notebook 5.0 564 | #c.FileContentsManager.save_script = False 565 | 566 | #------------------------------------------------------------------------------ 567 | # NotebookNotary(LoggingConfigurable) configuration 568 | #------------------------------------------------------------------------------ 569 | 570 | ## A class for computing and verifying notebook signatures. 571 | 572 | ## The hashing algorithm used to sign notebooks. 573 | #c.NotebookNotary.algorithm = 'sha256' 574 | 575 | ## The sqlite file in which to store notebook signatures. By default, this will 576 | # be in your Jupyter data directory. You can set it to ':memory:' to disable 577 | # sqlite writing to the filesystem. 578 | #c.NotebookNotary.db_file = '' 579 | 580 | ## The secret key with which notebooks are signed. 581 | #c.NotebookNotary.secret = b'' 582 | 583 | ## The file where the secret key is stored. 584 | #c.NotebookNotary.secret_file = '' 585 | 586 | ## A callable returning the storage backend for notebook signatures. The default 587 | # uses an SQLite database. 588 | #c.NotebookNotary.store_factory = traitlets.Undefined 589 | 590 | #------------------------------------------------------------------------------ 591 | # KernelSpecManager(LoggingConfigurable) configuration 592 | #------------------------------------------------------------------------------ 593 | 594 | ## If there is no Python kernelspec registered and the IPython kernel is 595 | # available, ensure it is added to the spec list. 596 | #c.KernelSpecManager.ensure_native_kernel = True 597 | 598 | ## The kernel spec class. This is configurable to allow subclassing of the 599 | # KernelSpecManager for customized behavior. 600 | #c.KernelSpecManager.kernel_spec_class = 'jupyter_client.kernelspec.KernelSpec' 601 | 602 | ## Whitelist of allowed kernel names. 603 | # 604 | # By default, all installed kernels are allowed. 605 | #c.KernelSpecManager.whitelist = set() 606 | -------------------------------------------------------------------------------- /aml_config/local.compute: -------------------------------------------------------------------------------- 1 | type: "local" 2 | pythonLocation: "python" 3 | sparkSubmitLocation: "spark-submit" 4 | nativeSharedDirectory: "~/.azureml/share/" -------------------------------------------------------------------------------- /aml_config/local.runconfig: -------------------------------------------------------------------------------- 1 | ArgumentVector: 2 | - "$file" 3 | Target: "local" 4 | EnvironmentVariables: 5 | "EXAMPLE_ENV_VAR": "Example Value" 6 | Framework: "Python" 7 | CondaDependenciesFile: "aml_config/conda_dependencies.yml" 8 | SparkDependenciesFile: "aml_config/spark_dependencies.yml" 9 | PrepareEnvironment: true 10 | TrackedRun: true -------------------------------------------------------------------------------- /aml_config/spark_dependencies.yml: -------------------------------------------------------------------------------- 1 | # Spark configuration and packages specification. The dependencies defined in 2 | # this file will be automatically provisioned for each run that uses Spark. 3 | 4 | # Spark configuration values can be set through the configuration dictionary. 5 | # Spark packages can be added through the repositories and packages lists. 6 | 7 | # For third-party python libraries, see conda_dependencies.yml. 8 | 9 | configuration: 10 | "spark.app.name": "AzureML Experiment" 11 | repositories: 12 | - "https://mmlspark.azureedge.net/maven" 13 | packages: 14 | - group: "com.microsoft.ml.spark" 15 | artifact: "mmlspark_2.11" 16 | version: "0.7.91" 17 | -------------------------------------------------------------------------------- /modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .feature_extractor import * 2 | from .phrase_learning import * -------------------------------------------------------------------------------- /modules/feature_extractor.py: -------------------------------------------------------------------------------- 1 | ################################################ 2 | # User Defined Functions for Feature Extraction 3 | ################################################ 4 | 5 | import pandas as pd 6 | import numpy as np 7 | from azureml.logging import get_azureml_logger 8 | 9 | run_logger = get_azureml_logger() 10 | run_logger.log('amlrealworld.QnA-matching.feature-extractor','true') 11 | 12 | 13 | # get Token to ID mapping: {Token: tokenId} 14 | def tokensToIds(tokens, featureHash): 15 | token2IdHash = {} 16 | for i in range(len(tokens)): 17 | tokenList = tokens.iloc[i].split(',') 18 | if featureHash is None: 19 | for t in tokenList: 20 | if t not in token2IdHash.keys(): 21 | token2IdHash[t] = len(token2IdHash) 22 | else: 23 | for t in tokenList: 24 | if t not in token2IdHash.keys() and t in list(featureHash.keys()): 25 | token2IdHash[t] = len(token2IdHash) 26 | 27 | return token2IdHash 28 | 29 | # create a matrix to store the token frequency. 30 | def countMatrix(frame, token2IdHash, labelColumnName=None, uniqueLabel=None): 31 | # create am empty matrix with the shape of: 32 | # num_row = num of unique tokens 33 | # num_column = num of unique answerIds (N_wA) or num of questions in testQ (tfMatrix) 34 | # rowIdx = token2IdHash.values() 35 | # colIdx = index of uniqueClass (N_wA) or index of questions in testQ (tfMatrix) 36 | num_row = len(token2IdHash) 37 | if uniqueLabel is not None: # get N_wA 38 | num_column = len(uniqueLabel) 39 | else: 40 | num_column = len(frame) 41 | countMatrix = np.zeros(shape=(num_row, num_column)) 42 | 43 | # loop through each question in the frame to fill in the countMatrix with corresponding counts 44 | for i in range(len(frame)): 45 | tokens = frame['Tokens'].iloc[i].split(',') 46 | if uniqueLabel is not None: # get N_wA 47 | label = frame[labelColumnName].iloc[i] 48 | colIdx = uniqueLabel.index(label) 49 | else: 50 | colIdx = i 51 | 52 | for t in tokens: 53 | if t in token2IdHash.keys(): 54 | rowIdx = token2IdHash[t] 55 | countMatrix[rowIdx, colIdx] += 1 56 | 57 | return countMatrix 58 | 59 | # calculate the prior probability of each answer class P(A): [P_A1, P_A2, ...] 60 | def priorProbabilityAnswer(answerIds, uniqueLabel): 61 | P_A = [] 62 | # convert a pandas series to a list 63 | answerIds = list(answerIds) 64 | 65 | for id in uniqueLabel: 66 | P_A.append(answerIds.count(id)/len(answerIds)) 67 | return np.array(P_A) 68 | 69 | # calculate the conditional probability of each answer class by giving a token P(A|w). 70 | def posterioriProb(N_wAInit, P_A, uniqueLabel): 71 | # N_A is the total number of answers 72 | N_A = len(uniqueLabel) 73 | # N_w is the total number of times w appears over all documents 74 | # rowSum of count matrix (N_wAInit) 75 | N_wInit = np.sum(N_wAInit, axis = 1) 76 | # P(A|w) = (N_w|A + N_A * P(A))/(N_w + N_A) 77 | N = N_wAInit + N_A * P_A 78 | D = N_wInit + N_A 79 | P_Aw = np.divide(N.T, D).T 80 | 81 | return P_Aw 82 | 83 | # select the top N tokens w which maximize P(A|w) for each A. 84 | # get FeatureHash: {token: 1} 85 | def feature_selection(P_Aw, token2IdHashInit, topN): 86 | featureHash = {} 87 | # for each answer A, sort tokens w by P(A|w) 88 | sortedIdxMatrix = np.argsort(P_Aw, axis=0)[::-1] 89 | # select top N tokens for each answer A 90 | topMatrix = sortedIdxMatrix[0:topN, :] 91 | # for each token w in topMatrix, add w to FeatureHash if it has not already been included 92 | topTokenIdList = np.reshape(topMatrix, topMatrix.shape[0] * topMatrix.shape[1]) 93 | # get ID to Token mapping: {tokenId: Token} 94 | Id2TokenHashInit = {y:x for x, y in token2IdHashInit.items()} 95 | 96 | for tokenId in topTokenIdList: 97 | token = Id2TokenHashInit[tokenId] 98 | if token not in featureHash.keys(): 99 | featureHash[token] = 1 100 | return featureHash 101 | 102 | # calculate the weight for each feature. 103 | def featureWeights(N_wA, alpha): 104 | # N_w is the total number of times w appears over all documents 105 | # rowSum of count matrix (N_wA) 106 | N_w = np.sum(N_wA, axis = 1) 107 | # N_W is the total count of all words 108 | N_W = np.sum(N_wA) 109 | # N_V is the count of unique words in the vocabulary 110 | N_V = N_wA.shape[0] 111 | # P(w) = (N_w + 1*alpha) / (N_W +N_V*alpha) 112 | N2 = N_w + 1 * alpha 113 | D2 = N_W + alpha * N_V 114 | P_w = N2/D2 115 | 116 | return P_w 117 | 118 | # calculate the conditional probability of each token within an answer class P(A|w). 119 | def wordProbabilityInAnswer(N_wA, P_w, beta): 120 | # N_V is the count of unique words in the vocabulary 121 | N_V = N_wA.shape[0] 122 | # N_WA is the total count of all words in questions on answer A 123 | # colSum of count matrix (N_wA) 124 | N_WA = np.sum(N_wA, axis=0) 125 | # P(w|A) = (N_w|A + beta N_V P(w))/(N_W|A + beta * N_V) 126 | N = (N_wA.T + beta * N_V * P_w).T 127 | D = N_WA + beta * N_V 128 | P_wA = N / D 129 | 130 | return P_wA 131 | 132 | # calculate the conditional probability of each token not within an answer class P(notA|w). 133 | def wordProbabilityNotinAnswer(N_wA, P_w, beta): 134 | # N_V is the count of unique words in the vocabulary 135 | N_V = N_wA.shape[0] 136 | # N_wNotA is the count of w over all documents but not on answer A 137 | # N_wNotA = N_w - N_wA 138 | N_w = np.sum(N_wA, axis = 1) 139 | N_wNotA = (N_w - N_wA.T).T 140 | # N_WNotA is the count of all words over all documents but not on answer A 141 | # N_WNotA = N_W - N_WA 142 | N_W = np.sum(N_wA) 143 | N_WA = np.sum(N_wA, axis=0) 144 | N_WNotA = N_W - N_WA 145 | # P(w|NotA) = (N_w|NotA + beta * N_V * P(w))/(N_W|NotA + beta * N_V) 146 | N = (N_wNotA.T + beta * N_V * P_w).T 147 | D = N_WNotA + beta * N_V 148 | P_wNotA = N / D 149 | 150 | return P_wNotA 151 | 152 | # calculate the normalized Term Frequency. 153 | def normalizeTF(frame, token2IdHash): 154 | 155 | N_wQ = countMatrix(frame, token2IdHash) 156 | N_WQ = np.sum(N_wQ, axis=0) 157 | 158 | # find the index where N_WQ is zero 159 | zeroIdx = np.where(N_WQ == 0)[0] 160 | 161 | # if N_WQ is zero, then the x_w for that particular question would be zero. 162 | # for a simple calculation, we convert the N_WQ to 1 in those cases so the demoninator is not zero. 163 | if len(zeroIdx) > 0: 164 | N_WQ[zeroIdx] = 1 165 | 166 | # x_w = P_wd = count(w)/sum(count(i in V)) 167 | x_w = N_wQ / N_WQ 168 | 169 | return x_w 170 | 171 | # calculate the Inverse Document Frequency. 172 | def getIDF(N_wQ): 173 | # N is total number of documents in the corpus 174 | # N_V is the number of tokens in the vocabulary 175 | N_V, N = N_wQ.shape 176 | # D is the number of documents where the token w appears 177 | D = np.zeros(shape=(0, N_V)) 178 | for i in range(N_V): 179 | D = np.append(D, len(np.nonzero(N_wQ[i, ])[0])) 180 | return np.log(N/D) 181 | 182 | # create a softmax function. 183 | def softmax(scores2D): 184 | # input: scores from different models 185 | # row: test example 186 | # column: label 187 | return np.exp(scores2D)/np.sum(np.exp(scores2D), axis=1)[:, None] -------------------------------------------------------------------------------- /modules/phrase_learning.py: -------------------------------------------------------------------------------- 1 | ############################################## 2 | # User Defined Functions for Phrase Learning 3 | ############################################## 4 | 5 | import pandas as pd 6 | import numpy as np 7 | import re, nltk, time, gc, math 8 | from azureml.logging import get_azureml_logger 9 | 10 | run_logger = get_azureml_logger() 11 | run_logger.log('amlrealworld.QnA-matching.phrase-learning','true') 12 | 13 | 14 | def CleanAndSplitText(frame): 15 | 16 | global EMPTY, SPACE, NLTK_PUNKT_EN, SENTENCE_BREAKER 17 | EMPTY = '' 18 | SPACE = ' ' 19 | nltk.download("punkt") 20 | NLTK_PUNKT_EN = 'tokenizers/punkt/english.pickle' 21 | SENTENCE_BREAKER = nltk.data.load(NLTK_PUNKT_EN) 22 | 23 | textDataOut = [] 24 | 25 | # This regular expression is for punctuation that we wish to clean out 26 | # We also will split sentences into smaller phrase like units using this expression 27 | rePhraseBreaks = re.compile("[\"\!\?\)\]\}\,\:\;\*\-]*\s+\([0-9]+\)\s+[\(\[\{\"\*\-]*" 28 | "|[\"\!\?\)\]\}\,\:\;\*\-]+\s+[\(\[\{\"\*\-]*" 29 | "|\.\.+" # .. 30 | "|\s*\-\-+\s*" # -- 31 | "|\s+\-\s+" # - 32 | "|\:\:+" # :: 33 | "|\s+[\/\(\[\{\"\-\*]+\s*" 34 | "|[\,!\?\"\)\(\]\[\}\{\:\;\*](?=[a-zA-Z])" 35 | "|[\"\!\?\)\]\}\,\:\;]+[\.]*$" 36 | ) 37 | 38 | # Regex for underbars 39 | regexUnderbar = re.compile('_|_+') 40 | 41 | # Regex for space 42 | regexSpace = re.compile(' +') 43 | 44 | # Regex for sentence final period 45 | regexPeriod = re.compile("\.$") 46 | 47 | # Regex for parentheses 48 | regexParentheses = re.compile("\(\$?") 49 | 50 | # Regex for equal sign 51 | regexEqual = re.compile("=") 52 | 53 | # Iterate through each document and do: 54 | # (1) Split documents into sections based on section headers and remove section headers 55 | # (2) Split the sections into sentences using NLTK sentence tokenizer 56 | # (3) Further split sentences into phrasal units based on punctuation and remove punctuation 57 | # (4) Remove sentence final periods when not part of a abbreviation 58 | 59 | for i in range(0,len(frame)): 60 | 61 | # Extract one document from frame 62 | docID = frame.index.values[i] 63 | docText = frame['Text'].iloc[i] 64 | 65 | # Set counter for output line count for this document 66 | lineIndex=0 67 | 68 | sentences = SENTENCE_BREAKER.tokenize(docText) 69 | 70 | for sentence in sentences: 71 | 72 | # Split each sentence into phrase level chunks based on punctuation 73 | textSegs = rePhraseBreaks.split(sentence) 74 | numSegs = len(textSegs) 75 | 76 | for j in range(0,numSegs): 77 | if len(textSegs[j])>0: 78 | # Convert underbars to spaces 79 | # Underbars are reserved for building the compound word phrases 80 | textSegs[j] = regexUnderbar.sub(" ",textSegs[j]) 81 | 82 | # Split out the words so we can specially handle the last word 83 | words = regexSpace.split(textSegs[j]) 84 | 85 | # Remove parentheses and equal signs 86 | words = [regexEqual.sub("", regexParentheses.sub("", w)) for w in words] 87 | 88 | phraseOut = "" 89 | last = len(words) -1 90 | for i in range(0, last): 91 | phraseOut += words[i] + " " 92 | # If the last word ends in a period then remove the period 93 | lastWord = regexPeriod.sub("", words[last]) 94 | # If the last word is an abbreviation like "U.S." 95 | # then add the word final perios back on 96 | if "\." in lastWord: 97 | lastWord += "." 98 | phraseOut += lastWord 99 | 100 | textDataOut.append([docID,lineIndex,phraseOut, phraseOut.lower()]) 101 | lineIndex += 1 102 | 103 | # Convert to pandas frame 104 | frameOut = pd.DataFrame(textDataOut, columns=['DocID','DocLine','CleanedText', 'LowercaseText']) 105 | 106 | return frameOut 107 | 108 | # count the number of occurances of all 2-gram, 3-ngram, and 4-gram word sequences. 109 | def ComputeNgramStats(textData,functionwordHash,blacklistHash): 110 | 111 | # Create an array to store the total count of all ngrams up to 4-grams 112 | # Array element 0 is unused, element 1 is unigrams, element 2 is bigrams, etc. 113 | ngramCounts = [0]*5; 114 | 115 | # Create a list of structures to tabulate ngram count statistics 116 | # Array element 0 is the array of total ngram counts, 117 | # Array element 1 is a hash table of individual unigram counts 118 | # Array element 2 is a hash table of individual bigram counts 119 | # Array element 3 is a hash table of individual trigram counts 120 | # Array element 4 is a hash table of individual 4-gram counts 121 | ngramStats = [ngramCounts, {}, {}, {}, {}] 122 | 123 | # Create a regular expression for assessing validity of words 124 | # for phrase modeling. The expression says words in phrases 125 | # must either: 126 | # (1) contain an alphabetic character, or 127 | # (2) be the single charcater '&', or 128 | # (3) be a one or two digit number 129 | reWordIsValid = re.compile('[A-Za-z]|^&$|^\d\d?$') 130 | 131 | # Go through the text data line by line collecting count statistics 132 | # for all valid n-grams that could appear in a potential phrase 133 | numLines = len(textData) 134 | for i in range(0, numLines): 135 | 136 | # Split the text line into an array of words 137 | wordArray = textData[i].split() 138 | numWords = len(wordArray) 139 | 140 | # Create an array marking each word as valid or invalid 141 | validArray = []; 142 | for word in wordArray: 143 | validArray.append(reWordIsValid.match(word) != None) 144 | 145 | # Tabulate total raw ngrams for this line into counts for each ngram bin 146 | # The total ngrams counts include the counts of all ngrams including those 147 | # that we won't consider as parts of phrases 148 | for j in range(1,5): 149 | if j<=numWords: 150 | ngramCounts[j] += numWords - j + 1 151 | 152 | # Collect counts for viable phrase ngrams and left context sub-phrases 153 | for j in range(0,numWords): 154 | word = wordArray[j] 155 | 156 | # Only bother counting the ngrams that start with a valid content word 157 | # i.e., valids words not in the function word list or the black list 158 | if ( ( word not in functionwordHash ) and ( word not in blacklistHash ) and validArray[j] ): 159 | 160 | # Initialize ngram string with first content word and add it to unigram counts 161 | ngramSeq = word 162 | if ngramSeq in ngramStats[1]: 163 | ngramStats[1][ngramSeq] += 1 164 | else: 165 | ngramStats[1][ngramSeq] = 1 166 | 167 | # Count valid ngrams from bigrams up to 4-grams 168 | stop = 0 169 | k = 1 170 | while (k<4) and (j+k= minCount: 202 | wordArray = ngram.split() 203 | # If the final word in the ngram is not a function word then 204 | # the ngram is a valid phrase candidate we want to score 205 | if wordArray[i] not in functionwordHash: 206 | leftNgram = wordArray[0] 207 | for j in range(1,i): 208 | leftNgram += ' ' + wordArray[j] 209 | rightWord = wordArray[i] 210 | 211 | # Compute the weighted pointwise mutual information (WPMI) for the phrase 212 | probNgram = float(ngramStats[n][ngram])/float(ngramStats[0][n]) 213 | probLeftNgram = float(ngramStats[n-1][leftNgram])/float(ngramStats[0][n-1]) 214 | probRightWord = float(ngramStats[1][rightWord])/float(ngramStats[0][1]) 215 | WPMI = probNgram * math.log(probNgram/(probLeftNgram*probRightWord)); 216 | 217 | # Add the phrase into the list of scored phrases only if WMPI is positive 218 | if WPMI > 0: 219 | ngramWPMIHash[ngram] = WPMI 220 | 221 | # Create a sorted list of the phrase candidates 222 | rankedNgrams = sorted(ngramWPMIHash, key=ngramWPMIHash.__getitem__, reverse=True) 223 | 224 | # Force a memory clean-up 225 | ngramWPMIHash = None 226 | gc.collect() 227 | 228 | return rankedNgrams 229 | 230 | # apply the phrase rewrites to training data. 231 | def ApplyPhraseRewrites(rankedNgrams,textData,learnedPhrases, 232 | maxPhrasesToAdd,maxPhraseLength,verbose): 233 | 234 | if len(rankedNgrams) == 0: 235 | return 236 | 237 | # This function will consider at most maxRewrite 238 | # new phrases to be added into the learned phrase 239 | # list as specified by the calling fuinction 240 | maxRewrite=maxPhrasesToAdd 241 | 242 | # If the remaining number of proposed ngram phrases is less 243 | # than the max allowed, then reset maxRewrite to the size of 244 | # the proposed ngram phrases list 245 | numNgrams = len(rankedNgrams) 246 | if numNgrams < maxRewrite: 247 | maxRewrite = numNgrams 248 | 249 | # Create empty hash tables to keep track of phrase overlap conflicts 250 | leftConflictHash = {} 251 | rightConflictHash = {} 252 | 253 | # Create an empty hash table collecting the set of rewrite rules 254 | # to be applied during this iteration of phrase learning 255 | ngramRewriteHash = {} 256 | 257 | # Precompile the regex for finding spaces in ngram phrases 258 | regexSpace = re.compile(' ') 259 | 260 | # Initialize some bookkeeping variables 261 | numLines = len(textData) 262 | numPhrasesAdded = 0 263 | numConsidered = 0 264 | lastSkippedNgram = "" 265 | lastAddedNgram = "" 266 | 267 | # Collect list up to maxRewrite ngram phrase rewrites 268 | stop = False 269 | index = 0 270 | while not stop: 271 | 272 | # Get the next phrase to consider adding to the phrase list 273 | inputNgram = rankedNgrams[index] 274 | 275 | # Create the output compound word version of the phrase 276 | # The extra space is added to make the regex rewrite easier 277 | outputNgram = " " + regexSpace.sub("_",inputNgram) 278 | 279 | # Count the total number of words in the proposed phrase 280 | numWords = len(outputNgram.split("_")) 281 | 282 | # Only add phrases that don't exceed the max phrase length 283 | if (numWords <= maxPhraseLength): 284 | 285 | # Keep count of phrases considered for inclusion during this iteration 286 | numConsidered += 1 287 | 288 | # Extract the left and right words in the phrase to use 289 | # in checks for phrase overlap conflicts 290 | ngramArray = inputNgram.split() 291 | leftWord = ngramArray[0] 292 | rightWord = ngramArray[len(ngramArray)-1] 293 | 294 | # Skip any ngram phrases that conflict with earlier phrases added 295 | # These ngram phrases will be reconsidered in the next iteration 296 | if (leftWord in leftConflictHash) or (rightWord in rightConflictHash): 297 | if verbose: 298 | print ("(%d) Skipping (context conflict): %s" % (numConsidered,inputNgram)) 299 | lastSkippedNgram = inputNgram 300 | 301 | # If no conflict exists then add this phrase into the list of phrase rewrites 302 | else: 303 | if verbose: 304 | print ("(%d) Adding: %s" % (numConsidered,inputNgram)) 305 | ngramRewriteHash[" " + inputNgram] = outputNgram 306 | learnedPhrases.append(inputNgram) 307 | lastAddedNgram = inputNgram 308 | numPhrasesAdded += 1 309 | 310 | # Keep track of all context words that might conflict with upcoming 311 | # propose phrases (even when phrases are skipped instead of added) 312 | leftConflictHash[rightWord] = 1 313 | rightConflictHash[leftWord] = 1 314 | 315 | # Stop when we've considered the maximum number of phrases per iteration 316 | if ( numConsidered >= maxRewrite ): 317 | stop = True 318 | 319 | # Increment to next phrase 320 | index += 1 321 | 322 | # Stop if we've reached the end of the ranked ngram list 323 | if index >= len(rankedNgrams): 324 | stop = True 325 | 326 | # Now do the phrase rewrites over the entire set of text data 327 | if numPhrasesAdded == 1: 328 | # If only one phrase to add use a single regex rule to do this phrase rewrite 329 | inputNgram = " " + lastAddedNgram 330 | outputNgram = ngramRewriteHash[inputNgram] 331 | regexNgram = re.compile (r'%s(?= )' % re.escape(inputNgram)) 332 | # Apply the regex over the full data set 333 | for j in range(0,numLines): 334 | textData[j] = regexNgram.sub(outputNgram, textData[j]) 335 | elif numPhrasesAdded > 1: 336 | # Compile a single regex rule from the collected set of phrase rewrites for this iteration 337 | ngramRegex = re.compile(r'%s(?= )' % "|".join(map(re.escape, ngramRewriteHash.keys()))) 338 | # Apply the regex over the full data set 339 | for i in range(0,len(textData)): 340 | # The regex substituion looks up the output string rewrite 341 | # in the hash table for each matched input phrase regex 342 | textData[i] = ngramRegex.sub(lambda mo: ngramRewriteHash[mo.string[mo.start():mo.end()]], textData[i]) 343 | 344 | return 345 | 346 | # run the full iterative phrase learning process. 347 | def ApplyPhraseLearning(textData, learnedPhrases, maxNumPhrases=200, maxPhraseLength=7, maxPhrasesPerIter=50, 348 | minCount=5, functionwordHash={}, blacklistHash={}, verbose=False): 349 | 350 | stop = 0 351 | iterNum = 0 352 | 353 | # Start timing the process 354 | functionStartTime = time.clock() 355 | 356 | numPhrasesLearned = len(learnedPhrases) 357 | print ("Start phrase learning with %d phrases of %d phrases learned" % (numPhrasesLearned,maxNumPhrases)) 358 | 359 | while not stop: 360 | iterNum += 1 361 | 362 | # Start timing this iteration 363 | startTime = time.clock() 364 | 365 | # Collect ngram stats 366 | ngramStats = ComputeNgramStats(textData,functionwordHash,blacklistHash) 367 | 368 | # Rank ngrams 369 | rankedNgrams = RankNgrams(ngramStats,functionwordHash,minCount) 370 | 371 | # Incorporate top ranked phrases into phrase list 372 | # and rewrite the text to use these phrases 373 | maxPhrasesToAdd = maxNumPhrases - numPhrasesLearned 374 | if maxPhrasesToAdd > maxPhrasesPerIter: 375 | maxPhrasesToAdd = maxPhrasesPerIter 376 | ApplyPhraseRewrites(rankedNgrams,textData,learnedPhrases,maxPhrasesToAdd,maxPhraseLength,verbose) 377 | numPhrasesAdded = len(learnedPhrases) - numPhrasesLearned 378 | 379 | # Garbage collect 380 | ngramStats = None 381 | rankedNgrams = None 382 | gc.collect(); 383 | 384 | elapsedTime = time.clock() - startTime 385 | 386 | numPhrasesLearned = len(learnedPhrases) 387 | print ("Iteration %d: Added %d new phrases in %.2f seconds (Learned %d of max %d)" % 388 | (iterNum,numPhrasesAdded,elapsedTime,numPhrasesLearned,maxNumPhrases)) 389 | 390 | if numPhrasesAdded >= maxPhrasesToAdd or numPhrasesAdded == 0: 391 | stop = 1 392 | 393 | # Remove the space padding at the start and end of each line 394 | regexSpacePadding = re.compile('^ +| +$') 395 | for i in range(0,len(textData)): 396 | textData[i] = regexSpacePadding.sub("",textData[i]) 397 | 398 | gc.collect() 399 | 400 | elapsedTime = time.clock() - functionStartTime 401 | elapsedTimeHours = elapsedTime/3600.0; 402 | print ("*** Phrase learning completed in %.2f hours ***" % elapsedTimeHours) 403 | 404 | return 405 | 406 | # apply the learned phrases to test data. 407 | def ApplyPhraseRewritesInPlace(textFrame, textColumnName, phraseRules): 408 | 409 | # Get text data column from frame 410 | textData = textFrame[textColumnName] 411 | numLines = len(textData) 412 | 413 | # initial a list to store output text 414 | textOutput = [None] * numLines 415 | 416 | # Add leading and trailing spaces to make regex matching easier 417 | for i in range(0,numLines): 418 | textOutput[i] = " " + textData[i] + " " 419 | 420 | # Make sure we have phrase to add 421 | numPhraseRules = len(phraseRules) 422 | if numPhraseRules == 0: 423 | print ("Warning: phrase rule lise is empty - no phrases being applied to text data") 424 | return 425 | 426 | # Precompile the regex for finding spaces in ngram phrases 427 | regexSpace = re.compile(' ') 428 | 429 | # Initialize some bookkeeping variables 430 | 431 | # Iterate through full set of phrases to find sets of 432 | # non-conflicting phrases that can be apply simultaneously 433 | index = 0 434 | outerStop = False 435 | while not outerStop: 436 | 437 | # Create empty hash tables to keep track of phrase overlap conflicts 438 | leftConflictHash = {} 439 | rightConflictHash = {} 440 | prevConflictHash = {} 441 | 442 | # Create an empty hash table collecting the next set of rewrite rules 443 | # to be applied during this iteration of phrase rewriting 444 | phraseRewriteHash = {} 445 | 446 | # Progress through phrases until the next conflicting phrase is found 447 | innerStop = 0 448 | numPhrasesAdded = 0 449 | while not innerStop: 450 | 451 | # Get the next phrase to consider adding to the phrase list 452 | nextPhrase = phraseRules[index] 453 | 454 | # Extract the left and right sides of the phrase to use 455 | # in checks for phrase overlap conflicts 456 | ngramArray = nextPhrase.split() 457 | leftWord = ngramArray[0] 458 | rightWord = ngramArray[len(ngramArray)-1] 459 | 460 | # Stop if we reach any phrases that conflicts with earlier phrases in this iteration 461 | # These ngram phrases will be reconsidered in the next iteration 462 | if ((leftWord in leftConflictHash) or (rightWord in rightConflictHash) 463 | or (leftWord in prevConflictHash) or (rightWord in prevConflictHash)): 464 | innerStop = True 465 | 466 | # If no conflict exists then add this phrase into the list of phrase rewrites 467 | else: 468 | # Create the output compound word version of the phrase 469 | 470 | outputPhrase = regexSpace.sub("_",nextPhrase); 471 | 472 | # Keep track of all context words that might conflict with upcoming 473 | # propose phrases (even when phrases are skipped instead of added) 474 | leftConflictHash[rightWord] = 1 475 | rightConflictHash[leftWord] = 1 476 | prevConflictHash[outputPhrase] = 1 477 | 478 | # Add extra space to input an output versions of the current phrase 479 | # to make the regex rewrite easier 480 | outputPhrase = " " + outputPhrase 481 | lastAddedPhrase = " " + nextPhrase 482 | 483 | # Add the phrase to the rewrite hash 484 | phraseRewriteHash[lastAddedPhrase] = outputPhrase 485 | 486 | # Increment to next phrase 487 | index += 1 488 | numPhrasesAdded += 1 489 | 490 | # Stop if we've reached the end of the phrases list 491 | if index >= numPhraseRules: 492 | innerStop = True 493 | outerStop = True 494 | 495 | # Now do the phrase rewrites over the entire set of text data 496 | if numPhrasesAdded == 1: 497 | 498 | # If only one phrase to add use a single regex rule to do this phrase rewrite 499 | outputPhrase = phraseRewriteHash[lastAddedPhrase] 500 | regexPhrase = re.compile (r'%s(?= )' % re.escape(lastAddedPhrase)) 501 | 502 | # Apply the regex over the full data set 503 | for j in range(0,numLines): 504 | textOutput[j] = regexPhrase.sub(outputPhrase, textOutput[j]) 505 | 506 | elif numPhrasesAdded > 1: 507 | # Compile a single regex rule from the collected set of phrase rewrites for this iteration 508 | regexPhrase = re.compile(r'%s(?= )' % "|".join(map(re.escape, phraseRewriteHash.keys()))) 509 | 510 | # Apply the regex over the full data set 511 | for i in range(0,numLines): 512 | # The regex substituion looks up the output string rewrite 513 | # in the hash table for each matched input phrase regex 514 | textOutput[i] = regexPhrase.sub(lambda mo: phraseRewriteHash[mo.string[mo.start():mo.end()]], textOutput[i]) 515 | 516 | # Remove the space padding at the start and end of each line 517 | regexSpacePadding = re.compile('^ +| +$') 518 | for i in range(0,len(textOutput)): 519 | textOutput[i] = regexSpacePadding.sub("",textOutput[i]) 520 | 521 | return textOutput 522 | 523 | # reconstruct the full processed text and put it back into a new data frame. 524 | def ReconstituteDocsFromChunks(textData, idColumnName, textColumnName): 525 | dataOut = [] 526 | 527 | currentDoc = ""; 528 | currentDocID = ""; 529 | 530 | for i in range(0,len(textData)): 531 | textChunk = textData[textColumnName][i] 532 | docID = textData[idColumnName][i] 533 | if docID != currentDocID: 534 | if currentDocID != "": 535 | dataOut.append(currentDoc) 536 | currentDoc = textChunk 537 | currentDocID = docID 538 | else: 539 | currentDoc += " " + textChunk 540 | dataOut.append(currentDoc) 541 | 542 | return dataOut 543 | 544 | # create the Vocabulary with some filtering criteria. 545 | def CreateVocabForTopicModeling(textData,stopwordHash): 546 | 547 | print ("Counting words") 548 | numDocs = len(textData) 549 | globalWordCountHash = {} 550 | globalDocCountHash = {} 551 | for textLine in textData: 552 | docWordCountHash = {} 553 | for word in textLine.split(): 554 | if word in globalWordCountHash: 555 | globalWordCountHash[word] += 1 556 | else: 557 | globalWordCountHash[word] = 1 558 | if word not in docWordCountHash: 559 | docWordCountHash[word] = 1 560 | if word in globalDocCountHash: 561 | globalDocCountHash[word] += 1 562 | else: 563 | globalDocCountHash[word] = 1 564 | 565 | minWordCount = 5; 566 | minDocCount = 2; 567 | maxDocFreq = .25; 568 | vocabCount = 0; 569 | vocabHash = {} 570 | 571 | excStopword = 0 572 | excNonalphabetic = 0 573 | excMinwordcount = 0 574 | excNotindochash = 0 575 | excMindoccount = 0 576 | excMaxdocfreq =0 577 | 578 | print ("Building vocab") 579 | for word in globalWordCountHash.keys(): 580 | # Test vocabulary exclusion criteria for each word 581 | if ( word in stopwordHash ): 582 | excStopword += 1 583 | elif ( not re.search(r'[a-zA-Z]', word, 0) ): 584 | excNonalphabetic += 1 585 | elif ( globalWordCountHash[word] < minWordCount ): 586 | excMinwordcount += 1 587 | elif ( word not in globalDocCountHash ): 588 | print ("Warning: Word '%s' not in doc count hash") % (word) 589 | excNotindochash += 1 590 | elif ( globalDocCountHash[word] < minDocCount ): 591 | excMindoccount += 1 592 | elif ( float(globalDocCountHash[word])/float(numDocs) > maxDocFreq ): 593 | excMaxdocfreq += 1 594 | else: 595 | # Add word to vocab 596 | vocabHash[word]= globalWordCountHash[word]; 597 | vocabCount += 1 598 | print ("Excluded %d stop words" % (excStopword)) 599 | print ("Excluded %d non-alphabetic words" % (excNonalphabetic)) 600 | print ("Excluded %d words below word count threshold" % (excMinwordcount)) 601 | print ("Excluded %d words below doc count threshold" % (excMindoccount)) 602 | print ("Excluded %d words above max doc frequency" % (excMaxdocfreq)) 603 | print ("Final Vocab Size: %d words" % vocabCount) 604 | 605 | return vocabHash -------------------------------------------------------------------------------- /notebooks/Part_1_Data_Preparation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Part 1: Data Preparation\n", 8 | "\n", 9 | "Please make sure you have __notebook__ and __nltk__ Python packages installed in the compute context you choose as kernel. For demonstration purpose, this series of notebooks uses the `local` compute context.\n", 10 | "\n", 11 | "**NOTE**: Python 3 kernel doesn't include Azure Machine Learning Workbench functionalities. Please switch the kernel to `local` before continuing further. \n", 12 | "\n", 13 | "To install __notebook__ and __nltk__, please uncomment and run the following script." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "# !pip install --upgrade notebook\n", 25 | "# !pip install --upgrade nltk" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Import Required Python Modules" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "import pandas as pd\n", 44 | "import numpy as np\n", 45 | "import re, os, gzip, requests, warnings\n", 46 | "from azureml.logging import get_azureml_logger\n", 47 | "warnings.filterwarnings(\"ignore\")" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "run_logger = get_azureml_logger()\n", 59 | "run_logger.log('amlrealworld.QnA-matching.part1-data-preparation','true')" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Access Sample Data\n", 67 | "\n", 68 | "In this example, we have collected a set of Q&A pairs from Stack Overflow site tagged as `JavaScript` questions. The data contains 1,201 original Q&A pairs as well as many duplicate questions, i.e. new questions that Stack Overflow users have linked back to pre-existing Q&A pairs that effectively provide answers to these new questions. The data schema of the original questions (Q), duplicate questions (D), and answers (A) can be found in the following table:\n", 69 | "\n", 70 | "| Dataset | Field | Type | Description\n", 71 | "| ----------|------------|------------|--------\n", 72 | "| question (Q) | Id | String | The unique question ID (primary key)\n", 73 | "| | AnswerId | String | The unique answer ID per question\n", 74 | "| | Text0 | String | The raw text data including the question's title and body\n", 75 | "| | CreationDate | Timestamp | The timestamp of when the question has been asked\n", 76 | "| dupes (D) | Id | String | The unique duplication ID (primary key)\n", 77 | "| | AnswerId | String | The answer ID associated with the duplication\n", 78 | "| | Text0 | String | The raw text data including the duplication's title and body\n", 79 | "| | CreationDate | Timestamp | The timestamp of when the duplication has been asked\n", 80 | "| answers (A) | Id | String | The unique answer ID (primary key)\n", 81 | "| | text0 | String | The raw text data of the answer\n", 82 | "\n", 83 | "The datasets are compressed and stored in Azure Blob storage as `.tsv.gz` files and this section provides you the code to retreive the data in the notebook." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "metadata": { 90 | "collapsed": true 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "# load raw data from a .tsv.gz file into Pandas data frame.\n", 95 | "def read_csv_gz(url, **kwargs):\n", 96 | " df = pd.read_csv(gzip.open(requests.get(url, stream=True).raw, mode='rb'), sep='\\t', encoding='utf8', **kwargs)\n", 97 | " return df.set_index('Id')" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "metadata": { 104 | "collapsed": true 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "# URLs to Original questions, Duplications, and Answers.\n", 109 | "questions_url = 'https://bostondata.blob.core.windows.net/stackoverflow/orig-q.tsv.gz'\n", 110 | "dupes_url = 'https://bostondata.blob.core.windows.net/stackoverflow/dup-q.tsv.gz'\n", 111 | "answers_url = 'https://bostondata.blob.core.windows.net/stackoverflow/ans.tsv.gz'\n", 112 | "\n", 113 | "# load datasets.\n", 114 | "questions = read_csv_gz(questions_url, names=('Id', 'AnswerId', 'Text0', 'CreationDate'))\n", 115 | "dupes = read_csv_gz(dupes_url, names=('Id', 'AnswerId', 'Text0', 'CreationDate'))\n", 116 | "answers = read_csv_gz(answers_url, names=('Id', 'Text0'))" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "To provide some example, here are the first five rows of the __questions__ table:" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 5, 129 | "metadata": { 130 | "collapsed": false, 131 | "scrolled": true 132 | }, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/html": [ 137 | "
\n", 138 | "\n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | "
AnswerIdText0CreationDate
Id
220231220233Accessing the web page's HTTP Headers in JavaS...2008-10-20 22:54:38.767
391979810461Get client IP using just JavaScript?. <p>I nee...2008-12-24 18:22:30.780
109086109091Stop setInterval call in JavaScript. <p>I am u...2008-09-20 19:29:55.377
4615546181Validate email address in JavaScript?. <p>How ...2008-09-05 16:10:11.093
121499121708When onblur occurs, how can I find out which e...2008-09-23 14:48:43.483
\n", 186 | "
" 187 | ], 188 | "text/plain": [ 189 | " AnswerId Text0 \\\n", 190 | "Id \n", 191 | "220231 220233 Accessing the web page's HTTP Headers in JavaS... \n", 192 | "391979 810461 Get client IP using just JavaScript?.

I nee... \n", 193 | "109086 109091 Stop setInterval call in JavaScript.

I am u... \n", 194 | "46155 46181 Validate email address in JavaScript?.

How ... \n", 195 | "121499 121708 When onblur occurs, how can I find out which e... \n", 196 | "\n", 197 | " CreationDate \n", 198 | "Id \n", 199 | "220231 2008-10-20 22:54:38.767 \n", 200 | "391979 2008-12-24 18:22:30.780 \n", 201 | "109086 2008-09-20 19:29:55.377 \n", 202 | "46155 2008-09-05 16:10:11.093 \n", 203 | "121499 2008-09-23 14:48:43.483 " 204 | ] 205 | }, 206 | "execution_count": 5, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "questions.head(5)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "Here is the full text of one __original__ question, whose is `Id` is `220231`. The `AnswerId` associated with this question is `220233`." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 6, 225 | "metadata": { 226 | "collapsed": false, 227 | "scrolled": true 228 | }, 229 | "outputs": [ 230 | { 231 | "name": "stdout", 232 | "output_type": "stream", 233 | "text": [ 234 | "Accessing the web page's HTTP Headers in JavaScript.

How do I access a page's HTTP response headers via JavaScript?

Related to this question, which was modified to ask about accessing two specific HTTP headers.

Related:
How do I access the HTTP request header fields via JavaScript?

\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "# This text include the HTML code.\n", 240 | "print(questions[\"Text0\"][220231])" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "Here is the full text of the __answer__ associated with the above original question:" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 7, 253 | "metadata": { 254 | "collapsed": false, 255 | "scrolled": true 256 | }, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "

Unfortunately, there isn't an API to give you the HTTP response headers for your initial page request. That was the original question posted here. It has been repeatedly asked, too, because some people would like to get the actual response headers of the original page request without issuing another one.


For AJAX Requests:

If an HTTP request is made over AJAX, it is possible to get the response headers with the getAllResponseHeaders() method. It's part of the XMLHttpRequest API. To see how this can be applied, check out the fetchSimilarHeaders() function below. Note that this is a work-around to the problem that won't be reliable for some applications.

myXMLHttpRequest.getAllResponseHeaders(); 

This will not give you information about the original page request's HTTP response headers, but it could be used to make educated guesses about what those headers were. More on that is described next.


Getting header values from the Initial Page Request:

This question was first asked several years ago, asking specifically about how to get at the original HTTP response headers for the current page (i.e. the same page inside of which the javascript was running). This is quite a different question than simply getting the response headers for any HTTP request. For the initial page request, the headers aren't readily available to javascript. Whether the header values you need will be reliably and sufficiently consistent if you request the same page again via AJAX will depend on your particular application.

The following are a few suggestions for getting around that problem.


1. Requests on Resources which are largely static

If the response is largely static and the headers are not expected to change much between requests, you could make an AJAX request for the same page you're currently on and assume that they're they are the same values which were part of the page's HTTP response. This could allow you to access the headers you need using the nice XMLHttpRequest API described above.

function fetchSimilarHeaders (callback) { var request = new XMLHttpRequest(); request.onreadystatechange = function () { if (request.readyState === 4) { // // The following headers may often be similar // to those of the original page request... // if (callback && typeof callback === 'function') { callback(request.getAllResponseHeaders()); } } }; // // Re-request the same page (document.location) // We hope to get the same or similar response headers to those which // came with the current page, but we have no guarantee. // Since we are only after the headers, a HEAD request may be sufficient. // request.open('HEAD', document.location, true); request.send(null); } 

This approach will be problematic if you truly have to rely on the values being consistent between requests, since you can't fully guarantee that they are the same. It's going to depend on your specific application and\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "print(answers[\"Text0\"][220233])" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "__Duplicate__ questions share the same `AnswerId` as the original question they link to. Here is the first duplicate question linked to the above original question:" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 8, 280 | "metadata": { 281 | "collapsed": false, 282 | "scrolled": true 283 | }, 284 | "outputs": [ 285 | { 286 | "name": "stdout", 287 | "output_type": "stream", 288 | "text": [ 289 | "Monitoring http request header on a page.

Possible Duplicates:
Accessing HTTP Headers in Javascript?
How do I access the HTTP request header fields via JavaScript?

We can use httpwatch on IE or httpfox on Firefox to monitor http activity

If i don't want to use any plugs on browser...

Is it possible to monitor http request header on a page just by javascript?

\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "print(dupes.query(\"AnswerId == 220233\").iloc[0][\"Text0\"])" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "## Pre-process Text Data\n", 302 | "\n", 303 | "### Clean up text\n", 304 | "\n", 305 | "The raw data is in `HTML` format and needs to be cleaned up for any further analysis. We exclude HTML tags, links and code snippets from the data." 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 9, 311 | "metadata": { 312 | "collapsed": true 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "# remove embedded code chunks, HTML tags and links/URLs.\n", 317 | "def clean_text(text):\n", 318 | " global EMPTY\n", 319 | " EMPTY = ''\n", 320 | " \n", 321 | " if not isinstance(text, str): \n", 322 | " return text\n", 323 | " text = re.sub('
.*?
', EMPTY, text)\n", 324 | "\n", 325 | " def replace_link(match):\n", 326 | " return EMPTY if re.match('[a-z]+://', match.group(1)) else match.group(1)\n", 327 | " \n", 328 | " text = re.sub(']+>(.*)', replace_link, text)\n", 329 | " return re.sub('<[^>]+>', EMPTY, text)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 10, 335 | "metadata": { 336 | "collapsed": true 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "for df in (questions, dupes, answers):\n", 341 | " df['Text'] = df['Text0'].apply(clean_text).str.lower()\n", 342 | " df['NumChars'] = df['Text'].str.len()" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "### Set data selection criteria\n", 350 | "\n", 351 | "To obtain the high quality datasets for phrase learning and model training, we requires a minimum length of characters in the text field. Different thresholds are considered for original questions, duplications, and answers, respectively. Also, each Q&A pair in our set must have a minimum of 3 additional semantically equivalent duplicate questions linked to it. " 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 11, 357 | "metadata": { 358 | "collapsed": true 359 | }, 360 | "outputs": [], 361 | "source": [ 362 | "# find the AnswerIds has at least 3 dupes.\n", 363 | "def find_answerId(answersC, dupesC, num_dupes):\n", 364 | " \n", 365 | " countHash = {}\n", 366 | " for i in dupesC.AnswerId:\n", 367 | " if i not in answersC.index.values:\n", 368 | " continue\n", 369 | " if i not in countHash.keys():\n", 370 | " countHash[i] = 1\n", 371 | " else:\n", 372 | " countHash[i] += 1\n", 373 | " \n", 374 | " countHash = {k: v for k, v in countHash.items() if v >= num_dupes}\n", 375 | " commonAnswerId = countHash.keys()\n", 376 | " \n", 377 | " return commonAnswerId\n", 378 | "\n", 379 | "# extract data based on the selection criteria.\n", 380 | "def select_data(questions, dupes, answers):\n", 381 | " # exclude the records without any text\n", 382 | " questions_nz = questions.query('NumChars > 0')\n", 383 | " dupes_nz = dupes.query('NumChars > 0')\n", 384 | " answers_nz = answers.query('NumChars > 0')\n", 385 | "\n", 386 | " # get the 10th percentile of text length as the minimum length of characters to consider in the text field\n", 387 | " minLenQ = questions_nz.quantile(.1)['NumChars']\n", 388 | " minLenD = dupes_nz.quantile(.1)['NumChars']\n", 389 | " minLenA = answers_nz.quantile(.1)['NumChars']\n", 390 | " \n", 391 | " # eliminate records with text less than the minimum length\n", 392 | " questionsC = questions.query('NumChars >' + str(int(minLenQ)))\n", 393 | " dupesC = dupes.query('NumChars >' + str(minLenD))\n", 394 | " answersC = answers.query('NumChars >' + str(minLenA))\n", 395 | " \n", 396 | " # remove the records in dupesC whose questionId has already existed in questionsC\n", 397 | " duplicatedIndex = list(set(questionsC.index).intersection(set(dupesC.index)))\n", 398 | " dupesC.drop(duplicatedIndex, inplace=True)\n", 399 | " \n", 400 | " # make sure Questions 1:1 match with Answers \n", 401 | " matches = questionsC.merge(answersC, left_on = 'AnswerId', right_index = True)\n", 402 | " questionsC = matches[['AnswerId', 'Text0_x', 'CreationDate', 'Text_x', 'NumChars_x']]\n", 403 | " questionsC.columns = ['AnswerId', 'Text0', 'CreationDate', 'Text', 'NumChars']\n", 404 | "\n", 405 | " answersC = matches[['Text0_y', 'Text_y', 'NumChars_y']]\n", 406 | " answersC.index = matches['AnswerId']\n", 407 | " answersC.columns = ['Text0', 'Text', 'NumChars']\n", 408 | " \n", 409 | " # find the AnswerIds has at least 3 dupes\n", 410 | " commonAnswerId = find_answerId(answersC, dupesC, 3)\n", 411 | " \n", 412 | " # select the records with those AnswerIds\n", 413 | " questionsC = questionsC.loc[questionsC.AnswerId.isin(commonAnswerId)]\n", 414 | " dupesC = dupesC.loc[dupesC.AnswerId.isin(commonAnswerId)]\n", 415 | " \n", 416 | " return questionsC, dupesC" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 12, 422 | "metadata": { 423 | "collapsed": true 424 | }, 425 | "outputs": [], 426 | "source": [ 427 | "# some questions have been linked to multiple AnswerIds.\n", 428 | "# we keep the first AnswerId associated with that question and remove the rest.\n", 429 | "questions = questions.groupby(questions.index).first()\n", 430 | "dupes = dupes.groupby(dupes.index).first()\n", 431 | "\n", 432 | "# execute the data selection function on questions, dupes and answers.\n", 433 | "questionsC, dupesC = select_data(questions, dupes, answers)" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "## Prepare Training and Test datasets\n", 441 | "\n", 442 | "In this example, we retain original question and 75% of the duplicate questions for training, and hold-out the most recently posted 25% of duplicate questions as test data. The training and test data are split by `CreationDate`.\n", 443 | "\n", 444 | "- training set = Original questions + 75% of oldest Duplications per original question\n", 445 | "- test set = remaining 25% of Duplications per original question" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 13, 451 | "metadata": { 452 | "collapsed": true 453 | }, 454 | "outputs": [], 455 | "source": [ 456 | "# split Original questions and their Duplications into training and test sets.\n", 457 | "def split_data(questions, dupes, frac):\n", 458 | " trainQ = questions\n", 459 | " testQ = pd.DataFrame(columns = dupes.columns.values) # create an empty data frame\n", 460 | "\n", 461 | " for answerId in np.unique(dupes.AnswerId):\n", 462 | " df = dupes.query('AnswerId == ' + str(answerId))\n", 463 | " totalCount = len(df)\n", 464 | " splitPoint = int(totalCount * frac)\n", 465 | " dfSort = df.sort_values(by = ['CreationDate'])\n", 466 | " trainQ = trainQ.append(dfSort.head(splitPoint)) # oldest N percent of duplications\n", 467 | " testQ = testQ.append(dfSort.tail(totalCount - splitPoint))\n", 468 | "\n", 469 | " # convert data type to int\n", 470 | " testQ[[\"AnswerId\", \"NumChars\"]] = testQ[[\"AnswerId\", \"NumChars\"]].astype(int) \n", 471 | " # rename the index \n", 472 | " testQ.index.rename(\"Id\", inplace=True)\n", 473 | " \n", 474 | " return trainQ, testQ" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 14, 480 | "metadata": { 481 | "collapsed": true 482 | }, 483 | "outputs": [], 484 | "source": [ 485 | "trainQ, testQ = split_data(questionsC, dupesC, 0.75)" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 15, 491 | "metadata": { 492 | "collapsed": false 493 | }, 494 | "outputs": [ 495 | { 496 | "data": { 497 | "text/html": [ 498 | "
\n", 499 | "\n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | "
AnswerIdText0CreationDateTextNumChars
Id
37133777Call ASP.NET Function From Javascript?. <p>I'm...2008-08-06 17:16:36.630call asp.net function from javascript?. i'm wr...227
52236700Length of a JavaScript object (that is, associ...2008-08-07 19:42:21.060length of a javascript object (that is, associ...313
74777523Autosizing textarea using Prototype. <p>I'm cu...2008-08-11 01:43:13.493autosizing textarea using prototype. i'm curre...1664
180821830844Validate decimal numbers in JavaScript - IsNum...2008-08-20 14:21:13.793validate decimal numbers in javascript - isnum...231
21294242607Dynamically load a JavaScript file. <p>How can...2008-08-21 21:59:31.080dynamically load a javascript file. how can yo...980
\n", 561 | "
" 562 | ], 563 | "text/plain": [ 564 | " AnswerId Text0 \\\n", 565 | "Id \n", 566 | "3713 3777 Call ASP.NET Function From Javascript?.

I'm... \n", 567 | "5223 6700 Length of a JavaScript object (that is, associ... \n", 568 | "7477 7523 Autosizing textarea using Prototype.

I'm cu... \n", 569 | "18082 1830844 Validate decimal numbers in JavaScript - IsNum... \n", 570 | "21294 242607 Dynamically load a JavaScript file.

How can... \n", 571 | "\n", 572 | " CreationDate \\\n", 573 | "Id \n", 574 | "3713 2008-08-06 17:16:36.630 \n", 575 | "5223 2008-08-07 19:42:21.060 \n", 576 | "7477 2008-08-11 01:43:13.493 \n", 577 | "18082 2008-08-20 14:21:13.793 \n", 578 | "21294 2008-08-21 21:59:31.080 \n", 579 | "\n", 580 | " Text NumChars \n", 581 | "Id \n", 582 | "3713 call asp.net function from javascript?. i'm wr... 227 \n", 583 | "5223 length of a javascript object (that is, associ... 313 \n", 584 | "7477 autosizing textarea using prototype. i'm curre... 1664 \n", 585 | "18082 validate decimal numbers in javascript - isnum... 231 \n", 586 | "21294 dynamically load a javascript file. how can yo... 980 " 587 | ] 588 | }, 589 | "execution_count": 15, 590 | "metadata": {}, 591 | "output_type": "execute_result" 592 | } 593 | ], 594 | "source": [ 595 | "trainQ.head(5)" 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "metadata": {}, 601 | "source": [ 602 | "## Select Subsets with Sufficient Training Questions per Answer Class\n", 603 | "\n", 604 | "In our past experiments, we notice that some Q&A pairs only link to a small number of duplicate questions. This means those answer classes may contain an insufficient amount of examples to train an accurate model. We examine the effect of the number of duplicate questions available for training for each Q&A pair. \n", 605 | "\n", 606 | "\n", 607 | "\n", 608 | "The above Figure shows results for questions relative to the number of training examples available for the correct Q&A pair that should be returned. Most of our Q&A pairs (857 out of 1201) have 5 or fewer known duplicate questions available for training. Performance on these questions is relatively weak, with the correct Q&A pair landing in the top 10 results less than 40% of the time. However, when greater numbers of duplicate questions are available for training, performance improves dramatically; when Q&A pairs have 50 or more duplicate questions available for training, the classification model places these pairs in the top 10 of the retrieved results 98% of the time when they correctly match the query. The most duplicated question contains 962 duplications. \n", 609 | "\n", 610 | "For the study in this notebook, we only consider the answer classes that have more than 13 training questions (original and duplicate questions) in this notebook. This reduces the entire dataset to 5,153 training questions, 1,735 test questions, and 103 unique answer classes." 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 16, 616 | "metadata": { 617 | "collapsed": true 618 | }, 619 | "outputs": [], 620 | "source": [ 621 | "countPerAns = pd.DataFrame({\"NumTrain\" : trainQ.groupby(\"AnswerId\").size()})\n", 622 | "trainQwithCount = trainQ.merge(countPerAns, left_on=\"AnswerId\", right_index=True)\n", 623 | "testQwithCount = testQ.merge(countPerAns, left_on=\"AnswerId\", right_index=True)\n", 624 | "\n", 625 | "# for each Answer class, we request more than 13 training questions.\n", 626 | "trainQ = trainQwithCount[trainQwithCount[\"NumTrain\"] > 13]\n", 627 | "testQ = testQwithCount[testQwithCount[\"NumTrain\"] > 13]" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 17, 633 | "metadata": { 634 | "collapsed": false, 635 | "scrolled": false 636 | }, 637 | "outputs": [ 638 | { 639 | "name": "stdout", 640 | "output_type": "stream", 641 | "text": [ 642 | "# of training examples: 5153\n", 643 | "# of testing examples: 1735\n", 644 | "\n", 645 | "A quick glance of the training data: \n", 646 | "\n" 647 | ] 648 | }, 649 | { 650 | "data": { 651 | "text/html": [ 652 | "

\n", 653 | "\n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | "
AnswerIdText
Id
6991369984why don't self-closing script tags work?. what...
39256169984firefox script tag error. while adding some ve...
129730869984weird javascript/jquery behavior. possible du...
335218269984html: why script tags should always have full ...
535586769984loading scripts in javascript. possible dupli...
\n", 694 | "
" 695 | ], 696 | "text/plain": [ 697 | " AnswerId Text\n", 698 | "Id \n", 699 | "69913 69984 why don't self-closing script tags work?. what...\n", 700 | "392561 69984 firefox script tag error. while adding some ve...\n", 701 | "1297308 69984 weird javascript/jquery behavior. possible du...\n", 702 | "3352182 69984 html: why script tags should always have full ...\n", 703 | "5355867 69984 loading scripts in javascript. possible dupli..." 704 | ] 705 | }, 706 | "execution_count": 17, 707 | "metadata": {}, 708 | "output_type": "execute_result" 709 | } 710 | ], 711 | "source": [ 712 | "print(\"# of training examples: \" + str(len(trainQ)))\n", 713 | "print(\"# of testing examples: \" + str(len(testQ)) + \"\\n\")\n", 714 | "print(\"A quick glance of the training data: \\n\")\n", 715 | "trainQ[[\"AnswerId\", \"Text\"]].head(5)" 716 | ] 717 | }, 718 | { 719 | "cell_type": "markdown", 720 | "metadata": {}, 721 | "source": [ 722 | "## Save Outputs to a Share Directory in the Workbench\n", 723 | "\n", 724 | "Azure Machine Learning Workbench provides a flexible way of saving intermediate files. `os.environ.get('AZUREML_NATIVE_SHARE_DIRECTORY')` retrieves a share directory where the files are stored. Those files can be accessed from other notebooks or Python files." 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": 18, 730 | "metadata": { 731 | "collapsed": true 732 | }, 733 | "outputs": [], 734 | "source": [ 735 | "workfolder = os.environ.get('AZUREML_NATIVE_SHARE_DIRECTORY')\n", 736 | "trainQ.to_csv(os.path.join(workfolder, 'trainQ_part1'), sep='\\t', header=True, index=True, index_label='Id')\n", 737 | "testQ.to_csv(os.path.join(workfolder, 'testQ_part1'), sep='\\t', header=True, index=True, index_label='Id')" 738 | ] 739 | } 740 | ], 741 | "metadata": { 742 | "kernelspec": { 743 | "display_name": "Python [default]", 744 | "language": "python", 745 | "name": "python3" 746 | }, 747 | "language_info": { 748 | "codemirror_mode": { 749 | "name": "ipython", 750 | "version": 3 751 | }, 752 | "file_extension": ".py", 753 | "mimetype": "text/x-python", 754 | "name": "python", 755 | "nbconvert_exporter": "python", 756 | "pygments_lexer": "ipython3", 757 | "version": "3.5.2" 758 | } 759 | }, 760 | "nbformat": 4, 761 | "nbformat_minor": 2 762 | } 763 | -------------------------------------------------------------------------------- /notebooks/Part_2_Phrase_Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Part 2: Phrase Learning\n", 8 | "\n", 9 | "If you haven't complete the **Part 1: Data Preparation**, please complete it before moving forward with **Part 2: Phrase Learning**. Part 2 requires files created from Part 1.\n", 10 | "\n", 11 | "Please make sure you have __notebook__ and __nltk__ Python packages installed in the compute context you choose as kernel. For demonstration purpose, this series of notebooks uses the `local` compute context.\n", 12 | "\n", 13 | "**NOTE**: Python 3 kernel doesn't include Azure Machine Learning Workbench functionalities. Please switch the kernel to `local` before continuing further. \n", 14 | "\n", 15 | "To install __notebook__ and __nltk__, please uncomment and run the following script." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "# !pip install --upgrade notebook\n", 27 | "# !pip install --upgrade nltk" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "### Import Required Python Modules\n", 35 | "\n", 36 | "`modules.phrase_learning` contains a list of Python user-defined Python modules to learn informative phrases that are used in this examples. You can find the source code of those modules in the directory of `modules/phrase_learning.py`." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "import pandas as pd\n", 48 | "import numpy as np\n", 49 | "import re, os, requests, warnings\n", 50 | "from collections import (namedtuple, Counter)\n", 51 | "from modules.phrase_learning import (CleanAndSplitText, ComputeNgramStats, RankNgrams, ApplyPhraseRewrites,\n", 52 | " ApplyPhraseLearning, ApplyPhraseRewritesInPlace, ReconstituteDocsFromChunks,\n", 53 | " CreateVocabForTopicModeling)\n", 54 | "from azureml.logging import get_azureml_logger\n", 55 | "warnings.filterwarnings(\"ignore\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "run_logger = get_azureml_logger()\n", 67 | "run_logger.log('amlrealworld.QnA-matching.part2-phrase-learning','true')" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## Access trainQ and testQ from Part 1\n", 75 | "\n", 76 | "As we have prepared the _trainQ_ and _testQ_ from the `Part 1: Data Preparation`, we retrieve the datasets here for the further process.\n", 77 | "\n", 78 | "_trainQ_ contains 5,153 training examples and _testQ_ contains 1,735 test examples. Also, there are 103 unique answer classes in both datasets." 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 3, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "# load non-content bearing function words (.txt file) into a Python dictionary. \n", 90 | "def LoadListAsHash(fileURL):\n", 91 | " response = requests.get(fileURL, stream=True)\n", 92 | " wordsList = response.text.split('\\n')\n", 93 | "\n", 94 | " # Read in lines one by one and strip away extra spaces, \n", 95 | " # leading spaces, and trailing spaces and inserting each\n", 96 | " # cleaned up line into a hash table.\n", 97 | " listHash = {}\n", 98 | " re1 = re.compile(' +')\n", 99 | " re2 = re.compile('^ +| +$')\n", 100 | " for stringIn in wordsList:\n", 101 | " term = re2.sub(\"\",re1.sub(\" \",stringIn.strip('\\n')))\n", 102 | " if term != '':\n", 103 | " listHash[term] = 1\n", 104 | " return listHash" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "metadata": { 111 | "collapsed": true 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "workfolder = os.environ.get('AZUREML_NATIVE_SHARE_DIRECTORY')\n", 116 | "\n", 117 | "# paths to trainQ, testQ and function words.\n", 118 | "trainQ_path = os.path.join(workfolder, 'trainQ_part1')\n", 119 | "testQ_path = os.path.join(workfolder, 'testQ_part1')\n", 120 | "function_words_url = 'https://bostondata.blob.core.windows.net/stackoverflow/function_words.txt'\n", 121 | "\n", 122 | "# load the training and test data.\n", 123 | "trainQ = pd.read_csv(trainQ_path, sep='\\t', index_col='Id', encoding='latin1')\n", 124 | "testQ = pd.read_csv(testQ_path, sep='\\t', index_col='Id', encoding='latin1')\n", 125 | "\n", 126 | "# Load the list of non-content bearing function words.\n", 127 | "functionwordHash = LoadListAsHash(function_words_url)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## Clean and Split the Text\n", 135 | "\n", 136 | "The CleanAndSplitText function from __phrase_learning__ takes as input a list where each row element is a single cohesive long string of text, i.e. a \"question\". The function first splits each string by various forms of punctuation into chunks of text that are likely sentences, phrases or sub-phrases. The splitting is designed to prohibit the phrase learning process from using cross-sentence or cross-phrase word strings when learning phrases.\n", 137 | "\n", 138 | "The function returns a table where each row represents a chunk of text from the questions. The `DocID` coulmn indicates the original row index from associated question in the input from which the chunk of text originated. The `DocLine` column contains the original text excluding the punctuation marks and `HTML` markup that have been during the cleaning process. The `Lowercase Taxt` column contains a fully lower-cased version of the text in the `CleanedText` column." 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 5, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "[nltk_data] Downloading package punkt to\n", 153 | "[nltk_data] C:\\Users\\mez\\AppData\\Roaming\\nltk_data...\n", 154 | "[nltk_data] Package punkt is already up-to-date!\n", 155 | "[nltk_data] Downloading package punkt to\n", 156 | "[nltk_data] C:\\Users\\mez\\AppData\\Roaming\\nltk_data...\n", 157 | "[nltk_data] Package punkt is already up-to-date!\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "CleanedTrainQ = CleanAndSplitText(trainQ)\n", 163 | "CleanedTestQ = CleanAndSplitText(testQ)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 6, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/html": [ 176 | "
\n", 177 | "\n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | "
DocIDDocLineCleanedTextLowercaseText
0699130why don't self-closing script tags workwhy don't self-closing script tags work
1699131what is the reason browsers do not correctly r...what is the reason browsers do not correctly r...
2699132only this is recognizedonly this is recognized
3699133does this break the concept of xhtml supportdoes this break the concept of xhtml support
4699134notenote
\n", 225 | "
" 226 | ], 227 | "text/plain": [ 228 | " DocID DocLine CleanedText \\\n", 229 | "0 69913 0 why don't self-closing script tags work \n", 230 | "1 69913 1 what is the reason browsers do not correctly r... \n", 231 | "2 69913 2 only this is recognized \n", 232 | "3 69913 3 does this break the concept of xhtml support \n", 233 | "4 69913 4 note \n", 234 | "\n", 235 | " LowercaseText \n", 236 | "0 why don't self-closing script tags work \n", 237 | "1 what is the reason browsers do not correctly r... \n", 238 | "2 only this is recognized \n", 239 | "3 does this break the concept of xhtml support \n", 240 | "4 note " 241 | ] 242 | }, 243 | "execution_count": 6, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "CleanedTrainQ.head(5)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "## Learn Informative Phrases \n", 257 | "The phrases can be treated as single compound word units in down-stream processes such as discriminative training. To learn the phrases, we have implemented the basic framework for key phrase learning as described in the paper entitled [\"Modeling Multiword Phrases with Constrained Phrases Tree for Improved Topic Modeling of Conversational Speech\"](http://people.csail.mit.edu/hazen/publications/Hazen-SLT-2012.pdf) which was originally presented in the 2012 IEEE Workshop on Spoken Language Technology. Although the paper examines the use of the technology for analyzing human-to-human conversations, the techniques are quite general and can be applied to a wide range of natural language data including news stories, legal documents, research publications, social media forum discussions, customer feedback forms, product reviews, and many more.\n", 258 | "\n", 259 | "`ApplyPhraseLearning` module takes the following arguments:\n", 260 | "- `textData`: array, a list of text data.\n", 261 | "- `learnedPhrases`: array, a list of learned phrases. For initialization, an empty list should be given.\n", 262 | "- `maxNumPhrases`: int, (default=200), maximium number of phrases to learn. If you want to test the code out quickly then set this to a small value (e.g. 100) and set `verbose` to True when running the quick test.\n", 263 | "- `maxPhraseLength`: int, (default=7), maximum number of words allowed in the learned phrases.\n", 264 | "- `maxPhrasesPerIter`: int, (default=50), maximum number of phrases to learn per iteration. Increasing this number may speed up processing but will affect the ordering of the phrases learned and good phrases could be by-passed if the maxNumPhrases is set to a small number.\n", 265 | "- `minCount`: int, (default=5), minimum number of times a phrase must occur in the data to be considered during the phrase learning process.\n", 266 | "- `functionwordHash`: dict, (default={}), a precreated hash table containing the list of function words used during phrase learning. \n", 267 | "- `blacklistHash`: dict, (default={}), a precreated hash table containing the list of black list words to be ignored during phrase learning.\n", 268 | "- `verbose`: boolean, (default=False). If verbose=True, it prints out the learned phrases to stdout buffer while its learning. This will generate a lot of text to stdout, so best to turn this off except for testing and debugging." 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 7, 274 | "metadata": { 275 | "collapsed": false 276 | }, 277 | "outputs": [ 278 | { 279 | "name": "stdout", 280 | "output_type": "stream", 281 | "text": [ 282 | "Start phrase learning with 0 phrases of 200 phrases learned\n", 283 | "Iteration 1: Added 42 new phrases in 1.34 seconds (Learned 42 of max 200)\n", 284 | "Iteration 2: Added 35 new phrases in 1.41 seconds (Learned 77 of max 200)\n", 285 | "Iteration 3: Added 32 new phrases in 1.14 seconds (Learned 109 of max 200)\n", 286 | "Iteration 4: Added 34 new phrases in 1.34 seconds (Learned 143 of max 200)\n", 287 | "Iteration 5: Added 31 new phrases in 1.27 seconds (Learned 174 of max 200)\n", 288 | "Iteration 6: Added 11 new phrases in 1.36 seconds (Learned 185 of max 200)\n", 289 | "Iteration 7: Added 3 new phrases in 1.14 seconds (Learned 188 of max 200)\n", 290 | "Iteration 8: Added 4 new phrases in 1.21 seconds (Learned 192 of max 200)\n", 291 | "Iteration 9: Added 1 new phrases in 1.27 seconds (Learned 193 of max 200)\n", 292 | "Iteration 10: Added 1 new phrases in 1.16 seconds (Learned 194 of max 200)\n", 293 | "Iteration 11: Added 1 new phrases in 1.23 seconds (Learned 195 of max 200)\n", 294 | "Iteration 12: Added 1 new phrases in 1.16 seconds (Learned 196 of max 200)\n", 295 | "Iteration 13: Added 1 new phrases in 1.23 seconds (Learned 197 of max 200)\n", 296 | "Iteration 14: Added 1 new phrases in 1.32 seconds (Learned 198 of max 200)\n", 297 | "Iteration 15: Added 1 new phrases in 1.19 seconds (Learned 199 of max 200)\n", 298 | "Iteration 16: Added 1 new phrases in 1.19 seconds (Learned 200 of max 200)\n", 299 | "*** Phrase learning completed in 0.01 hours ***\n" 300 | ] 301 | } 302 | ], 303 | "source": [ 304 | "# Initialize an empty list of learned phrases\n", 305 | "# If you have completed a partial run of phrase learning\n", 306 | "# and want to add more phrases, you can use the pre-learned \n", 307 | "# phrases as a starting point instead and the new phrases\n", 308 | "# will be appended to the list\n", 309 | "learnedPhrasesQ = []\n", 310 | "\n", 311 | "# Create a copy of the original text data that will be used during learning\n", 312 | "# The copy is needed because the algorithm does in-place replacement of learned\n", 313 | "# phrases directly on the text data structure it is provided\n", 314 | "phraseTextDataQ = []\n", 315 | "for textLine in CleanedTrainQ['LowercaseText']:\n", 316 | " phraseTextDataQ.append(' ' + textLine + ' ')\n", 317 | "\n", 318 | "# Run the phrase learning algorithm.\n", 319 | "ApplyPhraseLearning(phraseTextDataQ, learnedPhrasesQ, maxNumPhrases=200, maxPhraseLength=7, maxPhrasesPerIter=50,\n", 320 | " minCount=5, functionwordHash=functionwordHash)\n", 321 | "\n", 322 | "# Add text with learned phrases back into data frame\n", 323 | "CleanedTrainQ['TextWithPhrases'] = phraseTextDataQ\n", 324 | "\n", 325 | "# Apply the phrase learning to test data.\n", 326 | "CleanedTestQ['TextWithPhrases'] = ApplyPhraseRewritesInPlace(CleanedTestQ, 'LowercaseText', learnedPhrasesQ)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 8, 332 | "metadata": { 333 | "collapsed": false 334 | }, 335 | "outputs": [ 336 | { 337 | "name": "stdout", 338 | "output_type": "stream", 339 | "text": [ 340 | "\n", 341 | "Here are some phrases we learned in this part of the tutorial: \n", 342 | "\n", 343 | "['possible duplicate', \"i'm trying\", 'works fine', 'doing wrong', 'click event', 'following code', 'using jquery', 'uncaught typeerror', 'ajax request', 'global variable', 'div class', 'json object', 'callback function', \"i'm not sure\", 'anonymous function', 'php file', 'return value', 'user clicks', 'dynamically created', 'input type']\n" 344 | ] 345 | } 346 | ], 347 | "source": [ 348 | "print(\"\\nHere are some phrases we learned in this part of the tutorial: \\n\")\n", 349 | "print(learnedPhrasesQ[:20])" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "## Reconstruct the Full Processed Text\n", 357 | "\n", 358 | "After replacing the text with learned phrases, we reconstruct the sentences from the chunks of text and insert the sentences in the `TextWithPhrases` field. " 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 9, 364 | "metadata": { 365 | "collapsed": true 366 | }, 367 | "outputs": [], 368 | "source": [ 369 | "# reconstitue the text from seperated chunks.\n", 370 | "trainQ['TextWithPhrases'] = ReconstituteDocsFromChunks(CleanedTrainQ, 'DocID', 'TextWithPhrases')\n", 371 | "testQ['TextWithPhrases'] = ReconstituteDocsFromChunks(CleanedTestQ, 'DocID', 'TextWithPhrases')" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "## Tokenize Text with Learned Phrases\n", 379 | "\n", 380 | "We learn a vocabulary by considering some text exclusion criteria, such as stop words, non-alphabetic words, the words below word count threshold, etc. \n", 381 | "\n", 382 | "`TokenizeText` module breaks the reconstituted text into individual tokens and excludes any word that doesn't exist in the vocabulary." 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 10, 388 | "metadata": { 389 | "collapsed": true 390 | }, 391 | "outputs": [], 392 | "source": [ 393 | "def TokenizeText(textData, vocabHash):\n", 394 | " tokenizedText = ''\n", 395 | " for token in textData.split():\n", 396 | " if token in vocabHash:\n", 397 | " tokenizedText += (token.strip() + ',')\n", 398 | " return tokenizedText.strip(',')" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 11, 404 | "metadata": { 405 | "collapsed": false 406 | }, 407 | "outputs": [ 408 | { 409 | "name": "stdout", 410 | "output_type": "stream", 411 | "text": [ 412 | "Counting words\n", 413 | "Building vocab\n", 414 | "Excluded 307 stop words\n", 415 | "Excluded 911 non-alphabetic words\n", 416 | "Excluded 15265 words below word count threshold\n", 417 | "Excluded 142 words below doc count threshold\n", 418 | "Excluded 3 words above max doc frequency\n", 419 | "Final Vocab Size: 3115 words\n" 420 | ] 421 | } 422 | ], 423 | "source": [ 424 | "# create the vocabulary.\n", 425 | "vocabHashQ = CreateVocabForTopicModeling(trainQ['TextWithPhrases'], functionwordHash)\n", 426 | "\n", 427 | "# tokenize the text.\n", 428 | "trainQ['Tokens'] = trainQ['TextWithPhrases'].apply(lambda x: TokenizeText(x, vocabHashQ))\n", 429 | "testQ['Tokens'] = testQ['TextWithPhrases'].apply(lambda x: TokenizeText(x, vocabHashQ))" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 12, 435 | "metadata": { 436 | "collapsed": false, 437 | "scrolled": true 438 | }, 439 | "outputs": [ 440 | { 441 | "data": { 442 | "text/html": [ 443 | "
\n", 444 | "\n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | "
AnswerIdTokens
Id
6991369984self-closing,script,tags,work,reason,browsers,...
39256169984firefox,script,tag,error,adding,basic,script,t...
129730869984weird,javascript/jquery,behavior,possible_dupl...
335218269984html,script,tags,ending,possible_duplicate,t,s...
535586769984loading,scripts,possible_duplicate,don&#39,t,s...
\n", 485 | "
" 486 | ], 487 | "text/plain": [ 488 | " AnswerId Tokens\n", 489 | "Id \n", 490 | "69913 69984 self-closing,script,tags,work,reason,browsers,...\n", 491 | "392561 69984 firefox,script,tag,error,adding,basic,script,t...\n", 492 | "1297308 69984 weird,javascript/jquery,behavior,possible_dupl...\n", 493 | "3352182 69984 html,script,tags,ending,possible_duplicate,t,s...\n", 494 | "5355867 69984 loading,scripts,possible_duplicate,don',t,s..." 495 | ] 496 | }, 497 | "execution_count": 12, 498 | "metadata": {}, 499 | "output_type": "execute_result" 500 | } 501 | ], 502 | "source": [ 503 | "trainQ[['AnswerId', 'Tokens']].head(5)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "markdown", 508 | "metadata": {}, 509 | "source": [ 510 | "## Save Outputs to a Share Directory in the Workbench" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 13, 516 | "metadata": { 517 | "collapsed": true 518 | }, 519 | "outputs": [], 520 | "source": [ 521 | "trainQ.to_csv(os.path.join(workfolder, 'trainQ_part2'), sep='\\t', header=True, index=True, index_label='Id')\n", 522 | "testQ.to_csv(os.path.join(workfolder, 'testQ_part2'), sep='\\t', header=True, index=True, index_label='Id')" 523 | ] 524 | } 525 | ], 526 | "metadata": { 527 | "kernelspec": { 528 | "display_name": "Python [default]", 529 | "language": "python", 530 | "name": "python3" 531 | }, 532 | "language_info": { 533 | "codemirror_mode": { 534 | "name": "ipython", 535 | "version": 3 536 | }, 537 | "file_extension": ".py", 538 | "mimetype": "text/x-python", 539 | "name": "python", 540 | "nbconvert_exporter": "python", 541 | "pygments_lexer": "ipython3", 542 | "version": "3.5.2" 543 | } 544 | }, 545 | "nbformat": 4, 546 | "nbformat_minor": 2 547 | } 548 | -------------------------------------------------------------------------------- /notebooks/Part_3_Model_Training_and_Evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Part 3: Model Training and Evaluation\n", 8 | "\n", 9 | "If you haven't complete the **Part 1: Data Preparation** and **Part 2: Phrase Learning**, please complete them before moving forward with **Part 3: Model Training and Evaluation**.\n", 10 | "\n", 11 | "**NOTE**: Python 3 kernel doesn't include Azure Machine Learning Workbench functionalities. Please switch the kernel to `local` before continuing further. \n", 12 | "\n", 13 | "This example is designed to score new questions against the pre-existing Q&A pairs by training text classification models where each pre-existing Q&A pair is a unique class and a subset of the duplicate questions for each Q&A pair are available as training material. \n", 14 | "\n", 15 | "In the Part 3, the classification model uses an ensemble method to aggregate the following three base classifiers. In each base classifier, the `AnswerId` is used as the class label and the BOWs representations is used as the features.\n", 16 | "\n", 17 | "1. Naive Bayes Classifier\n", 18 | "2. Support Vector Machine (TF-IDF as features)\n", 19 | "3. Random Forest (NB Scores as features)\n", 20 | "\n", 21 | "Two different evaluation metrics are used to assess performance.\n", 22 | "1. `Average Rank (AR)`: indicates the average position where the correct answer is found in the list of retrieved Q&A pairs (out of the full set of 103 answer classes). \n", 23 | "2. `Top 3 Percentage`: indicates the percentage of test questions that the correct answer can be retrieved in the top three choices in the returned ranked list. \n", 24 | "\n", 25 | "`Average Rank (AR)` and `Top 3 Percentage` on the test set are calculated using the following formula:\n", 26 | "\n", 27 | "" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "### Import Required Python Modules\n", 35 | "\n", 36 | "`modules.feature_extractor` contains a list of user-defined Python modules to extract effective features that are used in this examples. You can find the source code of those modules in the directory of `modules/feature_extractor.py`." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 1, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "import pandas as pd\n", 48 | "import numpy as np\n", 49 | "import os, warnings\n", 50 | "from sklearn import svm\n", 51 | "from sklearn.ensemble import RandomForestClassifier\n", 52 | "from modules.feature_extractor import (tokensToIds, countMatrix, priorProbabilityAnswer, posterioriProb, \n", 53 | " feature_selection, featureWeights, wordProbabilityInAnswer, \n", 54 | " wordProbabilityNotinAnswer, normalizeTF, getIDF, softmax)\n", 55 | "from azureml.logging import get_azureml_logger\n", 56 | "warnings.filterwarnings(\"ignore\")" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "run_logger = get_azureml_logger()\n", 68 | "run_logger.log('amlrealworld.QnA-matching.part3-model-training-eval','true')" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Access trainQ and testQ from Part 2\n", 76 | "\n", 77 | "As we have prepared the _trainQ_ and _testQ_ with learned phrases and tokens from `Part 2: Phrase Learning`, we retrieve the datasets here for the further process.\n", 78 | "\n", 79 | "_trainQ_ contains 5,153 training examples and _testQ_ contains 1,735 test examples. Also, there are 103 unique answer classes in both datasets." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 2, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "workfolder = os.environ.get('AZUREML_NATIVE_SHARE_DIRECTORY')\n", 91 | "\n", 92 | "# paths to trainQ and testQ.\n", 93 | "trainQ_path = os.path.join(workfolder, 'trainQ_part2')\n", 94 | "testQ_path = os.path.join(workfolder, 'testQ_part2')\n", 95 | "\n", 96 | "# load the training and test data.\n", 97 | "trainQ = pd.read_csv(trainQ_path, sep='\\t', index_col='Id', encoding='latin1')\n", 98 | "testQ = pd.read_csv(testQ_path, sep='\\t', index_col='Id', encoding='latin1')" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "## Extract Features\n", 106 | "\n", 107 | "Selecting the right set of features is very critical for the model training. In this section, we show you several feature extraction approaches that have proved to yield good performance in text classification use cases.\n", 108 | "\n", 109 | "### Term Frequency and Inverse Document Frequency (TF-IDF) \n", 110 | "\n", 111 | "TF-IDF is a commonly used feature weighting approach for text classification. \n", 112 | "\n", 113 | "Each question `d` is typically represented by a feature vector `x` that represents the contents of `d`. Because different questions may have different lengths, it can be useful to apply L1 normalization on the feature vector `x`. Therefore, a normalized `Term Frequency` matrix can be obtained based on the following formula.\n", 114 | "\n", 115 | "\n", 116 | "\n", 117 | "Considering all tokens observed in the training questions, we compute the `Inverse Document Frequency` for each token based on the following formula.\n", 118 | "\n", 119 | "\n", 120 | "\n", 121 | "By knowing the `Term Frequency (TF)` matrix and `Inverse Document Frequency (IDF)` vector, we can simply compute `TF-IDF` matrix by multiplying them together.\n", 122 | "\n", 123 | "" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 3, 129 | "metadata": { 130 | "collapsed": true 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "token2IdHashInit = tokensToIds(trainQ['Tokens'], featureHash=None)\n", 135 | "\n", 136 | "# get unique answerId in ascending order\n", 137 | "uniqueAnswerId = list(np.unique(trainQ['AnswerId']))\n", 138 | "\n", 139 | "N_wQ = countMatrix(trainQ, token2IdHashInit)\n", 140 | "idf = getIDF(N_wQ)\n", 141 | "\n", 142 | "x_wTrain = normalizeTF(trainQ, token2IdHashInit)\n", 143 | "x_wTest = normalizeTF(testQ, token2IdHashInit)\n", 144 | "\n", 145 | "tfidfTrain = (x_wTrain.T * idf).T\n", 146 | "tfidfTest = (x_wTest.T * idf).T" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "### Naive Bayes Scores\n", 154 | "\n", 155 | "Besides using the IDF as the word weighting mechnism, a hypothesis testing likelihood ratio approach is also implemented here. \n", 156 | "\n", 157 | "In this approach, the word weights are associated with the answer classes and are calculated using the following formula.\n", 158 | "\n", 159 | "\n", 160 | "\n", 161 | "\n", 162 | "\n", 163 | "By knowing the `Term Frequency (TF)` matrix and `Weight` vector for each class, we can simply compute the `Naive Bayes Scores` matrix for each class by multiplying them together.\n", 164 | "\n", 165 | "#### Feature selection\n", 166 | "\n", 167 | "Text classification models often pre-select a set of features (i.e., tokens) which carry the most class relevant information for further processing while ignoring words that carry little to no value for identifying classes. A variety of feature selection methods have been previously explored for both text processing. In this example, we have had the most success selecting features based on the estimated class posterior probability `P(A|w)`, where `A` is a specific answer class and `w` is a specific token. The maximum a posteriori probability (MAP) estimate of `P(A|w)` is expressed as\n", 168 | "\n", 169 | "\n", 170 | "\n", 171 | "Feature selection in this example is performed by selecting the top `N` tokens which maximize for each `P(A|w)`. In order to determine the best value for the `TopN` parameter, you can simply run the `scripts/naive_bayes.py` with `local` compute context in the Azure Machine Learning Workbench and enter different integer values as `Arguments`.\n", 172 | "\n", 173 | "\n", 174 | "\n", 175 | "Based our experiments, the `TopN = 19` yields the best result and is demonstrated in this notebook. " 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 4, 181 | "metadata": { 182 | "collapsed": true 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "# calculate the count matrix of all training questions.\n", 187 | "N_wAInit = countMatrix(trainQ, token2IdHashInit, 'AnswerId', uniqueAnswerId)\n", 188 | "\n", 189 | "P_A = priorProbabilityAnswer(trainQ['AnswerId'], uniqueAnswerId)\n", 190 | "P_Aw = posterioriProb(N_wAInit, P_A, uniqueAnswerId)\n", 191 | "\n", 192 | "# select top N important tokens per answer class.\n", 193 | "featureHash = feature_selection(P_Aw, token2IdHashInit, topN=19)\n", 194 | "token2IdHash = tokensToIds(trainQ['Tokens'], featureHash=featureHash)\n", 195 | "\n", 196 | "N_wA = countMatrix(trainQ, token2IdHash, 'AnswerId', uniqueAnswerId)\n", 197 | "\n", 198 | "alpha = 0.0001\n", 199 | "P_w = featureWeights(N_wA, alpha)\n", 200 | "\n", 201 | "beta = 0.0001\n", 202 | "P_wA = wordProbabilityInAnswer(N_wA, P_w, beta)\n", 203 | "P_wNotA = wordProbabilityNotinAnswer(N_wA, P_w, beta)\n", 204 | "\n", 205 | "NBWeights = np.log(P_wA / P_wNotA)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "## Train Classification Models and Predict on Test Data" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "### Naive Bayes Classifier\n", 220 | "\n", 221 | "We implement the _Naive Bayes Classifier_ as described in the paper entitled [\"MCE Training Techniques for Topic Identification of Spoken Audio Documents\"](http://ieeexplore.ieee.org/abstract/document/5742980/)." 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 5, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "beta_A = 0\n", 233 | "\n", 234 | "x_wTest = normalizeTF(testQ, token2IdHash)\n", 235 | "Y_test_prob1 = softmax(-beta_A + np.dot(x_wTest.T, NBWeights))" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "### Support Vector Machine (TF-IDF as features)\n", 243 | "\n", 244 | "Traditionally, Support Vector Machine (SVM) model finds a hyperplane which maximally seperates positive and negative training tokens in a vector space. In its standard form, an SVM is a two-class classifier. To create a SVM model for a problem with multiple classes, a one-versus-rest (OVR) SVM classifier is typically learned for each answer class.\n", 245 | "\n", 246 | "The `sklearn` Python package implement such a classifier and we use the implementation in this example. More information about this `LinearSVC` classifier can be found [here](http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html)." 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 6, 252 | "metadata": { 253 | "collapsed": true 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "X_train, Y_train = tfidfTrain.T, np.array(trainQ['AnswerId'])\n", 258 | "clf = svm.LinearSVC(dual=True, multi_class='ovr', penalty='l2', C=1, loss=\"squared_hinge\", random_state=1)\n", 259 | "clf.fit(X_train, Y_train)\n", 260 | "\n", 261 | "X_test = tfidfTest.T\n", 262 | "Y_test_prob2 = softmax(clf.decision_function(X_test))" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "### Random Forest (NB Scores as features)\n", 270 | "\n", 271 | "Similar to the above one-versus-rest SVM classifier, we also implement a one-versus-rest Random Forest classifier based on a base two-class Random Forest classifier from `sklearn`. More information about the `RandomForestClassifier` can be found [here](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html).\n", 272 | "\n", 273 | "In each base classifier, we dynamically compute the naive bayes scores for the positive class as the features. Since the number of negative examples is much larger than the number of positive examples, we hold all positive example and randomly select negative examples based on a negative to positive ratio to obtain a balanced training data. This is controlled by the `ratio` parameter in the `ovrClassifier` function below.\n", 274 | "\n", 275 | "In this classifier, we need to tune two hyper-parameters: `TopN` and `n_estimators`. `TopN` is the same parameter as we learned in the _Feature Selection_ step and `n_estimators` indicates the number of trees to be constructed in the Random Forest classifier. To identify the best values for the hyper-parameters, you can run `scripts/random_forest.py` with `local` compute context in the Azure Machine Learning Workbench and enter different integer values `Arguments`. The value of `TopN` and the value of `n_estimators` should be space delimited.\n", 276 | "\n", 277 | "\n", 278 | "\n", 279 | "Based our experiments, the `TopN = 19` and `n_estimators = 250` yields the best result, and are demonstrated in this notebook." 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 7, 285 | "metadata": { 286 | "collapsed": true 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "# train one-vs-rest classifier using NB scores as features.\n", 291 | "def ovrClassifier(trainLabels, x_wTrain, x_wTest, NBWeights, clf, ratio):\n", 292 | " uniqueLabel = np.unique(trainLabels)\n", 293 | " dummyLabels = pd.get_dummies(trainLabels)\n", 294 | " numTest = x_wTest.shape[1]\n", 295 | " Y_test_prob = np.zeros(shape=(numTest, len(uniqueLabel)))\n", 296 | "\n", 297 | " for i in range(len(uniqueLabel)):\n", 298 | " X_train_all, Y_train_all = x_wTrain.T * NBWeights[:, i], dummyLabels.iloc[:, i]\n", 299 | " X_test = x_wTest.T * NBWeights[:, i]\n", 300 | " \n", 301 | " # with sample selection.\n", 302 | " if ratio is not None:\n", 303 | " # ratio = # of Negative/# of Positive\n", 304 | " posIdx = np.where(Y_train_all == 1)[0]\n", 305 | " negIdx = np.random.choice(np.where(Y_train_all == 0)[0], ratio*len(posIdx))\n", 306 | " allIdx = np.concatenate([posIdx, negIdx])\n", 307 | " X_train, Y_train = X_train_all[allIdx], Y_train_all.iloc[allIdx]\n", 308 | " else: # without sample selection.\n", 309 | " X_train, Y_train = X_train_all, Y_train_all\n", 310 | " \n", 311 | " clf.fit(X_train, Y_train)\n", 312 | " if hasattr(clf, \"decision_function\"):\n", 313 | " Y_test_prob[:, i] = clf.decision_function(X_test)\n", 314 | " else:\n", 315 | " Y_test_prob[:, i] = clf.predict_proba(X_test)[:, 1]\n", 316 | "\n", 317 | " return softmax(Y_test_prob)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 8, 323 | "metadata": { 324 | "collapsed": true 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "x_wTrain = normalizeTF(trainQ, token2IdHash)\n", 329 | "x_wTest = normalizeTF(testQ, token2IdHash)\n", 330 | "\n", 331 | "clf = RandomForestClassifier(n_estimators=250, criterion='entropy', random_state=1)\n", 332 | "Y_test_prob3 = ovrClassifier(trainQ[\"AnswerId\"], x_wTrain, x_wTest, NBWeights, clf, ratio=3)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "### Ensemble Model\n", 340 | "\n", 341 | "We build an ensemble model by aggregating the predicted probabilities from three previously trained classifiers. The base classifiers are equally weighted in this ensemble method. " 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 9, 347 | "metadata": { 348 | "collapsed": true 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "Y_test_prob_aggr = np.mean([Y_test_prob1, Y_test_prob2, Y_test_prob3], axis=0)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "## Evaluate Model Performance\n", 360 | "\n", 361 | "Two different evaluation metrics are used to assess performance. \n", 362 | "1. `Average Rank (AR)`: indicates the average position where the correct answer is found in the list of retrieved Q&A pairs (out of the full set of 103 answer classes). \n", 363 | "2. `Top 3 Percentage`: indicates the percentage of test questions that the correct answer can be retrieved in the top three choices in the returned ranked list. " 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 10, 369 | "metadata": { 370 | "collapsed": true 371 | }, 372 | "outputs": [], 373 | "source": [ 374 | "# get the rank of answerIds for a given question. \n", 375 | "def rank(frame, scores, uniqueAnswerId):\n", 376 | " frame['SortedAnswers'] = list(np.array(uniqueAnswerId)[np.argsort(-scores, axis=1)])\n", 377 | " \n", 378 | " rankList = []\n", 379 | " for i in range(len(frame)):\n", 380 | " rankList.append(np.where(frame['SortedAnswers'].iloc[i] == frame['AnswerId'].iloc[i])[0][0] + 1)\n", 381 | " frame['Rank'] = rankList\n", 382 | " \n", 383 | " return frame" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 11, 389 | "metadata": { 390 | "collapsed": false 391 | }, 392 | "outputs": [ 393 | { 394 | "name": "stdout", 395 | "output_type": "stream", 396 | "text": [ 397 | "Average of rank: 5.0\n", 398 | "Percentage of questions find answers in the first 3 choices: 0.684\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "testQ = rank(testQ, Y_test_prob_aggr, uniqueAnswerId)\n", 404 | "\n", 405 | "AR = np.floor(testQ['Rank'].mean())\n", 406 | "top3 = round(len(testQ.query('Rank <= 3'))/len(testQ), 3)\n", 407 | " \n", 408 | "print('Average of rank: ' + str(AR))\n", 409 | "print('Percentage of questions find answers in the first 3 choices: ' + str(top3))" 410 | ] 411 | } 412 | ], 413 | "metadata": { 414 | "kernelspec": { 415 | "display_name": "Python [default]", 416 | "language": "python", 417 | "name": "python3" 418 | }, 419 | "language_info": { 420 | "codemirror_mode": { 421 | "name": "ipython", 422 | "version": 3 423 | }, 424 | "file_extension": ".py", 425 | "mimetype": "text/x-python", 426 | "name": "python", 427 | "nbconvert_exporter": "python", 428 | "pygments_lexer": "ipython3", 429 | "version": "3.5.2" 430 | } 431 | }, 432 | "nbformat": 4, 433 | "nbformat_minor": 2 434 | } 435 | -------------------------------------------------------------------------------- /scripts/naive_bayes.py: -------------------------------------------------------------------------------- 1 | ################################################################################################################################### 2 | # This script is used for hyperparameter tunning of the Naive Bayes model described in the Part 3: Model Training and Evaluation. 3 | # To run this script, please enter a non-negative integer in the "Argument" box above the screen. 4 | # This argument indicates the number of important tokens that we selected for each answer class. 5 | # The total number of tokens that are selected for all classes construct the whole feature space. 6 | ################################################################################################################################### 7 | 8 | import pandas as pd 9 | import numpy as np 10 | import os, sys, warnings 11 | from azureml.logging import get_azureml_logger 12 | sys.path.append("") 13 | from modules.feature_extractor import (tokensToIds, countMatrix, priorProbabilityAnswer, posterioriProb, 14 | feature_selection, featureWeights, wordProbabilityInAnswer, 15 | wordProbabilityNotinAnswer, normalizeTF, softmax) 16 | warnings.filterwarnings("ignore") 17 | 18 | run_logger = get_azureml_logger() 19 | run_logger.log('amlrealworld.QnA-matching.naive-bayes','true') 20 | 21 | 22 | 23 | ######################################### 24 | # User Defined Functions 25 | ######################################### 26 | 27 | # get the rank of answerIds for a given question. 28 | def rank(frame, scores, uniqueAnswerId): 29 | frame['SortedAnswers'] = list(np.array(uniqueAnswerId)[np.argsort(-scores, axis=1)]) 30 | 31 | rankList = [] 32 | for i in range(len(frame)): 33 | rankList.append(np.where(frame['SortedAnswers'].iloc[i] == frame['AnswerId'].iloc[i])[0][0] + 1) 34 | frame['Rank'] = rankList 35 | 36 | return frame 37 | 38 | 39 | ######################################### 40 | # Main Function 41 | ######################################### 42 | 43 | def main(): 44 | 45 | ######################################### 46 | # Accept One Argument as Input 47 | ######################################### 48 | 49 | try: 50 | topN = int(sys.argv[1]) 51 | except IndexError: 52 | print("This script takes one argument. Please enter a valid non-negative integer number.\n") 53 | raise 54 | 55 | 56 | ######################################### 57 | # Access trainQ and testQ from Part 2 58 | ######################################### 59 | 60 | workfolder = os.environ.get('AZUREML_NATIVE_SHARE_DIRECTORY') 61 | 62 | # paths to trainQ and testQ. 63 | trainQ_path = os.path.join(workfolder, 'trainQ_part2') 64 | testQ_path = os.path.join(workfolder, 'testQ_part2') 65 | 66 | # load the training and test data. 67 | trainQ = pd.read_csv(trainQ_path, sep='\t', index_col='Id', encoding='latin1') 68 | testQ = pd.read_csv(testQ_path, sep='\t', index_col='Id', encoding='latin1') 69 | 70 | 71 | ######################################### 72 | # Extract Features 73 | ######################################### 74 | 75 | token2IdHashInit = tokensToIds(trainQ['Tokens'], featureHash=None) 76 | 77 | # get unique answerId in ascending order 78 | uniqueAnswerId = list(np.unique(trainQ['AnswerId'])) 79 | 80 | # calculate the count matrix of all training questions. 81 | N_wAInit = countMatrix(trainQ, token2IdHashInit, 'AnswerId', uniqueAnswerId) 82 | 83 | P_A = priorProbabilityAnswer(trainQ['AnswerId'], uniqueAnswerId) 84 | P_Aw = posterioriProb(N_wAInit, P_A, uniqueAnswerId) 85 | 86 | # select top N important tokens per answer class. 87 | featureHash = feature_selection(P_Aw, token2IdHashInit, topN=topN) 88 | token2IdHash = tokensToIds(trainQ['Tokens'], featureHash=featureHash) 89 | 90 | N_wA = countMatrix(trainQ, token2IdHash, 'AnswerId', uniqueAnswerId) 91 | 92 | alpha = 0.0001 93 | P_w = featureWeights(N_wA, alpha) 94 | 95 | beta = 0.0001 96 | P_wA = wordProbabilityInAnswer(N_wA, P_w, beta) 97 | P_wNotA = wordProbabilityNotinAnswer(N_wA, P_w, beta) 98 | 99 | 100 | ######################################### 101 | # Train Naive Bayes Classifier 102 | ######################################### 103 | 104 | NBWeights = np.log(P_wA / P_wNotA) 105 | 106 | 107 | ######################################### 108 | # Predict Probabilities on Test 109 | ######################################### 110 | 111 | beta_A = 0 112 | x_wTest = normalizeTF(testQ, token2IdHash) 113 | Y_test_prob = softmax(-beta_A + np.dot(x_wTest.T, NBWeights)) 114 | 115 | 116 | ######################################### 117 | # Evaluate Model Performance 118 | ######################################### 119 | # We use two evaluation matrices (Average Rank and Top 3 Percentage) to test our model performance. 120 | # The Average Rank can be interpreted as in average at which position we can find the correct answer among all available answers for a given question. 121 | # The Top 3 Percentage can be interpreted as how many percentage of the new questions that we can find their correct answers in the first 3 choices. 122 | # sort the similarity scores in descending order and map them to the corresponding AnswerId in Answer set 123 | 124 | testQ = rank(testQ, Y_test_prob, uniqueAnswerId) 125 | 126 | AR = np.floor(testQ['Rank'].mean()) 127 | top3 = round(len(testQ.query('Rank <= 3'))/len(testQ), 3) 128 | 129 | print('Top %d important tokens selected per Class.' %topN) 130 | print('Average of rank: ' + str(AR)) 131 | print('Percentage of questions find answers in the first 3 choices: ' + str(top3)) 132 | 133 | 134 | ######################################### 135 | # Log Parameters and Performance 136 | ######################################### 137 | 138 | # initialize the logger 139 | run_logger = get_azureml_logger() 140 | 141 | # log performance. 142 | run_logger.log("Top N Tokens Selected", topN) 143 | run_logger.log("Average Rank", AR) 144 | run_logger.log("Top 3 Percentage", top3) 145 | 146 | 147 | 148 | if __name__ == "__main__": 149 | main() 150 | print("\nRun is complete!") 151 | -------------------------------------------------------------------------------- /scripts/random_forest.py: -------------------------------------------------------------------------------- 1 | ################################################################################################################################### 2 | # This script is used for hyperparameter tunning of the Random Forest model described in the Part 3: Model Training and Evaluation. 3 | # To run this script, please enter two non-negative integers seperated by a space in the "Argument" box above the screen. 4 | # The first argument indicates the number of important tokens that we selected for each answer class. 5 | # The second argument indicates the number of trees to be constructed in the Random Forest model. 6 | ################################################################################################################################### 7 | 8 | 9 | import pandas as pd 10 | import numpy as np 11 | import os, sys, warnings 12 | from azureml.logging import get_azureml_logger 13 | from sklearn.ensemble import RandomForestClassifier 14 | sys.path.append("") 15 | from modules.feature_extractor import (tokensToIds, countMatrix, priorProbabilityAnswer, posterioriProb, 16 | feature_selection, featureWeights, wordProbabilityInAnswer, 17 | wordProbabilityNotinAnswer, normalizeTF, softmax) 18 | from naive_bayes import (rank) 19 | warnings.filterwarnings("ignore") 20 | 21 | run_logger = get_azureml_logger() 22 | run_logger.log('amlrealworld.QnA-matching.random-forest','true') 23 | 24 | ######################################### 25 | # User Defined Functions 26 | ######################################### 27 | 28 | # train one-vs-rest classifier using NB scores as features. 29 | def ovrClassifier(trainLabels, x_wTrain, x_wTest, NBWeights, clf, ratio): 30 | uniqueLabel = np.unique(trainLabels) 31 | dummyLabels = pd.get_dummies(trainLabels) 32 | numTest = x_wTest.shape[1] 33 | Y_test_prob = np.zeros(shape=(numTest, len(uniqueLabel))) 34 | 35 | for i in range(len(uniqueLabel)): 36 | X_train_all, Y_train_all = x_wTrain.T * NBWeights[:, i], dummyLabels.iloc[:, i] 37 | X_test = x_wTest.T * NBWeights[:, i] 38 | 39 | # with sample selection. 40 | if ratio is not None: 41 | # ratio = # of Negative/# of Positive 42 | posIdx = np.where(Y_train_all == 1)[0] 43 | negIdx = np.random.choice(np.where(Y_train_all == 0)[0], ratio*len(posIdx)) 44 | allIdx = np.concatenate([posIdx, negIdx]) 45 | X_train, Y_train = X_train_all[allIdx], Y_train_all.iloc[allIdx] 46 | else: # without sample selection. 47 | X_train, Y_train = X_train_all, Y_train_all 48 | 49 | clf.fit(X_train, Y_train) 50 | if hasattr(clf, "decision_function"): 51 | Y_test_prob[:, i] = clf.decision_function(X_test) 52 | else: 53 | Y_test_prob[:, i] = clf.predict_proba(X_test)[:, 1] 54 | 55 | return softmax(Y_test_prob) 56 | 57 | 58 | ######################################### 59 | # Main Function 60 | ######################################### 61 | 62 | def main(): 63 | 64 | ######################################### 65 | # Accept One Argument as Input 66 | ######################################### 67 | 68 | try: 69 | topN = int(sys.argv[1]) 70 | n_estimators = int(sys.argv[2]) 71 | except IndexError: 72 | print("This script takes two arguments. Please enter valid non-negative integer numbers.\n") 73 | raise 74 | 75 | 76 | ######################################### 77 | # Access trainQ and testQ from Part 2 78 | ######################################### 79 | 80 | workfolder = os.environ.get('AZUREML_NATIVE_SHARE_DIRECTORY') 81 | 82 | # paths to trainQ and testQ. 83 | trainQ_path = os.path.join(workfolder, 'trainQ_part2') 84 | testQ_path = os.path.join(workfolder, 'testQ_part2') 85 | 86 | # load the training and test data. 87 | trainQ = pd.read_csv(trainQ_path, sep='\t', index_col='Id', encoding='latin1') 88 | testQ = pd.read_csv(testQ_path, sep='\t', index_col='Id', encoding='latin1') 89 | 90 | 91 | ######################################### 92 | # Extract Features 93 | ######################################### 94 | 95 | token2IdHashInit = tokensToIds(trainQ['Tokens'], featureHash=None) 96 | 97 | # get unique answerId in ascending order 98 | uniqueAnswerId = list(np.unique(trainQ['AnswerId'])) 99 | 100 | # calculate the count matrix of all training questions. 101 | N_wAInit = countMatrix(trainQ, token2IdHashInit, 'AnswerId', uniqueAnswerId) 102 | 103 | P_A = priorProbabilityAnswer(trainQ['AnswerId'], uniqueAnswerId) 104 | P_Aw = posterioriProb(N_wAInit, P_A, uniqueAnswerId) 105 | 106 | # select top N important tokens per answer class. 107 | featureHash = feature_selection(P_Aw, token2IdHashInit, topN=topN) 108 | token2IdHash = tokensToIds(trainQ['Tokens'], featureHash=featureHash) 109 | 110 | N_wA = countMatrix(trainQ, token2IdHash, 'AnswerId', uniqueAnswerId) 111 | 112 | alpha = 0.0001 113 | P_w = featureWeights(N_wA, alpha) 114 | 115 | beta = 0.0001 116 | P_wA = wordProbabilityInAnswer(N_wA, P_w, beta) 117 | P_wNotA = wordProbabilityNotinAnswer(N_wA, P_w, beta) 118 | 119 | x_wTrain = normalizeTF(trainQ, token2IdHash) 120 | x_wTest = normalizeTF(testQ, token2IdHash) 121 | 122 | 123 | ######################################### 124 | # Train Naive Bayes Classifier 125 | ######################################### 126 | 127 | NBWeights = np.log(P_wA / P_wNotA) 128 | clf = RandomForestClassifier(n_estimators=n_estimators, criterion='entropy', random_state=1) 129 | 130 | 131 | ######################################### 132 | # Predict Probabilities on Test 133 | ######################################### 134 | 135 | Y_test_prob = ovrClassifier(trainQ["AnswerId"], x_wTrain, x_wTest, NBWeights, clf, ratio=3) 136 | 137 | 138 | ######################################### 139 | # Evaluate Model Performance 140 | ######################################### 141 | # We use two evaluation matrices (Average Rank and Top 3 Percentage) to test our model performance. 142 | # The Average Rank can be interpreted as in average at which position we can find the correct answer among all available answers for a given question. 143 | # The Top 3 Percentage can be interpreted as how many percentage of the new questions that we can find their correct answers in the first 3 choices. 144 | # sort the similarity scores in descending order and map them to the corresponding AnswerId in Answer set 145 | 146 | testQ = rank(testQ, Y_test_prob, uniqueAnswerId) 147 | 148 | AR = np.floor(testQ['Rank'].mean()) 149 | top3 = round(len(testQ.query('Rank <= 3'))/len(testQ), 3) 150 | 151 | print('Top %d important tokens selected per Class.' %topN) 152 | print('# of trees in the Random Forest: ' + str(n_estimators)) 153 | print('Average of rank: ' + str(AR)) 154 | print('Percentage of questions find answers in the first 3 choices: ' + str(top3)) 155 | 156 | 157 | ######################################### 158 | # Log Parameters and Performance 159 | ######################################### 160 | 161 | # initialize the logger. 162 | run_logger = get_azureml_logger() 163 | 164 | # log performance. 165 | run_logger.log("Top N Tokens Selected", topN) 166 | run_logger.log("Number of Trees", n_estimators) 167 | run_logger.log("Average Rank", AR) 168 | run_logger.log("Top 3 Percentage", top3) 169 | 170 | 171 | if __name__ == "__main__": 172 | main() 173 | print("\nRun is complete!") --------------------------------------------------------------------------------