├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ └── python-publish.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── data_preparation.py ├── data_transformations.py ├── docs ├── Makefile ├── make.bat └── source │ ├── conf.py │ ├── data_transformations.rst │ ├── define_multi_task_model.rst │ ├── examples.rst │ ├── index.rst │ ├── infering.rst │ ├── license.rst │ ├── multi_task.png │ ├── quickstart.rst │ ├── shared_encoder.rst │ ├── task_formats.rst │ └── training.rst ├── examples ├── answerability_detection │ ├── answerability_detection_msmarco.ipynb │ ├── tasks_file_answerability.yml │ └── transform_file_answerability.yml ├── entailment_detection │ ├── entailment_snli.ipynb │ ├── tasks_file_snli.yml │ └── transform_file_snli.yml ├── intent_ner_fragment │ ├── intent_ner_fragment.ipynb │ ├── snips_data │ │ ├── snips_dev.txt │ │ ├── snips_test.txt │ │ └── snips_train.txt │ ├── tasks_file_snips.yml │ └── transform_file_snips.yml ├── ner_pos_tagging │ ├── coNLL_data │ │ ├── coNLL_testa.txt │ │ ├── coNLL_testb.txt │ │ └── coNLL_train.txt │ ├── ner_pos_tagging_conll.ipynb │ ├── tasks_file_conll.yml │ └── transform_file_conll.yml ├── query_correctness │ ├── query_correctness.ipynb │ ├── query_correctness_data │ │ ├── dev.tsv │ │ ├── test.tsv │ │ └── train.tsv │ ├── tasks_file_query_correctness.yml │ └── transform_file_query_correctness.yml ├── query_pair_similarity │ ├── query_similarity_qqp.ipynb │ ├── tasks_file_qqp.yml │ └── transform_file_qqp.yml ├── query_type_detection │ ├── query_type_detection.ipynb │ ├── tasks_file_querytype.yml │ └── transform_file_querytype.yml └── sentiment_analysis │ ├── IMDb_sentiment_analysis.ipynb │ ├── tasks_file_imdb.yml │ └── transform_file_imdb.yml ├── infer_pipeline.py ├── logger_.py ├── models ├── data_manager.py ├── dropout.py ├── eval.py ├── loss.py └── model.py ├── requirements.txt ├── run_inference.py ├── train.py └── utils ├── data_utils.py ├── eval_metrics.py ├── task_utils.py ├── tranform_functions.py └── transform_utils.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # macOS Files 2 | .DS_Store 3 | 4 | #jupyter lab checkpoints 5 | .ipynb_checkpoints 6 | 7 | #docs 8 | docs/source/_build 9 | docs/build 10 | 11 | #data 12 | data 13 | 14 | #vs code 15 | .vscode 16 | 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | pip-wheel-metadata/ 40 | share/python-wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | MANIFEST 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | 70 | # Translations 71 | *.mo 72 | *.pot 73 | 74 | # Django stuff: 75 | *.log 76 | local_settings.py 77 | db.sqlite3 78 | db.sqlite3-journal 79 | 80 | # Flask stuff: 81 | instance/ 82 | .webassets-cache 83 | 84 | # Scrapy stuff: 85 | .scrapy 86 | 87 | # Sphinx documentation 88 | docs/_build/ 89 | 90 | # PyBuilder 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # IPython 97 | profile_default/ 98 | ipython_config.py 99 | 100 | # pyenv 101 | .python-version 102 | 103 | # pipenv 104 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 105 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 106 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 107 | # install all needed dependencies. 108 | #Pipfile.lock 109 | 110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 111 | __pypackages__/ 112 | 113 | # Celery stuff 114 | celerybeat-schedule 115 | celerybeat.pid 116 | 117 | # SageMath parsed files 118 | *.sage.py 119 | 120 | # Environments 121 | .env 122 | .venv 123 | env/ 124 | venv/ 125 | ENV/ 126 | env.bak/ 127 | venv.bak/ 128 | 129 | # Spyder project settings 130 | .spyderproject 131 | .spyproject 132 | 133 | # Rope project settings 134 | .ropeproject 135 | 136 | # mkdocs documentation 137 | /site 138 | 139 | # mypy 140 | .mypy_cache/ 141 | .dmypy.json 142 | dmypy.json 143 | 144 | # Pyre type checker 145 | .pyre/ -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at swapan@haptik.co. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /data_transformations.py: -------------------------------------------------------------------------------- 1 | ''' 2 | For transforming the raw data in different formats to standard tsv format 3 | to be consumed for multi-task 4 | ''' 5 | import argparse 6 | import os 7 | from utils.transform_utils import TransformParams 8 | from utils.data_utils import TRANSFORM_FUNCS 9 | 10 | def main(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--transform_file', type=str, required=True, 13 | default='transform_file.yml', help="path to the yml tranform file") 14 | args = parser.parse_args() 15 | #making transform params 16 | transformParams = TransformParams(args.transform_file) 17 | 18 | for transformName, transformFn in transformParams.transformFnMap.items(): 19 | transformParameters = transformParams.transformParamsMap[transformName] 20 | dataDir = transformParams.readDirMap[transformName] 21 | assert os.path.exists(dataDir), "{} doesnt exist".format(dataDir) 22 | saveDir = transformParams.saveDirMap[transformName] 23 | if not os.path.exists(saveDir): 24 | os.makedirs(saveDir) 25 | isTrain = True 26 | for file in transformParams.readFileNamesMap[transformName]: 27 | #calling respective transform function over file 28 | TRANSFORM_FUNCS[transformFn](dataDir = dataDir, readFile=file, 29 | wrtDir=saveDir, transParamDict=transformParameters, 30 | isTrainFile=isTrain) 31 | # only the first file will be considered as train file for making label map 32 | isTrain = False 33 | 34 | 35 | if __name__ == "__main__": 36 | main() -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | import os 16 | import sys 17 | sys.path.insert(0, os.path.abspath('../../')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | import sphinx_rtd_theme 23 | project = 'multi-task-NLP' 24 | copyright = '2020, Jio Haptik Technologies Limited' 25 | author = 'saransh mehta' 26 | 27 | # The short X.Y version 28 | version = '' 29 | # The full version, including alpha/beta/rc tags 30 | release = '0.0.1' 31 | 32 | 33 | # -- General configuration --------------------------------------------------- 34 | 35 | # If your documentation needs a minimal Sphinx version, state it here. 36 | # 37 | # needs_sphinx = '1.0' 38 | 39 | # Add any Sphinx extension module names here, as strings. They can be 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 41 | # ones. 42 | extensions = [ 43 | 'sphinx.ext.autodoc', 44 | 'sphinx.ext.napoleon', 45 | 'sphinx.ext.todo', 46 | 'sphinx.ext.mathjax', 47 | 'sphinx.ext.viewcode', 48 | 'sphinx.ext.githubpages', 49 | 'sphinx_rtd_theme', 50 | 'sphinx.ext.autosectionlabel' 51 | ] 52 | 53 | # Add any paths that contain templates here, relative to this directory. 54 | templates_path = ['_templates'] 55 | 56 | # The suffix(es) of source filenames. 57 | # You can specify multiple suffix as a list of string: 58 | # 59 | source_suffix = ['.rst', '.md'] 60 | #source_suffix = '.rst' 61 | 62 | # The master toctree document. 63 | master_doc = 'index' 64 | 65 | # The language for content autogenerated by Sphinx. Refer to documentation 66 | # for a list of supported languages. 67 | # 68 | # This is also used if you do content translation via gettext catalogs. 69 | # Usually you set "language" from the command line for these cases. 70 | language = None 71 | 72 | # List of patterns, relative to source directory, that match files and 73 | # directories to ignore when looking for source files. 74 | # This pattern also affects html_static_path and html_extra_path. 75 | exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store'] 76 | 77 | # The name of the Pygments (syntax highlighting) style to use. 78 | pygments_style = None 79 | 80 | 81 | # -- Options for HTML output ------------------------------------------------- 82 | 83 | # The theme to use for HTML and HTML Help pages. See the documentation for 84 | # a list of builtin themes. 85 | # 86 | 87 | html_theme = "sphinx_rtd_theme" 88 | 89 | # Theme options are theme-specific and customize the look and feel of a theme 90 | # further. For a list of options available for each theme, see the 91 | # documentation. 92 | # 93 | # html_theme_options = {} 94 | 95 | # Add any paths that contain custom static files (such as style sheets) here, 96 | # relative to this directory. They are copied after the builtin static files, 97 | # so a file named "default.css" will overwrite the builtin "default.css". 98 | html_static_path = ['_static'] 99 | 100 | # Custom sidebar templates, must be a dictionary that maps document names 101 | # to template names. 102 | # 103 | # The default sidebars (for documents that don't match any pattern) are 104 | # defined by theme itself. Builtin themes are using these templates by 105 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 106 | # 'searchbox.html']``. 107 | # 108 | # html_sidebars = {} 109 | 110 | 111 | # -- Options for HTMLHelp output --------------------------------------------- 112 | 113 | # Output file base name for HTML help builder. 114 | htmlhelp_basename = 'multi_task_NLPdoc' 115 | 116 | 117 | # -- Options for LaTeX output ------------------------------------------------ 118 | 119 | latex_elements = { 120 | # The paper size ('letterpaper' or 'a4paper'). 121 | # 122 | # 'papersize': 'letterpaper', 123 | 124 | # The font size ('10pt', '11pt' or '12pt'). 125 | # 126 | # 'pointsize': '10pt', 127 | 128 | # Additional stuff for the LaTeX preamble. 129 | # 130 | # 'preamble': '', 131 | 132 | # Latex figure (float) alignment 133 | # 134 | # 'figure_align': 'htbp', 135 | } 136 | 137 | # Grouping the document tree into LaTeX files. List of tuples 138 | # (source start file, target name, title, 139 | # author, documentclass [howto, manual, or own class]). 140 | latex_documents = [ 141 | (master_doc, 'multi_task_NLP.tex', 'multi\\_task\\_NLP Documentation', 142 | 'saransh mehta', 'manual'), 143 | ] 144 | 145 | 146 | # -- Options for manual page output ------------------------------------------ 147 | 148 | # One entry per manual page. List of tuples 149 | # (source start file, name, description, authors, manual section). 150 | man_pages = [ 151 | (master_doc, 'multi_task_nlp', 'multi_task_NLP Documentation', 152 | [author], 1) 153 | ] 154 | 155 | 156 | # -- Options for Texinfo output ---------------------------------------------- 157 | 158 | # Grouping the document tree into Texinfo files. List of tuples 159 | # (source start file, target name, title, author, 160 | # dir menu entry, description, category) 161 | texinfo_documents = [ 162 | (master_doc, 'multi_task_NLP', 'multi_task_NLP Documentation', 163 | author, 'multi_task_NLP', 'One line description of project.', 164 | 'Miscellaneous'), 165 | ] 166 | 167 | 168 | # -- Options for Epub output ------------------------------------------------- 169 | 170 | # Bibliographic Dublin Core info. 171 | epub_title = project 172 | 173 | # The unique identifier of the text. This can be a ISBN number 174 | # or the project homepage. 175 | # 176 | # epub_identifier = '' 177 | 178 | # A unique identification for the text. 179 | # 180 | # epub_uid = '' 181 | 182 | # A list of files that should not be packed into the epub file. 183 | epub_exclude_files = ['search.html'] 184 | 185 | 186 | # -- Extension configuration ------------------------------------------------- 187 | 188 | # -- Options for todo extension ---------------------------------------------- 189 | 190 | # If true, `todo` and `todoList` produce output, else they produce nothing. 191 | todo_include_todos = True 192 | -------------------------------------------------------------------------------- /docs/source/data_transformations.rst: -------------------------------------------------------------------------------- 1 | Data transformations 2 | ==================== 3 | 4 | It is very likely that the data you have is not in the format as required by the library. 5 | Hence, data transformations provide a way to convert data in raw form to standard tsv format required. 6 | 7 | Transform functions 8 | ------------------- 9 | 10 | Transform functions are the functions which can be used for performing transformations. 11 | Each function is defined to take raw data in certain format, perform the defined transformation steps and 12 | create the respective ``tsv`` file. 13 | 14 | Sample transform functions 15 | ^^^^^^^^^^^^^^^^^^^^^^^^^^ 16 | .. automodule:: utils.tranform_functions 17 | :members: snips_intent_ner_to_tsv, snli_entailment_to_tsv, create_fragment_detection_tsv, 18 | msmarco_answerability_detection_to_tsv, msmarco_query_type_to_tsv, bio_ner_to_tsv, coNLL_ner_pos_to_tsv, qqp_query_similarity_to_tsv, 19 | query_correctness_to_tsv, imdb_sentiment_data_to_tsv 20 | 21 | Your own transform function 22 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ 23 | In case, you need to convert some custom format data into the standard tsv format, you can do that 24 | by writing your own transform function. You must keep the following points in mind while writing your function 25 | 26 | - The function must take the standard input arguments like :ref:`sample transform functions` 27 | Any extra function specific parameter can be added to the ``transParamDict`` argument. 28 | 29 | - You should add the function in ``utils/tranform_functions.py`` file. 30 | 31 | - You should add a name map for the function in ``utils/data_utils.py`` file under ``TRANSFORM_FUNCS`` map. This 32 | step is required for transform file to recognize your function. 33 | 34 | - You should be able to use your function in the :ref:`transform file`. 35 | 36 | Transform File 37 | -------------- 38 | 39 | You can easily use the sample transformation functions or your own transformation function, 40 | by defining a YAML format ``transform_file``. Say you want to perform these transformations - 41 | **sample_transform1**, **sample_transform2**, ..., **sample_transform5**. 42 | Following is an example for the transform file, 43 | :: 44 | 45 | sample_transform1: 46 | transform_func: snips_intent_ner_to_tsv 47 | read_file_names: 48 | - snips_train.txt 49 | - snips_dev.txt 50 | - snips_test.txt 51 | read_dir: snips_data 52 | save_dir: demo_transform 53 | 54 | 55 | sample_transform2: 56 | transform_func: snli_entailment_to_tsv 57 | read_file_names: 58 | - snli_train.jsonl 59 | - snli_dev.jsonl 60 | - snli_test.jsonl 61 | read_dir : snli_data 62 | save_dir: demo_transform 63 | 64 | sample_transform3: 65 | transform_func: bio_ner_to_tsv 66 | transform_params: 67 | save_prefix : sample 68 | tag_col : 1 69 | col_sep : " " 70 | sen_sep : "\n" 71 | read_file_names: 72 | - coNLL_train.txt 73 | - coNLL_testa.txt 74 | - coNLL_testb.txt 75 | 76 | read_dir: coNLL_data 77 | save_dir: demo_transform 78 | 79 | sample_transform4: 80 | transform_func: fragment_detection_to_tsv 81 | transform_params: 82 | data_frac : 0.2 83 | seq_len_right : 3 84 | seq_len_left : 2 85 | sep : "\t" 86 | query_col : 2 87 | read_file_names: 88 | - int_snips_train.tsv 89 | - int_snips_dev.tsv 90 | - int_snips_test.tsv 91 | 92 | read_dir: data 93 | save_dir: demo_transform 94 | 95 | sample_transform5: 96 | transform_func: msmarco_query_type_to_tsv 97 | transform_params: 98 | data_frac : 0.2 99 | read_file_names: 100 | - train_v2.1.json 101 | - dev_v2.1.json 102 | - eval_v2.1_public.json 103 | 104 | read_dir: msmarco_qna_data 105 | save_dir: demo_transform 106 | 107 | 108 | NOTE:- The transform names (sample_transform1, sample_transform2, ...) are unique identifiers for the transform, hence the transform names must always be distinct. 109 | 110 | Transform file parameters 111 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 112 | 113 | Detailed description of the parameters available in the transform file. 114 | 115 | - ``transform_func`` `(required)` : Name of the :ref:`transform function` to use. 116 | - ``transform_params`` `(optional)` : Dictionary of function specific parameters which will go in ``transParamDict`` parameter of function. 117 | - ``read_file_names`` `(required)` : List of raw data files for transformations. The first file will be considered as **train file** and will be used to create label 118 | map file when required. 119 | - ``read_dir`` `(required)` : Directory containing the input files. 120 | - ``save_dir`` `(required)` : Directory to save the transformed tsv/label map files. 121 | 122 | 123 | Running data transformations 124 | ---------------------------- 125 | 126 | Once you have made the :ref:`transform file` with all the transform operations, 127 | you can run data transformations with the following terminal command. 128 | 129 | .. code-block:: console 130 | 131 | $ python data_transformations.py \ 132 | --transform_file 'transform_file.yml' 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /docs/source/define_multi_task_model.rst: -------------------------------------------------------------------------------- 1 | How to define your multi-task model? 2 | ==================================== 3 | 4 | Let’s consider you have three tasks - **TaskA**, **TaskB** and **TaskC** to train together. TaskA is single sentence classification type, 5 | TaskB is NER type and TaskC is sentence pair classification type. 6 | You can define a task file mentioning the required details about the task in following YAML format. 7 | :: 8 | 9 | TaskA: 10 | model_type: BERT 11 | config_name: bert-base-uncased 12 | dropout_prob: 0.05 13 | label_map_or_file: 14 | -label1 15 | -label2 16 | -label3 17 | metrics: 18 | - accuracy 19 | loss_type: CrossEntropyLoss 20 | task_type: SingleSenClassification 21 | file_names: 22 | - taskA_train.tsv 23 | - taskA_dev.tsv 24 | - taskA_test.tsv 25 | 26 | TaskB: 27 | model_type: BERT 28 | config_name: bert-base-uncased 29 | dropout_prob: 0.3 30 | label_map_or_file: data/taskB_train_label_map.joblib 31 | metrics: 32 | - seq_f1 33 | - seq_precision 34 | - seq_recall 35 | loss_type: NERLoss 36 | task_type: NER 37 | file_names: 38 | - taskB_train.tsv 39 | - taskB_dev.tsv 40 | - taskB_test.tsv 41 | 42 | TaskC: 43 | model_type: BERT 44 | config_name: bert-base-uncased 45 | dropout_prob: 0.05 46 | metrics: 47 | - accuracy 48 | loss_type: CrossEntropyLoss 49 | class_num: 2 50 | task_type: SentencePairClassification 51 | file_names: 52 | - taskC_train.tsv 53 | - taskC_dev.tsv 54 | - taskC_test.tsv 55 | 56 | Few points to keep in mind while making the task file, 57 | 58 | - You can keep the tasks which you want to train a single model for in this file. 59 | - The file can have either a single task or multiple tasks. In case only a single task is mentioned, the model will act like single-task model. 60 | - The task names (TaskA, TaskB and TaskC) are unique identifiers for the task, hence the task names must always be distinct. 61 | - The model type for all the tasks mentioned in the file must be the same, as the library uses a single shared encoder model for all these tasks. 62 | 63 | Task file parameters 64 | -------------------- 65 | 66 | Detailed description of the parameters available in the task file. 67 | 68 | - ``task_type`` `(required)` : Format of the task as described in :ref:`Task types` 69 | 70 | - ``file_names`` `(required)` : List of standard data tsv file names required for task. The first file is considered as **train** file, second file as **dev** file and the third file as **test** file. 71 | 72 | - ``model_type`` `(required)` : Type of shared encoder model to use. The model type for all the tasks mentioned in the file must be the same. You can refer :ref:`Model type` for selecting model type. 73 | 74 | - ``config_name`` `(optional)` : Config of the encoder model. You can refer :ref:`Model type` for selecting the model type config. In case this parameter is not present, default config will be used. 75 | 76 | - ``class_num`` `(required/optional)` : Number of classes present for classification. This parameter is optional if label_map_or_file is provided, required otherwise. 77 | 78 | - ``label_map_or_file`` `(required/optional)` : 79 | 80 | - In case labels are strings, this is the list of unique labels. 81 | - You can also give a joblib dumped dictionary map file like {‘label1’:0, ‘label2’:1, ..}. 82 | - If you’re using :ref:`Data Transformations` to create the data files, path to the label_map file created along with transformed files is to be given here. 83 | 84 | - ``loss_type`` `(required)` : Type of loss for training as defined in :ref:`Losses`. 85 | 86 | - ``dropout_prob`` `(optional)`: Dropout probability to use between encoder hidden outputs and task specific headers. 87 | 88 | - ``metrics`` `(optional)` : List of metrics to use during evaluation as defined in :ref:`Metrics`. 89 | 90 | - ``loss_weight`` `(optional)`: Loss weight value (between 0 to 1) for individual task. 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /docs/source/examples.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | =========== 3 | Here you can find various NLP (especially conversational AI) tasks as examples and can train them either in multi-task or single-task manner, using some simple steps mentioned in the notebooks. 4 | 5 | Example-1 Intent detection, NER, Fragment detection 6 | --------------------------------------------------- 7 | 8 | **Tasks Description** 9 | 10 | ``Intent Detection`` :- This is a single sentence classification task where an `intent` specifies which class the data sample belongs to. 11 | 12 | ``NER`` :- This is a Named Entity Recognition/ Sequence Labelling/ Slot filling task where individual words of the sentence are tagged with an entity label it belongs to. The words which don't belong to any entity label are simply labeled as "O". 13 | 14 | ``Fragment Detection`` :- This is modeled as a single sentence classification task which detects whether a sentence is incomplete (fragment) or not (non-fragment). 15 | 16 | **Conversational Utility** :- Intent detection is one of the fundamental components for conversational system as it gives a broad understand of the category/domain the sentence/query belongs to. 17 | 18 | NER helps in extracting values for required entities (eg. location, date-time) from query. 19 | 20 | Fragment detection is a very useful piece in conversational system as knowing if a query/sentence is incomplete can aid in discarding bad queries beforehand. 21 | 22 | **Intent Detection** 23 | 24 | Query: I need a reservation for a bar in bangladesh on feb the 11th 2032 25 | 26 | Intent: BookRestaurant 27 | 28 | **NER** 29 | 30 | 31 | Query: ['book', 'a', 'spot', 'for', 'ten', 'at', 'a', 'top-rated', 'caucasian', 'restaurant', 'not', 'far', 'from', 'selmer'] 32 | 33 | NER tags: ['O', 'O', 'O', 'O', 'B-party_size_number', 'O', 'O', 'B-sort', 'B-cuisine', 'B-restaurant_type', 'B-spatial_relation', 'I-spatial_relation', 'O', 'B-city'] 34 | 35 | 36 | **Fragment Detection** 37 | 38 | 39 | Query: a reservation for 40 | 41 | Label: fragment 42 | 43 | 44 | **Notebook** :- `intent_ner_fragment `_ 45 | 46 | **Transform file** :- `transform_file_snips `_ 47 | 48 | **Tasks file** :- `tasks_file_snips `_ 49 | 50 | Example-2 Recognising Textual Entailment 51 | ---------------------------------------- 52 | 53 | **Tasks Description** 54 | 55 | ``Entailment`` :- This is a sentence pair classification task which determines whether the second sentence in a sample can be inferred from the first. 56 | 57 | **Conversational Utility** :- In conversational AI context, this task can be seen as determining whether the second sentence is similar to first or not. Additionally, the probability score can also be used as a similarity score between the sentences. 58 | 59 | Query1: An old man with a package poses in front of an advertisement. 60 | 61 | Query2: A man poses in front of an ad. 62 | 63 | Label: entailment 64 | 65 | Query1: An old man with a package poses in front of an advertisement. 66 | 67 | Query2: A man poses in front of an ad for beer. 68 | 69 | Label: non-entailment 70 | 71 | 72 | 73 | **Notebook** :- `entailment_snli `_ 74 | 75 | **Transform file** :- `transform_file_snli `_ 76 | 77 | **Tasks file** :- `tasks_file_snli `_ 78 | 79 | 80 | 81 | Example-3 Answerability detection 82 | --------------------------------- 83 | **Tasks Description** 84 | 85 | ``answerability`` :- This is modeled as a sentence pair classification task where the first sentence is a query and second sentence is a context passage. The objective of this task is to determine whether the query can be answered from the context passage or not. 86 | 87 | **Conversational Utility** :- This can be a useful component for building a question-answering/ machine comprehension based system. In such cases, it becomes very important to determine whether the given query can be answered with given context passage or not before extracting/abstracting an answer from it. Performing question-answering for a query which is not answerable from the context, could lead to incorrect answer extraction. 88 | 89 | Query: how much money did evander holyfield make 90 | 91 | Context: Evander Holyfield Net Worth. How much is Evander Holyfield Worth? Evander Holyfield Net Worth: Evander Holyfield is a retired American professional boxer who has a net worth of $500 thousand. A professional boxer, Evander Holyfield has fought at the Heavyweight, Cruiserweight, and Light-Heavyweight Divisions, and won a Bronze medal a the 1984 Olympic Games. 92 | 93 | Label: answerable 94 | 95 | **Notebook** :- `answerability_detection_msmarco `_ 96 | 97 | **Transform file** :- `transform_file_answerability `_ 98 | 99 | **Tasks file** :- `tasks_file_answerability `_ 100 | 101 | Example-4 Query type detection 102 | ------------------------------ 103 | 104 | **Tasks Description** 105 | 106 | ``querytype`` :- This is a single sentence classification task to determine what type (category) of answer is expected for the given query. The queries are divided into 5 major classes according to the answer expected for them. 107 | 108 | **Conversational Utility** :- While returning a response for a query, knowing what kind of answer is expected for the query can help in both curating and cross-verfying an answer according to the type. 109 | 110 | Query: what's the distance between destin florida and birmingham alabama? 111 | 112 | Label: NUMERIC 113 | 114 | Query: who is suing scott wolter 115 | 116 | Label: PERSON 117 | 118 | 119 | 120 | **Notebook** :- `query_type_detection `_ 121 | 122 | **Transform file** :- `transform_file_querytype `_ 123 | 124 | **Tasks file** :- `tasks_file_querytype `_ 125 | 126 | Example-5 POS tagging, NER tagging 127 | ---------------------------------- 128 | 129 | **Tasks Description** 130 | 131 | ``NER`` :-This is a Named Entity Recognition task where individual words of the sentence are tagged with an entity label it belongs to. The words which don't belong to any entity label are simply labeled as "O". 132 | 133 | ``POS`` :- This is a Part of Speech tagging task. A part of speech is a category of words that have similar grammatical properties. Each word of the sentence is tagged with the part of speech label it belongs to. The words which don't belong to any part of speech label are simply labeled as "O". 134 | 135 | **Conversational Utility** :- In conversational AI context, determining the syntactic parts of the sentence can help in extracting noun-phrases or important keyphrases from the sentence. 136 | 137 | Query: ['Despite', 'winning', 'the', 'Asian', 'Games', 'title', 'two', 'years', 'ago', ',', 'Uzbekistan', 'are', 'in', 'the', 'finals', 'as', 'outsiders', '.'] 138 | 139 | NER tags: ['O', 'O', 'O', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] 140 | 141 | POS tags: ['I-PP', 'I-VP', 'I-NP', 'I-NP', 'I-NP', 'I-NP', 'B-NP', 'I-NP', 'I-ADVP', 'O', 'I-NP', 'I-VP', 'I-PP', 'I-NP', 'I-NP', 'I-SBAR', 'I-NP', 'O'] 142 | 143 | 144 | 145 | **Notebook** :- `ner_pos_tagging_conll `_ 146 | 147 | **Transform file** :- `transform_file_conll `_ 148 | 149 | **Tasks file** :- `tasks_file_conll `_ 150 | 151 | Example-6 Query correctness 152 | --------------------------- 153 | 154 | **Tasks Description** 155 | 156 | ``querycorrectness`` :- This is modeled as single sentence classification task identifying whether or not a query is structurally well formed. can enhance query un-derstanding. 157 | 158 | **Conversational Utility** :- Determining how much the query is structured would help in enhancing query understanding and improve reliability of tasks which depend on query structure to extract information. 159 | 160 | Query: What places have the oligarchy government ? 161 | 162 | Label: well-formed 163 | 164 | Query: What day of Diwali in 1980 ? 165 | 166 | Label: not well-formed 167 | 168 | 169 | 170 | **Notebook** :- `query_correctness `_ 171 | 172 | **Transform file** :- `transform_file_query_correctness `_ 173 | 174 | **Tasks file** :- `tasks_file_query_correctness `_ 175 | 176 | 177 | Example-7 Query similarity 178 | -------------------------- 179 | 180 | **Tasks Description** 181 | 182 | ``Query similarity`` :- This is a sentence pair classification task which determines whether the second sentence in a sample can be inferred from the first. 183 | 184 | **Conversational Utility** :- In conversational AI context, this task can be seen as determining whether the second sentence is similar to first or not. Additionally, the probability score can also be used as a similarity score between the sentences. 185 | 186 | 187 | Query1: What is the most used word in Malayalam? 188 | 189 | Query2: What is meaning of the Malayalam word ""thumbatthu""? 190 | 191 | Label: not similar 192 | 193 | Query1: Which is the best compliment you have ever received? 194 | 195 | Query2: What's the best compliment you've got? 196 | 197 | Label: similar 198 | 199 | 200 | **Notebook** :- `query_similarity `_ 201 | 202 | **Transform file** :- `transform_file_qqp `_ 203 | 204 | **Tasks file** :- `tasks_file_qqp `_ 205 | 206 | Example-8 Sentiment Analysis 207 | ---------------------------- 208 | 209 | **Tasks Description** 210 | 211 | ``sentiment`` :- This is modeled as single sentence classification task to determine where a piece of text conveys a positive or negative sentiment. 212 | 213 | **Conversational Utility** :- To determine whether a review is positive or negative. 214 | 215 | Review: What I enjoyed most in this film was the scenery of Corfu, being Greek I adore my country and I liked the flattering director's point of view. Based on a true story during the years when Greece was struggling to stand on her own two feet through war, Nazis and hardship. 216 | An Italian soldier and a Greek girl fall in love but the times are hard and they have a lot of sacrifices to make. Nicholas Cage looking great in a uniform gives a passionate account of this unfulfilled (in the beginning) love. I adored Christian Bale playing Mandras 217 | the heroine's husband-to-be, he looks very very good as a Greek, his personality matched the one of the Greek patriot! A true fighter in there, or what! One of the movies I would like to buy and keep it in my collection...for ever! 218 | 219 | Label: positive 220 | 221 | 222 | 223 | **Notebook** :- `IMDb_sentiment_analysis `_ 224 | 225 | **Transform file** :- `transform_file_imdb `_ 226 | 227 | **Tasks file** :- `tasks_file_imdb `_ 228 | 229 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | 2 | ============== 3 | multi-task-NLP 4 | ============== 5 | 6 | multi_task_NLP is a utility toolkit enabling NLP developers to easily train and infer a single model for multiple tasks. 7 | We support various data formats for majority of NLI tasks and multiple transformer-based encoders (eg. BERT, Distil-BERT, ALBERT, RoBERTa, XLNET etc.) 8 | 9 | .. image:: multi_task.png 10 | :scale: 75% 11 | :align: center 12 | 13 | What is multi_task_NLP about? 14 | ----------------------------- 15 | 16 | Any conversational AI system involves building multiple components to perform various tasks and a pipeline to stitch all components together. 17 | Provided the recent effectiveness of transformer-based models in NLP, it’s very common to build a transformer-based model to solve your use case. 18 | But having multiple such models running together for a conversational AI system can lead to expensive resource consumption, increased latencies for predictions and make the system difficult to manage. 19 | This poses a real challenge for anyone who wants to build a conversational AI system in a simplistic way. 20 | 21 | multi_task_NLP gives you the capability to define multiple tasks together and train a single model which simultaneously learns on all defined tasks. 22 | This means one can perform multiple tasks with latency and resource consumption equivalent to a single task. 23 | 24 | Installation 25 | ------------ 26 | 27 | To use multi-task-NLP, you can clone the repository into the desired location on your system 28 | with the following terminal command. 29 | 30 | .. code-block:: console 31 | 32 | $ cd /desired/location/ 33 | $ git clone https://github.com/hellohaptik/multi-task-NLP.git 34 | $ cd multi-task-NLP 35 | $ pip install -r requirements.txt 36 | 37 | NOTE:- The library is built and tested using ``Python 3.7.3``. It is recommended to install the requirements in a virtual environment. 38 | 39 | Quickstart Guide 40 | ---------------- 41 | A quick guide to show how a single model can be trained for multiple NLI tasks in just 3 simple steps 42 | and with **no requirement to code!!** 43 | 44 | .. toctree:: 45 | quickstart 46 | 47 | Examples Guide 48 | -------------- 49 | We provide exemplar notebooks to demonstrate some conversational AI tasks which can be perfomed using our library. 50 | You can follow along the `notebooks `_ to understand and train a multi-task model for the tasks. 51 | 52 | .. toctree:: 53 | :maxdepth: 2 54 | 55 | examples 56 | 57 | Step by Step Guide 58 | ------------------ 59 | A complete guide explaining all the components of multi-task-NLP in sequential order. 60 | 61 | .. toctree:: 62 | :maxdepth: 2 63 | 64 | task_formats 65 | data_transformations 66 | shared_encoder 67 | define_multi_task_model 68 | training 69 | infering 70 | license 71 | 72 | -------------------------------------------------------------------------------- /docs/source/infering.rst: -------------------------------------------------------------------------------- 1 | How to Infer? 2 | ============= 3 | 4 | Once you have a multi-task model trained on your tasks, we provide a convenient and easy way to use it for getting 5 | predictions on samples through the **inference pipeline**. 6 | 7 | .. autoclass:: infer_pipeline.inferPipeline 8 | :members: -------------------------------------------------------------------------------- /docs/source/license.rst: -------------------------------------------------------------------------------- 1 | License 2 | ======= 3 | 4 | Apache License 5 | Version 2.0, January 2004 6 | http://www.apache.org/licenses/ 7 | 8 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 9 | 10 | 1. Definitions. 11 | 12 | "License" shall mean the terms and conditions for use, reproduction, 13 | and distribution as defined by Sections 1 through 9 of this document. 14 | 15 | "Licensor" shall mean the copyright owner or entity authorized by 16 | the copyright owner that is granting the License. 17 | 18 | "Legal Entity" shall mean the union of the acting entity and all 19 | other entities that control, are controlled by, or are under common 20 | control with that entity. For the purposes of this definition, 21 | "control" means (i) the power, direct or indirect, to cause the 22 | direction or management of such entity, whether by contract or 23 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 24 | outstanding shares, or (iii) beneficial ownership of such entity. 25 | 26 | "You" (or "Your") shall mean an individual or Legal Entity 27 | exercising permissions granted by this License. 28 | 29 | "Source" form shall mean the preferred form for making modifications, 30 | including but not limited to software source code, documentation 31 | source, and configuration files. 32 | 33 | "Object" form shall mean any form resulting from mechanical 34 | transformation or translation of a Source form, including but 35 | not limited to compiled object code, generated documentation, 36 | and conversions to other media types. 37 | 38 | "Work" shall mean the work of authorship, whether in Source or 39 | Object form, made available under the License, as indicated by a 40 | copyright notice that is included in or attached to the work 41 | (an example is provided in the Appendix below). 42 | 43 | "Derivative Works" shall mean any work, whether in Source or Object 44 | form, that is based on (or derived from) the Work and for which the 45 | editorial revisions, annotations, elaborations, or other modifications 46 | represent, as a whole, an original work of authorship. For the purposes 47 | of this License, Derivative Works shall not include works that remain 48 | separable from, or merely link (or bind by name) to the interfaces of, 49 | the Work and Derivative Works thereof. 50 | 51 | "Contribution" shall mean any work of authorship, including 52 | the original version of the Work and any modifications or additions 53 | to that Work or Derivative Works thereof, that is intentionally 54 | submitted to Licensor for inclusion in the Work by the copyright owner 55 | or by an individual or Legal Entity authorized to submit on behalf of 56 | the copyright owner. For the purposes of this definition, "submitted" 57 | means any form of electronic, verbal, or written communication sent 58 | to the Licensor or its representatives, including but not limited to 59 | communication on electronic mailing lists, source code control systems, 60 | and issue tracking systems that are managed by, or on behalf of, the 61 | Licensor for the purpose of discussing and improving the Work, but 62 | excluding communication that is conspicuously marked or otherwise 63 | designated in writing by the copyright owner as "Not a Contribution." 64 | 65 | "Contributor" shall mean Licensor and any individual or Legal Entity 66 | on behalf of whom a Contribution has been received by Licensor and 67 | subsequently incorporated within the Work. 68 | 69 | 2. Grant of Copyright License. Subject to the terms and conditions of 70 | this License, each Contributor hereby grants to You a perpetual, 71 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 72 | copyright license to reproduce, prepare Derivative Works of, 73 | publicly display, publicly perform, sublicense, and distribute the 74 | Work and such Derivative Works in Source or Object form. 75 | 76 | 3. Grant of Patent License. Subject to the terms and conditions of 77 | this License, each Contributor hereby grants to You a perpetual, 78 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 79 | (except as stated in this section) patent license to make, have made, 80 | use, offer to sell, sell, import, and otherwise transfer the Work, 81 | where such license applies only to those patent claims licensable 82 | by such Contributor that are necessarily infringed by their 83 | Contribution(s) alone or by combination of their Contribution(s) 84 | with the Work to which such Contribution(s) was submitted. If You 85 | institute patent litigation against any entity (including a 86 | cross-claim or counterclaim in a lawsuit) alleging that the Work 87 | or a Contribution incorporated within the Work constitutes direct 88 | or contributory patent infringement, then any patent licenses 89 | granted to You under this License for that Work shall terminate 90 | as of the date such litigation is filed. 91 | 92 | 4. Redistribution. You may reproduce and distribute copies of the 93 | Work or Derivative Works thereof in any medium, with or without 94 | modifications, and in Source or Object form, provided that You 95 | meet the following conditions: 96 | 97 | (a) You must give any other recipients of the Work or 98 | Derivative Works a copy of this License; and 99 | 100 | (b) You must cause any modified files to carry prominent notices 101 | stating that You changed the files; and 102 | 103 | (c) You must retain, in the Source form of any Derivative Works 104 | that You distribute, all copyright, patent, trademark, and 105 | attribution notices from the Source form of the Work, 106 | excluding those notices that do not pertain to any part of 107 | the Derivative Works; and 108 | 109 | (d) If the Work includes a "NOTICE" text file as part of its 110 | distribution, then any Derivative Works that You distribute must 111 | include a readable copy of the attribution notices contained 112 | within such NOTICE file, excluding those notices that do not 113 | pertain to any part of the Derivative Works, in at least one 114 | of the following places: within a NOTICE text file distributed 115 | as part of the Derivative Works; within the Source form or 116 | documentation, if provided along with the Derivative Works; or, 117 | within a display generated by the Derivative Works, if and 118 | wherever such third-party notices normally appear. The contents 119 | of the NOTICE file are for informational purposes only and 120 | do not modify the License. You may add Your own attribution 121 | notices within Derivative Works that You distribute, alongside 122 | or as an addendum to the NOTICE text from the Work, provided 123 | that such additional attribution notices cannot be construed 124 | as modifying the License. 125 | 126 | You may add Your own copyright statement to Your modifications and 127 | may provide additional or different license terms and conditions 128 | for use, reproduction, or distribution of Your modifications, or 129 | for any such Derivative Works as a whole, provided Your use, 130 | reproduction, and distribution of the Work otherwise complies with 131 | the conditions stated in this License. 132 | 133 | 5. Submission of Contributions. Unless You explicitly state otherwise, 134 | any Contribution intentionally submitted for inclusion in the Work 135 | by You to the Licensor shall be under the terms and conditions of 136 | this License, without any additional terms or conditions. 137 | Notwithstanding the above, nothing herein shall supersede or modify 138 | the terms of any separate license agreement you may have executed 139 | with Licensor regarding such Contributions. 140 | 141 | 6. Trademarks. This License does not grant permission to use the trade 142 | names, trademarks, service marks, or product names of the Licensor, 143 | except as required for reasonable and customary use in describing the 144 | origin of the Work and reproducing the content of the NOTICE file. 145 | 146 | 7. Disclaimer of Warranty. Unless required by applicable law or 147 | agreed to in writing, Licensor provides the Work (and each 148 | Contributor provides its Contributions) on an "AS IS" BASIS, 149 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 150 | implied, including, without limitation, any warranties or conditions 151 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 152 | PARTICULAR PURPOSE. You are solely responsible for determining the 153 | appropriateness of using or redistributing the Work and assume any 154 | risks associated with Your exercise of permissions under this License. 155 | 156 | 8. Limitation of Liability. In no event and under no legal theory, 157 | whether in tort (including negligence), contract, or otherwise, 158 | unless required by applicable law (such as deliberate and grossly 159 | negligent acts) or agreed to in writing, shall any Contributor be 160 | liable to You for damages, including any direct, indirect, special, 161 | incidental, or consequential damages of any character arising as a 162 | result of this License or out of the use or inability to use the 163 | Work (including but not limited to damages for loss of goodwill, 164 | work stoppage, computer failure or malfunction, or any and all 165 | other commercial damages or losses), even if such Contributor 166 | has been advised of the possibility of such damages. 167 | 168 | 9. Accepting Warranty or Additional Liability. While redistributing 169 | the Work or Derivative Works thereof, You may choose to offer, 170 | and charge a fee for, acceptance of support, warranty, indemnity, 171 | or other liability obligations and/or rights consistent with this 172 | License. However, in accepting such obligations, You may act only 173 | on Your own behalf and on Your sole responsibility, not on behalf 174 | of any other Contributor, and only if You agree to indemnify, 175 | defend, and hold each Contributor harmless for any liability 176 | incurred by, or claims asserted against, such Contributor by reason 177 | of your accepting any such warranty or additional liability. 178 | 179 | END OF TERMS AND CONDITIONS 180 | 181 | APPENDIX: How to apply the Apache License to your work. 182 | 183 | To apply the Apache License to your work, attach the following 184 | boilerplate notice, with the fields enclosed by brackets "[]" 185 | replaced with your own identifying information. (Don't include 186 | the brackets!) The text should be enclosed in the appropriate 187 | comment syntax for the file format. We also recommend that a 188 | file or class name and description of purpose be included on the 189 | same "printed page" as the copyright notice for easier 190 | identification within third-party archives. 191 | 192 | Copyright [yyyy] [name of copyright owner] 193 | 194 | Licensed under the Apache License, Version 2.0 (the "License"); 195 | you may not use this file except in compliance with the License. 196 | You may obtain a copy of the License at 197 | 198 | http://www.apache.org/licenses/LICENSE-2.0 199 | 200 | Unless required by applicable law or agreed to in writing, software 201 | distributed under the License is distributed on an "AS IS" BASIS, 202 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 203 | See the License for the specific language governing permissions and 204 | limitations under the License. 205 | -------------------------------------------------------------------------------- /docs/source/multi_task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hellohaptik/multi-task-NLP/b8ae9c051437213245b51b9b1a5bea10565c38e8/docs/source/multi_task.png -------------------------------------------------------------------------------- /docs/source/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quickstart 2 | =========== 3 | Follow these 3 simple steps to train your multi-task model! 4 | 5 | Step 1 - Define your task file 6 | ------------------------------ 7 | 8 | Task file is a YAML format file where you can add all your tasks for which you want to train a multi-task model. 9 | 10 | :: 11 | 12 | TaskA: 13 | model_type: BERT 14 | config_name: bert-base-uncased 15 | dropout_prob: 0.05 16 | label_map_or_file: 17 | -label1 18 | -label2 19 | -label3 20 | metrics: 21 | - accuracy 22 | loss_type: CrossEntropyLoss 23 | task_type: SingleSenClassification 24 | file_names: 25 | - taskA_train.tsv 26 | - taskA_dev.tsv 27 | - taskA_test.tsv 28 | 29 | TaskB: 30 | model_type: BERT 31 | config_name: bert-base-uncased 32 | dropout_prob: 0.3 33 | label_map_or_file: data/taskB_train_label_map.joblib 34 | metrics: 35 | - seq_f1 36 | - seq_precision 37 | - seq_recall 38 | loss_type: NERLoss 39 | task_type: NER 40 | file_names: 41 | - taskB_train.tsv 42 | - taskB_dev.tsv 43 | - taskB_test.tsv 44 | 45 | For knowing about the task file parameters to make your task file, refer :ref:`here`. 46 | 47 | Step 2 - Run data preparation 48 | ----------------------------- 49 | 50 | After defining the task file in :ref:`Step 1`, run the following command to prepare the data. 51 | 52 | .. code-block:: console 53 | 54 | $ python data_preparation.py \ 55 | --task_file 'sample_task_file.yml' \ 56 | --data_dir 'data' \ 57 | --max_seq_len 50 58 | 59 | For knowing about the ``data_preparation.py`` script and its arguments, refer :ref:`here`. 60 | 61 | Step 3 - Run train 62 | ------------------ 63 | 64 | Finally you can start your training using the following command. 65 | 66 | .. code-block:: console 67 | 68 | $ python train.py \ 69 | --data_dir 'data/bert-base-uncased_prepared_data' \ 70 | --task_file 'sample_task_file.yml' \ 71 | --out_dir 'sample_out' \ 72 | --epochs 5 \ 73 | --train_batch_size 4 \ 74 | --eval_batch_size 8 \ 75 | --grad_accumulation_steps 2 \ 76 | --log_per_updates 25 \ 77 | --save_per_updates 1000 \ 78 | --eval_while_train True \ 79 | --test_while_train True \ 80 | --max_seq_len 50 \ 81 | --silent True 82 | 83 | For knowing about the ``train.py`` script and its arguments, refer :ref:`here`. 84 | 85 | 86 | -------------------------------------------------------------------------------- /docs/source/shared_encoder.rst: -------------------------------------------------------------------------------- 1 | Shared Encoder 2 | ============== 3 | 4 | What is a shared encoder? 5 | ------------------------- 6 | 7 | The concept of this library is to provide a single model for multiple tasks. 8 | To achieve this we place a transformer-based encoder at centre. Data for all tasks will go through this centre encoder. 9 | This encoder is called shared as it is responsible for majority of learnings on all the tasks. 10 | Further, task specific headers are formed over the shared encoder. 11 | 12 | Task specific headers 13 | --------------------- 14 | 15 | The encoder hidden states are consumed by task specific layers defined to output logits in the format required by the task. 16 | Forward pass for a data batch belonging to say taskA occurs through the shared encoder and header for taskA. 17 | The computed loss (which is called as ‘task loss’) is back-propagated through the same path. 18 | 19 | Choice of shared encoder 20 | ------------------------ 21 | 22 | We support multiple transformer-based encoder models. 23 | For ease of use, we’ve integrated the encoders from the `transformers `_ library. 24 | Available encoders with their config names are mentioned below. 25 | 26 | +------------------+---------------------------+---------------------------+ 27 | | Model type | Config name | Default config | 28 | +==================+===========================+===========================+ 29 | | | distilbert-base-uncased | | 30 | | DISTILBERT +---------------------------+ distilbert-base-uncased | 31 | | | distilbert-base-cased | | 32 | +------------------+---------------------------+---------------------------+ 33 | | | bert-base-uncased | | 34 | | +---------------------------+ | 35 | | | bert-base-cased | | 36 | | BERT +---------------------------+ bert-base-uncased | 37 | | | bert-large-uncased | | 38 | | +---------------------------+ | 39 | | | bert-large-cased | | 40 | +------------------+---------------------------+---------------------------+ 41 | | | roberta-base | | 42 | | ROBERTA +---------------------------+ roberta-base | 43 | | | roberta-large | | 44 | +------------------+---------------------------+---------------------------+ 45 | | | albert-base-v1 | | 46 | | +---------------------------+ | 47 | | | albert-large-v1 | | 48 | | +---------------------------+ | 49 | | | albert-xlarge-v1 | | 50 | | +---------------------------+ | 51 | | | albert-xxlarge-v1 | | 52 | | ALBERT +---------------------------+ albert-base-v1 | 53 | | | albert-base-v2 | | 54 | | +---------------------------+ | 55 | | | albert-large-v2 | | 56 | | +---------------------------+ | 57 | | | albert-xlarge-v2 | | 58 | | +---------------------------+ | 59 | | | albert-xxlarge-v2 | | 60 | +------------------+---------------------------+---------------------------+ 61 | | | xlnet-base-cased | | 62 | | XLNET +---------------------------+ xlnet-base-cased | 63 | | | xlnet-large-cased | | 64 | +------------------+---------------------------+---------------------------+ 65 | 66 | Losses 67 | ------ 68 | We support following two types of loss functions. 69 | 70 | .. autoclass:: models.loss.CrossEntropyLoss 71 | :members: forward 72 | 73 | .. autoclass:: models.loss.NERLoss 74 | :members: forward 75 | 76 | Metrics 77 | ------- 78 | For evaluating the performance on dev and test sets during training, we provide the following standard metrics. 79 | 80 | .. automodule:: utils.eval_metrics 81 | :members: classification_accuracy, classification_f1_score, seqeval_f1_score, 82 | seqeval_precision, seqeval_recall, snips_f1_score, snips_precision, snips_recall, classification_recall_score 83 | 84 | -------------------------------------------------------------------------------- /docs/source/task_formats.rst: -------------------------------------------------------------------------------- 1 | Task Formats 2 | ============ 3 | 4 | - To standardize the data input files, all the tasks require ``tsv`` format files as input data files. 5 | - The tsv data files shouldn’t contain any headers. Detailed tsv formats required for specific task types are mentioned in following subsection. 6 | 7 | Task types 8 | ---------- 9 | Input data formats for different NLI tasks can vary from task to task. We support the following three task types. 10 | Majority of the NLI tasks can be modeled using one of these task types. 11 | 12 | - ``SingleSenClassification``: This task type is to be used for classification of single sentences. The data files needs to have following columns separated by **"\\t"** 13 | in the order as mentioned below. 14 | 15 | 1. **Unique id** :- an id to uniquely identify each row/sample. 16 | 2. **Label** :- label for the sentence. Labels can be numeric or strings. In case labels are strings, label mapping needs to be provided. 17 | 3. **Sentence** :- The sentence which needs to be classified. 18 | 19 | - ``SentencePairClassification``: This task type is to be used for classification of sentence pairs (two sentences). The data files needs to have following columns separated by **"\\t"** 20 | in the order as mentioned below. 21 | 22 | 1. **Unique id** :- an id to uniquely identify each row/sample. 23 | 2. **Label** :- label for the sentence. Labels can be numeric or strings. In case labels are strings, label mapping needs to be provided. 24 | 3. **SentenceA** :- First sentence of the sentence pair. 25 | 4. **SentenceB** :- Second sentence of the sentence pair. 26 | 27 | - ``NER`` : This task type is to be used for sequence labelling tasks like Named Entity Recognition , entity mention detection, keyphrase extraction etc. The data files need to have following columns separated by **"\\t"** in the order as mentioned below. 28 | 29 | 1. **Unique id** :- an id to uniquely identify each row/sample. 30 | 2. **Label** :- List of tags for words in sentence. 31 | 3. **Sentence** :- List of words in sentence. 32 | 33 | 34 | 35 | NOTE:- The tsv data files must not have the header names. 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /docs/source/training.rst: -------------------------------------------------------------------------------- 1 | How to train? 2 | ============= 3 | 4 | Once you have made the task file with the tasks you want to train for, 5 | the next step is to run ``data_preparation.py`` and ``train.py``. 6 | 7 | Running data preparation 8 | ------------------------ 9 | 10 | - The job of this script is to convert the given tsv data files to model inputs such as **Token Ids**, **Attention Masks** and **Token Type Ids** based on the shared encoder type. 11 | 12 | - The script uses **multi-processing** which effectively reduces the data preparation time for large data files. 13 | 14 | - It stores the prepared data in json files under the directory name **prepared_data** prefixed with the shared encoder config name. 15 | 16 | The script takes the following arguments, 17 | 18 | - ``task_file`` `(required)` :- Path to the created task file for which you want to train. 19 | 20 | - ``data_dir`` `(required)` :- Path to the directory where the data files mentioned in task file are present. 21 | 22 | - ``do_lower_case`` `(optional, default True)` :- Set this to False in case you are using a `cased` config for model type. 23 | 24 | - ``max_seq_len`` `(required, default 128)` :- Maximum sequence length for inputs. Truncating or padding will occur accordingly. 25 | 26 | You can use the following terminal command with your own argument values to run. 27 | 28 | .. code-block:: console 29 | 30 | $ python data_preparation.py \ 31 | --task_file 'sample_task_file.yml' \ 32 | --data_dir 'data' \ 33 | --max_seq_len 50 34 | 35 | Running train 36 | ------------- 37 | 38 | After ``data_preparation.py`` has finished running, it will store the respective prepared files 39 | under the directory name ‘prepared_data’ prefixed with the shared encoder config name. 40 | The ``train.py`` can be run from terminal to start the training. Following arguments are 41 | available 42 | 43 | - ``data_dir`` `(required)` :- Path to the directory where prepared data is stored. (eg. bert_base_uncased_prepared_data) 44 | - ``task_file`` `(required)` :- Path to task file for training. 45 | - ``out_dir`` `(required)` :- Path to save the multi-task model checkpoints. 46 | - ``epochs`` `(required)` :- Number of epochs to train. 47 | - ``train_batch_size`` `(optional, default 8)` :- Batch size for training. 48 | - ``eval_batch_size`` `(optional, default 32)` :- Batch size for evaluation. 49 | - ``grad_accumulation_steps`` `(optional, default 1)` :- Number of batches to accumulate before update. 50 | - ``log_per_updates`` `(optional, default 10)` :- Number of updates after which to log loss. 51 | - ``silent`` `(optional, default True)` :- Set to False for logs to be shown on terminal output as well. 52 | - ``max_seq_len`` `(optional, default 128)` :- Maximum sequence length which was used during data preparation. 53 | - ``save_per_updates`` `(optional, default 0)` :- Number of update steps after which model checkpoint to be saved. Model is always saved at the end of every epoch. 54 | - ``load_saved_model`` `(optional, default None)` :- Path to the saved model in case of loading. 55 | - ``resume_train`` `(optional, default False)` :- Set to True for resuming training from the saved model. Training will resume from the step at which the loaded model was saved. 56 | 57 | You can use the following terminal command with your own argument values to run. 58 | 59 | .. code-block:: console 60 | 61 | $ python train.py \ 62 | --data_dir 'data/bert-base-uncased_prepared_data' \ 63 | --task_file 'sample_task_file.yml' \ 64 | --out_dir 'sample_out' \ 65 | --epochs 5 \ 66 | --train_batch_size 4 \ 67 | --eval_batch_size 8 \ 68 | --grad_accumulation_steps 2 \ 69 | --max_seq_len 50 \ 70 | --log_per_updates 25 \ 71 | --save_per_updates 1000 \ 72 | --eval_while_train \ 73 | --test_while_train \ 74 | --silent 75 | 76 | Logs and tensorboard 77 | -------------------- 78 | 79 | - Logs for the training should be saved in a time-stamp named directory (eg. 05_05-17_30). 80 | - The tensorboard logs are also present in the same directory and tensorboard can be started with the following command 81 | 82 | .. code-block:: console 83 | 84 | $ tensorboard --logdir 05_05-17_30/tb_logs 85 | 86 | 87 | -------------------------------------------------------------------------------- /examples/answerability_detection/answerability_detection_msmarco.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## EXAMPLE - 3\n", 8 | "\n", 9 | "**Tasks :- Answerability detection**\n", 10 | "\n", 11 | "**Tasks Description**\n", 12 | "\n", 13 | "``answerability`` :- This is modeled as a sentence pair classification task where the first sentence is a query and second sentence is a context passage. The objective of this task is to determine whether the query can be answered from the context passage or not.\n", 14 | "\n", 15 | "**Conversational Utility** :- This can be a useful component for building a question-answering/ machine comprehension based system. In such cases, it becomes very important to determine whether the given query can be answered with given context passage or not before extracting/abstracting an answer from it. Performing question-answering for a query which is not answerable from the context, could lead to incorrect answer extraction.\n", 16 | "\n", 17 | "**Data** :- In this example, we are using the MSMARCO triples data which is having sentence pairs and labels.\n", 18 | "The data contains triplets where the first entry is the query, second one is the context passage from which the query can be answered (positive passage) , while the third entry is a context passage from which the query cannot be answered (negative passage).\n", 19 | "\n", 20 | "Data is transformed into sentence pair classification format, with query-positive context pair labeled as 1 (answerable) and query-negative context pair labeled as 0 (non-answerable)\n", 21 | "\n", 22 | "The data can be downloaded using the following ``wget`` command and extracted using ``tar`` command. The data is fairly large to download (7.4GB). " 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "!wget https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.small.tar.gz -P msmarco_data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "!tar -xvzf msmarco_data/triples.train.small.tar.gz -C msmarco_data/" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "!rm msmarco_data/triples.train.small.tar.gz" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "# Step - 1: Transforming data\n", 57 | "\n", 58 | "The data is present in *JSONL* format where each object contains a sample having the two sentences as ``sentence1`` and ``sentence2``. We consider ``gold_label`` field as the label which can have value: entailment, contradiction or neutral.\n", 59 | "\n", 60 | "We already provide a sample transformation function ``msmarco_answerability_detection_to_tsv`` to convert this data to required tsv format. Data is transformed into sentence pair classification format, with query-positive context pair labeled as 1 (answerable) and query-negative context pair labeled as 0 (non-answerable)\n", 61 | "\n", 62 | "Running data transformations will save the required train, dev and test tsv data files under ``data`` directory in root of library. For more details on the data transformation process, refer to data transformations in documentation.\n", 63 | "\n", 64 | "The transformation file should have the following details which is already created ``transform_file_snli.yml``.\n", 65 | "\n", 66 | "```\n", 67 | "transform1:\n", 68 | " transform_func: msmarco_answerability_detection_to_tsv\n", 69 | " transform_params:\n", 70 | " data_frac : 0.02\n", 71 | " read_file_names:\n", 72 | " - triples.train.small.tsv\n", 73 | " read_dir : msmarco_data\n", 74 | " save_dir: ../../data\n", 75 | " \n", 76 | " ```\n", 77 | " Following command can be used to run the data transformation for the tasks." 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "# Step -2 Data Preparation\n", 85 | "\n", 86 | "For more details on the data preparation process, refer to data preparation in documentation.\n", 87 | "\n", 88 | "Defining tasks file for training single model for entailment task. The file is already created at ``tasks_file_answerability.yml``\n", 89 | "```\n", 90 | "answerability:\n", 91 | " model_type: BERT\n", 92 | " config_name: bert-base-uncased\n", 93 | " dropout_prob: 0.2\n", 94 | " class_num: 2\n", 95 | " metrics:\n", 96 | " - classification_accuracy\n", 97 | " loss_type: CrossEntropyLoss\n", 98 | " task_type: SentencePairClassification\n", 99 | " file_names:\n", 100 | " - msmarco_answerability_train.tsv\n", 101 | " - msmarco_answerability_dev.tsv\n", 102 | " - msmarco_answerability_test.tsv\n", 103 | "```" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "!python ../../data_preparation.py \\\n", 113 | " --task_file 'tasks_file_answerability.yml' \\\n", 114 | " --data_dir '../../data' \\\n", 115 | " --max_seq_len 324" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "# Step - 3 Running train\n", 123 | "\n", 124 | "Following command will start the training for the tasks. The log file reporting the loss, metrics and the tensorboard logs will be present in a time-stamped directory.\n", 125 | "\n", 126 | "For knowing more details about the train process, refer to running training in documentation." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "!python ../../train.py \\\n", 136 | " --data_dir '../../data/bert-base-uncased_prepared_data' \\\n", 137 | " --task_file 'tasks_file_answerability.yml' \\\n", 138 | " --out_dir 'msmarco_answerability_bert_base' \\\n", 139 | " --epochs 3 \\\n", 140 | " --train_batch_size 8 \\\n", 141 | " --eval_batch_size 16 \\\n", 142 | " --grad_accumulation_steps 2 \\\n", 143 | " --log_per_updates 250 \\\n", 144 | " --max_seq_len 324 \\\n", 145 | " --save_per_updates 16000 \\\n", 146 | " --eval_while_train \\\n", 147 | " --test_while_train \\\n", 148 | " --silent" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "# Step - 4 Infering\n", 156 | "\n", 157 | "You can import and use the ``inferPipeline`` to get predictions for the required tasks.\n", 158 | "The trained model and maximum sequence length to be used needs to be specified.\n", 159 | "\n", 160 | "For knowing more details about infering, refer to infer pipeline in documentation." 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "import sys\n", 170 | "sys.path.insert(1, '../../')\n", 171 | "from infer_pipeline import inferPipeline" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [] 187 | } 188 | ], 189 | "metadata": { 190 | "kernelspec": { 191 | "display_name": "Python 3", 192 | "language": "python", 193 | "name": "python3" 194 | }, 195 | "language_info": { 196 | "codemirror_mode": { 197 | "name": "ipython", 198 | "version": 3 199 | }, 200 | "file_extension": ".py", 201 | "mimetype": "text/x-python", 202 | "name": "python", 203 | "nbconvert_exporter": "python", 204 | "pygments_lexer": "ipython3", 205 | "version": "3.7.3" 206 | } 207 | }, 208 | "nbformat": 4, 209 | "nbformat_minor": 4 210 | } 211 | -------------------------------------------------------------------------------- /examples/answerability_detection/tasks_file_answerability.yml: -------------------------------------------------------------------------------- 1 | answerability: 2 | model_type: BERT 3 | config_name: bert-base-uncased 4 | dropout_prob: 0.2 5 | class_num: 2 6 | metrics: 7 | - classification_accuracy 8 | loss_type: CrossEntropyLoss 9 | task_type: SentencePairClassification 10 | file_names: 11 | - msmarco_answerability_train.tsv 12 | - msmarco_answerability_dev.tsv 13 | - msmarco_answerability_test.tsv -------------------------------------------------------------------------------- /examples/answerability_detection/transform_file_answerability.yml: -------------------------------------------------------------------------------- 1 | transform1: 2 | transform_func: msmarco_answerability_detection_to_tsv 3 | transform_params: 4 | data_frac : 0.02 5 | read_file_names: 6 | - triples.train.small.tsv 7 | read_dir : msmarco_data 8 | save_dir: ../../data -------------------------------------------------------------------------------- /examples/entailment_detection/entailment_snli.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# EXAMPLE - 2\n", 8 | "\n", 9 | "**Tasks :- Entailment detection**\n", 10 | "\n", 11 | "**Tasks Description**\n", 12 | "\n", 13 | "``Entailment`` :- This is a sentence pair classification task which determines whether the second sentence in a sample can be inferred from the first.\n", 14 | "\n", 15 | "**Conversational Utility** :- In conversational AI context, this task can be seen as determining whether the second sentence is similar to first or not. Additionally, the probability score can also be used as a similarity score between the sentences. \n", 16 | "\n", 17 | "**Data** :- In this example, we are using the SNLI data which is having sentence pairs and labels.\n", 18 | "\n", 19 | "The data can be downloaded using the following ``wget`` command and unzipped using ``unzip`` command." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "!wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "!unzip snli_1.0.zip" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "# Step - 1: Transforming data\n", 45 | "\n", 46 | "The data is present in *JSONL* format where each object contains a sample having the two sentences as ``sentence1`` and ``sentence2``. We consider ``gold_label`` field as the label which can have value: entailment, contradiction or neutral.\n", 47 | "\n", 48 | "We already provide a sample transformation function ``snli_entailment_to_tsv`` to convert this data to required tsv format. Contradiction and neutral labels are mapped to 0 representing non-entailment scenario. Only entailment label is mapped to 1.\n", 49 | "\n", 50 | "Running data transformations will save the required train, dev and test tsv data files under ``data`` directory in root of library. For more details on the data transformation process, refer to data transformations in documentation.\n", 51 | "\n", 52 | "The transformation file should have the following details which is already created ``transform_file_snli.yml``.\n", 53 | "\n", 54 | "```\n", 55 | "transform1:\n", 56 | " transform_func: snli_entailment_to_tsv\n", 57 | " read_file_names:\n", 58 | " - snli_1.0_train.jsonl\n", 59 | " - snli_1.0_dev.jsonl\n", 60 | " - snli_1.0_test.jsonl\n", 61 | " read_dir : snli_1.0\n", 62 | " save_dir: ../../data\n", 63 | " \n", 64 | " ```\n", 65 | " Following command can be used to run the data transformation for the tasks." 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "!python ../../data_transformations.py \\\n", 75 | " --transform_file 'transform_file_snli.yml'" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "# Step -2 Data Preparation\n", 83 | "\n", 84 | "For more details on the data preparation process, refer to data preparation in documentation.\n", 85 | "\n", 86 | "Defining tasks file for training single model for entailment task. The file is already created at ``tasks_file_snips.yml``\n", 87 | "```\n", 88 | "entailmentsnli:\n", 89 | " model_type: BERT\n", 90 | " config_name: bert-base-uncased\n", 91 | " dropout_prob: 0.2\n", 92 | " metrics:\n", 93 | " - classification_accuracy\n", 94 | " loss_type: CrossEntropyLoss\n", 95 | " class_num: 2\n", 96 | " task_type: SentencePairClassification\n", 97 | " file_names:\n", 98 | " - entailment_snli_1.0_train.tsv\n", 99 | " - entailment_snli_1.0_dev.tsv\n", 100 | " - entailment_snli_1.0_test.tsv\n", 101 | "```" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "!python ../../data_preparation.py \\\n", 111 | " --task_file 'tasks_file_snli.yml' \\\n", 112 | " --data_dir '../../data' \\\n", 113 | " --max_seq_len 128" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "# Step - 3 Running train\n", 121 | "\n", 122 | "Following command will start the training for the tasks. The log file reporting the loss, metrics and the tensorboard logs will be present in a time-stamped directory.\n", 123 | "\n", 124 | "For knowing more details about the train process, refer to running training in documentation." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "!python ../../train.py \\\n", 134 | " --data_dir '../../data/bert-base-uncased_prepared_data' \\\n", 135 | " --task_file 'tasks_file_snli.yml' \\\n", 136 | " --out_dir 'snli_entailment_bert_base' \\\n", 137 | " --epochs 3 \\\n", 138 | " --train_batch_size 64 \\\n", 139 | " --eval_batch_size 64 \\\n", 140 | " --grad_accumulation_steps 1 \\\n", 141 | " --log_per_updates 100 \\\n", 142 | " --max_seq_len 128 \\\n", 143 | " --eval_while_train \\\n", 144 | " --test_while_train \\\n", 145 | " --silent" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "# Step - 4 Infering\n", 153 | "\n", 154 | "You can import and use the ``inferPipeline`` to get predictions for the required tasks.\n", 155 | "The trained model and maximum sequence length to be used needs to be specified.\n", 156 | "\n", 157 | "For knowing more details about infering, refer to infer pipeline in documentation." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "import sys\n", 167 | "sys.path.insert(1, '../../')\n", 168 | "from infer_pipeline import inferPipeline" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [] 191 | } 192 | ], 193 | "metadata": { 194 | "kernelspec": { 195 | "display_name": "Python 3", 196 | "language": "python", 197 | "name": "python3" 198 | }, 199 | "language_info": { 200 | "codemirror_mode": { 201 | "name": "ipython", 202 | "version": 3 203 | }, 204 | "file_extension": ".py", 205 | "mimetype": "text/x-python", 206 | "name": "python", 207 | "nbconvert_exporter": "python", 208 | "pygments_lexer": "ipython3", 209 | "version": "3.7.3" 210 | } 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 4 214 | } 215 | -------------------------------------------------------------------------------- /examples/entailment_detection/tasks_file_snli.yml: -------------------------------------------------------------------------------- 1 | entailmentsnli: 2 | model_type: BERT 3 | config_name: bert-base-uncased 4 | dropout_prob: 0.2 5 | metrics: 6 | - classification_accuracy 7 | loss_type: CrossEntropyLoss 8 | class_num: 2 9 | task_type: SentencePairClassification 10 | file_names: 11 | - entailment_snli_1.0_train.tsv 12 | - entailment_snli_1.0_dev.tsv 13 | - entailment_snli_1.0_test.tsv -------------------------------------------------------------------------------- /examples/entailment_detection/transform_file_snli.yml: -------------------------------------------------------------------------------- 1 | transform1: 2 | transform_func: snli_entailment_to_tsv 3 | read_file_names: 4 | - snli_1.0_train.jsonl 5 | - snli_1.0_dev.jsonl 6 | - snli_1.0_test.jsonl 7 | read_dir : snli_1.0 8 | save_dir: ../../data -------------------------------------------------------------------------------- /examples/intent_ner_fragment/intent_ner_fragment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# EXAMPLE - 1\n", 8 | "\n", 9 | "**Tasks :- Intent Detection, NER, Fragment Detection**\n", 10 | "\n", 11 | "**Tasks Description**\n", 12 | "\n", 13 | "``Intent Detection`` :- This is a single sentence classification task where an `intent` specifies which class the data sample belongs to. \n", 14 | "\n", 15 | "``NER`` :- This is a Named Entity Recognition/ Sequence Labelling/ Slot filling task where individual words of the sentence are tagged with an entity label it belongs to. The words which don't belong to any entity label are simply labeled as \"O\". \n", 16 | "\n", 17 | "``Fragment Detection`` :- This is modeled as a single sentence classification task which detects whether a sentence is incomplete (fragment) or not (non-fragment).\n", 18 | "\n", 19 | "**Conversational Utility** :- Intent detection is one of the fundamental components for conversational system as it gives a broad understand of the category/domain the sentence/query belongs to.\n", 20 | "\n", 21 | "NER helps in extracting values for required entities (eg. location, date-time) from query.\n", 22 | "\n", 23 | "Fragment detection is a very useful piece in conversational system as knowing if a query/sentence is incomplete can aid in discarding bad queries beforehand.\n", 24 | "\n", 25 | "\n", 26 | "**Data** :- In this example, we are using the SNIPS data for intent and entity detection. For the sake of simplicity, we provide \n", 27 | "the data in simpler form under ``snips_data`` directory taken from here.\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "# Step - 1: Transforming data\n", 35 | "\n", 36 | "The data is present in *BIO* format where each word in a sentence is tagged with corresponding entity. \n", 37 | "Sentences are separated by \\\" \" and at the end of each sentence, intent class to which the sentence belongs is mentioned. We already provide a sample transformation function ``snli_entailment_to_tsv`` to convert this data to required tsv data files. T\n", 38 | "Fragment detection data is generated from intent detection data created using the transform function\n", 39 | "``create_fragment_detection_tsv``. \n", 40 | "\n", 41 | "Running data transformations will save the required train, dev and test tsv data files under ``data`` directory in root of library. For more details on the data transformation process, refer to data transformations in documentation.\n", 42 | "\n", 43 | "The transformation file should have the following details which is already created ``transform_file_snips.yml``.\n", 44 | "\n", 45 | "```\n", 46 | "transform1:\n", 47 | " transform_func: snips_intent_ner_to_tsv\n", 48 | " read_file_names:\n", 49 | " - snips_train.txt\n", 50 | " - snips_dev.txt\n", 51 | " - snips_test.txt\n", 52 | " read_dir: snips_data\n", 53 | " save_dir: ../../data\n", 54 | " \n", 55 | "transform2:\n", 56 | " transform_func: create_fragment_detection_tsv\n", 57 | " read_file_names:\n", 58 | " - intent_snips_train.tsv\n", 59 | " - intent_snips_dev.tsv\n", 60 | " - intent_snips_test.tsv\n", 61 | " read_dir: ../../data\n", 62 | " save_dir: ../../data\n", 63 | " \n", 64 | " ```\n", 65 | " Following command can be used to run the data transformation for the tasks." 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "!python ../../data_transformations.py \\\n", 75 | " --transform_file 'transform_file_snips.yml'" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "# Step -2 Data Preparation\n", 83 | "\n", 84 | "Here we are training the three tasks together for demonstration. This means we will have a single\n", 85 | "multi-task model capable of performing on all the three tasks. You can also train the tasks separately \n", 86 | "by mentioning single tasks in task file.\n", 87 | "\n", 88 | "For more details on the data preparation process, refer to data preparation in documentation.\n", 89 | "\n", 90 | "Defining tasks file for training single model for multiple tasks - intent detection, NER and fragment detection. The file is already created at ``tasks_file_snips.yml``\n", 91 | "\n", 92 | "```\n", 93 | "ner:\n", 94 | " model_type: BERT\n", 95 | " config_name: bert-base-uncased\n", 96 | " dropout_prob: 0.3\n", 97 | " label_map_or_file: ../../data/ner_snips_train_label_map.joblib\n", 98 | " metrics:\n", 99 | " - snips_f1_score\n", 100 | " - snips_precision\n", 101 | " - snips_recall\n", 102 | " loss_type: NERLoss\n", 103 | " task_type: NER\n", 104 | " file_names:\n", 105 | " - ner_snips_train.tsv\n", 106 | " - ner_snips_dev.tsv\n", 107 | " - ner_snips_test.tsv\n", 108 | "\n", 109 | "intent:\n", 110 | " model_type: BERT\n", 111 | " config_name: bert-base-uncased\n", 112 | " dropout_prob: 0.3\n", 113 | " label_map_or_file: ../../data/int_snips_train_label_map.joblib\n", 114 | " metrics:\n", 115 | " - classification_accuracy\n", 116 | " loss_type: CrossEntropyLoss\n", 117 | " task_type: SingleSenClassification\n", 118 | " file_names:\n", 119 | " - int_snips_train.tsv\n", 120 | " - int_snips_dev.tsv\n", 121 | " - int_snips_test.tsv\n", 122 | "\n", 123 | " \n", 124 | "fragment:\n", 125 | " model_type: BERT\n", 126 | " config_name: bert-base-uncased\n", 127 | " dropout_prob: 0.2\n", 128 | " class_num: 2\n", 129 | " metrics:\n", 130 | " - classification_accuracy\n", 131 | " loss_type: CrossEntropyLoss\n", 132 | " task_type: SingleSenClassification\n", 133 | " file_names:\n", 134 | " - fragment_snips_train.tsv\n", 135 | " - fragment_snips_dev.tsv\n", 136 | " - fragment_snips_test.tsv\n", 137 | "```\n", 138 | "\n", 139 | "Following command can be used to run the data preparation for the tasks." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "!python ../../data_preparation.py \\\n", 149 | " --task_file 'tasks_file_snips.yml' \\\n", 150 | " --data_dir '../../data' \\\n", 151 | " --max_seq_len 50" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "# Step - 3 Running train\n", 159 | "\n", 160 | "Following command will start the training for the tasks. The log file reporting the loss, metrics and the tensorboard logs will be present in a time-stamped directory. For demonstration, we've put up sample logs under ``train_logs`` directory.\n", 161 | "\n", 162 | "For knowing more details about the train process, refer to running training in documentation." 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "!python ../../train.py \\\n", 172 | " --data_dir '../../data/bert-base-uncased_prepared_data' \\\n", 173 | " --task_file 'tasks_file_snips.yml' \\\n", 174 | " --out_dir 'snips_intent_ner_fragment_bert_base' \\\n", 175 | " --epochs 3 \\\n", 176 | " --train_batch_size 16 \\\n", 177 | " --eval_batch_size 32 \\\n", 178 | " --grad_accumulation_steps 2 \\\n", 179 | " --log_per_updates 50 \\\n", 180 | " --max_seq_len 50 \\\n", 181 | " --eval_while_train \\\n", 182 | " --test_while_train \\\n", 183 | " --silent " 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "# Step - 4 Infering\n", 191 | "\n", 192 | "You can import and use the ``inferPipeline`` to get predictions for the required tasks.\n", 193 | "The trained model and maximum sequence length to be used needs to be specified.\n", 194 | "\n", 195 | "For knowing more details about infering, refer to infer pipeline in documentation." 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "import sys\n", 205 | "sys.path.insert(1, '../../')\n", 206 | "from infer_pipeline import inferPipeline" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [] 229 | } 230 | ], 231 | "metadata": { 232 | "kernelspec": { 233 | "display_name": "Python 3", 234 | "language": "python", 235 | "name": "python3" 236 | }, 237 | "language_info": { 238 | "codemirror_mode": { 239 | "name": "ipython", 240 | "version": 3 241 | }, 242 | "file_extension": ".py", 243 | "mimetype": "text/x-python", 244 | "name": "python", 245 | "nbconvert_exporter": "python", 246 | "pygments_lexer": "ipython3", 247 | "version": "3.7.3" 248 | } 249 | }, 250 | "nbformat": 4, 251 | "nbformat_minor": 4 252 | } 253 | -------------------------------------------------------------------------------- /examples/intent_ner_fragment/tasks_file_snips.yml: -------------------------------------------------------------------------------- 1 | ner: 2 | model_type: BERT 3 | config_name: bert-base-uncased 4 | dropout_prob: 0.3 5 | label_map_or_file: ../../data/ner_snips_train_label_map.joblib 6 | metrics: 7 | - snips_f1_score 8 | - snips_precision 9 | - snips_recall 10 | loss_type: NERLoss 11 | task_type: NER 12 | file_names: 13 | - ner_snips_train.tsv 14 | - ner_snips_dev.tsv 15 | - ner_snips_test.tsv 16 | 17 | intent: 18 | model_type: BERT 19 | config_name: bert-base-uncased 20 | dropout_prob: 0.3 21 | label_map_or_file: ../../data/int_snips_train_label_map.joblib 22 | metrics: 23 | - classification_accuracy 24 | loss_type: CrossEntropyLoss 25 | task_type: SingleSenClassification 26 | file_names: 27 | - intent_snips_train.tsv 28 | - intent_snips_dev.tsv 29 | - intent_snips_test.tsv 30 | 31 | 32 | fragdetect: 33 | model_type: BERT 34 | config_name: bert-base-uncased 35 | dropout_prob: 0.2 36 | class_num: 2 37 | metrics: 38 | - classification_accuracy 39 | loss_type: CrossEntropyLoss 40 | task_type: SingleSenClassification 41 | file_names: 42 | - fragment_snips_train.tsv 43 | - fragment_snips_dev.tsv 44 | - fragment_snips_test.tsv -------------------------------------------------------------------------------- /examples/intent_ner_fragment/transform_file_snips.yml: -------------------------------------------------------------------------------- 1 | transform1: 2 | transform_func: snips_intent_ner_to_tsv 3 | read_file_names: 4 | - snips_train.txt 5 | - snips_dev.txt 6 | - snips_test.txt 7 | read_dir: snips_data 8 | save_dir: ../../data 9 | 10 | transform2: 11 | transform_func: create_fragment_detection_tsv 12 | read_file_names: 13 | - intent_snips_train.tsv 14 | - intent_snips_dev.tsv 15 | - intent_snips_test.tsv 16 | read_dir: ../../data 17 | save_dir: ../../data -------------------------------------------------------------------------------- /examples/ner_pos_tagging/ner_pos_tagging_conll.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# EXAMPLE - 5\n", 8 | "\n", 9 | "**Tasks :- NER tagging, POS tagging**\n", 10 | "\n", 11 | "**Tasks Description**\n", 12 | "\n", 13 | "``NER`` :-This is a Named Entity Recognition task where individual words of the sentence are tagged with an entity label it belongs to. The words which don't belong to any entity label are simply labeled as \"O\".\n", 14 | "\n", 15 | "``POS`` :- This is a Part of Speech tagging task. A part of speech is a category of words that have similar grammatical properties. Each word of the sentence is tagged with the part of speech label it belongs to. The words which don't belong to any part of speech label are simply labeled as \"O\".\n", 16 | "\n", 17 | "**Conversational Utility** :- In conversational AI context, determining the syntactic parts of the sentence can help in extracting noun-phrases or important keyphrases from the sentence.\n", 18 | "\n", 19 | "**Data** :- In this example, we are using the coNLL 2003 data which is BIO tagged format with the POS and NER tags separated by space.\n", 20 | "\n", 21 | "The data is already present in ``coNLL_data`` directory." 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# Step - 1: Transforming data\n", 29 | "\n", 30 | "Raw data is in BIO tagged format with the POS and NER tags separated by space.\n", 31 | "\n", 32 | "We already provide a sample transformation function ``coNLL_ner_pos_to_tsv`` to convert this data to required tsv format. \n", 33 | "\n", 34 | "Running data transformations will save the required train, dev and test tsv data files under ``data`` directory in root of library. For more details on the data transformation process, refer to data transformations in documentation.\n", 35 | "\n", 36 | "The transformation file should have the following details which is already created ``transform_file_conll.yml``.\n", 37 | "\n", 38 | "```\n", 39 | "transform1:\n", 40 | " transform_func: coNLL_ner_pos_to_tsv\n", 41 | " read_file_names:\n", 42 | " - coNLL_train.txt\n", 43 | " - coNLL_testa.txt\n", 44 | " - coNLL_testb.txt\n", 45 | " read_dir: coNLL_data\n", 46 | " save_dir: ../../data\n", 47 | " ```\n", 48 | " Following command can be used to run the data transformation for the tasks." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "!python ../../data_transformations.py \\\n", 58 | " --transform_file 'transform_file_conll.yml'" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "# Step -2 Data Preparation\n", 66 | "\n", 67 | "For more details on the data preparation process, refer to data preparation in documentation.\n", 68 | "\n", 69 | "Defining tasks file for training single model for entailment task. The file is already created at ``tasks_file_conll.yml``\n", 70 | "```\n", 71 | "conllner:\n", 72 | " model_type: BERT\n", 73 | " config_name: bert-base-uncased\n", 74 | " dropout_prob: 0.2\n", 75 | " label_map_or_file: ../../data/ner_coNLL_train_label_map.joblib\n", 76 | " metrics:\n", 77 | " - seqeval_f1_score\n", 78 | " - seqeval_precision\n", 79 | " - seqeval_recall\n", 80 | " loss_type: NERLoss\n", 81 | " task_type: NER\n", 82 | " file_names:\n", 83 | " - ner_coNLL_train.tsv\n", 84 | " - ner_coNLL_testa.tsv\n", 85 | " - ner_coNLL_testb.tsv\n", 86 | "\n", 87 | "conllpos:\n", 88 | " model_type: BERT\n", 89 | " config_name: bert-base-uncased\n", 90 | " dropout_prob: 0.2\n", 91 | " label_map_or_file: ../../data/pos_coNLL_train_label_map.joblib\n", 92 | " metrics:\n", 93 | " - seqeval_f1_score\n", 94 | " - seqeval_precision\n", 95 | " - seqeval_recall\n", 96 | " loss_type: NERLoss\n", 97 | " task_type: NER\n", 98 | " file_names:\n", 99 | " - pos_coNLL_train.tsv\n", 100 | " - pos_coNLL_testa.tsv\n", 101 | " - pos_coNLL_testb.tsv\n", 102 | "```" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "!python ../../data_preparation.py \\\n", 112 | " --task_file 'tasks_file_conll.yml' \\\n", 113 | " --data_dir '../../data' \\\n", 114 | " --max_seq_len 50" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "# Step -3 Running Training" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "!python ../../train.py \\\n", 131 | " --data_dir '../../data/bert-base-uncased_prepared_data' \\\n", 132 | " --task_file 'tasks_file_conll.yml' \\\n", 133 | " --out_dir 'conll_ner_pos_bert_base' \\\n", 134 | " --epochs 10 \\\n", 135 | " --train_batch_size 32 \\\n", 136 | " --eval_batch_size 32 \\\n", 137 | " --grad_accumulation_steps 1 \\\n", 138 | " --log_per_updates 50 \\\n", 139 | " --max_seq_len 50 \\\n", 140 | " --eval_while_train \\\n", 141 | " --test_while_train \\\n", 142 | " --silent" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "# Step - 4 Infering\n", 150 | "\n", 151 | "You can import and use the ``inferPipeline`` to get predictions for the required tasks.\n", 152 | "The trained model and maximum sequence length to be used needs to be specified.\n", 153 | "\n", 154 | "For knowing more details about infering, refer to infer pipeline in documentation." 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "import sys\n", 164 | "sys.path.insert(1, '../../')\n", 165 | "from infer_pipeline import inferPipeline" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Python 3", 200 | "language": "python", 201 | "name": "python3" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.7.3" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 4 218 | } 219 | -------------------------------------------------------------------------------- /examples/ner_pos_tagging/tasks_file_conll.yml: -------------------------------------------------------------------------------- 1 | conllner: 2 | model_type: BERT 3 | config_name: bert-base-uncased 4 | dropout_prob: 0.2 5 | label_map_or_file: ../../data/ner_coNLL_train_label_map.joblib 6 | metrics: 7 | - seqeval_f1_score 8 | - seqeval_precision 9 | - seqeval_recall 10 | loss_type: NERLoss 11 | task_type: NER 12 | file_names: 13 | - ner_coNLL_train.tsv 14 | - ner_coNLL_testa.tsv 15 | - ner_coNLL_testb.tsv 16 | 17 | conllpos: 18 | model_type: BERT 19 | config_name: bert-base-uncased 20 | dropout_prob: 0.2 21 | label_map_or_file: ../../data/pos_coNLL_train_label_map.joblib 22 | metrics: 23 | - seqeval_f1_score 24 | - seqeval_precision 25 | - seqeval_recall 26 | loss_type: NERLoss 27 | task_type: NER 28 | file_names: 29 | - pos_coNLL_train.tsv 30 | - pos_coNLL_testa.tsv 31 | - pos_coNLL_testb.tsv -------------------------------------------------------------------------------- /examples/ner_pos_tagging/transform_file_conll.yml: -------------------------------------------------------------------------------- 1 | transform1: 2 | transform_func: coNLL_ner_pos_to_tsv 3 | read_file_names: 4 | - coNLL_train.txt 5 | - coNLL_testa.txt 6 | - coNLL_testb.txt 7 | read_dir: coNLL_data 8 | save_dir: ../../data -------------------------------------------------------------------------------- /examples/query_correctness/query_correctness.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# EXAMPLE - 6\n", 8 | "\n", 9 | "**Tasks :- query correctness**\n", 10 | "\n", 11 | "**Tasks Description**\n", 12 | "\n", 13 | "``querycorrectness`` :- This is modeled as single sentence classification task identifying whether or not a query is structurally well formed. can enhance query un-derstanding.\n", 14 | "\n", 15 | "**Conversational Utility** :- Determining how much the query is structured would help in enhancing query understanding and improve reliability of tasks which depend on query structure to extract information.\n", 16 | "\n", 17 | "**Data** :- In this example, we are using the Query-wellformedness data where every query was annotated by five raters each with 1/0 rating of whether or not the query is well-formed.\n", 18 | "\n", 19 | "The data is already present under the directory ``query_correctness_data``" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "# Step - 1: Transforming data\n", 27 | "\n", 28 | "```\n", 29 | "transform1:\n", 30 | " transform_func: query_correctness_to_tsv\n", 31 | " read_file_names:\n", 32 | " - train.tsv\n", 33 | " - dev.tsv\n", 34 | " - test.tsv\n", 35 | "\n", 36 | " read_dir: query_correctness_data\n", 37 | " save_dir: ../../data\n", 38 | "```" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "!python ../../data_transformations.py \\\n", 48 | " --transform_file 'transform_file_query_correctness.yml'" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# Step -2 Data Preparation\n", 56 | "\n", 57 | "```\n", 58 | "querycorrectness:\n", 59 | " model_type: BERT\n", 60 | " config_name: bert-base-uncased\n", 61 | " dropout_prob: 0.2\n", 62 | " class_num : 2\n", 63 | " metrics:\n", 64 | " - classification_accuracy\n", 65 | " loss_type: CrossEntropyLoss\n", 66 | " task_type: SingleSenClassification\n", 67 | " file_names:\n", 68 | " - query_correctness_train.tsv\n", 69 | " - query_correctness_dev.tsv\n", 70 | " - query_correctness_test.tsv\n", 71 | "```" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "!python ../../data_preparation.py \\\n", 81 | " --task_file 'tasks_file_query_correctness.yml' \\\n", 82 | " --data_dir '../../data' \\\n", 83 | " --max_seq_len 50" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "# Step - 3 Running train" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "!python ../../train.py \\\n", 100 | " --data_dir '../../data/bert-base-uncased_prepared_data' \\\n", 101 | " --task_file 'tasks_file_query_correctness.yml' \\\n", 102 | " --out_dir 'query_correctness_bert_base' \\\n", 103 | " --epochs 10 \\\n", 104 | " --train_batch_size 16 \\\n", 105 | " --eval_batch_size 32 \\\n", 106 | " --grad_accumulation_steps 1 \\\n", 107 | " --log_per_updates 20 \\\n", 108 | " --max_seq_len 50 \\\n", 109 | " --eval_while_train \\\n", 110 | " --test_while_train \\\n", 111 | " --silent" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "# Step - 4 Infering" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "import sys\n", 128 | "sys.path.insert(1, '../../')\n", 129 | "from infer_pipeline import inferPipeline" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [] 138 | } 139 | ], 140 | "metadata": { 141 | "kernelspec": { 142 | "display_name": "Python 3", 143 | "language": "python", 144 | "name": "python3" 145 | }, 146 | "language_info": { 147 | "codemirror_mode": { 148 | "name": "ipython", 149 | "version": 3 150 | }, 151 | "file_extension": ".py", 152 | "mimetype": "text/x-python", 153 | "name": "python", 154 | "nbconvert_exporter": "python", 155 | "pygments_lexer": "ipython3", 156 | "version": "3.7.3" 157 | } 158 | }, 159 | "nbformat": 4, 160 | "nbformat_minor": 4 161 | } 162 | -------------------------------------------------------------------------------- /examples/query_correctness/tasks_file_query_correctness.yml: -------------------------------------------------------------------------------- 1 | querycorrectness: 2 | model_type: BERT 3 | config_name: bert-base-uncased 4 | dropout_prob: 0.2 5 | class_num : 2 6 | metrics: 7 | - classification_accuracy 8 | loss_type: CrossEntropyLoss 9 | task_type: SingleSenClassification 10 | file_names: 11 | - query_correctness_train.tsv 12 | - query_correctness_dev.tsv 13 | - query_correctness_test.tsv -------------------------------------------------------------------------------- /examples/query_correctness/transform_file_query_correctness.yml: -------------------------------------------------------------------------------- 1 | transform1: 2 | transform_func: query_correctness_to_tsv 3 | read_file_names: 4 | - train.tsv 5 | - dev.tsv 6 | - test.tsv 7 | 8 | read_dir: query_correctness_data 9 | save_dir: ../../data -------------------------------------------------------------------------------- /examples/query_pair_similarity/query_similarity_qqp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# EXAMPLE - 7\n", 8 | "\n", 9 | "**Tasks :- Query similarity**\n", 10 | "\n", 11 | "**Tasks Description**\n", 12 | "\n", 13 | "``Query similarity`` :- This is a sentence pair classification task which determines whether the second sentence in a sample can be inferred from the first.\n", 14 | "\n", 15 | "**Conversational Utility** :- In conversational AI context, this task can be seen as determining whether the second sentence is similar to first or not. Additionally, the probability score can also be used as a similarity score between the sentences. \n", 16 | "\n", 17 | "**Data** :- In this example, we are using the SNLI data which is having sentence pairs and labels.\n", 18 | "\n", 19 | "The data can be downloaded using the following ``wget`` command and unzipped using ``unzip`` command." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "!wget qim.fs.quoracdn.net/quora_duplicate_questions.tsv -P qqp_data/" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Step -1 Data Transformations\n", 36 | "\n", 37 | "Defining transform file\n", 38 | "\n", 39 | "```\n", 40 | "sample_transform:\n", 41 | " transform_func: qqp_query_similarity_to_tsv\n", 42 | " read_file_names:\n", 43 | " - quora_duplicate_questions.tsv\n", 44 | " read_dir : qqp_data\n", 45 | " save_dir: ../../data\n", 46 | "```" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "!python ../../data_transformations.py \\\n", 56 | " --transform_file 'transform_file_qqp.yml'" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "# Step -2 Data Preparation\n", 64 | "\n", 65 | "Defining task file for query similarity detection with QQP data\n", 66 | "\n", 67 | "```\n", 68 | "querysimilarity:\n", 69 | " model_type: BERT\n", 70 | " config_name: bert-base-uncased\n", 71 | " dropout_prob: 0.2\n", 72 | " metrics:\n", 73 | " - classification_accuracy\n", 74 | " loss_type: CrossEntropyLoss\n", 75 | " class_num: 2\n", 76 | " task_type: SentencePairClassification\n", 77 | " file_names:\n", 78 | " - qqp_query_similarity_train.tsv\n", 79 | " - qqp_query_similarity_dev.tsv\n", 80 | " - qqp_query_similarity_test.tsv\n", 81 | "```" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "!python ../../data_preparation.py \\\n", 91 | " --task_file 'tasks_file_qqp.yml' \\\n", 92 | " --data_dir '../../data' \\\n", 93 | " --max_seq_len 200" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "# Step - 3 Running train\n", 101 | "\n", 102 | "Following command will start the training for the tasks. The log file reporting the loss, metrics and the tensorboard logs will be present in a time-stamped directory.\n", 103 | "\n", 104 | "For knowing more details about the train process, refer to running training in documentation." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "!python ../../train.py \\\n", 114 | " --data_dir '../../data/bert-base-uncased_prepared_data' \\\n", 115 | " --task_file 'tasks_file_qqp.yml' \\\n", 116 | " --out_dir 'qqp_query_similarity_bert_base' \\\n", 117 | " --epochs 3 \\\n", 118 | " --train_batch_size 32 \\\n", 119 | " --eval_batch_size 32 \\\n", 120 | " --grad_accumulation_steps 2 \\\n", 121 | " --log_per_updates 100 \\\n", 122 | " --save_per_updates 3000 \\\n", 123 | " --limit_save 6 \\\n", 124 | " --max_seq_len 200 \\\n", 125 | " --eval_while_train \\\n", 126 | " --test_while_train \\\n", 127 | " --silent" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "# Step - 4 Infering\n", 135 | "\n", 136 | "You can import and use the ``inferPipeline`` to get predictions for the required tasks.\n", 137 | "The trained model and maximum sequence length to be used needs to be specified.\n", 138 | "\n", 139 | "For knowing more details about infering, refer to infer pipeline in documentation." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "import sys\n", 149 | "sys.path.insert(1, '../../')\n", 150 | "from infer_pipeline import inferPipeline" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [] 159 | } 160 | ], 161 | "metadata": { 162 | "kernelspec": { 163 | "display_name": "Python 3", 164 | "language": "python", 165 | "name": "python3" 166 | }, 167 | "language_info": { 168 | "codemirror_mode": { 169 | "name": "ipython", 170 | "version": 3 171 | }, 172 | "file_extension": ".py", 173 | "mimetype": "text/x-python", 174 | "name": "python", 175 | "nbconvert_exporter": "python", 176 | "pygments_lexer": "ipython3", 177 | "version": "3.7.3" 178 | } 179 | }, 180 | "nbformat": 4, 181 | "nbformat_minor": 4 182 | } 183 | -------------------------------------------------------------------------------- /examples/query_pair_similarity/tasks_file_qqp.yml: -------------------------------------------------------------------------------- 1 | querysimilarity: 2 | model_type: BERT 3 | config_name: bert-base-uncased 4 | dropout_prob: 0.2 5 | metrics: 6 | - classification_accuracy 7 | loss_type: CrossEntropyLoss 8 | class_num: 2 9 | task_type: SentencePairClassification 10 | file_names: 11 | - qqp_query_similarity_train.tsv 12 | - qqp_query_similarity_dev.tsv 13 | - qqp_query_similarity_test.tsv -------------------------------------------------------------------------------- /examples/query_pair_similarity/transform_file_qqp.yml: -------------------------------------------------------------------------------- 1 | sample_transform: 2 | transform_func: qqp_query_similarity_to_tsv 3 | read_file_names: 4 | - quora_duplicate_questions.tsv 5 | read_dir : qqp_data 6 | save_dir: ../../data -------------------------------------------------------------------------------- /examples/query_type_detection/query_type_detection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Example - 4\n", 8 | "\n", 9 | "**Tasks :- Query type detection**\n", 10 | "\n", 11 | "**Tasks Description**\n", 12 | "\n", 13 | "``querytype`` :- This is a single sentence classification task to determine what type (category) of answer is expected for the given query. The queries are divided into 5 major classes according to the answer expected for them.\n", 14 | "\n", 15 | "**Conversational Utility** :- While returning a response for a query, knowing what kind of answer is expected for the query can help in both curating and cross-verfying an answer according to the type.\n", 16 | "\n", 17 | "**Data** :- In this example, we are using the MSMARCO QnA data. Queries are divided into 5 query types - NUMERIC, LOCATION, ENTITY, DESCRIPTION, PERSON.\n", 18 | "\n", 19 | "The data can be downloaded using the following ``wget`` command and unzipped using ``unzip`` command." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "!wget https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz -P msmarco_qna_data\n", 29 | "!wget https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz -P msmarco_qna_data\n", 30 | "!wget https://msmarco.blob.core.windows.net/msmarco/eval_v2.1_public.json.gz -P msmarco_qna_data" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "!gunzip msmarco_qna_data/train_v2.1.json.gz\n", 40 | "!gunzip msmarco_qna_data/dev_v2.1.json.gz\n", 41 | "!gunzip msmarco_qna_data/eval_v2.1_public.json.gz" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "# Step - 1: Transforming data\n", 49 | "\n", 50 | "The data is present in *JSON* format containing various data fields for each sample. We only consider the ``query`` and ``query_type`` in this example. The data is fairly large, hence we set ``data_frac`` to 0.2 by default. You can change this in case, you want to consider more data.\n", 51 | "\n", 52 | "We already provide a sample transformation function ``msmarco_query_type_to_tsv`` to convert this data to required tsv format. \n", 53 | "\n", 54 | "Running data transformations will save the required train, dev and test tsv data files under ``data`` directory in root of library. For more details on the data transformation process, refer to data transformations in documentation.\n", 55 | "\n", 56 | "The transformation file should have the following details which is already created ``transform_file_querytype.yml``.\n", 57 | "\n", 58 | "```\n", 59 | "transform1:\n", 60 | " transform_func: msmarco_query_type_to_tsv\n", 61 | " transform_params:\n", 62 | " data_frac : 0.2\n", 63 | " read_file_names:\n", 64 | " - train_v2.1.json\n", 65 | " - dev_v2.1.json\n", 66 | " - eval_v2.1_public.json\n", 67 | "\n", 68 | " read_dir: msmarco_qna_data\n", 69 | " save_dir: ../../data\n", 70 | " ```\n", 71 | " Following command can be used to run the data transformation for the tasks." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "!python ../../data_transformations.py \\\n", 81 | " --transform_file 'transform_file_querytype.yml'" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "# Step -2 Data Preparation\n", 89 | "\n", 90 | "For more details on the data preparation process, refer to data preparation in documentation.\n", 91 | "\n", 92 | "Defining tasks file for training single model for entailment task. The file is already created at ``tasks_file_querytype.yml``\n", 93 | "```\n", 94 | "querytype:\n", 95 | " model_type: BERT\n", 96 | " config_name: bert-base-uncased\n", 97 | " dropout_prob: 0.2\n", 98 | " label_map_or_file:\n", 99 | " - DESCRIPTION\n", 100 | " - ENTITY\n", 101 | " - LOCATION\n", 102 | " - NUMERIC\n", 103 | " - PERSON\n", 104 | " metrics:\n", 105 | " - classification_accuracy\n", 106 | " loss_type: CrossEntropyLoss\n", 107 | " task_type: SingleSenClassification\n", 108 | " file_names:\n", 109 | " - querytype_train_v2.1.tsv\n", 110 | " - querytype_dev_v2.1.tsv\n", 111 | " - querytype_eval_v2.1_public.tsv\n", 112 | "```" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "!python ../../data_preparation.py \\\n", 122 | " --task_file 'tasks_file_querytype.yml' \\\n", 123 | " --data_dir '../../data' \\\n", 124 | " --max_seq_len 60" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "# Step - 3 Running train\n", 132 | "\n", 133 | "Following command will start the training for the tasks. The log file reporting the loss, metrics and the tensorboard logs will be present in a time-stamped directory.\n", 134 | "\n", 135 | "For knowing more details about the train process, refer to running training in documentation." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "!python ../../train.py \\\n", 145 | " --data_dir '../../data/bert-base-uncased_prepared_data' \\\n", 146 | " --task_file 'tasks_file_querytype.yml' \\\n", 147 | " --out_dir 'msmarco_querytype_bert_base' \\\n", 148 | " --epochs 4 \\\n", 149 | " --train_batch_size 64 \\\n", 150 | " --eval_batch_size 64 \\\n", 151 | " --grad_accumulation_steps 1 \\\n", 152 | " --log_per_updates 100 \\\n", 153 | " --max_seq_len 60 \\\n", 154 | " --eval_while_train \\\n", 155 | " --test_while_train \\\n", 156 | " --silent" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "# Step - 4 Infering\n", 164 | "\n", 165 | "You can import and use the ``inferPipeline`` to get predictions for the required tasks.\n", 166 | "The trained model and maximum sequence length to be used needs to be specified.\n", 167 | "\n", 168 | "For knowing more details about infering, refer to infer pipeline in documentation." 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 1, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stderr", 178 | "output_type": "stream", 179 | "text": [ 180 | "Using TensorFlow backend.\n" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "import sys\n", 186 | "sys.path.insert(1, '../../')\n", 187 | "from infer_pipeline import inferPipeline" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [] 210 | } 211 | ], 212 | "metadata": { 213 | "kernelspec": { 214 | "display_name": "Python 3", 215 | "language": "python", 216 | "name": "python3" 217 | }, 218 | "language_info": { 219 | "codemirror_mode": { 220 | "name": "ipython", 221 | "version": 3 222 | }, 223 | "file_extension": ".py", 224 | "mimetype": "text/x-python", 225 | "name": "python", 226 | "nbconvert_exporter": "python", 227 | "pygments_lexer": "ipython3", 228 | "version": "3.7.3" 229 | } 230 | }, 231 | "nbformat": 4, 232 | "nbformat_minor": 4 233 | } 234 | -------------------------------------------------------------------------------- /examples/query_type_detection/tasks_file_querytype.yml: -------------------------------------------------------------------------------- 1 | querytype: 2 | model_type: BERT 3 | config_name: bert-base-uncased 4 | dropout_prob: 0.2 5 | label_map_or_file: 6 | - DESCRIPTION 7 | - ENTITY 8 | - LOCATION 9 | - NUMERIC 10 | - PERSON 11 | metrics: 12 | - classification_accuracy 13 | loss_type: CrossEntropyLoss 14 | task_type: SingleSenClassification 15 | file_names: 16 | - querytype_train_v2.1.tsv 17 | - querytype_dev_v2.1.tsv 18 | - querytype_eval_v2.1_public.tsv -------------------------------------------------------------------------------- /examples/query_type_detection/transform_file_querytype.yml: -------------------------------------------------------------------------------- 1 | transform1: 2 | transform_func: msmarco_query_type_to_tsv 3 | transform_params: 4 | data_frac : 0.2 5 | read_file_names: 6 | - train_v2.1.json 7 | - dev_v2.1.json 8 | - eval_v2.1_public.json 9 | 10 | read_dir: msmarco_qna_data 11 | save_dir: ../../data -------------------------------------------------------------------------------- /examples/sentiment_analysis/IMDb_sentiment_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# EXAMPLE - 8\n", 8 | "\n", 9 | "**Tasks :- Sentiment analysis**\n", 10 | "\n", 11 | "**Tasks Description**\n", 12 | "\n", 13 | "``sentiment`` :- This is modeled as single sentence classification task to determine where a piece of text conveys a positive or negative sentiment.\n", 14 | "\n", 15 | "**Conversational Utility** :- To determine whether a review is positive or negative.\n", 16 | "\n", 17 | "**Data** :- In this example, we are using the IMDB data which can be downloaded after accepting the terms and saved under `imdb_data` directory. The data is having total 50k samples labeled as positive or negative.\n" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "!unzip imdb_data/134715_320111_bundle_archive.zip -d imdb_data/imdb_dataset.csv" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "!mv imdb_data/IMDB\\ Dataset.csv imdb_data/imdb_sentiment_data.csv" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "# Step - 1: Transforming data\n", 43 | "The data file `imdb_dataset` is having 50k samples with two columns - review and sentiment. Sentiment is the label which can be positive or negative.\n", 44 | "We already provide a sample transformation function ``imdb_sentiment_data_to_tsv`` to convert this data to required tsv format.\n", 45 | "Running data transformations will save the required train and test tsv data files under ``data`` directory in root of library. For more details on the data transformation process, refer to data transformations in documentation.\n", 46 | "\n", 47 | "The transformation file should have the following details which is already created ``transform_file_imdb.yml``.\n", 48 | "\n", 49 | "```\n", 50 | "transform1:\n", 51 | " transform_func: imdb_sentiment_data_to_tsv\n", 52 | " read_file_names:\n", 53 | " - imdb_sentiment_data.csv\n", 54 | " read_dir: imdb_data\n", 55 | " save_dir: ../../data\n", 56 | "```" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "!python ../../data_transformations.py \\\n", 66 | " --transform_file 'transform_file_imdb.yml'" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "# Step -2 Data Preparation\n", 74 | "\n", 75 | "For more details on the data preparation process, refer to data preparation in documentation.\n", 76 | "\n", 77 | "Defining tasks file for training single model for sentiment task. The file is already created at ``tasks_file_imdb.yml``\n", 78 | "\n", 79 | "```\n", 80 | "sentiment:\n", 81 | " model_type: BERT\n", 82 | " config_name: bert-base-uncased\n", 83 | " dropout_prob: 0.2\n", 84 | " label_map_or_file:\n", 85 | " - negative\n", 86 | " - positive\n", 87 | " class_num: 2\n", 88 | " metrics:\n", 89 | " - classification_accuracy\n", 90 | " loss_type: CrossEntropyLoss\n", 91 | " task_type: SingleSenClassification\n", 92 | " file_names:\n", 93 | " - imdb_sentiment_train.tsv\n", 94 | " - imdb_sentiment_test.tsv\n", 95 | "```" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "!python ../../data_preparation.py \\\n", 105 | " --task_file 'tasks_file_imdb.yml' \\\n", 106 | " --data_dir '../../data' \\\n", 107 | " --max_seq_len 200" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "# Step - 3 Running train\n", 115 | "\n", 116 | "Following command will start the training for the tasks. The log file reporting the loss, metrics and the tensorboard logs will be present in a time-stamped directory.\n", 117 | "\n", 118 | "For knowing more details about the train process, refer to running training in documentation." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "!python ../../train.py \\\n", 128 | " --data_dir '../../data/bert-base-uncased_prepared_data' \\\n", 129 | " --task_file 'tasks_file_imdb.yml' \\\n", 130 | " --out_dir 'imdb_sentiment_bert_base' \\\n", 131 | " --epochs 8 \\\n", 132 | " --train_batch_size 32 \\\n", 133 | " --eval_batch_size 32 \\\n", 134 | " --max_seq_len 200 \\\n", 135 | " --grad_accumulation_steps 1 \\\n", 136 | " --log_per_updates 50 \\\n", 137 | " --eval_while_train \\\n", 138 | " --silent" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "# Step - 4 Infering\n", 146 | "\n", 147 | "You can import and use the ``inferPipeline`` to get predictions for the required tasks.\n", 148 | "The trained model and maximum sequence length to be used needs to be specified.\n", 149 | "\n", 150 | "For knowing more details about infering, refer to infer pipeline in documentation." 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "import sys\n", 160 | "sys.path.insert(1, '../../')\n", 161 | "from infer_pipeline import inferPipeline" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [] 170 | } 171 | ], 172 | "metadata": { 173 | "kernelspec": { 174 | "display_name": "Python 3", 175 | "language": "python", 176 | "name": "python3" 177 | }, 178 | "language_info": { 179 | "codemirror_mode": { 180 | "name": "ipython", 181 | "version": 3 182 | }, 183 | "file_extension": ".py", 184 | "mimetype": "text/x-python", 185 | "name": "python", 186 | "nbconvert_exporter": "python", 187 | "pygments_lexer": "ipython3", 188 | "version": "3.7.3" 189 | } 190 | }, 191 | "nbformat": 4, 192 | "nbformat_minor": 4 193 | } 194 | -------------------------------------------------------------------------------- /examples/sentiment_analysis/tasks_file_imdb.yml: -------------------------------------------------------------------------------- 1 | sentiment: 2 | model_type: BERT 3 | config_name: bert-base-uncased 4 | dropout_prob: 0.2 5 | label_map_or_file: 6 | - negative 7 | - positive 8 | class_num: 2 9 | metrics: 10 | - classification_accuracy 11 | loss_type: CrossEntropyLoss 12 | task_type: SingleSenClassification 13 | file_names: 14 | - imdb_sentiment_train.tsv 15 | - imdb_sentiment_test.tsv -------------------------------------------------------------------------------- /examples/sentiment_analysis/transform_file_imdb.yml: -------------------------------------------------------------------------------- 1 | transform1: 2 | transform_func: imdb_sentiment_data_to_tsv 3 | read_file_names: 4 | - imdb_sentiment_data.csv 5 | read_dir: imdb_data 6 | save_dir: ../../data -------------------------------------------------------------------------------- /infer_pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline for inference on batch for multi-task 3 | """ 4 | from utils.task_utils import TasksParam 5 | from utils.data_utils import TaskType, ModelType, NLP_MODELS 6 | from models.eval import evaluate 7 | from models.model import multiTaskModel 8 | from data_preparation import * 9 | from models.data_manager import allTasksDataset, Batcher, batchUtils 10 | from torch.utils.data import Dataset, DataLoader, BatchSampler 11 | import argparse 12 | import os 13 | import torch 14 | import logging 15 | logger = logging.getLogger("multi_task") 16 | 17 | class inferPipeline: 18 | 19 | """ 20 | For running inference on samples using a trained model for say TaskA, TaskB and TaskC, 21 | you can import this class and load the corresponding multi-task model by making an 22 | object of this class with the following arguments 23 | 24 | Args: 25 | modelPath (:obj:`str`) : Path to the trained multi-task model for required tasks. 26 | maxSeqLen (:obj:`int`, defaults to :obj:`128`) : maximum sequence length to be considered for samples. 27 | Truncating and padding will happen accordingly. 28 | 29 | Example:: 30 | 31 | >>> from infer_pipeline import inferPipeline 32 | >>> pipe = inferPipeline(modelPath = 'sample_out_dir/multi_task_model.pt', maxSeqLen = 50) 33 | 34 | """ 35 | 36 | def __init__(self, modelPath, maxSeqLen = 128): 37 | 38 | device = torch.device('cpu') 39 | if torch.cuda.is_available(): 40 | device = torch.device('cuda') 41 | 42 | self.maxSeqLen = maxSeqLen 43 | self.modelPath = modelPath 44 | assert os.path.exists(self.modelPath), "saved model not present at {}".format(self.modelPath) 45 | 46 | loadedDict = torch.load(self.modelPath, map_location=device) 47 | self.taskParams = loadedDict['task_params'] 48 | logger.info('Task Params loaded from saved model.') 49 | 50 | modelName = self.taskParams.modelType.name.lower() 51 | _, _ , tokenizerClass, defaultName = NLP_MODELS[modelName] 52 | configName = self.taskParams.modelConfig 53 | if configName is None: 54 | configName = defaultName 55 | #making tokenizer for model 56 | self.tokenizer = tokenizerClass.from_pretrained(configName) 57 | logger.info('{} model tokenizer loaded for config {}'.format(modelName, configName)) 58 | 59 | allParams = {} 60 | allParams['task_params'] = self.taskParams 61 | allParams['gpu'] = torch.cuda.is_available() 62 | # dummy values 63 | allParams['num_train_steps'] = 10 64 | allParams['warmup_steps'] = 0 65 | allParams['learning_rate'] = 2e-5 66 | allParams['epsilon'] = 1e-8 67 | 68 | #making and loading model 69 | self.model = multiTaskModel(allParams) 70 | self.model.load_multi_task_model(loadedDict) 71 | 72 | def make_feature_samples(self, dataList, taskType, taskName): 73 | allData = [] 74 | for i, sample in enumerate(dataList): 75 | if taskType == TaskType.SingleSenClassification: 76 | inputIds, typeIds, inputMask = standard_data_converter(self.maxSeqLen, self.tokenizer, sample[0]) 77 | features = { 78 | 'uid': i, 79 | 'label': 0, 80 | 'token_id': inputIds, 81 | 'type_id': typeIds, 82 | 'mask': inputMask} 83 | 84 | elif taskType == TaskType.SentencePairClassification: 85 | inputIds, typeIds, inputMask = standard_data_converter(self.maxSeqLen, self.tokenizer, sample[0], sample[1]) 86 | features = { 87 | 'uid': i, 88 | 'label': 0, 89 | 'token_id': inputIds, 90 | 'type_id': typeIds, 91 | 'mask': inputMask} 92 | 93 | elif taskType == TaskType.NER: 94 | 95 | splitSample = sample[0].split() 96 | label = ["O"]*len(splitSample) 97 | tempTokens = ['[CLS]'] 98 | tempLabels = ['[CLS]'] 99 | for word, label in zip(splitSample, label): 100 | tokens = self.tokenizer.tokenize(word) 101 | for m, token in enumerate(tokens): 102 | tempTokens.append(token) 103 | #only first piece would be marked with label 104 | if m==0: 105 | tempLabels.append(label) 106 | else: 107 | tempLabels.append('X') 108 | # adding [SEP] at end 109 | tempTokens.append('[SEP]') 110 | tempLabels.append('[SEP]') 111 | 112 | out = self.tokenizer.encode_plus(text = tempTokens, add_special_tokens=False, 113 | truncation_strategy ='only_first', 114 | max_length = self.maxSeqLen, pad_to_max_length=True) 115 | typeIds = None 116 | inputMask = None 117 | tokenIds = out['input_ids'] 118 | if 'token_type_ids' in out.keys(): 119 | typeIds = out['token_type_ids'] 120 | if 'attention_mask' in out.keys(): 121 | inputMask = out['attention_mask'] 122 | 123 | labelMap = self.taskParams.labelMap[taskName] 124 | tempLabelsEnc = pad_sequences([ [labelMap[l] for l in tempLabels] ], 125 | maxlen=self.maxSeqLen, value=labelMap["O"], padding="post", 126 | dtype="long", truncating="post").tolist()[0] 127 | #print(tempLabelsEnc) 128 | assert len(tempLabelsEnc) == len(tokenIds), "mismatch between processed tokens and labels" 129 | features = { 130 | 'uid': i, 131 | 'label': tempLabelsEnc, 132 | 'token_id': tokenIds, 133 | 'type_id': typeIds, 134 | 'mask': inputMask} 135 | else: 136 | raise ValueError(taskType) 137 | 138 | allData.append(features) 139 | 140 | return allData 141 | def format_ner_output(self, sample, result): 142 | assert len(sample) == len(result), "length of sample and result list not same" 143 | returnList = [] 144 | for i, (sam, res) in enumerate(zip(sample, result)): 145 | if res not in ["O", "[CLS]", "[SEP]", "X"]: 146 | curr = res.split('-')[-1] 147 | if len(returnList)>0: 148 | if curr == returnList[len(returnList)-1][0]: 149 | returnList[len(returnList)-1].append(sam) 150 | else: 151 | returnList.append([curr, sam]) 152 | else: 153 | returnList.append([curr, sam]) 154 | #print(returnList) 155 | outList = [] 156 | for finalSam in returnList: 157 | #print(finalSam) 158 | outS = ' '.join(finalSam[1:]) 159 | #print(outS) 160 | outList.append((finalSam[0], outS)) 161 | #print('{} : {}'.format(finalSam[0], outS)) 162 | 163 | return outList 164 | 165 | def format_output(self, dataList, allIds, allPreds, allScores): 166 | returnList = [] 167 | for sampleId in range(len(dataList)): 168 | resDict = {} 169 | #print("\nInput Sample : ", dataList[sampleId]) 170 | resDict['Query'] = dataList[sampleId] 171 | for i in range(len(allIds)): 172 | taskName = self.taskParams.taskIdNameMap[i] 173 | taskType = self.taskParams.taskTypeMap[taskName] 174 | if allPreds[i] == []: 175 | continue 176 | 177 | if taskType == TaskType.NER: 178 | result = allPreds[i][sampleId] 179 | inpp = dataList[sampleId][0].split() 180 | #print("{} : ".format(taskName)) 181 | result = self.format_ner_output(inpp, result) 182 | else: 183 | result = [allPreds[i][sampleId], allScores[i][sampleId]] 184 | 185 | resDict[taskName] = result 186 | #else: 187 | #print("{} : {}".format(taskName, result)) 188 | returnList.append(resDict) 189 | #print(returnList) 190 | return returnList 191 | 192 | 193 | def infer(self, dataList, taskNamesList, batchSize = 8, seed=42): 194 | 195 | """ 196 | This is the function which can be called to get the predictions for input samples 197 | for the mentioned tasks. 198 | 199 | - Samples can be packed in a ``list of lists`` manner as the function processes inputs in batch. 200 | - In case, an input sample requires sentence pair, the two sentences can be kept as elements of the list. 201 | - In case of single sentence classification or NER tasks, only the first element of a sample will be used. 202 | - For NER, the infer function automatically splits the sentence into tokens. 203 | - All the tasks mentioned in ``taskNamesList`` are performed for all the input samples. 204 | 205 | Args: 206 | 207 | dataList (:obj:`list of lists`) : A batch of input samples. For eg. 208 | 209 | [ 210 | [, ], 211 | 212 | [, ], 213 | 214 | ] 215 | 216 | or in case all the tasks just require single sentence inputs, 217 | 218 | [ 219 | [], 220 | 221 | [], 222 | 223 | ] 224 | 225 | taskNamesList (:obj:`list`) : List of tasks to be performed on dataList samples. For eg. 226 | 227 | ['TaskA', 'TaskB', 'TaskC'] 228 | 229 | You can choose the tasks you want to infer. For eg. 230 | 231 | ['TaskB'] 232 | 233 | batchSize (:obj:`int`, defaults to :obj:`8`) : Batch size for running inference. 234 | 235 | 236 | Return: 237 | 238 | outList (:obj:`list of objects`) : 239 | List of dictionary objects where each object contains one corresponding input sample and it's tasks outputs. The task outputs 240 | can also contain the confidence scores. For eg. 241 | 242 | [ 243 | {'Query' : [], 244 | 245 | 'TaskA' : , 246 | 247 | 'TaskB' : , 248 | 249 | 'TaskC' : }, 250 | 251 | ] 252 | 253 | Example:: 254 | 255 | >>> samples = [ ['sample_sentence_1'], ['sample_sentence_2'] ] 256 | >>> tasks = ['TaskA', 'TaskB'] 257 | >>> pipe.infer(samples, tasks) 258 | 259 | """ 260 | #print(dataList) 261 | #print(taskNamesList) 262 | allTasksList = [] 263 | for taskName in taskNamesList: 264 | assert taskName in self.taskParams.taskIdNameMap.values(), "task Name not in task names for loaded model" 265 | taskId = [taskId for taskId, tName in self.taskParams.taskIdNameMap.items() if tName==taskName][0] 266 | taskType = self.taskParams.taskTypeMap[taskName] 267 | 268 | taskData = self.make_feature_samples(dataList, taskType, taskName) 269 | #print('task data :', taskData) 270 | 271 | tasksDict = {"data_task_id" : int(taskId), 272 | "data_" : taskData, 273 | "data_task_type" : taskType, 274 | "data_task_name" : taskName} 275 | allTasksList.append(tasksDict) 276 | 277 | allData = allTasksDataset(allTasksList, pipeline=True) 278 | batchSampler = Batcher(allData, batchSize=batchSize, seed =seed, 279 | shuffleBatch=False, shuffleTask=False) 280 | # VERY IMPORTANT TO TURN OFF BATCH SHUFFLE IN INFERENCE. ELSE PREDICTION SCORES 281 | # WILL GET JUMBLED 282 | 283 | batchSamplerUtils = batchUtils(isTrain = False, modelType= self.taskParams.modelType, 284 | maxSeqLen = self.maxSeqLen) 285 | inferDataLoader = DataLoader(allData, batch_sampler=batchSampler, 286 | collate_fn=batchSamplerUtils.collate_fn, 287 | pin_memory=torch.cuda.is_available()) 288 | 289 | with torch.no_grad(): 290 | allIds, allPreds, allScores = evaluate(allData, batchSampler, inferDataLoader, self.taskParams, 291 | self.model, gpu=torch.cuda.is_available(), evalBatchSize=batchSize, needMetrics=False, hasTrueLabels=False, 292 | returnPred=True) 293 | 294 | finalOutList = self.format_output(dataList, allIds, allPreds, allScores) 295 | #print(finalOutList) 296 | return finalOutList 297 | -------------------------------------------------------------------------------- /logger_.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Custom log object to use across all files 3 | ''' 4 | import logging 5 | 6 | def make_logger(name, logFile, debugMode = False, silent = False): 7 | 8 | # Create a custom log 9 | log = logging.getLogger(name) 10 | log.setLevel(logging.DEBUG) 11 | log.propagate = False 12 | # Create handlers 13 | #setting level 14 | if debugMode: 15 | c_handler = logging.StreamHandler() 16 | f_handler = logging.FileHandler(logFile) 17 | c_handler.setLevel(logging.DEBUG) 18 | f_handler.setLevel(logging.DEBUG) 19 | elif silent: 20 | f_handler = logging.FileHandler(logFile) 21 | f_handler.setLevel(logging.INFO) 22 | else: 23 | c_handler = logging.StreamHandler() 24 | f_handler = logging.FileHandler(logFile) 25 | c_handler.setLevel(logging.INFO) 26 | f_handler.setLevel(logging.INFO) 27 | 28 | 29 | # Create formatters and add it to handlers 30 | f_format = logging.Formatter('%(levelname)s - %(message)s') 31 | f_handler.setFormatter(f_format) 32 | # Add handlers to the log 33 | log.addHandler(f_handler) 34 | 35 | if not silent: 36 | c_format = logging.Formatter('%(levelname)s - %(message)s') 37 | c_handler.setFormatter(c_format) 38 | log.addHandler(c_handler) 39 | 40 | return log -------------------------------------------------------------------------------- /models/data_manager.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script to manage datasets for multiple tasks 3 | ''' 4 | from torch.utils.data import Dataset, DataLoader, BatchSampler 5 | from utils.data_utils import TaskType, ModelType 6 | import torch 7 | import random 8 | import logging 9 | import json 10 | logger = logging.getLogger("multi_task") 11 | 12 | class allTasksDataset(Dataset): 13 | ''' 14 | class to make pytorch dataset of the processed data for a specific task 15 | taskDict :- list of dictionaries. Each dictioanry belong to the details of a 16 | dataset to be created for a task 17 | [ {"data_task_id" : "", "data_path" : "", "data_task_type" : ""}, 18 | ...] 19 | ''' 20 | def __init__(self, taskDict, pipeline = False): 21 | self.taskDict = taskDict 22 | self.pipeline = pipeline 23 | self.allTasksData, self.taskIdTypeMap = self.make_all_datasets() 24 | 25 | def read_data(self, readPath): 26 | with open(readPath, 'r', encoding = 'utf-8') as file: 27 | logger.info('Reading data from file {}'.format(readPath)) 28 | taskData = [] 29 | for i, line in enumerate(file): 30 | #if i >=1000: 31 | #continue 32 | sample = json.loads(line) 33 | taskData.append(sample) 34 | return taskData 35 | 36 | def make_all_datasets(self): 37 | ''' 38 | For each dataset entry in the taskDict, this function makes them into corresponding dataset 39 | and returns a dictionary mapping like { : ,} 40 | ''' 41 | allTasksData = {} 42 | taskIdTypeMap = {} # mapping from task id to task type 43 | for task in self.taskDict: 44 | if self.pipeline: 45 | logger.info('Reading data for pipeline') 46 | data = task["data_"] 47 | else: 48 | data = self.read_data(task["data_path"]) 49 | allTasksData[task["data_task_id"]] = data 50 | taskIdTypeMap[task["data_task_id"]] = task["data_task_type"] 51 | logger.info('Read Data for Task Id: {} Task Name: {}. Samples {}'.format(task["data_task_id"], task["data_task_name"], len(data))) 52 | return allTasksData, taskIdTypeMap 53 | 54 | # some standard functions which need to be overridden from Dataset 55 | #class for item, len etc.. 56 | def __len__(self): 57 | return sum(len(v) for k, v in self.allTasksData.items()) 58 | 59 | # get item will be used to fetch a sample when required for the corresponding task id. 60 | def __getitem__(self, idx): 61 | taskId, sampleId = idx 62 | out = {"task": {"task_id": taskId, "task_type": self.taskIdTypeMap[taskId]}, 63 | "sample": self.allTasksData[taskId][sampleId]} 64 | return out 65 | 66 | class Batcher(BatchSampler): 67 | def __init__(self, dataObj, batchSize, shuffleTask = True, shuffleBatch = True, seed = 42): 68 | ''' 69 | dataObj :- An instance of allTasksDataset containing data for all tasks 70 | ''' 71 | self.dataObj = dataObj 72 | self.allTasksData = dataObj.allTasksData 73 | self.batchSize = batchSize 74 | # to shuffle the indices in a batch 75 | self.shuffleBatch = shuffleBatch 76 | # to shuffle the samples picked up among all the tasks 77 | self.shuffleTask = shuffleTask 78 | self.seed = seed 79 | 80 | self.allTasksDataBatchIdxs = [] 81 | self.taskIdxId = [] 82 | for taskId, data in self.allTasksData.items(): 83 | self.allTasksDataBatchIdxs.append(self.make_batches(len(data))) 84 | self.taskIdxId.append(taskId) 85 | 86 | def make_batches(self, dataSize): 87 | batchIdxs = [list(range(i, min(i+self.batchSize, dataSize))) for i in range(0, dataSize, self.batchSize)] 88 | if self.shuffleBatch: 89 | random.seed(self.seed) 90 | random.shuffle(batchIdxs) 91 | return batchIdxs 92 | 93 | def make_task_idxs(self): 94 | ''' 95 | This fn makes task indices for which a corresponding batch is created 96 | eg. [0, 0, 1, 3, 0, 2, 3, 1, 1, ..] if task ids are 0,1,2,3 97 | ''' 98 | taskIdxs = [] 99 | for i in range(len(self.allTasksDataBatchIdxs)): 100 | taskIdxs += [i]*len(self.allTasksDataBatchIdxs[i]) 101 | if self.shuffleTask: 102 | random.seed(self.seed) 103 | random.shuffle(taskIdxs) 104 | return taskIdxs 105 | 106 | #over riding BatchSampler functions to generate iterators for all tasks 107 | # and iterate 108 | def __len__(self): 109 | return sum(len(data) for taskId, data in self.allTasksData.items()) 110 | 111 | def __iter__(self): 112 | allTasksIters = [iter(item) for item in self.allTasksDataBatchIdxs] 113 | #all_iters = [iter(item) for item in self._train_data_list] 114 | allIdxs = self.make_task_idxs() 115 | for taskIdx in allIdxs: 116 | # this batch belongs to a specific task id 117 | batchTaskId = self.taskIdxId[taskIdx] 118 | batch = next(allTasksIters[taskIdx]) 119 | yield [(batchTaskId, sampleIdx) for sampleIdx in batch] 120 | 121 | def patch_data(self, batch_info, batch_data, gpu = None): 122 | if gpu: 123 | for i, part in enumerate(batch_data): 124 | if part is not None: 125 | if isinstance(part, torch.Tensor): 126 | batch_data[i] = part.pin_memory().cuda(non_blocking=True) 127 | elif isinstance(part, tuple): 128 | batch_data[i] = tuple(sub_part.pin_memory().cuda(non_blocking=True) for sub_part in part) 129 | elif isinstance(part, list): 130 | batch_data[i] = [sub_part.pin_memory().cuda(non_blocking=True) for sub_part in part] 131 | else: 132 | raise TypeError("unknown batch data type at %s: %s" % (i, part)) 133 | 134 | return batch_info, batch_data 135 | 136 | class batchUtils: 137 | ''' 138 | This class is supposed to perform function which will help complete the batch data 139 | when DataLoader creates batch using allTasksDataset and Batcher. 140 | Main function would be 141 | 1. A function to make get the various components of input in batch samples and make them into 142 | Pytorch Tensors like token_id, type_ids, masks. 143 | 144 | 2. Collater function :- This function will use the above function to convert the batch into 145 | pytorch tensor inputs. As converting all the data into pytorch tensors before might not be a good 146 | idea due to space, hence this custom function will be used to convert the batches into tensors on the fly 147 | by acting as custom collater function to DataLoader 148 | ''' 149 | 150 | def __init__(self, isTrain, modelType, maxSeqLen, dropout = 0.005): 151 | self.isTrain = isTrain 152 | self.modelType = modelType 153 | self.maxSeqLen = maxSeqLen 154 | #self.dropout = dropout 155 | 156 | def check_samples_len(self, batch): 157 | #function to check whether all samples are having the maxSeqLen mentioned 158 | for samp in batch: 159 | assert len(samp['token_id']) == self.maxSeqLen, "token_id len doesn't match max seq len" 160 | # for multiple encoders 161 | if samp['type_id'] is not None: 162 | assert len(samp['type_id']) == self.maxSeqLen, "type_id len doesn't match max seq len" 163 | if samp['mask'] is not None: 164 | assert len(samp['mask']) == self.maxSeqLen, "mask len doesn't match max seq len" 165 | 166 | def make_batch_to_input_tensor(self, batch): 167 | #check len in batch data 168 | self.check_samples_len(batch) 169 | batchSize = len(batch) 170 | 171 | hasTypeIds = True 172 | hasAttnMasks = True 173 | if batch[0]['type_id'] is None: 174 | hasTypeIds = False 175 | if batch[0]['mask'] is None: 176 | hasAttnMasks = False 177 | 178 | #initializing token id, type id, attention mask tensors for this batch 179 | tokenIdsBatchTensor = torch.LongTensor(batchSize, self.maxSeqLen).fill_(0) 180 | typeIdsBatchTensor = torch.LongTensor(batchSize, self.maxSeqLen).fill_(0) 181 | masksBatchTensor = torch.LongTensor(batchSize, self.maxSeqLen).fill_(0) 182 | 183 | #fillling in data from sample 184 | for i, sample in enumerate(batch): 185 | tokenIdsBatchTensor[i] = torch.LongTensor(sample['token_id']) 186 | if hasTypeIds: 187 | typeIdsBatchTensor[i] = torch.LongTensor(sample['type_id']) 188 | if hasAttnMasks: 189 | masksBatchTensor[i] = torch.LongTensor(sample['mask']) 190 | 191 | # meta deta will store more things like task id, task type etc. 192 | batchMetaData = {"token_id_pos" : 0, "type_id_pos" : 1, "mask_pos" : 2} 193 | batchData = [tokenIdsBatchTensor, None, None] #None, None in case type ids, attnMasks not required by model 194 | if hasTypeIds: 195 | batchData[1] = typeIdsBatchTensor 196 | if hasAttnMasks: 197 | batchData[2] = masksBatchTensor 198 | return batchMetaData, batchData 199 | 200 | def collate_fn(self, batch): 201 | ''' 202 | This function will be used by DataLoader to return batches 203 | ''' 204 | taskId = batch[0]["task"]["task_id"] 205 | taskType = batch[0]["task"]["task_type"] 206 | 207 | orgBatch = [] 208 | labels = [] 209 | for sample in batch: 210 | assert sample["task"]["task_id"] == taskId 211 | assert sample["task"]["task_type"] == taskType 212 | orgBatch.append(sample["sample"]) 213 | labels.append(sample["sample"]["label"]) 214 | 215 | batch = orgBatch 216 | #making tensor batch data 217 | batchMetaData, batchData = self.make_batch_to_input_tensor(batch) 218 | batchMetaData['task_id'] = taskId 219 | batchMetaData['task_type'] = taskType 220 | 221 | #adding label tensor when training (as they'll used for loss calculatoion and update) 222 | # and in evaluation, it won't go with batch data, rather will keep it with meta data for metrics 223 | if self.isTrain: 224 | 225 | if taskType in (TaskType.SingleSenClassification, TaskType.SentencePairClassification, TaskType.NER): 226 | batchData.append(torch.LongTensor(labels)) 227 | 228 | #position for label 229 | batchMetaData['label_pos'] = len(batchData) - 1 230 | else: 231 | # for test/eval labels won't be added into batch, but kept in meta data 232 | # so metric evaluation can be done 233 | #batchData :- [tokenIdsBatchTensor, typeIdsBatchTensor, MasksBatchTensor] 234 | batchMetaData['label'] = labels 235 | 236 | batchMetaData['uids'] = [sample['uid'] for sample in batch] # used in scoring 237 | return batchMetaData, batchData 238 | -------------------------------------------------------------------------------- /models/dropout.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class DropoutWrapper(nn.Module): 6 | """ 7 | This is a dropout wrapper which supports the fix mask dropout 8 | """ 9 | def __init__(self, dropout_p=0, enable_vbp=True): 10 | super(DropoutWrapper, self).__init__() 11 | """variational dropout means fix dropout mask 12 | ref: https://discuss.pytorch.org/t/dropout-for-rnns/633/11 13 | """ 14 | self.enable_variational_dropout = enable_vbp 15 | self.dropout_p = dropout_p 16 | 17 | def forward(self, x): 18 | """ 19 | :param x: batch * len * input_size 20 | """ 21 | if self.training == False or self.dropout_p == 0: 22 | return x 23 | 24 | if len(x.size()) == 3: 25 | mask = 1.0 / (1-self.dropout_p) * torch.bernoulli((1-self.dropout_p) * (x.data.new(x.size(0), x.size(2)).zero_() + 1)) 26 | mask.requires_grad = False 27 | return mask.unsqueeze(1).expand_as(x) * x 28 | else: 29 | return F.dropout(x, p=self.dropout_p, training=self.training) 30 | -------------------------------------------------------------------------------- /models/eval.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | from utils.data_utils import METRICS, TaskType 4 | import math 5 | import os 6 | from tqdm import tqdm 7 | logger = logging.getLogger("multi_task") 8 | 9 | def evaluate(dataSet, batchSampler, dataLoader, taskParams, 10 | model, gpu, evalBatchSize, needMetrics, hasTrueLabels, 11 | wrtDir=None, wrtPredPath = None, returnPred=False): 12 | ''' 13 | Function to make predictions on the given data. The provided data can be multiple tasks or single task 14 | It will seprate out the predictions based on task id for metrics evaluation 15 | ''' 16 | numTasks = len(taskParams.taskIdNameMap) 17 | numStep = math.ceil(len(dataLoader)/evalBatchSize) 18 | allPreds = [[] for _ in range(numTasks)] 19 | allLabels = [[] for _ in range(numTasks)] 20 | allScores = [[] for _ in range(numTasks)] 21 | allIds = [[] for _ in range(numTasks)] 22 | 23 | for batchMetaData, batchData in tqdm(dataLoader, total=numStep, desc = 'Eval'): 24 | batchMetaData, batchData = batchSampler.patch_data(batchMetaData,batchData, gpu = gpu) 25 | prediction, scores = model.predict_step(batchMetaData, batchData) 26 | 27 | logger.debug("predictions in eval: {}".format(prediction)) 28 | batchTaskId = int(batchMetaData['task_id']) 29 | 30 | orgLabels = batchMetaData['label'] 31 | allLabels[batchTaskId].extend(orgLabels) 32 | 33 | logger.debug("batch task id in eval: {}".format(batchTaskId)) 34 | allPreds[batchTaskId].extend(prediction) 35 | allScores[batchTaskId].extend(scores) 36 | allIds[batchTaskId].extend(batchMetaData['uids']) 37 | 38 | for i in range(len(allPreds)): 39 | if allPreds[i] == []: 40 | continue 41 | taskName = taskParams.taskIdNameMap[i] 42 | taskType = taskParams.taskTypeMap[taskName] 43 | labMap = taskParams.labelMap[taskName] 44 | 45 | if taskType == TaskType.NER: 46 | # NER requires label clipping. We''ve already clipped our predictions 47 | #using attn Masks, so we will clip labels to predictions len 48 | # Also we need to remove the extra tokens from predictions based on labels 49 | #print(labMap) 50 | labMapRevN = {v:k for k,v in labMap.items()} 51 | 52 | for j, (p, l) in enumerate(zip(allPreds[i], allLabels[i])): 53 | allLabels[i][j] = l[:len(p)] 54 | allPreds[i][j] = [labMapRevN[int(ele)] for ele in p] 55 | allLabels[i][j] = [labMapRevN[int(ele)] for ele in allLabels[i][j]] 56 | #allPreds[i] = [ [ labMapRev[int(p)] for p in pp ] for pp in allPreds[i] ] 57 | #allLabels[i] = [ [labMapRev[int(l)] for l in ll] for ll in allLabels[i] ] 58 | 59 | newPreds = [] 60 | newLabels = [] 61 | newScores = [] 62 | for m, samp in enumerate(allLabels[i]): 63 | Preds = [] 64 | Labels = [] 65 | Scores = [] 66 | for n, ele in enumerate(samp): 67 | #print(ele) 68 | if ele != '[CLS]' and ele != '[SEP]' and ele != 'X': 69 | #print('inside') 70 | Preds.append(allPreds[i][m][n]) 71 | Labels.append(ele) 72 | Scores.append(allScores[i][m][n]) 73 | #del allLabels[i][m][n] 74 | #del allPreds[i][m][n] 75 | newPreds.append(Preds) 76 | newLabels.append(Labels) 77 | newScores.append(Scores) 78 | 79 | allLabels[i] = newLabels 80 | allPreds[i] = newPreds 81 | allScores[i] = newScores 82 | 83 | if taskType == TaskType.SingleSenClassification and labMap is not None: 84 | 85 | labMapRevC = {v:k for k,v in labMap.items()} 86 | allPreds[i] = [labMapRevC[int(ele)] for ele in allPreds[i]] 87 | allLabels[i] = [labMapRevC[int(ele)] for ele in allLabels[i]] 88 | 89 | if needMetrics: 90 | # fetch metrics from task id 91 | for i in range(len(allPreds)): 92 | if allPreds[i] == []: 93 | continue 94 | taskName = taskParams.taskIdNameMap[i] 95 | metrics = taskParams.metricsMap[taskName] 96 | if metrics is None: 97 | logger.info("No metrics are provided in task params (file)") 98 | continue 99 | 100 | logger.info("********** {} Evaluation************\n".format(taskName)) 101 | for m in metrics: 102 | metricVal = METRICS[m](allLabels[i], allPreds[i]) 103 | logger.info("{} : {}".format(m, metricVal)) 104 | 105 | if wrtPredPath is not None and wrtDir is not None: 106 | for i in range(len(allPreds)): 107 | if allPreds[i] == []: 108 | continue 109 | taskName = taskParams.taskIdNameMap[i] 110 | if hasTrueLabels: 111 | df = pd.DataFrame({"uid" : allIds[i], "prediction" : allPreds[i], "label" : allLabels[i]}) 112 | else: 113 | df = pd.DataFrame({"uid" : allIds[i], "prediction" : allPreds[i]}) 114 | 115 | savePath = os.path.join(wrtDir, "{}_{}".format(taskName, wrtPredPath)) 116 | df.to_csv(savePath, sep = "\t", index = False) 117 | logger.info("Predictions File saved at {}".format(savePath)) 118 | 119 | if returnPred: 120 | return allIds, allPreds, allScores -------------------------------------------------------------------------------- /models/loss.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.loss import _Loss 2 | import torch.nn.functional as F 3 | import torch 4 | from enum import IntEnum 5 | 6 | class CrossEntropyLoss(_Loss): 7 | def __init__(self, alpha=1.0, name='Cross Entropy Loss'): 8 | super().__init__() 9 | 10 | self.alpha = alpha 11 | self.name = name 12 | self.ignore_index = -1 13 | def forward(self, inp, target, attnMasks = None): 14 | """ 15 | This is the standard cross entropy loss as defined in pytorch. 16 | This loss should be used for single sentence or sentence pair classification tasks. 17 | 18 | To use this loss for training, set ``loss_type`` : **CrossEntropyLoss** in task file 19 | """ 20 | loss = F.cross_entropy(inp, target, ignore_index=self.ignore_index) 21 | loss *= self.alpha 22 | return loss 23 | 24 | class NERLoss(_Loss): 25 | def __init__(self, alpha=1.0, name='Cross Entropy Loss'): 26 | super().__init__() 27 | 28 | self.alpha = alpha 29 | self.name = name 30 | self.ignore_index = -1 #used to return 0 loss for such values 31 | def forward(self, inp, target, attnMasks = None): 32 | 33 | """ 34 | This loss is a modified version of cross entropy loss for NER/sequence labelling tasks. 35 | This loss ignores extra ‘O’ values through attention masks. 36 | 37 | To use this loss for training, set ``loss_type`` : **NERLoss** in task file 38 | """ 39 | 40 | ''' 41 | inp shape would be (batchSize, maxSeqlen, classNum). But for loss calculation 42 | we need (batchSize, classNum). Hence we will squeeze the batchSize and maxSeqlen together. 43 | 44 | In NER, we have to ignore the loss created for the extra padding that 45 | has been done for making labels till max seq length. Hence we will use 46 | attention masks to ignore losses with those indices 47 | ''' 48 | if attnMasks is not None: 49 | nerLoss = attnMasks.view(-1) == 1 50 | nerlogits = inp.view(-1, inp.size(-1)) 51 | nerLabels = torch.where( 52 | nerLoss, target.view(-1), torch.tensor(self.ignore_index).type_as(target) 53 | ) 54 | finalLoss = F.cross_entropy(nerlogits, nerLabels, ignore_index=self.ignore_index) 55 | 56 | else: 57 | finalLoss = F.cross_entropy(inp.view(-1, inp.size(-1)), target.view(-1), 58 | ignore_index=self.ignore_index) 59 | 60 | finalLoss *= self.alpha 61 | return finalLoss 62 | 63 | class SpanLoss(_Loss): 64 | def __init__(self, alpha=1.0, name='Span Cross Entropy Loss'): 65 | super().__init__() 66 | 67 | self.alpha = alpha 68 | self.name = name 69 | self.ignore_index = -1 70 | def forward(self, inp, target, attnMasks = None): 71 | 72 | #assert if inp and target has both start and end values 73 | assert len(inp) == 2, "start and end logits should be present for span loss calc" 74 | assert len(target) == 2, "start and end logits should be present for span loss calc" 75 | 76 | startInp, endInp = inp 77 | startTarg, endTarg = target 78 | 79 | startloss = F.cross_entropy(startInp, startTarg, ignore_index=self.ignore_index) 80 | endLoss = F.cross_entropy(endInp, endTarg, ignore_index=self.ignore_index) 81 | 82 | loss = 0.5 * (startloss + endLoss) * self.alpha 83 | return loss 84 | 85 | 86 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | seqeval==0.0.12 2 | tqdm==4.30.0 3 | ipywidgets==7.4.2 4 | Keras==2.3.1 5 | transformers==2.8.0 6 | joblib==0.13.2 7 | torch==1.2.0 8 | tensorflow==1.15.2 9 | numpy==1.18.1 10 | sphinx_rtd_theme==0.4.3 11 | pandas==1.0.1 12 | scikit_learn==0.23.1 13 | PyYAML==5.3.1 14 | -------------------------------------------------------------------------------- /run_inference.py: -------------------------------------------------------------------------------- 1 | """ 2 | File for making inference on a testing file with a saved multi-task model over 3 | The input data file here has to be the data prepared file for the corresponding test file 4 | 5 | For getting inference on a test file, (say test.tsv) 6 | """ 7 | from utils.task_utils import TasksParam 8 | from utils.data_utils import TaskType, ModelType, NLP_MODELS 9 | from models.eval import evaluate 10 | from models.model import multiTaskModel 11 | from data_preparation import * 12 | from models.data_manager import allTasksDataset, Batcher, batchUtils 13 | from torch.utils.data import Dataset, DataLoader, BatchSampler 14 | import argparse 15 | import os 16 | import torch 17 | import logging 18 | logger = logging.getLogger("multi_task") 19 | device = torch.device('cpu') 20 | if torch.cuda.is_available(): 21 | device = torch.device('cuda') 22 | 23 | def main(): 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument('--pred_file_path', type=str, required=True, 26 | help="path to the tsv file on which predictions to be made") 27 | parser.add_argument('--out_dir', type = str, required=True, 28 | help="path to save the predictions") 29 | parser.add_argument('--has_labels', type=str, default=False, 30 | help = "If labels are not present in file then False") 31 | parser.add_argument('--task_name', type=str, required = True, 32 | help = "task name for which prediction is required.") 33 | parser.add_argument('--saved_model_path', type=str, required = True, 34 | help = "path to the trained model to load") 35 | parser.add_argument('--eval_batch_size', type=int, default = 32, 36 | help = "batch size for prediction") 37 | parser.add_argument('--max_seq_len', type=int, 38 | help = "max seq len used during training of model") 39 | parser.add_argument('--seed', type=int, default = 42, 40 | help = "seed") 41 | args = parser.parse_args() 42 | 43 | allParams = vars(args) 44 | assert os.path.exists(args.saved_model_path), "saved model not present at {}".format(args.saved_model_path) 45 | assert os.path.exists(args.pred_file_path), "prediction tsv file not present at {}".format(args.pred_file_path) 46 | loadedDict = torch.load(args.saved_model_path, map_location=device) 47 | taskParamsModel = loadedDict['task_params'] 48 | logger.info('Task Params loaded from saved model.') 49 | 50 | assert args.task_name in taskParamsModel.taskIdNameMap.values(), "task Name not in task names for loaded model" 51 | 52 | taskId = [taskId for taskId, taskName in taskParamsModel.taskIdNameMap.items() if taskName==args.task_name][0] 53 | taskType = taskParamsModel.taskTypeMap[args.task_name] 54 | 55 | # preparing data from tsv file 56 | rows = load_data(args.pred_file_path, taskType, hasLabels = args.has_labels) 57 | 58 | modelName = taskParamsModel.modelType.name.lower() 59 | _, _ , tokenizerClass, defaultName = NLP_MODELS[modelName] 60 | configName = taskParamsModel.modelConfig 61 | if configName is None: 62 | configName = defaultName 63 | 64 | #making tokenizer for model 65 | tokenizer = tokenizerClass.from_pretrained(configName) 66 | logger.info('{} model tokenizer loaded for config {}'.format(modelName, configName)) 67 | 68 | dataPath = os.path.join(args.out_dir, '{}_prediction_data'.format(configName)) 69 | if not os.path.exists(dataPath): 70 | os.makedirs(dataPath) 71 | wrtFile = os.path.join(dataPath, '{}.json'.format(args.pred_file_path.split('/')[-1].split('.')[0])) 72 | print('Processing Started...') 73 | create_data_multithreaded(rows, wrtFile, tokenizer, taskParamsModel, args.task_name, 74 | args.max_seq_len, multithreaded = True) 75 | print('Data Processing done for {}. File saved at {}'.format(args.task_name, wrtFile)) 76 | 77 | allTaskslist = [ 78 | {"data_task_id" : int(taskId), 79 | "data_path" : wrtFile, 80 | "data_task_type" : taskType, 81 | "data_task_name" : args.task_name} 82 | ] 83 | allData = allTasksDataset(allTaskslist) 84 | batchSampler = Batcher(allData, batchSize=args.eval_batch_size, seed = args.seed) 85 | batchSamplerUtils = batchUtils(isTrain = False, modelType= taskParamsModel.modelType, 86 | maxSeqLen = args.max_seq_len) 87 | inferDataLoader = DataLoader(allData, batch_sampler=batchSampler, 88 | collate_fn=batchSamplerUtils.collate_fn, 89 | pin_memory=torch.cuda.is_available()) 90 | 91 | allParams['task_params'] = taskParamsModel 92 | allParams['gpu'] = torch.cuda.is_available() 93 | # dummy values 94 | allParams['num_train_steps'] = 10 95 | allParams['warmup_steps'] = 0 96 | allParams['learning_rate'] = 2e-5 97 | allParams['epsilon'] = 1e-8 98 | 99 | #making and loading model 100 | model = multiTaskModel(allParams) 101 | model.load_multi_task_model(loadedDict) 102 | 103 | with torch.no_grad(): 104 | wrtPredFile = 'predictions.tsv' 105 | evaluate(allData, batchSampler, inferDataLoader, taskParamsModel, 106 | model, gpu=allParams['gpu'], evalBatchSize=args.eval_batch_size, needMetrics=False, hasTrueLabels=False, 107 | wrtDir=args.out_dir, wrtPredPath=wrtPredFile) 108 | 109 | if __name__ == "__main__": 110 | main() 111 | -------------------------------------------------------------------------------- /utils/data_utils.py: -------------------------------------------------------------------------------- 1 | from enum import IntEnum 2 | from transformers import * 3 | from models.loss import * 4 | from utils.eval_metrics import * 5 | from utils.tranform_functions import * 6 | 7 | NLP_MODELS = { 8 | "bert": (BertConfig, BertModel, BertTokenizer, 'bert-base-uncased'), 9 | "distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'), 10 | "albert": (AlbertConfig, AlbertModel, AlbertTokenizer, 'albert-base-v2'), 11 | "roberta": (RobertaConfig, RobertaModel, RobertaTokenizer, 'roberta-base'), 12 | "xlnet" : (XLNetConfig, XLNetModel, XLNetTokenizer, 'xlnet-base-cased'), 13 | "electra" : (ElectraConfig, ElectraModel, ElectraTokenizer, 'google/electra-small-generator') 14 | } 15 | LOSSES = { 16 | "crossentropyloss" : CrossEntropyLoss, 17 | "nerloss" : NERLoss 18 | } 19 | 20 | METRICS = { 21 | "classification_accuracy": classification_accuracy, 22 | "classification_f1_score": classification_f1_score, 23 | "seqeval_f1_score" : seqeval_f1_score, 24 | "seqeval_precision" : seqeval_precision, 25 | "seqeval_recall" : seqeval_recall, 26 | "snips_f1_score" : snips_f1_score, 27 | "snips_precision" : snips_precision, 28 | "snips_recall" : snips_recall, 29 | "classification_recall" : classification_recall 30 | } 31 | 32 | TRANSFORM_FUNCS = { 33 | "snips_intent_ner_to_tsv" : snips_intent_ner_to_tsv, 34 | "coNLL_ner_pos_to_tsv" : coNLL_ner_pos_to_tsv, 35 | "snli_entailment_to_tsv" : snli_entailment_to_tsv, 36 | "bio_ner_to_tsv" : bio_ner_to_tsv, 37 | "create_fragment_detection_tsv" : create_fragment_detection_tsv, 38 | "msmarco_query_type_to_tsv" : msmarco_query_type_to_tsv, 39 | "imdb_sentiment_data_to_tsv" : imdb_sentiment_data_to_tsv, 40 | "qqp_query_similarity_to_tsv" : qqp_query_similarity_to_tsv, 41 | "msmarco_answerability_detection_to_tsv" : msmarco_answerability_detection_to_tsv, 42 | "query_correctness_to_tsv" : query_correctness_to_tsv, 43 | "clinc_out_of_scope_to_tsv" : clinc_out_of_scope_to_tsv 44 | } 45 | 46 | class ModelType(IntEnum): 47 | BERT = 1 48 | DISTILBERT = 2 49 | ALBERT = 3 50 | ROBERTA = 4 51 | XLNET = 5 52 | ELECTRA = 6 53 | 54 | class TaskType(IntEnum): 55 | SingleSenClassification = 1 56 | SentencePairClassification = 2 57 | NER = 3 58 | 59 | class LossType(IntEnum): 60 | CrossEntropyLoss = 0 61 | NERLoss = 1 62 | 63 | -------------------------------------------------------------------------------- /utils/eval_metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | File for creating metric functions 3 | """ 4 | from sklearn.metrics import accuracy_score, f1_score 5 | from sklearn.metrics import recall_score as class_recall_score 6 | from seqeval.metrics import f1_score as seq_f1 7 | from seqeval.metrics import precision_score, recall_score 8 | 9 | def classification_accuracy(yTrue, yPred): 10 | """ 11 | Accuracy score for classification tasks using the label provided in file and predictions from multi-task model. 12 | It takes a batch of predictions and labels. 13 | 14 | To use this metric, add **classification_accuracy** into list of ``metrics`` in task file. 15 | 16 | Args: 17 | yPred (:obj:`list`) : [0, 2, 1, 3] 18 | yTrue (:obj:`list`) : [0, 1, 2, 3] 19 | 20 | """ 21 | return accuracy_score(yTrue, yPred)*100 22 | 23 | def classification_f1_score(yTrue, yPred): 24 | """ 25 | Standard f1 score from sklearn for classification tasks. 26 | It takes a batch of predictions and labels. 27 | 28 | To use this metric, add **classification_f1_score** into list of ``metrics`` in task file. 29 | 30 | Args: 31 | yPred (:obj:`list`) : [0, 2, 1, 3] 32 | yTrue (:obj:`list`) : [0, 1, 2, 3] 33 | 34 | """ 35 | return f1_score(yTrue, yPred, average='micro') 36 | 37 | def classification_recall(yTrue, yPred): 38 | """ 39 | Standard recall score from sklearn for classification tasks. 40 | It takes a batch of predictions and labels. 41 | 42 | To use this metric, add **classification_f1_score** into list of ``metrics`` in task file. 43 | 44 | Args: 45 | yPred (:obj:`list`) : [0, 2, 1, 3] 46 | yTrue (:obj:`list`) : [0, 1, 2, 3] 47 | 48 | """ 49 | return class_recall_score(yTrue, yPred, average='micro') 50 | 51 | def seqeval_f1_score(yTrue, yPred): 52 | """ 53 | f1 score for NER/sequence labelling tasks taken from the `seqeval `_ library. 54 | 55 | To use this metric, add **seqeval_f1_score** into list of ``metrics`` in task file. 56 | 57 | Args: 58 | yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] 59 | yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] 60 | """ 61 | return seq_f1(yTrue, yPred) 62 | 63 | def seqeval_precision(yTrue, yPred): 64 | """ 65 | Precision score for NER/sequence labelling tasks taken from the `seqeval `_ library. 66 | 67 | To use this metric, add **seqeval_precision** into list of ``metrics`` in task file. 68 | 69 | Args: 70 | yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] 71 | yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] 72 | """ 73 | return precision_score(yTrue, yPred) 74 | 75 | def seqeval_recall(yTrue, yPred): 76 | 77 | """ 78 | Recall score for NER/sequence labelling tasks taken from the `seqeval `_ library. 79 | 80 | To use this metric, add **seqeval_recall** into list of ``metrics`` in task file. 81 | 82 | Args: 83 | yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] 84 | yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] 85 | """ 86 | return recall_score(yTrue, yPred) 87 | 88 | 89 | # compute f1 score is modified from conlleval.pl 90 | def __startOfChunk(prevTag, tag, prevTagType, tagType, chunkStart = False): 91 | if prevTag == 'B' and tag == 'B': 92 | chunkStart = True 93 | if prevTag == 'I' and tag == 'B': 94 | chunkStart = True 95 | if prevTag == 'O' and tag == 'B': 96 | chunkStart = True 97 | if prevTag == 'O' and tag == 'I': 98 | chunkStart = True 99 | 100 | if prevTag == 'E' and tag == 'E': 101 | chunkStart = True 102 | if prevTag == 'E' and tag == 'I': 103 | chunkStart = True 104 | if prevTag == 'O' and tag == 'E': 105 | chunkStart = True 106 | if prevTag == 'O' and tag == 'I': 107 | chunkStart = True 108 | 109 | if tag != 'O' and tag != '.' and prevTagType != tagType: 110 | chunkStart = True 111 | return chunkStart 112 | 113 | def __endOfChunk(prevTag, tag, prevTagType, tagType, chunkEnd = False): 114 | if prevTag == 'B' and tag == 'B': 115 | chunkEnd = True 116 | if prevTag == 'B' and tag == 'O': 117 | chunkEnd = True 118 | if prevTag == 'I' and tag == 'B': 119 | chunkEnd = True 120 | if prevTag == 'I' and tag == 'O': 121 | chunkEnd = True 122 | 123 | if prevTag == 'E' and tag == 'E': 124 | chunkEnd = True 125 | if prevTag == 'E' and tag == 'I': 126 | chunkEnd = True 127 | if prevTag == 'E' and tag == 'O': 128 | chunkEnd = True 129 | if prevTag == 'I' and tag == 'O': 130 | chunkEnd = True 131 | 132 | if prevTag != 'O' and prevTag != '.' and prevTagType != tagType: 133 | chunkEnd = True 134 | return chunkEnd 135 | 136 | def __splitTagType(tag): 137 | s = tag.split('-') 138 | if len(s) > 2 or len(s) == 0: 139 | raise ValueError('tag format wrong. it must be B-xxx.xxx') 140 | if len(s) == 1: 141 | tag = s[0] 142 | tagType = "" 143 | else: 144 | tag = s[0] 145 | tagType = s[1] 146 | return tag, tagType 147 | 148 | def computeF1Score(correct_slots, pred_slots): 149 | 150 | correctChunk = {} 151 | correctChunkCnt = 0 152 | foundCorrect = {} 153 | foundCorrectCnt = 0 154 | foundPred = {} 155 | foundPredCnt = 0 156 | correctTags = 0 157 | tokenCount = 0 158 | for correct_slot, pred_slot in zip(correct_slots, pred_slots): 159 | inCorrect = False 160 | lastCorrectTag = 'O' 161 | lastCorrectType = '' 162 | lastPredTag = 'O' 163 | lastPredType = '' 164 | for c, p in zip(correct_slot, pred_slot): 165 | correctTag, correctType = __splitTagType(c) 166 | predTag, predType = __splitTagType(p) 167 | 168 | if inCorrect == True: 169 | if __endOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True and \ 170 | __endOfChunk(lastPredTag, predTag, lastPredType, predType) == True and \ 171 | (lastCorrectType == lastPredType): 172 | inCorrect = False 173 | correctChunkCnt += 1 174 | if lastCorrectType in correctChunk: 175 | correctChunk[lastCorrectType] += 1 176 | else: 177 | correctChunk[lastCorrectType] = 1 178 | elif __endOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) != \ 179 | __endOfChunk(lastPredTag, predTag, lastPredType, predType) or \ 180 | (correctType != predType): 181 | inCorrect = False 182 | 183 | if __startOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True and \ 184 | __startOfChunk(lastPredTag, predTag, lastPredType, predType) == True and \ 185 | (correctType == predType): 186 | inCorrect = True 187 | 188 | if __startOfChunk(lastCorrectTag, correctTag, lastCorrectType, correctType) == True: 189 | foundCorrectCnt += 1 190 | if correctType in foundCorrect: 191 | foundCorrect[correctType] += 1 192 | else: 193 | foundCorrect[correctType] = 1 194 | 195 | if __startOfChunk(lastPredTag, predTag, lastPredType, predType) == True: 196 | foundPredCnt += 1 197 | if predType in foundPred: 198 | foundPred[predType] += 1 199 | else: 200 | foundPred[predType] = 1 201 | 202 | if correctTag == predTag and correctType == predType: 203 | correctTags += 1 204 | 205 | tokenCount += 1 206 | 207 | lastCorrectTag = correctTag 208 | lastCorrectType = correctType 209 | lastPredTag = predTag 210 | lastPredType = predType 211 | 212 | if inCorrect == True: 213 | correctChunkCnt += 1 214 | if lastCorrectType in correctChunk: 215 | correctChunk[lastCorrectType] += 1 216 | else: 217 | correctChunk[lastCorrectType] = 1 218 | 219 | if foundPredCnt > 0: 220 | precision = 100*correctChunkCnt/foundPredCnt 221 | else: 222 | precision = 0 223 | 224 | if foundCorrectCnt > 0: 225 | recall = 100*correctChunkCnt/foundCorrectCnt 226 | else: 227 | recall = 0 228 | 229 | if (precision+recall) > 0: 230 | f1 = (2*precision*recall)/(precision+recall) 231 | else: 232 | f1 = 0 233 | 234 | return f1, precision, recall 235 | 236 | def snips_f1_score(yTrue, yPred): 237 | 238 | """ 239 | f1 score for SNIPS NER/Slot filling task taken from the `MiuLab `_ library. 240 | 241 | To use this metric, add **snips_f1_score** into list of ``metrics`` in task file. 242 | 243 | Args: 244 | yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] 245 | yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] 246 | 247 | """ 248 | 249 | snipsF1, _, _ = computeF1Score(yTrue, yPred) 250 | return snipsF1 251 | 252 | def snips_precision(yTrue, yPred): 253 | """ 254 | Precision score for SNIPS NER/Slot filling task taken from the `MiuLab `_ library. 255 | 256 | To use this metric, add **snips_precision** into list of ``metrics`` in task file. 257 | 258 | Args: 259 | yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] 260 | yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] 261 | 262 | """ 263 | 264 | _, snipsPrecision, _ = computeF1Score(yTrue, yPred) 265 | return snipsPrecision 266 | 267 | def snips_recall(yTrue, yPred): 268 | 269 | """ 270 | Recall score for SNIPS NER/Slot filling task taken from the `MiuLab `_ library. 271 | 272 | To use this metric, add **snips_recall** into list of ``metrics`` in task file. 273 | 274 | Args: 275 | yTrue (:obj:`list of list`) : [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] 276 | yPred (:obj:`list of list`) : [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] 277 | 278 | """ 279 | _, _, snipsRecall = computeF1Score(yTrue, yPred) 280 | return snipsRecall 281 | 282 | -------------------------------------------------------------------------------- /utils/task_utils.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | import joblib 4 | from collections import OrderedDict 5 | from utils.data_utils import TaskType, ModelType, LossType, METRICS 6 | 7 | class TasksParam: 8 | ''' 9 | This class keeps the details mentioned in the tasks yml file as attributes. 10 | ''' 11 | def __init__(self, taskFilePath): 12 | # dictioanry holding all the tasks details with 13 | # task name as key. 14 | #The idea to store, retrieve task information in yaml file and process using dictionary maps and IntEnum classes 15 | # is inspired from Microsoft's mt-dnn 16 | 17 | self.taskDetails = yaml.safe_load(open(taskFilePath)) 18 | self.modelType = self.validity_checks() 19 | 20 | classNumMap = {} 21 | taskTypeMap = {} 22 | taskNameIdMap = {} 23 | taskIdNameMap = OrderedDict() 24 | metricsMap = {} 25 | dropoutProbMap = {} 26 | lossMap = {} 27 | labelMap = {} 28 | lossWeightMap = {} 29 | fileNamesMap = {} 30 | 31 | for i, (taskName, taskVals) in enumerate(self.taskDetails.items()): 32 | taskNameIdMap[taskName] = i 33 | taskIdNameMap[i] = taskName 34 | taskTypeMap[taskName] = TaskType[taskVals["task_type"]] 35 | fileNamesMap[taskName] = list(taskVals["file_names"]) 36 | 37 | modelConfig = None 38 | dropoutProbMap[taskName] = 0.05 39 | lossMap[taskName] = None 40 | lossWeightMap[taskName] = float(1.0) 41 | labelMap[taskName] = None 42 | metricsMap[taskName] = None 43 | 44 | if "class_num" in taskVals: 45 | classNumMap[taskName] = taskVals["class_num"] 46 | 47 | if "config_name" in taskVals: 48 | modelConfig = taskVals["config_name"] 49 | 50 | if "dropout_prob" in taskVals: 51 | dropoutProbMap[taskName] = taskVals["dropout_prob"] 52 | 53 | if "metrics" in taskVals: 54 | metricsMap[taskName] = [m.lower() for m in taskVals["metrics"]] 55 | 56 | # loss map 57 | if "loss_type" in taskVals: 58 | lossMap[taskName] = LossType[taskVals["loss_type"]] 59 | 60 | if "label_map_or_file" in taskVals: 61 | ''' 62 | Label Map is the list of label names (or tag names in NER) which are 63 | present in the data. We make it into dict. This dict will be used to create the label to index 64 | map and hence is important to maintain order. It is required in case of 65 | NER. For classification tasks, if the labels are already numeric in data, 66 | label map is not required, but if not, then required. 67 | 68 | DO NOT ADD ANY EXTRA SPECIAL TOKEN LIKE ['CLS'], 'X', ['SEP'] IN LABEL MAP OR COUNT IN CLASS NUMBER 69 | 70 | It can also take the generated label map joblib file from data transformations 71 | ''' 72 | if type(taskVals["label_map_or_file"]) == list: 73 | labelMap[taskName] = {lab:i for i, lab in enumerate(taskVals["label_map_or_file"])} 74 | 75 | elif type(taskVals["label_map_or_file"]) == str: 76 | labelMap[taskName] = joblib.load(taskVals["label_map_or_file"]) 77 | 78 | else: 79 | raise ValueError("label_map_or_file not recognized") 80 | 81 | if taskTypeMap[taskName] == TaskType.NER: 82 | labelMap[taskName]['[CLS]'] = len(labelMap[taskName]) 83 | labelMap[taskName]['[SEP]'] = len(labelMap[taskName]) 84 | labelMap[taskName]['X'] = len(labelMap[taskName]) 85 | if "O" not in labelMap[taskName]: 86 | labelMap[taskName]["O"] = len(labelMap[taskName]) 87 | 88 | classNumMap[taskName] = len(labelMap[taskName]) 89 | 90 | if "loss_weight" in taskVals: 91 | ''' 92 | loss weight for individual task. This factor 93 | will be multiplied directly to the loss calculated 94 | for backpropagation 95 | ''' 96 | lossWeightMap[taskName] = float(taskVals["loss_weight"]) 97 | else: 98 | lossWeightMap[taskName] = float(1.0) 99 | 100 | self.classNumMap = classNumMap 101 | self.taskTypeMap = taskTypeMap 102 | self.taskNameIdMap = taskNameIdMap 103 | self.taskIdNameMap = taskIdNameMap 104 | self.modelConfig = modelConfig 105 | self.metricsMap = metricsMap 106 | self.fileNamesMap = fileNamesMap 107 | self.dropoutProbMap = dropoutProbMap 108 | self.lossMap = lossMap 109 | self.labelMap =labelMap 110 | self.lossWeightMap = lossWeightMap 111 | 112 | def validity_checks(self): 113 | ''' 114 | Check if the yml has correct form or not. 115 | ''' 116 | requiredParams = {"task_type", "loss_type", "file_names"} 117 | uniqueModel = set() 118 | uniqueConfig = set() 119 | for taskName, taskVals in self.taskDetails.items(): 120 | # check task name 121 | assert taskName.isalpha(), "only alphabets are allowed in task name. No special chracters/numbers/whitespaces allowed. Task Name: %s" % taskName 122 | 123 | # check all required arguments 124 | assert len(requiredParams.intersection(set(taskVals.keys()))) == len(requiredParams), "following parameters are required {}".format(requiredParams) 125 | 126 | #check is loss, model type is correct 127 | try: 128 | LossType[taskVals["loss_type"]] 129 | ModelType[taskVals["model_type"]] 130 | except: 131 | print("allowed loss {}".format(list(LossType))) 132 | print("allowed model type {}".format(list( ModelType))) 133 | raise 134 | 135 | # check metric if present 136 | if "metrics" in taskVals: 137 | for m in taskVals["metrics"]: 138 | assert m.lower() in METRICS, "allowed metrics are {}".format(METRICS.keys()) 139 | 140 | # check model type, only one model type is allowed for all tasks 141 | uniqueModel.add(ModelType[taskVals["model_type"]]) 142 | if "config_name" in taskVals: 143 | uniqueConfig.add(taskVals["config_name"]) 144 | 145 | #check if all data files exists for task 146 | #for fileName in taskVals['file_names']: 147 | #assert os.path.exists(fileName) 148 | 149 | #either label map/file is required or class_num is required. 150 | assert "label_map_or_file" in taskVals or "class_num" in taskVals, "either class_num or label_map_or_file is required" 151 | 152 | # we definitely require label mapping for NER task 153 | if taskVals["task_type"] == 'NER': 154 | assert "label_map_or_file" in taskVals, "Unique Tags/Labels or map file needs to be mentioned in label_map_or_file for NER" 155 | 156 | assert len(uniqueModel) == 1, "Only one type of model can be shared across all tasks" 157 | assert len(uniqueConfig) <= 1, "Model config has to be same across all shared tasks" 158 | 159 | #return model type from here 160 | return list(uniqueModel)[0] 161 | 162 | 163 | -------------------------------------------------------------------------------- /utils/transform_utils.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | import joblib 4 | from utils.data_utils import TRANSFORM_FUNCS 5 | 6 | class TransformParams: 7 | ''' 8 | This class keeps the details mentioned in transform yaml file for the 9 | case when data transformations is required to be performed. 10 | ''' 11 | def __init__(self, transformFilePath): 12 | 13 | self.transformDetails = yaml.safe_load(open(transformFilePath)) 14 | self.validity_checks() 15 | transformFnMap = {} 16 | transformParamsMap = {} 17 | readFileNamesMap = {} 18 | readDirMap = {} 19 | saveDirMap = {} 20 | 21 | for i, (transformName, transformVals) in enumerate(self.transformDetails.items()): 22 | transformFnMap[transformName] = transformVals['transform_func'] 23 | transformParamsMap[transformName] = {} 24 | readFileNamesMap[transformName] = list(transformVals['read_file_names']) 25 | readDirMap[transformName] = transformVals['read_dir'] 26 | saveDirMap[transformName] = transformVals['save_dir'] 27 | 28 | if 'transform_params' in transformVals: 29 | transformParamsMap[transformName] = dict(transformVals['transform_params']) 30 | 31 | self.transformFnMap = transformFnMap 32 | self.transformParamsMap = transformParamsMap 33 | self.readFileNamesMap = readFileNamesMap 34 | self.readDirMap = readDirMap 35 | self.saveDirMap = saveDirMap 36 | 37 | def validity_checks(self): 38 | ''' 39 | Check if the transform yml is correct or not 40 | ''' 41 | requiredParams = {"transform_func", "read_dir", "read_file_names", "save_dir"} 42 | for i, (transformName, transformVals) in enumerate(self.transformDetails.items()): 43 | # check all required arguments 44 | assert len(requiredParams.intersection(set(transformVals.keys()))) == len(requiredParams), "following parameters are required {}".format(requiredParams) 45 | 46 | #check if transform functions is in the defined transform function 47 | assert transformVals['transform_func'] in TRANSFORM_FUNCS.keys(), "{} transform fn is not in following defined functions {}".format(transformVals['transform_func'], 48 | TRANSFORM_FUNCS.keys()) 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | --------------------------------------------------------------------------------